Esempio n. 1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/evaluate_trees.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("-r",
                      "--reference=",
                      dest="filename_reference_tree",
                      help="filename with reference tree.",
                      type="string")

    parser.set_defaults(filename_reference_tree=None)

    (options, args) = E.Start(parser)

    if not options.filename_reference_tree:
        print "please supply reference tree."

    if options.loglevel >= 1:
        print "# reading reference tree."

    nexus = TreeTools.Newick2Nexus(open(options.filename_reference_tree, "r"))
    reference_tree = nexus.trees[0]

    if options.loglevel >= 1:
        print "# reading sample trees."

    nexus2 = TreeTools.Newick2Nexus(sys.stdin)

    ntotal, nok, nfailed = 0, 0, 0
    ntopology, ntaxa, nleaves = 0, 0, 0
    for t in nexus2.trees:
        ntotal += 1
        is_ok, reason = TreeTools.IsCompatible(reference_tree, t)
        if is_ok:
            nok += 1
        else:
            nfailed += 1
            if reason == "topology":
                ntopology += 1
            elif reason == "taxa":
                ntaxa += 1
            elif reason == "leaves":
                nleaves += 1

    print "# total=%i, compatible=%i, failed=%i, topology=%i, taxa=%i, leaves=%i" %\
          (ntotal, nok, nfailed, ntopology, ntaxa, nleaves)

    E.Stop()
Esempio n. 2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: tree2plot.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.set_defaults()

    (options, args) = E.Start(parser, add_pipe_options=True)

    lines = filter(lambda x: x[0] != "#", sys.stdin.readlines())

    nexus = TreeTools.Newick2Nexus(lines)

    input_tree = nexus.trees[0]

    treegraph = TreeGraph(support=None, loglevel=options.loglevel)

    print treegraph.Run(input_tree)

    E.Stop()
Esempio n. 3
0
    def run(self, grammar, tree=None, dump=0, test=False, options={}):

        self.mTempdir = tempfile.mkdtemp()
        self.mFilenameGrammar = "grammar.eg"
        self.mFilenameTree = "tree.nh"
        self.mFilenameOutput = None
        self.mWarnings = []

        if test:
            print "# temporary directory is %s" % self.mTempdir

        outfile = open(self.mTempdir + "/" + self.mFilenameGrammar, "w")
        outfile.write(grammar.getGrammar())
        outfile.close()

        if tree:

            outfile = open(self.mTempdir + "/" + self.mFilenameTree, "w")

            ## check what kind of tree is given.
            if type(tree) == StringType:
                t = tree.strip()
                if t[0] == "(" and t[-1] in ");":
                    outfile.write("%s\n" % t)

                else:
                    nexus = TreeTools.Newick2Nexus(open(tree, "r"))
                    t = nexus.trees[0]
                    outfile.write("%s\n" % TreeTools.Tree2Newick(t))

            outfile.close()

        # use your own random seed. Time won't do, if simgram
        # is called in quick succession.
        # Are there any restrictions on seeds? Ian using an even number.
        statement = "%s -rndseed %i -g %s -t %s" % (
            self.mExecutable, random.randint(
                0, 4294967296), self.mFilenameGrammar, self.mFilenameTree)

        s = subprocess.Popen(statement,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             cwd=self.mTempdir,
                             close_fds=True)

        (out, err) = s.communicate()

        if s.returncode != 0:
            raise UsageError, "Error in running %s \n%s\n%s\nTemporary directory in %s" % (
                self.mExecutable, err, out, self.mTempdir)

        if dump:
            print "# stdout output of %s:\n%s\n######################################" % (
                self.mExecutable, out)

        if not test:
            shutil.rmtree(self.mTempdir)

        return self.parseOutput(out.split("\n"))
Esempio n. 4
0
def trainMali( mali, options ):
    """train a grammar on a multiple alignment."""

    ## remove empty columns and masked columns
    if options.clean_mali:
        mali.mGapChars = mali.mGapChars + ("n", "N")
        mali.removeGaps( minimum_gaps = 1, frame=1 )
    
    length = mali.getNumColumns()

    input_model = prepareGrammar( options )

    for id in mali.getIdentifiers():
        if options.separator in id:
            species = id.split(options.separator)[0]
            mali.rename( id, species )

    map_new2old = mali.mapIdentifiers()
    map_old2new = IOTools.getInvertedDictionary( map_new2old, make_unique = True )
    
    ids = mali.getIdentifiers()

    if options.input_filename_tree:
        nexus = TreeTools.Newick2Nexus( open(options.input_filename_tree,"r") )
        tree = nexus.trees[0]
        try:
            tree.relabel( map_old2new, warn = True )
        except KeyError, msg:
            raise KeyError( "names in mali and tree are not congruent: %s" % msg )
Esempio n. 5
0
    def processChunk(lines, map_strain2species, options):

        nexus = TreeTools.Newick2Nexus(lines)
        global ninput, noutput, nskipped, nmerged

        for tree in nexus.trees:
            ninput += 1

            if options.loglevel >= 3:
                tree.display()

            mergers = getSpeciesTreeMergers(tree, map_strain2species, options)

            if options.loglevel >= 3:
                options.stdlog.write(
                    "# found %i nodes in the tree that will be merged.\n" % (len(mergers)))

            if len(mergers) > 0:
                nmerged += 1

            n = applySpeciesTreeMergers(
                tree, mergers, map_strain2species, options)

            if len(tree.get_terminals()) <= 1:
                nskipped += 1
                continue

            tree.writeToFile(options.stdout, format=options.output_format)
            noutput += 1
Esempio n. 6
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: tree2taxa.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "--skip-trees",
        dest="skip_trees",
        action="store_true",
        help="do not output tree names in third field [default=%default].")

    parser.set_defaults(skip_trees=False)

    (options, args) = E.Start(parser, add_pipe_options=True)

    nexus = TreeTools.Newick2Nexus(sys.stdin)
    if options.loglevel >= 1:
        options.stdlog.write("# read %i trees from stdin.\n" %
                             len(nexus.trees))

    ntree = 0
    ntotal = len(nexus.trees)

    if ntotal == 1:
        options.stdout.write("taxon\n")
    else:
        if options.skip_trees:
            options.stdout.write("taxon\ttree\n")
        else:
            options.stdout.write("taxon\ttree\tname\n")

    for tree in nexus.trees:
        ntree += 1
        taxa = TreeTools.GetTaxa(tree)

        if ntotal == 1:
            for t in taxa:
                options.stdout.write("%s\n" % (t))
        elif options.skip_trees:
            for t in taxa:
                options.stdout.write("%s\t%i\n" % (t, ntree))
        else:
            for t in taxa:
                options.stdout.write("%s\t%i\t%s\n" % (t, ntree, tree.name))

    if options.loglevel >= 1:
        options.stdlog.write("# ntotal=%i\n" % (ntotal))

    E.Stop()
Esempio n. 7
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: tree2patterns.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--reference-tree",
                      dest="reference_tree",
                      type="string",
                      help="reference tree to read.")

    parser.add_option("-s",
                      "--sort-order",
                      dest="sort_order",
                      type="string",
                      help="output order of OTU.")

    parser.set_defaults(
        reference_tree=None,
        sort_order=[],
    )

    (options, args) = E.Start(parser)

    if not options.sort_order:
        for nx in reference_tree.get_terminals():
            options.sort_order.append(reference_tree.node(nx).get_data().taxon)
    else:
        options.sort_order = options.sort_order.split(",")

    if not options.reference_tree:
        raise "no reference tree defined."

    nexus = TreeTools.Newick2Nexus(options.reference_tree)
    reference_tree = nexus.trees[0]

    if options.loglevel >= 3:
        print "# reference tree:"
        print reference_tree.display()

    patterns = TreeTools.calculatePatternsFromTree(tree, options.sort_order)

    for p in patterns:
        print p

    E.Stop()
Esempio n. 8
0
    def WriteTree(self, tree):
        """write tree to file.
        """

        nexus = TreeTools.Newick2Nexus(tree)
        t = nexus.trees[0]
        TreeTools.MapTaxa(t, self.mMapOld2New)

        outfile = open(self.mTempdir + "/" + self.mFilenameTree, "w")
        outfile.write("%i 1\n" % self.mNumSequences)
        outfile.write("%s\n" % TreeTools.Tree2Newick(t))
        outfile.close()
Esempio n. 9
0
def ParseTree(reference_tree, rx_species):

    nexus = TreeTools.Newick2Nexus(reference_tree)
    reference_tree = nexus.trees[0]
    if param_loglevel >= 3:
        print "# reference tree:"
        reference_tree.display()

    map_taxon2id = {}
    for nx in reference_tree.get_terminals():
        otu = reference_tree.node(nx).get_data().taxon
        map_taxon2id[otu] = len(map_taxon2id)
        if param_loglevel >= 2:
            print "# %s\t%i" % (otu, map_taxon2id[otu])
    map_taxon2id["unknown"] = len(map_taxon2id)

    return reference_tree, map_taxon2id
Esempio n. 10
0
    def testGetMergers(self):
        """
        test.

        TODO: add testing for transcripts
        """
        print "testGetMergers()"

        for lines, reference, map_strain2species, options in self.mTestData:
            nexus = TreeTools.Newick2Nexus(lines)
            mergers = tree_strain2species.getMergers(
                nexus.trees[0], map_strain2species, options)
            for node_id, species, strain_x, gene_x, strain_y, gene_y in mergers:
                key1 = ((strain_x, gene_x), (strain_y, gene_y))
                key2 = ((strain_y, gene_y), (strain_x, gene_x))
                if key1 not in reference and key2 not in reference:
                    self.fail("%s not in reference %s" %
                              (str(key1), str(reference)))
Esempio n. 11
0
    def parseOutput(self, lines, out, err):

        lines = re.sub("\s", "", "".join(lines))
        lines = re.sub("\[[^\]]+\]", "", lines)

        t = TreeTools.Newick2Nexus("".join(lines))

        result = Result()
        t = t.trees[0]

        TreeTools.MapTaxa(t, self.mMapNew2Old)

        result.mTree = t

        result.mLog = out
        result.mErr = err

        return result
Esempio n. 12
0
    def processChunk(lines, map_strain2species, options):

        nexus = TreeTools.Newick2Nexus(lines)
        global ninput, noutput, nskipped, nmerged

        for tree in nexus.trees:
            ninput += 1

            if options.loglevel >= 3:
                tree.display()

            mergers = getMergers(tree, map_strain2species, options)

            if options.loglevel >= 3:
                options.stdlog.write(
                    "# found %i pairs of genes that will be merged.\n" %
                    (len(mergers)))

            if len(mergers) > 0:
                nmerged += 1

            n = applyMergers(tree, mergers, counters, map_strain2species,
                             options)

            if len(tree.get_terminals()) <= 1:
                nskipped += 1
                continue

            for new_name, values in n.items():
                for strain, gene in values:
                    if (strain, gene) in merged:
                        options.stdlog.write(
                            "# warning: strain %s and gene %s already appeared in tree %s"
                            % (merged[(strain, gene)]))
                        nwarnings += 1
                    merged[(strain, gene)] = None
                    output_genes.write("%s\t%s\n" % (options.separator.join(
                        (strain, gene)), new_name))

            tree.writeToFile(options.stdout, format=options.output_format)
            noutput += 1
Esempio n. 13
0
def GetPrunedReferenceTree( mask, present_orgs, reference_tree ):

    # reread and process species tree
    # has to be done for every new pass, because
    # the tree is modified later on (and I haven't found
    # a copy mechanism (because I did not look)).
    nexus = TreeTools.Newick2Nexus( reference_tree )
    reference_tree = nexus.trees[0]

    ###########################################################################
    # prune reference tree and keep only those taxa, which are present in the cluster.
    for nx in reference_tree.get_terminals():
        otu = reference_tree.node(nx).get_data().taxon
        if otu not in present_orgs:
            Prune( reference_tree, otu )
        
    if param_loglevel >= 3:
        print "# pruned reference tree for %s:" % (",".join(present_orgs.keys()))
        reference_tree.display()

    return reference_tree
Esempio n. 14
0
def processMali(mali, options):

    ncols = mali.getNumColumns()

    if ncols == 0:
        raise "refusing to process empty alignment."

    ## add annotation of states
    if options.block_size != None:
        if options.block_size < 1:
            size = int(float(ncols) / 3.0 * options.block_size) * 3
        else:
            size = int(options.block_size) * 3

        size = min(size, ncols)
        mali.addAnnotation("STATE", "N" * size + "C" * (ncols - size))

    ## remove gene ids
    for id in mali.getIdentifiers():
        if options.separator in id:
            species = id.split(options.separator)[0]
            mali.rename(id, species)

    map_new2old = mali.mapIdentifiers()
    map_old2new = IOTools.getInvertedDictionary(map_new2old, make_unique=True)

    ids = mali.getIdentifiers()
    xgram = XGram.XGram()

    if options.xrate_min_increment:
        xgram.setMinIncrement(options.xrate_min_increment)

    ninput, noutput, nskipped = 0, 0, 0

    # remove empty columns and masked columns
    if options.clean_mali:
        mali.mGapChars = mali.mGapChars + ("n", "N")
        mali.removeGaps(minimum_gaps=1, frame=3)

    if options.input_filename_tree:
        nexus = TreeTools.Newick2Nexus(open(options.input_filename_tree, "r"))
        tree = nexus.trees[0]
        tree.relabel(map_old2new)
    else:
        tree = None

    annotation = mali.getAnnotation("STATE")
    chars = set(list(annotation))
    for c in chars:
        assert c in (
            "N", "C"), "unknown annotation %s: only 'N' and 'C' are recognized"
    if len(chars) == 1:
        if options.loglevel >= 1:
            options.stdlog.write("# WARNING: only a single block")
        blocks = (("B0_", chars[0]), )
    else:
        blocks = (("B0_", "N"), ("B1_", "C"))

    result, mali, ids = prepareGrammar(xgram, mali, tree, map_old2new, blocks,
                                       options)

    trained_model = result.getModel()

    pis, matrices = RateEstimation.getRateMatrix(trained_model)

    annotation = mali.getAnnotation("STATE")

    for block, code in blocks:

        terminals = ("%sCOD0" % block, "%sCOD1" % block, "%sCOD2" % block)

        pi = pis[terminals]

        if options.shared_rates == "all":
            rate_prefix_rs = ""
            rate_prefix_rn = ""
            rate_prefix_ri = ""
            rate_prefix_rv = ""
        elif options.shared_rates == "kappa":
            rate_prefix_rs = block
            rate_prefix_rn = block
            rate_prefix_ri = ""
            rate_prefix_rv = ""
        elif options.shared_rates == "kappa-ds":
            rate_prefix_rs = ""
            rate_prefix_rn = block
            rate_prefix_ri = ""
            rate_prefix_rv = ""
        elif options.shared_rates == "omega":
            rate_prefix_rs = ""
            rate_prefix_rn = ""
            rate_prefix_ri = block
            rate_prefix_rv = block
        elif options.shared_rates == "omega-ds":
            rate_prefix_rs = ""
            rate_prefix_rn = ""
            rate_prefix_ri = block
            rate_prefix_rv = ""
        elif options.shared_rates == "ds":
            rate_prefix_rs = ""
            rate_prefix_rn = block
            rate_prefix_ri = block
            rate_prefix_rv = block
        else:
            rate_prefix_rs = block
            rate_prefix_rn = block
            rate_prefix_ri = block
            rate_prefix_rv = block

        if options.shared_frequencies:
            frequency_prefix = ""
        else:
            frequency_prefix = block

        rs = trained_model.mGrammar.getParameter('%sRs' % rate_prefix_rs)
        rn = trained_model.mGrammar.getParameter('%sRn' % rate_prefix_rn)
        ri = trained_model.mGrammar.getParameter('%sRi' % rate_prefix_ri)
        rv = trained_model.mGrammar.getParameter('%sRv' % rate_prefix_rv)

        nchars = annotation.count(code)

        msg = "iter=%i Rs=%6.4f Rn=%6.4f Ri=%6.4f Rv=%6.4f" % (
            result.getNumIterations(), rs, rn, ri, rv)

        try:
            Q, t = RateEstimation.getQMatrix(pi,
                                             Rsi=rs * ri,
                                             Rsv=rs * rv,
                                             Rni=rn * ri,
                                             Rnv=rn * rv)
            avg_omega = (rs + rn) / 2.0
            Q0, t0 = RateEstimation.getQMatrix(pi,
                                               Rsi=ri * avg_omega,
                                               Rsv=rv * avg_omega,
                                               Rni=ri * avg_omega,
                                               Rnv=rv * avg_omega)

            avg_kappa = (ri + rv) / 2.0
            Q1, t1 = RateEstimation.getQMatrix(pi,
                                               Rsi=rs * avg_kappa,
                                               Rsv=rs * avg_kappa,
                                               Rni=rn * avg_kappa,
                                               Rnv=rn * avg_kappa)

            rI, rV, rS, rN = RateEstimation.countSubstitutions(pi, Q)
            rI0, rV0, rS0, rN0 = RateEstimation.countSubstitutions(pi, Q0)
            rI1, rV1, rS1, rN1 = RateEstimation.countSubstitutions(pi, Q1)

            dS = rS / (3 * rS0) * t
            dN = rN / (3 * rN0) * t

            o_kappa = options.value_format % (rI / rI0 * rV0 / rV)
            o_omega = options.value_format % (dN / dS)

            o_dn = options.value_format % dN
            o_ds = options.value_format % dS
            o_rn = options.value_format % rN
            o_rs = options.value_format % rS
            o_rn0 = options.value_format % rN0
            o_rs0 = options.value_format % rS0
            o_t = options.value_format % t
            o_t0 = options.value_format % t0

        except ZeroDivisionError:

            o_kappa = "na"
            o_omega = "na"
            o_dn = "na"
            o_ds = "na"
            o_rn = "na"
            o_rs = "na"
            o_rn0 = "na"
            o_rs0 = "na"
            o_t = "na"
            o_t0 = "na"
            Q = None
            msg = "insufficient data to estimate rate matrix."

        options.stdout.write("\t".join(
            map(str, (code, block, o_dn, o_ds, o_omega, "na", "na", "na", "na",
                      o_kappa, result.getLogLikelihood(), "na", nchars))))

        if options.with_rho:
            options.stdout.write(
                "\t" +
                "\t".join(map(str, (o_rn, o_rs, o_t, o_rn0, o_rs0, o_t0))))

        options.stdout.write("\t%s\n" % msg)
Esempio n. 15
0
        colour_by_species=None,
        tree=None,
        branch_scale=0,
        height_scale=0,
    )

    (options, args) = Experiment.Start(parser, add_pipe_options=True)

    if options.filename_tree:
        tree_lines = open(options.filename_tree, "r").readlines()
    elif options.tree:
        tree_lines = options.tree
    else:
        raise "please supply a species tree."

    nexus = TreeTools.Newick2Nexus(tree_lines)
    Tree.updateNexus(nexus)
    tree = nexus.trees[0]

    if options.loglevel >= 2:
        tree.display()

    plot = SVGTree(tree)

    plot.setBranchScale(options.branch_scale)
    plot.setHeightScale(options.height_scale)

    if options.colour_by_species:
        rx = re.compile(options.species_regex)
        extract_species = lambda x: rx.search(x).groups()[0]
        plot.setDecoratorExternalNodes(
Esempio n. 16
0
def processMali(mali, options):

    map_new2old = mali.mapIdentifiers()
    ids = mali.getIdentifiers()

    invalid_chars = options.gap_chars + options.mask_chars

    has_non_overlaps = False

    pairs = []

    if options.iteration == "all-vs-all":
        for x in range(len(ids)):
            for y in range(0, x):
                pairs.append((x, y))
    elif options.iteration == "first-vs-all":
        for y in range(1, len(ids)):
            pairs.append((0, y))
    elif options.iteration == "pairwise":
        if len(ids) % 2 != 0:
            raise "uneven number of sequences (%i) not compatible with --iteration=pairwise" % len(
                ids)
        for x in range(0, len(ids), 2):
            pairs.append((x, x + 1))
    elif options.iteration == "tree":
        pairs = []
    else:
        raise "unknown iteration mode: %s" % (options.iteration)

    if options.remove_stops:
        for id, entry in mali.items():
            s = entry.mString.upper()
            fragments = []
            for x in range(0, len(s), 3):
                codon = s[x:x + 3]
                if Genomics.IsStopCodon(codon):
                    codon = "NNN"

                fragments.append(codon)

            entry.mString = "".join(fragments)

    for x, y in pairs:
        noverlap = 0
        for a, b in zip(mali[ids[x]], mali[ids[y]]):
            if a not in invalid_chars and b not in invalid_chars:
                noverlap += 1
                if noverlap >= options.min_overlap:
                    break
        else:
            has_non_overlaps = True
            break

    if options.tree:
        tree = TreeTools.Newick2Nexus(options.tree).trees[0]
        map_old2new = IOTools.getInvertedDictionary(map_new2old,
                                                    make_unique=True)
        tree.relabel(map_old2new)
    else:
        tree = None

    if options.method == "paml":
        runCodeML(mali, tree, has_non_overlaps, pairs, map_new2old, options)

    elif options.method == "xrate":
        runXrate(mali, has_non_overlaps, pairs, map_new2old, options)
Esempio n. 17
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: data2phylocontrasts.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-c",
                      "--columns",
                      dest="columns",
                      type="string",
                      help="columns to take for calculating histograms.")
    parser.add_option("-t",
                      "--filename-tree",
                      dest="filename_tree",
                      type="string",
                      help="filename with tree(s).")
    parser.add_option("--skip-header",
                      dest="add_header",
                      action="store_false",
                      help="do not add header to flat format.")
    parser.add_option("--write-header",
                      dest="write_header",
                      action="store_true",
                      help="write header and exit.")
    parser.add_option("--debug",
                      dest="debug",
                      action="store_true",
                      help="debug mode")
    parser.add_option("--display-tree",
                      dest="display_tree",
                      action="store_true",
                      help="display the tree")

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("contrasts", "spearman", "pearson", "compute"),
                      help="methods to perform on contrasts.")

    parser.set_defaults(
        columns="all",
        filename_tree=None,
        add_header=True,
        write_header=False,
        debug=False,
        methods=[],
        value_format="%6.4f",
        pvalue_format="%e",
        display_tree=False,
    )

    (options, args) = E.Start(parser, quiet=True)

    if options.columns not in ("all", "all-but-first"):
        options.columns = map(lambda x: int(x) - 1, options.columns.split(","))

    phylip = WrapperPhylip.Phylip()

    if options.debug:
        phylip.setLogLevel(options.loglevel)

    phylip.setProgram("contrast")

    ##########################################################
    ##########################################################
    ##########################################################
    # retrieve data and give to phylip
    data = []
    headers = []
    first = True
    for line in sys.stdin:
        if line[0] == "#":
            continue
        d = line[:-1].strip().split("\t")
        if first:
            first = False
            headers = d[1:]
            continue
        data.append(d)

    phylip.setData(data)
    ncolumns = len(headers)
    nrows = len(data)

    ##########################################################
    ##########################################################
    ##########################################################
    # read trees
    nexus = None
    if options.filename_tree:
        nexus = TreeTools.Newick2Nexus(open(options.filename_tree, "r"))

    if not nexus:
        raise ValueError("please provide trees with branchlenghts")

    ##########################################################
    ##########################################################
    ##########################################################
    # set up phylip
    phylip_options = []
    # print out contrasts
    phylip_options.append("C")
    phylip_options.append("Y")
    phylip.setOptions(phylip_options)

    ##########################################################
    ##########################################################
    ##########################################################
    # main loop
    ##########################################################
    for tree in nexus.trees:

        if options.display_tree:
            tree.display()

        # compute this before giving the tree to the phylip module,
        # as it remaps taxon names.
        map_node2data = {}
        for x in range(nrows):
            taxon = data[x][0]
            map_node2data[tree.search_taxon(taxon)] = x

        phylip.setTree(tree)

        result = phylip.run()

        for method in options.methods:

            if method in ("pearson", "spearman"):

                options.stdout.write("header1\theader2\tr\tp\tcode\n")

                n = len(result.mContrasts)
                columns = []
                for c in range(ncolumns):
                    columns.append(map(lambda x: x[c], result.mContrasts))

                for x in range(0, ncolumns - 1):
                    for y in range(x + 1, ncolumns):

                        # phylip value
                        phy_r = result.mCorrelations[x][y]

                        import rpy
                        from rpy import r as R

                        # Various ways to calculate r. It is not possible to use
                        # cor.test or lsfit directly, as you have to perform a
                        # regression through the origin.

                        # uncomment to check pearson r against phylip's value
                        ## r = calculateCorrelationCoefficient( columns[x], columns[y] )

                        # for significance, use linear regression models in R
                        rpy.set_default_mode(rpy.NO_CONVERSION)
                        linear_model = R.lm(R("y ~ x - 1"),
                                            data=R.data_frame(x=columns[x],
                                                              y=columns[y]))
                        rpy.set_default_mode(rpy.BASIC_CONVERSION)

                        ss = R.summary(linear_model)

                        # extract the p-value
                        p = ss['coefficients'][-1][-1]

                        if p < 0.001:
                            code = "***"
                        elif p < 0.01:
                            code = "**"
                        elif p < 0.05:
                            code = "*"
                        else:
                            code = ""

                        options.stdout.write("\t".join(
                            (headers[x], headers[y], options.value_format %
                             phy_r, options.pvalue_format % p, code)) + "\n")

            elif method == "contrasts":

                options.stdout.write("\t".join(headers) + "\n")
                for d in result.mContrasts:
                    options.stdout.write(
                        "\t".join(map(lambda x: options.value_format % x, d)) +
                        "\n ")

            elif method == "compute":

                # make room for all internal nodes and one dummy node
                # for unrooted trees.
                max_index = TreeTools.GetMaxIndex(tree) + 2
                variances = [None] * max_index
                values = [[None] * nrows for x in range(max_index)]
                contrasts = []
                for x in range(max_index):
                    contrasts.append([None] * ncolumns)
                branchlengths = [None] * max_index

                def update_data(
                    node_id,
                    bl,
                    c1,
                    c2,
                ):

                    b1, b2 = branchlengths[c1], branchlengths[c2]
                    rb1 = 1.0 / b1
                    rb2 = 1.0 / b2
                    # compute variance
                    variance = math.sqrt(b1 + b2)

                    # extend branch length of this node to create correct
                    # variance for parent
                    branchlengths[node_id] = bl + (b1 * b2) / (b1 + b2)
                    variances[node_id] = variance

                    for c in range(ncolumns):
                        v1, v2 = values[c1][c], values[c2][c]
                        # save ancestral value as weighted mean
                        values[node_id][c] = (
                            (rb1 * v1 + rb2 * v2)) / (rb1 + rb2)
                        # compute normalized contrast
                        contrasts[node_id][c] = (v1 - v2) / variance

                def update_contrasts(node_id):
                    """update contrasts for a node."""
                    node = tree.node(node_id)
                    if node.succ:
                        if len(node.succ) == 2:
                            c1, c2 = node.succ
                            update_data(node_id, node.data.branchlength, c1,
                                        c2)
                        else:
                            assert (node_id == tree.root)
                            assert (len(node.succ) == 3)
                            update_data(node_id, node.data.branchlength,
                                        node.succ[0], node.succ[1])
                            update_data(max_index - 1, node.data.branchlength,
                                        node_id, node.succ[2])
                    else:
                        for c in range(ncolumns):
                            values[node_id][c] = float(
                                data[map_node2data[node_id]][c + 1])

                        branchlengths[node_id] = node.data.branchlength

                tree.dfs(tree.root, post_function=update_contrasts)

                options.stdout.write("node_id\tvariance\t%s\n" %
                                     "\t".join(headers))
                for node_id in range(max_index):
                    if variances[node_id] is None:
                        continue
                    options.stdout.write("%s\t%s\t%s\n" % (
                        node_id,
                        options.value_format % variances[node_id],
                        "\t".join(
                            map(lambda x: options.value_format % x,
                                contrasts[node_id])),
                    ))

    E.Stop()
Esempio n. 18
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id: trees2tree.py 2782 2009-09-10 11:40:29Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=("counts", "min", "max", "sum", "mean", "median", "stddev", "non-redundant", "consensus",
                               "select-largest"),
                      help="aggregation function.")

    parser.add_option("-r", "--regex-id", dest="regex_id", type="string",
                      help="regex pattern to extract identifier from tree name for the selection functions.")

    parser.add_option("-w", "--write-values", dest="write_values", type="string",
                      help="if processing multiple trees, write values to file.")

    parser.add_option("-e", "--error-branchlength", dest="error_branchlength", type="float",
                      help="set branch length without counts to this value.")

    parser.set_defaults(
        method="mean",
        regex_id=None,
        filtered_branch_lengths=(-999.0, 999.0),
        write_values = None,
        error_branchlength = None,
        separator=":",
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.loglevel >= 2:
        options.stdlog.write("# reading trees from stdin.\n")
        options.stdlog.flush()

    nexus = TreeTools.Newick2Nexus(sys.stdin)
    if options.loglevel >= 1:
        options.stdlog.write(
            "# read %i trees from stdin.\n" % len(nexus.trees))

    nskipped = 0
    ninput = len(nexus.trees)
    noutput = 0
    nerrors = 0

    if options.method == "non-redundant":
        # compute non-redudant trees
        template_trees = []
        template_counts = []
        ntree = 0
        for tree in nexus.trees:

            for x in range(0, len(template_trees)):
                is_compatible, reason = TreeTools.IsCompatible(
                    tree, template_trees[x])
                if is_compatible:
                    template_counts[x] += 1
                    break
            else:
                template_counts.append(1)
                template_trees.append(tree)

            if options.loglevel >= 2:
                options.stdlog.write(
                    "# tree=%i, ntemplates=%i\n" % (ntree, len(template_trees)))

            ntree += 1

        for x in range(0, len(template_trees)):
            if options.loglevel >= 1:
                options.stdlog.write("# tree: %i, counts: %i, percent=%5.2f\n" %
                                     (x, template_counts[x], template_counts[x] * 100.0 / ntotal))
            options.stdout.write(
                TreeTools.Tree2Newick(template_trees[x]) + "\n")

    elif options.method in ("select-largest",):
        # select one of the trees with the same name.
        clusters = {}
        for x in range(0, len(nexus.trees)):
            n = nexus.trees[x].name

            if options.regex_id:
                n = re.search(options.regex_id, n).groups()[0]

            if n not in clusters:
                clusters[n] = []
            clusters[n].append(x)

        new_trees = []

        for name, cluster in clusters.items():
            new_trees.append(
                getBestTree([nexus.trees[x] for x in cluster], options.method))

        for x in range(0, len(new_trees)):
            options.stdout.write(">%s\n" % new_trees[x].name)
            options.stdout.write(TreeTools.Tree2Newick(new_trees[x],) + "\n")
            noutput += 1

        nskipped = ntotal - noutput

    elif options.method == "consensus":

        phylip = WrapperPhylip.Phylip()
        phylip.setLogLevel(options.loglevel - 2)
        phylip.setProgram("consense")
        phylip_options = []
        phylip_options.append("Y")

        phylip.setOptions(phylip_options)
        phylip.setTrees(nexus.trees)

        result = phylip.run()

        options.stdout.write(
            "# consensus tree built from %i trees\n" % (phylip.mNInputTrees))
        options.stdout.write(
            TreeTools.Tree2Newick(result.mNexus.trees[0]) + "\n")
        noutput = 1

    else:
        if options.method in ("min", "max", "sum", "mean", "counts"):

            xtree = nexus.trees[0]
            for n in xtree.chain.keys():
                if xtree.node(n).data.branchlength in options.filtered_branch_lengths:
                    xtree.node(n).data.branchlength = 0
                ntotals = [1] * len(xtree.chain.keys())

            if options.method == "min":
                f = min
            elif options.method == "max":
                f = max
            elif options.method == "sum":
                f = lambda x, y: x + y
            elif options.method == "mean":
                f = lambda x, y: x + y
            elif options.method == "counts":
                f = lambda x, y: x + 1
                for n in xtree.chain.keys():
                    if xtree.node(n).data.branchlength not in options.filtered_branch_lengths:
                        xtree.node(n).data.branchlength = 1
                    else:
                        xtree.node(n).data.branchlength = 0
            else:
                raise "unknown option %s" % options.method

            for tree in nexus.trees[1:]:

                for n in tree.chain.keys():
                    if tree.node(n).data.branchlength not in options.filtered_branch_lengths:
                        xtree.node(n).data.branchlength = f(
                            xtree.node(n).data.branchlength, tree.node(n).data.branchlength)
                        ntotals[n] += 1

            if options.method == "mean":
                for n in xtree.chain.keys():
                    if ntotals[n] > 0:
                        xtree.node(n).data.branchlength = float(
                            xtree.node(n).data.branchlength) / ntotals[n]
                    else:
                        if options.error_branchlength is not None:
                            xtree.node(
                                n).data.branchlength = options.error_branchlength
                            if options.loglevel >= 1:
                                options.stdlog.write(
                                    "# no counts for node %i - set to %f\n" % (n, options.error_branchlength))
                                nerrors += 1
                        else:
                            raise "no counts for node %i" % n

        else:
            # collect all values for trees
            values = [[] for x in range(TreeTools.GetSize(nexus.trees[0]))]

            for tree in nexus.trees:
                for n, node in tree.chain.items():
                    if node.data.branchlength not in options.filtered_branch_lengths:
                        values[n].append(node.data.branchlength)

            tree = nexus.trees[0]
            for n, node in tree.chain.items():
                if len(values[n]) > 0:
                    if options.method == "stddev":
                        node.data.branchlength = scipy.std(values[n])
                    elif options.method == "median":
                        node.data.branchlength = scipy.median(values[n])
                else:
                    if options.error_branchlength is not None:
                        node.data.branchlength = options.error_branchlength
                        if options.loglevel >= 1:
                            options.stdlog.write(
                                "# no counts for node %i - set to %f\n" % (n, options.error_branchlength))
                            nerrors += 1
                    else:
                        raise "no counts for node %i" % n

            if options.write_values:
                outfile = open(options.write_values, "w")
                for n, node in tree.chain.items():
                    values[n].sort()
                    id = options.separator.join(
                        sorted(TreeTools.GetLeaves(tree, n)))
                    outfile.write("%s\t%s\n" %
                                  (id, ";".join(map(str, values[n]))))
                outfile.close()

        del nexus.trees[1:]
        options.stdout.write(TreeTools.Nexus2Newick(nexus) + "\n")
        noutput = 1

    if options.loglevel >= 1:
        options.stdlog.write("# ntotal=%i, nskipped=%i, noutput=%i, nerrors=%i\n" % (
            ninput, nskipped, noutput, nerrors))

    E.Stop()
Esempio n. 19
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: tree2stats.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("branchlengths", ),
                      help="methods to apply.")

    parser.set_defaults(
        methods=[],
        filtered_branch_length=-999,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    nexus = TreeTools.Newick2Nexus(sys.stdin)
    if options.loglevel >= 1:
        options.stdlog.write("# read %i trees from stdin.\n" %
                             len(nexus.trees))

    ninput = len(nexus.trees)

    nskipped = 0

    for method in options.methods:

        outfile = options.stdout

        if method == "branchlengths":

            outfile.write(
                "tree\t%s\n" %
                "\t".join(Stats.DistributionalParameters().getHeaders()))

            for tree in nexus.trees:
                branchlengths = []
                for node in tree.chain.values():
                    # ignore branch length of root if it is zero
                    if not node.prev and node.data.branchlength == 0: continue

                    if node.data.branchlength == options.filtered_branch_length:
                        continue

                    branchlengths.append(node.data.branchlength)

                s = Stats.DistributionalParameters(branchlengths)
                outfile.write("%s\t%s\n" % (tree.name, str(s)))

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, nskipped=%i\n" % (ninput, nskipped))

    E.Stop()
Esempio n. 20
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: matrix2tree.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option("-i",
                      "--invert-map",
                      dest="invert_map",
                      action="store_true",
                      help="""invert map.""")

    parser.add_option("--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("phylip", "full"),
                      help="""input format.""")

    parser.add_option("-t",
                      "--filename-tree",
                      dest="filename_tree",
                      type="string",
                      help="""filename with tree to fit.""")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("nj", "kitsch", "fitch"),
                      help="""algorithm to run.""")

    parser.add_option("-e",
                      "--replicates",
                      dest="replicates",
                      action="store_true",
                      help="replicates.")

    parser.add_option("-r",
                      "--root",
                      dest="root",
                      action="store_true",
                      help="midpoint root (if it is not rooted).")

    parser.add_option("-u",
                      "--unroot",
                      dest="unroot",
                      action="store_true",
                      help="unroot tree (if it is rooted).")

    parser.add_option("--skip-separators",
                      dest="write_separators",
                      action="store_false",
                      help="do not echo separators (starting with >)")

    #    parser.add_option("-i", "--iterations", dest="iterations", type="int",
    #                      help="number of iterations." )

    parser.add_option("-p",
                      "--power",
                      dest="power",
                      type="float",
                      help="power.")

    parser.add_option(
        "--prune-tree",
        dest="prune_tree",
        action="store_true",
        help=
        "prune tree such to include only taxa which are part of the input matrix."
    )

    parser.add_option(
        "--add-random",
        dest="add_random",
        action="store_true",
        help="add small random value to off-diagonal zero elements in matrix.")

    parser.add_option(
        "--pseudo-replicates",
        dest="pseudo_replicates",
        action="store_true",
        help=
        "add small random value to off-diagonal zero elements in matrix, even if they have no replicates."
    )

    parser.add_option("--debug",
                      dest="debug",
                      action="store_true",
                      help="dump debug information.")

    parser.set_defaults(
        value=0,
        method="nj",
        input_format="phylip",
        filename_tree=None,
        outgroup=None,
        replicates=False,
        root=False,
        unroot=False,
        power=0,
        write_separators=True,
        prune_tree=False,
        add_random=False,
        debug=False,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    phylip = WrapperPhylip.Phylip()

    if options.debug:
        phylip.setLogLevel(options.loglevel)

    phylip.setPruneTree(options.prune_tree)

    lines = filter(lambda x: x[0] != "#", sys.stdin.readlines())

    chunks = filter(lambda x: lines[x][0] == ">", range(len(lines)))

    if not chunks:
        options.write_separators = False
        chunks = [-1]

    chunks.append(len(lines))

    for x in range(len(chunks) - 1):

        matrix = lines[chunks[x] + 1:chunks[x + 1]]

        # parse phylip matrix
        if options.add_random:
            mm = []
            ids = []
            for l in range(1, len(matrix)):
                values = re.split("\s+", matrix[l][:-1])
                ids.append(values[0])
                mm.append(map(lambda x: x.strip(), values[1:]))

            d = len(mm)
            if options.replicates:
                for row in range(d - 1):
                    for col in range(row + 1, d):
                        cc = col * 2
                        rr = row * 2
                        if mm[row][cc] == "0" and mm[row][cc + 1] != "0":
                            mm[row][cc + 1] = "1"
                            mm[col][rr + 1] = "1"
                            v = str(random.random() / 10000.0)
                            mm[row][cc] = v
                            mm[col][rr] = v

            else:
                for row in range(d - 1):
                    for col in range(row + 1, d):
                        if mm[row][col] == "0":
                            v = str(random.random() / 10000.0)
                            mm[row][col] = v
                            mm[col][row] = v

            matrix = ["%i\n" % d]
            for row in range(d):
                matrix.append(ids[row] + "    " + "    ".join(mm[row]) + "\n")

        # parse phylip matrix
        if options.pseudo_replicates:
            mm = []
            ids = []
            for l in range(1, len(matrix)):
                values = re.split("\s+", matrix[l][:-1])
                ids.append(values[0])
                mm.append(map(lambda x: x.strip(), values[1:]))

            d = len(mm)
            if options.replicates:
                for row in range(d - 1):
                    for col in range(row + 1, d):
                        cc = col * 2
                        rr = row * 2
                        if mm[row][cc + 1] == "0":
                            mm[row][cc + 1] = "1"
                            mm[col][rr + 1] = "1"
                            v = str(random.random() / 10000.0)
                            mm[row][cc] = v
                            mm[col][rr] = v
                        else:
                            mm[row][cc + 1] = "100"
                            mm[col][rr + 1] = "100"
            else:
                for row in range(d - 1):
                    for col in range(row + 1, d):
                        if mm[row][col] == "0":
                            v = str(random.random() / 10000.0)
                            mm[row][col] = v
                            mm[col][row] = v

            matrix = ["%i\n" % d]
            for row in range(d):
                matrix.append(ids[row] + "    " + "    ".join(mm[row]) + "\n")

        phylip.setMatrix(matrix)

        phylip_options = []

        if options.filename_tree:
            nexus = TreeTools.Newick2Nexus(open(options.filename_tree, "r"))
            ref_tree = nexus.trees[0]
            phylip.setTree(ref_tree)
            phylip_options.append("U")
        else:
            ref_tree = None

        if options.method == "nj":
            phylip.setProgram("neighbor")

        elif options.method == "fitch":
            phylip.setProgram("fitch")

        elif options.method == "kitsch":
            phylip.setProgram("kitsch")

        if options.replicates:
            phylip_options.append("S")

        if options.power > 0:
            phylip_options.append("P")
            phylip_options.append("%f" % options.power)

        phylip_options.append("Y")

        phylip.setOptions(phylip_options)

        result = phylip.run()

        # root with outgroup
        if options.root:
            if options.outgroup:
                pass
            # midpoint root
            else:
                for tree in result.mNexus.trees:
                    tree.root_midpoint()

        # explicitely unroot
        elif options.unroot:
            phylip.setOptions(("Y", "W", "U", "Q"))
            phylip.setProgram("retree")
            for x in range(len(result.mNexus.trees)):
                phylip.setTree(result.mNexus.trees[x])
                xresult = phylip.run()
                result.mNexus.trees[x] = xresult.mNexus.trees[0]

        if options.write_separators:
            options.stdout.write(lines[chunks[x]])

        if result.mNexus:
            options.stdout.write(TreeTools.Nexus2Newick(result.mNexus) + "\n")

        if options.loglevel >= 1:
            if ref_tree:
                nref = len(ref_tree.get_terminals())
            else:
                nref = 0
            for tree in result.mNexus.trees:
                options.stdlog.write(
                    "# ninput=%i, nreference=%i, noutput=%i\n" %
                    (len(matrix) - 1, nref, len(tree.get_terminals())))

    E.Stop()
Esempio n. 21
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: tree2matrix.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="string",
                      help="number format to use.")
    parser.add_option("-g",
                      "--graph",
                      dest="to_graph",
                      action="store_true",
                      help="convert tree(s) to graph(s).")
    parser.add_option("-a",
                      "--table",
                      dest="to_table",
                      action="store_true",
                      help="convert tree(s) to table.")
    parser.add_option("-t",
                      "--translate",
                      dest="do_translate",
                      action="store_true",
                      help="translate internal nodes to clades.")
    parser.add_option(
        "--output-pattern",
        dest="output_filename_pattern",
        type="string",
        help="pattern for output file if there are multiple trees in the file."
        "")
    parser.add_option("--pairs",
                      dest="pairs",
                      type="choice",
                      choices=("all", "leaves", "branches", "terminals",
                               "lineage", "between-species"),
                      help="choose pairs of nodes to output."
                      "")
    parser.add_option(
        "--species",
        dest="species",
        type="string",
        help=
        "comma separated list of species that are considered. All others are ignored."
    )

    parser.set_defaults(
        format="%6.4f",
        to_graph=False,
        to_table=False,
        do_translation=False,
        separator=":",
        do_all_on_all=False,
        do_branches=False,
        do_terminals=False,
        output_filename_pattern=None,
        pairs="branches",
        species=None,
        regex_species=("^([^|]+)"),
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.species: options.species = options.species.split(",")
    nexus = TreeTools.Newick2Nexus(sys.stdin)

    if options.loglevel >= 1:
        options.stdlog.write("# read %i trees from stdin.\n" %
                             len(nexus.trees))

    ntree = 0
    outfile = None

    ## The table is a hash of lists
    table = {}

    extract_species = lambda x: re.search(options.regex_species, x).groups()[0]

    for tree in nexus.trees:

        if len(nexus.trees) == 1:
            outfile = options.stdout
        elif options.output_filename_pattern:
            ntree += 1
            if outfile != None: outfile.close()
            outfile = open(options.output_filename_pattern % ntree, "w")
        else:
            outfile = options.stdout

        ## prune tree, if an explicit species list is given
        if options.species:
            species = set(options.species)
            terminals = tree.get_terminals()
            for x in terminals:
                taxon = tree.node(x).data.taxon
                if extract_species(taxon) not in species:
                    tree.prune(taxon)

        ## define node list
        terminals = tree.get_terminals()
        set_terminals = set(terminals)
        node_list = []

        if options.pairs == "all":
            nodes = TreeTools.GetAllNodes(tree)
            for x in range(len(nodes)):
                for y in range(0, x):
                    node_list.append((nodes[x], nodes[y]))
        elif options.pairs == "terminals":
            for x in terminals:
                node_list.append((x, tree.node(x).prev))
        elif options.pairs == "leaves":
            nodes = terminals
            for x in range(len(nodes)):
                for y in range(0, x):
                    node_list.append((nodes[x], nodes[y]))
        elif options.pairs == "branches":
            nodes = TreeTools.GetAllNodes(tree)
            for x in range(len(nodes)):
                if tree.node(x).prev:
                    node_list.append((x, tree.node(x).prev))
        elif options.pairs == "between-species":
            nodes = terminals
            for x in range(len(nodes)):
                for y in range(0, x):
                    s1 = extract_species(tree.node(nodes[x]).data.taxon)
                    s2 = extract_species(tree.node(nodes[y]).data.taxon)
                    if s1 != s2:
                        node_list.append((nodes[x], nodes[y]))

        elif options.pairs == "lineage":
            raise "not implemented."

        if options.to_graph:
            outfile.write("node1\tnode2\tdistance\n")

            links = TreeTools.Tree2Graph(tree)
            for n1, n2, weight in links:

                node1 = TranslateNode(n1, tree, set_terminals, options)
                node2 = TranslateNode(n2, tree, set_terminals, options)

                if node1 > node2: node1, node2 = node2, node1
                outfile.write("%s\t%s\t%s\n" %
                              (node1, node2, options.format % weight))

        elif options.to_table:

            if options.do_all_on_all:
                nodes = TreeTools.GetAllNodes(tree)
            else:
                nodes = terminals

            for n1, n2 in node_list:

                node1 = TranslateNode(n1, tree, set_terminals, options)
                node2 = TranslateNode(n2, tree, set_terminals, options)

                if node1 > node2: node1, node2 = node2, node1

                if options.do_terminals:
                    key = "%s" % node2
                else:
                    key = "%s-%s" % (node1, node2)

                if key not in table: table[key] = []

                table[key].append(options.format % tree.distance(n1, n2))
        else:
            outfile.write("node1\tnode2\tdistance\n")

            for n1, n2 in node_list:
                node1 = TranslateNode(n1, tree, set_terminals, options)
                node2 = TranslateNode(n2, tree, set_terminals, options)

                if node1 > node2: node1, node2 = node2, node1

                outfile.write( "%s\t%s\t%s\n" % ( \
                        node1, node2,
                        options.format % tree.distance( n1, n2 )))

    if options.to_table:
        outfile = sys.stdout
        outfile.write("branch\t%s\n" %
                      ("\t".join(map(str, range(0, len(nexus.trees))))))

        for key, values in table.items():
            outfile.write("%s\t%s\n" % (key, "\t".join(values)))

    if outfile != sys.stdout:
        outfile.close()

    E.Stop()
Esempio n. 22
0
def main():

    parser = E.OptionParser(
        version=
        "%prog version: $Id: plot_tree.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-i",
                      "--title",
                      dest="title",
                      type="string",
                      help="page title.")
    parser.add_option("-f",
                      "--footer",
                      dest="footer",
                      type="string",
                      help="page footer.")
    parser.add_option("-s",
                      "--filename-tree",
                      dest="filename_tree",
                      type="string",
                      help="filename with tree.")
    parser.add_option("-t", "--tree", dest="tree", type="string", help="tree.")
    parser.add_option(
        "-r",
        "--species-regex",
        dest="species_regex",
        type="string",
        help="regular expression to extract species from identifier.")
    parser.add_option("--colour-by-species",
                      dest="colour_by_species",
                      action="store_true",
                      help="colour by species.")
    parser.add_option("--support-style",
                      dest="support_style",
                      type="choice",
                      choices=("pie", "number"),
                      help="style for support information.")
    parser.add_option("--error-style",
                      dest="error_style",
                      type="choice",
                      choices=("pie", "number"),
                      help="style for error information.")
    parser.add_option("--branch-scale",
                      dest="branch_scale",
                      type="float",
                      help="branch length scale factor.")
    parser.add_option("--height-scale",
                      dest="height_scale",
                      type="float",
                      help="height scale factor.")
    parser.add_option("-a",
                      "--annotations",
                      dest="annotations",
                      type="choice",
                      action="append",
                      choices=("support", "error", "kaks", "master", "value",
                               "tables"),
                      help="annotations given by further trees.")
    parser.add_option(
        "--filename-tables",
        dest="filename_tables",
        type="string",
        help="add tables from file (need also set options -a tables) [%default]"
    )
    parser.add_option("--show-branchlengths",
                      dest="show_branchlengths",
                      action="store_true",
                      help="show branch lengths.")
    parser.add_option("--leaf-symbol",
                      dest="plot_leaf_symbol",
                      type="choice",
                      choices=("square", "circle"),
                      help="Symbol for leaves.")
    parser.add_option("--font-size-branches",
                      dest="font_size_branches",
                      type="int",
                      help="set font size for branches.")
    parser.add_option("--font-size-tips",
                      dest="font_size_tips",
                      type="int",
                      help="set font size for tips.")
    parser.add_option("--font-style-tips",
                      dest="font_style_tips",
                      type="choice",
                      choices=(
                          "normal",
                          "italic",
                      ),
                      help="set font style for tips.")
    parser.add_option("--filename-map",
                      dest="filename_map",
                      type="string",
                      help="filename with a name translation table.")
    parser.add_option("--filename-map-species2colour",
                      dest="filename_colour_map",
                      type="string",
                      help="filename with a map of species to colour.")
    parser.add_option("--no-leaf-labels",
                      dest="plot_leaf_labels",
                      action="store_false",
                      help="do not show labels at leafs.")
    parser.add_option("--no-ruler",
                      dest="plot_ruler",
                      action="store_false",
                      help="do not plot ruler.")

    parser.set_defaults(
        titles="",
        title="",
        footer="",
        filename_tree=None,
        species_regex="^([^|]+)\|",
        colour_by_species=None,
        tree=None,
        branch_scale=0,
        height_scale=0,
        support_style=None,
        error_style="number",
        kaks_style="number",
        annotations=None,
        show_branchlengths=False,
        branch_length_format="%5.2f",
        font_size_tips=None,
        font_size_branches=None,
        font_style_tips=None,
        filename_map=None,
        filename_colour_map=None,
        plot_leaf_labels=True,
        plot_leaf_symbol=None,
        plot_ruler=True,
        filename_tables=None,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.filename_tree:
        tree_lines = open(options.filename_tree, "r").readlines()
    elif options.tree:
        tree_lines = options.tree
    else:
        tree_lines = sys.stdin.readlines()

    nexus = TreeTools.Newick2Nexus(tree_lines)
    master_tree = nexus.trees[0]

    if options.filename_map:
        map_names = IOTools.ReadMap(open(options.filename_map, "r"))

        for id, node in master_tree.chain.items():
            if node.data.taxon in map_names:
                node.data.taxon = map_names[node.data.taxon]

    if options.loglevel >= 2:
        master_tree.display()

    plot = SVGTree.SVGTree(master_tree)

    if options.branch_scale:
        plot.setBranchScale(options.branch_scale)

    if options.height_scale != None:
        plot.setHeightScale(options.height_scale)

    if options.font_size_tips != None:
        plot.setFontSize(options.font_size_tips)

    if options.plot_ruler == False:
        plot.setRulerElements([])

    if options.show_branchlengths:
        b = SVGTree.BranchDecoratorHorizontalBranchLength(master_tree)
        if options.font_size_branches:
            b.setFontSize(options.font_size_branches)
        plot.setDecoratorHorizontalBranches(b)

    if options.colour_by_species:
        if options.filename_colour_map:
            map_species2colour = IOTools.ReadMap(
                open(options.filename_colour_map, "r"))
        else:
            map_species2colour = None

        rx = re.compile(options.species_regex)
        extract_species = lambda x: rx.search(x).groups()[0]
        plot.setDecoratorExternalNodes(
            SVGTree.NodeDecoratorBySpecies(
                master_tree,
                plot_symbol=options.plot_leaf_symbol,
                plot_label=options.plot_leaf_labels,
                map_species2colour=map_species2colour,
                extract_species=extract_species))

    if options.font_style_tips:
        plot.getDecoratorExternalNodes().setFontStyle(options.font_style_tips)

    plot.getDecoratorExternalNodes().setPlotLabel(options.plot_leaf_labels)

    current_tree = 1

    ## add annotations by further trees given on the command line
    branch_length_annotations = []

    current_reference_tree = master_tree

    if options.annotations:
        for annotation in options.annotations:

            tree = nexus.trees[current_tree]

            if annotation == "support":

                tree.branchlength2support()
                for id, node in tree.chain.items():
                    node.data.branchlength = 1.0

                if options.support_style == "pie":
                    plot.setDecoratorInternalNodes(
                        NodeDecoratorSupportPieChart(
                            nexus.trees[current_tree]))

            elif annotation == "error":

                if options.error_style == "number":
                    b = SVGTree.BranchDecoratorHorizontalBranchLengthError(
                        current_reference_tree, tree)
                    if options.font_size_branches:
                        b.setFontSize(options.font_size_branches)
                    branch_length_annotations.append(b)

            elif annotation == "kaks":

                if options.kaks_style == "number":
                    b = SVGTree.BranchDecoratorHorizontalBranchLengthWithKaks(
                        current_reference_tree, tree)
                    if options.font_size_branches:
                        b.setFontSize(options.font_size_branches)
                    branch_length_annotations.append(b)

            elif annotation == "value":

                b = SVGTree.BranchDecoratorHorizontalBranchLength(tree)
                if options.font_size_branches:
                    b.setFontSize(options.font_size_branches)
                branch_length_annotations.append(b)

            elif annotation == "master":
                current_reference_tree = tree

            elif annotation == "tables":
                b = BranchDecoratorTable(tree,
                                         filename=options.filename_tables)
                plot.setDecoratorHorizontalBranches(b)

            current_tree += 1

        if len(branch_length_annotations) == 1:
            b = branch_length_annotations[0]
        elif len(branch_length_annotations) == 2:
            b1, b2 = branch_length_annotations
            b1.setFontColour(SVGTree.BLUE)
            b2.setFontColour(SVGTree.RED)
            b = SVGTree.BranchDecoratorHorizontalAboveBelow(
                master_tree, b1, b2)
        elif len(branch_length_annotations) > 2:
            raise "obtained more than three branch length annotations. Layout not implemented"

        plot.setDecoratorHorizontalBranches(b)

    plot.initializePlot()

    plot.writeToFile(sys.stdout)

    E.Stop()
Esempio n. 23
0
    print E.GetHeader()
    print E.GetParams()

    keys = {}
    if param_apply:
        infile = open(param_apply, "r")
        for line in infile:
            if line[0] == "#":
                continue
            a, b = line[:-1].split("\t")[:2]
            if param_invert:
                a, b = b, a
            keys[a] = b

    nexus = TreeTools.Newick2Nexus(sys.stdin)

    notu = 0

    for tree in nexus.trees:
        if param_loglevel >= 2:
            tree.display()

        for nx in tree.get_terminals():
            t1 = tree.node(nx).get_data().taxon

            if param_create:
                if t1 not in keys:
                    keys[t1] = "otu%i" % notu
                    notu += 1
Esempio n. 24
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: trees2trees.py 2782 2009-09-10 11:40:29Z andreas $", usage = globals()["__doc__"])

    parser.add_option("-c", "--output-filename-map", dest="output_filename_map", type="string",
                      help="filename of map to output."  )

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=("filter", "split"),
                      help="method to use: filter removed trees, while split writes them to individual files. DEFAULT=%default"  )

    parser.add_option("-d", "--output-pattern", dest="output_pattern", type="string",
                      help="filename pattern for output multiple alignment files."  )

    parser.add_option("--filter-terminal-max-length", dest="filter_max_length", type="float",
                      help="remove terminal branches with a branch length larger than this."  )

    parser.add_option("--filter-terminal-min-length", dest="filter_min_length", type="float",
                      help="remove any branches with a branch length smaller than this."  )

    parser.add_option("--filter-min-length", dest="filter_min_length", type="float",
                      help="remove terminal branches with a branch length smaller than this."  )

    parser.add_option("--filter-max-length", dest="filter_min_length", type="float",
                      help="remove any branches with a branch length smaller than this."  )

    parser.add_option("--filter-by-trees", dest="filter_by_trees", type="string", action="append",
                      help="mask branches according to trees. Give filenames with mask trees. These trees need to have the same names and structure as the input trees, but can be in any order."  )

    parser.add_option("--filter-by-monophyly", dest="filter_by_monophyly", type="string",
                      help="only retain trees where the given taxa are monphyletic. Supply taxa as a comma-separated list."  )

    parser.add_option("--min-support", dest="min_support", type="float",
                      help="for monophyly filtering, only accept trees with minimum support."  )

    parser.add_option("--filter-ntaxa", dest="filter_ntaxa", type="int", 
                      help="filter by number of taxa."  )

    parser.add_option("--filter-simple-orthologs", dest="filter_simple_orthologs", action="store_true", 
                      help="filter for trees for simple orhtologs. This works by counting the number of taxa."  )

    parser.add_option("--filter", dest="filter", type="choice",
                      choices=("taxa", "trees"),
                      help="filter removes taxa or whole trees." )

    parser.set_defaults(
        output_pattern="%s.tree",
        output_filename_map = None,
        filter_terminal_max_length = None,
        filter_terminal_min_length = None,
        filter_max_length = None,
        filter_min_length = None,
        method ="split",
        filter = "taxa",
        filtered_branch_length = -999,
        filter_by_trees = [],
        filter_by_monophyly = None,
        filter_ntaxa = None,
        filter_simple_orthologs = None,
        min_support = 0.0,
        regex_species = ("^([^|]+)" ),
        )

    (options, args) = E.Start( parser )

    nexus = TreeTools.Newick2Nexus( sys.stdin )
    
    if options.loglevel >= 1:
        options.stdlog.write("# read %i trees from stdin.\n" % len(nexus.trees))

    ninput, noutput, nskipped = 0, 0, 0
    ndiscarded = 0
    ndiscarded_taxa = 0
    ndiscarded_branches = 0

    extract_species = lambda x: re.search( options.regex_species, x).groups()[0]
    
    if options.filter_by_trees:
        nexus_filter = []
        nexus_maps = []
        for filename in options.filter_by_trees:
            nexus_filter.append( TreeTools.Newick2Nexus( open( filename, "r") ) )
            trees = nexus_filter[-1].trees
            if options.loglevel >=1 :
                options.stdlog.write("# read %i trees for filtering from %s\n" % (len(trees), filename))

            nexus_map = {}
            for x in range( len(trees)):
                nexus_map[trees[x].name] = x
            nexus_maps.append( nexus_map )

    if options.filter_by_monophyly:
        monophyly_taxa = options.filter_by_monophyly.split(",")
        if len(monophyly_taxa) == 0:
            raise "please supply at least two taxa for the monophyly test."
            
    if options.output_filename_map:
        outfile_map = open(options.output_filename_map, "a" )
    else:
        outfile_map = None

    for tree in nexus.trees:

        ninput += 1
        id = tree.name
        has_discarded = False

        if options.filter_ntaxa != None:

            ntaxa = len(tree.get_terminals())
            if ntaxa != options.filter_ntaxa:
                if options.loglevel >= 2:
                    options.stdlog.write("# tree %s: removed because number of taxa (%i) different\n" % \
                                         (id, ntaxa ) )
                has_discarded = True
                
        if options.filter_simple_orthologs:
            ntaxa = len(tree.get_terminals())
            nspecies = len(set(map( lambda x: extract_species(tree.node(x).data.taxon), tree.get_terminals() )))
            if nspecies != ntaxa:
                if options.loglevel >= 2:
                    options.stdlog.write("# tree %s: removed because not a simple ortholog cluster: ntaxa!=nspecies (%i!=%i)\n" % \
                                             (id, ntaxa, nspecies ) )

                has_discarded = True

        if options.filter_terminal_max_length != None:
            for x in tree.get_terminals():
                node = tree.node(x)
                if node.data.branchlength >= options.filter_terminal_max_length:
                    has_discarded = True
                    ndiscarded_taxa += 1                    
                    tree.prune( node.data.taxon )
                    if options.loglevel >= 2:
                        options.stdlog.write("# tree %s: removed taxon %s because terminal branchlength to large: %s\n" % \
                                             (id, node.data.taxon, str(node.data.branchlength)) )

        if options.filter_terminal_min_length != None:
            for x in tree.get_terminals():
                node = tree.node(x)
                if node.data.branchlength <= options.filter_terminal_min_length:
                    has_discarded = True
                    ndiscarded_taxa += 1                    
                    tree.prune( node.data.taxon )
                    if options.loglevel >= 2:
                        options.stdlog.write("# tree %s: removed taxon %s because terminal branchlength to small: %s\n" % \
                                             (id, node.data.taxon, str(node.data.branchlength)) )
                    
        if options.filter_max_length != None:
            for x in tree.get_nodes(tree.root):
                if x == tree.root: continue                
                node = tree.node(x)
                if node.data.branchlength >= options.filter_max_length:
                    has_discarded = True
                    ndiscarded_branches += 1                    
                    if options.loglevel >= 2:
                        options.stdlog.write("# tree %s: removed branch %i because branchlength to large: %s\n" % \
                                             (id, x, tree.name, str(node.data.branchlength)) )
                    node.data.branchlength = options.filtered_branch_length
                    
        if options.filter_min_length != None:
            for x in tree.get_nodes(tree.root):
                if x == tree.root: continue
                node = tree.node(x)
                if node.data.branchlength <= options.filter_min_length:
                    has_discarded = True
                    ndiscarded_branches += 1
                    if options.loglevel >= 2:
                        options.stdlog.write("# tree %s: removed branch %i because internal branchlength too small: %s\n" % \
                                             (id, x, str(node.data.branchlength)) )
                    node.data.branchlength = options.filtered_branch_length
                    
        if options.filter_by_trees:
            found = []
            for y in range(len(nexus_maps)):
                if id in nexus_maps[y]:
                    found.append( (y, nexus_filter[y].trees[nexus_maps[y][id]]) )

            if not found:
                ndiscarded += 1
                continue

            for x in tree.get_nodes(tree.root):
                if x == tree.root: continue
                for y, other_tree in found:
                    other_node = other_tree.node( x )
                    if other_node.data.branchlength == options.filtered_branch_length:
                        node = tree.node(x)
                        if options.loglevel >= 2:
                            options.stdlog.write("# tree %s: removed branch %i because internal branchlength masked by tree %i:%s.\n" % \
                                                 (id, x, y, other_tree.name) )
                        
                        node.data.branchlength = options.filtered_branch_length
                        has_discarded = True
                        ndiscarded_branches += 1
                        break

        if options.filter_by_monophyly:

            terminals = set(map( lambda x: tree.node(x).data.taxon, tree.get_terminals()))
            
            for t in monophyly_taxa:
                if t not in terminals:
                    if options.loglevel >= 2:
                        options.stdlog.write( "taxon %s not in tree %s\n" % (t, tree.name))
                    nskipped += 1
            succ = tree.node(tree.root).succ
            ## use minimum support at root, if it is not the same (if trees
            ## are rooted)
            if len(succ) == 2:
                m = min( map( lambda x: tree.node(x).data.support, succ) )
                for x in succ:
                    tree.node(x).data.support = m
                
            if not TreeTools.IsMonophyleticForTaxa( tree, monophyly_taxa, support=options.min_support ):
                ndiscarded += 1
                continue
            
        if has_discarded:
            ndiscarded += 1
            if options.filter=="trees" or options.filter_ntaxa:
                continue

        if options.method == "split":

            output_filename = re.sub( "%s", id, options.output_pattern )

            dirname = os.path.dirname(output_filename)

            if dirname and not os.path.exists( dirname ):
                os.makedirs( dirname )

            if not os.path.exists( output_filename ):
                outfile = open(output_filename, "w" )
                outfile.write( TreeTools.Tree2Newick( tree ) + "\n" )
                noutput += 1
            else:
                if options.loglevel >= 1:
                    options.stdlog.write("# skipping because output for tree %s already exists: %s\n" % (id, output_filename))                        
                nskipped += 1
                continue

        elif options.method == "filter":
            options.stdout.write( ">%s\n%s\n" % (tree.name, TreeTools.Tree2Newick( tree )) )
            noutput += 1
            
        if outfile_map:
            for t in TreeTools.GetTaxa( tree ):
                outfile_map.write( "%s\t%s\n" % (t, id) )

    if outfile_map:
        outfile_map.close()

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i, with_discarded=%i, discarded_taxa=%i, discarded_branches=%i.\n" %\
                             (ninput, noutput, nskipped,
                              ndiscarded, ndiscarded_taxa, ndiscarded_branches))
        
    E.Stop()
Esempio n. 25
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: trees2sets.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--reference-tree",
                      dest="reference_tree",
                      type="string",
                      help="reference tree to read.")

    parser.add_option("-e",
                      "--enumeration",
                      dest="enumeration",
                      type="choice",
                      choices=("monophyletic", "full", "pairwise",
                               "exhaustive", "explicit", "lineage"),
                      help="enumeration of ortholog groups.")

    parser.add_option("-o",
                      "--organisms",
                      dest="column2org",
                      type="string",
                      help="sorted list of organisms.")

    parser.add_option("-p",
                      "--filename-patterns",
                      dest="filename_patterns",
                      type="string",
                      help="filename with patterns to output.")

    parser.add_option("-u",
                      "--filename-summary",
                      dest="filename_summary",
                      type="string",
                      help="filename with summary to output.")

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("strict", "degenerate", "any", "outgroup",
                               "lineage"),
                      help="sets to extract.")

    parser.add_option("-s",
                      "--species-set",
                      dest="species_set",
                      type="string",
                      help="comma separated list of species.")

    parser.add_option("-g",
                      "--outgroups",
                      dest="outgroups",
                      type="string",
                      help="comma separated list of outgroup species.")

    parser.add_option(
        "--species-regex",
        dest="species_regex",
        type="string",
        help="regular expression to extract species from identifier.")

    parser.add_option(
        "--gene-regex",
        dest="gene_regex",
        type="string",
        help="regular expression to extract gene from identifier.")

    parser.add_option("--reroot",
                      dest="reroot",
                      type="choice",
                      choices=("outgroup", "midpoint"),
                      help="reroot trees before computing sets.")

    parser.set_defaults(
        reference_tree=None,
        enumeration="full",
        column2org=None,
        separator="|",
        species_regex="^([^|]+)\|",
        gene_regex="^[^|]+\|[^|]+\|([^|]+)\|",
        filename_summary=None,
        methods=[],
        species_set=None,
        outgroups=None,
        reroot=None,
    )

    (options, args) = E.Start(parser)

    if len(options.methods) == 0:
        options.methods.append("strict")

    if options.species_set:
        options.species_set = options.species_set.split(",")
        options.enumeration = "explicit"

    #######################################################################
    # warning: outgroup method is useless, as it requires
    # only a single outgroup per tree and the tree rooted
    # with the outgroup.
    if "outgroup" in options.methods and not options.outgroups:
        raise "please supply --outgroups if method 'outgroup' is chosen."

    if options.outgroups:
        options.outgroups = options.outgroups.split(",")

    ########################################################################
    ########################################################################
    ########################################################################
    if options.reference_tree:
        if options.reference_tree[0] == "(":
            nexus = TreeTools.Newick2Nexus(options.reference_tree)
        else:
            nexus = TreeTools.Newick2Nexus(open(options.reference_tree, "r"))
        reference_tree = nexus.trees[0]

        if options.loglevel >= 3:
            options.stdlog.write("# reference tree:\n%s\n" %
                                 reference_tree.display())
    else:
        reference_tree = None
        raise ValueError("please supply a reference tree")

    ########################################################################
    ########################################################################
    ########################################################################
    # read all trees
    ########################################################################
    nexus = TreeTools.Newick2Nexus(sys.stdin)

    ########################################################################
    ########################################################################
    ########################################################################
    # sort out reference tree
    ########################################################################
    rs = re.compile(options.species_regex)
    rg = re.compile(options.gene_regex)
    extract_species = lambda x: parseIdentifier(x, options)[0]
    extract_gene = lambda x: parseIdentifier(x, options)[2]

    # prune reference tree to species present
    species_set = set()
    for tree in nexus.trees:
        try:
            species_set = species_set.union(
                set(map(extract_species, tree.get_taxa())))
        except AttributeError:
            raise "parsing error while extracting species from %s" % str(
                tree.get_taxa())

    TreeTools.PruneTree(reference_tree, species_set)

    if options.loglevel >= 1:
        options.stdlog.write("# reference tree after pruning has %i taxa.\n" %
                             len(reference_tree.get_taxa()))

    if options.column2org:
        options.column2org = options.column2org.split(",")
    elif reference_tree:
        options.column2org = []
        for nx in reference_tree.get_terminals():
            options.column2org.append(reference_tree.node(nx).get_data().taxon)

    options.org2column = {}
    for x in range(len(options.column2org)):
        options.org2column[options.column2org[x]] = x

    for method in options.methods:

        ###################################################################
        ###################################################################
        ###################################################################
        # print out a list of ortholog clusters
        ###################################################################
        writeOrthologSets(options.stdout,
                          nexus,
                          extract_species,
                          extract_gene,
                          options=options,
                          reference_tree=reference_tree,
                          method=method,
                          outgroups=options.outgroups)

    E.Stop()
Esempio n. 26
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: tree2tree.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-d",
                      "--value",
                      dest="value",
                      type="float",
                      help="normalizing value.")
    parser.add_option(
        "-m",
        "--method",
        dest="methods",
        type="string",
        help=
        """methods to apply [normalize|divide-by-tree|divide-by-tree|rename|set-uniform-branch-length|extract-with-pattern|build-map|remove-pattern|unroot|midpoint-root|balanced-root|add-node-names"""
    )
    parser.add_option("-2",
                      "--filename-tree2",
                      dest="filename_tree2",
                      type="string",
                      help="filename with second tree.")
    parser.add_option("-o",
                      "--outgroup",
                      dest="outgroup",
                      type="string",
                      help="reroot with outgroup before processing.")
    parser.add_option("-p",
                      "--parameters",
                      dest="parameters",
                      type="string",
                      help="parameters for methods.")
    parser.add_option(
        "-e",
        "--template-identifier",
        dest="template_identifier",
        type="string",
        help="""template identifier [%default]. A %i is replaced by the position
                      of the sequence in the file.""")
    parser.add_option("-i",
                      "--invert-map",
                      dest="invert_map",
                      action="store_true",
                      help="""invert map.""")
    parser.add_option("-f",
                      "--filter",
                      dest="filter",
                      type="choice",
                      choices=("max-branch-length", ),
                      help="filter trees")
    parser.add_option("--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("nh", "nhx"),
                      help=("output format for trees."))
    parser.add_option(
        "-b",
        "--no-branch-lengths",
        dest="with_branchlengths",
        action="store_false",
        help=
        """do not write branchlengths. Per default, 0 branch lengths are added."""
    )

    parser.set_defaults(
        value=0,
        methods="",
        filename_tree2=None,
        outgroup=None,
        parameters="",
        template_identifier="ID%06i",
        write_map=False,
        invert_map=False,
        filter=None,
        output_format="nh",
        with_branchlengths=True,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    options.methods = options.methods.split(",")
    options.parameters = options.parameters.split(",")

    other_trees = []
    # read other trees
    if options.filename_tree2:
        other_nexus = TreeTools.Newick2Nexus(open(options.filename_tree2, "r"))
        if len(other_nexus.trees) > 0:
            other_trees = other_nexus.trees
        else:
            other_tree = other_nexus.trees[0]
            other_trees = [other_tree]

    lines = sys.stdin.readlines()

    ntotal, nskipped, ntree = 0, 0, 0

    if options.filter:

        nexus = TreeTools.Newick2Nexus(lines)

        new_trees = []

        value = float(options.parameters[0])
        del options.parameters[0]

        # decision functions: return true, if tree
        # is to be skipped
        if options.filter == "max-branch-length":
            f = lambda x: x >= value

        for tree in nexus.trees:
            ntotal += 1

            for id, node in tree.chain.items():
                if f(node.data.branchlength):
                    nskipped += 1
                    break
            else:
                new_trees.append(tree)
                ntree += 1

        nexus.trees = new_trees

        options.stdout.write(
            TreeTools.Nexus2Newick(nexus, with_names=True) + "\n")

    else:

        # iterate over chunks
        chunks = filter(lambda x: lines[x][0] == ">", range(len(lines)))

        map_old2new = {}

        if chunks:
            for c in range(len(chunks) - 1):
                a, b = chunks[c], chunks[c + 1]
                options.stdout.write(lines[a])
                a += 1
                Process(lines[a:b], other_trees, options, map_old2new, ntree)

            options.stdout.write(lines[chunks[-1]])
            t, s, ntree = Process(lines[chunks[-1] + 1:], other_trees, options,
                                  map_old2new, ntree)
            ntotal += t
            nskipped += s
        else:
            ntotal, nskipped, ntree = Process(lines, other_trees, options,
                                              map_old2new, ntree)

        if options.write_map:
            p = options.parameters[0]
            if p:
                outfile = open(p, "w")
            else:
                outfile = options.stdout

            outfile.write("old\tnew\n")
            for old_id, new_id in map_old2new.items():
                outfile.write("%s\t%s\n" % (old_id, new_id))
            if p:
                outfile.close()

    if options.loglevel >= 1:
        options.stdlog.write("# ntotal=%i, nskipped=%i\n" % (ntotal, nskipped))

    E.Stop()
Esempio n. 27
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/analyze_genetrees.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option(
        "-r",
        "--species-regex",
        dest="species_regex",
        type="string",
        help="regular expression to extractspecies from identifier.")

    parser.add_option(
        "--gene-regex",
        dest="gene_regex",
        type="string",
        help="regular expression to extract gene from identifier.")

    parser.add_option("--filename-filter-positives",
                      dest="filename_filter_positives",
                      type="string",
                      help="filename with positive list of trees to analyze.")

    parser.add_option("-s",
                      "--filename-species-tree",
                      dest="filename_species_tree",
                      type="string",
                      help="filename with species tree.")

    parser.add_option(
        "--filename-species2colour",
        dest="filename_species2colour",
        type="string",
        help=
        "filename with map of species to colours. If not given, random colours are assigned to species."
    )

    parser.add_option("-t",
                      "--species-tree",
                      dest="species_tree",
                      type="string",
                      help="species tree.")

    parser.add_option(
        "-e",
        "--filename-locations",
        dest="filename_locations",
        type="string",
        help=
        "filename with map of transcript information to location information.")

    parser.add_option("--no-create",
                      dest="create",
                      action="store_false",
                      help="do not create files, but append to them.")

    parser.add_option(
        "--max-separation",
        dest="max_separation",
        type="int",
        help=
        "maximum allowable separation between syntenic segments for border plot (set to 0, if syntey is enough)."
    )

    parser.add_option(
        "--filename-species2url",
        dest="filename_species2url",
        type="string",
        help="filename with mapping information of species to URL.")

    parser.add_option("--prefix",
                      dest="prefix",
                      type="string",
                      help="prefix to add as first column.")

    parser.add_option(
        "--outgroup-species",
        dest="outgroup_species",
        type="string",
        help="species to used as outgroups. Separate multiple species by ','.")

    parser.add_option("--subtrees-trees",
                      dest="subtrees_trees",
                      action="store_true",
                      help="write trees for subtrees.")

    parser.add_option("--subtrees-identifiers",
                      dest="subtrees_identifiers",
                      action="store_true",
                      help="write identifiers of subtrees.")

    parser.add_option("--svg-add-ids",
                      dest="svg_add_ids",
                      action="store_true",
                      help="add node ids to svg plot.")

    parser.add_option("--svg-otus",
                      dest="svg_otus",
                      type="string",
                      help="otus to output in svg species tree.")

    parser.add_option("--svg-branch-lenghts",
                      dest="svg_branch_lengths",
                      type="choice",
                      choices=("contemporary", "uniform", "median"),
                      help="branch lengths in species tree.")

    parser.add_option("--print-totals",
                      dest="print_totals",
                      action="store_true",
                      help="output totals sections.")

    parser.add_option("--print-subtotals",
                      dest="print_subtotals",
                      action="store_true",
                      help="output subtotals sections.")

    parser.add_option(
        "--print-best",
        dest="print_best",
        action="store_true",
        help="output best node assignment for each node in gene tree.")

    parser.add_option("--print-svg",
                      dest="print_svg",
                      action="store_true",
                      help="output svg files.")

    parser.add_option("--print-species-svg",
                      dest="print_species_svg",
                      action="store_true",
                      help="output species svg files.")

    parser.add_option(
        "--output-pattern",
        dest="output_pattern",
        type="string",
        help=
        """output pattern for separate output of sections [default: %default].
                       Set to None, if output to stdout. Can contain one %s to be substituted with section."""
    )

    parser.add_option(
        "--output-pattern-svg",
        dest="output_pattern_svg",
        type="string",
        help=
        "filename for svg output. If it contains %s, this is replaced by gene_tree name."
    )

    parser.add_option(
        "--filename-node-types",
        dest="filename_node_types",
        type="string",
        help="filename with node type information from a previous run.")

    parser.add_option("--analyze-resolution-data",
                      dest="analyze_resolution_data",
                      type="choice",
                      action="append",
                      choices=("stats", "histograms"),
                      help="stdin is resolution data.")

    parser.add_option("--filter-quality",
                      dest="filter_quality",
                      type="choice",
                      choices=("all", "genes", "pseudogenes"),
                      help="filter predictions by gene type.")

    parser.add_option("--filter-location",
                      dest="filter_location",
                      type="choice",
                      choices=("all", "local", "non-local", "cis", "unplaced"),
                      help="filter predictions by location.")

    parser.add_option("--remove-unplaced",
                      dest="remove_unplaced",
                      action="store_true",
                      help="remove predictions on unplaced contigs.")

    parser.add_option("--skip-without-outgroups",
                      dest="skip_without_outgroups",
                      action="store_true",
                      help="skip clusters without outgroups.")

    parser.set_defaults(
        filter_quality="all",
        filter_location="all",
        remove_unplaced=False,
        species_regex="^([^|]+)\|",
        gene_regex="^[^|]+\|[^|]+\|([^|]+)\|",
        filename_species_tree=None,
        priority={
            "Speciation": 0,
            "SpeciationDeletion": 1,
            "Transcripts": 2,
            "DuplicationLineage": 3,
            "Duplication": 4,
            "DuplicationDeletion": 5,
            "DuplicationInconsistency": 6,
            "Outparalogs": 7,
            "InconsistentTranscripts": 8,
            "Inconsistency": 9,
            "Masked": 10
        },
        species_tree=None,
        filename_species2colour=None,
        filename_locations=None,
        max_separation=0,
        filename_species2url=None,
        separator="|",
        prefix=None,
        output_pattern=None,
        output_pattern_svg=None,
        outgroup_species=None,
        svg_add_ids=False,
        svg_branch_lengths="median",
        svg_otus=None,
        subtrees=False,
        print_svg=False,
        print_subtotals=False,
        print_totals=False,
        print_best=False,
        subtrees_identifiers=False,
        create=True,
        min_branch_length=0.00,
        filename_node_types=None,
        format_branch_length="%6.4f",
        nodetypes_inconsistency=("InconsistentTranscripts", "Inconsistency"),
        analyze_resolution_data=None,
        warning_small_branch_length=0.01,
        filename_filter_positives=None,
        skip_without_outgroups=False,
    )

    (options, args) = E.Start(parser,
                              add_psql_options=True,
                              add_csv_options=True)

    if options.outgroup_species:
        options.outgroup_species = set(options.outgroup_species.split(","))

    if options.svg_otus:
        options.svg_otus = set(options.svg_otus.split(","))

    rx_species = re.compile(options.species_regex)
    extract_species = lambda x: rx_species.match(x).groups()[0]
    if options.gene_regex:
        rx_gene = re.compile(options.gene_regex)
        extract_gene = lambda x: rx_gene.match(x).groups()[0]
    else:
        extract_gene = None

    extract_quality = lambda x: x.split(options.separator)[3]

    #########################################################################
    #########################################################################
    #########################################################################
    # read positive list of malis
    #########################################################################
    if options.filename_filter_positives:
        filter_positives, nerrors = IOTools.ReadList(
            open(options.filename_filter_positives, "r"))
        filter_positives = set(filter_positives)
    else:
        filter_positives = None

    #########################################################################
    #########################################################################
    #########################################################################
    # read location info
    #########################################################################
    if options.filename_locations:
        map_id2location = TreeReconciliation.readLocations(
            open(options.filename_locations, "r"), extract_species)
    else:
        map_id2location = {}

    if (options.remove_unplaced or options.filter_location != "all"
        ) and not options.filename_locations:
        raise "please supply a file with location information."

    #########################################################################
    #########################################################################
    #########################################################################
    # delete output files
    #########################################################################
    if options.create and options.output_pattern:
        for section in ("details", "subtrees", "subids", "details", "trees",
                        "nodes", "categories"):
            fn = options.output_pattern % section
            if os.path.exists(fn):
                if options.loglevel >= 1:
                    options.stdlog.write("# deleting file %s.\n" % fn)
                os.remove(fn)

    if options.loglevel >= 1:
        options.stdlog.write("# reading gene trees.\n")
        options.stdlog.flush()

    gene_nexus = TreeTools.Newick2Nexus(sys.stdin)

    Tree.updateNexus(gene_nexus)

    if options.loglevel >= 1:
        options.stdlog.write("# read %i gene trees from stdin.\n" %
                             len(gene_nexus.trees))
        options.stdlog.flush()

    #########################################################################
    #########################################################################
    #########################################################################
    # main loop over gene trees
    #########################################################################
    ninput, nfiltered, nskipped, noutput = 0, 0, 0, 0
    nskipped_filter, nskipped_outgroups = 0, 0

    # total counts
    total_heights_per_species = {}
    total_relheights_per_species = {}
    total_heights_per_tree = []
    total_relheights_per_tree = []

    for gene_tree in gene_nexus.trees:

        ninput += 1

        xname = re.sub("_tree.*", "", gene_tree.name)
        xname = re.sub("subtree_", "", xname)

        if filter_positives and xname not in filter_positives:
            nskipped_filter += 1
            continue

        if options.loglevel >= 6:
            gene_tree.display()

        #######################################################################
        #######################################################################
        #######################################################################
        # get identifier for this tree and update prefixes accordingly
        #######################################################################
        if options.prefix:
            if len(gene_nexus.trees) > 0:
                prefix_header = "prefix1\tprefix2\t"
                prefix_row = options.prefix + "\t" + gene_tree.name + "\t"
                prefix_prefix = options.prefix + "_" + gene_tree.name + "_"
                prefix_name = options.prefix + "_" + gene_tree.name
            else:
                prefix_header = "prefix\t"
                prefix_row = options.prefix + "\t"
                prefix_prefix = options.prefix + "_"
                prefix_name = options.prefix
        else:
            if len(gene_nexus.trees) > 0:
                prefix_header = "prefix\t"
                prefix_row = gene_tree.name + "\t"
                prefix_prefix = gene_tree.name + "\t"
                prefix_name = gene_tree.name
            else:
                prefix_header, prefix_row, prefix_prefix, prefix_name = "", "", "", ""

        #######################################################################
        #######################################################################
        #######################################################################
        # apply filters to gene tree
        #######################################################################
        TreeReconciliation.filterTree(gene_tree, options, map_id2location)

        otus = TreeTools.GetTaxa(gene_tree)

        if len(otus) <= 1:
            nfiltered += 1
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# tree %s: empty after filtering - skipped.\n" %
                    gene_tree.name)
            continue

        this_species_list = map(extract_species, otus)
        # check, if only outgroups
        if options.outgroup_species:
            if not set(this_species_list).difference(options.outgroup_species):
                nfiltered += 1
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# tree %s: only outgroups after filtering - skipped.\n"
                        % gene_tree.name)
                continue

            if options.skip_without_outgroups and not set(
                    this_species_list).intersection(options.outgroup_species):
                nskipped_outgroups += 1
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# tree %s: no outgroups - skipped.\n" %
                        gene_tree.name)
                continue

        #######################################################################
        #######################################################################
        #######################################################################
        # reroot gene tree, if outgroups have been given.
        #######################################################################
        if options.outgroup_species:
            TreeReconciliation.rerootTree(gene_tree, extract_species, options)

        #######################################################################
        #######################################################################
        #######################################################################
        # compute distance to root for each node
        #######################################################################
        distance_to_root = TreeTools.GetDistanceToRoot(gene_tree)

        #######################################################################
        #######################################################################
        #######################################################################
        # compute counts
        #######################################################################
        # heights per tree
        heights_per_tree = []
        # relative heights per tree
        relheights_per_tree = []
        # distance to root
        heights_per_species = {}
        # distance to root (relative to maximum distance to root)
        relheights_per_species = {}

        analysis_set, gene_set, pseudogene_set, other_set = TreeReconciliation.getAnalysisSets(
            gene_tree, extract_quality, options)

        if len(analysis_set) == 0:
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# tree %s: empty analysis set - skipped.\n" %
                    gene_tree.name)
            nskipped += 1
            continue

        reference_height = TreeReconciliation.getReferenceHeight(
            distance_to_root,
            gene_tree,
            gene_set,
            options,
            extract_species,
            method="median")

        if reference_height is None:
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# tree %s: reference height not computable or 0 - skipped.\n"
                    % gene_tree.name)
            nskipped += 1
            continue

        for node_id in analysis_set:

            node = gene_tree.node(node_id)
            species = extract_species(node.data.taxon)
            height = distance_to_root[node_id]

            if height < options.warning_small_branch_length:
                options.stdlog.write(
                    "# tree %s: small distance %s to root at node %i: %s\n" %
                    (gene_tree.name, options.format_branch_length % height,
                     node_id, node.data.taxon))

            relheight = height / reference_height
            try:
                heights_per_species[species].append(height)
            except KeyError:
                heights_per_species[species] = [height]
                relheights_per_species[species] = []

            relheights_per_species[species].append(relheight)

            # do not use outgroup species
            if options.outgroup_species and species in options.outgroup_species:
                continue

            heights_per_tree.append(height)
            relheights_per_tree.append(relheight)

        if options.loglevel >= 1:
            options.stdlog.write(
                "# tree %s: reference_height=%s\n" %
                (gene_tree.name,
                 options.format_branch_length % reference_height))
            options.stdlog.flush()

        if options.print_subtotals:
            printCounts(heights_per_species, relheights_per_species,
                        heights_per_tree, relheights_per_tree, options,
                        prefix_header, prefix_row)

        #######################################################################
        #######################################################################
        #######################################################################
        # update total counts
        #######################################################################
        TreeReconciliation.appendCounts(total_heights_per_species,
                                        heights_per_species)
        TreeReconciliation.appendCounts(total_relheights_per_species,
                                        relheights_per_species)

        TreeReconciliation.appendCounts(total_heights_per_tree,
                                        heights_per_tree)
        TreeReconciliation.appendCounts(total_relheights_per_tree,
                                        relheights_per_tree)

        noutput += 1

    if options.print_totals:

        if options.prefix:
            prefix_header = "prefix1\tprefix2\t"
            prefix_row = options.prefix + "\t" + "total" + "\t"
            prefix_prefix = options.prefix + "_" + "total" + "_"
            prefix_name = options.prefix + "_" + "total"
        else:
            prefix_header = "prefix\t"
            prefix_row = "total" + "\t"
            prefix_prefix = "total" + "_"
            prefix_name = "total"

        printCounts(total_heights_per_species, total_relheights_per_species,
                    total_heights_per_tree, total_relheights_per_tree, options,
                    prefix_header, prefix_row)

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, nfiltered=%i, nskipped=%i, nskipped_filter=%i, nskipped_outgroups=%i, noutput=%i\n"
            % (ninput, nfiltered, nskipped, nskipped_filter,
               nskipped_outgroups, noutput))

    E.Stop()
Esempio n. 28
0
def Process(lines, other_trees, options, map_old2new, ntree):

    nexus = TreeTools.Newick2Nexus(map(lambda x: x[:-1], lines))

    if options.loglevel >= 1:
        options.stdlog.write("# read %i trees.\n" % len(nexus.trees))

    nskipped = 0
    ntotal = len(nexus.trees)
    extract_pattern = None
    species2remove = None
    write_map = False

    phylip_executable = None
    phylip_options = None

    index = 0

    # default: do not output internal node names
    write_all_taxa = False

    for tree in nexus.trees:

        if options.outgroup:
            tree.root_with_outgroup(options.outgroup)

        for method in options.methods:

            if options.loglevel >= 3:
                options.stdlog.write("# applying method %s to tree %i.\n" %
                                     (method, index))

            if method == "midpoint-root":
                tree.root_midpoint()

            elif method == "balanced-root":
                tree.root_balanced()

            elif method == "unroot":
                TreeTools.Unroot(tree)

            elif method == "phylip":
                if not phylip_executable:
                    phylip_executable = options.parameters[0]
                    del options.parameters[0]
                    phylip_options = re.split("@", options.parameters[0])
                    del options.parameters[0]

                    phylip = WrapperPhylip.Phylip()
                    phylip.setProgram(phylip_executable)
                    phylip.setOptions(phylip_options)

                phylip.setTree(tree)

                result = phylip.run()

                nexus.trees[index] = result.mNexus.trees[0]

            elif method == "normalize":
                if options.value == 0:
                    v = 0
                    for n in tree.chain.keys():
                        v = max(v, tree.node(n).data.branchlength)
                else:
                    v = options.value

                for n in tree.chain.keys():
                    tree.node(n).data.branchlength /= float(options.value)

            elif method == "divide-by-tree":

                if len(other_trees) > 1:
                    other_tree = other_trees[ntree]
                else:
                    other_tree = other_trees[0]

                # the trees have to be exactly the same!!
                if options.loglevel >= 2:
                    print tree.display()
                    print other_tree.display()

                if not tree.is_identical(other_tree):
                    nskipped += 1
                    continue

                # even if the trees are the same (in topology), the node numbering might not be
                # the same. Thus build a map of node ids.
                map_a2b = TreeTools.GetNodeMap(tree, other_tree)

                for n in tree.chain.keys():
                    try:
                        tree.node(n).data.branchlength /= float(
                            other_tree.node(map_a2b[n]).data.branchlength)
                    except ZeroDivisionError:
                        options.stdlog.write(
                            "# Warning: branch for nodes %i and %i in tree-pair %i: divide by zero\n"
                            % (n, map_a2b[n], ntree))
                        continue

            elif method == "rename":
                if not map_old2new:

                    map_old2new = IOTools.ReadMap(open(options.parameters[0],
                                                       "r"),
                                                  columns=(0, 1))

                    if options.invert_map:
                        map_old2new = IOTools.getInvertedDictionary(
                            map_old2new, make_unique=True)

                    del options.parameters[0]

                unknown = []
                for n, node in tree.chain.items():
                    if node.data.taxon:
                        try:
                            node.data.taxon = map_old2new[node.data.taxon]
                        except KeyError:
                            unknown.append(node.data.taxon)

                for taxon in unknown:
                    tree.prune(taxon)

            # reformat terminals
            elif method == "extract-with-pattern":

                if not extract_pattern:
                    extract_pattern = re.compile(options.parameters[0])
                    del options.parameters[0]

                for n in tree.get_terminals():
                    node = tree.node(n)
                    node.data.taxon = extract_pattern.search(
                        node.data.taxon).groups()[0]

            elif method == "set-uniform-branchlength":
                for n in tree.chain.keys():
                    tree.node(n).data.branchlength = options.value

            elif method == "build-map":
                # build a map of identifiers
                options.write_map = True
                for n in tree.get_terminals():
                    node = tree.node(n)
                    if node.data.taxon not in map_old2new:
                        new = options.template_identifier % (len(map_old2new) +
                                                             1)
                        map_old2new[node.data.taxon] = new
                    node.data.taxon = map_old2new[node.data.taxon]

            elif method == "remove-pattern":
                if species2remove is None:
                    species2remove = re.compile(options.parameters[0])
                    del options.parameters
                taxa = []
                for n in tree.get_terminals():
                    t = tree.node(n).data.taxon
                    skip = False
                    if species2remove.search(t):
                        continue
                    if not skip:
                        taxa.append(t)
                TreeTools.PruneTree(tree, taxa)

            elif method == "add-node-names":

                inode = 0
                write_all_taxa = True
                for n, node in tree.chain.items():
                    if not node.data.taxon:
                        node.data.taxon = "inode%i" % inode
                        inode += 1

            elif method == "newick2nhx":
                # convert names to species names
                for n in tree.get_terminals():
                    t = tree.node(n).data.taxon
                    d = t.split("|")
                    if len(d) >= 2:
                        tree.node(n).data.species = d[0]

        index += 1
        ntree += 1

    if options.output_format == "nh":
        options.stdout.write(
            TreeTools.Nexus2Newick(
                nexus,
                write_all_taxa=True,
                with_branchlengths=options.with_branchlengths) + "\n")
    else:
        for tree in nexus.trees:
            tree.writeToFile(options.stdout, format=options.output_format)

    return ntotal, nskipped, ntree
Esempio n. 29
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/count_orgs.py 1706 2007-12-11 16:46:11Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--reference-tree",
                      dest="reference_tree",
                      type="string",
                      help="reference tree to read.")

    parser.add_option("-p",
                      "--filename-patterns",
                      dest="filename_patterns",
                      type="string",
                      help="filename with patterns to output.")

    parser.add_option("-u",
                      "--filename-summary",
                      dest="filename_summary",
                      type="string",
                      help="filename with summary to output.")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="choice",
                      choices=("map", "links", "trees"),
                      help="output format.")

    parser.add_option("-o",
                      "--organisms",
                      dest="column2org",
                      type="string",
                      help="sorted list of organisms.")

    parser.add_option(
        "-s",
        "--species-regex",
        dest="species_regex",
        type="string",
        help="regular expression to extract species from identifier.")

    parser.add_option(
        "-g",
        "--gene-regex",
        dest="gene_regex",
        type="string",
        help="regular expression to extract gene from identifier.")

    parser.set_defaults(
        reference_tree=None,
        format="map",
        filename_patterns=None,
        column2org=None,
        species_regex="^([^|]+)\|",
        gene_regex="^[^|]+\|[^|]+\|([^|]+)\|",
        separator="|",
        filename_summary=None,
    )

    (options, args) = E.Start(parser)

    if options.reference_tree:
        if options.reference_tree[0] == "(":
            nexus = TreeTools.Newick2Nexus(options.reference_tree)
        else:
            nexus = TreeTools.Newick2Nexus(open(options.reference_tree, "r"))
        reference_tree = nexus.trees[0]

        if options.loglevel >= 3:
            print "# reference tree:"
            print reference_tree.display()

    else:
        reference_tree = None

    clusters = {}
    if options.format == "map":

        for line in sys.stdin:
            if line[0] == "#": continue
            id, r = line[:-1].split("\t")
            if r not in clusters: clusters[r] = []
            clusters[r].append(id)

    elif options.format == "trees":

        nexus = TreeTools.Newick2Nexus(sys.stdin)

        for tree in nexus.trees:
            clusters[tree.name] = tree.get_taxa()

    elif options.format == "links":
        members = set()
        id = None
        for line in sys.stdin:
            if line[0] == "#": continue

            if line[0] == ">":
                if id: clusters[id] = members
                x = re.match(">cluster #(\d+)", line[:-1])
                if x:
                    id = x.groups()[0]
                else:
                    id = line[1:-1]
                members = set()
                continue

            data = line[:-1].split("\t")[:2]
            members.add(data[0])
            members.add(data[1])

        if id: clusters[id] = members

    if len(clusters) == 0:
        raise "empty input."

    ########################################################################
    ########################################################################
    ########################################################################
    ## sort out reference tree
    ########################################################################
    rs = re.compile(options.species_regex)
    rg = re.compile(options.gene_regex)
    extract_species = lambda x: rs.search(x).groups()[0]

    ## prune tree to species present
    species_set = set()
    for cluster, members in clusters.items():
        species_set = species_set.union(set(map(extract_species, members)))

    if reference_tree:

        TreeTools.PruneTree(reference_tree, species_set)

        if options.loglevel >= 1:
            options.stdlog.write("# Tree after pruning: %i taxa.\n" %
                                 len(reference_tree.get_taxa()))

    if options.column2org:
        options.column2org = options.column2org.split(",")
    elif reference_tree:
        options.column2org = []
        for nx in reference_tree.get_terminals():
            options.column2org.append(reference_tree.node(nx).get_data().taxon)
    else:
        options.column2org = []
        for x in species_set:
            options.column2org.append(x)

    options.org2column = {}
    for x in range(len(options.column2org)):
        options.org2column[options.column2org[x]] = x

    if reference_tree:
        reference_patterns = TreeTools.calculatePatternsFromTree(
            reference_tree, options.column2org)

        if options.loglevel >= 3:
            print "# reference patterns:"
            print reference_patterns

    ##############################################################################
    notus = len(options.column2org)
    patterns = {}
    species_counts = [SpeciesCounts() for x in options.column2org]

    ## first genes, then transcripts
    options.stdout.write(
        "mali\tpattern\tpresent\tngenes\t%s\tntranscripts\t%s\n" %
        ("\t".join(options.column2org), "\t".join(options.column2org)))

    keys = clusters.keys()
    keys.sort()
    for cluster in keys:
        members = clusters[cluster]

        count_genes = [{} for x in range(len(options.org2column))]
        count_transcripts = [0] * len(options.org2column)

        for m in members:
            data = m.split(options.separator)

            if len(data) == 4:
                s, t, g, q = data
            elif len(data) == 2:
                s, g = data
                t = g

            if s not in options.org2column:
                raise "unknown species %s" % s

            col = options.org2column[s]

            count_transcripts[col] += 1
            if g not in count_genes[col]:
                count_genes[col][g] = 0

            count_genes[col][g] += 1

            species_counts[col].mGenes.add(g)
            species_counts[col].mTranscripts.add(t)
            species_counts[col].mTrees.add(cluster)

        ntotal_transcripts = reduce(lambda x, y: x + y, count_transcripts)
        npresent_transcripts = len(filter(lambda x: x > 0, count_transcripts))
        ntotal_genes = reduce(lambda x, y: x + y, map(len, count_genes))
        npresent_genes = len(filter(lambda x: x > 0, map(len, count_genes)))

        pattern = GetPattern(count_transcripts, notus)
        if pattern not in patterns: patterns[pattern] = 0
        patterns[pattern] += 1
        options.stdout.write(
            string.join(
                (cluster, pattern, str(npresent_genes), str(ntotal_genes),
                 string.join(map(str, map(len, count_genes)), "\t"),
                 str(ntotal_transcripts),
                 string.join(map(str, count_transcripts), "\t")), "\t") + "\n")

    #######################################################################################
    #######################################################################################
    #######################################################################################
    ## write pattern summary
    #######################################################################################
    xx = patterns.keys()
    xx.sort()
    if options.filename_patterns:
        outfile = open(options.filename_patterns, "w")
    else:
        outfile = sys.stdout

    for x in range(len(options.column2org)):
        outfile.write("# %i = %s\n" % (x, options.column2org[x]))

    if reference_tree:
        outfile.write("pattern\tcounts\tisok\n")
    else:
        outfile.write("pattern\tcounts\n")

    for x in xx:
        if reference_tree:
            if x in reference_patterns:
                is_ok = "1"
            else:
                is_ok = "0"
            outfile.write("%s\t%s\t%s\n" % (x, patterns[x], is_ok))
        else:
            outfile.write("%s\t%s\n" % (x, patterns[x]))

    if outfile != sys.stdout: outfile.close()

    #######################################################################################
    #######################################################################################
    #######################################################################################
    ## write summary counts per species
    #######################################################################################
    if options.filename_summary:
        outfile = open(options.filename_summary, "w")
    else:
        outfile = sys.stdout

    outfile.write("species\tntranscripts\tngenes\tntrees\n")

    for species, col in options.org2column.items():
        outfile.write(
            "%s\t%i\t%i\t%i\n" %
            (species, len(species_counts[col].mTranscripts),
             len(species_counts[col].mGenes), len(species_counts[col].mTrees)))

    if outfile != sys.stdout: outfile.close()

    E.Stop()
Esempio n. 30
0
def AnalysePatterns(patterns,
                    map_id2org,
                    min_cluster_support=100,
                    min_report_support=90):
    """analyse partitions by comparing to reference tree.

    Prints out for each partition, whether left/right is consistent
    with reference tree or not.

    If there are full complements on either side, print suggested split.

    Prints summary statistics:
    for each consistent partition:
        print counts
    """

    # reread and process species tree
    # has to be done for every new pass, because
    # the tree is modified later on (and I haven't found
    # a copy mechanism (because I did not look)).
    nexus = TreeTools.Newick2Nexus(param_reference_tree)
    reference_tree = nexus.trees[0]

    norgs = len(reference_tree.get_terminals())
    notus = len(patterns[0][1])

    # complement patterns with single species patterns:
    patterns.reverse()
    for x in range(notus):
        pattern = ["."] * notus
        pattern[x] = "*"
        patterns.append((100, string.join(pattern, "")))
    patterns.reverse()

    ##########################################################################
    # first pass: separate well supported full species trees
    masks = []
    present_orgs = {}
    mask_id = 0

    for support, pattern in patterns:

        t1, t2, i1, i2 = {}, {}, [], []

        for x in range(len(pattern)):
            org, name, nid = map_id2org[x]
            if org == "unknown":
                continue
            present_orgs[org] = 1
            if pattern[x] == "*":
                t1[org] = 1
                i1.append(name)
            else:
                t2[org] = 1
                i2.append(name)

        t1 = t1.keys()
        t2 = t2.keys()
        t1.sort()
        t2.sort()

        if param_loglevel >= 4:
            print "# ", pattern, len(t1), len(t2), i1, i2
            sys.stdout.flush()

        if len(t1) == len(t2) and \
                len(t1) == norgs and \
                support >= min_cluster_support:

            mask1, notus1 = [], 0
            mask2, notus2 = [], 0

            for x in range(len(pattern)):
                if pattern[x] == "*":
                    notus1 += 1
                    mask1.append(1)
                    mask2.append(0)
                else:
                    notus2 += 1
                    mask1.append(0)
                    mask2.append(1)

            mask_id += 1
            masks.append(Results(mask1, notus1, len(t1), mask_id=mask_id))
            mask_id += 1
            masks.append(Results(mask2, notus2, len(t2), mask_id=mask_id))

            if param_loglevel >= 2:
                print "# split\tfull\t%i\t%s\t%i\t%i\t%s" % (
                    support, string.join(map(str, mask1), ""), notus1, len(t1),
                    string.join(i1, ";"))
                print "# split\tfull\t%i\t%s\t%i\t%i\t%s" % (
                    support, string.join(map(str, mask2), ""), notus2, len(t2),
                    string.join(i2, ";"))

    # add full mask
    if len(masks) == 0:
        masks.append(Results([1] * notus, notus, len(present_orgs), mask_id=1))

    ##########################################################################
    # second pass: check subtrees for each mask
    # external: edges leading to external nodes (i.e., leaves): total number = norgs
    # internal: all other edges: maximum number = 2 * (2 * norgs - 3 - norgs) = 2 * (norgs - 3)
    # 1st factor 2: two directions
    # 2nd factor: 2n-3 is number of edges in unrooted tree.
    # 3rd factor: -n = number of external edges
    for mask in masks:
        reference_tree = GetPrunedReferenceTree(mask, GetOrgs(map_id2org),
                                                param_reference_tree)
        AnalyseMask(mask, patterns, norgs, reference_tree, map_id2org,
                    min_report_support)

    if param_loglevel >= 1:
        print "# partitions after evaluation:"
        print "#", Results().printHeader()
        for m in masks:
            print "#", str(m)

    reference_tree = GetPrunedReferenceTree(mask, GetOrgs(map_id2org),
                                            param_reference_tree)
    new_masks = SelectMasks(masks, patterns, norgs, map_id2org,
                            min_report_support)

    if param_loglevel >= 1:
        print "# partitions after selection:"
        print "#", Results().printHeader()
        for m in new_masks:
            print "#", str(m)

    return new_masks