Esempi in Python per PhyloTree, esempi in Python per ete_dev.PhyloTree

Esempio n. 1

0

Mostra file

File: prx5.py Progetto: jhcepas/prx5_supplementary_data

def translate_ids(trees_file, outgroup_lineage="Bacteria"):
    for line in open(trees_file):
        if not line.strip() or line.startswith('#'):
            continue

        t = PhyloTree(line, sp_naming_function=spname)
        #t.set_outgroup(t.get_midpoint_outgroup())

        for lf in t:
            lf.add_features(coded_name = lf.name)            
            if lf.name in NAME2SP:
                lf.name = "%s {%s}" %(lf.name, NAME2SP[lf.name])
         
        t.dist = 0
        ncbi.connect_database()
        name2sp = ncbi.get_name_translator(t.get_species())
        for lf in t.iter_leaves():
            lf.add_features(taxid=name2sp.get(lf.species, 0))

        t.set_outgroup(t.search_nodes(taxid=9606)[0])
        ncbi.annotate_tree(t, attr_name='taxid')
        t.set_outgroup(t.get_common_ancestor([lf for lf in t if outgroup_lineage in lf.named_lineage]))
        ncbi.annotate_tree(t, attr_name='taxid')
            
        #print t.write(features=[])
        #print t.write()
        yield t

Esempio n. 2

0

Mostra file

def get_example_tree():

    # Performs a tree reconciliation analysis
    gene_tree_nw = '((Dme_001,Dme_002),(((Cfa_001,Mms_001),((Hsa_001,Ptr_001),Mmu_001)),(Ptr_002,(Hsa_002,Mmu_002))));'
    species_tree_nw = "((((Hsa, Ptr), Mmu), (Mms, Cfa)), Dme);"
    genetree = PhyloTree(gene_tree_nw)
    sptree = PhyloTree(species_tree_nw)
    recon_tree, events = genetree.reconcile(sptree)
    recon_tree.link_to_alignment(alg)
    return recon_tree, TreeStyle()

Esempio n. 3

0

Mostra file

File: phylotree_visualization.py Progetto: MikeTrizna/ete

def get_example_tree():

    # Performs a tree reconciliation analysis 
    gene_tree_nw = '((Dme_001,Dme_002),(((Cfa_001,Mms_001),((Hsa_001,Ptr_001),Mmu_001)),(Ptr_002,(Hsa_002,Mmu_002))));'
    species_tree_nw = "((((Hsa, Ptr), Mmu), (Mms, Cfa)), Dme);"
    genetree = PhyloTree(gene_tree_nw)
    sptree = PhyloTree(species_tree_nw)
    recon_tree, events = genetree.reconcile(sptree)
    recon_tree.link_to_alignment(alg)
    return recon_tree, TreeStyle()

Esempio n. 4

0

Mostra file

File: test_ncbiquery.py Progetto: MikeTrizna/ete

  def test_tree_annotation(self):
    t = PhyloTree( "((9598, 9606), 10090);" )
    t.annotate_ncbi_taxa()
    self.assertEqual(t.sci_name, 'Euarchontoglires')
    homi = (t&'9606').up
    self.assertEqual(homi.sci_name, 'Homininae')
    self.assertEqual(homi.taxid, 207598)
    self.assertEqual(homi.rank, 'subfamily')
    self.assertEqual(homi.named_lineage, [u'root', u'cellular organisms', u'Eukaryota', u'Opisthokonta', u'Metazoa', u'Eumetazoa', u'Bilateria', u'Deuterostomia', u'Chordata', u'Craniata', u'Vertebrata', u'Gnathostomata', u'Teleostomi', u'Euteleostomi', u'Sarcopterygii', u'Dipnotetrapodomorpha', u'Tetrapoda', u'Amniota', u'Mammalia', u'Theria', u'Eutheria', u'Boreoeutheria', u'Euarchontoglires', u'Primates', u'Haplorrhini', u'Simiiformes', u'Catarrhini', u'Hominoidea', u'Hominidae', u'Homininae'])
    self.assertEqual(homi.lineage, [1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593, 7742, 7776, 117570, 117571, 8287, 1338369, 32523, 32524, 40674, 32525, 9347, 1437010, 314146, 9443, 376913, 314293, 9526, 314295, 9604, 207598] )

    human = t&'9606'
    self.assertEqual(human.sci_name, 'H**o sapiens')
    self.assertEqual(human.taxid, 9606)
    self.assertEqual(human.rank, 'species')
    self.assertEqual(human.named_lineage, [u'root', u'cellular organisms', u'Eukaryota', u'Opisthokonta', u'Metazoa', u'Eumetazoa', u'Bilateria', u'Deuterostomia', u'Chordata', u'Craniata', u'Vertebrata', u'Gnathostomata', u'Teleostomi', u'Euteleostomi', u'Sarcopterygii', u'Dipnotetrapodomorpha', u'Tetrapoda', u'Amniota', u'Mammalia', u'Theria', u'Eutheria', u'Boreoeutheria', u'Euarchontoglires', u'Primates', u'Haplorrhini', u'Simiiformes', u'Catarrhini', u'Hominoidea', u'Hominidae', u'Homininae', u'H**o', u'H**o sapiens'])
    self.assertEqual(human.lineage, [1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593, 7742, 7776, 117570, 117571, 8287, 1338369, 32523, 32524, 40674, 32525, 9347, 1437010, 314146, 9443, 376913, 314293, 9526, 314295, 9604, 207598, 9605, 9606])

Esempio n. 5

0

Mostra file

File: test_ncbiquery.py Progetto: tarah28/ete

    def test_tree_annotation(self):
        t = PhyloTree("((9598, 9606), 10090);")
        t.annotate_ncbi_taxa()
        self.assertEqual(t.sci_name, 'Euarchontoglires')
        homi = (t & '9606').up
        self.assertEqual(homi.sci_name, 'Homininae')
        self.assertEqual(homi.taxid, 207598)
        self.assertEqual(homi.rank, 'subfamily')
        self.assertEqual(homi.named_lineage, [
            u'root', u'cellular organisms', u'Eukaryota', u'Opisthokonta',
            u'Metazoa', u'Eumetazoa', u'Bilateria', u'Deuterostomia',
            u'Chordata', u'Craniata', u'Vertebrata', u'Gnathostomata',
            u'Teleostomi', u'Euteleostomi', u'Sarcopterygii',
            u'Dipnotetrapodomorpha', u'Tetrapoda', u'Amniota', u'Mammalia',
            u'Theria', u'Eutheria', u'Boreoeutheria', u'Euarchontoglires',
            u'Primates', u'Haplorrhini', u'Simiiformes', u'Catarrhini',
            u'Hominoidea', u'Hominidae', u'Homininae'
        ])
        self.assertEqual(homi.lineage, [
            1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593,
            7742, 7776, 117570, 117571, 8287, 1338369, 32523, 32524, 40674,
            32525, 9347, 1437010, 314146, 9443, 376913, 314293, 9526, 314295,
            9604, 207598
        ])

        human = t & '9606'
        self.assertEqual(human.sci_name, 'H**o sapiens')
        self.assertEqual(human.taxid, 9606)
        self.assertEqual(human.rank, 'species')
        self.assertEqual(human.named_lineage, [
            u'root', u'cellular organisms', u'Eukaryota', u'Opisthokonta',
            u'Metazoa', u'Eumetazoa', u'Bilateria', u'Deuterostomia',
            u'Chordata', u'Craniata', u'Vertebrata', u'Gnathostomata',
            u'Teleostomi', u'Euteleostomi', u'Sarcopterygii',
            u'Dipnotetrapodomorpha', u'Tetrapoda', u'Amniota', u'Mammalia',
            u'Theria', u'Eutheria', u'Boreoeutheria', u'Euarchontoglires',
            u'Primates', u'Haplorrhini', u'Simiiformes', u'Catarrhini',
            u'Hominoidea', u'Hominidae', u'Homininae', u'H**o', u'H**o sapiens'
        ])
        self.assertEqual(human.lineage, [
            1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593,
            7742, 7776, 117570, 117571, 8287, 1338369, 32523, 32524, 40674,
            32525, 9347, 1437010, 314146, 9443, 376913, 314293, 9526, 314295,
            9604, 207598, 9605, 9606
        ])

Esempio n. 6

0

Mostra file

File: phylomeDB.py Progetto: tarah28/ete

    def get_tree(self, protid, method, phylome_id):
        """ Returns the method-tree associated to a given protid. """

        cmd = 'SELECT newick,lk FROM %s WHERE phylome_id=%s AND species="%s" AND protid="%s" AND method ="%s"' %\
            (self._trees_table, phylome_id, protid[:3],protid[3:],method)
        if self._SQL.execute(cmd):
            entry = self._SQL.fetchone()
            nw = entry[0]
            lk = float(entry[1])
            t  = PhyloTree(nw)
        else:
            t  = None
            lk = None
        return t,lk

Esempio n. 7

0

Mostra file

File: phylomeDB.py Progetto: tarah28/ete

    def get_best_tree(self, protid, phylome_id):
        """ Returns the winner ML tree"""

        likelihoods    = {}
        winner_model   = None
        winner_lk      = None
        winner_newick  = None
        t = None
        command ='SELECT newick,method,lk FROM %s WHERE phylome_id=%s AND species="%s" and protid="%s";' \
            % (self._trees_table,phylome_id, protid[:3], protid[3:])
        self._SQL.execute(command)
        result = self._SQL.fetchall()
        for r in result:
            nw,m,lk = r
            if lk < 0:
                likelihoods[m] = lk
                if  winner_lk==None or lk > winner_lk:
                    winner_lk     = lk
                    winner_model  = m
                    winner_newick = nw
        if winner_newick:
            t = PhyloTree(winner_newick)
        return winner_model,likelihoods,t

Esempio n. 8

0

Mostra file

File: new_seq_face.py Progetto: MikeTrizna/ete

    '''
    layout for CodemlTree
    '''
    if hasattr(node, "collapsed"):
        if node.collapsed == 1:
            node.img_style["draw_descendants"]= False
    if node.is_leaf():
        if hasattr (node, "sequence"):
            seqface =  MySequenceFace(node.sequence, "nt",
                                      fsize=10,
                                      col_w=11, interactive=True)
            faces.add_face_to_node(seqface, node, 1, aligned=True)

            
if __name__ == "__main__":
    tree = PhyloTree('(Orangutan,Human,Chimp);')
    tree.link_to_alignment("""
                           >Chimp
                           HARWLNEKLRCELRTLKKLGLDGYKAVSQYVKGRA
                           >Orangutan
                           DARWINEKLRCVSRTLKKLGLDGYKGVSQYVKGRP
                           >Human
                           DARWHNVKLRCELRTLKKLGLVGFKAVSQFVIRRA
                           """)
    nt_sequences = {"Human"    : "GACGCACGGTGGCACAACGTAAAATTAAGATGTGAATTGAGAACTCTGAAAAAATTGGGACTGGTCGGCTTCAAGGCAGTAAGTCAATTCGTAATACGTCGTGCG",
                    "Chimp"    : "CACGCCCGATGGCTCAACGAAAAGTTAAGATGCGAATTGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGCAGTAAGTCAGTACGTTAAAGGTCGTGCG",
                    "Orangutan": "GATGCACGCTGGATCAACGAAAAGTTAAGATGCGTATCGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGGAGTAAGTCAATACGTTAAAGGTCGTCCG"
                }
    for l in nt_sequences:
        (tree & l).nt_sequence = nt_sequences[l]
    tree.dist = 0

Esempio n. 9

0

Mostra file

     #    tree.run_model("fb")
     #    tree.run_model("M2")
     #except:
     #    pass
     tree.dist = 0
     ts = TreeStyle()
     ts.title.add_face(TextFace(
         "Example for EvolTree, interactivity shows codons", fsize=15),
                       column=0)
     ts.layout_fn = test_layout_evol
     #try:
     #    tree.show(tree_style=ts, histfaces=["M2"])
     #except:
     tree.show(tree_style=ts)
 except:
     tree = PhyloTree('(Orangutan,Human,Chimp);')
     tree.link_to_alignment("""
                            >Chimp
                            HARWLNEKLRCELRTLKKLGLDGYKAVSQYVKGRA
                            >Orangutan
                            DARWINEKLRCVSRTLKKLGLDGYKGVSQYVKGRP
                            >Human
                            DARWHNVKLRCELRTLKKLGLVGFKAVSQFVIRRA
                            """)
     nt_sequences = {
         "Human":
         "GACGCACGGTGGCACAACGTAAAATTAAGATGTGAATTGAGAACTCTGAAAAAATTGGGACTGGTCGGCTTCAAGGCAGTAAGTCAATTCGTAATACGTCGTGCG",
         "Chimp":
         "CACGCCCGATGGCTCAACGAAAAGTTAAGATGCGAATTGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGCAGTAAGTCAGTACGTTAAAGGTCGTGCG",
         "Orangutan":
         "GATGCACGCTGGATCAACGAAAAGTTAAGATGCGTATCGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGGAGTAAGTCAATACGTTAAAGGTCGTCCG"

Esempio n. 10

0

Mostra file

File: ete_ncbicomp.py Progetto: tarah28/ete

def main(argv):
    parser = argparse.ArgumentParser(
        description=__DESCRIPTION__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo.
    # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version)
    # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) )
    # const - A constant value required by some action and nargs selections.
    # default - The value produced if the argument is absent from the command line.
    # type - The type to which the command-line argument should be converted.
    # choices - A container of the allowable values for the argument.
    # required - Whether or not the command-line option may be omitted (optionals only).
    # help - A brief description of what the argument does.
    # metavar - A name for the argument in usage messages.
    # dest - The name of the attribute to be added to the object returned by parse_args().

    parser.add_argument("--show",
                        dest="show_tree",
                        action="store_true",
                        help="""Display tree after the analysis.""")

    parser.add_argument("--render",
                        dest="render",
                        action="store_true",
                        help="""Render tree.""")

    parser.add_argument("--dump",
                        dest="dump",
                        action="store_true",
                        help="""Dump analysis""")

    parser.add_argument(
        "--explore",
        dest="explore",
        type=str,
        help="""Reads a previously analyzed tree and visualize it""")

    input_args = parser.add_mutually_exclusive_group()
    input_args.required = True
    input_args.add_argument("-t",
                            "--tree",
                            dest="target_tree",
                            nargs="+",
                            type=str,
                            help="""Tree file in newick format""")

    input_args.add_argument("-tf",
                            dest="tree_list_file",
                            type=str,
                            help="File with the list of tree files")

    parser.add_argument("--tax",
                        dest="tax_info",
                        type=str,
                        help="If the taxid attribute is not set in the"
                        " newick file for all leaf nodes, a tab file file"
                        " with the translation of name and taxid can be"
                        " provided with this option.")

    parser.add_argument(
        "--sp_delimiter",
        dest="sp_delimiter",
        type=str,
        help=
        "If taxid is part of the leaf name, delimiter used to split the string"
    )

    parser.add_argument(
        "--sp_field",
        dest="sp_field",
        type=int,
        default=0,
        help="field position for taxid after splitting leaf names")

    parser.add_argument("--ref",
                        dest="ref_tree",
                        type=str,
                        help="Uses ref tree to compute robinson foulds"
                        " distances of the different subtrees")

    parser.add_argument("--rf-only",
                        dest="rf_only",
                        action="store_true",
                        help="Skip ncbi consensus analysis")

    parser.add_argument(
        "--outgroup",
        dest="outgroup",
        type=str,
        nargs="+",
        help="A list of node names defining the trees outgroup")

    parser.add_argument("--is_sptree",
                        dest="is_sptree",
                        action="store_true",
                        help="Assumes no duplication nodes in the tree")

    parser.add_argument("-o",
                        dest="output",
                        type=str,
                        help="Writes result into a file")

    parser.add_argument("--tax2name", dest="tax2name", type=str, help="")

    parser.add_argument("--tax2track", dest="tax2track", type=str, help="")

    parser.add_argument("--dump_tax_info",
                        dest="dump_tax_info",
                        action="store_true",
                        help="")

    args = parser.parse_args(argv)

    if args.sp_delimiter:
        GET_TAXID = lambda x: x.split(args.sp_delimiter)[args.sp_field]
    else:
        GET_TAXID = None

    reftree_name = os.path.basename(args.ref_tree) if args.ref_tree else ""
    if args.explore:
        print >> sys.stderr, "Reading tree from file:", args.explore
        t = cPickle.load(open(args.explore))
        ts = TreeStyle()
        ts.force_topology = True
        ts.show_leaf_name = False
        ts.layout_fn = ncbi_layout
        ts.mode = "r"
        t.show(tree_style=ts)
        print >> sys.stderr, "dumping color config"
        cPickle.dump(name2color, open("ncbi_colors.pkl", "w"))
        sys.exit()

    if args.output:
        OUT = open(args.output, "w")
    else:
        OUT = sys.stdout

    print >> sys.stderr, "Dumping results into", OUT
    target_trees = []
    if args.tree_list_file:
        target_trees = [line.strip() for line in open(args.tree_list_file)]
    if args.target_tree:
        target_trees += args.target_tree
    prev_tree = None
    if args.tax2name:
        tax2name = cPickle.load(open(args.tax2name))
    else:
        tax2name = {}

    if args.tax2track:
        tax2track = cPickle.load(open(args.tax2track))
    else:
        tax2track = {}
    print len(tax2track), len(tax2name)
    header = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees",
              "Broken clades", "Clade sizes", "RF (avg)", "RF (med)",
              "RF (std)", "RF (max)", "Shared tips")
    print >> OUT, '|'.join([h.ljust(15) for h in header])
    if args.ref_tree:
        print >> sys.stderr, "Reading ref tree from", args.ref_tree
        reft = Tree(args.ref_tree, format=1)
    else:
        reft = None

    SHOW_TREE = False
    if args.show_tree or args.render:
        SHOW_TREE = True

    prev_broken = set()
    ENTRIES = []
    ncbi.connect_database()
    for tfile in target_trees:
        #print tfile
        t = PhyloTree(tfile, sp_naming_function=None)
        if GET_TAXID:
            for n in t.iter_leaves():
                n.name = GET_TAXID(n.name)

        if args.outgroup:
            if len(args.outgroup) == 1:
                out = t & args.outgroup[0]
            else:
                out = t.get_common_ancestor(args.outgroup)
                if set(out.get_leaf_names()) ^ set(args.outgroup):
                    raise ValueError("Outgroup is not monophyletic")

            t.set_outgroup(out)
        t.ladderize()

        if prev_tree:
            tree_compare(t, prev_tree)
        prev_tree = t

        if args.tax_info:
            tax2name, tax2track = annotate_tree_with_taxa(
                t, args.tax_info, tax2name, tax2track)
            if args.dump_tax_info:
                cPickle.dump(tax2track, open("tax2track.pkl", "w"))
                cPickle.dump(tax2name, open("tax2name.pkl", "w"))
                print "Tax info written into pickle files"
        else:
            for n in t.iter_leaves():
                spcode = n.name
                n.add_features(taxid=spcode)
                n.add_features(species=spcode)
            tax2name, tax2track = annotate_tree_with_taxa(
                t, None, tax2name, tax2track)

        # Split tree into species trees
        #subtrees =  t.get_speciation_trees()
        if not args.rf_only:
            #print "Calculating tree subparts..."
            t1 = time.time()
            if not args.is_sptree:
                subtrees = t.split_by_dups()
                #print "Subparts:", len(subtrees), time.time()-t1
            else:
                subtrees = [t]

            valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = analyze_subtrees(
                t, subtrees, show_tree=SHOW_TREE)

            #print valid_subtrees, broken_subtrees, ncbi_mistakes, total_rf
        else:
            subtrees = []
            valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = 0, 0, 0, 0, 0, 0

        ndups = 0
        nsubtrees = len(subtrees)

        rf = 0
        rf_max = 0
        rf_std = 0
        rf_med = 0
        common_names = 0
        max_size = 0
        if reft and len(subtrees) == 1:
            rf = t.robinson_foulds(reft, attr_t1="realname")
            rf_max = rf[1]
            rf = rf[0]
            rf_med = rf

        elif reft:
            #print "Calculating avg RF..."
            nsubtrees, ndups, subtrees = t.get_speciation_trees(
                map_features=["taxid"])
            #print len(subtrees), "Sub-Species-trees found"
            avg_rf = []
            rf_max = 0.0  # reft.robinson_foulds(reft)[1]
            sum_size = 0.0
            print nsubtrees, "subtrees", ndups, "duplications"

            for ii, subt in enumerate(subtrees):
                print "\r%d" % ii,
                sys.stdout.flush()
                try:
                    partial_rf = subt.robinson_foulds(reft, attr_t1="taxid")
                except ValueError:
                    pass
                else:
                    sptree_size = len(
                        set([n.taxid for n in subt.iter_leaves()]))
                    sum_size += sptree_size
                    avg_rf.append(
                        (partial_rf[0] / float(partial_rf[1])) * sptree_size)
                    common_names = len(partial_rf[3])
                    max_size = max(max_size, sptree_size)
                    rf_max = max(rf_max, partial_rf[1])
                #print  partial_rf[:2]
            rf = numpy.sum(avg_rf) / float(sum_size)  # Treeko dist
            rf_std = numpy.std(avg_rf)
            rf_med = numpy.median(avg_rf)

        sizes_info = "%0.1f/%0.1f +- %0.1f" % (numpy.mean(broken_sizes),
                                               numpy.median(broken_sizes),
                                               numpy.std(broken_sizes))
        iter_values = [
            os.path.basename(tfile), nsubtrees, ndups, broken_subtrees,
            ncbi_mistakes, broken_branches, sizes_info, rf, rf_med, rf_std,
            rf_max, common_names
        ]
        print >> OUT, '|'.join(
            map(lambda x: str(x).strip().ljust(15), iter_values))
        fixed = sorted([n for n in prev_broken if n not in broken_clades])
        new_problems = sorted(broken_clades - prev_broken)
        fixed_string = color(', '.join(fixed), "green") if fixed else ""
        problems_string = color(', '.join(new_problems),
                                "red") if new_problems else ""
        OUT.write("    Fixed clades: %s\n" % fixed_string) if fixed else None
        OUT.write("    New broken:   %s\n" %
                  problems_string) if new_problems else None
        prev_broken = broken_clades
        ENTRIES.append([
            os.path.basename(tfile), nsubtrees, ndups, broken_subtrees,
            ncbi_mistakes, broken_branches, sizes_info, fixed_string,
            problems_string
        ])
        OUT.flush()
        if args.show_tree or args.render:
            ts = TreeStyle()
            ts.force_topology = True
            #ts.tree_width = 500
            ts.show_leaf_name = False
            ts.layout_fn = ncbi_layout
            ts.mode = "r"
            t.dist = 0
            if args.show_tree:
                #if args.hide_monophyletic:
                #    tax2monophyletic = {}
                #    n2content = t.get_node2content()
                #    for node in t.traverse():
                #        term2count = defaultdict(int)
                #        for leaf in n2content[node]:
                #            if leaf.lineage:
                #                for term in leaf.lineage:
                #                    term2count[term] += 1
                #        expected_size = len(n2content)
                #        for term, count in term2count.iteritems():
                #            if count > 1

                print "Showing tree..."
                t.show(tree_style=ts)
            else:
                t.render("img.svg", tree_style=ts, dpi=300)
            print "dumping color config"
            cPickle.dump(name2color, open("ncbi_colors.pkl", "w"))

        if args.dump:
            cPickle.dump(t, open("ncbi_analysis.pkl", "w"))

    print
    print
    HEADER = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees",
              "Broken clades", "Broken branches", "Clade sizes",
              "Fixed Groups", "New Broken Clades")
    print_table(ENTRIES, max_col_width=50, row_line=True, header=HEADER)

    if args.output:
        OUT.close()

Esempio n. 11

0

Mostra file

File: ete_dist.py Progetto: MikeTrizna/ete

def main(argv):
    
    parser = argparse.ArgumentParser(description=__DESCRIPTION__, 
                            formatter_class=argparse.RawDescriptionHelpFormatter)


    input_args = parser.add_argument_group("INPUT OPTIONS")
    input_args.add_argument("source_trees", metavar='source_trees', type=str, nargs="*",
                   help='a list of source tree files')
    
    input_args.add_argument("--source_file", dest="source_file", 
                        type=str, 
                        help="""path to a file containing many source trees, one per line""")

    input_args.add_argument("-r", dest="reftree", 
                        type=str, required=True,
                        help="""Reference tree""")

    input_args.add_argument("--ref_tree_attr", dest="ref_tree_attr", 
                            type=str, default="name",
                            help=("attribute in ref tree used as leaf name"))
    
    input_args.add_argument("--src_tree_attr", dest="src_tree_attr", 
                            type=str, default="name",
                            help=("attribute in source tree used as leaf name"))

    input_args.add_argument("--min_support_ref",
                            type=float, default=0.0,
                        help=("min support for branches to be considered from the ref tree"))
    input_args.add_argument("--min_support_src",
                        type=float, default=0.0,
                        help=("min support for branches to be considered from the source tree"))

    
    output_args = parser.add_argument_group("OUTPUT OPTIONS")
    
    output_args.add_argument("-o", dest="output", 
                            type=str,
                            help="""Path to the tab delimited report file""")

    
    opt_args = parser.add_argument_group("DISTANCE OPTIONS")
    

    opt_args.add_argument("--outgroup", dest="outgroup", 
                        nargs = "+",
                        help="""outgroup used to root reference and source trees before distance computation""")
  
    opt_args.add_argument("--expand_polytomies", dest="polytomies", 
                        action = "store_true",
                        help="""expand politomies if necessary""")
  
    opt_args.add_argument("--unrooted", dest="unrooted", 
                        action = "store_true",
                        help="""compare trees as unrooted""")

    opt_args.add_argument("--min_support", dest="min_support", 
                        type=float, default=0.0,
                        help=("min support value for branches to be counted in the distance computation (RF, treeko and refTree/targeGene compatibility)"))

    opt_args = parser.add_argument_group("PHYLOGENETICS OPTIONS")
    
    opt_args.add_argument("--extract_species",
                        action = "store_true",
                        help="When used, leaf names in the reference and source trees are assumed to represent species."
                          " If target trees are gene-trees whose species information is encoded as a part of the leaf sequence name,"
                          " it can be automatically extracted by providing a Perl regular expression that extract a "
                          " valid species code (see --sp_regexp). Such information will be also used to detect duplication"
                          " events. ")

    opt_args.add_argument("--sp_regexp", 
                          type=str,
                         help=("Specifies a Perl regular expression to automatically extract species names"
                          " from the name string in source trees. If not used, leaf names are assumed to represent species names."
                          " Example: use this expression '[^_]+_(.+)' to extract HUMAN from the string 'P53_HUMAN'."))
        
    opt_args.add_argument("--collateral", 
                        action='store_true', 
                        help=(""))

    
    args = parser.parse_args(argv)
    print __DESCRIPTION__
    reftree = args.reftree
    if args.source_file and args.source_trees:
        print >>sys.stderr, 'The use of targets_file and targets at the same time is not supported.'
        sys.exit(1)
        
    if args.source_file:
        source_trees = tree_iterator(args.source_file)
    else:
        source_trees = args.source_trees
        
    ref_tree = Tree(reftree)

    if args.ref_tree_attr:
        for lf in ref_tree.iter_leaves():
            lf._origname = lf.name
            if args.ref_tree_attr not in lf.features:
                print lf
            lf.name = getattr(lf, args.ref_tree_attr)
    
    if args.outgroup:
        if len(args.outgroup) > 1:
            out = ref_tree.get_common_ancestor(args.outgroup)
        else:
            out = ref_tree.search_nodes(name=args.outgroup[0])[0]
        ref_tree.set_outgroup(out)
                     

    HEADER = ("source tree", 'ref tree', 'common\ntips', 'normRF', 'RF', 'maxRF', "%reftree", "%genetree", "subtrees", "treeko\ndist")
    if args.output:
        OUT = open(args.output, "w")
        print >>OUT, '# ' + ctime()
        print >>OUT, '# ' + ' '.join(sys.argv) 
        print >>OUT, '#'+'\t'.join(HEADER)
    else:
        print '# ' + ctime()
        print '# ' + ' '.join(sys.argv) 
        COL_WIDTHS = [20, 20] + [9] * 10
        print_table([HEADER], fix_col_width=COL_WIDTHS, wrap_style='wrap')
        
                
    prev_tree = None
    ref_fname = os.path.basename(args.reftree)
    for counter, tfile in enumerate(source_trees):
        if args.source_file:
            seedid, tfile = tfile
        else:
            seedid = None
           
        if args.extract_species:

            if args.sp_regexp:
                SPMATCHER = re.compile(args.sp_regexp)
                get_sp_name = lambda x: re.search(SPMATCHER, x).groups()[0]
            else:
                get_sp_name = lambda x: x
                
            tt = PhyloTree(tfile, sp_naming_function = get_sp_name)
        else:
            tt = Tree(tfile)

        if args.src_tree_attr:
            for lf in tt.iter_leaves():
                lf._origname = lf.name
                lf.name = getattr(lf, args.src_tree_attr)
            
        if args.outgroup:
            if len(args.outgroup) > 1:
                out = tt.get_common_ancestor(args.outgroup)
            else:
                out = tt.search_nodes(name=args.outgroup[0])[0]
            tt.set_outgroup(out)
        
        if args.source_trees:
            fname = os.path.basename(tfile)
        else:
            fname = '%05d' %counter                          


            
        r = tt.compare(ref_tree, 
                       ref_tree_attr=args.ref_tree_attr,
                       source_tree_attr=args.src_tree_attr,
                       min_support_ref=args.min_support_ref,
                       min_support_source = args.min_support_src,
                       unrooted=args.unrooted,
                       has_duplications=args.extract_species)

                          

        print_table([map(istr, [fname[-30:], ref_fname[-30:], r['effective_tree_size'], r['norm_rf'],
                               r['rf'], r['max_rf'], r["source_edges_in_ref"],
                               r["ref_edges_in_source"], r['source_subtrees'], r['treeko_dist']])],
                    fix_col_width = COL_WIDTHS, wrap_style='cut')
                          

    if args.output:
        OUT.close()

Esempio n. 12

0

Mostra file

    '''
    if hasattr(node, "collapsed"):
        if node.collapsed == 1:
            node.img_style["draw_descendants"] = False
    if node.is_leaf():
        if hasattr(node, "sequence"):
            seqface = MySequenceFace(node.sequence,
                                     "nt",
                                     fsize=10,
                                     col_w=11,
                                     interactive=True)
            faces.add_face_to_node(seqface, node, 1, aligned=True)


if __name__ == "__main__":
    tree = PhyloTree('(Orangutan,Human,Chimp);')
    tree.link_to_alignment("""
                           >Chimp
                           HARWLNEKLRCELRTLKKLGLDGYKAVSQYVKGRA
                           >Orangutan
                           DARWINEKLRCVSRTLKKLGLDGYKGVSQYVKGRP
                           >Human
                           DARWHNVKLRCELRTLKKLGLVGFKAVSQFVIRRA
                           """)
    nt_sequences = {
        "Human":
        "GACGCACGGTGGCACAACGTAAAATTAAGATGTGAATTGAGAACTCTGAAAAAATTGGGACTGGTCGGCTTCAAGGCAGTAAGTCAATTCGTAATACGTCGTGCG",
        "Chimp":
        "CACGCCCGATGGCTCAACGAAAAGTTAAGATGCGAATTGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGCAGTAAGTCAGTACGTTAAAGGTCGTGCG",
        "Orangutan":
        "GATGCACGCTGGATCAACGAAAAGTTAAGATGCGTATCGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGGAGTAAGTCAATACGTTAAAGGTCGTCCG"

Esempio n. 13

0

Mostra file

File: orthology_and_paralogy_prediction.py Progetto: MikeTrizna/ete

from ete_dev import PhyloTree
# Loads an example tree
nw = """
((Dme_001,Dme_002),(((Cfa_001,Mms_001),((Hsa_001,Ptr_001),Mmu_001)),
(Ptr_002,(Hsa_002,Mmu_002))));
"""
t = PhyloTree(nw)
print t
#                    /-Dme_001
#          /--------|
#         |          \-Dme_002
#         |
#         |                              /-Cfa_001
#         |                    /--------|
#---------|                   |          \-Mms_001
#         |          /--------|
#         |         |         |                    /-Hsa_001
#         |         |         |          /--------|
#         |         |          \--------|          \-Ptr_001
#          \--------|                   |
#                   |                    \-Mmu_001
#                   |
#                   |          /-Ptr_002
#                    \--------|
#                             |          /-Hsa_002
#                              \--------|
#                                        \-Mmu_002
#
# To obtain all the evolutionary events involving a given leaf node we
# use get_my_evol_events method
matches = t.search_nodes(name="Hsa_001")

Esempio n. 14

0

Mostra file

        tax2name = {}

    if args.tax2track:
        tax2track = cPickle.load(open(args.tax2track))
    else:
        tax2track = {}
    print len(tax2track), len(tax2name)
    #header = "filename", "refname", "# subtrees", "# dups", "broken subtrees", "ncbi_mistakes", "RF", "avg RF", "RF std", "max RF", "")
    #print '\t'.join(header)
    header = ("Tree".center(50), "Total subtrees", "Broken subtrees",
              "Broken NCBI clades", "RF (avg)", "RF (med)", "RF (std)",
              "RF (max possible)")
    print >> OUT, "#" + ' '.join([h.center(15) for h in header])
    for tfile in target_trees:
        print tfile
        t = PhyloTree(tfile, sp_naming_function=None)
        if args.outgroup:
            if len(args.outgroup) == 1:
                out = t & args.outgroup[0]
            else:
                out = t.get_common_ancestor(args.outgroup)
                if set(out.get_leaf_names()) ^ set(args.outgroup):
                    raise ValueError("Outgroup is not monophyletic")

            t.set_outgroup(out)
        t.ladderize()

        if prev_tree:
            tree_compare(t, prev_tree)
        prev_tree = t

Esempio n. 15

0

Mostra file

File: link_sequences_to_phylogenies.py Progetto: MikeTrizna/ete

 MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAH
"""
iphylip_txt = """
 4 76
      seqA   MAEIPDETIQ QFMALT---H NIAVQYLSEF GDLNEALNSY YASQTDDIKD RREEAHQFMA
      seqB   MAEIPDATIQ QFMALTNVSH NIAVQY--EF GDLNEALNSY YAYQTDDQKD RREEAHQFMA
      seqC   MAEIPDATIQ ---ALTNVSH NIAVQYLSEF GDLNEALNSY YASQTDDQPD RREEAHQFMA
      seqD   MAEAPDETIQ QFMALTNVSH NIAVQYLSEF GDLNEAL--- ---------- -REEAHQ---
             LTNVSHQFMA LTNVSH
             LTNVSH---- ------
             LTNVSH---- ------
             -------FMA LTNVSH
"""
# Load a tree and link it to an alignment. As usual, 'alignment' can
# be the path to a file or data in text format.
t = PhyloTree("(((seqA,seqB),seqC),seqD);", alignment=fasta_txt, alg_format="fasta")

#We can now access the sequence of every leaf node
print "These are the nodes and its sequences:"
for leaf in t.iter_leaves():
    print leaf.name, leaf.sequence
#seqD MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAH
#seqC MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH
#seqA MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAH
#seqB MAEIPDATIQQFMALTNVSHNIAVQY--EFGDLNEALNSYYAYQTDDQKDRREEAH
#
# The associated alignment can be changed at any time
t.link_to_alignment(alignment=iphylip_txt, alg_format="iphylip")
# Let's check that sequences have changed
print "These are the nodes and its re-linked sequences:"
for leaf in t.iter_leaves():

Esempio n. 16

0

Mostra file

File: test_ncbiquery.py Progetto: tarah28/ete

 def test_ncbi_compare(self):
     t = PhyloTree("((9606, (9598, 9606)), 10090);",
                   sp_naming_function=lambda x: x.name)
     t.annotate_ncbi_taxa()

Esempio n. 17

0

Mostra file

File: alignment_visualization.py Progetto: tarah28/ete

from ete_dev import PhyloTree, PhylomeDBConnector, SeqGroup

p = PhylomeDBConnector()
w,x, t =  p.get_best_tree("Hsa0000001", 1)
a, l = p.get_clean_alg("Hsa0000001", 1)
A = SeqGroup(a, "iphylip")
for s in A.id2seq:
    A.id2seq[s]=A.id2seq[s][:30]
t.link_to_alignment(A)
print t.get_species()
print t
t.set_outgroup(t&"Ddi0002240")

sp = PhyloTree("(((((((((((Hsa, Ptr), Mmu), ((Mms, Rno), (Bta, Cfa))), Mdo), Gga), Xtr), (Dre, Fru))),Cin) (Dme, Aga)), Ddi);")
reconciled, evs = t.reconcile(sp)
print reconciled
reconciled.show()

Esempio n. 18

0

Mostra file

File: ete_ncbiquery.py Progetto: MikeTrizna/ete

def main(argv):
    
    parser = ArgumentParser(description=__DESCRIPTION__)

    parser.add_argument("--db",  dest="dbfile",
                        type=str,
                        help="""NCBI sqlite3 db file.""")

    input_args = parser.add_argument_group('TAXID INPUT OPTIONS')
    
    input_args.add_argument("-i", "--taxid", dest="taxid", nargs="+",  
                        type=int, 
                        help="""taxids (space separated)""")

    input_args.add_argument("-if", "--taxid_file", dest="taxid_file",   
                        type=str, 
                        help="""file containing a list of taxids (one per line)""")

    input_args.add_argument("-t", "--reftree", dest="reftree",   
                        type=str, 
                        help="""Read taxids from the provided tree.""")
    
    input_args.add_argument("--reftree_attr", dest="reftree_attr",   
                        type=str, default="name",
                        help="""tree attribute encoding for taxid numbers.""")

    name_input_args = parser.add_argument_group('NAME INPUT OPTIONS')
    
    name_input_args.add_argument("-n", "--name", dest="names", nargs="+",  
                        type=str, 
                        help="""species or taxa names (comma separated)""")

    name_input_args.add_argument("-nf", "--names_file", dest="names_file",   
                        type=str, 
                        help="""file containing a list of taxids (one per line)""")
    name_input_args.add_argument("--fuzzy", dest="fuzzy", type=float,
                        help=("Tries a fuzzy (and SLOW) search for those"
                              " species names that could not be translated"
                              " into taxids. A float number must be provided"
                              " indicating the minimum string similarity."))

    output_args = parser.add_argument_group('OUTPUT OPTIONS')
    
    output_args.add_argument("-x", "--taxonomy", dest="taxonomy",   
                        type=str, 
                        help=("dump a pruned version of the NCBI taxonomy"
                              " tree containing target species into the specified file"))

    output_args.add_argument("-l", "--list", dest="info_list",   
                        type=str, 
                        help="""dump NCBI taxonmy information for each target species into the specified file. """)

    output_args.add_argument("-a", "--annotated", dest="annotated_tree",   
                        type=str, 
                        help="dump the annotated tree of the input reftree provided with -t into the specified file.")                             
    
    output_args.add_argument("--collapse_subspecies", dest="collapse_subspecies",   
                        action="store_true",
                        help=("When used, all nodes under the the species rank"
                              " are collapsed, so all species and subspecies"
                              " are seen as sister nodes"))

    output_args.add_argument("--rank_limit", dest="rank_limit",   
                        type=str,
                        help=("When used, all nodes under the provided rank"
                              " are discarded"))
    
    output_args.add_argument("--full_lineage", dest="full_lineage",   
                        action="store_true",
                        help=("When used, topology is not pruned to avoid "
                              " one-child-nodes, so the complete lineage"
                              " track leading from root to tips is kept."))
        
    args = parser.parse_args(argv)

    taxid_source = args.taxid or args.taxid_file or args.reftree
    name_source = args.names or args.names_file
    if not taxid_source and not name_source:
        parser.error('At least one input source is required')
    if taxid_source and name_source:
        parser.error('taxid and name options are mutually exclusive')
        
    if not args.taxonomy and not args.info_list and not args.annotated_tree:
        parser.error('At least one output option is required')
        
    
    ncbi = NCBITaxa(args.dbfile)
        
    all_names = set([])
    all_taxids = []

    if args.names_file:
        all_names.update(map(strip, open(args.names_file, "rU").read().split("\n")))
        
    if args.names:
        all_names.update(map(strip, " ".join(args.names).split(",")))
        
    all_names.discard("")
    #all_names = set([n.lower() for n in all_names])
    not_found = set()
    name2realname = {}
    name2score = {}
    if all_names:
        log.info("Dumping name translations in %s.name_translation.txt ... ")
        name2id = ncbi.get_name_translator(all_names)
        not_found = all_names - set(name2id.keys())

        if args.fuzzy and not_found:
            log.info("%s unknown names", len(not_found))
            for name in not_found:
                # enable extension loading
                tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy)
                if tax:
                    name2id[name] = tax
                    name2realname[name] = realname
                    name2score[name] = "Fuzzy:%0.2f" %sim
                    
        for name in all_names:
            taxid = name2id.get(name, "???")
            realname = name2realname.get(name, name)
            score = name2score.get(name, "Exact:1.0")
            print "\t".join(map(str, [score, name, realname.capitalize(), taxid]))
            
    if args.taxid_file:
        all_taxids.extend(map(strip, open(args.taxid_file, "rU").read().split("\n")))
    if args.taxid:
        all_taxids.extend(args.taxid)
        
    reftree = None
    if args.reftree:
        reftree = PhyloTree(args.reftree)
        all_taxids.extend(list(set([getattr(n, args.reftree_attr) for n in reftree.iter_leaves()])))
       
    if all_taxids and args.info_list:
        all_taxids = set(all_taxids)
        all_taxids.discard("")
        all_taxids, merge_conversion = ncbi._translate_merged(all_taxids)
        outfile = args.info_list+".info.txt"
        log.info("Dumping %d taxid translations in %s ..." %(len(all_taxids), outfile))
        all_taxids.discard("")
        translator = ncbi.get_taxid_translator(all_taxids)
        OUT = open(outfile, "w")
        for taxid, name in translator.iteritems():
            lineage = ncbi.get_sp_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage = ','.join(map(str, lineage))
            print >>OUT, "\t".join(map(str, [merge_conversion.get(int(taxid), taxid), name, named_lineage, lineage ]))
            
        OUT.close()    
        for notfound in set(map(str, all_taxids)) - set(str(k) for k in translator.iterkeys()):
            print >>sys.stderr, notfound, "NOT FOUND"
    
    if all_taxids and args.taxonomy:
        all_taxids = set(all_taxids)
        all_taxids.discard("")
        log.info("Dumping NCBI taxonomy of %d taxa in %s.*.nw ..." %(len(all_taxids), args.taxonomy))

        t = ncbi.get_topology(all_taxids, args.full_lineage, args.rank_limit)
        
        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            n.name = "%s{%s}" %(id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_sp_lineage(n.taxid)
            n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage)))
            
        if args.collapse_subspecies:
            species_nodes = [n for n in t.traverse() if n.rank == "species"
                             if int(n.taxid) in all_taxids]
            for sp_node in species_nodes:
                bellow = sp_node.get_descendants()
                if bellow:
                    # creates a copy of the species node
                    connector = sp_node.__class__()
                    for f in sp_node.features:
                        connector.add_feature(f, getattr(sp_node, f))
                    connector.name = connector.name + "{species}"
                    for n in bellow:
                        n.detach()
                        n.name = n.name + "{%s}" %n.rank
                        sp_node.add_child(n)
                    sp_node.add_child(connector)
                    sp_node.add_feature("collapse_subspecies", "1")
                
            
        t.write(format=9, outfile=args.taxonomy+".names.nw")
        t.write(format=8, outfile=args.taxonomy+".allnames.nw")
        t.write(format=9, features=["taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage"],
                outfile=args.taxonomy+".full_annotation.nw")
        
        for i in t.iter_leaves():
            i.name = i.taxid
        t.write(format=9, outfile=args.taxonomy+".taxids.nw")
        t.write(format=8, outfile=args.taxonomy+".alltaxids.nw")


    if all_taxids and reftree:
        translator = ncbi.get_taxid_translator(all_taxids)
        for n in reftree.iter_leaves():
            if not hasattr(n, "taxid"):
                n.add_features(taxid=int(getattr(n, args.reftree_attr)))
            n.add_features(sci_name = translator.get(int(n.taxid), n.name))
            lineage = ncbi.get_sp_lineage(n.taxid)
            named_lineage = '|'.join(ncbi._translate_to_names(lineage))
            n.add_features(ncbi_track=named_lineage)
            
        print reftree.write(features=["taxid", "sci_name", "ncbi_track"])

Esempio n. 19

0

Mostra file

File: species_aware_phylogenies.py Progetto: tarah28/ete

from ete_dev import PhyloTree
# Reads a phylogenetic tree (using default species name encoding)
t = PhyloTree("(((Hsa_001,Ptr_001),(Cfa_001,Mms_001)),(Dme_001,Dme_002));")
#                              /-Hsa_001
#                    /--------|
#                   |          \-Ptr_001
#          /--------|
#         |         |          /-Cfa_001
#         |          \--------|
#---------|                    \-Mms_001
#         |
#         |          /-Dme_001
#          \--------|
#                    \-Dme_002
#
# Prints current leaf names and species codes
print "Deafult mode:"
for n in t.get_leaves():
    print "node:", n.name, "Species name:", n.species
# node: Dme_001 Species name: Dme
# node: Dme_002 Species name: Dme
# node: Hsa_001 Species name: Hsa
# node: Ptr_001 Species name: Ptr
# node: Cfa_001 Species name: Cfa
# node: Mms_001 Species name: Mms
#
# We can also use our own leaf name parsing function to obtain species
# names. All we need to do is create a python function that takes
# node's name as argument and return its corresponding species name.
def get_species_name(node_name_string):
    # Species code is the first part of leaf name (separated by an

Esempio n. 20

0

Mostra file

File: webplugin_example.py Progetto: tarah28/ete

def my_tree_loader(tree):
    """ This is function is used to load trees within the
    WebTreeApplication object. """
    t = PhyloTree(tree, sp_naming_function=extract_species_code)
    return t

Esempio n. 21

0

Mostra file

File: ete_dist.py Progetto: tarah28/ete

def main(argv):

    parser = argparse.ArgumentParser(
        description=__DESCRIPTION__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    input_args = parser.add_argument_group("INPUT OPTIONS")
    input_args.add_argument("source_trees",
                            metavar='source_trees',
                            type=str,
                            nargs="*",
                            help='a list of source tree files')

    input_args.add_argument(
        "--source_file",
        dest="source_file",
        type=str,
        help="""path to a file containing many source trees, one per line""")

    input_args.add_argument("-r",
                            dest="reftree",
                            type=str,
                            required=True,
                            help="""Reference tree""")

    input_args.add_argument("--ref_tree_attr",
                            dest="ref_tree_attr",
                            type=str,
                            default="name",
                            help=("attribute in ref tree used as leaf name"))

    input_args.add_argument(
        "--src_tree_attr",
        dest="src_tree_attr",
        type=str,
        default="name",
        help=("attribute in source tree used as leaf name"))

    input_args.add_argument(
        "--min_support_ref",
        type=float,
        default=0.0,
        help=("min support for branches to be considered from the ref tree"))
    input_args.add_argument(
        "--min_support_src",
        type=float,
        default=0.0,
        help=(
            "min support for branches to be considered from the source tree"))

    output_args = parser.add_argument_group("OUTPUT OPTIONS")

    output_args.add_argument("-o",
                             dest="output",
                             type=str,
                             help="""Path to the tab delimited report file""")

    opt_args = parser.add_argument_group("DISTANCE OPTIONS")

    opt_args.add_argument(
        "--outgroup",
        dest="outgroup",
        nargs="+",
        help=
        """outgroup used to root reference and source trees before distance computation"""
    )

    opt_args.add_argument("--expand_polytomies",
                          dest="polytomies",
                          action="store_true",
                          help="""expand politomies if necessary""")

    opt_args.add_argument("--unrooted",
                          dest="unrooted",
                          action="store_true",
                          help="""compare trees as unrooted""")

    opt_args.add_argument(
        "--min_support",
        dest="min_support",
        type=float,
        default=0.0,
        help=
        ("min support value for branches to be counted in the distance computation (RF, treeko and refTree/targeGene compatibility)"
         ))

    opt_args = parser.add_argument_group("PHYLOGENETICS OPTIONS")

    opt_args.add_argument(
        "--extract_species",
        action="store_true",
        help=
        "When used, leaf names in the reference and source trees are assumed to represent species."
        " If target trees are gene-trees whose species information is encoded as a part of the leaf sequence name,"
        " it can be automatically extracted by providing a Perl regular expression that extract a "
        " valid species code (see --sp_regexp). Such information will be also used to detect duplication"
        " events. ")

    opt_args.add_argument(
        "--sp_regexp",
        type=str,
        help=
        ("Specifies a Perl regular expression to automatically extract species names"
         " from the name string in source trees. If not used, leaf names are assumed to represent species names."
         " Example: use this expression '[^_]+_(.+)' to extract HUMAN from the string 'P53_HUMAN'."
         ))

    opt_args.add_argument("--collateral", action='store_true', help=(""))

    args = parser.parse_args(argv)
    print __DESCRIPTION__
    reftree = args.reftree
    if args.source_file and args.source_trees:
        print >> sys.stderr, 'The use of targets_file and targets at the same time is not supported.'
        sys.exit(1)

    if args.source_file:
        source_trees = tree_iterator(args.source_file)
    else:
        source_trees = args.source_trees

    ref_tree = Tree(reftree)

    if args.ref_tree_attr:
        for lf in ref_tree.iter_leaves():
            lf._origname = lf.name
            if args.ref_tree_attr not in lf.features:
                print lf
            lf.name = getattr(lf, args.ref_tree_attr)

    if args.outgroup:
        if len(args.outgroup) > 1:
            out = ref_tree.get_common_ancestor(args.outgroup)
        else:
            out = ref_tree.search_nodes(name=args.outgroup[0])[0]
        ref_tree.set_outgroup(out)

    HEADER = ("source tree", 'ref tree', 'common\ntips', 'normRF', 'RF',
              'maxRF', "%reftree", "%genetree", "subtrees", "treeko\ndist")
    if args.output:
        OUT = open(args.output, "w")
        print >> OUT, '# ' + ctime()
        print >> OUT, '# ' + ' '.join(sys.argv)
        print >> OUT, '#' + '\t'.join(HEADER)
    else:
        print '# ' + ctime()
        print '# ' + ' '.join(sys.argv)
        COL_WIDTHS = [20, 20] + [9] * 10
        print_table([HEADER], fix_col_width=COL_WIDTHS, wrap_style='wrap')

    prev_tree = None
    ref_fname = os.path.basename(args.reftree)
    for counter, tfile in enumerate(source_trees):
        if args.source_file:
            seedid, tfile = tfile
        else:
            seedid = None

        if args.extract_species:

            if args.sp_regexp:
                SPMATCHER = re.compile(args.sp_regexp)
                get_sp_name = lambda x: re.search(SPMATCHER, x).groups()[0]
            else:
                get_sp_name = lambda x: x

            tt = PhyloTree(tfile, sp_naming_function=get_sp_name)
        else:
            tt = Tree(tfile)

        if args.src_tree_attr:
            for lf in tt.iter_leaves():
                lf._origname = lf.name
                lf.name = getattr(lf, args.src_tree_attr)

        if args.outgroup:
            if len(args.outgroup) > 1:
                out = tt.get_common_ancestor(args.outgroup)
            else:
                out = tt.search_nodes(name=args.outgroup[0])[0]
            tt.set_outgroup(out)

        if args.source_trees:
            fname = os.path.basename(tfile)
        else:
            fname = '%05d' % counter

        r = tt.compare(ref_tree,
                       ref_tree_attr=args.ref_tree_attr,
                       source_tree_attr=args.src_tree_attr,
                       min_support_ref=args.min_support_ref,
                       min_support_source=args.min_support_src,
                       unrooted=args.unrooted,
                       has_duplications=args.extract_species)

        print_table([
            map(istr, [
                fname[-30:], ref_fname[-30:], r['effective_tree_size'],
                r['norm_rf'], r['rf'], r['max_rf'], r["source_edges_in_ref"],
                r["ref_edges_in_source"], r['source_subtrees'],
                r['treeko_dist']
            ])
        ],
                    fix_col_width=COL_WIDTHS,
                    wrap_style='cut')

    if args.output:
        OUT.close()

Esempio n. 22

0

Mostra file

File: test_ncbiquery.py Progetto: MikeTrizna/ete

 def test_ncbi_compare(self):
   t = PhyloTree( "((9606, (9598, 9606)), 10090);", sp_naming_function=lambda x: x.name )
   t.annotate_ncbi_taxa()

Esempio n. 23

0

Mostra file

File: phylotree_visualization.py Progetto: xguse/ete

 MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH
 >Mms_001
 MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAH
 >Hsa_001
 MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAH
 >Ptr_002
 MAEIPDATIQ-FMALTNVSHNIAVQY--EFGDLNEALNSY--YQTDDQKDRREEAH
 >Mmu_002
 MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH
 >Hsa_002
 MAEAPDETIQQFM-LTNVSHNIAVQYLSEFGDLNEAL--------------REEAH
 >Mmu_001
 MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAH
 >Ptr_001
 MAEIPDATIQ-FMALTNVSHNIAVQY--EFGDLNEALNSY--YQTDDQKDRREEAH
 >Mmu_001
 MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH
"""

# Performs a tree reconciliation analysis 
gene_tree_nw = '((Dme_001,Dme_002),(((Cfa_001,Mms_001),((Hsa_001,Ptr_001),Mmu_001)),(Ptr_002,(Hsa_002,Mmu_002))));'
species_tree_nw = "((((Hsa, Ptr), Mmu), (Mms, Cfa)), Dme);"
genetree = PhyloTree(gene_tree_nw)
sptree = PhyloTree(species_tree_nw)
recon_tree, events = genetree.reconcile(sptree)
recon_tree.link_to_alignment(alg)

# Visualize the reconciled tree
recon_tree.render("phylotree.png", w=750)

Esempio n. 24

0

Mostra file

File: ete_ncbiquery.py Progetto: tarah28/ete

def main(argv):

    parser = ArgumentParser(description=__DESCRIPTION__)

    parser.add_argument("--db",
                        dest="dbfile",
                        type=str,
                        help="""NCBI sqlite3 db file.""")

    input_args = parser.add_argument_group('TAXID INPUT OPTIONS')

    input_args.add_argument("-i",
                            "--taxid",
                            dest="taxid",
                            nargs="+",
                            type=int,
                            help="""taxids (space separated)""")

    input_args.add_argument(
        "-if",
        "--taxid_file",
        dest="taxid_file",
        type=str,
        help="""file containing a list of taxids (one per line)""")

    input_args.add_argument("-t",
                            "--reftree",
                            dest="reftree",
                            type=str,
                            help="""Read taxids from the provided tree.""")

    input_args.add_argument(
        "--reftree_attr",
        dest="reftree_attr",
        type=str,
        default="name",
        help="""tree attribute encoding for taxid numbers.""")

    name_input_args = parser.add_argument_group('NAME INPUT OPTIONS')

    name_input_args.add_argument(
        "-n",
        "--name",
        dest="names",
        nargs="+",
        type=str,
        help="""species or taxa names (comma separated)""")

    name_input_args.add_argument(
        "-nf",
        "--names_file",
        dest="names_file",
        type=str,
        help="""file containing a list of taxids (one per line)""")
    name_input_args.add_argument(
        "--fuzzy",
        dest="fuzzy",
        type=float,
        help=("Tries a fuzzy (and SLOW) search for those"
              " species names that could not be translated"
              " into taxids. A float number must be provided"
              " indicating the minimum string similarity."))

    output_args = parser.add_argument_group('OUTPUT OPTIONS')

    output_args.add_argument(
        "-x",
        "--taxonomy",
        dest="taxonomy",
        type=str,
        help=("dump a pruned version of the NCBI taxonomy"
              " tree containing target species into the specified file"))

    output_args.add_argument(
        "-l",
        "--list",
        dest="info_list",
        type=str,
        help=
        """dump NCBI taxonmy information for each target species into the specified file. """
    )

    output_args.add_argument(
        "-a",
        "--annotated",
        dest="annotated_tree",
        type=str,
        help=
        "dump the annotated tree of the input reftree provided with -t into the specified file."
    )

    output_args.add_argument(
        "--collapse_subspecies",
        dest="collapse_subspecies",
        action="store_true",
        help=("When used, all nodes under the the species rank"
              " are collapsed, so all species and subspecies"
              " are seen as sister nodes"))

    output_args.add_argument(
        "--rank_limit",
        dest="rank_limit",
        type=str,
        help=("When used, all nodes under the provided rank"
              " are discarded"))

    output_args.add_argument(
        "--full_lineage",
        dest="full_lineage",
        action="store_true",
        help=("When used, topology is not pruned to avoid "
              " one-child-nodes, so the complete lineage"
              " track leading from root to tips is kept."))

    args = parser.parse_args(argv)

    taxid_source = args.taxid or args.taxid_file or args.reftree
    name_source = args.names or args.names_file
    if not taxid_source and not name_source:
        parser.error('At least one input source is required')
    if taxid_source and name_source:
        parser.error('taxid and name options are mutually exclusive')

    if not args.taxonomy and not args.info_list and not args.annotated_tree:
        parser.error('At least one output option is required')

    ncbi = NCBITaxa(args.dbfile)

    all_names = set([])
    all_taxids = []

    if args.names_file:
        all_names.update(
            map(strip,
                open(args.names_file, "rU").read().split("\n")))

    if args.names:
        all_names.update(map(strip, " ".join(args.names).split(",")))

    all_names.discard("")
    #all_names = set([n.lower() for n in all_names])
    not_found = set()
    name2realname = {}
    name2score = {}
    if all_names:
        log.info("Dumping name translations in %s.name_translation.txt ... ")
        name2id = ncbi.get_name_translator(all_names)
        not_found = all_names - set(name2id.keys())

        if args.fuzzy and not_found:
            log.info("%s unknown names", len(not_found))
            for name in not_found:
                # enable extension loading
                tax, realname, sim = ncbi.get_fuzzy_name_translation(
                    name, args.fuzzy)
                if tax:
                    name2id[name] = tax
                    name2realname[name] = realname
                    name2score[name] = "Fuzzy:%0.2f" % sim

        for name in all_names:
            taxid = name2id.get(name, "???")
            realname = name2realname.get(name, name)
            score = name2score.get(name, "Exact:1.0")
            print "\t".join(
                map(str,
                    [score, name, realname.capitalize(), taxid]))

    if args.taxid_file:
        all_taxids.extend(
            map(strip,
                open(args.taxid_file, "rU").read().split("\n")))
    if args.taxid:
        all_taxids.extend(args.taxid)

    reftree = None
    if args.reftree:
        reftree = PhyloTree(args.reftree)
        all_taxids.extend(
            list(
                set([
                    getattr(n, args.reftree_attr)
                    for n in reftree.iter_leaves()
                ])))

    if all_taxids and args.info_list:
        all_taxids = set(all_taxids)
        all_taxids.discard("")
        all_taxids, merge_conversion = ncbi._translate_merged(all_taxids)
        outfile = args.info_list + ".info.txt"
        log.info("Dumping %d taxid translations in %s ..." %
                 (len(all_taxids), outfile))
        all_taxids.discard("")
        translator = ncbi.get_taxid_translator(all_taxids)
        OUT = open(outfile, "w")
        for taxid, name in translator.iteritems():
            lineage = ncbi.get_sp_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage = ','.join(map(str, lineage))
            print >> OUT, "\t".join(
                map(str, [
                    merge_conversion.get(int(taxid), taxid), name,
                    named_lineage, lineage
                ]))

        OUT.close()
        for notfound in set(map(str, all_taxids)) - set(
                str(k) for k in translator.iterkeys()):
            print >> sys.stderr, notfound, "NOT FOUND"

    if all_taxids and args.taxonomy:
        all_taxids = set(all_taxids)
        all_taxids.discard("")
        log.info("Dumping NCBI taxonomy of %d taxa in %s.*.nw ..." %
                 (len(all_taxids), args.taxonomy))

        t = ncbi.get_topology(all_taxids, args.full_lineage, args.rank_limit)

        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            n.name = "%s{%s}" % (id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_sp_lineage(n.taxid)
            n.add_features(
                named_lineage='|'.join(ncbi.translate_to_names(lineage)))

        if args.collapse_subspecies:
            species_nodes = [
                n for n in t.traverse() if n.rank == "species"
                if int(n.taxid) in all_taxids
            ]
            for sp_node in species_nodes:
                bellow = sp_node.get_descendants()
                if bellow:
                    # creates a copy of the species node
                    connector = sp_node.__class__()
                    for f in sp_node.features:
                        connector.add_feature(f, getattr(sp_node, f))
                    connector.name = connector.name + "{species}"
                    for n in bellow:
                        n.detach()
                        n.name = n.name + "{%s}" % n.rank
                        sp_node.add_child(n)
                    sp_node.add_child(connector)
                    sp_node.add_feature("collapse_subspecies", "1")

        t.write(format=9, outfile=args.taxonomy + ".names.nw")
        t.write(format=8, outfile=args.taxonomy + ".allnames.nw")
        t.write(format=9,
                features=[
                    "taxid", "name", "rank", "bgcolor", "sci_name",
                    "collapse_subspecies", "named_lineage"
                ],
                outfile=args.taxonomy + ".full_annotation.nw")

        for i in t.iter_leaves():
            i.name = i.taxid
        t.write(format=9, outfile=args.taxonomy + ".taxids.nw")
        t.write(format=8, outfile=args.taxonomy + ".alltaxids.nw")

    if all_taxids and reftree:
        translator = ncbi.get_taxid_translator(all_taxids)
        for n in reftree.iter_leaves():
            if not hasattr(n, "taxid"):
                n.add_features(taxid=int(getattr(n, args.reftree_attr)))
            n.add_features(sci_name=translator.get(int(n.taxid), n.name))
            lineage = ncbi.get_sp_lineage(n.taxid)
            named_lineage = '|'.join(ncbi._translate_to_names(lineage))
            n.add_features(ncbi_track=named_lineage)

        print reftree.write(features=["taxid", "sci_name", "ncbi_track"])

Esempio n. 25

0

Mostra file

File: orthology_and_paralogy_prediction.py Progetto: tarah28/ete

from ete_dev import PhyloTree
# Loads an example tree
nw = """
((Dme_001,Dme_002),(((Cfa_001,Mms_001),((Hsa_001,Ptr_001),Mmu_001)),
(Ptr_002,(Hsa_002,Mmu_002))));
"""
t = PhyloTree(nw)
print t
#                    /-Dme_001
#          /--------|
#         |          \-Dme_002
#         |
#         |                              /-Cfa_001
#         |                    /--------|
#---------|                   |          \-Mms_001
#         |          /--------|
#         |         |         |                    /-Hsa_001
#         |         |         |          /--------|
#         |         |          \--------|          \-Ptr_001
#          \--------|                   |
#                   |                    \-Mmu_001
#                   |
#                   |          /-Ptr_002
#                    \--------|
#                             |          /-Hsa_002
#                              \--------|
#                                        \-Mmu_002
#
# To obtain all the evolutionary events involving a given leaf node we
# use get_my_evol_events method
matches = t.search_nodes(name="Hsa_001")

Esempio n. 26

0

Mostra file

File: prx5.py Progetto: jhcepas/prx5_supplementary_data

def translate_ids(trees_file, outgroup_lineage="Bacteria"):
    for line in open(trees_file):
        if not line.strip() or line.startswith('#'):
            continue

        t = PhyloTree(line, sp_naming_function=spname)
        #t.set_outgroup(t.get_midpoint_outgroup())

        for lf in t:
            lf.add_features(coded_name=lf.name)
            if lf.name in NAME2SP:
                lf.name = "%s {%s}" % (lf.name, NAME2SP[lf.name])

        t.dist = 0
        ncbi.connect_database()
        name2sp = ncbi.get_name_translator(t.get_species())
        for lf in t.iter_leaves():
            lf.add_features(taxid=name2sp.get(lf.species, 0))

        t.set_outgroup(t.search_nodes(taxid=9606)[0])
        ncbi.annotate_tree(t, attr_name='taxid')
        t.set_outgroup(
            t.get_common_ancestor(
                [lf for lf in t if outgroup_lineage in lf.named_lineage]))
        ncbi.annotate_tree(t, attr_name='taxid')

        #print t.write(features=[])
        #print t.write()
        yield t

Esempio n. 27

0

Mostra file

File: test_treeko.py Progetto: alxndrsPittis/ete

import sys
from collections import defaultdict
from ete_dev import PhyloTree

if len(sys.argv) > 1:
    t = PhyloTree(sys.argv[1])
else:
    t = PhyloTree()
    #t.populate(5000, reuse_names=True, names_library=map(lambda x: "%03d" %x, range(100)))

    #t.populate(5000, reuse_names=True, names_library=["aaa", "bbb", "ccc","dddd"])
    #t.set_species_naming_function(lambda x: x[:3])
    #t = PhyloTree("((((Kla0008018:0.226825,(Kwa0003593:0.270871,(((((((Sce0006606:0.020101,(Smi0000169:0.045626,Sku0001100:0.091634)0.9:0.021336)0.473:0.004546,Spa0001368:0)0.806:0.040152,Sba0000063:0.059101)0.967:0.124536,Sca0004780:0.57162)0.36:0.045976,Cgl0005705:0.244154)0.94:0.080608,(((Spa0003632:0.005291,Sce0012358:0.019313)0.879:0.014349,Smi0005102:0.031246)0.028:0.000541,(Sba0002319:0.027948,Sku0001858:0.037758)0.873:0.023849)0.995:0.14497)0.859:0.056767,(Sca0004490:0.235469,Kpo0005032:0.313188)0.699:0.077825)0.807:0.085287)0.523:0.049374)0.606:0.167197,Ago0006484:0.438321)0.976:0.605273,Cal0012751:1.95721)0.975:0.332581,(Cal0010356:0.478947,((Ago0007434:1.13211,Kwa0002043:1.20443)0.282:0.216219,(Skl0001126:0.276168,Cgl0008719:0.5381)0.454:0.191735)0.934:0.438082)0.975:0.332581);")
    #t = PhyloTree("((((((AAA1, AAA2),((BBB1,BBB2), AAA3)D1),(CCC1,CCC2)), AAA8)D2, (((AAA5, AAA6),((BBB5,BBB6), AAA4)D3),(CCC3,CCC4)))D4, D);", format=1)
    t = PhyloTree(
        "((((((((AAA1, AAA2:0.111)a1,(((BBB1,ZZZ1)a2,MMM1)a3,AAA4)a4)a5, AAA3)a6,(AAA4, (AAA5, XXX1)a8)a9)a10,DDD)a11,DDD)a12,DDD)a13,DDD)root;",
        format=1)
    print t.get_ascii()

ntrees, ndups, sp_trees = t.get_speciation_trees(map_features=["dist"])

for sptree in sp_trees:
    print sptree.get_ascii(attributes=["dist"])

Esempio n. 28

0

Mostra file

File: ete_ncbicomp.py Progetto: MikeTrizna/ete

def main(argv):
    parser = argparse.ArgumentParser(description=__DESCRIPTION__, 
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo.
    # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version)
    # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) )
    # const - A constant value required by some action and nargs selections. 
    # default - The value produced if the argument is absent from the command line.
    # type - The type to which the command-line argument should be converted.
    # choices - A container of the allowable values for the argument.
    # required - Whether or not the command-line option may be omitted (optionals only).
    # help - A brief description of what the argument does.
    # metavar - A name for the argument in usage messages.
    # dest - The name of the attribute to be added to the object returned by parse_args().
    
    parser.add_argument("--show", dest="show_tree",
                        action="store_true", 
                        help="""Display tree after the analysis.""")
    
    parser.add_argument("--render", dest="render",
                        action="store_true", 
                        help="""Render tree.""")

    parser.add_argument("--dump", dest="dump",
                        action="store_true", 
                        help="""Dump analysis""")

    parser.add_argument("--explore", dest="explore",
                        type=str,
                        help="""Reads a previously analyzed tree and visualize it""")
    
    input_args = parser.add_mutually_exclusive_group()
    input_args.required=True
    input_args.add_argument("-t", "--tree", dest="target_tree",  nargs="+",
                        type=str,
                        help="""Tree file in newick format""")

    input_args.add_argument("-tf", dest="tree_list_file",
                        type=str, 
                        help="File with the list of tree files")
    
    parser.add_argument("--tax", dest="tax_info", type=str,
                        help="If the taxid attribute is not set in the"
                        " newick file for all leaf nodes, a tab file file"
                        " with the translation of name and taxid can be"
                        " provided with this option.")

    parser.add_argument("--sp_delimiter", dest="sp_delimiter", type=str,
                        help="If taxid is part of the leaf name, delimiter used to split the string")

    parser.add_argument("--sp_field", dest="sp_field", type=int, default=0,
                        help="field position for taxid after splitting leaf names")
    
    parser.add_argument("--ref", dest="ref_tree", type=str,
                        help="Uses ref tree to compute robinson foulds"
                        " distances of the different subtrees")

    parser.add_argument("--rf-only", dest="rf_only",
                        action = "store_true",
                        help="Skip ncbi consensus analysis")

    parser.add_argument("--outgroup", dest="outgroup",
                        type=str, nargs="+",
                        help="A list of node names defining the trees outgroup")
    
    parser.add_argument("--is_sptree", dest="is_sptree",
                        action = "store_true",
                        help="Assumes no duplication nodes in the tree")
    
    parser.add_argument("-o", dest="output", type=str,
                        help="Writes result into a file")

    parser.add_argument("--tax2name", dest="tax2name", type=str,
                        help="")
    
    parser.add_argument("--tax2track", dest="tax2track", type=str,
                        help="")
    
    parser.add_argument("--dump_tax_info", dest="dump_tax_info", action="store_true",
                        help="")
    
    args = parser.parse_args(argv)

    if args.sp_delimiter:
        GET_TAXID = lambda x: x.split(args.sp_delimiter)[args.sp_field]
    else:
        GET_TAXID = None
    
    reftree_name = os.path.basename(args.ref_tree) if args.ref_tree else ""
    if args.explore:
        print >>sys.stderr, "Reading tree from file:", args.explore
        t = cPickle.load(open(args.explore))
        ts = TreeStyle()
        ts.force_topology = True
        ts.show_leaf_name = False
        ts.layout_fn = ncbi_layout 
        ts.mode = "r"
        t.show(tree_style=ts)
        print >>sys.stderr, "dumping color config"
        cPickle.dump(name2color, open("ncbi_colors.pkl", "w"))
        sys.exit()
    
    if args.output:
        OUT = open(args.output, "w")
    else:
        OUT = sys.stdout

    print >>sys.stderr, "Dumping results into", OUT
    target_trees = []
    if args.tree_list_file:
        target_trees = [line.strip() for line in open(args.tree_list_file)]
    if args.target_tree:
        target_trees += args.target_tree
    prev_tree = None
    if args.tax2name:
        tax2name = cPickle.load(open(args.tax2name))
    else:
        tax2name = {}

    if args.tax2track:
        tax2track = cPickle.load(open(args.tax2track))
    else:
        tax2track = {}
    print len(tax2track), len(tax2name)
    header = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Clade sizes", "RF (avg)", "RF (med)", "RF (std)", "RF (max)", "Shared tips")
    print >>OUT, '|'.join([h.ljust(15) for h in header])
    if args.ref_tree:
        print >>sys.stderr, "Reading ref tree from", args.ref_tree
        reft = Tree(args.ref_tree, format=1)
    else:
        reft = None

    SHOW_TREE = False
    if args.show_tree or args.render:
        SHOW_TREE = True

        
    prev_broken = set()
    ENTRIES = []
    ncbi.connect_database()
    for tfile in target_trees:
        #print tfile
        t = PhyloTree(tfile, sp_naming_function=None)
        if GET_TAXID:
            for n in t.iter_leaves():
                n.name = GET_TAXID(n.name)
        
        if args.outgroup:
            if len(args.outgroup) == 1:
                out = t & args.outgroup[0]
            else:
                out = t.get_common_ancestor(args.outgroup)
                if set(out.get_leaf_names()) ^ set(args.outgroup):
                    raise ValueError("Outgroup is not monophyletic")
                
            t.set_outgroup(out)
        t.ladderize()

        if prev_tree:
            tree_compare(t, prev_tree)
        prev_tree = t
       
        
        if args.tax_info:
            tax2name, tax2track = annotate_tree_with_taxa(t, args.tax_info, tax2name, tax2track)
            if args.dump_tax_info:
                cPickle.dump(tax2track, open("tax2track.pkl", "w"))
                cPickle.dump(tax2name, open("tax2name.pkl", "w"))
                print "Tax info written into pickle files"
        else:
            for n in t.iter_leaves():
                spcode = n.name
                n.add_features(taxid=spcode)
                n.add_features(species=spcode)
            tax2name, tax2track = annotate_tree_with_taxa(t, None, tax2name, tax2track)
            
        # Split tree into species trees
        #subtrees =  t.get_speciation_trees()
        if not args.rf_only:
            #print "Calculating tree subparts..."
            t1 = time.time()
            if not args.is_sptree:
                subtrees =  t.split_by_dups()
                #print "Subparts:", len(subtrees), time.time()-t1
            else:
                subtrees = [t]

          
            valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = analyze_subtrees(t, subtrees, show_tree=SHOW_TREE)
            
            #print valid_subtrees, broken_subtrees, ncbi_mistakes, total_rf
        else:
            subtrees = []
            valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = 0, 0, 0, 0, 0, 0
            
        ndups = 0
        nsubtrees = len(subtrees)
           
        rf = 0
        rf_max = 0
        rf_std = 0
        rf_med = 0
        common_names = 0
        max_size = 0
        if reft and len(subtrees) == 1:
            rf = t.robinson_foulds(reft, attr_t1="realname")
            rf_max = rf[1]
            rf = rf[0]
            rf_med = rf
            
        elif reft:
            #print "Calculating avg RF..."
            nsubtrees, ndups, subtrees = t.get_speciation_trees(map_features=["taxid"])
            #print len(subtrees), "Sub-Species-trees found"
            avg_rf = []
            rf_max = 0.0 # reft.robinson_foulds(reft)[1]
            sum_size = 0.0
            print nsubtrees, "subtrees", ndups, "duplications"

            for ii, subt in enumerate(subtrees):
                print "\r%d" %ii,
                sys.stdout.flush()
                try:
                    partial_rf = subt.robinson_foulds(reft, attr_t1="taxid")
                except ValueError:
                    pass
                else:
                    sptree_size = len(set([n.taxid for n in subt.iter_leaves()]))
                    sum_size += sptree_size
                    avg_rf.append((partial_rf[0]/float(partial_rf[1])) * sptree_size)
                    common_names = len(partial_rf[3])
                    max_size = max(max_size, sptree_size)
                    rf_max = max(rf_max, partial_rf[1])
                #print  partial_rf[:2]
            rf = numpy.sum(avg_rf) / float(sum_size) # Treeko dist
            rf_std = numpy.std(avg_rf)
            rf_med = numpy.median(avg_rf)

        sizes_info = "%0.1f/%0.1f +- %0.1f" %( numpy.mean(broken_sizes), numpy.median(broken_sizes), numpy.std(broken_sizes))
        iter_values = [os.path.basename(tfile), nsubtrees, ndups,
                        broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, rf, rf_med,
                       rf_std, rf_max, common_names] 
        print >>OUT, '|'.join(map(lambda x: str(x).strip().ljust(15), iter_values)) 
        fixed = sorted([n for n in prev_broken if n not in broken_clades])
        new_problems =  sorted(broken_clades - prev_broken)
        fixed_string = color(', '.join(fixed), "green") if fixed else ""
        problems_string = color(', '.join(new_problems), "red") if new_problems else ""
        OUT.write("    Fixed clades: %s\n" %fixed_string) if fixed else None
        OUT.write("    New broken:   %s\n" %problems_string) if new_problems else None
        prev_broken = broken_clades
        ENTRIES.append([os.path.basename(tfile), nsubtrees, ndups,
                        broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, fixed_string, problems_string])
        OUT.flush()
        if args.show_tree or args.render:
            ts = TreeStyle()
            ts.force_topology = True
            #ts.tree_width = 500
            ts.show_leaf_name = False
            ts.layout_fn = ncbi_layout 
            ts.mode = "r"
            t.dist = 0
            if args.show_tree:
                #if args.hide_monophyletic:
                #    tax2monophyletic = {}
                #    n2content = t.get_node2content()
                #    for node in t.traverse():
                #        term2count = defaultdict(int)
                #        for leaf in n2content[node]:
                #            if leaf.lineage:
                #                for term in leaf.lineage:
                #                    term2count[term] += 1
                #        expected_size = len(n2content)
                #        for term, count in term2count.iteritems():
                #            if count > 1
                    
                print "Showing tree..."
                t.show(tree_style=ts)
            else:
                t.render("img.svg", tree_style=ts, dpi=300)
            print "dumping color config"
            cPickle.dump(name2color, open("ncbi_colors.pkl", "w"))

        if args.dump:
            cPickle.dump(t, open("ncbi_analysis.pkl", "w"))
            
    print
    print
    HEADER = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Broken branches", "Clade sizes", "Fixed Groups", "New Broken Clades")
    print_table(ENTRIES, max_col_width = 50, row_line=True, header=HEADER)
            
    if args.output:
        OUT.close()

Esempio n. 29

0

Mostra file

File: ncbi_consensus.py Progetto: Bioinfo2015/ncbi_taxonomy

        tax2name = cPickle.load(open(args.tax2name))
    else:
        tax2name = {}

    if args.tax2track:
        tax2track = cPickle.load(open(args.tax2track))
    else:
        tax2track = {}
    print len(tax2track), len(tax2name)
    #header = "filename", "refname", "# subtrees", "# dups", "broken subtrees", "ncbi_mistakes", "RF", "avg RF", "RF std", "max RF", "")
    #print '\t'.join(header)
    header = ("Tree".center(50), "Total subtrees", "Broken subtrees", "Broken NCBI clades", "RF (avg)", "RF (med)", "RF (std)", "RF (max possible)")
    print >>OUT, "#"+' '.join([h.center(15) for h in header])
    for tfile in target_trees:
        print tfile
        t = PhyloTree(tfile, sp_naming_function=None)
        if args.outgroup:
            if len(args.outgroup) == 1:
                out = t & args.outgroup[0]
            else:
                out = t.get_common_ancestor(args.outgroup)
                if set(out.get_leaf_names()) ^ set(args.outgroup):
                    raise ValueError("Outgroup is not monophyletic")
                
            t.set_outgroup(out)
        t.ladderize()

        if prev_tree:
            tree_compare(t, prev_tree)
        prev_tree = t

Esempio n. 30

0

Mostra file

File: tree_reconciliation.py Progetto: tarah28/ete

from ete_dev import PhyloTree

# Loads a gene tree and its corresponding species tree. Note that
# species names in sptree are the 3 firs letters of leaf nodes in
# genetree.
gene_tree_nw = '((Dme_001,Dme_002),(((Cfa_001,Mms_001),((Hsa_001,Ptr_001),Mmu_001)),(Ptr_002,(Hsa_002,Mmu_002))));'
species_tree_nw = "((((Hsa, Ptr), Mmu), (Mms, Cfa)), Dme);"
genetree = PhyloTree(gene_tree_nw)
sptree = PhyloTree(species_tree_nw)
print genetree
#                    /-Dme_001
#          /--------|
#         |          \-Dme_002
#         |
#         |                              /-Cfa_001
#         |                    /--------|
#---------|                   |          \-Mms_001
#         |          /--------|
#         |         |         |                    /-Hsa_001
#         |         |         |          /--------|
#         |         |          \--------|          \-Ptr_001
#          \--------|                   |
#                   |                    \-Mmu_001
#                   |
#                   |          /-Ptr_002
#                    \--------|
#                             |          /-Hsa_002
#                              \--------|
#                                        \-Mmu_002
#
# Let's reconcile our genetree with the species tree

Esempio n. 31

0

Mostra file

"""
iphylip_txt = """
 4 76
      seqA   MAEIPDETIQ QFMALT---H NIAVQYLSEF GDLNEALNSY YASQTDDIKD RREEAHQFMA
      seqB   MAEIPDATIQ QFMALTNVSH NIAVQY--EF GDLNEALNSY YAYQTDDQKD RREEAHQFMA
      seqC   MAEIPDATIQ ---ALTNVSH NIAVQYLSEF GDLNEALNSY YASQTDDQPD RREEAHQFMA
      seqD   MAEAPDETIQ QFMALTNVSH NIAVQYLSEF GDLNEAL--- ---------- -REEAHQ---
             LTNVSHQFMA LTNVSH
             LTNVSH---- ------
             LTNVSH---- ------
             -------FMA LTNVSH
"""
# Load a tree and link it to an alignment. As usual, 'alignment' can
# be the path to a file or data in text format.
t = PhyloTree("(((seqA,seqB),seqC),seqD);",
              alignment=fasta_txt,
              alg_format="fasta")

#We can now access the sequence of every leaf node
print "These are the nodes and its sequences:"
for leaf in t.iter_leaves():
    print leaf.name, leaf.sequence
#seqD MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAH
#seqC MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH
#seqA MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAH
#seqB MAEIPDATIQQFMALTNVSHNIAVQY--EFGDLNEALNSYYAYQTDDQKDRREEAH
#
# The associated alignment can be changed at any time
t.link_to_alignment(alignment=iphylip_txt, alg_format="iphylip")
# Let's check that sequences have changed
print "These are the nodes and its re-linked sequences:"

Esempio n. 32

0

Mostra file

from ete_dev import PhyloTree
# Creates a gene phylogeny with several duplication events at
# different levels. Note that we are using the default method for
# detecting the species code of leaves (three first lettes in the node
# name are considered the species code).
nw = """
((Dme_001,Dme_002),(((Cfa_001,Mms_001),((((Hsa_001,Hsa_003),Ptr_001)
,Mmu_001),((Hsa_004,Ptr_004),Mmu_004))),(Ptr_002,(Hsa_002,Mmu_002))));
"""
t = PhyloTree(nw)
print "Original tree:",
print t
#
#             /-Dme_001
#   /--------|
#  |          \-Dme_002
#  |
#  |                              /-Cfa_001
#  |                    /--------|
#  |                   |          \-Mms_001
#  |                   |
#--|                   |                                        /-Hsa_001
#  |                   |                              /--------|
#  |          /--------|                    /--------|          \-Hsa_003
#  |         |         |                   |         |
#  |         |         |          /--------|          \-Ptr_001
#  |         |         |         |         |
#  |         |         |         |          \-Mmu_001
#  |         |          \--------|
#   \--------|                   |                    /-Hsa_004
#            |                   |          /--------|