Esempio n. 1
0
def translate_ids(trees_file, outgroup_lineage="Bacteria"):
    for line in open(trees_file):
        if not line.strip() or line.startswith('#'):
            continue

        t = PhyloTree(line, sp_naming_function=spname)
        #t.set_outgroup(t.get_midpoint_outgroup())

        for lf in t:
            lf.add_features(coded_name = lf.name)            
            if lf.name in NAME2SP:
                lf.name = "%s {%s}" %(lf.name, NAME2SP[lf.name])
         
        t.dist = 0
        ncbi.connect_database()
        name2sp = ncbi.get_name_translator(t.get_species())
        for lf in t.iter_leaves():
            lf.add_features(taxid=name2sp.get(lf.species, 0))

        t.set_outgroup(t.search_nodes(taxid=9606)[0])
        ncbi.annotate_tree(t, attr_name='taxid')
        t.set_outgroup(t.get_common_ancestor([lf for lf in t if outgroup_lineage in lf.named_lineage]))
        ncbi.annotate_tree(t, attr_name='taxid')
            
        #print t.write(features=[])
        #print t.write()
        yield t
Esempio n. 2
0
def get_example_tree():

    # Performs a tree reconciliation analysis
    gene_tree_nw = '((Dme_001,Dme_002),(((Cfa_001,Mms_001),((Hsa_001,Ptr_001),Mmu_001)),(Ptr_002,(Hsa_002,Mmu_002))));'
    species_tree_nw = "((((Hsa, Ptr), Mmu), (Mms, Cfa)), Dme);"
    genetree = PhyloTree(gene_tree_nw)
    sptree = PhyloTree(species_tree_nw)
    recon_tree, events = genetree.reconcile(sptree)
    recon_tree.link_to_alignment(alg)
    return recon_tree, TreeStyle()
Esempio n. 3
0
def get_example_tree():

    # Performs a tree reconciliation analysis 
    gene_tree_nw = '((Dme_001,Dme_002),(((Cfa_001,Mms_001),((Hsa_001,Ptr_001),Mmu_001)),(Ptr_002,(Hsa_002,Mmu_002))));'
    species_tree_nw = "((((Hsa, Ptr), Mmu), (Mms, Cfa)), Dme);"
    genetree = PhyloTree(gene_tree_nw)
    sptree = PhyloTree(species_tree_nw)
    recon_tree, events = genetree.reconcile(sptree)
    recon_tree.link_to_alignment(alg)
    return recon_tree, TreeStyle()
Esempio n. 4
0
  def test_tree_annotation(self):
    t = PhyloTree( "((9598, 9606), 10090);" )
    t.annotate_ncbi_taxa()
    self.assertEqual(t.sci_name, 'Euarchontoglires')
    homi = (t&'9606').up
    self.assertEqual(homi.sci_name, 'Homininae')
    self.assertEqual(homi.taxid, 207598)
    self.assertEqual(homi.rank, 'subfamily')
    self.assertEqual(homi.named_lineage, [u'root', u'cellular organisms', u'Eukaryota', u'Opisthokonta', u'Metazoa', u'Eumetazoa', u'Bilateria', u'Deuterostomia', u'Chordata', u'Craniata', u'Vertebrata', u'Gnathostomata', u'Teleostomi', u'Euteleostomi', u'Sarcopterygii', u'Dipnotetrapodomorpha', u'Tetrapoda', u'Amniota', u'Mammalia', u'Theria', u'Eutheria', u'Boreoeutheria', u'Euarchontoglires', u'Primates', u'Haplorrhini', u'Simiiformes', u'Catarrhini', u'Hominoidea', u'Hominidae', u'Homininae'])
    self.assertEqual(homi.lineage, [1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593, 7742, 7776, 117570, 117571, 8287, 1338369, 32523, 32524, 40674, 32525, 9347, 1437010, 314146, 9443, 376913, 314293, 9526, 314295, 9604, 207598] )

    human = t&'9606'
    self.assertEqual(human.sci_name, 'H**o sapiens')
    self.assertEqual(human.taxid, 9606)
    self.assertEqual(human.rank, 'species')
    self.assertEqual(human.named_lineage, [u'root', u'cellular organisms', u'Eukaryota', u'Opisthokonta', u'Metazoa', u'Eumetazoa', u'Bilateria', u'Deuterostomia', u'Chordata', u'Craniata', u'Vertebrata', u'Gnathostomata', u'Teleostomi', u'Euteleostomi', u'Sarcopterygii', u'Dipnotetrapodomorpha', u'Tetrapoda', u'Amniota', u'Mammalia', u'Theria', u'Eutheria', u'Boreoeutheria', u'Euarchontoglires', u'Primates', u'Haplorrhini', u'Simiiformes', u'Catarrhini', u'Hominoidea', u'Hominidae', u'Homininae', u'H**o', u'H**o sapiens'])
    self.assertEqual(human.lineage, [1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593, 7742, 7776, 117570, 117571, 8287, 1338369, 32523, 32524, 40674, 32525, 9347, 1437010, 314146, 9443, 376913, 314293, 9526, 314295, 9604, 207598, 9605, 9606])
Esempio n. 5
0
    def test_tree_annotation(self):
        t = PhyloTree("((9598, 9606), 10090);")
        t.annotate_ncbi_taxa()
        self.assertEqual(t.sci_name, 'Euarchontoglires')
        homi = (t & '9606').up
        self.assertEqual(homi.sci_name, 'Homininae')
        self.assertEqual(homi.taxid, 207598)
        self.assertEqual(homi.rank, 'subfamily')
        self.assertEqual(homi.named_lineage, [
            u'root', u'cellular organisms', u'Eukaryota', u'Opisthokonta',
            u'Metazoa', u'Eumetazoa', u'Bilateria', u'Deuterostomia',
            u'Chordata', u'Craniata', u'Vertebrata', u'Gnathostomata',
            u'Teleostomi', u'Euteleostomi', u'Sarcopterygii',
            u'Dipnotetrapodomorpha', u'Tetrapoda', u'Amniota', u'Mammalia',
            u'Theria', u'Eutheria', u'Boreoeutheria', u'Euarchontoglires',
            u'Primates', u'Haplorrhini', u'Simiiformes', u'Catarrhini',
            u'Hominoidea', u'Hominidae', u'Homininae'
        ])
        self.assertEqual(homi.lineage, [
            1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593,
            7742, 7776, 117570, 117571, 8287, 1338369, 32523, 32524, 40674,
            32525, 9347, 1437010, 314146, 9443, 376913, 314293, 9526, 314295,
            9604, 207598
        ])

        human = t & '9606'
        self.assertEqual(human.sci_name, 'H**o sapiens')
        self.assertEqual(human.taxid, 9606)
        self.assertEqual(human.rank, 'species')
        self.assertEqual(human.named_lineage, [
            u'root', u'cellular organisms', u'Eukaryota', u'Opisthokonta',
            u'Metazoa', u'Eumetazoa', u'Bilateria', u'Deuterostomia',
            u'Chordata', u'Craniata', u'Vertebrata', u'Gnathostomata',
            u'Teleostomi', u'Euteleostomi', u'Sarcopterygii',
            u'Dipnotetrapodomorpha', u'Tetrapoda', u'Amniota', u'Mammalia',
            u'Theria', u'Eutheria', u'Boreoeutheria', u'Euarchontoglires',
            u'Primates', u'Haplorrhini', u'Simiiformes', u'Catarrhini',
            u'Hominoidea', u'Hominidae', u'Homininae', u'H**o', u'H**o sapiens'
        ])
        self.assertEqual(human.lineage, [
            1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593,
            7742, 7776, 117570, 117571, 8287, 1338369, 32523, 32524, 40674,
            32525, 9347, 1437010, 314146, 9443, 376913, 314293, 9526, 314295,
            9604, 207598, 9605, 9606
        ])
Esempio n. 6
0
    def get_tree(self, protid, method, phylome_id):
        """ Returns the method-tree associated to a given protid. """

        cmd = 'SELECT newick,lk FROM %s WHERE phylome_id=%s AND species="%s" AND protid="%s" AND method ="%s"' %\
            (self._trees_table, phylome_id, protid[:3],protid[3:],method)
        if self._SQL.execute(cmd):
            entry = self._SQL.fetchone()
            nw = entry[0]
            lk = float(entry[1])
            t  = PhyloTree(nw)
        else:
            t  = None
            lk = None
        return t,lk
Esempio n. 7
0
    def get_best_tree(self, protid, phylome_id):
        """ Returns the winner ML tree"""

        likelihoods    = {}
        winner_model   = None
        winner_lk      = None
        winner_newick  = None
        t = None
        command ='SELECT newick,method,lk FROM %s WHERE phylome_id=%s AND species="%s" and protid="%s";' \
            % (self._trees_table,phylome_id, protid[:3], protid[3:])
        self._SQL.execute(command)
        result = self._SQL.fetchall()
        for r in result:
            nw,m,lk = r
            if lk < 0:
                likelihoods[m] = lk
                if  winner_lk==None or lk > winner_lk:
                    winner_lk     = lk
                    winner_model  = m
                    winner_newick = nw
        if winner_newick:
            t = PhyloTree(winner_newick)
        return winner_model,likelihoods,t
Esempio n. 8
0
    '''
    layout for CodemlTree
    '''
    if hasattr(node, "collapsed"):
        if node.collapsed == 1:
            node.img_style["draw_descendants"]= False
    if node.is_leaf():
        if hasattr (node, "sequence"):
            seqface =  MySequenceFace(node.sequence, "nt",
                                      fsize=10,
                                      col_w=11, interactive=True)
            faces.add_face_to_node(seqface, node, 1, aligned=True)

            
if __name__ == "__main__":
    tree = PhyloTree('(Orangutan,Human,Chimp);')
    tree.link_to_alignment("""
                           >Chimp
                           HARWLNEKLRCELRTLKKLGLDGYKAVSQYVKGRA
                           >Orangutan
                           DARWINEKLRCVSRTLKKLGLDGYKGVSQYVKGRP
                           >Human
                           DARWHNVKLRCELRTLKKLGLVGFKAVSQFVIRRA
                           """)
    nt_sequences = {"Human"    : "GACGCACGGTGGCACAACGTAAAATTAAGATGTGAATTGAGAACTCTGAAAAAATTGGGACTGGTCGGCTTCAAGGCAGTAAGTCAATTCGTAATACGTCGTGCG",
                    "Chimp"    : "CACGCCCGATGGCTCAACGAAAAGTTAAGATGCGAATTGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGCAGTAAGTCAGTACGTTAAAGGTCGTGCG",
                    "Orangutan": "GATGCACGCTGGATCAACGAAAAGTTAAGATGCGTATCGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGGAGTAAGTCAATACGTTAAAGGTCGTCCG"
                }
    for l in nt_sequences:
        (tree & l).nt_sequence = nt_sequences[l]
    tree.dist = 0
Esempio n. 9
0
     #    tree.run_model("fb")
     #    tree.run_model("M2")
     #except:
     #    pass
     tree.dist = 0
     ts = TreeStyle()
     ts.title.add_face(TextFace(
         "Example for EvolTree, interactivity shows codons", fsize=15),
                       column=0)
     ts.layout_fn = test_layout_evol
     #try:
     #    tree.show(tree_style=ts, histfaces=["M2"])
     #except:
     tree.show(tree_style=ts)
 except:
     tree = PhyloTree('(Orangutan,Human,Chimp);')
     tree.link_to_alignment("""
                            >Chimp
                            HARWLNEKLRCELRTLKKLGLDGYKAVSQYVKGRA
                            >Orangutan
                            DARWINEKLRCVSRTLKKLGLDGYKGVSQYVKGRP
                            >Human
                            DARWHNVKLRCELRTLKKLGLVGFKAVSQFVIRRA
                            """)
     nt_sequences = {
         "Human":
         "GACGCACGGTGGCACAACGTAAAATTAAGATGTGAATTGAGAACTCTGAAAAAATTGGGACTGGTCGGCTTCAAGGCAGTAAGTCAATTCGTAATACGTCGTGCG",
         "Chimp":
         "CACGCCCGATGGCTCAACGAAAAGTTAAGATGCGAATTGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGCAGTAAGTCAGTACGTTAAAGGTCGTGCG",
         "Orangutan":
         "GATGCACGCTGGATCAACGAAAAGTTAAGATGCGTATCGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGGAGTAAGTCAATACGTTAAAGGTCGTCCG"
Esempio n. 10
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=__DESCRIPTION__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo.
    # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version)
    # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) )
    # const - A constant value required by some action and nargs selections.
    # default - The value produced if the argument is absent from the command line.
    # type - The type to which the command-line argument should be converted.
    # choices - A container of the allowable values for the argument.
    # required - Whether or not the command-line option may be omitted (optionals only).
    # help - A brief description of what the argument does.
    # metavar - A name for the argument in usage messages.
    # dest - The name of the attribute to be added to the object returned by parse_args().

    parser.add_argument("--show",
                        dest="show_tree",
                        action="store_true",
                        help="""Display tree after the analysis.""")

    parser.add_argument("--render",
                        dest="render",
                        action="store_true",
                        help="""Render tree.""")

    parser.add_argument("--dump",
                        dest="dump",
                        action="store_true",
                        help="""Dump analysis""")

    parser.add_argument(
        "--explore",
        dest="explore",
        type=str,
        help="""Reads a previously analyzed tree and visualize it""")

    input_args = parser.add_mutually_exclusive_group()
    input_args.required = True
    input_args.add_argument("-t",
                            "--tree",
                            dest="target_tree",
                            nargs="+",
                            type=str,
                            help="""Tree file in newick format""")

    input_args.add_argument("-tf",
                            dest="tree_list_file",
                            type=str,
                            help="File with the list of tree files")

    parser.add_argument("--tax",
                        dest="tax_info",
                        type=str,
                        help="If the taxid attribute is not set in the"
                        " newick file for all leaf nodes, a tab file file"
                        " with the translation of name and taxid can be"
                        " provided with this option.")

    parser.add_argument(
        "--sp_delimiter",
        dest="sp_delimiter",
        type=str,
        help=
        "If taxid is part of the leaf name, delimiter used to split the string"
    )

    parser.add_argument(
        "--sp_field",
        dest="sp_field",
        type=int,
        default=0,
        help="field position for taxid after splitting leaf names")

    parser.add_argument("--ref",
                        dest="ref_tree",
                        type=str,
                        help="Uses ref tree to compute robinson foulds"
                        " distances of the different subtrees")

    parser.add_argument("--rf-only",
                        dest="rf_only",
                        action="store_true",
                        help="Skip ncbi consensus analysis")

    parser.add_argument(
        "--outgroup",
        dest="outgroup",
        type=str,
        nargs="+",
        help="A list of node names defining the trees outgroup")

    parser.add_argument("--is_sptree",
                        dest="is_sptree",
                        action="store_true",
                        help="Assumes no duplication nodes in the tree")

    parser.add_argument("-o",
                        dest="output",
                        type=str,
                        help="Writes result into a file")

    parser.add_argument("--tax2name", dest="tax2name", type=str, help="")

    parser.add_argument("--tax2track", dest="tax2track", type=str, help="")

    parser.add_argument("--dump_tax_info",
                        dest="dump_tax_info",
                        action="store_true",
                        help="")

    args = parser.parse_args(argv)

    if args.sp_delimiter:
        GET_TAXID = lambda x: x.split(args.sp_delimiter)[args.sp_field]
    else:
        GET_TAXID = None

    reftree_name = os.path.basename(args.ref_tree) if args.ref_tree else ""
    if args.explore:
        print >> sys.stderr, "Reading tree from file:", args.explore
        t = cPickle.load(open(args.explore))
        ts = TreeStyle()
        ts.force_topology = True
        ts.show_leaf_name = False
        ts.layout_fn = ncbi_layout
        ts.mode = "r"
        t.show(tree_style=ts)
        print >> sys.stderr, "dumping color config"
        cPickle.dump(name2color, open("ncbi_colors.pkl", "w"))
        sys.exit()

    if args.output:
        OUT = open(args.output, "w")
    else:
        OUT = sys.stdout

    print >> sys.stderr, "Dumping results into", OUT
    target_trees = []
    if args.tree_list_file:
        target_trees = [line.strip() for line in open(args.tree_list_file)]
    if args.target_tree:
        target_trees += args.target_tree
    prev_tree = None
    if args.tax2name:
        tax2name = cPickle.load(open(args.tax2name))
    else:
        tax2name = {}

    if args.tax2track:
        tax2track = cPickle.load(open(args.tax2track))
    else:
        tax2track = {}
    print len(tax2track), len(tax2name)
    header = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees",
              "Broken clades", "Clade sizes", "RF (avg)", "RF (med)",
              "RF (std)", "RF (max)", "Shared tips")
    print >> OUT, '|'.join([h.ljust(15) for h in header])
    if args.ref_tree:
        print >> sys.stderr, "Reading ref tree from", args.ref_tree
        reft = Tree(args.ref_tree, format=1)
    else:
        reft = None

    SHOW_TREE = False
    if args.show_tree or args.render:
        SHOW_TREE = True

    prev_broken = set()
    ENTRIES = []
    ncbi.connect_database()
    for tfile in target_trees:
        #print tfile
        t = PhyloTree(tfile, sp_naming_function=None)
        if GET_TAXID:
            for n in t.iter_leaves():
                n.name = GET_TAXID(n.name)

        if args.outgroup:
            if len(args.outgroup) == 1:
                out = t & args.outgroup[0]
            else:
                out = t.get_common_ancestor(args.outgroup)
                if set(out.get_leaf_names()) ^ set(args.outgroup):
                    raise ValueError("Outgroup is not monophyletic")

            t.set_outgroup(out)
        t.ladderize()

        if prev_tree:
            tree_compare(t, prev_tree)
        prev_tree = t

        if args.tax_info:
            tax2name, tax2track = annotate_tree_with_taxa(
                t, args.tax_info, tax2name, tax2track)
            if args.dump_tax_info:
                cPickle.dump(tax2track, open("tax2track.pkl", "w"))
                cPickle.dump(tax2name, open("tax2name.pkl", "w"))
                print "Tax info written into pickle files"
        else:
            for n in t.iter_leaves():
                spcode = n.name
                n.add_features(taxid=spcode)
                n.add_features(species=spcode)
            tax2name, tax2track = annotate_tree_with_taxa(
                t, None, tax2name, tax2track)

        # Split tree into species trees
        #subtrees =  t.get_speciation_trees()
        if not args.rf_only:
            #print "Calculating tree subparts..."
            t1 = time.time()
            if not args.is_sptree:
                subtrees = t.split_by_dups()
                #print "Subparts:", len(subtrees), time.time()-t1
            else:
                subtrees = [t]

            valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = analyze_subtrees(
                t, subtrees, show_tree=SHOW_TREE)

            #print valid_subtrees, broken_subtrees, ncbi_mistakes, total_rf
        else:
            subtrees = []
            valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = 0, 0, 0, 0, 0, 0

        ndups = 0
        nsubtrees = len(subtrees)

        rf = 0
        rf_max = 0
        rf_std = 0
        rf_med = 0
        common_names = 0
        max_size = 0
        if reft and len(subtrees) == 1:
            rf = t.robinson_foulds(reft, attr_t1="realname")
            rf_max = rf[1]
            rf = rf[0]
            rf_med = rf

        elif reft:
            #print "Calculating avg RF..."
            nsubtrees, ndups, subtrees = t.get_speciation_trees(
                map_features=["taxid"])
            #print len(subtrees), "Sub-Species-trees found"
            avg_rf = []
            rf_max = 0.0  # reft.robinson_foulds(reft)[1]
            sum_size = 0.0
            print nsubtrees, "subtrees", ndups, "duplications"

            for ii, subt in enumerate(subtrees):
                print "\r%d" % ii,
                sys.stdout.flush()
                try:
                    partial_rf = subt.robinson_foulds(reft, attr_t1="taxid")
                except ValueError:
                    pass
                else:
                    sptree_size = len(
                        set([n.taxid for n in subt.iter_leaves()]))
                    sum_size += sptree_size
                    avg_rf.append(
                        (partial_rf[0] / float(partial_rf[1])) * sptree_size)
                    common_names = len(partial_rf[3])
                    max_size = max(max_size, sptree_size)
                    rf_max = max(rf_max, partial_rf[1])
                #print  partial_rf[:2]
            rf = numpy.sum(avg_rf) / float(sum_size)  # Treeko dist
            rf_std = numpy.std(avg_rf)
            rf_med = numpy.median(avg_rf)

        sizes_info = "%0.1f/%0.1f +- %0.1f" % (numpy.mean(broken_sizes),
                                               numpy.median(broken_sizes),
                                               numpy.std(broken_sizes))
        iter_values = [
            os.path.basename(tfile), nsubtrees, ndups, broken_subtrees,
            ncbi_mistakes, broken_branches, sizes_info, rf, rf_med, rf_std,
            rf_max, common_names
        ]
        print >> OUT, '|'.join(
            map(lambda x: str(x).strip().ljust(15), iter_values))
        fixed = sorted([n for n in prev_broken if n not in broken_clades])
        new_problems = sorted(broken_clades - prev_broken)
        fixed_string = color(', '.join(fixed), "green") if fixed else ""
        problems_string = color(', '.join(new_problems),
                                "red") if new_problems else ""
        OUT.write("    Fixed clades: %s\n" % fixed_string) if fixed else None
        OUT.write("    New broken:   %s\n" %
                  problems_string) if new_problems else None
        prev_broken = broken_clades
        ENTRIES.append([
            os.path.basename(tfile), nsubtrees, ndups, broken_subtrees,
            ncbi_mistakes, broken_branches, sizes_info, fixed_string,
            problems_string
        ])
        OUT.flush()
        if args.show_tree or args.render:
            ts = TreeStyle()
            ts.force_topology = True
            #ts.tree_width = 500
            ts.show_leaf_name = False
            ts.layout_fn = ncbi_layout
            ts.mode = "r"
            t.dist = 0
            if args.show_tree:
                #if args.hide_monophyletic:
                #    tax2monophyletic = {}
                #    n2content = t.get_node2content()
                #    for node in t.traverse():
                #        term2count = defaultdict(int)
                #        for leaf in n2content[node]:
                #            if leaf.lineage:
                #                for term in leaf.lineage:
                #                    term2count[term] += 1
                #        expected_size = len(n2content)
                #        for term, count in term2count.iteritems():
                #            if count > 1

                print "Showing tree..."
                t.show(tree_style=ts)
            else:
                t.render("img.svg", tree_style=ts, dpi=300)
            print "dumping color config"
            cPickle.dump(name2color, open("ncbi_colors.pkl", "w"))

        if args.dump:
            cPickle.dump(t, open("ncbi_analysis.pkl", "w"))

    print
    print
    HEADER = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees",
              "Broken clades", "Broken branches", "Clade sizes",
              "Fixed Groups", "New Broken Clades")
    print_table(ENTRIES, max_col_width=50, row_line=True, header=HEADER)

    if args.output:
        OUT.close()
Esempio n. 11
0
def main(argv):
    
    parser = argparse.ArgumentParser(description=__DESCRIPTION__, 
                            formatter_class=argparse.RawDescriptionHelpFormatter)


    input_args = parser.add_argument_group("INPUT OPTIONS")
    input_args.add_argument("source_trees", metavar='source_trees', type=str, nargs="*",
                   help='a list of source tree files')
    
    input_args.add_argument("--source_file", dest="source_file", 
                        type=str, 
                        help="""path to a file containing many source trees, one per line""")

    input_args.add_argument("-r", dest="reftree", 
                        type=str, required=True,
                        help="""Reference tree""")

    input_args.add_argument("--ref_tree_attr", dest="ref_tree_attr", 
                            type=str, default="name",
                            help=("attribute in ref tree used as leaf name"))
    
    input_args.add_argument("--src_tree_attr", dest="src_tree_attr", 
                            type=str, default="name",
                            help=("attribute in source tree used as leaf name"))

    input_args.add_argument("--min_support_ref",
                            type=float, default=0.0,
                        help=("min support for branches to be considered from the ref tree"))
    input_args.add_argument("--min_support_src",
                        type=float, default=0.0,
                        help=("min support for branches to be considered from the source tree"))

    
    output_args = parser.add_argument_group("OUTPUT OPTIONS")
    
    output_args.add_argument("-o", dest="output", 
                            type=str,
                            help="""Path to the tab delimited report file""")

    
    opt_args = parser.add_argument_group("DISTANCE OPTIONS")
    

    opt_args.add_argument("--outgroup", dest="outgroup", 
                        nargs = "+",
                        help="""outgroup used to root reference and source trees before distance computation""")
  
    opt_args.add_argument("--expand_polytomies", dest="polytomies", 
                        action = "store_true",
                        help="""expand politomies if necessary""")
  
    opt_args.add_argument("--unrooted", dest="unrooted", 
                        action = "store_true",
                        help="""compare trees as unrooted""")

    opt_args.add_argument("--min_support", dest="min_support", 
                        type=float, default=0.0,
                        help=("min support value for branches to be counted in the distance computation (RF, treeko and refTree/targeGene compatibility)"))

    opt_args = parser.add_argument_group("PHYLOGENETICS OPTIONS")
    
    opt_args.add_argument("--extract_species",
                        action = "store_true",
                        help="When used, leaf names in the reference and source trees are assumed to represent species."
                          " If target trees are gene-trees whose species information is encoded as a part of the leaf sequence name,"
                          " it can be automatically extracted by providing a Perl regular expression that extract a "
                          " valid species code (see --sp_regexp). Such information will be also used to detect duplication"
                          " events. ")

    opt_args.add_argument("--sp_regexp", 
                          type=str,
                         help=("Specifies a Perl regular expression to automatically extract species names"
                          " from the name string in source trees. If not used, leaf names are assumed to represent species names."
                          " Example: use this expression '[^_]+_(.+)' to extract HUMAN from the string 'P53_HUMAN'."))
        
    opt_args.add_argument("--collateral", 
                        action='store_true', 
                        help=(""))

    
    args = parser.parse_args(argv)
    print __DESCRIPTION__
    reftree = args.reftree
    if args.source_file and args.source_trees:
        print >>sys.stderr, 'The use of targets_file and targets at the same time is not supported.'
        sys.exit(1)
        
    if args.source_file:
        source_trees = tree_iterator(args.source_file)
    else:
        source_trees = args.source_trees
        
    ref_tree = Tree(reftree)

    if args.ref_tree_attr:
        for lf in ref_tree.iter_leaves():
            lf._origname = lf.name
            if args.ref_tree_attr not in lf.features:
                print lf
            lf.name = getattr(lf, args.ref_tree_attr)
    
    if args.outgroup:
        if len(args.outgroup) > 1:
            out = ref_tree.get_common_ancestor(args.outgroup)
        else:
            out = ref_tree.search_nodes(name=args.outgroup[0])[0]
        ref_tree.set_outgroup(out)
                     

    HEADER = ("source tree", 'ref tree', 'common\ntips', 'normRF', 'RF', 'maxRF', "%reftree", "%genetree", "subtrees", "treeko\ndist")
    if args.output:
        OUT = open(args.output, "w")
        print >>OUT, '# ' + ctime()
        print >>OUT, '# ' + ' '.join(sys.argv) 
        print >>OUT, '#'+'\t'.join(HEADER)
    else:
        print '# ' + ctime()
        print '# ' + ' '.join(sys.argv) 
        COL_WIDTHS = [20, 20] + [9] * 10
        print_table([HEADER], fix_col_width=COL_WIDTHS, wrap_style='wrap')
        
                
    prev_tree = None
    ref_fname = os.path.basename(args.reftree)
    for counter, tfile in enumerate(source_trees):
        if args.source_file:
            seedid, tfile = tfile
        else:
            seedid = None
           
        if args.extract_species:

            if args.sp_regexp:
                SPMATCHER = re.compile(args.sp_regexp)
                get_sp_name = lambda x: re.search(SPMATCHER, x).groups()[0]
            else:
                get_sp_name = lambda x: x
                
            tt = PhyloTree(tfile, sp_naming_function = get_sp_name)
        else:
            tt = Tree(tfile)

        if args.src_tree_attr:
            for lf in tt.iter_leaves():
                lf._origname = lf.name
                lf.name = getattr(lf, args.src_tree_attr)
            
        if args.outgroup:
            if len(args.outgroup) > 1:
                out = tt.get_common_ancestor(args.outgroup)
            else:
                out = tt.search_nodes(name=args.outgroup[0])[0]
            tt.set_outgroup(out)
        
        if args.source_trees:
            fname = os.path.basename(tfile)
        else:
            fname = '%05d' %counter                          


            
        r = tt.compare(ref_tree, 
                       ref_tree_attr=args.ref_tree_attr,
                       source_tree_attr=args.src_tree_attr,
                       min_support_ref=args.min_support_ref,
                       min_support_source = args.min_support_src,
                       unrooted=args.unrooted,
                       has_duplications=args.extract_species)

                          

        print_table([map(istr, [fname[-30:], ref_fname[-30:], r['effective_tree_size'], r['norm_rf'],
                               r['rf'], r['max_rf'], r["source_edges_in_ref"],
                               r["ref_edges_in_source"], r['source_subtrees'], r['treeko_dist']])],
                    fix_col_width = COL_WIDTHS, wrap_style='cut')
                          

    if args.output:
        OUT.close()
Esempio n. 12
0
    '''
    if hasattr(node, "collapsed"):
        if node.collapsed == 1:
            node.img_style["draw_descendants"] = False
    if node.is_leaf():
        if hasattr(node, "sequence"):
            seqface = MySequenceFace(node.sequence,
                                     "nt",
                                     fsize=10,
                                     col_w=11,
                                     interactive=True)
            faces.add_face_to_node(seqface, node, 1, aligned=True)


if __name__ == "__main__":
    tree = PhyloTree('(Orangutan,Human,Chimp);')
    tree.link_to_alignment("""
                           >Chimp
                           HARWLNEKLRCELRTLKKLGLDGYKAVSQYVKGRA
                           >Orangutan
                           DARWINEKLRCVSRTLKKLGLDGYKGVSQYVKGRP
                           >Human
                           DARWHNVKLRCELRTLKKLGLVGFKAVSQFVIRRA
                           """)
    nt_sequences = {
        "Human":
        "GACGCACGGTGGCACAACGTAAAATTAAGATGTGAATTGAGAACTCTGAAAAAATTGGGACTGGTCGGCTTCAAGGCAGTAAGTCAATTCGTAATACGTCGTGCG",
        "Chimp":
        "CACGCCCGATGGCTCAACGAAAAGTTAAGATGCGAATTGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGCAGTAAGTCAGTACGTTAAAGGTCGTGCG",
        "Orangutan":
        "GATGCACGCTGGATCAACGAAAAGTTAAGATGCGTATCGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGGAGTAAGTCAATACGTTAAAGGTCGTCCG"
from ete_dev import PhyloTree
# Loads an example tree
nw = """
((Dme_001,Dme_002),(((Cfa_001,Mms_001),((Hsa_001,Ptr_001),Mmu_001)),
(Ptr_002,(Hsa_002,Mmu_002))));
"""
t = PhyloTree(nw)
print t
#                    /-Dme_001
#          /--------|
#         |          \-Dme_002
#         |
#         |                              /-Cfa_001
#         |                    /--------|
#---------|                   |          \-Mms_001
#         |          /--------|
#         |         |         |                    /-Hsa_001
#         |         |         |          /--------|
#         |         |          \--------|          \-Ptr_001
#          \--------|                   |
#                   |                    \-Mmu_001
#                   |
#                   |          /-Ptr_002
#                    \--------|
#                             |          /-Hsa_002
#                              \--------|
#                                        \-Mmu_002
#
# To obtain all the evolutionary events involving a given leaf node we
# use get_my_evol_events method
matches = t.search_nodes(name="Hsa_001")
Esempio n. 14
0
        tax2name = {}

    if args.tax2track:
        tax2track = cPickle.load(open(args.tax2track))
    else:
        tax2track = {}
    print len(tax2track), len(tax2name)
    #header = "filename", "refname", "# subtrees", "# dups", "broken subtrees", "ncbi_mistakes", "RF", "avg RF", "RF std", "max RF", "")
    #print '\t'.join(header)
    header = ("Tree".center(50), "Total subtrees", "Broken subtrees",
              "Broken NCBI clades", "RF (avg)", "RF (med)", "RF (std)",
              "RF (max possible)")
    print >> OUT, "#" + ' '.join([h.center(15) for h in header])
    for tfile in target_trees:
        print tfile
        t = PhyloTree(tfile, sp_naming_function=None)
        if args.outgroup:
            if len(args.outgroup) == 1:
                out = t & args.outgroup[0]
            else:
                out = t.get_common_ancestor(args.outgroup)
                if set(out.get_leaf_names()) ^ set(args.outgroup):
                    raise ValueError("Outgroup is not monophyletic")

            t.set_outgroup(out)
        t.ladderize()

        if prev_tree:
            tree_compare(t, prev_tree)
        prev_tree = t
 MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAH
"""
iphylip_txt = """
 4 76
      seqA   MAEIPDETIQ QFMALT---H NIAVQYLSEF GDLNEALNSY YASQTDDIKD RREEAHQFMA
      seqB   MAEIPDATIQ QFMALTNVSH NIAVQY--EF GDLNEALNSY YAYQTDDQKD RREEAHQFMA
      seqC   MAEIPDATIQ ---ALTNVSH NIAVQYLSEF GDLNEALNSY YASQTDDQPD RREEAHQFMA
      seqD   MAEAPDETIQ QFMALTNVSH NIAVQYLSEF GDLNEAL--- ---------- -REEAHQ---
             LTNVSHQFMA LTNVSH
             LTNVSH---- ------
             LTNVSH---- ------
             -------FMA LTNVSH
"""
# Load a tree and link it to an alignment. As usual, 'alignment' can
# be the path to a file or data in text format.
t = PhyloTree("(((seqA,seqB),seqC),seqD);", alignment=fasta_txt, alg_format="fasta")

#We can now access the sequence of every leaf node
print "These are the nodes and its sequences:"
for leaf in t.iter_leaves():
    print leaf.name, leaf.sequence
#seqD MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAH
#seqC MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH
#seqA MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAH
#seqB MAEIPDATIQQFMALTNVSHNIAVQY--EFGDLNEALNSYYAYQTDDQKDRREEAH
#
# The associated alignment can be changed at any time
t.link_to_alignment(alignment=iphylip_txt, alg_format="iphylip")
# Let's check that sequences have changed
print "These are the nodes and its re-linked sequences:"
for leaf in t.iter_leaves():
Esempio n. 16
0
 def test_ncbi_compare(self):
     t = PhyloTree("((9606, (9598, 9606)), 10090);",
                   sp_naming_function=lambda x: x.name)
     t.annotate_ncbi_taxa()
Esempio n. 17
0
from ete_dev import PhyloTree, PhylomeDBConnector, SeqGroup

p = PhylomeDBConnector()
w,x, t =  p.get_best_tree("Hsa0000001", 1)
a, l = p.get_clean_alg("Hsa0000001", 1)
A = SeqGroup(a, "iphylip")
for s in A.id2seq:
    A.id2seq[s]=A.id2seq[s][:30]
t.link_to_alignment(A)
print t.get_species()
print t
t.set_outgroup(t&"Ddi0002240")

sp = PhyloTree("(((((((((((Hsa, Ptr), Mmu), ((Mms, Rno), (Bta, Cfa))), Mdo), Gga), Xtr), (Dre, Fru))),Cin) (Dme, Aga)), Ddi);")
reconciled, evs = t.reconcile(sp)
print reconciled
reconciled.show()
Esempio n. 18
0
def main(argv):
    
    parser = ArgumentParser(description=__DESCRIPTION__)

    parser.add_argument("--db",  dest="dbfile",
                        type=str,
                        help="""NCBI sqlite3 db file.""")

    input_args = parser.add_argument_group('TAXID INPUT OPTIONS')
    
    input_args.add_argument("-i", "--taxid", dest="taxid", nargs="+",  
                        type=int, 
                        help="""taxids (space separated)""")

    input_args.add_argument("-if", "--taxid_file", dest="taxid_file",   
                        type=str, 
                        help="""file containing a list of taxids (one per line)""")

    input_args.add_argument("-t", "--reftree", dest="reftree",   
                        type=str, 
                        help="""Read taxids from the provided tree.""")
    
    input_args.add_argument("--reftree_attr", dest="reftree_attr",   
                        type=str, default="name",
                        help="""tree attribute encoding for taxid numbers.""")

    name_input_args = parser.add_argument_group('NAME INPUT OPTIONS')
    
    name_input_args.add_argument("-n", "--name", dest="names", nargs="+",  
                        type=str, 
                        help="""species or taxa names (comma separated)""")

    name_input_args.add_argument("-nf", "--names_file", dest="names_file",   
                        type=str, 
                        help="""file containing a list of taxids (one per line)""")
    name_input_args.add_argument("--fuzzy", dest="fuzzy", type=float,
                        help=("Tries a fuzzy (and SLOW) search for those"
                              " species names that could not be translated"
                              " into taxids. A float number must be provided"
                              " indicating the minimum string similarity."))

    output_args = parser.add_argument_group('OUTPUT OPTIONS')
    
    output_args.add_argument("-x", "--taxonomy", dest="taxonomy",   
                        type=str, 
                        help=("dump a pruned version of the NCBI taxonomy"
                              " tree containing target species into the specified file"))

    output_args.add_argument("-l", "--list", dest="info_list",   
                        type=str, 
                        help="""dump NCBI taxonmy information for each target species into the specified file. """)

    output_args.add_argument("-a", "--annotated", dest="annotated_tree",   
                        type=str, 
                        help="dump the annotated tree of the input reftree provided with -t into the specified file.")                             
    
    output_args.add_argument("--collapse_subspecies", dest="collapse_subspecies",   
                        action="store_true",
                        help=("When used, all nodes under the the species rank"
                              " are collapsed, so all species and subspecies"
                              " are seen as sister nodes"))

    output_args.add_argument("--rank_limit", dest="rank_limit",   
                        type=str,
                        help=("When used, all nodes under the provided rank"
                              " are discarded"))
    
    output_args.add_argument("--full_lineage", dest="full_lineage",   
                        action="store_true",
                        help=("When used, topology is not pruned to avoid "
                              " one-child-nodes, so the complete lineage"
                              " track leading from root to tips is kept."))
        
    args = parser.parse_args(argv)

    taxid_source = args.taxid or args.taxid_file or args.reftree
    name_source = args.names or args.names_file
    if not taxid_source and not name_source:
        parser.error('At least one input source is required')
    if taxid_source and name_source:
        parser.error('taxid and name options are mutually exclusive')
        
    if not args.taxonomy and not args.info_list and not args.annotated_tree:
        parser.error('At least one output option is required')
        
    
    ncbi = NCBITaxa(args.dbfile)
        
    all_names = set([])
    all_taxids = []

    if args.names_file:
        all_names.update(map(strip, open(args.names_file, "rU").read().split("\n")))
        
    if args.names:
        all_names.update(map(strip, " ".join(args.names).split(",")))
        
    all_names.discard("")
    #all_names = set([n.lower() for n in all_names])
    not_found = set()
    name2realname = {}
    name2score = {}
    if all_names:
        log.info("Dumping name translations in %s.name_translation.txt ... ")
        name2id = ncbi.get_name_translator(all_names)
        not_found = all_names - set(name2id.keys())

        if args.fuzzy and not_found:
            log.info("%s unknown names", len(not_found))
            for name in not_found:
                # enable extension loading
                tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy)
                if tax:
                    name2id[name] = tax
                    name2realname[name] = realname
                    name2score[name] = "Fuzzy:%0.2f" %sim
                    
        for name in all_names:
            taxid = name2id.get(name, "???")
            realname = name2realname.get(name, name)
            score = name2score.get(name, "Exact:1.0")
            print "\t".join(map(str, [score, name, realname.capitalize(), taxid]))
            
    if args.taxid_file:
        all_taxids.extend(map(strip, open(args.taxid_file, "rU").read().split("\n")))
    if args.taxid:
        all_taxids.extend(args.taxid)
        
    reftree = None
    if args.reftree:
        reftree = PhyloTree(args.reftree)
        all_taxids.extend(list(set([getattr(n, args.reftree_attr) for n in reftree.iter_leaves()])))
       
    if all_taxids and args.info_list:
        all_taxids = set(all_taxids)
        all_taxids.discard("")
        all_taxids, merge_conversion = ncbi._translate_merged(all_taxids)
        outfile = args.info_list+".info.txt"
        log.info("Dumping %d taxid translations in %s ..." %(len(all_taxids), outfile))
        all_taxids.discard("")
        translator = ncbi.get_taxid_translator(all_taxids)
        OUT = open(outfile, "w")
        for taxid, name in translator.iteritems():
            lineage = ncbi.get_sp_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage = ','.join(map(str, lineage))
            print >>OUT, "\t".join(map(str, [merge_conversion.get(int(taxid), taxid), name, named_lineage, lineage ]))
            
        OUT.close()    
        for notfound in set(map(str, all_taxids)) - set(str(k) for k in translator.iterkeys()):
            print >>sys.stderr, notfound, "NOT FOUND"
    
    if all_taxids and args.taxonomy:
        all_taxids = set(all_taxids)
        all_taxids.discard("")
        log.info("Dumping NCBI taxonomy of %d taxa in %s.*.nw ..." %(len(all_taxids), args.taxonomy))

        t = ncbi.get_topology(all_taxids, args.full_lineage, args.rank_limit)
        
        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            n.name = "%s{%s}" %(id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_sp_lineage(n.taxid)
            n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage)))
            
        if args.collapse_subspecies:
            species_nodes = [n for n in t.traverse() if n.rank == "species"
                             if int(n.taxid) in all_taxids]
            for sp_node in species_nodes:
                bellow = sp_node.get_descendants()
                if bellow:
                    # creates a copy of the species node
                    connector = sp_node.__class__()
                    for f in sp_node.features:
                        connector.add_feature(f, getattr(sp_node, f))
                    connector.name = connector.name + "{species}"
                    for n in bellow:
                        n.detach()
                        n.name = n.name + "{%s}" %n.rank
                        sp_node.add_child(n)
                    sp_node.add_child(connector)
                    sp_node.add_feature("collapse_subspecies", "1")
                
            
        t.write(format=9, outfile=args.taxonomy+".names.nw")
        t.write(format=8, outfile=args.taxonomy+".allnames.nw")
        t.write(format=9, features=["taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage"],
                outfile=args.taxonomy+".full_annotation.nw")
        
        for i in t.iter_leaves():
            i.name = i.taxid
        t.write(format=9, outfile=args.taxonomy+".taxids.nw")
        t.write(format=8, outfile=args.taxonomy+".alltaxids.nw")


    if all_taxids and reftree:
        translator = ncbi.get_taxid_translator(all_taxids)
        for n in reftree.iter_leaves():
            if not hasattr(n, "taxid"):
                n.add_features(taxid=int(getattr(n, args.reftree_attr)))
            n.add_features(sci_name = translator.get(int(n.taxid), n.name))
            lineage = ncbi.get_sp_lineage(n.taxid)
            named_lineage = '|'.join(ncbi._translate_to_names(lineage))
            n.add_features(ncbi_track=named_lineage)
            
        print reftree.write(features=["taxid", "sci_name", "ncbi_track"])
Esempio n. 19
0
from ete_dev import PhyloTree
# Reads a phylogenetic tree (using default species name encoding)
t = PhyloTree("(((Hsa_001,Ptr_001),(Cfa_001,Mms_001)),(Dme_001,Dme_002));")
#                              /-Hsa_001
#                    /--------|
#                   |          \-Ptr_001
#          /--------|
#         |         |          /-Cfa_001
#         |          \--------|
#---------|                    \-Mms_001
#         |
#         |          /-Dme_001
#          \--------|
#                    \-Dme_002
#
# Prints current leaf names and species codes
print "Deafult mode:"
for n in t.get_leaves():
    print "node:", n.name, "Species name:", n.species
# node: Dme_001 Species name: Dme
# node: Dme_002 Species name: Dme
# node: Hsa_001 Species name: Hsa
# node: Ptr_001 Species name: Ptr
# node: Cfa_001 Species name: Cfa
# node: Mms_001 Species name: Mms
#
# We can also use our own leaf name parsing function to obtain species
# names. All we need to do is create a python function that takes
# node's name as argument and return its corresponding species name.
def get_species_name(node_name_string):
    # Species code is the first part of leaf name (separated by an
Esempio n. 20
0
def my_tree_loader(tree):
    """ This is function is used to load trees within the
    WebTreeApplication object. """
    t = PhyloTree(tree, sp_naming_function=extract_species_code)
    return t
Esempio n. 21
0
def main(argv):

    parser = argparse.ArgumentParser(
        description=__DESCRIPTION__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    input_args = parser.add_argument_group("INPUT OPTIONS")
    input_args.add_argument("source_trees",
                            metavar='source_trees',
                            type=str,
                            nargs="*",
                            help='a list of source tree files')

    input_args.add_argument(
        "--source_file",
        dest="source_file",
        type=str,
        help="""path to a file containing many source trees, one per line""")

    input_args.add_argument("-r",
                            dest="reftree",
                            type=str,
                            required=True,
                            help="""Reference tree""")

    input_args.add_argument("--ref_tree_attr",
                            dest="ref_tree_attr",
                            type=str,
                            default="name",
                            help=("attribute in ref tree used as leaf name"))

    input_args.add_argument(
        "--src_tree_attr",
        dest="src_tree_attr",
        type=str,
        default="name",
        help=("attribute in source tree used as leaf name"))

    input_args.add_argument(
        "--min_support_ref",
        type=float,
        default=0.0,
        help=("min support for branches to be considered from the ref tree"))
    input_args.add_argument(
        "--min_support_src",
        type=float,
        default=0.0,
        help=(
            "min support for branches to be considered from the source tree"))

    output_args = parser.add_argument_group("OUTPUT OPTIONS")

    output_args.add_argument("-o",
                             dest="output",
                             type=str,
                             help="""Path to the tab delimited report file""")

    opt_args = parser.add_argument_group("DISTANCE OPTIONS")

    opt_args.add_argument(
        "--outgroup",
        dest="outgroup",
        nargs="+",
        help=
        """outgroup used to root reference and source trees before distance computation"""
    )

    opt_args.add_argument("--expand_polytomies",
                          dest="polytomies",
                          action="store_true",
                          help="""expand politomies if necessary""")

    opt_args.add_argument("--unrooted",
                          dest="unrooted",
                          action="store_true",
                          help="""compare trees as unrooted""")

    opt_args.add_argument(
        "--min_support",
        dest="min_support",
        type=float,
        default=0.0,
        help=
        ("min support value for branches to be counted in the distance computation (RF, treeko and refTree/targeGene compatibility)"
         ))

    opt_args = parser.add_argument_group("PHYLOGENETICS OPTIONS")

    opt_args.add_argument(
        "--extract_species",
        action="store_true",
        help=
        "When used, leaf names in the reference and source trees are assumed to represent species."
        " If target trees are gene-trees whose species information is encoded as a part of the leaf sequence name,"
        " it can be automatically extracted by providing a Perl regular expression that extract a "
        " valid species code (see --sp_regexp). Such information will be also used to detect duplication"
        " events. ")

    opt_args.add_argument(
        "--sp_regexp",
        type=str,
        help=
        ("Specifies a Perl regular expression to automatically extract species names"
         " from the name string in source trees. If not used, leaf names are assumed to represent species names."
         " Example: use this expression '[^_]+_(.+)' to extract HUMAN from the string 'P53_HUMAN'."
         ))

    opt_args.add_argument("--collateral", action='store_true', help=(""))

    args = parser.parse_args(argv)
    print __DESCRIPTION__
    reftree = args.reftree
    if args.source_file and args.source_trees:
        print >> sys.stderr, 'The use of targets_file and targets at the same time is not supported.'
        sys.exit(1)

    if args.source_file:
        source_trees = tree_iterator(args.source_file)
    else:
        source_trees = args.source_trees

    ref_tree = Tree(reftree)

    if args.ref_tree_attr:
        for lf in ref_tree.iter_leaves():
            lf._origname = lf.name
            if args.ref_tree_attr not in lf.features:
                print lf
            lf.name = getattr(lf, args.ref_tree_attr)

    if args.outgroup:
        if len(args.outgroup) > 1:
            out = ref_tree.get_common_ancestor(args.outgroup)
        else:
            out = ref_tree.search_nodes(name=args.outgroup[0])[0]
        ref_tree.set_outgroup(out)

    HEADER = ("source tree", 'ref tree', 'common\ntips', 'normRF', 'RF',
              'maxRF', "%reftree", "%genetree", "subtrees", "treeko\ndist")
    if args.output:
        OUT = open(args.output, "w")
        print >> OUT, '# ' + ctime()
        print >> OUT, '# ' + ' '.join(sys.argv)
        print >> OUT, '#' + '\t'.join(HEADER)
    else:
        print '# ' + ctime()
        print '# ' + ' '.join(sys.argv)
        COL_WIDTHS = [20, 20] + [9] * 10
        print_table([HEADER], fix_col_width=COL_WIDTHS, wrap_style='wrap')

    prev_tree = None
    ref_fname = os.path.basename(args.reftree)
    for counter, tfile in enumerate(source_trees):
        if args.source_file:
            seedid, tfile = tfile
        else:
            seedid = None

        if args.extract_species:

            if args.sp_regexp:
                SPMATCHER = re.compile(args.sp_regexp)
                get_sp_name = lambda x: re.search(SPMATCHER, x).groups()[0]
            else:
                get_sp_name = lambda x: x

            tt = PhyloTree(tfile, sp_naming_function=get_sp_name)
        else:
            tt = Tree(tfile)

        if args.src_tree_attr:
            for lf in tt.iter_leaves():
                lf._origname = lf.name
                lf.name = getattr(lf, args.src_tree_attr)

        if args.outgroup:
            if len(args.outgroup) > 1:
                out = tt.get_common_ancestor(args.outgroup)
            else:
                out = tt.search_nodes(name=args.outgroup[0])[0]
            tt.set_outgroup(out)

        if args.source_trees:
            fname = os.path.basename(tfile)
        else:
            fname = '%05d' % counter

        r = tt.compare(ref_tree,
                       ref_tree_attr=args.ref_tree_attr,
                       source_tree_attr=args.src_tree_attr,
                       min_support_ref=args.min_support_ref,
                       min_support_source=args.min_support_src,
                       unrooted=args.unrooted,
                       has_duplications=args.extract_species)

        print_table([
            map(istr, [
                fname[-30:], ref_fname[-30:], r['effective_tree_size'],
                r['norm_rf'], r['rf'], r['max_rf'], r["source_edges_in_ref"],
                r["ref_edges_in_source"], r['source_subtrees'],
                r['treeko_dist']
            ])
        ],
                    fix_col_width=COL_WIDTHS,
                    wrap_style='cut')

    if args.output:
        OUT.close()
Esempio n. 22
0
 def test_ncbi_compare(self):
   t = PhyloTree( "((9606, (9598, 9606)), 10090);", sp_naming_function=lambda x: x.name )
   t.annotate_ncbi_taxa()
Esempio n. 23
0
 MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH
 >Mms_001
 MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAH
 >Hsa_001
 MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAH
 >Ptr_002
 MAEIPDATIQ-FMALTNVSHNIAVQY--EFGDLNEALNSY--YQTDDQKDRREEAH
 >Mmu_002
 MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH
 >Hsa_002
 MAEAPDETIQQFM-LTNVSHNIAVQYLSEFGDLNEAL--------------REEAH
 >Mmu_001
 MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAH
 >Ptr_001
 MAEIPDATIQ-FMALTNVSHNIAVQY--EFGDLNEALNSY--YQTDDQKDRREEAH
 >Mmu_001
 MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH
"""

# Performs a tree reconciliation analysis 
gene_tree_nw = '((Dme_001,Dme_002),(((Cfa_001,Mms_001),((Hsa_001,Ptr_001),Mmu_001)),(Ptr_002,(Hsa_002,Mmu_002))));'
species_tree_nw = "((((Hsa, Ptr), Mmu), (Mms, Cfa)), Dme);"
genetree = PhyloTree(gene_tree_nw)
sptree = PhyloTree(species_tree_nw)
recon_tree, events = genetree.reconcile(sptree)
recon_tree.link_to_alignment(alg)

# Visualize the reconciled tree
recon_tree.render("phylotree.png", w=750)

Esempio n. 24
0
def main(argv):

    parser = ArgumentParser(description=__DESCRIPTION__)

    parser.add_argument("--db",
                        dest="dbfile",
                        type=str,
                        help="""NCBI sqlite3 db file.""")

    input_args = parser.add_argument_group('TAXID INPUT OPTIONS')

    input_args.add_argument("-i",
                            "--taxid",
                            dest="taxid",
                            nargs="+",
                            type=int,
                            help="""taxids (space separated)""")

    input_args.add_argument(
        "-if",
        "--taxid_file",
        dest="taxid_file",
        type=str,
        help="""file containing a list of taxids (one per line)""")

    input_args.add_argument("-t",
                            "--reftree",
                            dest="reftree",
                            type=str,
                            help="""Read taxids from the provided tree.""")

    input_args.add_argument(
        "--reftree_attr",
        dest="reftree_attr",
        type=str,
        default="name",
        help="""tree attribute encoding for taxid numbers.""")

    name_input_args = parser.add_argument_group('NAME INPUT OPTIONS')

    name_input_args.add_argument(
        "-n",
        "--name",
        dest="names",
        nargs="+",
        type=str,
        help="""species or taxa names (comma separated)""")

    name_input_args.add_argument(
        "-nf",
        "--names_file",
        dest="names_file",
        type=str,
        help="""file containing a list of taxids (one per line)""")
    name_input_args.add_argument(
        "--fuzzy",
        dest="fuzzy",
        type=float,
        help=("Tries a fuzzy (and SLOW) search for those"
              " species names that could not be translated"
              " into taxids. A float number must be provided"
              " indicating the minimum string similarity."))

    output_args = parser.add_argument_group('OUTPUT OPTIONS')

    output_args.add_argument(
        "-x",
        "--taxonomy",
        dest="taxonomy",
        type=str,
        help=("dump a pruned version of the NCBI taxonomy"
              " tree containing target species into the specified file"))

    output_args.add_argument(
        "-l",
        "--list",
        dest="info_list",
        type=str,
        help=
        """dump NCBI taxonmy information for each target species into the specified file. """
    )

    output_args.add_argument(
        "-a",
        "--annotated",
        dest="annotated_tree",
        type=str,
        help=
        "dump the annotated tree of the input reftree provided with -t into the specified file."
    )

    output_args.add_argument(
        "--collapse_subspecies",
        dest="collapse_subspecies",
        action="store_true",
        help=("When used, all nodes under the the species rank"
              " are collapsed, so all species and subspecies"
              " are seen as sister nodes"))

    output_args.add_argument(
        "--rank_limit",
        dest="rank_limit",
        type=str,
        help=("When used, all nodes under the provided rank"
              " are discarded"))

    output_args.add_argument(
        "--full_lineage",
        dest="full_lineage",
        action="store_true",
        help=("When used, topology is not pruned to avoid "
              " one-child-nodes, so the complete lineage"
              " track leading from root to tips is kept."))

    args = parser.parse_args(argv)

    taxid_source = args.taxid or args.taxid_file or args.reftree
    name_source = args.names or args.names_file
    if not taxid_source and not name_source:
        parser.error('At least one input source is required')
    if taxid_source and name_source:
        parser.error('taxid and name options are mutually exclusive')

    if not args.taxonomy and not args.info_list and not args.annotated_tree:
        parser.error('At least one output option is required')

    ncbi = NCBITaxa(args.dbfile)

    all_names = set([])
    all_taxids = []

    if args.names_file:
        all_names.update(
            map(strip,
                open(args.names_file, "rU").read().split("\n")))

    if args.names:
        all_names.update(map(strip, " ".join(args.names).split(",")))

    all_names.discard("")
    #all_names = set([n.lower() for n in all_names])
    not_found = set()
    name2realname = {}
    name2score = {}
    if all_names:
        log.info("Dumping name translations in %s.name_translation.txt ... ")
        name2id = ncbi.get_name_translator(all_names)
        not_found = all_names - set(name2id.keys())

        if args.fuzzy and not_found:
            log.info("%s unknown names", len(not_found))
            for name in not_found:
                # enable extension loading
                tax, realname, sim = ncbi.get_fuzzy_name_translation(
                    name, args.fuzzy)
                if tax:
                    name2id[name] = tax
                    name2realname[name] = realname
                    name2score[name] = "Fuzzy:%0.2f" % sim

        for name in all_names:
            taxid = name2id.get(name, "???")
            realname = name2realname.get(name, name)
            score = name2score.get(name, "Exact:1.0")
            print "\t".join(
                map(str,
                    [score, name, realname.capitalize(), taxid]))

    if args.taxid_file:
        all_taxids.extend(
            map(strip,
                open(args.taxid_file, "rU").read().split("\n")))
    if args.taxid:
        all_taxids.extend(args.taxid)

    reftree = None
    if args.reftree:
        reftree = PhyloTree(args.reftree)
        all_taxids.extend(
            list(
                set([
                    getattr(n, args.reftree_attr)
                    for n in reftree.iter_leaves()
                ])))

    if all_taxids and args.info_list:
        all_taxids = set(all_taxids)
        all_taxids.discard("")
        all_taxids, merge_conversion = ncbi._translate_merged(all_taxids)
        outfile = args.info_list + ".info.txt"
        log.info("Dumping %d taxid translations in %s ..." %
                 (len(all_taxids), outfile))
        all_taxids.discard("")
        translator = ncbi.get_taxid_translator(all_taxids)
        OUT = open(outfile, "w")
        for taxid, name in translator.iteritems():
            lineage = ncbi.get_sp_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage = ','.join(map(str, lineage))
            print >> OUT, "\t".join(
                map(str, [
                    merge_conversion.get(int(taxid), taxid), name,
                    named_lineage, lineage
                ]))

        OUT.close()
        for notfound in set(map(str, all_taxids)) - set(
                str(k) for k in translator.iterkeys()):
            print >> sys.stderr, notfound, "NOT FOUND"

    if all_taxids and args.taxonomy:
        all_taxids = set(all_taxids)
        all_taxids.discard("")
        log.info("Dumping NCBI taxonomy of %d taxa in %s.*.nw ..." %
                 (len(all_taxids), args.taxonomy))

        t = ncbi.get_topology(all_taxids, args.full_lineage, args.rank_limit)

        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            n.name = "%s{%s}" % (id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_sp_lineage(n.taxid)
            n.add_features(
                named_lineage='|'.join(ncbi.translate_to_names(lineage)))

        if args.collapse_subspecies:
            species_nodes = [
                n for n in t.traverse() if n.rank == "species"
                if int(n.taxid) in all_taxids
            ]
            for sp_node in species_nodes:
                bellow = sp_node.get_descendants()
                if bellow:
                    # creates a copy of the species node
                    connector = sp_node.__class__()
                    for f in sp_node.features:
                        connector.add_feature(f, getattr(sp_node, f))
                    connector.name = connector.name + "{species}"
                    for n in bellow:
                        n.detach()
                        n.name = n.name + "{%s}" % n.rank
                        sp_node.add_child(n)
                    sp_node.add_child(connector)
                    sp_node.add_feature("collapse_subspecies", "1")

        t.write(format=9, outfile=args.taxonomy + ".names.nw")
        t.write(format=8, outfile=args.taxonomy + ".allnames.nw")
        t.write(format=9,
                features=[
                    "taxid", "name", "rank", "bgcolor", "sci_name",
                    "collapse_subspecies", "named_lineage"
                ],
                outfile=args.taxonomy + ".full_annotation.nw")

        for i in t.iter_leaves():
            i.name = i.taxid
        t.write(format=9, outfile=args.taxonomy + ".taxids.nw")
        t.write(format=8, outfile=args.taxonomy + ".alltaxids.nw")

    if all_taxids and reftree:
        translator = ncbi.get_taxid_translator(all_taxids)
        for n in reftree.iter_leaves():
            if not hasattr(n, "taxid"):
                n.add_features(taxid=int(getattr(n, args.reftree_attr)))
            n.add_features(sci_name=translator.get(int(n.taxid), n.name))
            lineage = ncbi.get_sp_lineage(n.taxid)
            named_lineage = '|'.join(ncbi._translate_to_names(lineage))
            n.add_features(ncbi_track=named_lineage)

        print reftree.write(features=["taxid", "sci_name", "ncbi_track"])
from ete_dev import PhyloTree
# Loads an example tree
nw = """
((Dme_001,Dme_002),(((Cfa_001,Mms_001),((Hsa_001,Ptr_001),Mmu_001)),
(Ptr_002,(Hsa_002,Mmu_002))));
"""
t = PhyloTree(nw)
print t
#                    /-Dme_001
#          /--------|
#         |          \-Dme_002
#         |
#         |                              /-Cfa_001
#         |                    /--------|
#---------|                   |          \-Mms_001
#         |          /--------|
#         |         |         |                    /-Hsa_001
#         |         |         |          /--------|
#         |         |          \--------|          \-Ptr_001
#          \--------|                   |
#                   |                    \-Mmu_001
#                   |
#                   |          /-Ptr_002
#                    \--------|
#                             |          /-Hsa_002
#                              \--------|
#                                        \-Mmu_002
#
# To obtain all the evolutionary events involving a given leaf node we
# use get_my_evol_events method
matches = t.search_nodes(name="Hsa_001")
Esempio n. 26
0
def translate_ids(trees_file, outgroup_lineage="Bacteria"):
    for line in open(trees_file):
        if not line.strip() or line.startswith('#'):
            continue

        t = PhyloTree(line, sp_naming_function=spname)
        #t.set_outgroup(t.get_midpoint_outgroup())

        for lf in t:
            lf.add_features(coded_name=lf.name)
            if lf.name in NAME2SP:
                lf.name = "%s {%s}" % (lf.name, NAME2SP[lf.name])

        t.dist = 0
        ncbi.connect_database()
        name2sp = ncbi.get_name_translator(t.get_species())
        for lf in t.iter_leaves():
            lf.add_features(taxid=name2sp.get(lf.species, 0))

        t.set_outgroup(t.search_nodes(taxid=9606)[0])
        ncbi.annotate_tree(t, attr_name='taxid')
        t.set_outgroup(
            t.get_common_ancestor(
                [lf for lf in t if outgroup_lineage in lf.named_lineage]))
        ncbi.annotate_tree(t, attr_name='taxid')

        #print t.write(features=[])
        #print t.write()
        yield t
Esempio n. 27
0
import sys
from collections import defaultdict
from ete_dev import PhyloTree

if len(sys.argv) > 1:
    t = PhyloTree(sys.argv[1])
else:
    t = PhyloTree()
    #t.populate(5000, reuse_names=True, names_library=map(lambda x: "%03d" %x, range(100)))

    #t.populate(5000, reuse_names=True, names_library=["aaa", "bbb", "ccc","dddd"])
    #t.set_species_naming_function(lambda x: x[:3])
    #t = PhyloTree("((((Kla0008018:0.226825,(Kwa0003593:0.270871,(((((((Sce0006606:0.020101,(Smi0000169:0.045626,Sku0001100:0.091634)0.9:0.021336)0.473:0.004546,Spa0001368:0)0.806:0.040152,Sba0000063:0.059101)0.967:0.124536,Sca0004780:0.57162)0.36:0.045976,Cgl0005705:0.244154)0.94:0.080608,(((Spa0003632:0.005291,Sce0012358:0.019313)0.879:0.014349,Smi0005102:0.031246)0.028:0.000541,(Sba0002319:0.027948,Sku0001858:0.037758)0.873:0.023849)0.995:0.14497)0.859:0.056767,(Sca0004490:0.235469,Kpo0005032:0.313188)0.699:0.077825)0.807:0.085287)0.523:0.049374)0.606:0.167197,Ago0006484:0.438321)0.976:0.605273,Cal0012751:1.95721)0.975:0.332581,(Cal0010356:0.478947,((Ago0007434:1.13211,Kwa0002043:1.20443)0.282:0.216219,(Skl0001126:0.276168,Cgl0008719:0.5381)0.454:0.191735)0.934:0.438082)0.975:0.332581);")
    #t = PhyloTree("((((((AAA1, AAA2),((BBB1,BBB2), AAA3)D1),(CCC1,CCC2)), AAA8)D2, (((AAA5, AAA6),((BBB5,BBB6), AAA4)D3),(CCC3,CCC4)))D4, D);", format=1)
    t = PhyloTree(
        "((((((((AAA1, AAA2:0.111)a1,(((BBB1,ZZZ1)a2,MMM1)a3,AAA4)a4)a5, AAA3)a6,(AAA4, (AAA5, XXX1)a8)a9)a10,DDD)a11,DDD)a12,DDD)a13,DDD)root;",
        format=1)
    print t.get_ascii()

ntrees, ndups, sp_trees = t.get_speciation_trees(map_features=["dist"])

for sptree in sp_trees:
    print sptree.get_ascii(attributes=["dist"])
Esempio n. 28
0
def main(argv):
    parser = argparse.ArgumentParser(description=__DESCRIPTION__, 
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo.
    # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version)
    # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) )
    # const - A constant value required by some action and nargs selections. 
    # default - The value produced if the argument is absent from the command line.
    # type - The type to which the command-line argument should be converted.
    # choices - A container of the allowable values for the argument.
    # required - Whether or not the command-line option may be omitted (optionals only).
    # help - A brief description of what the argument does.
    # metavar - A name for the argument in usage messages.
    # dest - The name of the attribute to be added to the object returned by parse_args().
    
    parser.add_argument("--show", dest="show_tree",
                        action="store_true", 
                        help="""Display tree after the analysis.""")
    
    parser.add_argument("--render", dest="render",
                        action="store_true", 
                        help="""Render tree.""")

    parser.add_argument("--dump", dest="dump",
                        action="store_true", 
                        help="""Dump analysis""")

    parser.add_argument("--explore", dest="explore",
                        type=str,
                        help="""Reads a previously analyzed tree and visualize it""")
    
    input_args = parser.add_mutually_exclusive_group()
    input_args.required=True
    input_args.add_argument("-t", "--tree", dest="target_tree",  nargs="+",
                        type=str,
                        help="""Tree file in newick format""")

    input_args.add_argument("-tf", dest="tree_list_file",
                        type=str, 
                        help="File with the list of tree files")
    
    parser.add_argument("--tax", dest="tax_info", type=str,
                        help="If the taxid attribute is not set in the"
                        " newick file for all leaf nodes, a tab file file"
                        " with the translation of name and taxid can be"
                        " provided with this option.")

    parser.add_argument("--sp_delimiter", dest="sp_delimiter", type=str,
                        help="If taxid is part of the leaf name, delimiter used to split the string")

    parser.add_argument("--sp_field", dest="sp_field", type=int, default=0,
                        help="field position for taxid after splitting leaf names")
    
    parser.add_argument("--ref", dest="ref_tree", type=str,
                        help="Uses ref tree to compute robinson foulds"
                        " distances of the different subtrees")

    parser.add_argument("--rf-only", dest="rf_only",
                        action = "store_true",
                        help="Skip ncbi consensus analysis")

    parser.add_argument("--outgroup", dest="outgroup",
                        type=str, nargs="+",
                        help="A list of node names defining the trees outgroup")
    
    parser.add_argument("--is_sptree", dest="is_sptree",
                        action = "store_true",
                        help="Assumes no duplication nodes in the tree")
    
    parser.add_argument("-o", dest="output", type=str,
                        help="Writes result into a file")

    parser.add_argument("--tax2name", dest="tax2name", type=str,
                        help="")
    
    parser.add_argument("--tax2track", dest="tax2track", type=str,
                        help="")
    
    parser.add_argument("--dump_tax_info", dest="dump_tax_info", action="store_true",
                        help="")
    
    args = parser.parse_args(argv)

    if args.sp_delimiter:
        GET_TAXID = lambda x: x.split(args.sp_delimiter)[args.sp_field]
    else:
        GET_TAXID = None
    
    reftree_name = os.path.basename(args.ref_tree) if args.ref_tree else ""
    if args.explore:
        print >>sys.stderr, "Reading tree from file:", args.explore
        t = cPickle.load(open(args.explore))
        ts = TreeStyle()
        ts.force_topology = True
        ts.show_leaf_name = False
        ts.layout_fn = ncbi_layout 
        ts.mode = "r"
        t.show(tree_style=ts)
        print >>sys.stderr, "dumping color config"
        cPickle.dump(name2color, open("ncbi_colors.pkl", "w"))
        sys.exit()
    
    if args.output:
        OUT = open(args.output, "w")
    else:
        OUT = sys.stdout

    print >>sys.stderr, "Dumping results into", OUT
    target_trees = []
    if args.tree_list_file:
        target_trees = [line.strip() for line in open(args.tree_list_file)]
    if args.target_tree:
        target_trees += args.target_tree
    prev_tree = None
    if args.tax2name:
        tax2name = cPickle.load(open(args.tax2name))
    else:
        tax2name = {}

    if args.tax2track:
        tax2track = cPickle.load(open(args.tax2track))
    else:
        tax2track = {}
    print len(tax2track), len(tax2name)
    header = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Clade sizes", "RF (avg)", "RF (med)", "RF (std)", "RF (max)", "Shared tips")
    print >>OUT, '|'.join([h.ljust(15) for h in header])
    if args.ref_tree:
        print >>sys.stderr, "Reading ref tree from", args.ref_tree
        reft = Tree(args.ref_tree, format=1)
    else:
        reft = None

    SHOW_TREE = False
    if args.show_tree or args.render:
        SHOW_TREE = True

        
    prev_broken = set()
    ENTRIES = []
    ncbi.connect_database()
    for tfile in target_trees:
        #print tfile
        t = PhyloTree(tfile, sp_naming_function=None)
        if GET_TAXID:
            for n in t.iter_leaves():
                n.name = GET_TAXID(n.name)
        
        if args.outgroup:
            if len(args.outgroup) == 1:
                out = t & args.outgroup[0]
            else:
                out = t.get_common_ancestor(args.outgroup)
                if set(out.get_leaf_names()) ^ set(args.outgroup):
                    raise ValueError("Outgroup is not monophyletic")
                
            t.set_outgroup(out)
        t.ladderize()

        if prev_tree:
            tree_compare(t, prev_tree)
        prev_tree = t
       
        
        if args.tax_info:
            tax2name, tax2track = annotate_tree_with_taxa(t, args.tax_info, tax2name, tax2track)
            if args.dump_tax_info:
                cPickle.dump(tax2track, open("tax2track.pkl", "w"))
                cPickle.dump(tax2name, open("tax2name.pkl", "w"))
                print "Tax info written into pickle files"
        else:
            for n in t.iter_leaves():
                spcode = n.name
                n.add_features(taxid=spcode)
                n.add_features(species=spcode)
            tax2name, tax2track = annotate_tree_with_taxa(t, None, tax2name, tax2track)
            
        # Split tree into species trees
        #subtrees =  t.get_speciation_trees()
        if not args.rf_only:
            #print "Calculating tree subparts..."
            t1 = time.time()
            if not args.is_sptree:
                subtrees =  t.split_by_dups()
                #print "Subparts:", len(subtrees), time.time()-t1
            else:
                subtrees = [t]

          
            valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = analyze_subtrees(t, subtrees, show_tree=SHOW_TREE)
            
            #print valid_subtrees, broken_subtrees, ncbi_mistakes, total_rf
        else:
            subtrees = []
            valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = 0, 0, 0, 0, 0, 0
            
        ndups = 0
        nsubtrees = len(subtrees)
           
        rf = 0
        rf_max = 0
        rf_std = 0
        rf_med = 0
        common_names = 0
        max_size = 0
        if reft and len(subtrees) == 1:
            rf = t.robinson_foulds(reft, attr_t1="realname")
            rf_max = rf[1]
            rf = rf[0]
            rf_med = rf
            
        elif reft:
            #print "Calculating avg RF..."
            nsubtrees, ndups, subtrees = t.get_speciation_trees(map_features=["taxid"])
            #print len(subtrees), "Sub-Species-trees found"
            avg_rf = []
            rf_max = 0.0 # reft.robinson_foulds(reft)[1]
            sum_size = 0.0
            print nsubtrees, "subtrees", ndups, "duplications"

            for ii, subt in enumerate(subtrees):
                print "\r%d" %ii,
                sys.stdout.flush()
                try:
                    partial_rf = subt.robinson_foulds(reft, attr_t1="taxid")
                except ValueError:
                    pass
                else:
                    sptree_size = len(set([n.taxid for n in subt.iter_leaves()]))
                    sum_size += sptree_size
                    avg_rf.append((partial_rf[0]/float(partial_rf[1])) * sptree_size)
                    common_names = len(partial_rf[3])
                    max_size = max(max_size, sptree_size)
                    rf_max = max(rf_max, partial_rf[1])
                #print  partial_rf[:2]
            rf = numpy.sum(avg_rf) / float(sum_size) # Treeko dist
            rf_std = numpy.std(avg_rf)
            rf_med = numpy.median(avg_rf)

        sizes_info = "%0.1f/%0.1f +- %0.1f" %( numpy.mean(broken_sizes), numpy.median(broken_sizes), numpy.std(broken_sizes))
        iter_values = [os.path.basename(tfile), nsubtrees, ndups,
                        broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, rf, rf_med,
                       rf_std, rf_max, common_names] 
        print >>OUT, '|'.join(map(lambda x: str(x).strip().ljust(15), iter_values)) 
        fixed = sorted([n for n in prev_broken if n not in broken_clades])
        new_problems =  sorted(broken_clades - prev_broken)
        fixed_string = color(', '.join(fixed), "green") if fixed else ""
        problems_string = color(', '.join(new_problems), "red") if new_problems else ""
        OUT.write("    Fixed clades: %s\n" %fixed_string) if fixed else None
        OUT.write("    New broken:   %s\n" %problems_string) if new_problems else None
        prev_broken = broken_clades
        ENTRIES.append([os.path.basename(tfile), nsubtrees, ndups,
                        broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, fixed_string, problems_string])
        OUT.flush()
        if args.show_tree or args.render:
            ts = TreeStyle()
            ts.force_topology = True
            #ts.tree_width = 500
            ts.show_leaf_name = False
            ts.layout_fn = ncbi_layout 
            ts.mode = "r"
            t.dist = 0
            if args.show_tree:
                #if args.hide_monophyletic:
                #    tax2monophyletic = {}
                #    n2content = t.get_node2content()
                #    for node in t.traverse():
                #        term2count = defaultdict(int)
                #        for leaf in n2content[node]:
                #            if leaf.lineage:
                #                for term in leaf.lineage:
                #                    term2count[term] += 1
                #        expected_size = len(n2content)
                #        for term, count in term2count.iteritems():
                #            if count > 1
                    
                print "Showing tree..."
                t.show(tree_style=ts)
            else:
                t.render("img.svg", tree_style=ts, dpi=300)
            print "dumping color config"
            cPickle.dump(name2color, open("ncbi_colors.pkl", "w"))

        if args.dump:
            cPickle.dump(t, open("ncbi_analysis.pkl", "w"))
            
    print
    print
    HEADER = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Broken branches", "Clade sizes", "Fixed Groups", "New Broken Clades")
    print_table(ENTRIES, max_col_width = 50, row_line=True, header=HEADER)
            
    if args.output:
        OUT.close()
Esempio n. 29
0
        tax2name = cPickle.load(open(args.tax2name))
    else:
        tax2name = {}

    if args.tax2track:
        tax2track = cPickle.load(open(args.tax2track))
    else:
        tax2track = {}
    print len(tax2track), len(tax2name)
    #header = "filename", "refname", "# subtrees", "# dups", "broken subtrees", "ncbi_mistakes", "RF", "avg RF", "RF std", "max RF", "")
    #print '\t'.join(header)
    header = ("Tree".center(50), "Total subtrees", "Broken subtrees", "Broken NCBI clades", "RF (avg)", "RF (med)", "RF (std)", "RF (max possible)")
    print >>OUT, "#"+' '.join([h.center(15) for h in header])
    for tfile in target_trees:
        print tfile
        t = PhyloTree(tfile, sp_naming_function=None)
        if args.outgroup:
            if len(args.outgroup) == 1:
                out = t & args.outgroup[0]
            else:
                out = t.get_common_ancestor(args.outgroup)
                if set(out.get_leaf_names()) ^ set(args.outgroup):
                    raise ValueError("Outgroup is not monophyletic")
                
            t.set_outgroup(out)
        t.ladderize()

        if prev_tree:
            tree_compare(t, prev_tree)
        prev_tree = t
        
Esempio n. 30
0
from ete_dev import PhyloTree

# Loads a gene tree and its corresponding species tree. Note that
# species names in sptree are the 3 firs letters of leaf nodes in
# genetree.
gene_tree_nw = '((Dme_001,Dme_002),(((Cfa_001,Mms_001),((Hsa_001,Ptr_001),Mmu_001)),(Ptr_002,(Hsa_002,Mmu_002))));'
species_tree_nw = "((((Hsa, Ptr), Mmu), (Mms, Cfa)), Dme);"
genetree = PhyloTree(gene_tree_nw)
sptree = PhyloTree(species_tree_nw)
print genetree
#                    /-Dme_001
#          /--------|
#         |          \-Dme_002
#         |
#         |                              /-Cfa_001
#         |                    /--------|
#---------|                   |          \-Mms_001
#         |          /--------|
#         |         |         |                    /-Hsa_001
#         |         |         |          /--------|
#         |         |          \--------|          \-Ptr_001
#          \--------|                   |
#                   |                    \-Mmu_001
#                   |
#                   |          /-Ptr_002
#                    \--------|
#                             |          /-Hsa_002
#                              \--------|
#                                        \-Mmu_002
#
# Let's reconcile our genetree with the species tree
Esempio n. 31
0
"""
iphylip_txt = """
 4 76
      seqA   MAEIPDETIQ QFMALT---H NIAVQYLSEF GDLNEALNSY YASQTDDIKD RREEAHQFMA
      seqB   MAEIPDATIQ QFMALTNVSH NIAVQY--EF GDLNEALNSY YAYQTDDQKD RREEAHQFMA
      seqC   MAEIPDATIQ ---ALTNVSH NIAVQYLSEF GDLNEALNSY YASQTDDQPD RREEAHQFMA
      seqD   MAEAPDETIQ QFMALTNVSH NIAVQYLSEF GDLNEAL--- ---------- -REEAHQ---
             LTNVSHQFMA LTNVSH
             LTNVSH---- ------
             LTNVSH---- ------
             -------FMA LTNVSH
"""
# Load a tree and link it to an alignment. As usual, 'alignment' can
# be the path to a file or data in text format.
t = PhyloTree("(((seqA,seqB),seqC),seqD);",
              alignment=fasta_txt,
              alg_format="fasta")

#We can now access the sequence of every leaf node
print "These are the nodes and its sequences:"
for leaf in t.iter_leaves():
    print leaf.name, leaf.sequence
#seqD MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAH
#seqC MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH
#seqA MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAH
#seqB MAEIPDATIQQFMALTNVSHNIAVQY--EFGDLNEALNSYYAYQTDDQKDRREEAH
#
# The associated alignment can be changed at any time
t.link_to_alignment(alignment=iphylip_txt, alg_format="iphylip")
# Let's check that sequences have changed
print "These are the nodes and its re-linked sequences:"
Esempio n. 32
0
from ete_dev import PhyloTree
# Creates a gene phylogeny with several duplication events at
# different levels. Note that we are using the default method for
# detecting the species code of leaves (three first lettes in the node
# name are considered the species code).
nw = """
((Dme_001,Dme_002),(((Cfa_001,Mms_001),((((Hsa_001,Hsa_003),Ptr_001)
,Mmu_001),((Hsa_004,Ptr_004),Mmu_004))),(Ptr_002,(Hsa_002,Mmu_002))));
"""
t = PhyloTree(nw)
print "Original tree:",
print t
#
#             /-Dme_001
#   /--------|
#  |          \-Dme_002
#  |
#  |                              /-Cfa_001
#  |                    /--------|
#  |                   |          \-Mms_001
#  |                   |
#--|                   |                                        /-Hsa_001
#  |                   |                              /--------|
#  |          /--------|                    /--------|          \-Hsa_003
#  |         |         |                   |         |
#  |         |         |          /--------|          \-Ptr_001
#  |         |         |         |         |
#  |         |         |         |          \-Mmu_001
#  |         |          \--------|
#   \--------|                   |                    /-Hsa_004
#            |                   |          /--------|