def translate_ids(trees_file, outgroup_lineage="Bacteria"): for line in open(trees_file): if not line.strip() or line.startswith('#'): continue t = PhyloTree(line, sp_naming_function=spname) #t.set_outgroup(t.get_midpoint_outgroup()) for lf in t: lf.add_features(coded_name = lf.name) if lf.name in NAME2SP: lf.name = "%s {%s}" %(lf.name, NAME2SP[lf.name]) t.dist = 0 ncbi.connect_database() name2sp = ncbi.get_name_translator(t.get_species()) for lf in t.iter_leaves(): lf.add_features(taxid=name2sp.get(lf.species, 0)) t.set_outgroup(t.search_nodes(taxid=9606)[0]) ncbi.annotate_tree(t, attr_name='taxid') t.set_outgroup(t.get_common_ancestor([lf for lf in t if outgroup_lineage in lf.named_lineage])) ncbi.annotate_tree(t, attr_name='taxid') #print t.write(features=[]) #print t.write() yield t
def get_example_tree(): # Performs a tree reconciliation analysis gene_tree_nw = '((Dme_001,Dme_002),(((Cfa_001,Mms_001),((Hsa_001,Ptr_001),Mmu_001)),(Ptr_002,(Hsa_002,Mmu_002))));' species_tree_nw = "((((Hsa, Ptr), Mmu), (Mms, Cfa)), Dme);" genetree = PhyloTree(gene_tree_nw) sptree = PhyloTree(species_tree_nw) recon_tree, events = genetree.reconcile(sptree) recon_tree.link_to_alignment(alg) return recon_tree, TreeStyle()
def test_tree_annotation(self): t = PhyloTree( "((9598, 9606), 10090);" ) t.annotate_ncbi_taxa() self.assertEqual(t.sci_name, 'Euarchontoglires') homi = (t&'9606').up self.assertEqual(homi.sci_name, 'Homininae') self.assertEqual(homi.taxid, 207598) self.assertEqual(homi.rank, 'subfamily') self.assertEqual(homi.named_lineage, [u'root', u'cellular organisms', u'Eukaryota', u'Opisthokonta', u'Metazoa', u'Eumetazoa', u'Bilateria', u'Deuterostomia', u'Chordata', u'Craniata', u'Vertebrata', u'Gnathostomata', u'Teleostomi', u'Euteleostomi', u'Sarcopterygii', u'Dipnotetrapodomorpha', u'Tetrapoda', u'Amniota', u'Mammalia', u'Theria', u'Eutheria', u'Boreoeutheria', u'Euarchontoglires', u'Primates', u'Haplorrhini', u'Simiiformes', u'Catarrhini', u'Hominoidea', u'Hominidae', u'Homininae']) self.assertEqual(homi.lineage, [1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593, 7742, 7776, 117570, 117571, 8287, 1338369, 32523, 32524, 40674, 32525, 9347, 1437010, 314146, 9443, 376913, 314293, 9526, 314295, 9604, 207598] ) human = t&'9606' self.assertEqual(human.sci_name, 'H**o sapiens') self.assertEqual(human.taxid, 9606) self.assertEqual(human.rank, 'species') self.assertEqual(human.named_lineage, [u'root', u'cellular organisms', u'Eukaryota', u'Opisthokonta', u'Metazoa', u'Eumetazoa', u'Bilateria', u'Deuterostomia', u'Chordata', u'Craniata', u'Vertebrata', u'Gnathostomata', u'Teleostomi', u'Euteleostomi', u'Sarcopterygii', u'Dipnotetrapodomorpha', u'Tetrapoda', u'Amniota', u'Mammalia', u'Theria', u'Eutheria', u'Boreoeutheria', u'Euarchontoglires', u'Primates', u'Haplorrhini', u'Simiiformes', u'Catarrhini', u'Hominoidea', u'Hominidae', u'Homininae', u'H**o', u'H**o sapiens']) self.assertEqual(human.lineage, [1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593, 7742, 7776, 117570, 117571, 8287, 1338369, 32523, 32524, 40674, 32525, 9347, 1437010, 314146, 9443, 376913, 314293, 9526, 314295, 9604, 207598, 9605, 9606])
def test_tree_annotation(self): t = PhyloTree("((9598, 9606), 10090);") t.annotate_ncbi_taxa() self.assertEqual(t.sci_name, 'Euarchontoglires') homi = (t & '9606').up self.assertEqual(homi.sci_name, 'Homininae') self.assertEqual(homi.taxid, 207598) self.assertEqual(homi.rank, 'subfamily') self.assertEqual(homi.named_lineage, [ u'root', u'cellular organisms', u'Eukaryota', u'Opisthokonta', u'Metazoa', u'Eumetazoa', u'Bilateria', u'Deuterostomia', u'Chordata', u'Craniata', u'Vertebrata', u'Gnathostomata', u'Teleostomi', u'Euteleostomi', u'Sarcopterygii', u'Dipnotetrapodomorpha', u'Tetrapoda', u'Amniota', u'Mammalia', u'Theria', u'Eutheria', u'Boreoeutheria', u'Euarchontoglires', u'Primates', u'Haplorrhini', u'Simiiformes', u'Catarrhini', u'Hominoidea', u'Hominidae', u'Homininae' ]) self.assertEqual(homi.lineage, [ 1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593, 7742, 7776, 117570, 117571, 8287, 1338369, 32523, 32524, 40674, 32525, 9347, 1437010, 314146, 9443, 376913, 314293, 9526, 314295, 9604, 207598 ]) human = t & '9606' self.assertEqual(human.sci_name, 'H**o sapiens') self.assertEqual(human.taxid, 9606) self.assertEqual(human.rank, 'species') self.assertEqual(human.named_lineage, [ u'root', u'cellular organisms', u'Eukaryota', u'Opisthokonta', u'Metazoa', u'Eumetazoa', u'Bilateria', u'Deuterostomia', u'Chordata', u'Craniata', u'Vertebrata', u'Gnathostomata', u'Teleostomi', u'Euteleostomi', u'Sarcopterygii', u'Dipnotetrapodomorpha', u'Tetrapoda', u'Amniota', u'Mammalia', u'Theria', u'Eutheria', u'Boreoeutheria', u'Euarchontoglires', u'Primates', u'Haplorrhini', u'Simiiformes', u'Catarrhini', u'Hominoidea', u'Hominidae', u'Homininae', u'H**o', u'H**o sapiens' ]) self.assertEqual(human.lineage, [ 1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593, 7742, 7776, 117570, 117571, 8287, 1338369, 32523, 32524, 40674, 32525, 9347, 1437010, 314146, 9443, 376913, 314293, 9526, 314295, 9604, 207598, 9605, 9606 ])
def get_tree(self, protid, method, phylome_id): """ Returns the method-tree associated to a given protid. """ cmd = 'SELECT newick,lk FROM %s WHERE phylome_id=%s AND species="%s" AND protid="%s" AND method ="%s"' %\ (self._trees_table, phylome_id, protid[:3],protid[3:],method) if self._SQL.execute(cmd): entry = self._SQL.fetchone() nw = entry[0] lk = float(entry[1]) t = PhyloTree(nw) else: t = None lk = None return t,lk
def get_best_tree(self, protid, phylome_id): """ Returns the winner ML tree""" likelihoods = {} winner_model = None winner_lk = None winner_newick = None t = None command ='SELECT newick,method,lk FROM %s WHERE phylome_id=%s AND species="%s" and protid="%s";' \ % (self._trees_table,phylome_id, protid[:3], protid[3:]) self._SQL.execute(command) result = self._SQL.fetchall() for r in result: nw,m,lk = r if lk < 0: likelihoods[m] = lk if winner_lk==None or lk > winner_lk: winner_lk = lk winner_model = m winner_newick = nw if winner_newick: t = PhyloTree(winner_newick) return winner_model,likelihoods,t
''' layout for CodemlTree ''' if hasattr(node, "collapsed"): if node.collapsed == 1: node.img_style["draw_descendants"]= False if node.is_leaf(): if hasattr (node, "sequence"): seqface = MySequenceFace(node.sequence, "nt", fsize=10, col_w=11, interactive=True) faces.add_face_to_node(seqface, node, 1, aligned=True) if __name__ == "__main__": tree = PhyloTree('(Orangutan,Human,Chimp);') tree.link_to_alignment(""" >Chimp HARWLNEKLRCELRTLKKLGLDGYKAVSQYVKGRA >Orangutan DARWINEKLRCVSRTLKKLGLDGYKGVSQYVKGRP >Human DARWHNVKLRCELRTLKKLGLVGFKAVSQFVIRRA """) nt_sequences = {"Human" : "GACGCACGGTGGCACAACGTAAAATTAAGATGTGAATTGAGAACTCTGAAAAAATTGGGACTGGTCGGCTTCAAGGCAGTAAGTCAATTCGTAATACGTCGTGCG", "Chimp" : "CACGCCCGATGGCTCAACGAAAAGTTAAGATGCGAATTGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGCAGTAAGTCAGTACGTTAAAGGTCGTGCG", "Orangutan": "GATGCACGCTGGATCAACGAAAAGTTAAGATGCGTATCGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGGAGTAAGTCAATACGTTAAAGGTCGTCCG" } for l in nt_sequences: (tree & l).nt_sequence = nt_sequences[l] tree.dist = 0
# tree.run_model("fb") # tree.run_model("M2") #except: # pass tree.dist = 0 ts = TreeStyle() ts.title.add_face(TextFace( "Example for EvolTree, interactivity shows codons", fsize=15), column=0) ts.layout_fn = test_layout_evol #try: # tree.show(tree_style=ts, histfaces=["M2"]) #except: tree.show(tree_style=ts) except: tree = PhyloTree('(Orangutan,Human,Chimp);') tree.link_to_alignment(""" >Chimp HARWLNEKLRCELRTLKKLGLDGYKAVSQYVKGRA >Orangutan DARWINEKLRCVSRTLKKLGLDGYKGVSQYVKGRP >Human DARWHNVKLRCELRTLKKLGLVGFKAVSQFVIRRA """) nt_sequences = { "Human": "GACGCACGGTGGCACAACGTAAAATTAAGATGTGAATTGAGAACTCTGAAAAAATTGGGACTGGTCGGCTTCAAGGCAGTAAGTCAATTCGTAATACGTCGTGCG", "Chimp": "CACGCCCGATGGCTCAACGAAAAGTTAAGATGCGAATTGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGCAGTAAGTCAGTACGTTAAAGGTCGTGCG", "Orangutan": "GATGCACGCTGGATCAACGAAAAGTTAAGATGCGTATCGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGGAGTAAGTCAATACGTTAAAGGTCGTCCG"
def main(argv): parser = argparse.ArgumentParser( description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo. # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version) # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) ) # const - A constant value required by some action and nargs selections. # default - The value produced if the argument is absent from the command line. # type - The type to which the command-line argument should be converted. # choices - A container of the allowable values for the argument. # required - Whether or not the command-line option may be omitted (optionals only). # help - A brief description of what the argument does. # metavar - A name for the argument in usage messages. # dest - The name of the attribute to be added to the object returned by parse_args(). parser.add_argument("--show", dest="show_tree", action="store_true", help="""Display tree after the analysis.""") parser.add_argument("--render", dest="render", action="store_true", help="""Render tree.""") parser.add_argument("--dump", dest="dump", action="store_true", help="""Dump analysis""") parser.add_argument( "--explore", dest="explore", type=str, help="""Reads a previously analyzed tree and visualize it""") input_args = parser.add_mutually_exclusive_group() input_args.required = True input_args.add_argument("-t", "--tree", dest="target_tree", nargs="+", type=str, help="""Tree file in newick format""") input_args.add_argument("-tf", dest="tree_list_file", type=str, help="File with the list of tree files") parser.add_argument("--tax", dest="tax_info", type=str, help="If the taxid attribute is not set in the" " newick file for all leaf nodes, a tab file file" " with the translation of name and taxid can be" " provided with this option.") parser.add_argument( "--sp_delimiter", dest="sp_delimiter", type=str, help= "If taxid is part of the leaf name, delimiter used to split the string" ) parser.add_argument( "--sp_field", dest="sp_field", type=int, default=0, help="field position for taxid after splitting leaf names") parser.add_argument("--ref", dest="ref_tree", type=str, help="Uses ref tree to compute robinson foulds" " distances of the different subtrees") parser.add_argument("--rf-only", dest="rf_only", action="store_true", help="Skip ncbi consensus analysis") parser.add_argument( "--outgroup", dest="outgroup", type=str, nargs="+", help="A list of node names defining the trees outgroup") parser.add_argument("--is_sptree", dest="is_sptree", action="store_true", help="Assumes no duplication nodes in the tree") parser.add_argument("-o", dest="output", type=str, help="Writes result into a file") parser.add_argument("--tax2name", dest="tax2name", type=str, help="") parser.add_argument("--tax2track", dest="tax2track", type=str, help="") parser.add_argument("--dump_tax_info", dest="dump_tax_info", action="store_true", help="") args = parser.parse_args(argv) if args.sp_delimiter: GET_TAXID = lambda x: x.split(args.sp_delimiter)[args.sp_field] else: GET_TAXID = None reftree_name = os.path.basename(args.ref_tree) if args.ref_tree else "" if args.explore: print >> sys.stderr, "Reading tree from file:", args.explore t = cPickle.load(open(args.explore)) ts = TreeStyle() ts.force_topology = True ts.show_leaf_name = False ts.layout_fn = ncbi_layout ts.mode = "r" t.show(tree_style=ts) print >> sys.stderr, "dumping color config" cPickle.dump(name2color, open("ncbi_colors.pkl", "w")) sys.exit() if args.output: OUT = open(args.output, "w") else: OUT = sys.stdout print >> sys.stderr, "Dumping results into", OUT target_trees = [] if args.tree_list_file: target_trees = [line.strip() for line in open(args.tree_list_file)] if args.target_tree: target_trees += args.target_tree prev_tree = None if args.tax2name: tax2name = cPickle.load(open(args.tax2name)) else: tax2name = {} if args.tax2track: tax2track = cPickle.load(open(args.tax2track)) else: tax2track = {} print len(tax2track), len(tax2name) header = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Clade sizes", "RF (avg)", "RF (med)", "RF (std)", "RF (max)", "Shared tips") print >> OUT, '|'.join([h.ljust(15) for h in header]) if args.ref_tree: print >> sys.stderr, "Reading ref tree from", args.ref_tree reft = Tree(args.ref_tree, format=1) else: reft = None SHOW_TREE = False if args.show_tree or args.render: SHOW_TREE = True prev_broken = set() ENTRIES = [] ncbi.connect_database() for tfile in target_trees: #print tfile t = PhyloTree(tfile, sp_naming_function=None) if GET_TAXID: for n in t.iter_leaves(): n.name = GET_TAXID(n.name) if args.outgroup: if len(args.outgroup) == 1: out = t & args.outgroup[0] else: out = t.get_common_ancestor(args.outgroup) if set(out.get_leaf_names()) ^ set(args.outgroup): raise ValueError("Outgroup is not monophyletic") t.set_outgroup(out) t.ladderize() if prev_tree: tree_compare(t, prev_tree) prev_tree = t if args.tax_info: tax2name, tax2track = annotate_tree_with_taxa( t, args.tax_info, tax2name, tax2track) if args.dump_tax_info: cPickle.dump(tax2track, open("tax2track.pkl", "w")) cPickle.dump(tax2name, open("tax2name.pkl", "w")) print "Tax info written into pickle files" else: for n in t.iter_leaves(): spcode = n.name n.add_features(taxid=spcode) n.add_features(species=spcode) tax2name, tax2track = annotate_tree_with_taxa( t, None, tax2name, tax2track) # Split tree into species trees #subtrees = t.get_speciation_trees() if not args.rf_only: #print "Calculating tree subparts..." t1 = time.time() if not args.is_sptree: subtrees = t.split_by_dups() #print "Subparts:", len(subtrees), time.time()-t1 else: subtrees = [t] valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = analyze_subtrees( t, subtrees, show_tree=SHOW_TREE) #print valid_subtrees, broken_subtrees, ncbi_mistakes, total_rf else: subtrees = [] valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = 0, 0, 0, 0, 0, 0 ndups = 0 nsubtrees = len(subtrees) rf = 0 rf_max = 0 rf_std = 0 rf_med = 0 common_names = 0 max_size = 0 if reft and len(subtrees) == 1: rf = t.robinson_foulds(reft, attr_t1="realname") rf_max = rf[1] rf = rf[0] rf_med = rf elif reft: #print "Calculating avg RF..." nsubtrees, ndups, subtrees = t.get_speciation_trees( map_features=["taxid"]) #print len(subtrees), "Sub-Species-trees found" avg_rf = [] rf_max = 0.0 # reft.robinson_foulds(reft)[1] sum_size = 0.0 print nsubtrees, "subtrees", ndups, "duplications" for ii, subt in enumerate(subtrees): print "\r%d" % ii, sys.stdout.flush() try: partial_rf = subt.robinson_foulds(reft, attr_t1="taxid") except ValueError: pass else: sptree_size = len( set([n.taxid for n in subt.iter_leaves()])) sum_size += sptree_size avg_rf.append( (partial_rf[0] / float(partial_rf[1])) * sptree_size) common_names = len(partial_rf[3]) max_size = max(max_size, sptree_size) rf_max = max(rf_max, partial_rf[1]) #print partial_rf[:2] rf = numpy.sum(avg_rf) / float(sum_size) # Treeko dist rf_std = numpy.std(avg_rf) rf_med = numpy.median(avg_rf) sizes_info = "%0.1f/%0.1f +- %0.1f" % (numpy.mean(broken_sizes), numpy.median(broken_sizes), numpy.std(broken_sizes)) iter_values = [ os.path.basename(tfile), nsubtrees, ndups, broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, rf, rf_med, rf_std, rf_max, common_names ] print >> OUT, '|'.join( map(lambda x: str(x).strip().ljust(15), iter_values)) fixed = sorted([n for n in prev_broken if n not in broken_clades]) new_problems = sorted(broken_clades - prev_broken) fixed_string = color(', '.join(fixed), "green") if fixed else "" problems_string = color(', '.join(new_problems), "red") if new_problems else "" OUT.write(" Fixed clades: %s\n" % fixed_string) if fixed else None OUT.write(" New broken: %s\n" % problems_string) if new_problems else None prev_broken = broken_clades ENTRIES.append([ os.path.basename(tfile), nsubtrees, ndups, broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, fixed_string, problems_string ]) OUT.flush() if args.show_tree or args.render: ts = TreeStyle() ts.force_topology = True #ts.tree_width = 500 ts.show_leaf_name = False ts.layout_fn = ncbi_layout ts.mode = "r" t.dist = 0 if args.show_tree: #if args.hide_monophyletic: # tax2monophyletic = {} # n2content = t.get_node2content() # for node in t.traverse(): # term2count = defaultdict(int) # for leaf in n2content[node]: # if leaf.lineage: # for term in leaf.lineage: # term2count[term] += 1 # expected_size = len(n2content) # for term, count in term2count.iteritems(): # if count > 1 print "Showing tree..." t.show(tree_style=ts) else: t.render("img.svg", tree_style=ts, dpi=300) print "dumping color config" cPickle.dump(name2color, open("ncbi_colors.pkl", "w")) if args.dump: cPickle.dump(t, open("ncbi_analysis.pkl", "w")) print print HEADER = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Broken branches", "Clade sizes", "Fixed Groups", "New Broken Clades") print_table(ENTRIES, max_col_width=50, row_line=True, header=HEADER) if args.output: OUT.close()
def main(argv): parser = argparse.ArgumentParser(description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) input_args = parser.add_argument_group("INPUT OPTIONS") input_args.add_argument("source_trees", metavar='source_trees', type=str, nargs="*", help='a list of source tree files') input_args.add_argument("--source_file", dest="source_file", type=str, help="""path to a file containing many source trees, one per line""") input_args.add_argument("-r", dest="reftree", type=str, required=True, help="""Reference tree""") input_args.add_argument("--ref_tree_attr", dest="ref_tree_attr", type=str, default="name", help=("attribute in ref tree used as leaf name")) input_args.add_argument("--src_tree_attr", dest="src_tree_attr", type=str, default="name", help=("attribute in source tree used as leaf name")) input_args.add_argument("--min_support_ref", type=float, default=0.0, help=("min support for branches to be considered from the ref tree")) input_args.add_argument("--min_support_src", type=float, default=0.0, help=("min support for branches to be considered from the source tree")) output_args = parser.add_argument_group("OUTPUT OPTIONS") output_args.add_argument("-o", dest="output", type=str, help="""Path to the tab delimited report file""") opt_args = parser.add_argument_group("DISTANCE OPTIONS") opt_args.add_argument("--outgroup", dest="outgroup", nargs = "+", help="""outgroup used to root reference and source trees before distance computation""") opt_args.add_argument("--expand_polytomies", dest="polytomies", action = "store_true", help="""expand politomies if necessary""") opt_args.add_argument("--unrooted", dest="unrooted", action = "store_true", help="""compare trees as unrooted""") opt_args.add_argument("--min_support", dest="min_support", type=float, default=0.0, help=("min support value for branches to be counted in the distance computation (RF, treeko and refTree/targeGene compatibility)")) opt_args = parser.add_argument_group("PHYLOGENETICS OPTIONS") opt_args.add_argument("--extract_species", action = "store_true", help="When used, leaf names in the reference and source trees are assumed to represent species." " If target trees are gene-trees whose species information is encoded as a part of the leaf sequence name," " it can be automatically extracted by providing a Perl regular expression that extract a " " valid species code (see --sp_regexp). Such information will be also used to detect duplication" " events. ") opt_args.add_argument("--sp_regexp", type=str, help=("Specifies a Perl regular expression to automatically extract species names" " from the name string in source trees. If not used, leaf names are assumed to represent species names." " Example: use this expression '[^_]+_(.+)' to extract HUMAN from the string 'P53_HUMAN'.")) opt_args.add_argument("--collateral", action='store_true', help=("")) args = parser.parse_args(argv) print __DESCRIPTION__ reftree = args.reftree if args.source_file and args.source_trees: print >>sys.stderr, 'The use of targets_file and targets at the same time is not supported.' sys.exit(1) if args.source_file: source_trees = tree_iterator(args.source_file) else: source_trees = args.source_trees ref_tree = Tree(reftree) if args.ref_tree_attr: for lf in ref_tree.iter_leaves(): lf._origname = lf.name if args.ref_tree_attr not in lf.features: print lf lf.name = getattr(lf, args.ref_tree_attr) if args.outgroup: if len(args.outgroup) > 1: out = ref_tree.get_common_ancestor(args.outgroup) else: out = ref_tree.search_nodes(name=args.outgroup[0])[0] ref_tree.set_outgroup(out) HEADER = ("source tree", 'ref tree', 'common\ntips', 'normRF', 'RF', 'maxRF', "%reftree", "%genetree", "subtrees", "treeko\ndist") if args.output: OUT = open(args.output, "w") print >>OUT, '# ' + ctime() print >>OUT, '# ' + ' '.join(sys.argv) print >>OUT, '#'+'\t'.join(HEADER) else: print '# ' + ctime() print '# ' + ' '.join(sys.argv) COL_WIDTHS = [20, 20] + [9] * 10 print_table([HEADER], fix_col_width=COL_WIDTHS, wrap_style='wrap') prev_tree = None ref_fname = os.path.basename(args.reftree) for counter, tfile in enumerate(source_trees): if args.source_file: seedid, tfile = tfile else: seedid = None if args.extract_species: if args.sp_regexp: SPMATCHER = re.compile(args.sp_regexp) get_sp_name = lambda x: re.search(SPMATCHER, x).groups()[0] else: get_sp_name = lambda x: x tt = PhyloTree(tfile, sp_naming_function = get_sp_name) else: tt = Tree(tfile) if args.src_tree_attr: for lf in tt.iter_leaves(): lf._origname = lf.name lf.name = getattr(lf, args.src_tree_attr) if args.outgroup: if len(args.outgroup) > 1: out = tt.get_common_ancestor(args.outgroup) else: out = tt.search_nodes(name=args.outgroup[0])[0] tt.set_outgroup(out) if args.source_trees: fname = os.path.basename(tfile) else: fname = '%05d' %counter r = tt.compare(ref_tree, ref_tree_attr=args.ref_tree_attr, source_tree_attr=args.src_tree_attr, min_support_ref=args.min_support_ref, min_support_source = args.min_support_src, unrooted=args.unrooted, has_duplications=args.extract_species) print_table([map(istr, [fname[-30:], ref_fname[-30:], r['effective_tree_size'], r['norm_rf'], r['rf'], r['max_rf'], r["source_edges_in_ref"], r["ref_edges_in_source"], r['source_subtrees'], r['treeko_dist']])], fix_col_width = COL_WIDTHS, wrap_style='cut') if args.output: OUT.close()
''' if hasattr(node, "collapsed"): if node.collapsed == 1: node.img_style["draw_descendants"] = False if node.is_leaf(): if hasattr(node, "sequence"): seqface = MySequenceFace(node.sequence, "nt", fsize=10, col_w=11, interactive=True) faces.add_face_to_node(seqface, node, 1, aligned=True) if __name__ == "__main__": tree = PhyloTree('(Orangutan,Human,Chimp);') tree.link_to_alignment(""" >Chimp HARWLNEKLRCELRTLKKLGLDGYKAVSQYVKGRA >Orangutan DARWINEKLRCVSRTLKKLGLDGYKGVSQYVKGRP >Human DARWHNVKLRCELRTLKKLGLVGFKAVSQFVIRRA """) nt_sequences = { "Human": "GACGCACGGTGGCACAACGTAAAATTAAGATGTGAATTGAGAACTCTGAAAAAATTGGGACTGGTCGGCTTCAAGGCAGTAAGTCAATTCGTAATACGTCGTGCG", "Chimp": "CACGCCCGATGGCTCAACGAAAAGTTAAGATGCGAATTGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGCAGTAAGTCAGTACGTTAAAGGTCGTGCG", "Orangutan": "GATGCACGCTGGATCAACGAAAAGTTAAGATGCGTATCGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGGAGTAAGTCAATACGTTAAAGGTCGTCCG"
from ete_dev import PhyloTree # Loads an example tree nw = """ ((Dme_001,Dme_002),(((Cfa_001,Mms_001),((Hsa_001,Ptr_001),Mmu_001)), (Ptr_002,(Hsa_002,Mmu_002)))); """ t = PhyloTree(nw) print t # /-Dme_001 # /--------| # | \-Dme_002 # | # | /-Cfa_001 # | /--------| #---------| | \-Mms_001 # | /--------| # | | | /-Hsa_001 # | | | /--------| # | | \--------| \-Ptr_001 # \--------| | # | \-Mmu_001 # | # | /-Ptr_002 # \--------| # | /-Hsa_002 # \--------| # \-Mmu_002 # # To obtain all the evolutionary events involving a given leaf node we # use get_my_evol_events method matches = t.search_nodes(name="Hsa_001")
tax2name = {} if args.tax2track: tax2track = cPickle.load(open(args.tax2track)) else: tax2track = {} print len(tax2track), len(tax2name) #header = "filename", "refname", "# subtrees", "# dups", "broken subtrees", "ncbi_mistakes", "RF", "avg RF", "RF std", "max RF", "") #print '\t'.join(header) header = ("Tree".center(50), "Total subtrees", "Broken subtrees", "Broken NCBI clades", "RF (avg)", "RF (med)", "RF (std)", "RF (max possible)") print >> OUT, "#" + ' '.join([h.center(15) for h in header]) for tfile in target_trees: print tfile t = PhyloTree(tfile, sp_naming_function=None) if args.outgroup: if len(args.outgroup) == 1: out = t & args.outgroup[0] else: out = t.get_common_ancestor(args.outgroup) if set(out.get_leaf_names()) ^ set(args.outgroup): raise ValueError("Outgroup is not monophyletic") t.set_outgroup(out) t.ladderize() if prev_tree: tree_compare(t, prev_tree) prev_tree = t
MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAH """ iphylip_txt = """ 4 76 seqA MAEIPDETIQ QFMALT---H NIAVQYLSEF GDLNEALNSY YASQTDDIKD RREEAHQFMA seqB MAEIPDATIQ QFMALTNVSH NIAVQY--EF GDLNEALNSY YAYQTDDQKD RREEAHQFMA seqC MAEIPDATIQ ---ALTNVSH NIAVQYLSEF GDLNEALNSY YASQTDDQPD RREEAHQFMA seqD MAEAPDETIQ QFMALTNVSH NIAVQYLSEF GDLNEAL--- ---------- -REEAHQ--- LTNVSHQFMA LTNVSH LTNVSH---- ------ LTNVSH---- ------ -------FMA LTNVSH """ # Load a tree and link it to an alignment. As usual, 'alignment' can # be the path to a file or data in text format. t = PhyloTree("(((seqA,seqB),seqC),seqD);", alignment=fasta_txt, alg_format="fasta") #We can now access the sequence of every leaf node print "These are the nodes and its sequences:" for leaf in t.iter_leaves(): print leaf.name, leaf.sequence #seqD MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAH #seqC MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH #seqA MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAH #seqB MAEIPDATIQQFMALTNVSHNIAVQY--EFGDLNEALNSYYAYQTDDQKDRREEAH # # The associated alignment can be changed at any time t.link_to_alignment(alignment=iphylip_txt, alg_format="iphylip") # Let's check that sequences have changed print "These are the nodes and its re-linked sequences:" for leaf in t.iter_leaves():
def test_ncbi_compare(self): t = PhyloTree("((9606, (9598, 9606)), 10090);", sp_naming_function=lambda x: x.name) t.annotate_ncbi_taxa()
from ete_dev import PhyloTree, PhylomeDBConnector, SeqGroup p = PhylomeDBConnector() w,x, t = p.get_best_tree("Hsa0000001", 1) a, l = p.get_clean_alg("Hsa0000001", 1) A = SeqGroup(a, "iphylip") for s in A.id2seq: A.id2seq[s]=A.id2seq[s][:30] t.link_to_alignment(A) print t.get_species() print t t.set_outgroup(t&"Ddi0002240") sp = PhyloTree("(((((((((((Hsa, Ptr), Mmu), ((Mms, Rno), (Bta, Cfa))), Mdo), Gga), Xtr), (Dre, Fru))),Cin) (Dme, Aga)), Ddi);") reconciled, evs = t.reconcile(sp) print reconciled reconciled.show()
def main(argv): parser = ArgumentParser(description=__DESCRIPTION__) parser.add_argument("--db", dest="dbfile", type=str, help="""NCBI sqlite3 db file.""") input_args = parser.add_argument_group('TAXID INPUT OPTIONS') input_args.add_argument("-i", "--taxid", dest="taxid", nargs="+", type=int, help="""taxids (space separated)""") input_args.add_argument("-if", "--taxid_file", dest="taxid_file", type=str, help="""file containing a list of taxids (one per line)""") input_args.add_argument("-t", "--reftree", dest="reftree", type=str, help="""Read taxids from the provided tree.""") input_args.add_argument("--reftree_attr", dest="reftree_attr", type=str, default="name", help="""tree attribute encoding for taxid numbers.""") name_input_args = parser.add_argument_group('NAME INPUT OPTIONS') name_input_args.add_argument("-n", "--name", dest="names", nargs="+", type=str, help="""species or taxa names (comma separated)""") name_input_args.add_argument("-nf", "--names_file", dest="names_file", type=str, help="""file containing a list of taxids (one per line)""") name_input_args.add_argument("--fuzzy", dest="fuzzy", type=float, help=("Tries a fuzzy (and SLOW) search for those" " species names that could not be translated" " into taxids. A float number must be provided" " indicating the minimum string similarity.")) output_args = parser.add_argument_group('OUTPUT OPTIONS') output_args.add_argument("-x", "--taxonomy", dest="taxonomy", type=str, help=("dump a pruned version of the NCBI taxonomy" " tree containing target species into the specified file")) output_args.add_argument("-l", "--list", dest="info_list", type=str, help="""dump NCBI taxonmy information for each target species into the specified file. """) output_args.add_argument("-a", "--annotated", dest="annotated_tree", type=str, help="dump the annotated tree of the input reftree provided with -t into the specified file.") output_args.add_argument("--collapse_subspecies", dest="collapse_subspecies", action="store_true", help=("When used, all nodes under the the species rank" " are collapsed, so all species and subspecies" " are seen as sister nodes")) output_args.add_argument("--rank_limit", dest="rank_limit", type=str, help=("When used, all nodes under the provided rank" " are discarded")) output_args.add_argument("--full_lineage", dest="full_lineage", action="store_true", help=("When used, topology is not pruned to avoid " " one-child-nodes, so the complete lineage" " track leading from root to tips is kept.")) args = parser.parse_args(argv) taxid_source = args.taxid or args.taxid_file or args.reftree name_source = args.names or args.names_file if not taxid_source and not name_source: parser.error('At least one input source is required') if taxid_source and name_source: parser.error('taxid and name options are mutually exclusive') if not args.taxonomy and not args.info_list and not args.annotated_tree: parser.error('At least one output option is required') ncbi = NCBITaxa(args.dbfile) all_names = set([]) all_taxids = [] if args.names_file: all_names.update(map(strip, open(args.names_file, "rU").read().split("\n"))) if args.names: all_names.update(map(strip, " ".join(args.names).split(","))) all_names.discard("") #all_names = set([n.lower() for n in all_names]) not_found = set() name2realname = {} name2score = {} if all_names: log.info("Dumping name translations in %s.name_translation.txt ... ") name2id = ncbi.get_name_translator(all_names) not_found = all_names - set(name2id.keys()) if args.fuzzy and not_found: log.info("%s unknown names", len(not_found)) for name in not_found: # enable extension loading tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy) if tax: name2id[name] = tax name2realname[name] = realname name2score[name] = "Fuzzy:%0.2f" %sim for name in all_names: taxid = name2id.get(name, "???") realname = name2realname.get(name, name) score = name2score.get(name, "Exact:1.0") print "\t".join(map(str, [score, name, realname.capitalize(), taxid])) if args.taxid_file: all_taxids.extend(map(strip, open(args.taxid_file, "rU").read().split("\n"))) if args.taxid: all_taxids.extend(args.taxid) reftree = None if args.reftree: reftree = PhyloTree(args.reftree) all_taxids.extend(list(set([getattr(n, args.reftree_attr) for n in reftree.iter_leaves()]))) if all_taxids and args.info_list: all_taxids = set(all_taxids) all_taxids.discard("") all_taxids, merge_conversion = ncbi._translate_merged(all_taxids) outfile = args.info_list+".info.txt" log.info("Dumping %d taxid translations in %s ..." %(len(all_taxids), outfile)) all_taxids.discard("") translator = ncbi.get_taxid_translator(all_taxids) OUT = open(outfile, "w") for taxid, name in translator.iteritems(): lineage = ncbi.get_sp_lineage(taxid) named_lineage = ','.join(ncbi.translate_to_names(lineage)) lineage = ','.join(map(str, lineage)) print >>OUT, "\t".join(map(str, [merge_conversion.get(int(taxid), taxid), name, named_lineage, lineage ])) OUT.close() for notfound in set(map(str, all_taxids)) - set(str(k) for k in translator.iterkeys()): print >>sys.stderr, notfound, "NOT FOUND" if all_taxids and args.taxonomy: all_taxids = set(all_taxids) all_taxids.discard("") log.info("Dumping NCBI taxonomy of %d taxa in %s.*.nw ..." %(len(all_taxids), args.taxonomy)) t = ncbi.get_topology(all_taxids, args.full_lineage, args.rank_limit) id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()]) for n in t.traverse(): n.add_features(taxid=n.name) n.add_features(sci_name=str(id2name.get(int(n.name), "?"))) n.name = "%s{%s}" %(id2name.get(int(n.name), n.name), n.name) lineage = ncbi.get_sp_lineage(n.taxid) n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage))) if args.collapse_subspecies: species_nodes = [n for n in t.traverse() if n.rank == "species" if int(n.taxid) in all_taxids] for sp_node in species_nodes: bellow = sp_node.get_descendants() if bellow: # creates a copy of the species node connector = sp_node.__class__() for f in sp_node.features: connector.add_feature(f, getattr(sp_node, f)) connector.name = connector.name + "{species}" for n in bellow: n.detach() n.name = n.name + "{%s}" %n.rank sp_node.add_child(n) sp_node.add_child(connector) sp_node.add_feature("collapse_subspecies", "1") t.write(format=9, outfile=args.taxonomy+".names.nw") t.write(format=8, outfile=args.taxonomy+".allnames.nw") t.write(format=9, features=["taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage"], outfile=args.taxonomy+".full_annotation.nw") for i in t.iter_leaves(): i.name = i.taxid t.write(format=9, outfile=args.taxonomy+".taxids.nw") t.write(format=8, outfile=args.taxonomy+".alltaxids.nw") if all_taxids and reftree: translator = ncbi.get_taxid_translator(all_taxids) for n in reftree.iter_leaves(): if not hasattr(n, "taxid"): n.add_features(taxid=int(getattr(n, args.reftree_attr))) n.add_features(sci_name = translator.get(int(n.taxid), n.name)) lineage = ncbi.get_sp_lineage(n.taxid) named_lineage = '|'.join(ncbi._translate_to_names(lineage)) n.add_features(ncbi_track=named_lineage) print reftree.write(features=["taxid", "sci_name", "ncbi_track"])
from ete_dev import PhyloTree # Reads a phylogenetic tree (using default species name encoding) t = PhyloTree("(((Hsa_001,Ptr_001),(Cfa_001,Mms_001)),(Dme_001,Dme_002));") # /-Hsa_001 # /--------| # | \-Ptr_001 # /--------| # | | /-Cfa_001 # | \--------| #---------| \-Mms_001 # | # | /-Dme_001 # \--------| # \-Dme_002 # # Prints current leaf names and species codes print "Deafult mode:" for n in t.get_leaves(): print "node:", n.name, "Species name:", n.species # node: Dme_001 Species name: Dme # node: Dme_002 Species name: Dme # node: Hsa_001 Species name: Hsa # node: Ptr_001 Species name: Ptr # node: Cfa_001 Species name: Cfa # node: Mms_001 Species name: Mms # # We can also use our own leaf name parsing function to obtain species # names. All we need to do is create a python function that takes # node's name as argument and return its corresponding species name. def get_species_name(node_name_string): # Species code is the first part of leaf name (separated by an
def my_tree_loader(tree): """ This is function is used to load trees within the WebTreeApplication object. """ t = PhyloTree(tree, sp_naming_function=extract_species_code) return t
def main(argv): parser = argparse.ArgumentParser( description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) input_args = parser.add_argument_group("INPUT OPTIONS") input_args.add_argument("source_trees", metavar='source_trees', type=str, nargs="*", help='a list of source tree files') input_args.add_argument( "--source_file", dest="source_file", type=str, help="""path to a file containing many source trees, one per line""") input_args.add_argument("-r", dest="reftree", type=str, required=True, help="""Reference tree""") input_args.add_argument("--ref_tree_attr", dest="ref_tree_attr", type=str, default="name", help=("attribute in ref tree used as leaf name")) input_args.add_argument( "--src_tree_attr", dest="src_tree_attr", type=str, default="name", help=("attribute in source tree used as leaf name")) input_args.add_argument( "--min_support_ref", type=float, default=0.0, help=("min support for branches to be considered from the ref tree")) input_args.add_argument( "--min_support_src", type=float, default=0.0, help=( "min support for branches to be considered from the source tree")) output_args = parser.add_argument_group("OUTPUT OPTIONS") output_args.add_argument("-o", dest="output", type=str, help="""Path to the tab delimited report file""") opt_args = parser.add_argument_group("DISTANCE OPTIONS") opt_args.add_argument( "--outgroup", dest="outgroup", nargs="+", help= """outgroup used to root reference and source trees before distance computation""" ) opt_args.add_argument("--expand_polytomies", dest="polytomies", action="store_true", help="""expand politomies if necessary""") opt_args.add_argument("--unrooted", dest="unrooted", action="store_true", help="""compare trees as unrooted""") opt_args.add_argument( "--min_support", dest="min_support", type=float, default=0.0, help= ("min support value for branches to be counted in the distance computation (RF, treeko and refTree/targeGene compatibility)" )) opt_args = parser.add_argument_group("PHYLOGENETICS OPTIONS") opt_args.add_argument( "--extract_species", action="store_true", help= "When used, leaf names in the reference and source trees are assumed to represent species." " If target trees are gene-trees whose species information is encoded as a part of the leaf sequence name," " it can be automatically extracted by providing a Perl regular expression that extract a " " valid species code (see --sp_regexp). Such information will be also used to detect duplication" " events. ") opt_args.add_argument( "--sp_regexp", type=str, help= ("Specifies a Perl regular expression to automatically extract species names" " from the name string in source trees. If not used, leaf names are assumed to represent species names." " Example: use this expression '[^_]+_(.+)' to extract HUMAN from the string 'P53_HUMAN'." )) opt_args.add_argument("--collateral", action='store_true', help=("")) args = parser.parse_args(argv) print __DESCRIPTION__ reftree = args.reftree if args.source_file and args.source_trees: print >> sys.stderr, 'The use of targets_file and targets at the same time is not supported.' sys.exit(1) if args.source_file: source_trees = tree_iterator(args.source_file) else: source_trees = args.source_trees ref_tree = Tree(reftree) if args.ref_tree_attr: for lf in ref_tree.iter_leaves(): lf._origname = lf.name if args.ref_tree_attr not in lf.features: print lf lf.name = getattr(lf, args.ref_tree_attr) if args.outgroup: if len(args.outgroup) > 1: out = ref_tree.get_common_ancestor(args.outgroup) else: out = ref_tree.search_nodes(name=args.outgroup[0])[0] ref_tree.set_outgroup(out) HEADER = ("source tree", 'ref tree', 'common\ntips', 'normRF', 'RF', 'maxRF', "%reftree", "%genetree", "subtrees", "treeko\ndist") if args.output: OUT = open(args.output, "w") print >> OUT, '# ' + ctime() print >> OUT, '# ' + ' '.join(sys.argv) print >> OUT, '#' + '\t'.join(HEADER) else: print '# ' + ctime() print '# ' + ' '.join(sys.argv) COL_WIDTHS = [20, 20] + [9] * 10 print_table([HEADER], fix_col_width=COL_WIDTHS, wrap_style='wrap') prev_tree = None ref_fname = os.path.basename(args.reftree) for counter, tfile in enumerate(source_trees): if args.source_file: seedid, tfile = tfile else: seedid = None if args.extract_species: if args.sp_regexp: SPMATCHER = re.compile(args.sp_regexp) get_sp_name = lambda x: re.search(SPMATCHER, x).groups()[0] else: get_sp_name = lambda x: x tt = PhyloTree(tfile, sp_naming_function=get_sp_name) else: tt = Tree(tfile) if args.src_tree_attr: for lf in tt.iter_leaves(): lf._origname = lf.name lf.name = getattr(lf, args.src_tree_attr) if args.outgroup: if len(args.outgroup) > 1: out = tt.get_common_ancestor(args.outgroup) else: out = tt.search_nodes(name=args.outgroup[0])[0] tt.set_outgroup(out) if args.source_trees: fname = os.path.basename(tfile) else: fname = '%05d' % counter r = tt.compare(ref_tree, ref_tree_attr=args.ref_tree_attr, source_tree_attr=args.src_tree_attr, min_support_ref=args.min_support_ref, min_support_source=args.min_support_src, unrooted=args.unrooted, has_duplications=args.extract_species) print_table([ map(istr, [ fname[-30:], ref_fname[-30:], r['effective_tree_size'], r['norm_rf'], r['rf'], r['max_rf'], r["source_edges_in_ref"], r["ref_edges_in_source"], r['source_subtrees'], r['treeko_dist'] ]) ], fix_col_width=COL_WIDTHS, wrap_style='cut') if args.output: OUT.close()
def test_ncbi_compare(self): t = PhyloTree( "((9606, (9598, 9606)), 10090);", sp_naming_function=lambda x: x.name ) t.annotate_ncbi_taxa()
MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH >Mms_001 MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAH >Hsa_001 MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAH >Ptr_002 MAEIPDATIQ-FMALTNVSHNIAVQY--EFGDLNEALNSY--YQTDDQKDRREEAH >Mmu_002 MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH >Hsa_002 MAEAPDETIQQFM-LTNVSHNIAVQYLSEFGDLNEAL--------------REEAH >Mmu_001 MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAH >Ptr_001 MAEIPDATIQ-FMALTNVSHNIAVQY--EFGDLNEALNSY--YQTDDQKDRREEAH >Mmu_001 MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH """ # Performs a tree reconciliation analysis gene_tree_nw = '((Dme_001,Dme_002),(((Cfa_001,Mms_001),((Hsa_001,Ptr_001),Mmu_001)),(Ptr_002,(Hsa_002,Mmu_002))));' species_tree_nw = "((((Hsa, Ptr), Mmu), (Mms, Cfa)), Dme);" genetree = PhyloTree(gene_tree_nw) sptree = PhyloTree(species_tree_nw) recon_tree, events = genetree.reconcile(sptree) recon_tree.link_to_alignment(alg) # Visualize the reconciled tree recon_tree.render("phylotree.png", w=750)
def main(argv): parser = ArgumentParser(description=__DESCRIPTION__) parser.add_argument("--db", dest="dbfile", type=str, help="""NCBI sqlite3 db file.""") input_args = parser.add_argument_group('TAXID INPUT OPTIONS') input_args.add_argument("-i", "--taxid", dest="taxid", nargs="+", type=int, help="""taxids (space separated)""") input_args.add_argument( "-if", "--taxid_file", dest="taxid_file", type=str, help="""file containing a list of taxids (one per line)""") input_args.add_argument("-t", "--reftree", dest="reftree", type=str, help="""Read taxids from the provided tree.""") input_args.add_argument( "--reftree_attr", dest="reftree_attr", type=str, default="name", help="""tree attribute encoding for taxid numbers.""") name_input_args = parser.add_argument_group('NAME INPUT OPTIONS') name_input_args.add_argument( "-n", "--name", dest="names", nargs="+", type=str, help="""species or taxa names (comma separated)""") name_input_args.add_argument( "-nf", "--names_file", dest="names_file", type=str, help="""file containing a list of taxids (one per line)""") name_input_args.add_argument( "--fuzzy", dest="fuzzy", type=float, help=("Tries a fuzzy (and SLOW) search for those" " species names that could not be translated" " into taxids. A float number must be provided" " indicating the minimum string similarity.")) output_args = parser.add_argument_group('OUTPUT OPTIONS') output_args.add_argument( "-x", "--taxonomy", dest="taxonomy", type=str, help=("dump a pruned version of the NCBI taxonomy" " tree containing target species into the specified file")) output_args.add_argument( "-l", "--list", dest="info_list", type=str, help= """dump NCBI taxonmy information for each target species into the specified file. """ ) output_args.add_argument( "-a", "--annotated", dest="annotated_tree", type=str, help= "dump the annotated tree of the input reftree provided with -t into the specified file." ) output_args.add_argument( "--collapse_subspecies", dest="collapse_subspecies", action="store_true", help=("When used, all nodes under the the species rank" " are collapsed, so all species and subspecies" " are seen as sister nodes")) output_args.add_argument( "--rank_limit", dest="rank_limit", type=str, help=("When used, all nodes under the provided rank" " are discarded")) output_args.add_argument( "--full_lineage", dest="full_lineage", action="store_true", help=("When used, topology is not pruned to avoid " " one-child-nodes, so the complete lineage" " track leading from root to tips is kept.")) args = parser.parse_args(argv) taxid_source = args.taxid or args.taxid_file or args.reftree name_source = args.names or args.names_file if not taxid_source and not name_source: parser.error('At least one input source is required') if taxid_source and name_source: parser.error('taxid and name options are mutually exclusive') if not args.taxonomy and not args.info_list and not args.annotated_tree: parser.error('At least one output option is required') ncbi = NCBITaxa(args.dbfile) all_names = set([]) all_taxids = [] if args.names_file: all_names.update( map(strip, open(args.names_file, "rU").read().split("\n"))) if args.names: all_names.update(map(strip, " ".join(args.names).split(","))) all_names.discard("") #all_names = set([n.lower() for n in all_names]) not_found = set() name2realname = {} name2score = {} if all_names: log.info("Dumping name translations in %s.name_translation.txt ... ") name2id = ncbi.get_name_translator(all_names) not_found = all_names - set(name2id.keys()) if args.fuzzy and not_found: log.info("%s unknown names", len(not_found)) for name in not_found: # enable extension loading tax, realname, sim = ncbi.get_fuzzy_name_translation( name, args.fuzzy) if tax: name2id[name] = tax name2realname[name] = realname name2score[name] = "Fuzzy:%0.2f" % sim for name in all_names: taxid = name2id.get(name, "???") realname = name2realname.get(name, name) score = name2score.get(name, "Exact:1.0") print "\t".join( map(str, [score, name, realname.capitalize(), taxid])) if args.taxid_file: all_taxids.extend( map(strip, open(args.taxid_file, "rU").read().split("\n"))) if args.taxid: all_taxids.extend(args.taxid) reftree = None if args.reftree: reftree = PhyloTree(args.reftree) all_taxids.extend( list( set([ getattr(n, args.reftree_attr) for n in reftree.iter_leaves() ]))) if all_taxids and args.info_list: all_taxids = set(all_taxids) all_taxids.discard("") all_taxids, merge_conversion = ncbi._translate_merged(all_taxids) outfile = args.info_list + ".info.txt" log.info("Dumping %d taxid translations in %s ..." % (len(all_taxids), outfile)) all_taxids.discard("") translator = ncbi.get_taxid_translator(all_taxids) OUT = open(outfile, "w") for taxid, name in translator.iteritems(): lineage = ncbi.get_sp_lineage(taxid) named_lineage = ','.join(ncbi.translate_to_names(lineage)) lineage = ','.join(map(str, lineage)) print >> OUT, "\t".join( map(str, [ merge_conversion.get(int(taxid), taxid), name, named_lineage, lineage ])) OUT.close() for notfound in set(map(str, all_taxids)) - set( str(k) for k in translator.iterkeys()): print >> sys.stderr, notfound, "NOT FOUND" if all_taxids and args.taxonomy: all_taxids = set(all_taxids) all_taxids.discard("") log.info("Dumping NCBI taxonomy of %d taxa in %s.*.nw ..." % (len(all_taxids), args.taxonomy)) t = ncbi.get_topology(all_taxids, args.full_lineage, args.rank_limit) id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()]) for n in t.traverse(): n.add_features(taxid=n.name) n.add_features(sci_name=str(id2name.get(int(n.name), "?"))) n.name = "%s{%s}" % (id2name.get(int(n.name), n.name), n.name) lineage = ncbi.get_sp_lineage(n.taxid) n.add_features( named_lineage='|'.join(ncbi.translate_to_names(lineage))) if args.collapse_subspecies: species_nodes = [ n for n in t.traverse() if n.rank == "species" if int(n.taxid) in all_taxids ] for sp_node in species_nodes: bellow = sp_node.get_descendants() if bellow: # creates a copy of the species node connector = sp_node.__class__() for f in sp_node.features: connector.add_feature(f, getattr(sp_node, f)) connector.name = connector.name + "{species}" for n in bellow: n.detach() n.name = n.name + "{%s}" % n.rank sp_node.add_child(n) sp_node.add_child(connector) sp_node.add_feature("collapse_subspecies", "1") t.write(format=9, outfile=args.taxonomy + ".names.nw") t.write(format=8, outfile=args.taxonomy + ".allnames.nw") t.write(format=9, features=[ "taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage" ], outfile=args.taxonomy + ".full_annotation.nw") for i in t.iter_leaves(): i.name = i.taxid t.write(format=9, outfile=args.taxonomy + ".taxids.nw") t.write(format=8, outfile=args.taxonomy + ".alltaxids.nw") if all_taxids and reftree: translator = ncbi.get_taxid_translator(all_taxids) for n in reftree.iter_leaves(): if not hasattr(n, "taxid"): n.add_features(taxid=int(getattr(n, args.reftree_attr))) n.add_features(sci_name=translator.get(int(n.taxid), n.name)) lineage = ncbi.get_sp_lineage(n.taxid) named_lineage = '|'.join(ncbi._translate_to_names(lineage)) n.add_features(ncbi_track=named_lineage) print reftree.write(features=["taxid", "sci_name", "ncbi_track"])
def translate_ids(trees_file, outgroup_lineage="Bacteria"): for line in open(trees_file): if not line.strip() or line.startswith('#'): continue t = PhyloTree(line, sp_naming_function=spname) #t.set_outgroup(t.get_midpoint_outgroup()) for lf in t: lf.add_features(coded_name=lf.name) if lf.name in NAME2SP: lf.name = "%s {%s}" % (lf.name, NAME2SP[lf.name]) t.dist = 0 ncbi.connect_database() name2sp = ncbi.get_name_translator(t.get_species()) for lf in t.iter_leaves(): lf.add_features(taxid=name2sp.get(lf.species, 0)) t.set_outgroup(t.search_nodes(taxid=9606)[0]) ncbi.annotate_tree(t, attr_name='taxid') t.set_outgroup( t.get_common_ancestor( [lf for lf in t if outgroup_lineage in lf.named_lineage])) ncbi.annotate_tree(t, attr_name='taxid') #print t.write(features=[]) #print t.write() yield t
import sys from collections import defaultdict from ete_dev import PhyloTree if len(sys.argv) > 1: t = PhyloTree(sys.argv[1]) else: t = PhyloTree() #t.populate(5000, reuse_names=True, names_library=map(lambda x: "%03d" %x, range(100))) #t.populate(5000, reuse_names=True, names_library=["aaa", "bbb", "ccc","dddd"]) #t.set_species_naming_function(lambda x: x[:3]) #t = PhyloTree("((((Kla0008018:0.226825,(Kwa0003593:0.270871,(((((((Sce0006606:0.020101,(Smi0000169:0.045626,Sku0001100:0.091634)0.9:0.021336)0.473:0.004546,Spa0001368:0)0.806:0.040152,Sba0000063:0.059101)0.967:0.124536,Sca0004780:0.57162)0.36:0.045976,Cgl0005705:0.244154)0.94:0.080608,(((Spa0003632:0.005291,Sce0012358:0.019313)0.879:0.014349,Smi0005102:0.031246)0.028:0.000541,(Sba0002319:0.027948,Sku0001858:0.037758)0.873:0.023849)0.995:0.14497)0.859:0.056767,(Sca0004490:0.235469,Kpo0005032:0.313188)0.699:0.077825)0.807:0.085287)0.523:0.049374)0.606:0.167197,Ago0006484:0.438321)0.976:0.605273,Cal0012751:1.95721)0.975:0.332581,(Cal0010356:0.478947,((Ago0007434:1.13211,Kwa0002043:1.20443)0.282:0.216219,(Skl0001126:0.276168,Cgl0008719:0.5381)0.454:0.191735)0.934:0.438082)0.975:0.332581);") #t = PhyloTree("((((((AAA1, AAA2),((BBB1,BBB2), AAA3)D1),(CCC1,CCC2)), AAA8)D2, (((AAA5, AAA6),((BBB5,BBB6), AAA4)D3),(CCC3,CCC4)))D4, D);", format=1) t = PhyloTree( "((((((((AAA1, AAA2:0.111)a1,(((BBB1,ZZZ1)a2,MMM1)a3,AAA4)a4)a5, AAA3)a6,(AAA4, (AAA5, XXX1)a8)a9)a10,DDD)a11,DDD)a12,DDD)a13,DDD)root;", format=1) print t.get_ascii() ntrees, ndups, sp_trees = t.get_speciation_trees(map_features=["dist"]) for sptree in sp_trees: print sptree.get_ascii(attributes=["dist"])
def main(argv): parser = argparse.ArgumentParser(description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo. # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version) # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) ) # const - A constant value required by some action and nargs selections. # default - The value produced if the argument is absent from the command line. # type - The type to which the command-line argument should be converted. # choices - A container of the allowable values for the argument. # required - Whether or not the command-line option may be omitted (optionals only). # help - A brief description of what the argument does. # metavar - A name for the argument in usage messages. # dest - The name of the attribute to be added to the object returned by parse_args(). parser.add_argument("--show", dest="show_tree", action="store_true", help="""Display tree after the analysis.""") parser.add_argument("--render", dest="render", action="store_true", help="""Render tree.""") parser.add_argument("--dump", dest="dump", action="store_true", help="""Dump analysis""") parser.add_argument("--explore", dest="explore", type=str, help="""Reads a previously analyzed tree and visualize it""") input_args = parser.add_mutually_exclusive_group() input_args.required=True input_args.add_argument("-t", "--tree", dest="target_tree", nargs="+", type=str, help="""Tree file in newick format""") input_args.add_argument("-tf", dest="tree_list_file", type=str, help="File with the list of tree files") parser.add_argument("--tax", dest="tax_info", type=str, help="If the taxid attribute is not set in the" " newick file for all leaf nodes, a tab file file" " with the translation of name and taxid can be" " provided with this option.") parser.add_argument("--sp_delimiter", dest="sp_delimiter", type=str, help="If taxid is part of the leaf name, delimiter used to split the string") parser.add_argument("--sp_field", dest="sp_field", type=int, default=0, help="field position for taxid after splitting leaf names") parser.add_argument("--ref", dest="ref_tree", type=str, help="Uses ref tree to compute robinson foulds" " distances of the different subtrees") parser.add_argument("--rf-only", dest="rf_only", action = "store_true", help="Skip ncbi consensus analysis") parser.add_argument("--outgroup", dest="outgroup", type=str, nargs="+", help="A list of node names defining the trees outgroup") parser.add_argument("--is_sptree", dest="is_sptree", action = "store_true", help="Assumes no duplication nodes in the tree") parser.add_argument("-o", dest="output", type=str, help="Writes result into a file") parser.add_argument("--tax2name", dest="tax2name", type=str, help="") parser.add_argument("--tax2track", dest="tax2track", type=str, help="") parser.add_argument("--dump_tax_info", dest="dump_tax_info", action="store_true", help="") args = parser.parse_args(argv) if args.sp_delimiter: GET_TAXID = lambda x: x.split(args.sp_delimiter)[args.sp_field] else: GET_TAXID = None reftree_name = os.path.basename(args.ref_tree) if args.ref_tree else "" if args.explore: print >>sys.stderr, "Reading tree from file:", args.explore t = cPickle.load(open(args.explore)) ts = TreeStyle() ts.force_topology = True ts.show_leaf_name = False ts.layout_fn = ncbi_layout ts.mode = "r" t.show(tree_style=ts) print >>sys.stderr, "dumping color config" cPickle.dump(name2color, open("ncbi_colors.pkl", "w")) sys.exit() if args.output: OUT = open(args.output, "w") else: OUT = sys.stdout print >>sys.stderr, "Dumping results into", OUT target_trees = [] if args.tree_list_file: target_trees = [line.strip() for line in open(args.tree_list_file)] if args.target_tree: target_trees += args.target_tree prev_tree = None if args.tax2name: tax2name = cPickle.load(open(args.tax2name)) else: tax2name = {} if args.tax2track: tax2track = cPickle.load(open(args.tax2track)) else: tax2track = {} print len(tax2track), len(tax2name) header = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Clade sizes", "RF (avg)", "RF (med)", "RF (std)", "RF (max)", "Shared tips") print >>OUT, '|'.join([h.ljust(15) for h in header]) if args.ref_tree: print >>sys.stderr, "Reading ref tree from", args.ref_tree reft = Tree(args.ref_tree, format=1) else: reft = None SHOW_TREE = False if args.show_tree or args.render: SHOW_TREE = True prev_broken = set() ENTRIES = [] ncbi.connect_database() for tfile in target_trees: #print tfile t = PhyloTree(tfile, sp_naming_function=None) if GET_TAXID: for n in t.iter_leaves(): n.name = GET_TAXID(n.name) if args.outgroup: if len(args.outgroup) == 1: out = t & args.outgroup[0] else: out = t.get_common_ancestor(args.outgroup) if set(out.get_leaf_names()) ^ set(args.outgroup): raise ValueError("Outgroup is not monophyletic") t.set_outgroup(out) t.ladderize() if prev_tree: tree_compare(t, prev_tree) prev_tree = t if args.tax_info: tax2name, tax2track = annotate_tree_with_taxa(t, args.tax_info, tax2name, tax2track) if args.dump_tax_info: cPickle.dump(tax2track, open("tax2track.pkl", "w")) cPickle.dump(tax2name, open("tax2name.pkl", "w")) print "Tax info written into pickle files" else: for n in t.iter_leaves(): spcode = n.name n.add_features(taxid=spcode) n.add_features(species=spcode) tax2name, tax2track = annotate_tree_with_taxa(t, None, tax2name, tax2track) # Split tree into species trees #subtrees = t.get_speciation_trees() if not args.rf_only: #print "Calculating tree subparts..." t1 = time.time() if not args.is_sptree: subtrees = t.split_by_dups() #print "Subparts:", len(subtrees), time.time()-t1 else: subtrees = [t] valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = analyze_subtrees(t, subtrees, show_tree=SHOW_TREE) #print valid_subtrees, broken_subtrees, ncbi_mistakes, total_rf else: subtrees = [] valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = 0, 0, 0, 0, 0, 0 ndups = 0 nsubtrees = len(subtrees) rf = 0 rf_max = 0 rf_std = 0 rf_med = 0 common_names = 0 max_size = 0 if reft and len(subtrees) == 1: rf = t.robinson_foulds(reft, attr_t1="realname") rf_max = rf[1] rf = rf[0] rf_med = rf elif reft: #print "Calculating avg RF..." nsubtrees, ndups, subtrees = t.get_speciation_trees(map_features=["taxid"]) #print len(subtrees), "Sub-Species-trees found" avg_rf = [] rf_max = 0.0 # reft.robinson_foulds(reft)[1] sum_size = 0.0 print nsubtrees, "subtrees", ndups, "duplications" for ii, subt in enumerate(subtrees): print "\r%d" %ii, sys.stdout.flush() try: partial_rf = subt.robinson_foulds(reft, attr_t1="taxid") except ValueError: pass else: sptree_size = len(set([n.taxid for n in subt.iter_leaves()])) sum_size += sptree_size avg_rf.append((partial_rf[0]/float(partial_rf[1])) * sptree_size) common_names = len(partial_rf[3]) max_size = max(max_size, sptree_size) rf_max = max(rf_max, partial_rf[1]) #print partial_rf[:2] rf = numpy.sum(avg_rf) / float(sum_size) # Treeko dist rf_std = numpy.std(avg_rf) rf_med = numpy.median(avg_rf) sizes_info = "%0.1f/%0.1f +- %0.1f" %( numpy.mean(broken_sizes), numpy.median(broken_sizes), numpy.std(broken_sizes)) iter_values = [os.path.basename(tfile), nsubtrees, ndups, broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, rf, rf_med, rf_std, rf_max, common_names] print >>OUT, '|'.join(map(lambda x: str(x).strip().ljust(15), iter_values)) fixed = sorted([n for n in prev_broken if n not in broken_clades]) new_problems = sorted(broken_clades - prev_broken) fixed_string = color(', '.join(fixed), "green") if fixed else "" problems_string = color(', '.join(new_problems), "red") if new_problems else "" OUT.write(" Fixed clades: %s\n" %fixed_string) if fixed else None OUT.write(" New broken: %s\n" %problems_string) if new_problems else None prev_broken = broken_clades ENTRIES.append([os.path.basename(tfile), nsubtrees, ndups, broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, fixed_string, problems_string]) OUT.flush() if args.show_tree or args.render: ts = TreeStyle() ts.force_topology = True #ts.tree_width = 500 ts.show_leaf_name = False ts.layout_fn = ncbi_layout ts.mode = "r" t.dist = 0 if args.show_tree: #if args.hide_monophyletic: # tax2monophyletic = {} # n2content = t.get_node2content() # for node in t.traverse(): # term2count = defaultdict(int) # for leaf in n2content[node]: # if leaf.lineage: # for term in leaf.lineage: # term2count[term] += 1 # expected_size = len(n2content) # for term, count in term2count.iteritems(): # if count > 1 print "Showing tree..." t.show(tree_style=ts) else: t.render("img.svg", tree_style=ts, dpi=300) print "dumping color config" cPickle.dump(name2color, open("ncbi_colors.pkl", "w")) if args.dump: cPickle.dump(t, open("ncbi_analysis.pkl", "w")) print print HEADER = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Broken branches", "Clade sizes", "Fixed Groups", "New Broken Clades") print_table(ENTRIES, max_col_width = 50, row_line=True, header=HEADER) if args.output: OUT.close()
tax2name = cPickle.load(open(args.tax2name)) else: tax2name = {} if args.tax2track: tax2track = cPickle.load(open(args.tax2track)) else: tax2track = {} print len(tax2track), len(tax2name) #header = "filename", "refname", "# subtrees", "# dups", "broken subtrees", "ncbi_mistakes", "RF", "avg RF", "RF std", "max RF", "") #print '\t'.join(header) header = ("Tree".center(50), "Total subtrees", "Broken subtrees", "Broken NCBI clades", "RF (avg)", "RF (med)", "RF (std)", "RF (max possible)") print >>OUT, "#"+' '.join([h.center(15) for h in header]) for tfile in target_trees: print tfile t = PhyloTree(tfile, sp_naming_function=None) if args.outgroup: if len(args.outgroup) == 1: out = t & args.outgroup[0] else: out = t.get_common_ancestor(args.outgroup) if set(out.get_leaf_names()) ^ set(args.outgroup): raise ValueError("Outgroup is not monophyletic") t.set_outgroup(out) t.ladderize() if prev_tree: tree_compare(t, prev_tree) prev_tree = t
from ete_dev import PhyloTree # Loads a gene tree and its corresponding species tree. Note that # species names in sptree are the 3 firs letters of leaf nodes in # genetree. gene_tree_nw = '((Dme_001,Dme_002),(((Cfa_001,Mms_001),((Hsa_001,Ptr_001),Mmu_001)),(Ptr_002,(Hsa_002,Mmu_002))));' species_tree_nw = "((((Hsa, Ptr), Mmu), (Mms, Cfa)), Dme);" genetree = PhyloTree(gene_tree_nw) sptree = PhyloTree(species_tree_nw) print genetree # /-Dme_001 # /--------| # | \-Dme_002 # | # | /-Cfa_001 # | /--------| #---------| | \-Mms_001 # | /--------| # | | | /-Hsa_001 # | | | /--------| # | | \--------| \-Ptr_001 # \--------| | # | \-Mmu_001 # | # | /-Ptr_002 # \--------| # | /-Hsa_002 # \--------| # \-Mmu_002 # # Let's reconcile our genetree with the species tree
""" iphylip_txt = """ 4 76 seqA MAEIPDETIQ QFMALT---H NIAVQYLSEF GDLNEALNSY YASQTDDIKD RREEAHQFMA seqB MAEIPDATIQ QFMALTNVSH NIAVQY--EF GDLNEALNSY YAYQTDDQKD RREEAHQFMA seqC MAEIPDATIQ ---ALTNVSH NIAVQYLSEF GDLNEALNSY YASQTDDQPD RREEAHQFMA seqD MAEAPDETIQ QFMALTNVSH NIAVQYLSEF GDLNEAL--- ---------- -REEAHQ--- LTNVSHQFMA LTNVSH LTNVSH---- ------ LTNVSH---- ------ -------FMA LTNVSH """ # Load a tree and link it to an alignment. As usual, 'alignment' can # be the path to a file or data in text format. t = PhyloTree("(((seqA,seqB),seqC),seqD);", alignment=fasta_txt, alg_format="fasta") #We can now access the sequence of every leaf node print "These are the nodes and its sequences:" for leaf in t.iter_leaves(): print leaf.name, leaf.sequence #seqD MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAH #seqC MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH #seqA MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAH #seqB MAEIPDATIQQFMALTNVSHNIAVQY--EFGDLNEALNSYYAYQTDDQKDRREEAH # # The associated alignment can be changed at any time t.link_to_alignment(alignment=iphylip_txt, alg_format="iphylip") # Let's check that sequences have changed print "These are the nodes and its re-linked sequences:"
from ete_dev import PhyloTree # Creates a gene phylogeny with several duplication events at # different levels. Note that we are using the default method for # detecting the species code of leaves (three first lettes in the node # name are considered the species code). nw = """ ((Dme_001,Dme_002),(((Cfa_001,Mms_001),((((Hsa_001,Hsa_003),Ptr_001) ,Mmu_001),((Hsa_004,Ptr_004),Mmu_004))),(Ptr_002,(Hsa_002,Mmu_002)))); """ t = PhyloTree(nw) print "Original tree:", print t # # /-Dme_001 # /--------| # | \-Dme_002 # | # | /-Cfa_001 # | /--------| # | | \-Mms_001 # | | #--| | /-Hsa_001 # | | /--------| # | /--------| /--------| \-Hsa_003 # | | | | | # | | | /--------| \-Ptr_001 # | | | | | # | | | | \-Mmu_001 # | | \--------| # \--------| | /-Hsa_004 # | | /--------|