def translate_ids(trees_file, outgroup_lineage="Bacteria"): for line in open(trees_file): if not line.strip() or line.startswith('#'): continue t = PhyloTree(line, sp_naming_function=spname) #t.set_outgroup(t.get_midpoint_outgroup()) for lf in t: lf.add_features(coded_name=lf.name) if lf.name in NAME2SP: lf.name = "%s {%s}" % (lf.name, NAME2SP[lf.name]) t.dist = 0 ncbi.connect_database() name2sp = ncbi.get_name_translator(t.get_species()) for lf in t.iter_leaves(): lf.add_features(taxid=name2sp.get(lf.species, 0)) t.set_outgroup(t.search_nodes(taxid=9606)[0]) ncbi.annotate_tree(t, attr_name='taxid') t.set_outgroup( t.get_common_ancestor( [lf for lf in t if outgroup_lineage in lf.named_lineage])) ncbi.annotate_tree(t, attr_name='taxid') #print t.write(features=[]) #print t.write() yield t
def translate_ids(trees_file, outgroup_lineage="Bacteria"): for line in open(trees_file): if not line.strip() or line.startswith('#'): continue t = PhyloTree(line, sp_naming_function=spname) #t.set_outgroup(t.get_midpoint_outgroup()) for lf in t: lf.add_features(coded_name = lf.name) if lf.name in NAME2SP: lf.name = "%s {%s}" %(lf.name, NAME2SP[lf.name]) t.dist = 0 ncbi.connect_database() name2sp = ncbi.get_name_translator(t.get_species()) for lf in t.iter_leaves(): lf.add_features(taxid=name2sp.get(lf.species, 0)) t.set_outgroup(t.search_nodes(taxid=9606)[0]) ncbi.annotate_tree(t, attr_name='taxid') t.set_outgroup(t.get_common_ancestor([lf for lf in t if outgroup_lineage in lf.named_lineage])) ncbi.annotate_tree(t, attr_name='taxid') #print t.write(features=[]) #print t.write() yield t
seqC MAEIPDATIQ ---ALTNVSH NIAVQYLSEF GDLNEALNSY YASQTDDQPD RREEAHQFMA seqD MAEAPDETIQ QFMALTNVSH NIAVQYLSEF GDLNEAL--- ---------- -REEAHQ--- LTNVSHQFMA LTNVSH LTNVSH---- ------ LTNVSH---- ------ -------FMA LTNVSH """ # Load a tree and link it to an alignment. As usual, 'alignment' can # be the path to a file or data in text format. t = PhyloTree("(((seqA,seqB),seqC),seqD);", alignment=fasta_txt, alg_format="fasta") #We can now access the sequence of every leaf node print "These are the nodes and its sequences:" for leaf in t.iter_leaves(): print leaf.name, leaf.sequence #seqD MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAH #seqC MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH #seqA MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAH #seqB MAEIPDATIQQFMALTNVSHNIAVQY--EFGDLNEALNSYYAYQTDDQKDRREEAH # # The associated alignment can be changed at any time t.link_to_alignment(alignment=iphylip_txt, alg_format="iphylip") # Let's check that sequences have changed print "These are the nodes and its re-linked sequences:" for leaf in t.iter_leaves(): print leaf.name, leaf.sequence #seqD MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAHQ----------FMALTNVSH #seqC MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAHQFMALTNVSH---------- #seqA MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAHQFMALTNVSHQFMALTNVSH
def main(argv): parser = argparse.ArgumentParser(description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) input_args = parser.add_argument_group("INPUT OPTIONS") input_args.add_argument("source_trees", metavar='source_trees', type=str, nargs="*", help='a list of source tree files') input_args.add_argument("--source_file", dest="source_file", type=str, help="""path to a file containing many source trees, one per line""") input_args.add_argument("-r", dest="reftree", type=str, required=True, help="""Reference tree""") input_args.add_argument("--ref_tree_attr", dest="ref_tree_attr", type=str, default="name", help=("attribute in ref tree used as leaf name")) input_args.add_argument("--src_tree_attr", dest="src_tree_attr", type=str, default="name", help=("attribute in source tree used as leaf name")) input_args.add_argument("--min_support_ref", type=float, default=0.0, help=("min support for branches to be considered from the ref tree")) input_args.add_argument("--min_support_src", type=float, default=0.0, help=("min support for branches to be considered from the source tree")) output_args = parser.add_argument_group("OUTPUT OPTIONS") output_args.add_argument("-o", dest="output", type=str, help="""Path to the tab delimited report file""") opt_args = parser.add_argument_group("DISTANCE OPTIONS") opt_args.add_argument("--outgroup", dest="outgroup", nargs = "+", help="""outgroup used to root reference and source trees before distance computation""") opt_args.add_argument("--expand_polytomies", dest="polytomies", action = "store_true", help="""expand politomies if necessary""") opt_args.add_argument("--unrooted", dest="unrooted", action = "store_true", help="""compare trees as unrooted""") opt_args.add_argument("--min_support", dest="min_support", type=float, default=0.0, help=("min support value for branches to be counted in the distance computation (RF, treeko and refTree/targeGene compatibility)")) opt_args = parser.add_argument_group("PHYLOGENETICS OPTIONS") opt_args.add_argument("--extract_species", action = "store_true", help="When used, leaf names in the reference and source trees are assumed to represent species." " If target trees are gene-trees whose species information is encoded as a part of the leaf sequence name," " it can be automatically extracted by providing a Perl regular expression that extract a " " valid species code (see --sp_regexp). Such information will be also used to detect duplication" " events. ") opt_args.add_argument("--sp_regexp", type=str, help=("Specifies a Perl regular expression to automatically extract species names" " from the name string in source trees. If not used, leaf names are assumed to represent species names." " Example: use this expression '[^_]+_(.+)' to extract HUMAN from the string 'P53_HUMAN'.")) opt_args.add_argument("--collateral", action='store_true', help=("")) args = parser.parse_args(argv) print __DESCRIPTION__ reftree = args.reftree if args.source_file and args.source_trees: print >>sys.stderr, 'The use of targets_file and targets at the same time is not supported.' sys.exit(1) if args.source_file: source_trees = tree_iterator(args.source_file) else: source_trees = args.source_trees ref_tree = Tree(reftree) if args.ref_tree_attr: for lf in ref_tree.iter_leaves(): lf._origname = lf.name if args.ref_tree_attr not in lf.features: print lf lf.name = getattr(lf, args.ref_tree_attr) if args.outgroup: if len(args.outgroup) > 1: out = ref_tree.get_common_ancestor(args.outgroup) else: out = ref_tree.search_nodes(name=args.outgroup[0])[0] ref_tree.set_outgroup(out) HEADER = ("source tree", 'ref tree', 'common\ntips', 'normRF', 'RF', 'maxRF', "%reftree", "%genetree", "subtrees", "treeko\ndist") if args.output: OUT = open(args.output, "w") print >>OUT, '# ' + ctime() print >>OUT, '# ' + ' '.join(sys.argv) print >>OUT, '#'+'\t'.join(HEADER) else: print '# ' + ctime() print '# ' + ' '.join(sys.argv) COL_WIDTHS = [20, 20] + [9] * 10 print_table([HEADER], fix_col_width=COL_WIDTHS, wrap_style='wrap') prev_tree = None ref_fname = os.path.basename(args.reftree) for counter, tfile in enumerate(source_trees): if args.source_file: seedid, tfile = tfile else: seedid = None if args.extract_species: if args.sp_regexp: SPMATCHER = re.compile(args.sp_regexp) get_sp_name = lambda x: re.search(SPMATCHER, x).groups()[0] else: get_sp_name = lambda x: x tt = PhyloTree(tfile, sp_naming_function = get_sp_name) else: tt = Tree(tfile) if args.src_tree_attr: for lf in tt.iter_leaves(): lf._origname = lf.name lf.name = getattr(lf, args.src_tree_attr) if args.outgroup: if len(args.outgroup) > 1: out = tt.get_common_ancestor(args.outgroup) else: out = tt.search_nodes(name=args.outgroup[0])[0] tt.set_outgroup(out) if args.source_trees: fname = os.path.basename(tfile) else: fname = '%05d' %counter r = tt.compare(ref_tree, ref_tree_attr=args.ref_tree_attr, source_tree_attr=args.src_tree_attr, min_support_ref=args.min_support_ref, min_support_source = args.min_support_src, unrooted=args.unrooted, has_duplications=args.extract_species) print_table([map(istr, [fname[-30:], ref_fname[-30:], r['effective_tree_size'], r['norm_rf'], r['rf'], r['max_rf'], r["source_edges_in_ref"], r["ref_edges_in_source"], r['source_subtrees'], r['treeko_dist']])], fix_col_width = COL_WIDTHS, wrap_style='cut') if args.output: OUT.close()
if args.ref_tree: print "Reading ref tree from", args.ref_tree reft = Tree(args.ref_tree, format=1) else: reft = None if args.tax_info: tax2name, tax2track = annotate_tree_with_taxa( t, args.tax_info, tax2name, tax2track) if args.dump_tax_info: cPickle.dump(tax2track, open("tax2track.pkl", "w")) cPickle.dump(tax2name, open("tax2name.pkl", "w")) print "Tax info written into pickle files" else: for n in t.iter_leaves(): spcode = n.name n.add_features(taxid=spcode) n.add_features(species=spcode) tax2name, tax2track = annotate_tree_with_taxa( t, None, tax2name, tax2track) # Split tree into species trees #subtrees = t.get_speciation_trees() if not args.rf_only: print "Calculating tree subparts..." t1 = time.time() if not args.is_sptree: subtrees = t.split_by_dups() print "Subparts:", len(subtrees), time.time() - t1 else:
def main(argv): parser = argparse.ArgumentParser( description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) input_args = parser.add_argument_group("INPUT OPTIONS") input_args.add_argument("source_trees", metavar='source_trees', type=str, nargs="*", help='a list of source tree files') input_args.add_argument( "--source_file", dest="source_file", type=str, help="""path to a file containing many source trees, one per line""") input_args.add_argument("-r", dest="reftree", type=str, required=True, help="""Reference tree""") input_args.add_argument("--ref_tree_attr", dest="ref_tree_attr", type=str, default="name", help=("attribute in ref tree used as leaf name")) input_args.add_argument( "--src_tree_attr", dest="src_tree_attr", type=str, default="name", help=("attribute in source tree used as leaf name")) input_args.add_argument( "--min_support_ref", type=float, default=0.0, help=("min support for branches to be considered from the ref tree")) input_args.add_argument( "--min_support_src", type=float, default=0.0, help=( "min support for branches to be considered from the source tree")) output_args = parser.add_argument_group("OUTPUT OPTIONS") output_args.add_argument("-o", dest="output", type=str, help="""Path to the tab delimited report file""") opt_args = parser.add_argument_group("DISTANCE OPTIONS") opt_args.add_argument( "--outgroup", dest="outgroup", nargs="+", help= """outgroup used to root reference and source trees before distance computation""" ) opt_args.add_argument("--expand_polytomies", dest="polytomies", action="store_true", help="""expand politomies if necessary""") opt_args.add_argument("--unrooted", dest="unrooted", action="store_true", help="""compare trees as unrooted""") opt_args.add_argument( "--min_support", dest="min_support", type=float, default=0.0, help= ("min support value for branches to be counted in the distance computation (RF, treeko and refTree/targeGene compatibility)" )) opt_args = parser.add_argument_group("PHYLOGENETICS OPTIONS") opt_args.add_argument( "--extract_species", action="store_true", help= "When used, leaf names in the reference and source trees are assumed to represent species." " If target trees are gene-trees whose species information is encoded as a part of the leaf sequence name," " it can be automatically extracted by providing a Perl regular expression that extract a " " valid species code (see --sp_regexp). Such information will be also used to detect duplication" " events. ") opt_args.add_argument( "--sp_regexp", type=str, help= ("Specifies a Perl regular expression to automatically extract species names" " from the name string in source trees. If not used, leaf names are assumed to represent species names." " Example: use this expression '[^_]+_(.+)' to extract HUMAN from the string 'P53_HUMAN'." )) opt_args.add_argument("--collateral", action='store_true', help=("")) args = parser.parse_args(argv) print __DESCRIPTION__ reftree = args.reftree if args.source_file and args.source_trees: print >> sys.stderr, 'The use of targets_file and targets at the same time is not supported.' sys.exit(1) if args.source_file: source_trees = tree_iterator(args.source_file) else: source_trees = args.source_trees ref_tree = Tree(reftree) if args.ref_tree_attr: for lf in ref_tree.iter_leaves(): lf._origname = lf.name if args.ref_tree_attr not in lf.features: print lf lf.name = getattr(lf, args.ref_tree_attr) if args.outgroup: if len(args.outgroup) > 1: out = ref_tree.get_common_ancestor(args.outgroup) else: out = ref_tree.search_nodes(name=args.outgroup[0])[0] ref_tree.set_outgroup(out) HEADER = ("source tree", 'ref tree', 'common\ntips', 'normRF', 'RF', 'maxRF', "%reftree", "%genetree", "subtrees", "treeko\ndist") if args.output: OUT = open(args.output, "w") print >> OUT, '# ' + ctime() print >> OUT, '# ' + ' '.join(sys.argv) print >> OUT, '#' + '\t'.join(HEADER) else: print '# ' + ctime() print '# ' + ' '.join(sys.argv) COL_WIDTHS = [20, 20] + [9] * 10 print_table([HEADER], fix_col_width=COL_WIDTHS, wrap_style='wrap') prev_tree = None ref_fname = os.path.basename(args.reftree) for counter, tfile in enumerate(source_trees): if args.source_file: seedid, tfile = tfile else: seedid = None if args.extract_species: if args.sp_regexp: SPMATCHER = re.compile(args.sp_regexp) get_sp_name = lambda x: re.search(SPMATCHER, x).groups()[0] else: get_sp_name = lambda x: x tt = PhyloTree(tfile, sp_naming_function=get_sp_name) else: tt = Tree(tfile) if args.src_tree_attr: for lf in tt.iter_leaves(): lf._origname = lf.name lf.name = getattr(lf, args.src_tree_attr) if args.outgroup: if len(args.outgroup) > 1: out = tt.get_common_ancestor(args.outgroup) else: out = tt.search_nodes(name=args.outgroup[0])[0] tt.set_outgroup(out) if args.source_trees: fname = os.path.basename(tfile) else: fname = '%05d' % counter r = tt.compare(ref_tree, ref_tree_attr=args.ref_tree_attr, source_tree_attr=args.src_tree_attr, min_support_ref=args.min_support_ref, min_support_source=args.min_support_src, unrooted=args.unrooted, has_duplications=args.extract_species) print_table([ map(istr, [ fname[-30:], ref_fname[-30:], r['effective_tree_size'], r['norm_rf'], r['rf'], r['max_rf'], r["source_edges_in_ref"], r["ref_edges_in_source"], r['source_subtrees'], r['treeko_dist'] ]) ], fix_col_width=COL_WIDTHS, wrap_style='cut') if args.output: OUT.close()
if args.ref_tree: print "Reading ref tree from", args.ref_tree reft = Tree(args.ref_tree, format=1) else: reft = None if args.tax_info: tax2name, tax2track = annotate_tree_with_taxa(t, args.tax_info, tax2name, tax2track) if args.dump_tax_info: cPickle.dump(tax2track, open("tax2track.pkl", "w")) cPickle.dump(tax2name, open("tax2name.pkl", "w")) print "Tax info written into pickle files" else: for n in t.iter_leaves(): spcode = n.name n.add_features(taxid=spcode) n.add_features(species=spcode) tax2name, tax2track = annotate_tree_with_taxa(t, None, tax2name, tax2track) # Split tree into species trees #subtrees = t.get_speciation_trees() if not args.rf_only: print "Calculating tree subparts..." t1 = time.time() if not args.is_sptree: subtrees = t.split_by_dups() print "Subparts:", len(subtrees), time.time()-t1 else: subtrees = [t]
def main(argv): parser = argparse.ArgumentParser(description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo. # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version) # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) ) # const - A constant value required by some action and nargs selections. # default - The value produced if the argument is absent from the command line. # type - The type to which the command-line argument should be converted. # choices - A container of the allowable values for the argument. # required - Whether or not the command-line option may be omitted (optionals only). # help - A brief description of what the argument does. # metavar - A name for the argument in usage messages. # dest - The name of the attribute to be added to the object returned by parse_args(). parser.add_argument("--show", dest="show_tree", action="store_true", help="""Display tree after the analysis.""") parser.add_argument("--render", dest="render", action="store_true", help="""Render tree.""") parser.add_argument("--dump", dest="dump", action="store_true", help="""Dump analysis""") parser.add_argument("--explore", dest="explore", type=str, help="""Reads a previously analyzed tree and visualize it""") input_args = parser.add_mutually_exclusive_group() input_args.required=True input_args.add_argument("-t", "--tree", dest="target_tree", nargs="+", type=str, help="""Tree file in newick format""") input_args.add_argument("-tf", dest="tree_list_file", type=str, help="File with the list of tree files") parser.add_argument("--tax", dest="tax_info", type=str, help="If the taxid attribute is not set in the" " newick file for all leaf nodes, a tab file file" " with the translation of name and taxid can be" " provided with this option.") parser.add_argument("--sp_delimiter", dest="sp_delimiter", type=str, help="If taxid is part of the leaf name, delimiter used to split the string") parser.add_argument("--sp_field", dest="sp_field", type=int, default=0, help="field position for taxid after splitting leaf names") parser.add_argument("--ref", dest="ref_tree", type=str, help="Uses ref tree to compute robinson foulds" " distances of the different subtrees") parser.add_argument("--rf-only", dest="rf_only", action = "store_true", help="Skip ncbi consensus analysis") parser.add_argument("--outgroup", dest="outgroup", type=str, nargs="+", help="A list of node names defining the trees outgroup") parser.add_argument("--is_sptree", dest="is_sptree", action = "store_true", help="Assumes no duplication nodes in the tree") parser.add_argument("-o", dest="output", type=str, help="Writes result into a file") parser.add_argument("--tax2name", dest="tax2name", type=str, help="") parser.add_argument("--tax2track", dest="tax2track", type=str, help="") parser.add_argument("--dump_tax_info", dest="dump_tax_info", action="store_true", help="") args = parser.parse_args(argv) if args.sp_delimiter: GET_TAXID = lambda x: x.split(args.sp_delimiter)[args.sp_field] else: GET_TAXID = None reftree_name = os.path.basename(args.ref_tree) if args.ref_tree else "" if args.explore: print >>sys.stderr, "Reading tree from file:", args.explore t = cPickle.load(open(args.explore)) ts = TreeStyle() ts.force_topology = True ts.show_leaf_name = False ts.layout_fn = ncbi_layout ts.mode = "r" t.show(tree_style=ts) print >>sys.stderr, "dumping color config" cPickle.dump(name2color, open("ncbi_colors.pkl", "w")) sys.exit() if args.output: OUT = open(args.output, "w") else: OUT = sys.stdout print >>sys.stderr, "Dumping results into", OUT target_trees = [] if args.tree_list_file: target_trees = [line.strip() for line in open(args.tree_list_file)] if args.target_tree: target_trees += args.target_tree prev_tree = None if args.tax2name: tax2name = cPickle.load(open(args.tax2name)) else: tax2name = {} if args.tax2track: tax2track = cPickle.load(open(args.tax2track)) else: tax2track = {} print len(tax2track), len(tax2name) header = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Clade sizes", "RF (avg)", "RF (med)", "RF (std)", "RF (max)", "Shared tips") print >>OUT, '|'.join([h.ljust(15) for h in header]) if args.ref_tree: print >>sys.stderr, "Reading ref tree from", args.ref_tree reft = Tree(args.ref_tree, format=1) else: reft = None SHOW_TREE = False if args.show_tree or args.render: SHOW_TREE = True prev_broken = set() ENTRIES = [] ncbi.connect_database() for tfile in target_trees: #print tfile t = PhyloTree(tfile, sp_naming_function=None) if GET_TAXID: for n in t.iter_leaves(): n.name = GET_TAXID(n.name) if args.outgroup: if len(args.outgroup) == 1: out = t & args.outgroup[0] else: out = t.get_common_ancestor(args.outgroup) if set(out.get_leaf_names()) ^ set(args.outgroup): raise ValueError("Outgroup is not monophyletic") t.set_outgroup(out) t.ladderize() if prev_tree: tree_compare(t, prev_tree) prev_tree = t if args.tax_info: tax2name, tax2track = annotate_tree_with_taxa(t, args.tax_info, tax2name, tax2track) if args.dump_tax_info: cPickle.dump(tax2track, open("tax2track.pkl", "w")) cPickle.dump(tax2name, open("tax2name.pkl", "w")) print "Tax info written into pickle files" else: for n in t.iter_leaves(): spcode = n.name n.add_features(taxid=spcode) n.add_features(species=spcode) tax2name, tax2track = annotate_tree_with_taxa(t, None, tax2name, tax2track) # Split tree into species trees #subtrees = t.get_speciation_trees() if not args.rf_only: #print "Calculating tree subparts..." t1 = time.time() if not args.is_sptree: subtrees = t.split_by_dups() #print "Subparts:", len(subtrees), time.time()-t1 else: subtrees = [t] valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = analyze_subtrees(t, subtrees, show_tree=SHOW_TREE) #print valid_subtrees, broken_subtrees, ncbi_mistakes, total_rf else: subtrees = [] valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = 0, 0, 0, 0, 0, 0 ndups = 0 nsubtrees = len(subtrees) rf = 0 rf_max = 0 rf_std = 0 rf_med = 0 common_names = 0 max_size = 0 if reft and len(subtrees) == 1: rf = t.robinson_foulds(reft, attr_t1="realname") rf_max = rf[1] rf = rf[0] rf_med = rf elif reft: #print "Calculating avg RF..." nsubtrees, ndups, subtrees = t.get_speciation_trees(map_features=["taxid"]) #print len(subtrees), "Sub-Species-trees found" avg_rf = [] rf_max = 0.0 # reft.robinson_foulds(reft)[1] sum_size = 0.0 print nsubtrees, "subtrees", ndups, "duplications" for ii, subt in enumerate(subtrees): print "\r%d" %ii, sys.stdout.flush() try: partial_rf = subt.robinson_foulds(reft, attr_t1="taxid") except ValueError: pass else: sptree_size = len(set([n.taxid for n in subt.iter_leaves()])) sum_size += sptree_size avg_rf.append((partial_rf[0]/float(partial_rf[1])) * sptree_size) common_names = len(partial_rf[3]) max_size = max(max_size, sptree_size) rf_max = max(rf_max, partial_rf[1]) #print partial_rf[:2] rf = numpy.sum(avg_rf) / float(sum_size) # Treeko dist rf_std = numpy.std(avg_rf) rf_med = numpy.median(avg_rf) sizes_info = "%0.1f/%0.1f +- %0.1f" %( numpy.mean(broken_sizes), numpy.median(broken_sizes), numpy.std(broken_sizes)) iter_values = [os.path.basename(tfile), nsubtrees, ndups, broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, rf, rf_med, rf_std, rf_max, common_names] print >>OUT, '|'.join(map(lambda x: str(x).strip().ljust(15), iter_values)) fixed = sorted([n for n in prev_broken if n not in broken_clades]) new_problems = sorted(broken_clades - prev_broken) fixed_string = color(', '.join(fixed), "green") if fixed else "" problems_string = color(', '.join(new_problems), "red") if new_problems else "" OUT.write(" Fixed clades: %s\n" %fixed_string) if fixed else None OUT.write(" New broken: %s\n" %problems_string) if new_problems else None prev_broken = broken_clades ENTRIES.append([os.path.basename(tfile), nsubtrees, ndups, broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, fixed_string, problems_string]) OUT.flush() if args.show_tree or args.render: ts = TreeStyle() ts.force_topology = True #ts.tree_width = 500 ts.show_leaf_name = False ts.layout_fn = ncbi_layout ts.mode = "r" t.dist = 0 if args.show_tree: #if args.hide_monophyletic: # tax2monophyletic = {} # n2content = t.get_node2content() # for node in t.traverse(): # term2count = defaultdict(int) # for leaf in n2content[node]: # if leaf.lineage: # for term in leaf.lineage: # term2count[term] += 1 # expected_size = len(n2content) # for term, count in term2count.iteritems(): # if count > 1 print "Showing tree..." t.show(tree_style=ts) else: t.render("img.svg", tree_style=ts, dpi=300) print "dumping color config" cPickle.dump(name2color, open("ncbi_colors.pkl", "w")) if args.dump: cPickle.dump(t, open("ncbi_analysis.pkl", "w")) print print HEADER = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Broken branches", "Clade sizes", "Fixed Groups", "New Broken Clades") print_table(ENTRIES, max_col_width = 50, row_line=True, header=HEADER) if args.output: OUT.close()
seqA MAEIPDETIQ QFMALT---H NIAVQYLSEF GDLNEALNSY YASQTDDIKD RREEAHQFMA seqB MAEIPDATIQ QFMALTNVSH NIAVQY--EF GDLNEALNSY YAYQTDDQKD RREEAHQFMA seqC MAEIPDATIQ ---ALTNVSH NIAVQYLSEF GDLNEALNSY YASQTDDQPD RREEAHQFMA seqD MAEAPDETIQ QFMALTNVSH NIAVQYLSEF GDLNEAL--- ---------- -REEAHQ--- LTNVSHQFMA LTNVSH LTNVSH---- ------ LTNVSH---- ------ -------FMA LTNVSH """ # Load a tree and link it to an alignment. As usual, 'alignment' can # be the path to a file or data in text format. t = PhyloTree("(((seqA,seqB),seqC),seqD);", alignment=fasta_txt, alg_format="fasta") #We can now access the sequence of every leaf node print "These are the nodes and its sequences:" for leaf in t.iter_leaves(): print leaf.name, leaf.sequence #seqD MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAH #seqC MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH #seqA MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAH #seqB MAEIPDATIQQFMALTNVSHNIAVQY--EFGDLNEALNSYYAYQTDDQKDRREEAH # # The associated alignment can be changed at any time t.link_to_alignment(alignment=iphylip_txt, alg_format="iphylip") # Let's check that sequences have changed print "These are the nodes and its re-linked sequences:" for leaf in t.iter_leaves(): print leaf.name, leaf.sequence #seqD MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAHQ----------FMALTNVSH #seqC MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAHQFMALTNVSH---------- #seqA MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAHQFMALTNVSHQFMALTNVSH
def main(argv): parser = ArgumentParser(description=__DESCRIPTION__) parser.add_argument("--db", dest="dbfile", type=str, help="""NCBI sqlite3 db file.""") input_args = parser.add_argument_group('TAXID INPUT OPTIONS') input_args.add_argument("-i", "--taxid", dest="taxid", nargs="+", type=int, help="""taxids (space separated)""") input_args.add_argument("-if", "--taxid_file", dest="taxid_file", type=str, help="""file containing a list of taxids (one per line)""") input_args.add_argument("-t", "--reftree", dest="reftree", type=str, help="""Read taxids from the provided tree.""") input_args.add_argument("--reftree_attr", dest="reftree_attr", type=str, default="name", help="""tree attribute encoding for taxid numbers.""") name_input_args = parser.add_argument_group('NAME INPUT OPTIONS') name_input_args.add_argument("-n", "--name", dest="names", nargs="+", type=str, help="""species or taxa names (comma separated)""") name_input_args.add_argument("-nf", "--names_file", dest="names_file", type=str, help="""file containing a list of taxids (one per line)""") name_input_args.add_argument("--fuzzy", dest="fuzzy", type=float, help=("Tries a fuzzy (and SLOW) search for those" " species names that could not be translated" " into taxids. A float number must be provided" " indicating the minimum string similarity.")) output_args = parser.add_argument_group('OUTPUT OPTIONS') output_args.add_argument("-x", "--taxonomy", dest="taxonomy", type=str, help=("dump a pruned version of the NCBI taxonomy" " tree containing target species into the specified file")) output_args.add_argument("-l", "--list", dest="info_list", type=str, help="""dump NCBI taxonmy information for each target species into the specified file. """) output_args.add_argument("-a", "--annotated", dest="annotated_tree", type=str, help="dump the annotated tree of the input reftree provided with -t into the specified file.") output_args.add_argument("--collapse_subspecies", dest="collapse_subspecies", action="store_true", help=("When used, all nodes under the the species rank" " are collapsed, so all species and subspecies" " are seen as sister nodes")) output_args.add_argument("--rank_limit", dest="rank_limit", type=str, help=("When used, all nodes under the provided rank" " are discarded")) output_args.add_argument("--full_lineage", dest="full_lineage", action="store_true", help=("When used, topology is not pruned to avoid " " one-child-nodes, so the complete lineage" " track leading from root to tips is kept.")) args = parser.parse_args(argv) taxid_source = args.taxid or args.taxid_file or args.reftree name_source = args.names or args.names_file if not taxid_source and not name_source: parser.error('At least one input source is required') if taxid_source and name_source: parser.error('taxid and name options are mutually exclusive') if not args.taxonomy and not args.info_list and not args.annotated_tree: parser.error('At least one output option is required') ncbi = NCBITaxa(args.dbfile) all_names = set([]) all_taxids = [] if args.names_file: all_names.update(map(strip, open(args.names_file, "rU").read().split("\n"))) if args.names: all_names.update(map(strip, " ".join(args.names).split(","))) all_names.discard("") #all_names = set([n.lower() for n in all_names]) not_found = set() name2realname = {} name2score = {} if all_names: log.info("Dumping name translations in %s.name_translation.txt ... ") name2id = ncbi.get_name_translator(all_names) not_found = all_names - set(name2id.keys()) if args.fuzzy and not_found: log.info("%s unknown names", len(not_found)) for name in not_found: # enable extension loading tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy) if tax: name2id[name] = tax name2realname[name] = realname name2score[name] = "Fuzzy:%0.2f" %sim for name in all_names: taxid = name2id.get(name, "???") realname = name2realname.get(name, name) score = name2score.get(name, "Exact:1.0") print "\t".join(map(str, [score, name, realname.capitalize(), taxid])) if args.taxid_file: all_taxids.extend(map(strip, open(args.taxid_file, "rU").read().split("\n"))) if args.taxid: all_taxids.extend(args.taxid) reftree = None if args.reftree: reftree = PhyloTree(args.reftree) all_taxids.extend(list(set([getattr(n, args.reftree_attr) for n in reftree.iter_leaves()]))) if all_taxids and args.info_list: all_taxids = set(all_taxids) all_taxids.discard("") all_taxids, merge_conversion = ncbi._translate_merged(all_taxids) outfile = args.info_list+".info.txt" log.info("Dumping %d taxid translations in %s ..." %(len(all_taxids), outfile)) all_taxids.discard("") translator = ncbi.get_taxid_translator(all_taxids) OUT = open(outfile, "w") for taxid, name in translator.iteritems(): lineage = ncbi.get_sp_lineage(taxid) named_lineage = ','.join(ncbi.translate_to_names(lineage)) lineage = ','.join(map(str, lineage)) print >>OUT, "\t".join(map(str, [merge_conversion.get(int(taxid), taxid), name, named_lineage, lineage ])) OUT.close() for notfound in set(map(str, all_taxids)) - set(str(k) for k in translator.iterkeys()): print >>sys.stderr, notfound, "NOT FOUND" if all_taxids and args.taxonomy: all_taxids = set(all_taxids) all_taxids.discard("") log.info("Dumping NCBI taxonomy of %d taxa in %s.*.nw ..." %(len(all_taxids), args.taxonomy)) t = ncbi.get_topology(all_taxids, args.full_lineage, args.rank_limit) id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()]) for n in t.traverse(): n.add_features(taxid=n.name) n.add_features(sci_name=str(id2name.get(int(n.name), "?"))) n.name = "%s{%s}" %(id2name.get(int(n.name), n.name), n.name) lineage = ncbi.get_sp_lineage(n.taxid) n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage))) if args.collapse_subspecies: species_nodes = [n for n in t.traverse() if n.rank == "species" if int(n.taxid) in all_taxids] for sp_node in species_nodes: bellow = sp_node.get_descendants() if bellow: # creates a copy of the species node connector = sp_node.__class__() for f in sp_node.features: connector.add_feature(f, getattr(sp_node, f)) connector.name = connector.name + "{species}" for n in bellow: n.detach() n.name = n.name + "{%s}" %n.rank sp_node.add_child(n) sp_node.add_child(connector) sp_node.add_feature("collapse_subspecies", "1") t.write(format=9, outfile=args.taxonomy+".names.nw") t.write(format=8, outfile=args.taxonomy+".allnames.nw") t.write(format=9, features=["taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage"], outfile=args.taxonomy+".full_annotation.nw") for i in t.iter_leaves(): i.name = i.taxid t.write(format=9, outfile=args.taxonomy+".taxids.nw") t.write(format=8, outfile=args.taxonomy+".alltaxids.nw") if all_taxids and reftree: translator = ncbi.get_taxid_translator(all_taxids) for n in reftree.iter_leaves(): if not hasattr(n, "taxid"): n.add_features(taxid=int(getattr(n, args.reftree_attr))) n.add_features(sci_name = translator.get(int(n.taxid), n.name)) lineage = ncbi.get_sp_lineage(n.taxid) named_lineage = '|'.join(ncbi._translate_to_names(lineage)) n.add_features(ncbi_track=named_lineage) print reftree.write(features=["taxid", "sci_name", "ncbi_track"])
def main(argv): parser = ArgumentParser(description=__DESCRIPTION__) parser.add_argument("--db", dest="dbfile", type=str, help="""NCBI sqlite3 db file.""") input_args = parser.add_argument_group('TAXID INPUT OPTIONS') input_args.add_argument("-i", "--taxid", dest="taxid", nargs="+", type=int, help="""taxids (space separated)""") input_args.add_argument( "-if", "--taxid_file", dest="taxid_file", type=str, help="""file containing a list of taxids (one per line)""") input_args.add_argument("-t", "--reftree", dest="reftree", type=str, help="""Read taxids from the provided tree.""") input_args.add_argument( "--reftree_attr", dest="reftree_attr", type=str, default="name", help="""tree attribute encoding for taxid numbers.""") name_input_args = parser.add_argument_group('NAME INPUT OPTIONS') name_input_args.add_argument( "-n", "--name", dest="names", nargs="+", type=str, help="""species or taxa names (comma separated)""") name_input_args.add_argument( "-nf", "--names_file", dest="names_file", type=str, help="""file containing a list of taxids (one per line)""") name_input_args.add_argument( "--fuzzy", dest="fuzzy", type=float, help=("Tries a fuzzy (and SLOW) search for those" " species names that could not be translated" " into taxids. A float number must be provided" " indicating the minimum string similarity.")) output_args = parser.add_argument_group('OUTPUT OPTIONS') output_args.add_argument( "-x", "--taxonomy", dest="taxonomy", type=str, help=("dump a pruned version of the NCBI taxonomy" " tree containing target species into the specified file")) output_args.add_argument( "-l", "--list", dest="info_list", type=str, help= """dump NCBI taxonmy information for each target species into the specified file. """ ) output_args.add_argument( "-a", "--annotated", dest="annotated_tree", type=str, help= "dump the annotated tree of the input reftree provided with -t into the specified file." ) output_args.add_argument( "--collapse_subspecies", dest="collapse_subspecies", action="store_true", help=("When used, all nodes under the the species rank" " are collapsed, so all species and subspecies" " are seen as sister nodes")) output_args.add_argument( "--rank_limit", dest="rank_limit", type=str, help=("When used, all nodes under the provided rank" " are discarded")) output_args.add_argument( "--full_lineage", dest="full_lineage", action="store_true", help=("When used, topology is not pruned to avoid " " one-child-nodes, so the complete lineage" " track leading from root to tips is kept.")) args = parser.parse_args(argv) taxid_source = args.taxid or args.taxid_file or args.reftree name_source = args.names or args.names_file if not taxid_source and not name_source: parser.error('At least one input source is required') if taxid_source and name_source: parser.error('taxid and name options are mutually exclusive') if not args.taxonomy and not args.info_list and not args.annotated_tree: parser.error('At least one output option is required') ncbi = NCBITaxa(args.dbfile) all_names = set([]) all_taxids = [] if args.names_file: all_names.update( map(strip, open(args.names_file, "rU").read().split("\n"))) if args.names: all_names.update(map(strip, " ".join(args.names).split(","))) all_names.discard("") #all_names = set([n.lower() for n in all_names]) not_found = set() name2realname = {} name2score = {} if all_names: log.info("Dumping name translations in %s.name_translation.txt ... ") name2id = ncbi.get_name_translator(all_names) not_found = all_names - set(name2id.keys()) if args.fuzzy and not_found: log.info("%s unknown names", len(not_found)) for name in not_found: # enable extension loading tax, realname, sim = ncbi.get_fuzzy_name_translation( name, args.fuzzy) if tax: name2id[name] = tax name2realname[name] = realname name2score[name] = "Fuzzy:%0.2f" % sim for name in all_names: taxid = name2id.get(name, "???") realname = name2realname.get(name, name) score = name2score.get(name, "Exact:1.0") print "\t".join( map(str, [score, name, realname.capitalize(), taxid])) if args.taxid_file: all_taxids.extend( map(strip, open(args.taxid_file, "rU").read().split("\n"))) if args.taxid: all_taxids.extend(args.taxid) reftree = None if args.reftree: reftree = PhyloTree(args.reftree) all_taxids.extend( list( set([ getattr(n, args.reftree_attr) for n in reftree.iter_leaves() ]))) if all_taxids and args.info_list: all_taxids = set(all_taxids) all_taxids.discard("") all_taxids, merge_conversion = ncbi._translate_merged(all_taxids) outfile = args.info_list + ".info.txt" log.info("Dumping %d taxid translations in %s ..." % (len(all_taxids), outfile)) all_taxids.discard("") translator = ncbi.get_taxid_translator(all_taxids) OUT = open(outfile, "w") for taxid, name in translator.iteritems(): lineage = ncbi.get_sp_lineage(taxid) named_lineage = ','.join(ncbi.translate_to_names(lineage)) lineage = ','.join(map(str, lineage)) print >> OUT, "\t".join( map(str, [ merge_conversion.get(int(taxid), taxid), name, named_lineage, lineage ])) OUT.close() for notfound in set(map(str, all_taxids)) - set( str(k) for k in translator.iterkeys()): print >> sys.stderr, notfound, "NOT FOUND" if all_taxids and args.taxonomy: all_taxids = set(all_taxids) all_taxids.discard("") log.info("Dumping NCBI taxonomy of %d taxa in %s.*.nw ..." % (len(all_taxids), args.taxonomy)) t = ncbi.get_topology(all_taxids, args.full_lineage, args.rank_limit) id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()]) for n in t.traverse(): n.add_features(taxid=n.name) n.add_features(sci_name=str(id2name.get(int(n.name), "?"))) n.name = "%s{%s}" % (id2name.get(int(n.name), n.name), n.name) lineage = ncbi.get_sp_lineage(n.taxid) n.add_features( named_lineage='|'.join(ncbi.translate_to_names(lineage))) if args.collapse_subspecies: species_nodes = [ n for n in t.traverse() if n.rank == "species" if int(n.taxid) in all_taxids ] for sp_node in species_nodes: bellow = sp_node.get_descendants() if bellow: # creates a copy of the species node connector = sp_node.__class__() for f in sp_node.features: connector.add_feature(f, getattr(sp_node, f)) connector.name = connector.name + "{species}" for n in bellow: n.detach() n.name = n.name + "{%s}" % n.rank sp_node.add_child(n) sp_node.add_child(connector) sp_node.add_feature("collapse_subspecies", "1") t.write(format=9, outfile=args.taxonomy + ".names.nw") t.write(format=8, outfile=args.taxonomy + ".allnames.nw") t.write(format=9, features=[ "taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage" ], outfile=args.taxonomy + ".full_annotation.nw") for i in t.iter_leaves(): i.name = i.taxid t.write(format=9, outfile=args.taxonomy + ".taxids.nw") t.write(format=8, outfile=args.taxonomy + ".alltaxids.nw") if all_taxids and reftree: translator = ncbi.get_taxid_translator(all_taxids) for n in reftree.iter_leaves(): if not hasattr(n, "taxid"): n.add_features(taxid=int(getattr(n, args.reftree_attr))) n.add_features(sci_name=translator.get(int(n.taxid), n.name)) lineage = ncbi.get_sp_lineage(n.taxid) named_lineage = '|'.join(ncbi._translate_to_names(lineage)) n.add_features(ncbi_track=named_lineage) print reftree.write(features=["taxid", "sci_name", "ncbi_track"])
def main(argv): parser = argparse.ArgumentParser( description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo. # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version) # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) ) # const - A constant value required by some action and nargs selections. # default - The value produced if the argument is absent from the command line. # type - The type to which the command-line argument should be converted. # choices - A container of the allowable values for the argument. # required - Whether or not the command-line option may be omitted (optionals only). # help - A brief description of what the argument does. # metavar - A name for the argument in usage messages. # dest - The name of the attribute to be added to the object returned by parse_args(). parser.add_argument("--show", dest="show_tree", action="store_true", help="""Display tree after the analysis.""") parser.add_argument("--render", dest="render", action="store_true", help="""Render tree.""") parser.add_argument("--dump", dest="dump", action="store_true", help="""Dump analysis""") parser.add_argument( "--explore", dest="explore", type=str, help="""Reads a previously analyzed tree and visualize it""") input_args = parser.add_mutually_exclusive_group() input_args.required = True input_args.add_argument("-t", "--tree", dest="target_tree", nargs="+", type=str, help="""Tree file in newick format""") input_args.add_argument("-tf", dest="tree_list_file", type=str, help="File with the list of tree files") parser.add_argument("--tax", dest="tax_info", type=str, help="If the taxid attribute is not set in the" " newick file for all leaf nodes, a tab file file" " with the translation of name and taxid can be" " provided with this option.") parser.add_argument( "--sp_delimiter", dest="sp_delimiter", type=str, help= "If taxid is part of the leaf name, delimiter used to split the string" ) parser.add_argument( "--sp_field", dest="sp_field", type=int, default=0, help="field position for taxid after splitting leaf names") parser.add_argument("--ref", dest="ref_tree", type=str, help="Uses ref tree to compute robinson foulds" " distances of the different subtrees") parser.add_argument("--rf-only", dest="rf_only", action="store_true", help="Skip ncbi consensus analysis") parser.add_argument( "--outgroup", dest="outgroup", type=str, nargs="+", help="A list of node names defining the trees outgroup") parser.add_argument("--is_sptree", dest="is_sptree", action="store_true", help="Assumes no duplication nodes in the tree") parser.add_argument("-o", dest="output", type=str, help="Writes result into a file") parser.add_argument("--tax2name", dest="tax2name", type=str, help="") parser.add_argument("--tax2track", dest="tax2track", type=str, help="") parser.add_argument("--dump_tax_info", dest="dump_tax_info", action="store_true", help="") args = parser.parse_args(argv) if args.sp_delimiter: GET_TAXID = lambda x: x.split(args.sp_delimiter)[args.sp_field] else: GET_TAXID = None reftree_name = os.path.basename(args.ref_tree) if args.ref_tree else "" if args.explore: print >> sys.stderr, "Reading tree from file:", args.explore t = cPickle.load(open(args.explore)) ts = TreeStyle() ts.force_topology = True ts.show_leaf_name = False ts.layout_fn = ncbi_layout ts.mode = "r" t.show(tree_style=ts) print >> sys.stderr, "dumping color config" cPickle.dump(name2color, open("ncbi_colors.pkl", "w")) sys.exit() if args.output: OUT = open(args.output, "w") else: OUT = sys.stdout print >> sys.stderr, "Dumping results into", OUT target_trees = [] if args.tree_list_file: target_trees = [line.strip() for line in open(args.tree_list_file)] if args.target_tree: target_trees += args.target_tree prev_tree = None if args.tax2name: tax2name = cPickle.load(open(args.tax2name)) else: tax2name = {} if args.tax2track: tax2track = cPickle.load(open(args.tax2track)) else: tax2track = {} print len(tax2track), len(tax2name) header = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Clade sizes", "RF (avg)", "RF (med)", "RF (std)", "RF (max)", "Shared tips") print >> OUT, '|'.join([h.ljust(15) for h in header]) if args.ref_tree: print >> sys.stderr, "Reading ref tree from", args.ref_tree reft = Tree(args.ref_tree, format=1) else: reft = None SHOW_TREE = False if args.show_tree or args.render: SHOW_TREE = True prev_broken = set() ENTRIES = [] ncbi.connect_database() for tfile in target_trees: #print tfile t = PhyloTree(tfile, sp_naming_function=None) if GET_TAXID: for n in t.iter_leaves(): n.name = GET_TAXID(n.name) if args.outgroup: if len(args.outgroup) == 1: out = t & args.outgroup[0] else: out = t.get_common_ancestor(args.outgroup) if set(out.get_leaf_names()) ^ set(args.outgroup): raise ValueError("Outgroup is not monophyletic") t.set_outgroup(out) t.ladderize() if prev_tree: tree_compare(t, prev_tree) prev_tree = t if args.tax_info: tax2name, tax2track = annotate_tree_with_taxa( t, args.tax_info, tax2name, tax2track) if args.dump_tax_info: cPickle.dump(tax2track, open("tax2track.pkl", "w")) cPickle.dump(tax2name, open("tax2name.pkl", "w")) print "Tax info written into pickle files" else: for n in t.iter_leaves(): spcode = n.name n.add_features(taxid=spcode) n.add_features(species=spcode) tax2name, tax2track = annotate_tree_with_taxa( t, None, tax2name, tax2track) # Split tree into species trees #subtrees = t.get_speciation_trees() if not args.rf_only: #print "Calculating tree subparts..." t1 = time.time() if not args.is_sptree: subtrees = t.split_by_dups() #print "Subparts:", len(subtrees), time.time()-t1 else: subtrees = [t] valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = analyze_subtrees( t, subtrees, show_tree=SHOW_TREE) #print valid_subtrees, broken_subtrees, ncbi_mistakes, total_rf else: subtrees = [] valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = 0, 0, 0, 0, 0, 0 ndups = 0 nsubtrees = len(subtrees) rf = 0 rf_max = 0 rf_std = 0 rf_med = 0 common_names = 0 max_size = 0 if reft and len(subtrees) == 1: rf = t.robinson_foulds(reft, attr_t1="realname") rf_max = rf[1] rf = rf[0] rf_med = rf elif reft: #print "Calculating avg RF..." nsubtrees, ndups, subtrees = t.get_speciation_trees( map_features=["taxid"]) #print len(subtrees), "Sub-Species-trees found" avg_rf = [] rf_max = 0.0 # reft.robinson_foulds(reft)[1] sum_size = 0.0 print nsubtrees, "subtrees", ndups, "duplications" for ii, subt in enumerate(subtrees): print "\r%d" % ii, sys.stdout.flush() try: partial_rf = subt.robinson_foulds(reft, attr_t1="taxid") except ValueError: pass else: sptree_size = len( set([n.taxid for n in subt.iter_leaves()])) sum_size += sptree_size avg_rf.append( (partial_rf[0] / float(partial_rf[1])) * sptree_size) common_names = len(partial_rf[3]) max_size = max(max_size, sptree_size) rf_max = max(rf_max, partial_rf[1]) #print partial_rf[:2] rf = numpy.sum(avg_rf) / float(sum_size) # Treeko dist rf_std = numpy.std(avg_rf) rf_med = numpy.median(avg_rf) sizes_info = "%0.1f/%0.1f +- %0.1f" % (numpy.mean(broken_sizes), numpy.median(broken_sizes), numpy.std(broken_sizes)) iter_values = [ os.path.basename(tfile), nsubtrees, ndups, broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, rf, rf_med, rf_std, rf_max, common_names ] print >> OUT, '|'.join( map(lambda x: str(x).strip().ljust(15), iter_values)) fixed = sorted([n for n in prev_broken if n not in broken_clades]) new_problems = sorted(broken_clades - prev_broken) fixed_string = color(', '.join(fixed), "green") if fixed else "" problems_string = color(', '.join(new_problems), "red") if new_problems else "" OUT.write(" Fixed clades: %s\n" % fixed_string) if fixed else None OUT.write(" New broken: %s\n" % problems_string) if new_problems else None prev_broken = broken_clades ENTRIES.append([ os.path.basename(tfile), nsubtrees, ndups, broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, fixed_string, problems_string ]) OUT.flush() if args.show_tree or args.render: ts = TreeStyle() ts.force_topology = True #ts.tree_width = 500 ts.show_leaf_name = False ts.layout_fn = ncbi_layout ts.mode = "r" t.dist = 0 if args.show_tree: #if args.hide_monophyletic: # tax2monophyletic = {} # n2content = t.get_node2content() # for node in t.traverse(): # term2count = defaultdict(int) # for leaf in n2content[node]: # if leaf.lineage: # for term in leaf.lineage: # term2count[term] += 1 # expected_size = len(n2content) # for term, count in term2count.iteritems(): # if count > 1 print "Showing tree..." t.show(tree_style=ts) else: t.render("img.svg", tree_style=ts, dpi=300) print "dumping color config" cPickle.dump(name2color, open("ncbi_colors.pkl", "w")) if args.dump: cPickle.dump(t, open("ncbi_analysis.pkl", "w")) print print HEADER = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Broken branches", "Clade sizes", "Fixed Groups", "New Broken Clades") print_table(ENTRIES, max_col_width=50, row_line=True, header=HEADER) if args.output: OUT.close()