def dump_results(reftree, informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports, coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch, coll_refbranch_supports): # FInal annotation of the refTree annotate_tree(reftree, informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports, coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch, coll_refbranch_supports) # Summary newick tree with all features if IMG_REPORT: print >> sys.stderr, "Generating tree analysis image" ts = TreeStyle() ts.layout_fn = info_layout reftree.render("%s.tree_analysis.png" % args.output, tree_style=ts) summary_fetaures = [ "ntrees", "nid", "ndups", "dup_rate", "losses", "losses_rate", "nduplosses", "duplosses_rate", "gt_support", "nsupport_trees", "coll_ndups", "coll_dup_rate", "coll_losses", "coll_losses_rate", "coll_nduplosses", "coll_duplosses_rate", "coll_gt_support", "coll_nsupport_trees" ] print >> sys.stderr, "Dumping annotated newick..." reftree.write(outfile="%s.nwx" % args.output, features=summary_fetaures) open("%s.log" % args.output, "w").write(' '.join(sys.argv))
def dump_results(reftree, informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports, coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch, coll_refbranch_supports): # FInal annotation of the refTree annotate_tree(reftree, informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports, coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch, coll_refbranch_supports) # Summary newick tree with all features if IMG_REPORT: print >>sys.stderr, "Generating tree analysis image" ts = TreeStyle() ts.layout_fn = info_layout reftree.render("%s.tree_analysis.png"%args.output, tree_style=ts) summary_fetaures = [ "ntrees", "nid", "ndups", "dup_rate", "losses", "losses_rate", "nduplosses", "duplosses_rate", "gt_support", "nsupport_trees", "coll_ndups", "coll_dup_rate", "coll_losses", "coll_losses_rate", "coll_nduplosses", "coll_duplosses_rate", "coll_gt_support", "coll_nsupport_trees"] print >>sys.stderr, "Dumping annotated newick..." reftree.write(outfile="%s.nwx"%args.output, features=summary_fetaures) open("%s.log"%args.output, "w").write(' '.join(sys.argv))
def main(argv): parser = argparse.ArgumentParser( description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo. # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version) # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) ) # const - A constant value required by some action and nargs selections. # default - The value produced if the argument is absent from the command line. # type - The type to which the command-line argument should be converted. # choices - A container of the allowable values for the argument. # required - Whether or not the command-line option may be omitted (optionals only). # help - A brief description of what the argument does. # metavar - A name for the argument in usage messages. # dest - The name of the attribute to be added to the object returned by parse_args(). input_gr = parser.add_argument_group( "TREE INPUT OPTIONS\n=================") input_gr.add_argument( 'tree', metavar='tree_file', type=str, nargs=1, help='A tree file (or text string) in newick format.') input_gr.add_argument("--raxml", dest="raxml", action="store_true", help="""Process newick as raxml bootstrap values""") img_gr = parser.add_argument_group("TREE IMAGE OPTIONS\n=================") img_gr.add_argument("-m", "--mode", dest="mode", choices=["c", "r"], default="r", help="""(r)ectangular or (c)ircular visualization""") img_gr.add_argument( "-i", "--image", dest="image", type=str, help="Render tree image instead of showing it. A filename " " should be provided. PDF, SVG and PNG file extensions are" " supported (i.e. -i tree.svg)") img_gr.add_argument( "--Iw", "--width", dest="width", type=int, default=0, help="width of the rendered image in pixels (see --size-units).") img_gr.add_argument( "--Ih", "--height", dest="height", type=int, default=0, help="height of the rendered image in pixels (see --size-units).") img_gr.add_argument("--Ir", "--resolution", dest="resolution", type=int, default=300, help="Resolution if the tree image (DPI)") img_gr.add_argument("--Iu", "--size-units", dest="size_units", choices=["px", "mm", "in"], default="px", help="Units used to specify the size of the image." " (px:pixels, mm:millimeters, in:inches). ") img_gr.add_argument( "-mbs", "--min-branch-separation", dest="branch_separation", type=int, default=3, help="Min number of pixels to separate branches vertically.") img_gr.add_argument("--ss", "--show-support", dest="show_support", action="store_true", help="""Shows branch bootstrap/support values""") img_gr.add_argument("--sbl", "--branch-length", dest="show_branch_length", action="store_true", help="""Show branch lengths.""") img_gr.add_argument( "--ft", "--force-topology", dest="force_topology", action="store_true", help="""Force branch length to have a minimum length in the image""") img_gr.add_argument("--hln", "--hide-leaf-names", dest="hide_leaf_names", action="store_true", help="""Hide leaf names.""") img_gr.add_argument( "--sin", "--show-internal-names", dest="show_internal_names", action="store_true", help="""Show the name attribute of all internal nodes.""") edit_gr = parser.add_argument_group("TREE EDIT OPTIONS\n=================") edit_gr.add_argument( "-r", "--root", dest="root", type=str, nargs="*", help="Roots the tree to the node grouping the list" " of node names provided (space separated). In example:" "'--root human rat mouse'") edit_gr.add_argument("-s", "--sort-branches", dest="sort", action="store_true", help="""Sort branches according to node names.""") edit_gr.add_argument("-l", "--ladderize", dest="ladderize", action="store_true", help="""Sort branches by partition size.""") edit_gr.add_argument("--color_by_rank", dest="color_by_rank", type=str, nargs="+", help="""If the attribute rank is present in nodes """) phylo_gr = parser.add_argument_group( "PHYLOGENETIC OPTIONS\n=================") phylo_gr.add_argument("--alg", dest="alg", type=str, help="""Multiple sequence alignment.""") phylo_gr.add_argument( "--alg-format", dest="alg_format", type=str, default="fasta", help="""fasta, phylip, iphylip, relaxed_iphylip, relaxed_phylip.""") phylo_gr.add_argument( "--sp-discovery", dest="species_discovery_regexp", type=str, default="^[^_]+_(.+)", help="Perl regular expression to capture species" " code from node names. By default, node names" " are expected to follow the NAME_SPCODE format = '^[^_]+_(.+)' ") phylo_gr.add_argument( "--dump-subtrees", dest="subtrees_output_file", type=str, help="Returns a file containing all possible species subtrees" " contained in a given gene tree ") phylo_gr.add_argument( "--newick", dest="newick", type=str, help="dumps newick file after applying editing options") args = parser.parse_args(argv) tfile = args.tree[0] if args.ladderize and args.sort: raise ValueError( "--sort-branches and --ladderize options are mutually exclusive") if args.raxml: nw = re.sub(":(\d+\.\d+)\[(\d+)\]", ":\\1[&&NHX:support=\\2]", open(tfile).read()) t = PhyloTree(nw) #for n in t.traverse(): #n.support = getattr(n, "bootstrap", -1) # else: t = PhyloTree(tfile) if args.alg: t.link_to_alignment(args.alg, alg_format=args.alg_format) LEAF_ATTRIBUTES["sequence"] = 1 if args.species_discovery_regexp: SPCODE_REGEXP = re.compile(args.species_discovery_regexp) t.set_species_naming_function(user_species_naming_function) if args.ladderize: t.ladderize() if args.sort: t.sort_descendants() if args.root: if len(args.root) > 1: outgroup = t.get_common_ancestor(args.root) else: outgroup = t & args.root[0] t.set_outgroup(outgroup) # EXTRACT INFO if args.subtrees_output_file: ntrees, ndups, treeiter = t.get_speciation_trees() print >> sys.stderr, "Found %d duplication nodes. Dumping %d sutrees..." % ( ndups, ntrees) OUT = open(args.subtrees_output_file, "w") for tree in treeiter: print >> OUT, tree.write() OUT.close() # VISUALIZATION ts = TreeStyle() ts.mode = args.mode ts.show_leaf_name = False ts.branch_vertical_margin = args.branch_separation if args.show_support: ts.show_branch_support = True if args.show_branch_length: ts.show_branch_length = True if args.force_topology: ts.force_topology = True if args.hide_leaf_names: del LEAF_ATTRIBUTES["name"] if args.show_internal_names: INTERNAL_ATTRIBUTES["name"] = 1 # scale the tree if not args.height: args.height = None if not args.width: args.width = None ts.layout_fn = master_layout if args.image: t.render(args.image, tree_style=ts, w=args.width, h=args.height, units=args.size_units) else: t.show(None, tree_style=ts) if args.newick: t.write(features=[], outfile=args.newick) print "Processed Newick dumped into", args.newick
def main(argv): parser = argparse.ArgumentParser( description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo. # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version) # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) ) # const - A constant value required by some action and nargs selections. # default - The value produced if the argument is absent from the command line. # type - The type to which the command-line argument should be converted. # choices - A container of the allowable values for the argument. # required - Whether or not the command-line option may be omitted (optionals only). # help - A brief description of what the argument does. # metavar - A name for the argument in usage messages. # dest - The name of the attribute to be added to the object returned by parse_args(). parser.add_argument("--show", dest="show_tree", action="store_true", help="""Display tree after the analysis.""") parser.add_argument("--render", dest="render", action="store_true", help="""Render tree.""") parser.add_argument("--dump", dest="dump", action="store_true", help="""Dump analysis""") parser.add_argument( "--explore", dest="explore", type=str, help="""Reads a previously analyzed tree and visualize it""") input_args = parser.add_mutually_exclusive_group() input_args.required = True input_args.add_argument("-t", "--tree", dest="target_tree", nargs="+", type=str, help="""Tree file in newick format""") input_args.add_argument("-tf", dest="tree_list_file", type=str, help="File with the list of tree files") parser.add_argument("--tax", dest="tax_info", type=str, help="If the taxid attribute is not set in the" " newick file for all leaf nodes, a tab file file" " with the translation of name and taxid can be" " provided with this option.") parser.add_argument( "--sp_delimiter", dest="sp_delimiter", type=str, help= "If taxid is part of the leaf name, delimiter used to split the string" ) parser.add_argument( "--sp_field", dest="sp_field", type=int, default=0, help="field position for taxid after splitting leaf names") parser.add_argument("--ref", dest="ref_tree", type=str, help="Uses ref tree to compute robinson foulds" " distances of the different subtrees") parser.add_argument("--rf-only", dest="rf_only", action="store_true", help="Skip ncbi consensus analysis") parser.add_argument( "--outgroup", dest="outgroup", type=str, nargs="+", help="A list of node names defining the trees outgroup") parser.add_argument("--is_sptree", dest="is_sptree", action="store_true", help="Assumes no duplication nodes in the tree") parser.add_argument("-o", dest="output", type=str, help="Writes result into a file") parser.add_argument("--tax2name", dest="tax2name", type=str, help="") parser.add_argument("--tax2track", dest="tax2track", type=str, help="") parser.add_argument("--dump_tax_info", dest="dump_tax_info", action="store_true", help="") args = parser.parse_args(argv) if args.sp_delimiter: GET_TAXID = lambda x: x.split(args.sp_delimiter)[args.sp_field] else: GET_TAXID = None reftree_name = os.path.basename(args.ref_tree) if args.ref_tree else "" if args.explore: print >> sys.stderr, "Reading tree from file:", args.explore t = cPickle.load(open(args.explore)) ts = TreeStyle() ts.force_topology = True ts.show_leaf_name = False ts.layout_fn = ncbi_layout ts.mode = "r" t.show(tree_style=ts) print >> sys.stderr, "dumping color config" cPickle.dump(name2color, open("ncbi_colors.pkl", "w")) sys.exit() if args.output: OUT = open(args.output, "w") else: OUT = sys.stdout print >> sys.stderr, "Dumping results into", OUT target_trees = [] if args.tree_list_file: target_trees = [line.strip() for line in open(args.tree_list_file)] if args.target_tree: target_trees += args.target_tree prev_tree = None if args.tax2name: tax2name = cPickle.load(open(args.tax2name)) else: tax2name = {} if args.tax2track: tax2track = cPickle.load(open(args.tax2track)) else: tax2track = {} print len(tax2track), len(tax2name) header = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Clade sizes", "RF (avg)", "RF (med)", "RF (std)", "RF (max)", "Shared tips") print >> OUT, '|'.join([h.ljust(15) for h in header]) if args.ref_tree: print >> sys.stderr, "Reading ref tree from", args.ref_tree reft = Tree(args.ref_tree, format=1) else: reft = None SHOW_TREE = False if args.show_tree or args.render: SHOW_TREE = True prev_broken = set() ENTRIES = [] ncbi.connect_database() for tfile in target_trees: #print tfile t = PhyloTree(tfile, sp_naming_function=None) if GET_TAXID: for n in t.iter_leaves(): n.name = GET_TAXID(n.name) if args.outgroup: if len(args.outgroup) == 1: out = t & args.outgroup[0] else: out = t.get_common_ancestor(args.outgroup) if set(out.get_leaf_names()) ^ set(args.outgroup): raise ValueError("Outgroup is not monophyletic") t.set_outgroup(out) t.ladderize() if prev_tree: tree_compare(t, prev_tree) prev_tree = t if args.tax_info: tax2name, tax2track = annotate_tree_with_taxa( t, args.tax_info, tax2name, tax2track) if args.dump_tax_info: cPickle.dump(tax2track, open("tax2track.pkl", "w")) cPickle.dump(tax2name, open("tax2name.pkl", "w")) print "Tax info written into pickle files" else: for n in t.iter_leaves(): spcode = n.name n.add_features(taxid=spcode) n.add_features(species=spcode) tax2name, tax2track = annotate_tree_with_taxa( t, None, tax2name, tax2track) # Split tree into species trees #subtrees = t.get_speciation_trees() if not args.rf_only: #print "Calculating tree subparts..." t1 = time.time() if not args.is_sptree: subtrees = t.split_by_dups() #print "Subparts:", len(subtrees), time.time()-t1 else: subtrees = [t] valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = analyze_subtrees( t, subtrees, show_tree=SHOW_TREE) #print valid_subtrees, broken_subtrees, ncbi_mistakes, total_rf else: subtrees = [] valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = 0, 0, 0, 0, 0, 0 ndups = 0 nsubtrees = len(subtrees) rf = 0 rf_max = 0 rf_std = 0 rf_med = 0 common_names = 0 max_size = 0 if reft and len(subtrees) == 1: rf = t.robinson_foulds(reft, attr_t1="realname") rf_max = rf[1] rf = rf[0] rf_med = rf elif reft: #print "Calculating avg RF..." nsubtrees, ndups, subtrees = t.get_speciation_trees( map_features=["taxid"]) #print len(subtrees), "Sub-Species-trees found" avg_rf = [] rf_max = 0.0 # reft.robinson_foulds(reft)[1] sum_size = 0.0 print nsubtrees, "subtrees", ndups, "duplications" for ii, subt in enumerate(subtrees): print "\r%d" % ii, sys.stdout.flush() try: partial_rf = subt.robinson_foulds(reft, attr_t1="taxid") except ValueError: pass else: sptree_size = len( set([n.taxid for n in subt.iter_leaves()])) sum_size += sptree_size avg_rf.append( (partial_rf[0] / float(partial_rf[1])) * sptree_size) common_names = len(partial_rf[3]) max_size = max(max_size, sptree_size) rf_max = max(rf_max, partial_rf[1]) #print partial_rf[:2] rf = numpy.sum(avg_rf) / float(sum_size) # Treeko dist rf_std = numpy.std(avg_rf) rf_med = numpy.median(avg_rf) sizes_info = "%0.1f/%0.1f +- %0.1f" % (numpy.mean(broken_sizes), numpy.median(broken_sizes), numpy.std(broken_sizes)) iter_values = [ os.path.basename(tfile), nsubtrees, ndups, broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, rf, rf_med, rf_std, rf_max, common_names ] print >> OUT, '|'.join( map(lambda x: str(x).strip().ljust(15), iter_values)) fixed = sorted([n for n in prev_broken if n not in broken_clades]) new_problems = sorted(broken_clades - prev_broken) fixed_string = color(', '.join(fixed), "green") if fixed else "" problems_string = color(', '.join(new_problems), "red") if new_problems else "" OUT.write(" Fixed clades: %s\n" % fixed_string) if fixed else None OUT.write(" New broken: %s\n" % problems_string) if new_problems else None prev_broken = broken_clades ENTRIES.append([ os.path.basename(tfile), nsubtrees, ndups, broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, fixed_string, problems_string ]) OUT.flush() if args.show_tree or args.render: ts = TreeStyle() ts.force_topology = True #ts.tree_width = 500 ts.show_leaf_name = False ts.layout_fn = ncbi_layout ts.mode = "r" t.dist = 0 if args.show_tree: #if args.hide_monophyletic: # tax2monophyletic = {} # n2content = t.get_node2content() # for node in t.traverse(): # term2count = defaultdict(int) # for leaf in n2content[node]: # if leaf.lineage: # for term in leaf.lineage: # term2count[term] += 1 # expected_size = len(n2content) # for term, count in term2count.iteritems(): # if count > 1 print "Showing tree..." t.show(tree_style=ts) else: t.render("img.svg", tree_style=ts, dpi=300) print "dumping color config" cPickle.dump(name2color, open("ncbi_colors.pkl", "w")) if args.dump: cPickle.dump(t, open("ncbi_analysis.pkl", "w")) print print HEADER = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Broken branches", "Clade sizes", "Fixed Groups", "New Broken Clades") print_table(ENTRIES, max_col_width=50, row_line=True, header=HEADER) if args.output: OUT.close()
def main(argv): parser = argparse.ArgumentParser(description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo. # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version) # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) ) # const - A constant value required by some action and nargs selections. # default - The value produced if the argument is absent from the command line. # type - The type to which the command-line argument should be converted. # choices - A container of the allowable values for the argument. # required - Whether or not the command-line option may be omitted (optionals only). # help - A brief description of what the argument does. # metavar - A name for the argument in usage messages. # dest - The name of the attribute to be added to the object returned by parse_args(). input_gr = parser.add_argument_group("TREE INPUT OPTIONS\n=================") input_gr.add_argument('tree', metavar='tree_file', type=str, nargs=1, help='A tree file (or text string) in newick format.') input_gr.add_argument("--raxml", dest="raxml", action="store_true", help="""Process newick as raxml bootstrap values""") img_gr = parser.add_argument_group("TREE IMAGE OPTIONS\n=================") img_gr.add_argument("-m", "--mode", dest="mode", choices=["c", "r"], default="r", help="""(r)ectangular or (c)ircular visualization""") img_gr.add_argument("-i", "--image", dest="image", type=str, help="Render tree image instead of showing it. A filename " " should be provided. PDF, SVG and PNG file extensions are" " supported (i.e. -i tree.svg)" ) img_gr.add_argument("--Iw", "--width", dest="width", type=int, default=0, help="width of the rendered image in pixels (see --size-units)." ) img_gr.add_argument("--Ih", "--height", dest="height", type=int, default=0, help="height of the rendered image in pixels (see --size-units)." ) img_gr.add_argument("--Ir", "--resolution", dest="resolution", type=int, default=300, help="Resolution if the tree image (DPI)" ) img_gr.add_argument("--Iu", "--size-units", dest="size_units", choices=["px", "mm", "in"], default="px", help="Units used to specify the size of the image." " (px:pixels, mm:millimeters, in:inches). " ) img_gr.add_argument("-mbs", "--min-branch-separation", dest="branch_separation", type=int, default = 3, help="Min number of pixels to separate branches vertically." ) img_gr.add_argument("--ss", "--show-support", dest="show_support", action="store_true", help="""Shows branch bootstrap/support values""") img_gr.add_argument("--sbl", "--branch-length", dest="show_branch_length", action="store_true", help="""Show branch lengths.""") img_gr.add_argument("--ft", "--force-topology", dest="force_topology", action="store_true", help="""Force branch length to have a minimum length in the image""") img_gr.add_argument("--hln", "--hide-leaf-names", dest="hide_leaf_names", action="store_true", help="""Hide leaf names.""") img_gr.add_argument("--sin", "--show-internal-names", dest="show_internal_names", action="store_true", help="""Show the name attribute of all internal nodes.""") edit_gr = parser.add_argument_group("TREE EDIT OPTIONS\n=================") edit_gr.add_argument("-r", "--root", dest="root", type=str, nargs="*", help="Roots the tree to the node grouping the list" " of node names provided (space separated). In example:" "'--root human rat mouse'") edit_gr.add_argument("-s", "--sort-branches", dest="sort", action="store_true", help="""Sort branches according to node names.""") edit_gr.add_argument("-l", "--ladderize", dest="ladderize", action="store_true", help="""Sort branches by partition size.""") edit_gr.add_argument("--color_by_rank", dest="color_by_rank", type=str, nargs="+", help="""If the attribute rank is present in nodes """) phylo_gr = parser.add_argument_group("PHYLOGENETIC OPTIONS\n=================") phylo_gr.add_argument("--alg", dest="alg", type=str, help="""Multiple sequence alignment.""") phylo_gr.add_argument("--alg-format", dest="alg_format", type=str, default="fasta", help="""fasta, phylip, iphylip, relaxed_iphylip, relaxed_phylip.""") phylo_gr.add_argument("--sp-discovery", dest="species_discovery_regexp", type=str, default="^[^_]+_(.+)", help="Perl regular expression to capture species" " code from node names. By default, node names" " are expected to follow the NAME_SPCODE format = '^[^_]+_(.+)' ") phylo_gr.add_argument("--dump-subtrees", dest="subtrees_output_file", type=str, help="Returns a file containing all possible species subtrees" " contained in a given gene tree ") phylo_gr.add_argument("--newick", dest="newick", type=str, help="dumps newick file after applying editing options") args = parser.parse_args(argv) tfile = args.tree[0] if args.ladderize and args.sort: raise ValueError("--sort-branches and --ladderize options are mutually exclusive") if args.raxml: nw = re.sub(":(\d+\.\d+)\[(\d+)\]", ":\\1[&&NHX:support=\\2]", open(tfile).read()) t = PhyloTree(nw) #for n in t.traverse(): #n.support = getattr(n, "bootstrap", -1) # else: t = PhyloTree(tfile) if args.alg: t.link_to_alignment(args.alg, alg_format=args.alg_format) LEAF_ATTRIBUTES["sequence"] = 1 if args.species_discovery_regexp: SPCODE_REGEXP = re.compile(args.species_discovery_regexp) t.set_species_naming_function(user_species_naming_function) if args.ladderize: t.ladderize() if args.sort: t.sort_descendants() if args.root: if len(args.root) > 1: outgroup = t.get_common_ancestor(args.root) else: outgroup = t & args.root[0] t.set_outgroup(outgroup) # EXTRACT INFO if args.subtrees_output_file: ntrees, ndups, treeiter = t.get_speciation_trees() print >>sys.stderr, "Found %d duplication nodes. Dumping %d sutrees..." %(ndups, ntrees) OUT = open(args.subtrees_output_file, "w") for tree in treeiter: print >>OUT, tree.write() OUT.close() # VISUALIZATION ts = TreeStyle() ts.mode = args.mode ts.show_leaf_name = False ts.branch_vertical_margin = args.branch_separation if args.show_support: ts.show_branch_support = True if args.show_branch_length: ts.show_branch_length = True if args.force_topology: ts.force_topology = True if args.hide_leaf_names: del LEAF_ATTRIBUTES["name"] if args.show_internal_names: INTERNAL_ATTRIBUTES["name"] = 1 # scale the tree if not args.height: args.height = None if not args.width: args.width = None ts.layout_fn = master_layout if args.image: t.render(args.image, tree_style=ts, w=args.width, h=args.height, units=args.size_units) else: t.show(None, tree_style=ts) if args.newick: t.write(features=[], outfile=args.newick) print "Processed Newick dumped into", args.newick
def process_trees(iter_data, reftree, total_trees, thread_name=""): # cache some common data reftree_content = reftree.get_cached_content(store_attr="name") sorted_ref_branches = [(n, reftree_content[n]) for n in reftree.traverse("preorder")] refclades = [(n, reftree_content[n.children[0]], reftree_content[n.children[1]]) for n in reftree.traverse("preorder") if not n.is_leaf()] informed_branches = defaultdict(int) # How many trees were used to # inform about each refTree branch losses_per_branch = defaultdict( int) # Number of losses in each refTree branch coll_losses_per_branch = defaultdict(int) losses_per_dup_branch = defaultdict( list) # Number of losses for duplication # in each refTreeBranch coll_losses_per_dup_branch = defaultdict(list) dup_per_branch = defaultdict(list) # dUplication events sorted by # refTree branch coll_dup_per_branch = defaultdict(list) refbranch_supports = defaultdict(list) # gene tree support values for # each refTree branch coll_refbranch_supports = defaultdict(list) skipped_trees = 0 time0 = time.time() tracked_times = [] for tree_counter, (treeid, t, tree_content) in enumerate(iter_data): if DEBUG: print treeid, t ts = TreeStyle() ts.title.add_face(faces.TextFace("Seedid = %s" % treeid), 1) t.render("%s.png" % treeid, tree_style=ts) if tree_counter % 100 == 0: etime = time.time() - time0 tracked_times.append(etime) total_etime = ((total_trees - tree_counter) / 100.0) * numpy.mean(tracked_times) percent = (tree_counter / float(total_trees)) * 100 print >> sys.stderr, "\r%s% 10d (%0.1f%%) skipped trees:% 5d. Remaining time ~= %d min" % ( thread_name, tree_counter, percent, skipped_trees, total_etime / 60.) time0 = time.time() sys.stderr.flush() gc.collect() if tree_counter and MONITOR_STEP and tree_counter % MONITOR_STEP == 0: annotate_tree(reftree, informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports, coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch, coll_refbranch_supports) ts = TreeStyle() ts.layout_fn = info_layout reftree.render("temp_tree_analysis.png", tree_style=ts) # Compute support of this tree over the whole refTree seedid = None if USE_COLLATERAL else treeid seedsp = None if USE_COLLATERAL else extract_species(treeid) branch2supports, branch2coll_supports = get_supported_branches( t, tree_content, refclades=refclades, seedid=seedid) if branch2supports == {} and branch2coll_supports == {}: skipped_trees += 1 # We combine the information of all treeko trees, by averaging the # number of subtrees that supported or not a given refTree branch. for refbranch, supports in branch2supports.iteritems(): if IS_VALID_TREEID is None or IS_VALID_TREEID( treeid, extract_species(reftree_content[refbranch])): refbranch_supports[refbranch.nid].append(numpy.mean(supports)) for refbranch, coll_supports in branch2coll_supports.iteritems(): if IS_VALID_TREEID is None or IS_VALID_TREEID( treeid, extract_species(reftree_content[refbranch])): coll_refbranch_supports[refbranch.nid].append( numpy.mean(coll_supports)) all_observed_sp = extract_species([n.name for n in tree_content[t]]) if REPORT_PER_TREE_SUPPORTS: if branch2supports: mean_seed_support = numpy.mean([ numpy.mean(branch2supports[_b]) for _b in branch2supports ]) else: mean_seed_support = 0.0 if branch2coll_supports: mean_coll_support = numpy.mean([ numpy.mean(branch2coll_supports[_b]) for _b in branch2coll_supports ]) else: mean_coll_support = 0.0 species_coverage = float( len(all_observed_sp)) / len(REFTREE_SPECIES) print >> REPORT_SUPPORT_FILE, '\t'.join( map(str, [ treeid, species_coverage, mean_seed_support, mean_coll_support, len(branch2supports), len(branch2coll_supports) ])) # Here I keep a counter on how many trees were potentially able to # inform about specific reftree branches. For instance, if outgroup # species X does not appear in a genetree, I dont want to count this # tree as a source for duplication in the X branch. if len(all_observed_sp) == 1: max_ref_branch = reftree.search_nodes( name=list(all_observed_sp)[0])[0] else: max_ref_branch = reftree.get_common_ancestor(all_observed_sp) for refbranch in max_ref_branch.traverse(): if IS_VALID_TREEID is None or IS_VALID_TREEID( treeid, extract_species(reftree_content[refbranch])): informed_branches[refbranch.nid] += 1 # Start analyzing internal nodes for node in t.traverse("preorder"): if node.is_leaf(): continue if len(node.children) != 2: print node raise ValueError("Binary trees are required") # Extract the species set at both sides of the node ch_left = node.children[0] ch_right = node.children[1] seqs_left = set([n.name for n in tree_content[ch_left]]) seqs_right = set([n.name for n in tree_content[ch_right]]) species_left = extract_species(seqs_left) species_right = extract_species(seqs_right) # Decide whether this node is a duplication or not if DETECT_DUPLICATIONS: if SP_OVERLAP == 0: isdup = True if species_left & species_right else False else: #overlap = len(species_left & species_right) / float(max(len(species_left), len(species_right))) overlap = len(species_left & species_right) / float( len(species_left | species_right)) isdup = True if overlap >= SP_OVERLAP else False if DEBUG and overlap: print species_left, species_right print len(species_left & species_right), float( len(species_left | species_right)) print overlap, isdup else: isdup = True if n.evoltype == "D" else False # if this is a dup or the root of tree, map the to node to its # corresponding refTree branch and infer the expected list of # species if isdup or node is t: observed_sp = species_left | species_right if len(observed_sp) == 1: ref_branch = reftree.search_nodes( name=list(observed_sp)[0])[0] else: ref_branch = reftree.get_common_ancestor(observed_sp) expected_sp = reftree_content[ref_branch] if isdup: if IS_VALID_TREEID is None or IS_VALID_TREEID( treeid, extract_species(reftree_content[ref_branch])): # updates duplications per branch in ref tree (dup rate analysis) if USE_COLLATERAL or seedsp in observed_sp: dup_per_branch[ref_branch.nid].append( [seqs_left, seqs_right]) __seed = True elif not USE_COLLATERAL: coll_dup_per_branch[ref_branch.nid].append( [seqs_left, seqs_right]) __seed = False # Count losses observed after a duplication or at the root of the tree. if isdup or node is t: # get a list of losses at both sides of the dupli if not isdup and node is t: losses_left = get_lost_branches(observed_sp, expected_sp, ref_branch, sorted_ref_branches) losses_right = [] else: losses_left = get_lost_branches(species_left, expected_sp, ref_branch, sorted_ref_branches) losses_right = get_lost_branches(species_right, expected_sp, ref_branch, sorted_ref_branches) if IS_VALID_TREEID is not None: losses_left = [ branch for branch in losses_left if IS_VALID_TREEID( treeid, extract_species(reftree_content[branch])) ] losses_right = [ branch for branch in losses_right if IS_VALID_TREEID( treeid, extract_species(reftree_content[branch])) ] if USE_COLLATERAL: losses = losses_left + losses_right coll_losses = [] else: if treeid in seqs_left: # if the seed species is not found at the other side of # the dup, we can assume that its losses will never be # counted, so we combine data from both sides. if seedsp not in species_right: losses = losses_left + losses_right # otherwise, we wait for info for a different seed tree else: losses = losses_left # No collateral information as data come from a duplication including the seed coll_losses = [] elif treeid in seqs_right: # if the seed species is not found at the other side of # the dup, we can assume that its losses will never be # counted, so we combine data from both sides. if seedsp not in species_left: losses = losses_left + losses_right # otherwise, we wait for info for a different seed tree else: losses = losses_right # No collateral information as data come from a duplication including the seed coll_losses = [] else: # If this is a collateral duplication, process losses as such losses = [] coll_losses = losses_left + losses_right if len(reftree_content[ref_branch] ) == 1 and losses + coll_losses: raw_input("This should never happen") # update gene loss counters for lost_branch in losses: losses_per_branch[lost_branch.nid] += 1 if isdup: # if losses come from a dup event losses_per_dup_branch[ref_branch.nid].append( lost_branch) for lost_branch in coll_losses: coll_losses_per_branch[lost_branch.nid] += 1 if isdup: # if losses come from a dup event coll_losses_per_dup_branch[ref_branch.nid].append( lost_branch) return (informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports, coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch, coll_refbranch_supports)
def main(argv): global args parser = argparse.ArgumentParser( description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("-r", dest="reftree", type=str, required=True, help="""Reference tree""") parser.add_argument( "--source_trees", dest="source_trees", type=str, required=True, help= ("A list of *rooted* genetrees, one per line, in the format: TreeID/SeedID [TAB] newick " )) parser.add_argument("--plot_newick", dest="plot_newick", type=str, help=("")) parser.add_argument("--spname_delimiter", dest="spname_delimiter", type=str, default="_", help=("species code delimiter in node names")) parser.add_argument( "--spname_field", dest="spname_field", type=int, default=-1, help= ("position of the species code extracted from node names. -1 = last field" )) parser.add_argument( "--collateral", dest="use_collateral", action="store_true", help=("If enabled, collateral information will be used as" " equally qualified data. Otherwise, such data will" " be reported separatedly. Use this if your set of" " trees are not overlaping. ")) parser.add_argument( "--skip_dup_detection", dest="skip_dup_detection", action="store_true", help=('If used, duplications will be expected to be annotated' ' in the source gene trees with the evoltype="D" tag.' ' Otherwise they will be inferred on the fly using' ' the species overlap algorithm.')) parser.add_argument( "--spoverlap", dest="species_overlap", type=float, default=0.0, help=("Species overlap cutoff. A number between 0 and 1 " "representing the percentage of species that should be " "shared between two sister partitions to be considered a" " duplication. 0 = any overlap represents a duplication. ")) parser.add_argument( "--debug", dest="debug", action="store_true", help= ("generate an image of every input gene tree tree, so the result can be inspected" )) parser.add_argument( "--snapshot_step", dest="snapshot_step", type=int, default=1000, help=("How many trees should be processed between snapshots dumps?")) parser.add_argument( "--reftree_constraint", dest="reftree_constraint", type=str, help=("A python module from from which a function called " "*is_valid_treeid(treeid, refbranch)* should be importable. " "The function will be used to decide if the info of a given " "source tree is informative or not for each reftree branch. ")) parser.add_argument("-o", dest="output", type=str, required=True, help=("output tag name (extensions will be added)")) parser.add_argument("--cpu", dest="cpu", type=int, default=1, help=("enable parallel computation")) parser.add_argument( "--img_report", dest="img_report", action="store_true", help= ("If true, it generates a summary image results with all the computed data" )) parser.add_argument( "--report_supports", dest="report_supports", action="store_true", help= ("If used, supported ref tree branches are individually reported for each gene tree " )) args = parser.parse_args(argv) if args.plot_newick: t = Tree(args.plot_newick) ts = TreeStyle() ts.layout_fn = info_layout t.render("tree_analysis.png", tree_style=ts) sys.exit(0) SPNAME_FIELD, SPNAME_DELIMITER = args.spname_field, args.spname_delimiter USE_COLLATERAL = args.use_collateral DETECT_DUPLICATIONS = True if not args.skip_dup_detection else False REPORT_PER_TREE_SUPPORTS = True if args.report_supports else False SP_OVERLAP = args.species_overlap DEBUG = args.debug IMG_REPORT = args.img_report reftree = PhyloTree(args.reftree, sp_naming_function=None) for nid, n in enumerate(reftree.traverse()): n.add_features(nid=nid) REFTREE_SPECIES = set(reftree.get_leaf_names()) print __DESCRIPTION__ if REPORT_PER_TREE_SUPPORTS: REPORT_SUPPORT_FILE = open("%s.gentree_supports" % args.output, "w") print >> REPORT_SUPPORT_FILE, '#' + '\t'.join( map(str, [ "treeId", "spCoverage", "mean_support", "mean_coll_support", "tested_branches", 'tested_coll_branches' ])) TOTAL_TREES = int( commands.getoutput("wc -l %s" % args.source_trees).split()[0]) + 1 print >> sys.stderr, "Processing %d source trees" % TOTAL_TREES if args.reftree_constraint: import imp constraint = imp.load_source('constraint', args.reftree_constraint) IS_VALID_TREEID = constraint.is_valid_treeid else: IS_VALID_TREEID = None if args.cpu > 1: MONITOR_STEP = 0 #return (informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports, # coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch, coll_refbranch_supports) # The output of the process_trees function are 9 dictionaries in which keys are refbranches target_dicts = [{} for x in range(9)] def merge_dict_results(target, source): def merge_dict(target, source): for k, v in source.iteritems(): if k not in target: target[k] = v elif isinstance(v, list): target[k].extend(v) elif isinstance(v, set): target[k].update(v) elif isinstance(v, int): target[k] += v else: raise ValueError("Impossible to merge str results") for index in xrange(len(target)): merge_dict(target[index], out[index]) from multiprocessing import Process, Queue from Queue import Empty as QueueEmpty outputs_queue = Queue() if TOTAL_TREES > args.cpu: trees_per_cpu = TOTAL_TREES / args.cpu trees_per_cpu += 1 if TOTAL_TREES % args.cpu else 0 else: trees_per_cpu = 1 args.cpu = TOTAL_TREES all_workers = set() for cpu_num in xrange(args.cpu): sline = (cpu_num * trees_per_cpu) eline = (cpu_num * trees_per_cpu) + trees_per_cpu data_iter = tree_iterator(args.source_trees, restrict_species=REFTREE_SPECIES, start_line=sline, end_line=eline) print >> sys.stderr, "Launching worker %d from %d to %d" % ( cpu_num, sline, eline) worker = Process(target=run_parallel, args=(cpu_num, outputs_queue, process_trees, data_iter, reftree, trees_per_cpu)) worker.name = "Worker_%d" % cpu_num all_workers.add(worker) worker.start() while all_workers: # clear done threads for w in list(all_workers): if not w.is_alive(): print >> sys.stderr, "%s thread is done!" % w.name all_workers.discard(w) # get and merge results while 1: try: out = outputs_queue.get(False) except QueueEmpty: break else: # This merge depends on process_trees return output!!!!! merge_dict_results(target_dicts, out) # Dump a snapshot dump_results(reftree, *target_dicts) time.sleep(0.1) if all_workers: time.sleep(1) # collected data (informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports, coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch, coll_refbranch_supports) = target_dicts else: MONITOR_STEP = args.snapshot_step data_iter = tree_iterator(args.source_trees, restrict_species=REFTREE_SPECIES) (informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports, coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch, coll_refbranch_supports) = process_trees(data_iter, reftree, TOTAL_TREES) if REPORT_PER_TREE_SUPPORTS: REPORT_SUPPORT_FILE.close() dump_results(reftree, informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports, coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch, coll_refbranch_supports) print >> sys.stderr, "Dumping full analysis..." # Full dump, including duplication details cPickle.dump(reftree, open("%s.pkl" % args.output, "w"))
def main(argv): parser = argparse.ArgumentParser( description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) input_gr = parser.add_argument_group( "TREE INPUT OPTIONS\n=================") input_gr.add_argument( 'tree', metavar='tree_file', type=str, nargs=1, help='A tree file (or text string) in newick format.') input_gr.add_argument("--raxml", dest="raxml", action="store_true", help="""Process newick as raxml bootstrap values""") img_gr = parser.add_argument_group("TREE IMAGE OPTIONS\n=================") img_gr.add_argument("-m", "--mode", dest="mode", choices=["c", "r"], default="r", help="""(r)ectangular or (c)ircular visualization""") img_gr.add_argument( "-i", "--image", dest="image", type=str, help="Render tree image instead of showing it. A filename " " should be provided. PDF, SVG and PNG file extensions are" " supported (i.e. -i tree.svg)") img_gr.add_argument("--text", dest="text_mode", action="store_true", help="Shows the tree using ASCII characters") img_gr.add_argument( "--attr", "--show_attributes", dest="show_attributes", nargs="+", help="Display the value of the specified attributes, if available") img_gr.add_argument( "--Iw", "--width", dest="width", type=int, default=0, help="width of the rendered image in pixels (see --size-units).") img_gr.add_argument( "--Ih", "--height", dest="height", type=int, default=0, help="height of the rendered image in pixels (see --size-units).") img_gr.add_argument("--Ir", "--resolution", dest="resolution", type=int, default=300, help="Resolution if the tree image (DPI)") img_gr.add_argument("--Iu", "--size_units", dest="size_units", choices=["px", "mm", "in"], default="px", help="Units used to specify the size of the image." " (px:pixels, mm:millimeters, in:inches). ") img_gr.add_argument( "-mbs", "--min_branch_separation", dest="branch_separation", type=int, default=3, help="Min number of pixels to separate branches vertically.") img_gr.add_argument("--ss", "--show_support", dest="show_support", action="store_true", help="""Shows branch bootstrap/support values""") img_gr.add_argument("--sbl", "--show_branch_length", dest="show_branch_length", action="store_true", help="""Show branch lengths.""") img_gr.add_argument( "--ft", "--force_topology", dest="force_topology", action="store_true", help="""Force branch length to have a minimum length in the image""") img_gr.add_argument("--hln", "--hide_leaf_names", dest="hide_leaf_names", action="store_true", help="""Hide leaf names.""") img_gr.add_argument( "--sin", "--show_internal_names", dest="show_internal_names", action="store_true", help="""Show the name attribute of all internal nodes.""") edit_gr = parser.add_argument_group("TREE EDIT OPTIONS\n=================") edit_gr.add_argument( "-r", "--root", dest="root", type=str, nargs="*", help="Roots the tree to the node grouping the list" " of node names provided (space separated). In example:" "'--root human rat mouse'") edit_gr.add_argument("-s", "--sort_branches", dest="sort", action="store_true", help="""Sort branches according to node names.""") edit_gr.add_argument("-l", "--ladderize", dest="ladderize", action="store_true", help="""Sort branches by partition size.""") edit_gr.add_argument("--color_by_rank", dest="color_by_rank", type=str, nargs="+", help="""If the attribute rank is present in nodes """) edit_gr.add_argument( "--ncbi", dest="ncbi", action="store_true", help=""" Annotate tree using the NCBI taxonomy database""") edit_gr.add_argument( "--taxid_attr", dest="taxid_attr", type=str, default="name", help="node attribute encoding for valid taxid numbers.") edit_gr.add_argument( "--taxid_attr_regexp", dest="taxid_attr_regexp", type=str, help= "If taxid number is encoded as part of another text string, i.e. gene name, use this argument to define a Perl regular expression to extract taxid numbers." ) phylo_gr = parser.add_argument_group( "PHYLOGENETIC OPTIONS\n=================") phylo_gr.add_argument( "--alg", dest="alg", type=str, help="""Link tree to a multiple sequence alignment.""") phylo_gr.add_argument( "--alg_format", dest="alg_format", type=str, default="fasta", help="""fasta, phylip, iphylip, relaxed_iphylip, relaxed_phylip.""") phylo_gr.add_argument( "--sp_discovery", dest="species_discovery_regexp", type=str, default="^[^_]+_(.+)", help="Perl regular expression used to capture species" " code from node names. By default, node names" " are expected to follow the NAME_SPCODE format = '^[^_]+_(.+)' ") args = parser.parse_args(argv) tfile = args.tree[0] if args.ladderize and args.sort: raise ValueError( "--sort-branches and --ladderize options are mutually exclusive") if args.raxml: nw = re.sub(":(\d+\.\d+)\[(\d+)\]", ":\\1[&&NHX:support=\\2]", open(tfile).read()) t = PhyloTree(nw) else: t = PhyloTree(tfile) if args.ncbi: if args.taxid_attr_regexp: TAXIDMATCHER = re.compile(args.taxid_attr_regexp) for lf in t: if args.taxid_attr_regexp: lf.taxid = re.search(TAXIDMATCHER, getattr(lf, args.taxid_attr)).groups()[0] else: lf.taxid = getattr(lf, args.taxid_attr) t.annotate_ncbi_taxa(taxid_attr="taxid") if args.alg: t.link_to_alignment(args.alg, alg_format=args.alg_format) LEAF_ATTRIBUTES["sequence"] = 1 if args.species_discovery_regexp: SPCODE_REGEXP = re.compile(args.species_discovery_regexp) t.set_species_naming_function(user_species_naming_function) if args.ladderize: t.ladderize() if args.sort: t.sort_descendants() if args.root: if len(args.root) > 1: outgroup = t.get_common_ancestor(args.root) else: outgroup = t & args.root[0] t.set_outgroup(outgroup) # VISUALIZATION ts = TreeStyle() ts.mode = args.mode ts.show_leaf_name = False ts.branch_vertical_margin = args.branch_separation if args.show_support: ts.show_branch_support = True if args.show_branch_length: ts.show_branch_length = True if args.force_topology: ts.force_topology = True if args.hide_leaf_names: del LEAF_ATTRIBUTES["name"] if args.show_internal_names: INTERNAL_ATTRIBUTES["name"] = 1 # scale the tree if not args.height: args.height = None if not args.width: args.width = None if args.text_mode: print t.get_ascii(show_internal=args.show_internal_names, attributes=args.show_attributes) else: ts.layout_fn = master_layout if args.image: t.render(args.image, tree_style=ts, w=args.width, h=args.height, units=args.size_units) else: t.show(None, tree_style=ts)
def main(argv): parser = argparse.ArgumentParser(description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) input_gr = parser.add_argument_group("TREE INPUT OPTIONS\n=================") input_gr.add_argument('tree', metavar='tree_file', type=str, nargs=1, help='A tree file (or text string) in newick format.') input_gr.add_argument("--raxml", dest="raxml", action="store_true", help="""Process newick as raxml bootstrap values""") img_gr = parser.add_argument_group("TREE IMAGE OPTIONS\n=================") img_gr.add_argument("-m", "--mode", dest="mode", choices=["c", "r"], default="r", help="""(r)ectangular or (c)ircular visualization""") img_gr.add_argument("-i", "--image", dest="image", type=str, help="Render tree image instead of showing it. A filename " " should be provided. PDF, SVG and PNG file extensions are" " supported (i.e. -i tree.svg)" ) img_gr.add_argument("--text", dest="text_mode", action="store_true", help="Shows the tree using ASCII characters") img_gr.add_argument("--attr", "--show_attributes", dest="show_attributes", nargs="+", help="Display the value of the specified attributes, if available") img_gr.add_argument("--Iw", "--width", dest="width", type=int, default=0, help="width of the rendered image in pixels (see --size-units)." ) img_gr.add_argument("--Ih", "--height", dest="height", type=int, default=0, help="height of the rendered image in pixels (see --size-units)." ) img_gr.add_argument("--Ir", "--resolution", dest="resolution", type=int, default=300, help="Resolution if the tree image (DPI)" ) img_gr.add_argument("--Iu", "--size_units", dest="size_units", choices=["px", "mm", "in"], default="px", help="Units used to specify the size of the image." " (px:pixels, mm:millimeters, in:inches). " ) img_gr.add_argument("-mbs", "--min_branch_separation", dest="branch_separation", type=int, default = 3, help="Min number of pixels to separate branches vertically." ) img_gr.add_argument("--ss", "--show_support", dest="show_support", action="store_true", help="""Shows branch bootstrap/support values""") img_gr.add_argument("--sbl", "--show_branch_length", dest="show_branch_length", action="store_true", help="""Show branch lengths.""") img_gr.add_argument("--ft", "--force_topology", dest="force_topology", action="store_true", help="""Force branch length to have a minimum length in the image""") img_gr.add_argument("--hln", "--hide_leaf_names", dest="hide_leaf_names", action="store_true", help="""Hide leaf names.""") img_gr.add_argument("--sin", "--show_internal_names", dest="show_internal_names", action="store_true", help="""Show the name attribute of all internal nodes.""") edit_gr = parser.add_argument_group("TREE EDIT OPTIONS\n=================") edit_gr.add_argument("-r", "--root", dest="root", type=str, nargs="*", help="Roots the tree to the node grouping the list" " of node names provided (space separated). In example:" "'--root human rat mouse'") edit_gr.add_argument("-s", "--sort_branches", dest="sort", action="store_true", help="""Sort branches according to node names.""") edit_gr.add_argument("-l", "--ladderize", dest="ladderize", action="store_true", help="""Sort branches by partition size.""") edit_gr.add_argument("--color_by_rank", dest="color_by_rank", type=str, nargs="+", help="""If the attribute rank is present in nodes """) edit_gr.add_argument("--ncbi", dest="ncbi", action="store_true", help=""" Annotate tree using the NCBI taxonomy database""") edit_gr.add_argument("--taxid_attr", dest="taxid_attr", type=str, default="name", help="node attribute encoding for valid taxid numbers.") edit_gr.add_argument("--taxid_attr_regexp", dest="taxid_attr_regexp", type=str, help="If taxid number is encoded as part of another text string, i.e. gene name, use this argument to define a Perl regular expression to extract taxid numbers.") phylo_gr = parser.add_argument_group("PHYLOGENETIC OPTIONS\n=================") phylo_gr.add_argument("--alg", dest="alg", type=str, help="""Link tree to a multiple sequence alignment.""") phylo_gr.add_argument("--alg_format", dest="alg_format", type=str, default="fasta", help="""fasta, phylip, iphylip, relaxed_iphylip, relaxed_phylip.""") phylo_gr.add_argument("--sp_discovery", dest="species_discovery_regexp", type=str, default="^[^_]+_(.+)", help="Perl regular expression used to capture species" " code from node names. By default, node names" " are expected to follow the NAME_SPCODE format = '^[^_]+_(.+)' ") args = parser.parse_args(argv) tfile = args.tree[0] if args.ladderize and args.sort: raise ValueError("--sort-branches and --ladderize options are mutually exclusive") if args.raxml: nw = re.sub(":(\d+\.\d+)\[(\d+)\]", ":\\1[&&NHX:support=\\2]", open(tfile).read()) t = PhyloTree(nw) else: t = PhyloTree(tfile) if args.ncbi: if args.taxid_attr_regexp: TAXIDMATCHER = re.compile(args.taxid_attr_regexp) for lf in t: if args.taxid_attr_regexp: lf.taxid = re.search(TAXIDMATCHER, getattr(lf, args.taxid_attr)).groups()[0] else: lf.taxid = getattr(lf, args.taxid_attr) t.annotate_ncbi_taxa(taxid_attr="taxid") if args.alg: t.link_to_alignment(args.alg, alg_format=args.alg_format) LEAF_ATTRIBUTES["sequence"] = 1 if args.species_discovery_regexp: SPCODE_REGEXP = re.compile(args.species_discovery_regexp) t.set_species_naming_function(user_species_naming_function) if args.ladderize: t.ladderize() if args.sort: t.sort_descendants() if args.root: if len(args.root) > 1: outgroup = t.get_common_ancestor(args.root) else: outgroup = t & args.root[0] t.set_outgroup(outgroup) # VISUALIZATION ts = TreeStyle() ts.mode = args.mode ts.show_leaf_name = False ts.branch_vertical_margin = args.branch_separation if args.show_support: ts.show_branch_support = True if args.show_branch_length: ts.show_branch_length = True if args.force_topology: ts.force_topology = True if args.hide_leaf_names: del LEAF_ATTRIBUTES["name"] if args.show_internal_names: INTERNAL_ATTRIBUTES["name"] = 1 # scale the tree if not args.height: args.height = None if not args.width: args.width = None if args.text_mode: print t.get_ascii(show_internal=args.show_internal_names, attributes = args.show_attributes) else: ts.layout_fn = master_layout if args.image: t.render(args.image, tree_style=ts, w=args.width, h=args.height, units=args.size_units) else: t.show(None, tree_style=ts)
def process_trees(iter_data, reftree, total_trees, thread_name=""): # cache some common data reftree_content = reftree.get_cached_content(store_attr="name") sorted_ref_branches = [(n, reftree_content[n]) for n in reftree.traverse("preorder")] refclades = [(n, reftree_content[n.children[0]], reftree_content[n.children[1]]) for n in reftree.traverse("preorder") if not n.is_leaf()] informed_branches = defaultdict(int) # How many trees were used to # inform about each refTree branch losses_per_branch = defaultdict(int) # Number of losses in each refTree branch coll_losses_per_branch = defaultdict(int) losses_per_dup_branch = defaultdict(list) # Number of losses for duplication # in each refTreeBranch coll_losses_per_dup_branch = defaultdict(list) dup_per_branch = defaultdict(list) # dUplication events sorted by # refTree branch coll_dup_per_branch = defaultdict(list) refbranch_supports = defaultdict(list) # gene tree support values for # each refTree branch coll_refbranch_supports = defaultdict(list) skipped_trees = 0 time0 = time.time() tracked_times = [] for tree_counter, (treeid, t, tree_content) in enumerate(iter_data): if DEBUG: print treeid, t ts = TreeStyle() ts.title.add_face(faces.TextFace("Seedid = %s"%treeid), 1) t.render("%s.png"%treeid, tree_style=ts) if tree_counter % 100 == 0: etime = time.time() - time0 tracked_times.append(etime) total_etime = ((total_trees - tree_counter) / 100.0) * numpy.mean(tracked_times) percent = (tree_counter / float(total_trees)) * 100 print >>sys.stderr, "\r%s% 10d (%0.1f%%) skipped trees:% 5d. Remaining time ~= %d min" %(thread_name, tree_counter, percent, skipped_trees, total_etime/60.) time0 = time.time() sys.stderr.flush() gc.collect() if tree_counter and MONITOR_STEP and tree_counter % MONITOR_STEP == 0: annotate_tree(reftree, informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports, coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch, coll_refbranch_supports) ts = TreeStyle() ts.layout_fn = info_layout reftree.render("temp_tree_analysis.png", tree_style=ts) # Compute support of this tree over the whole refTree seedid = None if USE_COLLATERAL else treeid seedsp = None if USE_COLLATERAL else extract_species(treeid) branch2supports, branch2coll_supports = get_supported_branches(t, tree_content, refclades=refclades, seedid=seedid) if branch2supports == {} and branch2coll_supports == {}: skipped_trees +=1 # We combine the information of all treeko trees, by averaging the # number of subtrees that supported or not a given refTree branch. for refbranch, supports in branch2supports.iteritems(): if IS_VALID_TREEID is None or IS_VALID_TREEID(treeid, extract_species(reftree_content[refbranch])): refbranch_supports[refbranch.nid].append(numpy.mean(supports)) for refbranch, coll_supports in branch2coll_supports.iteritems(): if IS_VALID_TREEID is None or IS_VALID_TREEID(treeid, extract_species(reftree_content[refbranch])): coll_refbranch_supports[refbranch.nid].append(numpy.mean(coll_supports)) all_observed_sp = extract_species([n.name for n in tree_content[t]]) if REPORT_PER_TREE_SUPPORTS: if branch2supports: mean_seed_support = numpy.mean([numpy.mean(branch2supports[_b]) for _b in branch2supports]) else: mean_seed_support = 0.0 if branch2coll_supports: mean_coll_support = numpy.mean([numpy.mean(branch2coll_supports[_b]) for _b in branch2coll_supports]) else: mean_coll_support = 0.0 species_coverage = float(len(all_observed_sp))/len(REFTREE_SPECIES) print >>REPORT_SUPPORT_FILE, '\t'.join(map(str, [treeid, species_coverage, mean_seed_support, mean_coll_support, len(branch2supports), len(branch2coll_supports)])) # Here I keep a counter on how many trees were potentially able to # inform about specific reftree branches. For instance, if outgroup # species X does not appear in a genetree, I dont want to count this # tree as a source for duplication in the X branch. if len(all_observed_sp) == 1: max_ref_branch = reftree.search_nodes(name=list(all_observed_sp)[0])[0] else: max_ref_branch = reftree.get_common_ancestor(all_observed_sp) for refbranch in max_ref_branch.traverse(): if IS_VALID_TREEID is None or IS_VALID_TREEID(treeid, extract_species(reftree_content[refbranch])): informed_branches[refbranch.nid] += 1 # Start analyzing internal nodes for node in t.traverse("preorder"): if node.is_leaf(): continue if len(node.children) != 2: print node raise ValueError("Binary trees are required") # Extract the species set at both sides of the node ch_left = node.children[0] ch_right = node.children[1] seqs_left = set([n.name for n in tree_content[ch_left]]) seqs_right = set([n.name for n in tree_content[ch_right]]) species_left = extract_species(seqs_left) species_right = extract_species(seqs_right) # Decide whether this node is a duplication or not if DETECT_DUPLICATIONS: if SP_OVERLAP == 0: isdup = True if species_left & species_right else False else: #overlap = len(species_left & species_right) / float(max(len(species_left), len(species_right))) overlap = len(species_left & species_right) / float(len(species_left | species_right)) isdup = True if overlap >= SP_OVERLAP else False if DEBUG and overlap: print species_left, species_right print len(species_left & species_right), float(len(species_left | species_right)) print overlap, isdup else: isdup = True if n.evoltype == "D" else False # if this is a dup or the root of tree, map the to node to its # corresponding refTree branch and infer the expected list of # species if isdup or node is t: observed_sp = species_left | species_right if len(observed_sp) == 1: ref_branch = reftree.search_nodes(name=list(observed_sp)[0])[0] else: ref_branch = reftree.get_common_ancestor(observed_sp) expected_sp = reftree_content[ref_branch] if isdup: if IS_VALID_TREEID is None or IS_VALID_TREEID(treeid, extract_species(reftree_content[ref_branch])): # updates duplications per branch in ref tree (dup rate analysis) if USE_COLLATERAL or seedsp in observed_sp: dup_per_branch[ref_branch.nid].append([seqs_left, seqs_right]) __seed = True elif not USE_COLLATERAL: coll_dup_per_branch[ref_branch.nid].append([seqs_left, seqs_right]) __seed = False # Count losses observed after a duplication or at the root of the tree. if isdup or node is t: # get a list of losses at both sides of the dupli if not isdup and node is t: losses_left = get_lost_branches(observed_sp, expected_sp, ref_branch, sorted_ref_branches) losses_right = [] else: losses_left = get_lost_branches(species_left, expected_sp, ref_branch, sorted_ref_branches) losses_right = get_lost_branches(species_right, expected_sp, ref_branch, sorted_ref_branches) if IS_VALID_TREEID is not None: losses_left = [branch for branch in losses_left if IS_VALID_TREEID(treeid, extract_species(reftree_content[branch]))] losses_right = [branch for branch in losses_right if IS_VALID_TREEID(treeid, extract_species(reftree_content[branch]))] if USE_COLLATERAL: losses = losses_left + losses_right coll_losses = [] else: if treeid in seqs_left: # if the seed species is not found at the other side of # the dup, we can assume that its losses will never be # counted, so we combine data from both sides. if seedsp not in species_right: losses = losses_left + losses_right # otherwise, we wait for info for a different seed tree else: losses = losses_left # No collateral information as data come from a duplication including the seed coll_losses = [] elif treeid in seqs_right: # if the seed species is not found at the other side of # the dup, we can assume that its losses will never be # counted, so we combine data from both sides. if seedsp not in species_left: losses = losses_left + losses_right # otherwise, we wait for info for a different seed tree else: losses = losses_right # No collateral information as data come from a duplication including the seed coll_losses = [] else: # If this is a collateral duplication, process losses as such losses = [] coll_losses = losses_left + losses_right if len(reftree_content[ref_branch]) == 1 and losses + coll_losses: raw_input("This should never happen") # update gene loss counters for lost_branch in losses: losses_per_branch[lost_branch.nid] += 1 if isdup: # if losses come from a dup event losses_per_dup_branch[ref_branch.nid].append(lost_branch) for lost_branch in coll_losses: coll_losses_per_branch[lost_branch.nid] += 1 if isdup: # if losses come from a dup event coll_losses_per_dup_branch[ref_branch.nid].append(lost_branch) return (informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports, coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch, coll_refbranch_supports)
def main(argv): global args parser = argparse.ArgumentParser(description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("-r", dest="reftree", type=str, required=True, help="""Reference tree""") parser.add_argument("--source_trees", dest="source_trees", type=str, required = True, help=("A list of *rooted* genetrees, one per line, in the format: TreeID/SeedID [TAB] newick ")) parser.add_argument("--plot_newick", dest="plot_newick", type=str, help=("")) parser.add_argument("--spname_delimiter", dest="spname_delimiter", type=str, default="_", help=("species code delimiter in node names")) parser.add_argument("--spname_field", dest="spname_field", type=int, default=-1, help=("position of the species code extracted from node names. -1 = last field")) parser.add_argument("--collateral", dest="use_collateral", action="store_true", help=("If enabled, collateral information will be used as" " equally qualified data. Otherwise, such data will" " be reported separatedly. Use this if your set of" " trees are not overlaping. ")) parser.add_argument("--skip_dup_detection", dest="skip_dup_detection", action="store_true", help=('If used, duplications will be expected to be annotated' ' in the source gene trees with the evoltype="D" tag.' ' Otherwise they will be inferred on the fly using' ' the species overlap algorithm.')) parser.add_argument("--spoverlap", dest="species_overlap", type=float, default=0.0, help=("Species overlap cutoff. A number between 0 and 1 " "representing the percentage of species that should be " "shared between two sister partitions to be considered a" " duplication. 0 = any overlap represents a duplication. ")) parser.add_argument("--debug", dest="debug", action="store_true", help=("generate an image of every input gene tree tree, so the result can be inspected")) parser.add_argument("--snapshot_step", dest="snapshot_step", type=int, default=1000, help=("How many trees should be processed between snapshots dumps?")) parser.add_argument("--reftree_constraint", dest="reftree_constraint", type=str, help=("A python module from from which a function called " "*is_valid_treeid(treeid, refbranch)* should be importable. " "The function will be used to decide if the info of a given " "source tree is informative or not for each reftree branch. ")) parser.add_argument("-o", dest="output", type=str, required=True, help=("output tag name (extensions will be added)")) parser.add_argument("--cpu", dest="cpu", type=int, default=1, help=("enable parallel computation")) parser.add_argument("--img_report", dest="img_report", action="store_true", help=("If true, it generates a summary image results with all the computed data")) parser.add_argument("--report_supports", dest="report_supports", action="store_true", help=("If used, supported ref tree branches are individually reported for each gene tree ")) args = parser.parse_args(argv) if args.plot_newick: t = Tree(args.plot_newick) ts = TreeStyle() ts.layout_fn = info_layout t.render("tree_analysis.png", tree_style=ts) sys.exit(0) SPNAME_FIELD, SPNAME_DELIMITER = args.spname_field, args.spname_delimiter USE_COLLATERAL = args.use_collateral DETECT_DUPLICATIONS = True if not args.skip_dup_detection else False REPORT_PER_TREE_SUPPORTS = True if args.report_supports else False SP_OVERLAP = args.species_overlap DEBUG = args.debug IMG_REPORT = args.img_report reftree = PhyloTree(args.reftree, sp_naming_function=None) for nid, n in enumerate(reftree.traverse()): n.add_features(nid = nid) REFTREE_SPECIES = set(reftree.get_leaf_names()) print __DESCRIPTION__ if REPORT_PER_TREE_SUPPORTS: REPORT_SUPPORT_FILE = open("%s.gentree_supports" %args.output, "w") print >>REPORT_SUPPORT_FILE, '#'+'\t'.join(map(str, ["treeId", "spCoverage", "mean_support", "mean_coll_support", "tested_branches", 'tested_coll_branches'])) TOTAL_TREES = int(commands.getoutput("wc -l %s" %args.source_trees).split()[0]) + 1 print >>sys.stderr, "Processing %d source trees" %TOTAL_TREES if args.reftree_constraint: import imp constraint = imp.load_source('constraint', args.reftree_constraint) IS_VALID_TREEID = constraint.is_valid_treeid else: IS_VALID_TREEID = None if args.cpu > 1: MONITOR_STEP = 0 #return (informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports, # coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch, coll_refbranch_supports) # The output of the process_trees function are 9 dictionaries in which keys are refbranches target_dicts = [{} for x in range(9)] def merge_dict_results(target, source): def merge_dict(target, source): for k, v in source.iteritems(): if k not in target: target[k] = v elif isinstance(v, list): target[k].extend(v) elif isinstance(v, set): target[k].update(v) elif isinstance(v, int): target[k] += v else: raise ValueError("Impossible to merge str results") for index in xrange(len(target)): merge_dict(target[index], out[index]) from multiprocessing import Process, Queue from Queue import Empty as QueueEmpty outputs_queue = Queue() if TOTAL_TREES > args.cpu: trees_per_cpu = TOTAL_TREES / args.cpu trees_per_cpu += 1 if TOTAL_TREES % args.cpu else 0 else: trees_per_cpu = 1 args.cpu = TOTAL_TREES all_workers = set() for cpu_num in xrange(args.cpu): sline = (cpu_num*trees_per_cpu) eline = (cpu_num*trees_per_cpu) + trees_per_cpu data_iter = tree_iterator(args.source_trees, restrict_species=REFTREE_SPECIES, start_line=sline, end_line=eline) print >>sys.stderr, "Launching worker %d from %d to %d" %(cpu_num, sline, eline) worker = Process(target=run_parallel, args=(cpu_num, outputs_queue, process_trees, data_iter, reftree, trees_per_cpu)) worker.name = "Worker_%d" %cpu_num all_workers.add(worker) worker.start() while all_workers: # clear done threads for w in list(all_workers): if not w.is_alive(): print >>sys.stderr, "%s thread is done!" %w.name all_workers.discard(w) # get and merge results while 1: try: out = outputs_queue.get(False) except QueueEmpty: break else: # This merge depends on process_trees return output!!!!! merge_dict_results(target_dicts, out) # Dump a snapshot dump_results(reftree, *target_dicts) time.sleep(0.1) if all_workers: time.sleep(1) # collected data (informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports, coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch, coll_refbranch_supports) = target_dicts else: MONITOR_STEP = args.snapshot_step data_iter = tree_iterator(args.source_trees, restrict_species=REFTREE_SPECIES) (informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports, coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch, coll_refbranch_supports) = process_trees(data_iter, reftree, TOTAL_TREES) if REPORT_PER_TREE_SUPPORTS: REPORT_SUPPORT_FILE.close() dump_results(reftree, informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports, coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch, coll_refbranch_supports) print >>sys.stderr, "Dumping full analysis..." # Full dump, including duplication details cPickle.dump(reftree, open("%s.pkl"%args.output, "w"))
def main(argv): parser = argparse.ArgumentParser(description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo. # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version) # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) ) # const - A constant value required by some action and nargs selections. # default - The value produced if the argument is absent from the command line. # type - The type to which the command-line argument should be converted. # choices - A container of the allowable values for the argument. # required - Whether or not the command-line option may be omitted (optionals only). # help - A brief description of what the argument does. # metavar - A name for the argument in usage messages. # dest - The name of the attribute to be added to the object returned by parse_args(). parser.add_argument("--show", dest="show_tree", action="store_true", help="""Display tree after the analysis.""") parser.add_argument("--render", dest="render", action="store_true", help="""Render tree.""") parser.add_argument("--dump", dest="dump", action="store_true", help="""Dump analysis""") parser.add_argument("--explore", dest="explore", type=str, help="""Reads a previously analyzed tree and visualize it""") input_args = parser.add_mutually_exclusive_group() input_args.required=True input_args.add_argument("-t", "--tree", dest="target_tree", nargs="+", type=str, help="""Tree file in newick format""") input_args.add_argument("-tf", dest="tree_list_file", type=str, help="File with the list of tree files") parser.add_argument("--tax", dest="tax_info", type=str, help="If the taxid attribute is not set in the" " newick file for all leaf nodes, a tab file file" " with the translation of name and taxid can be" " provided with this option.") parser.add_argument("--sp_delimiter", dest="sp_delimiter", type=str, help="If taxid is part of the leaf name, delimiter used to split the string") parser.add_argument("--sp_field", dest="sp_field", type=int, default=0, help="field position for taxid after splitting leaf names") parser.add_argument("--ref", dest="ref_tree", type=str, help="Uses ref tree to compute robinson foulds" " distances of the different subtrees") parser.add_argument("--rf-only", dest="rf_only", action = "store_true", help="Skip ncbi consensus analysis") parser.add_argument("--outgroup", dest="outgroup", type=str, nargs="+", help="A list of node names defining the trees outgroup") parser.add_argument("--is_sptree", dest="is_sptree", action = "store_true", help="Assumes no duplication nodes in the tree") parser.add_argument("-o", dest="output", type=str, help="Writes result into a file") parser.add_argument("--tax2name", dest="tax2name", type=str, help="") parser.add_argument("--tax2track", dest="tax2track", type=str, help="") parser.add_argument("--dump_tax_info", dest="dump_tax_info", action="store_true", help="") args = parser.parse_args(argv) if args.sp_delimiter: GET_TAXID = lambda x: x.split(args.sp_delimiter)[args.sp_field] else: GET_TAXID = None reftree_name = os.path.basename(args.ref_tree) if args.ref_tree else "" if args.explore: print >>sys.stderr, "Reading tree from file:", args.explore t = cPickle.load(open(args.explore)) ts = TreeStyle() ts.force_topology = True ts.show_leaf_name = False ts.layout_fn = ncbi_layout ts.mode = "r" t.show(tree_style=ts) print >>sys.stderr, "dumping color config" cPickle.dump(name2color, open("ncbi_colors.pkl", "w")) sys.exit() if args.output: OUT = open(args.output, "w") else: OUT = sys.stdout print >>sys.stderr, "Dumping results into", OUT target_trees = [] if args.tree_list_file: target_trees = [line.strip() for line in open(args.tree_list_file)] if args.target_tree: target_trees += args.target_tree prev_tree = None if args.tax2name: tax2name = cPickle.load(open(args.tax2name)) else: tax2name = {} if args.tax2track: tax2track = cPickle.load(open(args.tax2track)) else: tax2track = {} print len(tax2track), len(tax2name) header = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Clade sizes", "RF (avg)", "RF (med)", "RF (std)", "RF (max)", "Shared tips") print >>OUT, '|'.join([h.ljust(15) for h in header]) if args.ref_tree: print >>sys.stderr, "Reading ref tree from", args.ref_tree reft = Tree(args.ref_tree, format=1) else: reft = None SHOW_TREE = False if args.show_tree or args.render: SHOW_TREE = True prev_broken = set() ENTRIES = [] ncbi.connect_database() for tfile in target_trees: #print tfile t = PhyloTree(tfile, sp_naming_function=None) if GET_TAXID: for n in t.iter_leaves(): n.name = GET_TAXID(n.name) if args.outgroup: if len(args.outgroup) == 1: out = t & args.outgroup[0] else: out = t.get_common_ancestor(args.outgroup) if set(out.get_leaf_names()) ^ set(args.outgroup): raise ValueError("Outgroup is not monophyletic") t.set_outgroup(out) t.ladderize() if prev_tree: tree_compare(t, prev_tree) prev_tree = t if args.tax_info: tax2name, tax2track = annotate_tree_with_taxa(t, args.tax_info, tax2name, tax2track) if args.dump_tax_info: cPickle.dump(tax2track, open("tax2track.pkl", "w")) cPickle.dump(tax2name, open("tax2name.pkl", "w")) print "Tax info written into pickle files" else: for n in t.iter_leaves(): spcode = n.name n.add_features(taxid=spcode) n.add_features(species=spcode) tax2name, tax2track = annotate_tree_with_taxa(t, None, tax2name, tax2track) # Split tree into species trees #subtrees = t.get_speciation_trees() if not args.rf_only: #print "Calculating tree subparts..." t1 = time.time() if not args.is_sptree: subtrees = t.split_by_dups() #print "Subparts:", len(subtrees), time.time()-t1 else: subtrees = [t] valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = analyze_subtrees(t, subtrees, show_tree=SHOW_TREE) #print valid_subtrees, broken_subtrees, ncbi_mistakes, total_rf else: subtrees = [] valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = 0, 0, 0, 0, 0, 0 ndups = 0 nsubtrees = len(subtrees) rf = 0 rf_max = 0 rf_std = 0 rf_med = 0 common_names = 0 max_size = 0 if reft and len(subtrees) == 1: rf = t.robinson_foulds(reft, attr_t1="realname") rf_max = rf[1] rf = rf[0] rf_med = rf elif reft: #print "Calculating avg RF..." nsubtrees, ndups, subtrees = t.get_speciation_trees(map_features=["taxid"]) #print len(subtrees), "Sub-Species-trees found" avg_rf = [] rf_max = 0.0 # reft.robinson_foulds(reft)[1] sum_size = 0.0 print nsubtrees, "subtrees", ndups, "duplications" for ii, subt in enumerate(subtrees): print "\r%d" %ii, sys.stdout.flush() try: partial_rf = subt.robinson_foulds(reft, attr_t1="taxid") except ValueError: pass else: sptree_size = len(set([n.taxid for n in subt.iter_leaves()])) sum_size += sptree_size avg_rf.append((partial_rf[0]/float(partial_rf[1])) * sptree_size) common_names = len(partial_rf[3]) max_size = max(max_size, sptree_size) rf_max = max(rf_max, partial_rf[1]) #print partial_rf[:2] rf = numpy.sum(avg_rf) / float(sum_size) # Treeko dist rf_std = numpy.std(avg_rf) rf_med = numpy.median(avg_rf) sizes_info = "%0.1f/%0.1f +- %0.1f" %( numpy.mean(broken_sizes), numpy.median(broken_sizes), numpy.std(broken_sizes)) iter_values = [os.path.basename(tfile), nsubtrees, ndups, broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, rf, rf_med, rf_std, rf_max, common_names] print >>OUT, '|'.join(map(lambda x: str(x).strip().ljust(15), iter_values)) fixed = sorted([n for n in prev_broken if n not in broken_clades]) new_problems = sorted(broken_clades - prev_broken) fixed_string = color(', '.join(fixed), "green") if fixed else "" problems_string = color(', '.join(new_problems), "red") if new_problems else "" OUT.write(" Fixed clades: %s\n" %fixed_string) if fixed else None OUT.write(" New broken: %s\n" %problems_string) if new_problems else None prev_broken = broken_clades ENTRIES.append([os.path.basename(tfile), nsubtrees, ndups, broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, fixed_string, problems_string]) OUT.flush() if args.show_tree or args.render: ts = TreeStyle() ts.force_topology = True #ts.tree_width = 500 ts.show_leaf_name = False ts.layout_fn = ncbi_layout ts.mode = "r" t.dist = 0 if args.show_tree: #if args.hide_monophyletic: # tax2monophyletic = {} # n2content = t.get_node2content() # for node in t.traverse(): # term2count = defaultdict(int) # for leaf in n2content[node]: # if leaf.lineage: # for term in leaf.lineage: # term2count[term] += 1 # expected_size = len(n2content) # for term, count in term2count.iteritems(): # if count > 1 print "Showing tree..." t.show(tree_style=ts) else: t.render("img.svg", tree_style=ts, dpi=300) print "dumping color config" cPickle.dump(name2color, open("ncbi_colors.pkl", "w")) if args.dump: cPickle.dump(t, open("ncbi_analysis.pkl", "w")) print print HEADER = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Broken branches", "Clade sizes", "Fixed Groups", "New Broken Clades") print_table(ENTRIES, max_col_width = 50, row_line=True, header=HEADER) if args.output: OUT.close()