def main(argv): parser = argparse.ArgumentParser( description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo. # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version) # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) ) # const - A constant value required by some action and nargs selections. # default - The value produced if the argument is absent from the command line. # type - The type to which the command-line argument should be converted. # choices - A container of the allowable values for the argument. # required - Whether or not the command-line option may be omitted (optionals only). # help - A brief description of what the argument does. # metavar - A name for the argument in usage messages. # dest - The name of the attribute to be added to the object returned by parse_args(). input_gr = parser.add_argument_group( "TREE INPUT OPTIONS\n=================") input_gr.add_argument( 'tree', metavar='tree_file', type=str, nargs=1, help='A tree file (or text string) in newick format.') input_gr.add_argument("--raxml", dest="raxml", action="store_true", help="""Process newick as raxml bootstrap values""") img_gr = parser.add_argument_group("TREE IMAGE OPTIONS\n=================") img_gr.add_argument("-m", "--mode", dest="mode", choices=["c", "r"], default="r", help="""(r)ectangular or (c)ircular visualization""") img_gr.add_argument( "-i", "--image", dest="image", type=str, help="Render tree image instead of showing it. A filename " " should be provided. PDF, SVG and PNG file extensions are" " supported (i.e. -i tree.svg)") img_gr.add_argument( "--Iw", "--width", dest="width", type=int, default=0, help="width of the rendered image in pixels (see --size-units).") img_gr.add_argument( "--Ih", "--height", dest="height", type=int, default=0, help="height of the rendered image in pixels (see --size-units).") img_gr.add_argument("--Ir", "--resolution", dest="resolution", type=int, default=300, help="Resolution if the tree image (DPI)") img_gr.add_argument("--Iu", "--size-units", dest="size_units", choices=["px", "mm", "in"], default="px", help="Units used to specify the size of the image." " (px:pixels, mm:millimeters, in:inches). ") img_gr.add_argument( "-mbs", "--min-branch-separation", dest="branch_separation", type=int, default=3, help="Min number of pixels to separate branches vertically.") img_gr.add_argument("--ss", "--show-support", dest="show_support", action="store_true", help="""Shows branch bootstrap/support values""") img_gr.add_argument("--sbl", "--branch-length", dest="show_branch_length", action="store_true", help="""Show branch lengths.""") img_gr.add_argument( "--ft", "--force-topology", dest="force_topology", action="store_true", help="""Force branch length to have a minimum length in the image""") img_gr.add_argument("--hln", "--hide-leaf-names", dest="hide_leaf_names", action="store_true", help="""Hide leaf names.""") img_gr.add_argument( "--sin", "--show-internal-names", dest="show_internal_names", action="store_true", help="""Show the name attribute of all internal nodes.""") edit_gr = parser.add_argument_group("TREE EDIT OPTIONS\n=================") edit_gr.add_argument( "-r", "--root", dest="root", type=str, nargs="*", help="Roots the tree to the node grouping the list" " of node names provided (space separated). In example:" "'--root human rat mouse'") edit_gr.add_argument("-s", "--sort-branches", dest="sort", action="store_true", help="""Sort branches according to node names.""") edit_gr.add_argument("-l", "--ladderize", dest="ladderize", action="store_true", help="""Sort branches by partition size.""") edit_gr.add_argument("--color_by_rank", dest="color_by_rank", type=str, nargs="+", help="""If the attribute rank is present in nodes """) phylo_gr = parser.add_argument_group( "PHYLOGENETIC OPTIONS\n=================") phylo_gr.add_argument("--alg", dest="alg", type=str, help="""Multiple sequence alignment.""") phylo_gr.add_argument( "--alg-format", dest="alg_format", type=str, default="fasta", help="""fasta, phylip, iphylip, relaxed_iphylip, relaxed_phylip.""") phylo_gr.add_argument( "--sp-discovery", dest="species_discovery_regexp", type=str, default="^[^_]+_(.+)", help="Perl regular expression to capture species" " code from node names. By default, node names" " are expected to follow the NAME_SPCODE format = '^[^_]+_(.+)' ") phylo_gr.add_argument( "--dump-subtrees", dest="subtrees_output_file", type=str, help="Returns a file containing all possible species subtrees" " contained in a given gene tree ") phylo_gr.add_argument( "--newick", dest="newick", type=str, help="dumps newick file after applying editing options") args = parser.parse_args(argv) tfile = args.tree[0] if args.ladderize and args.sort: raise ValueError( "--sort-branches and --ladderize options are mutually exclusive") if args.raxml: nw = re.sub(":(\d+\.\d+)\[(\d+)\]", ":\\1[&&NHX:support=\\2]", open(tfile).read()) t = PhyloTree(nw) #for n in t.traverse(): #n.support = getattr(n, "bootstrap", -1) # else: t = PhyloTree(tfile) if args.alg: t.link_to_alignment(args.alg, alg_format=args.alg_format) LEAF_ATTRIBUTES["sequence"] = 1 if args.species_discovery_regexp: SPCODE_REGEXP = re.compile(args.species_discovery_regexp) t.set_species_naming_function(user_species_naming_function) if args.ladderize: t.ladderize() if args.sort: t.sort_descendants() if args.root: if len(args.root) > 1: outgroup = t.get_common_ancestor(args.root) else: outgroup = t & args.root[0] t.set_outgroup(outgroup) # EXTRACT INFO if args.subtrees_output_file: ntrees, ndups, treeiter = t.get_speciation_trees() print >> sys.stderr, "Found %d duplication nodes. Dumping %d sutrees..." % ( ndups, ntrees) OUT = open(args.subtrees_output_file, "w") for tree in treeiter: print >> OUT, tree.write() OUT.close() # VISUALIZATION ts = TreeStyle() ts.mode = args.mode ts.show_leaf_name = False ts.branch_vertical_margin = args.branch_separation if args.show_support: ts.show_branch_support = True if args.show_branch_length: ts.show_branch_length = True if args.force_topology: ts.force_topology = True if args.hide_leaf_names: del LEAF_ATTRIBUTES["name"] if args.show_internal_names: INTERNAL_ATTRIBUTES["name"] = 1 # scale the tree if not args.height: args.height = None if not args.width: args.width = None ts.layout_fn = master_layout if args.image: t.render(args.image, tree_style=ts, w=args.width, h=args.height, units=args.size_units) else: t.show(None, tree_style=ts) if args.newick: t.write(features=[], outfile=args.newick) print "Processed Newick dumped into", args.newick
def main(argv): parser = argparse.ArgumentParser( description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo. # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version) # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) ) # const - A constant value required by some action and nargs selections. # default - The value produced if the argument is absent from the command line. # type - The type to which the command-line argument should be converted. # choices - A container of the allowable values for the argument. # required - Whether or not the command-line option may be omitted (optionals only). # help - A brief description of what the argument does. # metavar - A name for the argument in usage messages. # dest - The name of the attribute to be added to the object returned by parse_args(). parser.add_argument("--show", dest="show_tree", action="store_true", help="""Display tree after the analysis.""") parser.add_argument("--render", dest="render", action="store_true", help="""Render tree.""") parser.add_argument("--dump", dest="dump", action="store_true", help="""Dump analysis""") parser.add_argument( "--explore", dest="explore", type=str, help="""Reads a previously analyzed tree and visualize it""") input_args = parser.add_mutually_exclusive_group() input_args.required = True input_args.add_argument("-t", "--tree", dest="target_tree", nargs="+", type=str, help="""Tree file in newick format""") input_args.add_argument("-tf", dest="tree_list_file", type=str, help="File with the list of tree files") parser.add_argument("--tax", dest="tax_info", type=str, help="If the taxid attribute is not set in the" " newick file for all leaf nodes, a tab file file" " with the translation of name and taxid can be" " provided with this option.") parser.add_argument( "--sp_delimiter", dest="sp_delimiter", type=str, help= "If taxid is part of the leaf name, delimiter used to split the string" ) parser.add_argument( "--sp_field", dest="sp_field", type=int, default=0, help="field position for taxid after splitting leaf names") parser.add_argument("--ref", dest="ref_tree", type=str, help="Uses ref tree to compute robinson foulds" " distances of the different subtrees") parser.add_argument("--rf-only", dest="rf_only", action="store_true", help="Skip ncbi consensus analysis") parser.add_argument( "--outgroup", dest="outgroup", type=str, nargs="+", help="A list of node names defining the trees outgroup") parser.add_argument("--is_sptree", dest="is_sptree", action="store_true", help="Assumes no duplication nodes in the tree") parser.add_argument("-o", dest="output", type=str, help="Writes result into a file") parser.add_argument("--tax2name", dest="tax2name", type=str, help="") parser.add_argument("--tax2track", dest="tax2track", type=str, help="") parser.add_argument("--dump_tax_info", dest="dump_tax_info", action="store_true", help="") args = parser.parse_args(argv) if args.sp_delimiter: GET_TAXID = lambda x: x.split(args.sp_delimiter)[args.sp_field] else: GET_TAXID = None reftree_name = os.path.basename(args.ref_tree) if args.ref_tree else "" if args.explore: print >> sys.stderr, "Reading tree from file:", args.explore t = cPickle.load(open(args.explore)) ts = TreeStyle() ts.force_topology = True ts.show_leaf_name = False ts.layout_fn = ncbi_layout ts.mode = "r" t.show(tree_style=ts) print >> sys.stderr, "dumping color config" cPickle.dump(name2color, open("ncbi_colors.pkl", "w")) sys.exit() if args.output: OUT = open(args.output, "w") else: OUT = sys.stdout print >> sys.stderr, "Dumping results into", OUT target_trees = [] if args.tree_list_file: target_trees = [line.strip() for line in open(args.tree_list_file)] if args.target_tree: target_trees += args.target_tree prev_tree = None if args.tax2name: tax2name = cPickle.load(open(args.tax2name)) else: tax2name = {} if args.tax2track: tax2track = cPickle.load(open(args.tax2track)) else: tax2track = {} print len(tax2track), len(tax2name) header = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Clade sizes", "RF (avg)", "RF (med)", "RF (std)", "RF (max)", "Shared tips") print >> OUT, '|'.join([h.ljust(15) for h in header]) if args.ref_tree: print >> sys.stderr, "Reading ref tree from", args.ref_tree reft = Tree(args.ref_tree, format=1) else: reft = None SHOW_TREE = False if args.show_tree or args.render: SHOW_TREE = True prev_broken = set() ENTRIES = [] ncbi.connect_database() for tfile in target_trees: #print tfile t = PhyloTree(tfile, sp_naming_function=None) if GET_TAXID: for n in t.iter_leaves(): n.name = GET_TAXID(n.name) if args.outgroup: if len(args.outgroup) == 1: out = t & args.outgroup[0] else: out = t.get_common_ancestor(args.outgroup) if set(out.get_leaf_names()) ^ set(args.outgroup): raise ValueError("Outgroup is not monophyletic") t.set_outgroup(out) t.ladderize() if prev_tree: tree_compare(t, prev_tree) prev_tree = t if args.tax_info: tax2name, tax2track = annotate_tree_with_taxa( t, args.tax_info, tax2name, tax2track) if args.dump_tax_info: cPickle.dump(tax2track, open("tax2track.pkl", "w")) cPickle.dump(tax2name, open("tax2name.pkl", "w")) print "Tax info written into pickle files" else: for n in t.iter_leaves(): spcode = n.name n.add_features(taxid=spcode) n.add_features(species=spcode) tax2name, tax2track = annotate_tree_with_taxa( t, None, tax2name, tax2track) # Split tree into species trees #subtrees = t.get_speciation_trees() if not args.rf_only: #print "Calculating tree subparts..." t1 = time.time() if not args.is_sptree: subtrees = t.split_by_dups() #print "Subparts:", len(subtrees), time.time()-t1 else: subtrees = [t] valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = analyze_subtrees( t, subtrees, show_tree=SHOW_TREE) #print valid_subtrees, broken_subtrees, ncbi_mistakes, total_rf else: subtrees = [] valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = 0, 0, 0, 0, 0, 0 ndups = 0 nsubtrees = len(subtrees) rf = 0 rf_max = 0 rf_std = 0 rf_med = 0 common_names = 0 max_size = 0 if reft and len(subtrees) == 1: rf = t.robinson_foulds(reft, attr_t1="realname") rf_max = rf[1] rf = rf[0] rf_med = rf elif reft: #print "Calculating avg RF..." nsubtrees, ndups, subtrees = t.get_speciation_trees( map_features=["taxid"]) #print len(subtrees), "Sub-Species-trees found" avg_rf = [] rf_max = 0.0 # reft.robinson_foulds(reft)[1] sum_size = 0.0 print nsubtrees, "subtrees", ndups, "duplications" for ii, subt in enumerate(subtrees): print "\r%d" % ii, sys.stdout.flush() try: partial_rf = subt.robinson_foulds(reft, attr_t1="taxid") except ValueError: pass else: sptree_size = len( set([n.taxid for n in subt.iter_leaves()])) sum_size += sptree_size avg_rf.append( (partial_rf[0] / float(partial_rf[1])) * sptree_size) common_names = len(partial_rf[3]) max_size = max(max_size, sptree_size) rf_max = max(rf_max, partial_rf[1]) #print partial_rf[:2] rf = numpy.sum(avg_rf) / float(sum_size) # Treeko dist rf_std = numpy.std(avg_rf) rf_med = numpy.median(avg_rf) sizes_info = "%0.1f/%0.1f +- %0.1f" % (numpy.mean(broken_sizes), numpy.median(broken_sizes), numpy.std(broken_sizes)) iter_values = [ os.path.basename(tfile), nsubtrees, ndups, broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, rf, rf_med, rf_std, rf_max, common_names ] print >> OUT, '|'.join( map(lambda x: str(x).strip().ljust(15), iter_values)) fixed = sorted([n for n in prev_broken if n not in broken_clades]) new_problems = sorted(broken_clades - prev_broken) fixed_string = color(', '.join(fixed), "green") if fixed else "" problems_string = color(', '.join(new_problems), "red") if new_problems else "" OUT.write(" Fixed clades: %s\n" % fixed_string) if fixed else None OUT.write(" New broken: %s\n" % problems_string) if new_problems else None prev_broken = broken_clades ENTRIES.append([ os.path.basename(tfile), nsubtrees, ndups, broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, fixed_string, problems_string ]) OUT.flush() if args.show_tree or args.render: ts = TreeStyle() ts.force_topology = True #ts.tree_width = 500 ts.show_leaf_name = False ts.layout_fn = ncbi_layout ts.mode = "r" t.dist = 0 if args.show_tree: #if args.hide_monophyletic: # tax2monophyletic = {} # n2content = t.get_node2content() # for node in t.traverse(): # term2count = defaultdict(int) # for leaf in n2content[node]: # if leaf.lineage: # for term in leaf.lineage: # term2count[term] += 1 # expected_size = len(n2content) # for term, count in term2count.iteritems(): # if count > 1 print "Showing tree..." t.show(tree_style=ts) else: t.render("img.svg", tree_style=ts, dpi=300) print "dumping color config" cPickle.dump(name2color, open("ncbi_colors.pkl", "w")) if args.dump: cPickle.dump(t, open("ncbi_analysis.pkl", "w")) print print HEADER = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Broken branches", "Clade sizes", "Fixed Groups", "New Broken Clades") print_table(ENTRIES, max_col_width=50, row_line=True, header=HEADER) if args.output: OUT.close()
def main(argv): parser = argparse.ArgumentParser(description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo. # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version) # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) ) # const - A constant value required by some action and nargs selections. # default - The value produced if the argument is absent from the command line. # type - The type to which the command-line argument should be converted. # choices - A container of the allowable values for the argument. # required - Whether or not the command-line option may be omitted (optionals only). # help - A brief description of what the argument does. # metavar - A name for the argument in usage messages. # dest - The name of the attribute to be added to the object returned by parse_args(). input_gr = parser.add_argument_group("TREE INPUT OPTIONS\n=================") input_gr.add_argument('tree', metavar='tree_file', type=str, nargs=1, help='A tree file (or text string) in newick format.') input_gr.add_argument("--raxml", dest="raxml", action="store_true", help="""Process newick as raxml bootstrap values""") img_gr = parser.add_argument_group("TREE IMAGE OPTIONS\n=================") img_gr.add_argument("-m", "--mode", dest="mode", choices=["c", "r"], default="r", help="""(r)ectangular or (c)ircular visualization""") img_gr.add_argument("-i", "--image", dest="image", type=str, help="Render tree image instead of showing it. A filename " " should be provided. PDF, SVG and PNG file extensions are" " supported (i.e. -i tree.svg)" ) img_gr.add_argument("--Iw", "--width", dest="width", type=int, default=0, help="width of the rendered image in pixels (see --size-units)." ) img_gr.add_argument("--Ih", "--height", dest="height", type=int, default=0, help="height of the rendered image in pixels (see --size-units)." ) img_gr.add_argument("--Ir", "--resolution", dest="resolution", type=int, default=300, help="Resolution if the tree image (DPI)" ) img_gr.add_argument("--Iu", "--size-units", dest="size_units", choices=["px", "mm", "in"], default="px", help="Units used to specify the size of the image." " (px:pixels, mm:millimeters, in:inches). " ) img_gr.add_argument("-mbs", "--min-branch-separation", dest="branch_separation", type=int, default = 3, help="Min number of pixels to separate branches vertically." ) img_gr.add_argument("--ss", "--show-support", dest="show_support", action="store_true", help="""Shows branch bootstrap/support values""") img_gr.add_argument("--sbl", "--branch-length", dest="show_branch_length", action="store_true", help="""Show branch lengths.""") img_gr.add_argument("--ft", "--force-topology", dest="force_topology", action="store_true", help="""Force branch length to have a minimum length in the image""") img_gr.add_argument("--hln", "--hide-leaf-names", dest="hide_leaf_names", action="store_true", help="""Hide leaf names.""") img_gr.add_argument("--sin", "--show-internal-names", dest="show_internal_names", action="store_true", help="""Show the name attribute of all internal nodes.""") edit_gr = parser.add_argument_group("TREE EDIT OPTIONS\n=================") edit_gr.add_argument("-r", "--root", dest="root", type=str, nargs="*", help="Roots the tree to the node grouping the list" " of node names provided (space separated). In example:" "'--root human rat mouse'") edit_gr.add_argument("-s", "--sort-branches", dest="sort", action="store_true", help="""Sort branches according to node names.""") edit_gr.add_argument("-l", "--ladderize", dest="ladderize", action="store_true", help="""Sort branches by partition size.""") edit_gr.add_argument("--color_by_rank", dest="color_by_rank", type=str, nargs="+", help="""If the attribute rank is present in nodes """) phylo_gr = parser.add_argument_group("PHYLOGENETIC OPTIONS\n=================") phylo_gr.add_argument("--alg", dest="alg", type=str, help="""Multiple sequence alignment.""") phylo_gr.add_argument("--alg-format", dest="alg_format", type=str, default="fasta", help="""fasta, phylip, iphylip, relaxed_iphylip, relaxed_phylip.""") phylo_gr.add_argument("--sp-discovery", dest="species_discovery_regexp", type=str, default="^[^_]+_(.+)", help="Perl regular expression to capture species" " code from node names. By default, node names" " are expected to follow the NAME_SPCODE format = '^[^_]+_(.+)' ") phylo_gr.add_argument("--dump-subtrees", dest="subtrees_output_file", type=str, help="Returns a file containing all possible species subtrees" " contained in a given gene tree ") phylo_gr.add_argument("--newick", dest="newick", type=str, help="dumps newick file after applying editing options") args = parser.parse_args(argv) tfile = args.tree[0] if args.ladderize and args.sort: raise ValueError("--sort-branches and --ladderize options are mutually exclusive") if args.raxml: nw = re.sub(":(\d+\.\d+)\[(\d+)\]", ":\\1[&&NHX:support=\\2]", open(tfile).read()) t = PhyloTree(nw) #for n in t.traverse(): #n.support = getattr(n, "bootstrap", -1) # else: t = PhyloTree(tfile) if args.alg: t.link_to_alignment(args.alg, alg_format=args.alg_format) LEAF_ATTRIBUTES["sequence"] = 1 if args.species_discovery_regexp: SPCODE_REGEXP = re.compile(args.species_discovery_regexp) t.set_species_naming_function(user_species_naming_function) if args.ladderize: t.ladderize() if args.sort: t.sort_descendants() if args.root: if len(args.root) > 1: outgroup = t.get_common_ancestor(args.root) else: outgroup = t & args.root[0] t.set_outgroup(outgroup) # EXTRACT INFO if args.subtrees_output_file: ntrees, ndups, treeiter = t.get_speciation_trees() print >>sys.stderr, "Found %d duplication nodes. Dumping %d sutrees..." %(ndups, ntrees) OUT = open(args.subtrees_output_file, "w") for tree in treeiter: print >>OUT, tree.write() OUT.close() # VISUALIZATION ts = TreeStyle() ts.mode = args.mode ts.show_leaf_name = False ts.branch_vertical_margin = args.branch_separation if args.show_support: ts.show_branch_support = True if args.show_branch_length: ts.show_branch_length = True if args.force_topology: ts.force_topology = True if args.hide_leaf_names: del LEAF_ATTRIBUTES["name"] if args.show_internal_names: INTERNAL_ATTRIBUTES["name"] = 1 # scale the tree if not args.height: args.height = None if not args.width: args.width = None ts.layout_fn = master_layout if args.image: t.render(args.image, tree_style=ts, w=args.width, h=args.height, units=args.size_units) else: t.show(None, tree_style=ts) if args.newick: t.write(features=[], outfile=args.newick) print "Processed Newick dumped into", args.newick
def main(argv): parser = argparse.ArgumentParser( description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) input_gr = parser.add_argument_group( "TREE INPUT OPTIONS\n=================") input_gr.add_argument( 'tree', metavar='tree_file', type=str, nargs=1, help='A tree file (or text string) in newick format.') input_gr.add_argument("--raxml", dest="raxml", action="store_true", help="""Process newick as raxml bootstrap values""") img_gr = parser.add_argument_group("TREE IMAGE OPTIONS\n=================") img_gr.add_argument("-m", "--mode", dest="mode", choices=["c", "r"], default="r", help="""(r)ectangular or (c)ircular visualization""") img_gr.add_argument( "-i", "--image", dest="image", type=str, help="Render tree image instead of showing it. A filename " " should be provided. PDF, SVG and PNG file extensions are" " supported (i.e. -i tree.svg)") img_gr.add_argument("--text", dest="text_mode", action="store_true", help="Shows the tree using ASCII characters") img_gr.add_argument( "--attr", "--show_attributes", dest="show_attributes", nargs="+", help="Display the value of the specified attributes, if available") img_gr.add_argument( "--Iw", "--width", dest="width", type=int, default=0, help="width of the rendered image in pixels (see --size-units).") img_gr.add_argument( "--Ih", "--height", dest="height", type=int, default=0, help="height of the rendered image in pixels (see --size-units).") img_gr.add_argument("--Ir", "--resolution", dest="resolution", type=int, default=300, help="Resolution if the tree image (DPI)") img_gr.add_argument("--Iu", "--size_units", dest="size_units", choices=["px", "mm", "in"], default="px", help="Units used to specify the size of the image." " (px:pixels, mm:millimeters, in:inches). ") img_gr.add_argument( "-mbs", "--min_branch_separation", dest="branch_separation", type=int, default=3, help="Min number of pixels to separate branches vertically.") img_gr.add_argument("--ss", "--show_support", dest="show_support", action="store_true", help="""Shows branch bootstrap/support values""") img_gr.add_argument("--sbl", "--show_branch_length", dest="show_branch_length", action="store_true", help="""Show branch lengths.""") img_gr.add_argument( "--ft", "--force_topology", dest="force_topology", action="store_true", help="""Force branch length to have a minimum length in the image""") img_gr.add_argument("--hln", "--hide_leaf_names", dest="hide_leaf_names", action="store_true", help="""Hide leaf names.""") img_gr.add_argument( "--sin", "--show_internal_names", dest="show_internal_names", action="store_true", help="""Show the name attribute of all internal nodes.""") edit_gr = parser.add_argument_group("TREE EDIT OPTIONS\n=================") edit_gr.add_argument( "-r", "--root", dest="root", type=str, nargs="*", help="Roots the tree to the node grouping the list" " of node names provided (space separated). In example:" "'--root human rat mouse'") edit_gr.add_argument("-s", "--sort_branches", dest="sort", action="store_true", help="""Sort branches according to node names.""") edit_gr.add_argument("-l", "--ladderize", dest="ladderize", action="store_true", help="""Sort branches by partition size.""") edit_gr.add_argument("--color_by_rank", dest="color_by_rank", type=str, nargs="+", help="""If the attribute rank is present in nodes """) edit_gr.add_argument( "--ncbi", dest="ncbi", action="store_true", help=""" Annotate tree using the NCBI taxonomy database""") edit_gr.add_argument( "--taxid_attr", dest="taxid_attr", type=str, default="name", help="node attribute encoding for valid taxid numbers.") edit_gr.add_argument( "--taxid_attr_regexp", dest="taxid_attr_regexp", type=str, help= "If taxid number is encoded as part of another text string, i.e. gene name, use this argument to define a Perl regular expression to extract taxid numbers." ) phylo_gr = parser.add_argument_group( "PHYLOGENETIC OPTIONS\n=================") phylo_gr.add_argument( "--alg", dest="alg", type=str, help="""Link tree to a multiple sequence alignment.""") phylo_gr.add_argument( "--alg_format", dest="alg_format", type=str, default="fasta", help="""fasta, phylip, iphylip, relaxed_iphylip, relaxed_phylip.""") phylo_gr.add_argument( "--sp_discovery", dest="species_discovery_regexp", type=str, default="^[^_]+_(.+)", help="Perl regular expression used to capture species" " code from node names. By default, node names" " are expected to follow the NAME_SPCODE format = '^[^_]+_(.+)' ") args = parser.parse_args(argv) tfile = args.tree[0] if args.ladderize and args.sort: raise ValueError( "--sort-branches and --ladderize options are mutually exclusive") if args.raxml: nw = re.sub(":(\d+\.\d+)\[(\d+)\]", ":\\1[&&NHX:support=\\2]", open(tfile).read()) t = PhyloTree(nw) else: t = PhyloTree(tfile) if args.ncbi: if args.taxid_attr_regexp: TAXIDMATCHER = re.compile(args.taxid_attr_regexp) for lf in t: if args.taxid_attr_regexp: lf.taxid = re.search(TAXIDMATCHER, getattr(lf, args.taxid_attr)).groups()[0] else: lf.taxid = getattr(lf, args.taxid_attr) t.annotate_ncbi_taxa(taxid_attr="taxid") if args.alg: t.link_to_alignment(args.alg, alg_format=args.alg_format) LEAF_ATTRIBUTES["sequence"] = 1 if args.species_discovery_regexp: SPCODE_REGEXP = re.compile(args.species_discovery_regexp) t.set_species_naming_function(user_species_naming_function) if args.ladderize: t.ladderize() if args.sort: t.sort_descendants() if args.root: if len(args.root) > 1: outgroup = t.get_common_ancestor(args.root) else: outgroup = t & args.root[0] t.set_outgroup(outgroup) # VISUALIZATION ts = TreeStyle() ts.mode = args.mode ts.show_leaf_name = False ts.branch_vertical_margin = args.branch_separation if args.show_support: ts.show_branch_support = True if args.show_branch_length: ts.show_branch_length = True if args.force_topology: ts.force_topology = True if args.hide_leaf_names: del LEAF_ATTRIBUTES["name"] if args.show_internal_names: INTERNAL_ATTRIBUTES["name"] = 1 # scale the tree if not args.height: args.height = None if not args.width: args.width = None if args.text_mode: print t.get_ascii(show_internal=args.show_internal_names, attributes=args.show_attributes) else: ts.layout_fn = master_layout if args.image: t.render(args.image, tree_style=ts, w=args.width, h=args.height, units=args.size_units) else: t.show(None, tree_style=ts)
def main(argv): parser = argparse.ArgumentParser(description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) input_gr = parser.add_argument_group("TREE INPUT OPTIONS\n=================") input_gr.add_argument('tree', metavar='tree_file', type=str, nargs=1, help='A tree file (or text string) in newick format.') input_gr.add_argument("--raxml", dest="raxml", action="store_true", help="""Process newick as raxml bootstrap values""") img_gr = parser.add_argument_group("TREE IMAGE OPTIONS\n=================") img_gr.add_argument("-m", "--mode", dest="mode", choices=["c", "r"], default="r", help="""(r)ectangular or (c)ircular visualization""") img_gr.add_argument("-i", "--image", dest="image", type=str, help="Render tree image instead of showing it. A filename " " should be provided. PDF, SVG and PNG file extensions are" " supported (i.e. -i tree.svg)" ) img_gr.add_argument("--text", dest="text_mode", action="store_true", help="Shows the tree using ASCII characters") img_gr.add_argument("--attr", "--show_attributes", dest="show_attributes", nargs="+", help="Display the value of the specified attributes, if available") img_gr.add_argument("--Iw", "--width", dest="width", type=int, default=0, help="width of the rendered image in pixels (see --size-units)." ) img_gr.add_argument("--Ih", "--height", dest="height", type=int, default=0, help="height of the rendered image in pixels (see --size-units)." ) img_gr.add_argument("--Ir", "--resolution", dest="resolution", type=int, default=300, help="Resolution if the tree image (DPI)" ) img_gr.add_argument("--Iu", "--size_units", dest="size_units", choices=["px", "mm", "in"], default="px", help="Units used to specify the size of the image." " (px:pixels, mm:millimeters, in:inches). " ) img_gr.add_argument("-mbs", "--min_branch_separation", dest="branch_separation", type=int, default = 3, help="Min number of pixels to separate branches vertically." ) img_gr.add_argument("--ss", "--show_support", dest="show_support", action="store_true", help="""Shows branch bootstrap/support values""") img_gr.add_argument("--sbl", "--show_branch_length", dest="show_branch_length", action="store_true", help="""Show branch lengths.""") img_gr.add_argument("--ft", "--force_topology", dest="force_topology", action="store_true", help="""Force branch length to have a minimum length in the image""") img_gr.add_argument("--hln", "--hide_leaf_names", dest="hide_leaf_names", action="store_true", help="""Hide leaf names.""") img_gr.add_argument("--sin", "--show_internal_names", dest="show_internal_names", action="store_true", help="""Show the name attribute of all internal nodes.""") edit_gr = parser.add_argument_group("TREE EDIT OPTIONS\n=================") edit_gr.add_argument("-r", "--root", dest="root", type=str, nargs="*", help="Roots the tree to the node grouping the list" " of node names provided (space separated). In example:" "'--root human rat mouse'") edit_gr.add_argument("-s", "--sort_branches", dest="sort", action="store_true", help="""Sort branches according to node names.""") edit_gr.add_argument("-l", "--ladderize", dest="ladderize", action="store_true", help="""Sort branches by partition size.""") edit_gr.add_argument("--color_by_rank", dest="color_by_rank", type=str, nargs="+", help="""If the attribute rank is present in nodes """) edit_gr.add_argument("--ncbi", dest="ncbi", action="store_true", help=""" Annotate tree using the NCBI taxonomy database""") edit_gr.add_argument("--taxid_attr", dest="taxid_attr", type=str, default="name", help="node attribute encoding for valid taxid numbers.") edit_gr.add_argument("--taxid_attr_regexp", dest="taxid_attr_regexp", type=str, help="If taxid number is encoded as part of another text string, i.e. gene name, use this argument to define a Perl regular expression to extract taxid numbers.") phylo_gr = parser.add_argument_group("PHYLOGENETIC OPTIONS\n=================") phylo_gr.add_argument("--alg", dest="alg", type=str, help="""Link tree to a multiple sequence alignment.""") phylo_gr.add_argument("--alg_format", dest="alg_format", type=str, default="fasta", help="""fasta, phylip, iphylip, relaxed_iphylip, relaxed_phylip.""") phylo_gr.add_argument("--sp_discovery", dest="species_discovery_regexp", type=str, default="^[^_]+_(.+)", help="Perl regular expression used to capture species" " code from node names. By default, node names" " are expected to follow the NAME_SPCODE format = '^[^_]+_(.+)' ") args = parser.parse_args(argv) tfile = args.tree[0] if args.ladderize and args.sort: raise ValueError("--sort-branches and --ladderize options are mutually exclusive") if args.raxml: nw = re.sub(":(\d+\.\d+)\[(\d+)\]", ":\\1[&&NHX:support=\\2]", open(tfile).read()) t = PhyloTree(nw) else: t = PhyloTree(tfile) if args.ncbi: if args.taxid_attr_regexp: TAXIDMATCHER = re.compile(args.taxid_attr_regexp) for lf in t: if args.taxid_attr_regexp: lf.taxid = re.search(TAXIDMATCHER, getattr(lf, args.taxid_attr)).groups()[0] else: lf.taxid = getattr(lf, args.taxid_attr) t.annotate_ncbi_taxa(taxid_attr="taxid") if args.alg: t.link_to_alignment(args.alg, alg_format=args.alg_format) LEAF_ATTRIBUTES["sequence"] = 1 if args.species_discovery_regexp: SPCODE_REGEXP = re.compile(args.species_discovery_regexp) t.set_species_naming_function(user_species_naming_function) if args.ladderize: t.ladderize() if args.sort: t.sort_descendants() if args.root: if len(args.root) > 1: outgroup = t.get_common_ancestor(args.root) else: outgroup = t & args.root[0] t.set_outgroup(outgroup) # VISUALIZATION ts = TreeStyle() ts.mode = args.mode ts.show_leaf_name = False ts.branch_vertical_margin = args.branch_separation if args.show_support: ts.show_branch_support = True if args.show_branch_length: ts.show_branch_length = True if args.force_topology: ts.force_topology = True if args.hide_leaf_names: del LEAF_ATTRIBUTES["name"] if args.show_internal_names: INTERNAL_ATTRIBUTES["name"] = 1 # scale the tree if not args.height: args.height = None if not args.width: args.width = None if args.text_mode: print t.get_ascii(show_internal=args.show_internal_names, attributes = args.show_attributes) else: ts.layout_fn = master_layout if args.image: t.render(args.image, tree_style=ts, w=args.width, h=args.height, units=args.size_units) else: t.show(None, tree_style=ts)
def main(argv): parser = argparse.ArgumentParser(description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo. # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version) # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) ) # const - A constant value required by some action and nargs selections. # default - The value produced if the argument is absent from the command line. # type - The type to which the command-line argument should be converted. # choices - A container of the allowable values for the argument. # required - Whether or not the command-line option may be omitted (optionals only). # help - A brief description of what the argument does. # metavar - A name for the argument in usage messages. # dest - The name of the attribute to be added to the object returned by parse_args(). parser.add_argument("--show", dest="show_tree", action="store_true", help="""Display tree after the analysis.""") parser.add_argument("--render", dest="render", action="store_true", help="""Render tree.""") parser.add_argument("--dump", dest="dump", action="store_true", help="""Dump analysis""") parser.add_argument("--explore", dest="explore", type=str, help="""Reads a previously analyzed tree and visualize it""") input_args = parser.add_mutually_exclusive_group() input_args.required=True input_args.add_argument("-t", "--tree", dest="target_tree", nargs="+", type=str, help="""Tree file in newick format""") input_args.add_argument("-tf", dest="tree_list_file", type=str, help="File with the list of tree files") parser.add_argument("--tax", dest="tax_info", type=str, help="If the taxid attribute is not set in the" " newick file for all leaf nodes, a tab file file" " with the translation of name and taxid can be" " provided with this option.") parser.add_argument("--sp_delimiter", dest="sp_delimiter", type=str, help="If taxid is part of the leaf name, delimiter used to split the string") parser.add_argument("--sp_field", dest="sp_field", type=int, default=0, help="field position for taxid after splitting leaf names") parser.add_argument("--ref", dest="ref_tree", type=str, help="Uses ref tree to compute robinson foulds" " distances of the different subtrees") parser.add_argument("--rf-only", dest="rf_only", action = "store_true", help="Skip ncbi consensus analysis") parser.add_argument("--outgroup", dest="outgroup", type=str, nargs="+", help="A list of node names defining the trees outgroup") parser.add_argument("--is_sptree", dest="is_sptree", action = "store_true", help="Assumes no duplication nodes in the tree") parser.add_argument("-o", dest="output", type=str, help="Writes result into a file") parser.add_argument("--tax2name", dest="tax2name", type=str, help="") parser.add_argument("--tax2track", dest="tax2track", type=str, help="") parser.add_argument("--dump_tax_info", dest="dump_tax_info", action="store_true", help="") args = parser.parse_args(argv) if args.sp_delimiter: GET_TAXID = lambda x: x.split(args.sp_delimiter)[args.sp_field] else: GET_TAXID = None reftree_name = os.path.basename(args.ref_tree) if args.ref_tree else "" if args.explore: print >>sys.stderr, "Reading tree from file:", args.explore t = cPickle.load(open(args.explore)) ts = TreeStyle() ts.force_topology = True ts.show_leaf_name = False ts.layout_fn = ncbi_layout ts.mode = "r" t.show(tree_style=ts) print >>sys.stderr, "dumping color config" cPickle.dump(name2color, open("ncbi_colors.pkl", "w")) sys.exit() if args.output: OUT = open(args.output, "w") else: OUT = sys.stdout print >>sys.stderr, "Dumping results into", OUT target_trees = [] if args.tree_list_file: target_trees = [line.strip() for line in open(args.tree_list_file)] if args.target_tree: target_trees += args.target_tree prev_tree = None if args.tax2name: tax2name = cPickle.load(open(args.tax2name)) else: tax2name = {} if args.tax2track: tax2track = cPickle.load(open(args.tax2track)) else: tax2track = {} print len(tax2track), len(tax2name) header = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Clade sizes", "RF (avg)", "RF (med)", "RF (std)", "RF (max)", "Shared tips") print >>OUT, '|'.join([h.ljust(15) for h in header]) if args.ref_tree: print >>sys.stderr, "Reading ref tree from", args.ref_tree reft = Tree(args.ref_tree, format=1) else: reft = None SHOW_TREE = False if args.show_tree or args.render: SHOW_TREE = True prev_broken = set() ENTRIES = [] ncbi.connect_database() for tfile in target_trees: #print tfile t = PhyloTree(tfile, sp_naming_function=None) if GET_TAXID: for n in t.iter_leaves(): n.name = GET_TAXID(n.name) if args.outgroup: if len(args.outgroup) == 1: out = t & args.outgroup[0] else: out = t.get_common_ancestor(args.outgroup) if set(out.get_leaf_names()) ^ set(args.outgroup): raise ValueError("Outgroup is not monophyletic") t.set_outgroup(out) t.ladderize() if prev_tree: tree_compare(t, prev_tree) prev_tree = t if args.tax_info: tax2name, tax2track = annotate_tree_with_taxa(t, args.tax_info, tax2name, tax2track) if args.dump_tax_info: cPickle.dump(tax2track, open("tax2track.pkl", "w")) cPickle.dump(tax2name, open("tax2name.pkl", "w")) print "Tax info written into pickle files" else: for n in t.iter_leaves(): spcode = n.name n.add_features(taxid=spcode) n.add_features(species=spcode) tax2name, tax2track = annotate_tree_with_taxa(t, None, tax2name, tax2track) # Split tree into species trees #subtrees = t.get_speciation_trees() if not args.rf_only: #print "Calculating tree subparts..." t1 = time.time() if not args.is_sptree: subtrees = t.split_by_dups() #print "Subparts:", len(subtrees), time.time()-t1 else: subtrees = [t] valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = analyze_subtrees(t, subtrees, show_tree=SHOW_TREE) #print valid_subtrees, broken_subtrees, ncbi_mistakes, total_rf else: subtrees = [] valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = 0, 0, 0, 0, 0, 0 ndups = 0 nsubtrees = len(subtrees) rf = 0 rf_max = 0 rf_std = 0 rf_med = 0 common_names = 0 max_size = 0 if reft and len(subtrees) == 1: rf = t.robinson_foulds(reft, attr_t1="realname") rf_max = rf[1] rf = rf[0] rf_med = rf elif reft: #print "Calculating avg RF..." nsubtrees, ndups, subtrees = t.get_speciation_trees(map_features=["taxid"]) #print len(subtrees), "Sub-Species-trees found" avg_rf = [] rf_max = 0.0 # reft.robinson_foulds(reft)[1] sum_size = 0.0 print nsubtrees, "subtrees", ndups, "duplications" for ii, subt in enumerate(subtrees): print "\r%d" %ii, sys.stdout.flush() try: partial_rf = subt.robinson_foulds(reft, attr_t1="taxid") except ValueError: pass else: sptree_size = len(set([n.taxid for n in subt.iter_leaves()])) sum_size += sptree_size avg_rf.append((partial_rf[0]/float(partial_rf[1])) * sptree_size) common_names = len(partial_rf[3]) max_size = max(max_size, sptree_size) rf_max = max(rf_max, partial_rf[1]) #print partial_rf[:2] rf = numpy.sum(avg_rf) / float(sum_size) # Treeko dist rf_std = numpy.std(avg_rf) rf_med = numpy.median(avg_rf) sizes_info = "%0.1f/%0.1f +- %0.1f" %( numpy.mean(broken_sizes), numpy.median(broken_sizes), numpy.std(broken_sizes)) iter_values = [os.path.basename(tfile), nsubtrees, ndups, broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, rf, rf_med, rf_std, rf_max, common_names] print >>OUT, '|'.join(map(lambda x: str(x).strip().ljust(15), iter_values)) fixed = sorted([n for n in prev_broken if n not in broken_clades]) new_problems = sorted(broken_clades - prev_broken) fixed_string = color(', '.join(fixed), "green") if fixed else "" problems_string = color(', '.join(new_problems), "red") if new_problems else "" OUT.write(" Fixed clades: %s\n" %fixed_string) if fixed else None OUT.write(" New broken: %s\n" %problems_string) if new_problems else None prev_broken = broken_clades ENTRIES.append([os.path.basename(tfile), nsubtrees, ndups, broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, fixed_string, problems_string]) OUT.flush() if args.show_tree or args.render: ts = TreeStyle() ts.force_topology = True #ts.tree_width = 500 ts.show_leaf_name = False ts.layout_fn = ncbi_layout ts.mode = "r" t.dist = 0 if args.show_tree: #if args.hide_monophyletic: # tax2monophyletic = {} # n2content = t.get_node2content() # for node in t.traverse(): # term2count = defaultdict(int) # for leaf in n2content[node]: # if leaf.lineage: # for term in leaf.lineage: # term2count[term] += 1 # expected_size = len(n2content) # for term, count in term2count.iteritems(): # if count > 1 print "Showing tree..." t.show(tree_style=ts) else: t.render("img.svg", tree_style=ts, dpi=300) print "dumping color config" cPickle.dump(name2color, open("ncbi_colors.pkl", "w")) if args.dump: cPickle.dump(t, open("ncbi_analysis.pkl", "w")) print print HEADER = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Broken branches", "Clade sizes", "Fixed Groups", "New Broken Clades") print_table(ENTRIES, max_col_width = 50, row_line=True, header=HEADER) if args.output: OUT.close()