def main(argv): parser = argparse.ArgumentParser( description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo. # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version) # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) ) # const - A constant value required by some action and nargs selections. # default - The value produced if the argument is absent from the command line. # type - The type to which the command-line argument should be converted. # choices - A container of the allowable values for the argument. # required - Whether or not the command-line option may be omitted (optionals only). # help - A brief description of what the argument does. # metavar - A name for the argument in usage messages. # dest - The name of the attribute to be added to the object returned by parse_args(). input_gr = parser.add_argument_group( "TREE INPUT OPTIONS\n=================") input_gr.add_argument( 'tree', metavar='tree_file', type=str, nargs=1, help='A tree file (or text string) in newick format.') input_gr.add_argument("--raxml", dest="raxml", action="store_true", help="""Process newick as raxml bootstrap values""") img_gr = parser.add_argument_group("TREE IMAGE OPTIONS\n=================") img_gr.add_argument("-m", "--mode", dest="mode", choices=["c", "r"], default="r", help="""(r)ectangular or (c)ircular visualization""") img_gr.add_argument( "-i", "--image", dest="image", type=str, help="Render tree image instead of showing it. A filename " " should be provided. PDF, SVG and PNG file extensions are" " supported (i.e. -i tree.svg)") img_gr.add_argument( "--Iw", "--width", dest="width", type=int, default=0, help="width of the rendered image in pixels (see --size-units).") img_gr.add_argument( "--Ih", "--height", dest="height", type=int, default=0, help="height of the rendered image in pixels (see --size-units).") img_gr.add_argument("--Ir", "--resolution", dest="resolution", type=int, default=300, help="Resolution if the tree image (DPI)") img_gr.add_argument("--Iu", "--size-units", dest="size_units", choices=["px", "mm", "in"], default="px", help="Units used to specify the size of the image." " (px:pixels, mm:millimeters, in:inches). ") img_gr.add_argument( "-mbs", "--min-branch-separation", dest="branch_separation", type=int, default=3, help="Min number of pixels to separate branches vertically.") img_gr.add_argument("--ss", "--show-support", dest="show_support", action="store_true", help="""Shows branch bootstrap/support values""") img_gr.add_argument("--sbl", "--branch-length", dest="show_branch_length", action="store_true", help="""Show branch lengths.""") img_gr.add_argument( "--ft", "--force-topology", dest="force_topology", action="store_true", help="""Force branch length to have a minimum length in the image""") img_gr.add_argument("--hln", "--hide-leaf-names", dest="hide_leaf_names", action="store_true", help="""Hide leaf names.""") img_gr.add_argument( "--sin", "--show-internal-names", dest="show_internal_names", action="store_true", help="""Show the name attribute of all internal nodes.""") edit_gr = parser.add_argument_group("TREE EDIT OPTIONS\n=================") edit_gr.add_argument( "-r", "--root", dest="root", type=str, nargs="*", help="Roots the tree to the node grouping the list" " of node names provided (space separated). In example:" "'--root human rat mouse'") edit_gr.add_argument("-s", "--sort-branches", dest="sort", action="store_true", help="""Sort branches according to node names.""") edit_gr.add_argument("-l", "--ladderize", dest="ladderize", action="store_true", help="""Sort branches by partition size.""") edit_gr.add_argument("--color_by_rank", dest="color_by_rank", type=str, nargs="+", help="""If the attribute rank is present in nodes """) phylo_gr = parser.add_argument_group( "PHYLOGENETIC OPTIONS\n=================") phylo_gr.add_argument("--alg", dest="alg", type=str, help="""Multiple sequence alignment.""") phylo_gr.add_argument( "--alg-format", dest="alg_format", type=str, default="fasta", help="""fasta, phylip, iphylip, relaxed_iphylip, relaxed_phylip.""") phylo_gr.add_argument( "--sp-discovery", dest="species_discovery_regexp", type=str, default="^[^_]+_(.+)", help="Perl regular expression to capture species" " code from node names. By default, node names" " are expected to follow the NAME_SPCODE format = '^[^_]+_(.+)' ") phylo_gr.add_argument( "--dump-subtrees", dest="subtrees_output_file", type=str, help="Returns a file containing all possible species subtrees" " contained in a given gene tree ") phylo_gr.add_argument( "--newick", dest="newick", type=str, help="dumps newick file after applying editing options") args = parser.parse_args(argv) tfile = args.tree[0] if args.ladderize and args.sort: raise ValueError( "--sort-branches and --ladderize options are mutually exclusive") if args.raxml: nw = re.sub(":(\d+\.\d+)\[(\d+)\]", ":\\1[&&NHX:support=\\2]", open(tfile).read()) t = PhyloTree(nw) #for n in t.traverse(): #n.support = getattr(n, "bootstrap", -1) # else: t = PhyloTree(tfile) if args.alg: t.link_to_alignment(args.alg, alg_format=args.alg_format) LEAF_ATTRIBUTES["sequence"] = 1 if args.species_discovery_regexp: SPCODE_REGEXP = re.compile(args.species_discovery_regexp) t.set_species_naming_function(user_species_naming_function) if args.ladderize: t.ladderize() if args.sort: t.sort_descendants() if args.root: if len(args.root) > 1: outgroup = t.get_common_ancestor(args.root) else: outgroup = t & args.root[0] t.set_outgroup(outgroup) # EXTRACT INFO if args.subtrees_output_file: ntrees, ndups, treeiter = t.get_speciation_trees() print >> sys.stderr, "Found %d duplication nodes. Dumping %d sutrees..." % ( ndups, ntrees) OUT = open(args.subtrees_output_file, "w") for tree in treeiter: print >> OUT, tree.write() OUT.close() # VISUALIZATION ts = TreeStyle() ts.mode = args.mode ts.show_leaf_name = False ts.branch_vertical_margin = args.branch_separation if args.show_support: ts.show_branch_support = True if args.show_branch_length: ts.show_branch_length = True if args.force_topology: ts.force_topology = True if args.hide_leaf_names: del LEAF_ATTRIBUTES["name"] if args.show_internal_names: INTERNAL_ATTRIBUTES["name"] = 1 # scale the tree if not args.height: args.height = None if not args.width: args.width = None ts.layout_fn = master_layout if args.image: t.render(args.image, tree_style=ts, w=args.width, h=args.height, units=args.size_units) else: t.show(None, tree_style=ts) if args.newick: t.write(features=[], outfile=args.newick) print "Processed Newick dumped into", args.newick
def main(argv): parser = argparse.ArgumentParser(description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo. # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version) # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) ) # const - A constant value required by some action and nargs selections. # default - The value produced if the argument is absent from the command line. # type - The type to which the command-line argument should be converted. # choices - A container of the allowable values for the argument. # required - Whether or not the command-line option may be omitted (optionals only). # help - A brief description of what the argument does. # metavar - A name for the argument in usage messages. # dest - The name of the attribute to be added to the object returned by parse_args(). input_gr = parser.add_argument_group("TREE INPUT OPTIONS\n=================") input_gr.add_argument('tree', metavar='tree_file', type=str, nargs=1, help='A tree file (or text string) in newick format.') input_gr.add_argument("--raxml", dest="raxml", action="store_true", help="""Process newick as raxml bootstrap values""") img_gr = parser.add_argument_group("TREE IMAGE OPTIONS\n=================") img_gr.add_argument("-m", "--mode", dest="mode", choices=["c", "r"], default="r", help="""(r)ectangular or (c)ircular visualization""") img_gr.add_argument("-i", "--image", dest="image", type=str, help="Render tree image instead of showing it. A filename " " should be provided. PDF, SVG and PNG file extensions are" " supported (i.e. -i tree.svg)" ) img_gr.add_argument("--Iw", "--width", dest="width", type=int, default=0, help="width of the rendered image in pixels (see --size-units)." ) img_gr.add_argument("--Ih", "--height", dest="height", type=int, default=0, help="height of the rendered image in pixels (see --size-units)." ) img_gr.add_argument("--Ir", "--resolution", dest="resolution", type=int, default=300, help="Resolution if the tree image (DPI)" ) img_gr.add_argument("--Iu", "--size-units", dest="size_units", choices=["px", "mm", "in"], default="px", help="Units used to specify the size of the image." " (px:pixels, mm:millimeters, in:inches). " ) img_gr.add_argument("-mbs", "--min-branch-separation", dest="branch_separation", type=int, default = 3, help="Min number of pixels to separate branches vertically." ) img_gr.add_argument("--ss", "--show-support", dest="show_support", action="store_true", help="""Shows branch bootstrap/support values""") img_gr.add_argument("--sbl", "--branch-length", dest="show_branch_length", action="store_true", help="""Show branch lengths.""") img_gr.add_argument("--ft", "--force-topology", dest="force_topology", action="store_true", help="""Force branch length to have a minimum length in the image""") img_gr.add_argument("--hln", "--hide-leaf-names", dest="hide_leaf_names", action="store_true", help="""Hide leaf names.""") img_gr.add_argument("--sin", "--show-internal-names", dest="show_internal_names", action="store_true", help="""Show the name attribute of all internal nodes.""") edit_gr = parser.add_argument_group("TREE EDIT OPTIONS\n=================") edit_gr.add_argument("-r", "--root", dest="root", type=str, nargs="*", help="Roots the tree to the node grouping the list" " of node names provided (space separated). In example:" "'--root human rat mouse'") edit_gr.add_argument("-s", "--sort-branches", dest="sort", action="store_true", help="""Sort branches according to node names.""") edit_gr.add_argument("-l", "--ladderize", dest="ladderize", action="store_true", help="""Sort branches by partition size.""") edit_gr.add_argument("--color_by_rank", dest="color_by_rank", type=str, nargs="+", help="""If the attribute rank is present in nodes """) phylo_gr = parser.add_argument_group("PHYLOGENETIC OPTIONS\n=================") phylo_gr.add_argument("--alg", dest="alg", type=str, help="""Multiple sequence alignment.""") phylo_gr.add_argument("--alg-format", dest="alg_format", type=str, default="fasta", help="""fasta, phylip, iphylip, relaxed_iphylip, relaxed_phylip.""") phylo_gr.add_argument("--sp-discovery", dest="species_discovery_regexp", type=str, default="^[^_]+_(.+)", help="Perl regular expression to capture species" " code from node names. By default, node names" " are expected to follow the NAME_SPCODE format = '^[^_]+_(.+)' ") phylo_gr.add_argument("--dump-subtrees", dest="subtrees_output_file", type=str, help="Returns a file containing all possible species subtrees" " contained in a given gene tree ") phylo_gr.add_argument("--newick", dest="newick", type=str, help="dumps newick file after applying editing options") args = parser.parse_args(argv) tfile = args.tree[0] if args.ladderize and args.sort: raise ValueError("--sort-branches and --ladderize options are mutually exclusive") if args.raxml: nw = re.sub(":(\d+\.\d+)\[(\d+)\]", ":\\1[&&NHX:support=\\2]", open(tfile).read()) t = PhyloTree(nw) #for n in t.traverse(): #n.support = getattr(n, "bootstrap", -1) # else: t = PhyloTree(tfile) if args.alg: t.link_to_alignment(args.alg, alg_format=args.alg_format) LEAF_ATTRIBUTES["sequence"] = 1 if args.species_discovery_regexp: SPCODE_REGEXP = re.compile(args.species_discovery_regexp) t.set_species_naming_function(user_species_naming_function) if args.ladderize: t.ladderize() if args.sort: t.sort_descendants() if args.root: if len(args.root) > 1: outgroup = t.get_common_ancestor(args.root) else: outgroup = t & args.root[0] t.set_outgroup(outgroup) # EXTRACT INFO if args.subtrees_output_file: ntrees, ndups, treeiter = t.get_speciation_trees() print >>sys.stderr, "Found %d duplication nodes. Dumping %d sutrees..." %(ndups, ntrees) OUT = open(args.subtrees_output_file, "w") for tree in treeiter: print >>OUT, tree.write() OUT.close() # VISUALIZATION ts = TreeStyle() ts.mode = args.mode ts.show_leaf_name = False ts.branch_vertical_margin = args.branch_separation if args.show_support: ts.show_branch_support = True if args.show_branch_length: ts.show_branch_length = True if args.force_topology: ts.force_topology = True if args.hide_leaf_names: del LEAF_ATTRIBUTES["name"] if args.show_internal_names: INTERNAL_ATTRIBUTES["name"] = 1 # scale the tree if not args.height: args.height = None if not args.width: args.width = None ts.layout_fn = master_layout if args.image: t.render(args.image, tree_style=ts, w=args.width, h=args.height, units=args.size_units) else: t.show(None, tree_style=ts) if args.newick: t.write(features=[], outfile=args.newick) print "Processed Newick dumped into", args.newick
def main(argv): parser = ArgumentParser(description=__DESCRIPTION__) parser.add_argument("--db", dest="dbfile", type=str, help="""NCBI sqlite3 db file.""") parser.add_argument("-t", "--taxid", dest="taxid", nargs="+", type=int, help="""taxids (space separated)""") parser.add_argument("-tf", "--taxid_file", dest="taxid_file", type=str, help="""file containing a list of taxids (one per line)""") parser.add_argument("-r", "--reftree", dest="reftree", type=str, help="""tree file containing taxids as node names.""") parser.add_argument("--reftree_attr", dest="reftree_attr", type=str, default="name", help="""Where taxid should be read from""") parser.add_argument("-n", "--name", dest="names", nargs="+", type=str, help="""species or taxa names (comma separated)""") parser.add_argument("-nf", "--names_file", dest="names_file", type=str, help="""file containing a list of taxids (one per line)""") parser.add_argument("-x", "--taxonomy", dest="taxonomy", action="store_true", help=("returns a pruned version of the NCBI taxonomy" " tree containing target species")) parser.add_argument("--show_tree", dest="show_tree", action="store_true", help="""shows the NCBI taxonomy tree of the provided species""") parser.add_argument("--collapse_subspecies", dest="collapse_subspecies", action="store_true", help=("When used, all nodes under the the species rank" " are collapsed, so all species and subspecies" " are seen as sister nodes")) parser.add_argument("--rank_limit", dest="rank_limit", type=str, help=("When used, all nodes under the provided rank" " are discarded")) parser.add_argument("--full_lineage", dest="full_lineage", action="store_true", help=("When used, topology is not pruned to avoid " " one-child-nodes, so the complete lineage" " track leading from root to tips is kept.")) parser.add_argument("-i", "--info", dest="info", action="store_true", help="""shows NCBI information about the species""") parser.add_argument("--fuzzy", dest="fuzzy", type=float, help=("Tries a fuzzy (and SLOW) search for those" " species names that could not be translated" " into taxids. A float number must be provided" " indicating the minimum string similarity.")) args = parser.parse_args(argv) if not args.taxonomy and not args.info and not args.reftree: parser.print_usage() sys.exit(0) if args.fuzzy: import pysqlite2.dbapi2 as sqlite3 c = sqlite3.connect(os.path.join(MODULE_PATH, args.dbfile)) else: ncbi.connect_database(args.dbfile) all_names = set([]) all_taxids = [] if args.names_file: all_names.update(map(strip, open(args.names_file, "rU").read().split("\n"))) if args.names: all_names.update(map(strip, " ".join(args.names).split(","))) all_names.discard("") #all_names = set([n.lower() for n in all_names]) not_found = set() name2realname = {} name2score = {} if all_names: log.info("Dumping name translations:") name2id = ncbi.get_name_translator(all_names) not_found = all_names - set(name2id.keys()) if args.fuzzy and not_found: log.info("%s unknown names", len(not_found)) for name in not_found: # enable extension loading c.enable_load_extension(True) c.execute("select load_extension('%s')" % os.path.join(module_path, "SQLite-Levenshtein/levenshtein.sqlext")) tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy) if tax: name2id[name] = tax name2realname[name] = realname name2score[name] = "Fuzzy:%0.2f" %sim for name in all_names: taxid = name2id.get(name, "???") realname = name2realname.get(name, name) score = name2score.get(name, "Exact:1.0") print "\t".join(map(str, [score, name, realname.capitalize(), taxid])) if args.taxid_file: all_taxids.extend(map(strip, open(args.taxid_file, "rU").read().split("\n"))) if args.taxid: all_taxids.extend(args.taxid) reftree = None if args.reftree: reftree = PhyloTree(args.reftree) all_taxids.extend(list(set([getattr(n, args.reftree_attr) for n in reftree.iter_leaves()]))) if all_taxids and args.info: all_taxids = set(all_taxids) all_taxids.discard("") all_taxids, merge_conversion = ncbi.translate_merged(all_taxids) log.info("Dumping %d taxid translations:" %len(all_taxids)) all_taxids.discard("") translator = ncbi.get_taxid_translator(all_taxids) for taxid, name in translator.iteritems(): lineage = ncbi.get_sp_lineage(taxid) named_lineage = ','.join(ncbi.translate_to_names(lineage)) lineage = ','.join(map(str, lineage)) print "\t".join(map(str, [merge_conversion.get(int(taxid), taxid), name, named_lineage, lineage ])) for notfound in set(map(str, all_taxids)) - set(str(k) for k in translator.iterkeys()): print >>sys.stderr, notfound, "NOT FOUND" if all_taxids and args.taxonomy: all_taxids = set(all_taxids) all_taxids.discard("") log.info("Dumping NCBI taxonomy of %d taxa:" %len(all_taxids)) t = ncbi.get_topology(all_taxids, args.full_lineage, args.rank_limit) id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()]) for n in t.traverse(): n.add_features(taxid=n.name) n.add_features(sci_name=str(id2name.get(int(n.name), "?"))) if n.rank in COLOR_RANKS: n.add_features(bgcolor=COLOR_RANKS[n.rank]) n.name = "%s{%s}" %(id2name.get(int(n.name), n.name), n.name) lineage = ncbi.get_sp_lineage(n.taxid) n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage))) if args.collapse_subspecies: species_nodes = [n for n in t.traverse() if n.rank == "species" if int(n.taxid) in all_taxids] for sp_node in species_nodes: bellow = sp_node.get_descendants() if bellow: # creates a copy of the species node connector = sp_node.__class__() for f in sp_node.features: connector.add_feature(f, getattr(sp_node, f)) connector.name = connector.name + "{species}" for n in bellow: n.detach() n.name = n.name + "{%s}" %n.rank sp_node.add_child(n) sp_node.add_child(connector) sp_node.add_feature("collapse_subspecies", "1") if args.show_tree: t.show() print "\n\n ===== Newick files saved as 'your_taxa_query.*' ===== " t.write(format=9, outfile="your_ncbi_query.nw") t.write(format=8, outfile="your_ncbi_query.named.nw") t.write(format=9, features=["taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage"], outfile="your_ncbi_query.extended.nw") for i in t.iter_leaves(): i.name = i.taxid t.write(format=9, outfile="your_ncbi_query.taxids.nw") if all_taxids and reftree: translator = ncbi.get_taxid_translator(all_taxids) for n in reftree.iter_leaves(): if not hasattr(n, "taxid"): n.add_features(taxid=int(getattr(n, args.reftree_attr))) n.add_features(sci_name = translator.get(int(n.taxid), n.name)) lineage = ncbi.get_sp_lineage(n.taxid) named_lineage = '|'.join(ncbi.translate_to_names(lineage)) n.add_features(ncbi_track=named_lineage) print reftree.write(features=["taxid", "sci_name", "ncbi_track"])
def main(argv): parser = ArgumentParser(description=__DESCRIPTION__) parser.add_argument("--db", dest="dbfile", type=str, help="""NCBI sqlite3 db file.""") parser.add_argument("-t", "--taxid", dest="taxid", nargs="+", type=int, help="""taxids (space separated)""") parser.add_argument( "-tf", "--taxid_file", dest="taxid_file", type=str, help="""file containing a list of taxids (one per line)""") parser.add_argument("-r", "--reftree", dest="reftree", type=str, help="""tree file containing taxids as node names.""") parser.add_argument("--reftree_attr", dest="reftree_attr", type=str, default="name", help="""Where taxid should be read from""") parser.add_argument("-n", "--name", dest="names", nargs="+", type=str, help="""species or taxa names (comma separated)""") parser.add_argument( "-nf", "--names_file", dest="names_file", type=str, help="""file containing a list of taxids (one per line)""") parser.add_argument("-x", "--taxonomy", dest="taxonomy", action="store_true", help=("returns a pruned version of the NCBI taxonomy" " tree containing target species")) parser.add_argument( "--show_tree", dest="show_tree", action="store_true", help="""shows the NCBI taxonomy tree of the provided species""") parser.add_argument("--collapse_subspecies", dest="collapse_subspecies", action="store_true", help=("When used, all nodes under the the species rank" " are collapsed, so all species and subspecies" " are seen as sister nodes")) parser.add_argument("--rank_limit", dest="rank_limit", type=str, help=("When used, all nodes under the provided rank" " are discarded")) parser.add_argument("--full_lineage", dest="full_lineage", action="store_true", help=("When used, topology is not pruned to avoid " " one-child-nodes, so the complete lineage" " track leading from root to tips is kept.")) parser.add_argument("-i", "--info", dest="info", action="store_true", help="""shows NCBI information about the species""") parser.add_argument("--fuzzy", dest="fuzzy", type=float, help=("Tries a fuzzy (and SLOW) search for those" " species names that could not be translated" " into taxids. A float number must be provided" " indicating the minimum string similarity.")) args = parser.parse_args(argv) if not args.taxonomy and not args.info and not args.reftree: parser.print_usage() sys.exit(0) if args.fuzzy: import pysqlite2.dbapi2 as sqlite3 c = sqlite3.connect(os.path.join(MODULE_PATH, args.dbfile)) else: ncbi.connect_database(args.dbfile) all_names = set([]) all_taxids = [] if args.names_file: all_names.update( map(strip, open(args.names_file, "rU").read().split("\n"))) if args.names: all_names.update(map(strip, " ".join(args.names).split(","))) all_names.discard("") #all_names = set([n.lower() for n in all_names]) not_found = set() name2realname = {} name2score = {} if all_names: log.info("Dumping name translations:") name2id = ncbi.get_name_translator(all_names) not_found = all_names - set(name2id.keys()) if args.fuzzy and not_found: log.info("%s unknown names", len(not_found)) for name in not_found: # enable extension loading c.enable_load_extension(True) c.execute("select load_extension('%s')" % os.path.join( module_path, "SQLite-Levenshtein/levenshtein.sqlext")) tax, realname, sim = ncbi.get_fuzzy_name_translation( name, args.fuzzy) if tax: name2id[name] = tax name2realname[name] = realname name2score[name] = "Fuzzy:%0.2f" % sim for name in all_names: taxid = name2id.get(name, "???") realname = name2realname.get(name, name) score = name2score.get(name, "Exact:1.0") print "\t".join( map(str, [score, name, realname.capitalize(), taxid])) if args.taxid_file: all_taxids.extend( map(strip, open(args.taxid_file, "rU").read().split("\n"))) if args.taxid: all_taxids.extend(args.taxid) reftree = None if args.reftree: reftree = PhyloTree(args.reftree) all_taxids.extend( list( set([ getattr(n, args.reftree_attr) for n in reftree.iter_leaves() ]))) if all_taxids and args.info: all_taxids = set(all_taxids) all_taxids.discard("") all_taxids, merge_conversion = ncbi.translate_merged(all_taxids) log.info("Dumping %d taxid translations:" % len(all_taxids)) all_taxids.discard("") translator = ncbi.get_taxid_translator(all_taxids) for taxid, name in translator.iteritems(): lineage = ncbi.get_sp_lineage(taxid) named_lineage = ','.join(ncbi.translate_to_names(lineage)) lineage = ','.join(map(str, lineage)) print "\t".join( map(str, [ merge_conversion.get(int(taxid), taxid), name, named_lineage, lineage ])) for notfound in set(map(str, all_taxids)) - set( str(k) for k in translator.iterkeys()): print >> sys.stderr, notfound, "NOT FOUND" if all_taxids and args.taxonomy: all_taxids = set(all_taxids) all_taxids.discard("") log.info("Dumping NCBI taxonomy of %d taxa:" % len(all_taxids)) t = ncbi.get_topology(all_taxids, args.full_lineage, args.rank_limit) id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()]) for n in t.traverse(): n.add_features(taxid=n.name) n.add_features(sci_name=str(id2name.get(int(n.name), "?"))) if n.rank in COLOR_RANKS: n.add_features(bgcolor=COLOR_RANKS[n.rank]) n.name = "%s{%s}" % (id2name.get(int(n.name), n.name), n.name) lineage = ncbi.get_sp_lineage(n.taxid) n.add_features( named_lineage='|'.join(ncbi.translate_to_names(lineage))) if args.collapse_subspecies: species_nodes = [ n for n in t.traverse() if n.rank == "species" if int(n.taxid) in all_taxids ] for sp_node in species_nodes: bellow = sp_node.get_descendants() if bellow: # creates a copy of the species node connector = sp_node.__class__() for f in sp_node.features: connector.add_feature(f, getattr(sp_node, f)) connector.name = connector.name + "{species}" for n in bellow: n.detach() n.name = n.name + "{%s}" % n.rank sp_node.add_child(n) sp_node.add_child(connector) sp_node.add_feature("collapse_subspecies", "1") if args.show_tree: t.show() print "\n\n ===== Newick files saved as 'your_taxa_query.*' ===== " t.write(format=9, outfile="your_ncbi_query.nw") t.write(format=8, outfile="your_ncbi_query.named.nw") t.write(format=9, features=[ "taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage" ], outfile="your_ncbi_query.extended.nw") for i in t.iter_leaves(): i.name = i.taxid t.write(format=9, outfile="your_ncbi_query.taxids.nw") if all_taxids and reftree: translator = ncbi.get_taxid_translator(all_taxids) for n in reftree.iter_leaves(): if not hasattr(n, "taxid"): n.add_features(taxid=int(getattr(n, args.reftree_attr))) n.add_features(sci_name=translator.get(int(n.taxid), n.name)) lineage = ncbi.get_sp_lineage(n.taxid) named_lineage = '|'.join(ncbi.translate_to_names(lineage)) n.add_features(ncbi_track=named_lineage) print reftree.write(features=["taxid", "sci_name", "ncbi_track"])
def main(argv): parser = argparse.ArgumentParser( description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('tree', metavar='tree_file', type=str, nargs=1, help='A tree file (or text string) in newick format.') parser.add_argument( "--sp_delimiter", dest="species_delimiter", type=str, default="_", help=("When species names are guessed from node names," " this argument specifies how to split node name to guess" " the species code")) parser.add_argument( "--sp_field", dest="species_field", type=int, default=1, help=("When species names are guessed from node names," " this argument specifies the position of the species" " name code relative to the name splitting delimiter")) parser.add_argument( "--root", dest="root", type=str, nargs="*", help="Roots the tree to the node grouping the list" " of node names provided (space separated). In example:" "'--root human rat mouse'") parser.add_argument( "--skip_ortholog_detection", dest="skip_ortholog_detection", action="store_true", help= ("Skip automatic detection of" " speciation and duplication events, thus relying in the" " correct annotation of the provided tree using" " the extended newick format (i.e. '((A, A)[&&NHX:evoltype=D], B)[&&NHX:evoltype=S];')" )) parser.add_argument( "--evoltype_attr", dest="evoltype_attr", type=str, default="evoltype", help=( "When orthology detection is disabled," " the attribute name provided here will be expected to exist" " in all internal nodes and read from the extended newick format")) parser.add_argument("--database", dest="database", type=str, default="", help=("Database name")) parser.add_argument( "--show", dest="show", action="store_true", default="", help=( "Show the tree and its evolutionary events before orthoXML export" )) parser.add_argument( "--ascii", dest="ascii", action="store_true", default="", help=( "Show the tree using ASCII representation and all its evolutionary" " events before orthoXML export")) parser.add_argument( "--newick", dest="newick", action="store_true", default="", help=("print the extended newick format for provided tree using" " ASCII representation and all its evolutionary events" " before orthoXML export")) args = parser.parse_args() newick = args.tree[0] SPECIES_NAME_POS = args.species_field SPECIES_NAME_DELIMITER = args.species_delimiter # load a phylomeDB Tree provided as a newick file in the command line t = PhyloTree(newick, sp_naming_function=extract_spname) if args.root: if len(args.root) > 1: outgroup = t.get_common_ancestor(args.root) else: outgroup = t & args.root[0] t.set_outgroup(outgroup) if not args.skip_ortholog_detection: # detect speciation and duplication events using the species overlap # algorithm used in phylomeDB t.get_descendant_evol_events() if args.ascii: print t.get_ascii(attributes=[args.evoltype_attr, "name"], show_internal=True) if args.newick: print t.write(features=[args.evoltype_attr], format_root_node=True) if args.show: t.show() export_as_orthoXML(t, args.database, args.evoltype_attr)
def main(argv): parser = argparse.ArgumentParser(description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('tree', metavar='tree_file', type=str, nargs=1, help='A tree file (or text string) in newick format.') parser.add_argument("--sp_delimiter", dest="species_delimiter", type=str, default="_", help=("When species names are guessed from node names," " this argument specifies how to split node name to guess" " the species code")) parser.add_argument("--sp_field", dest="species_field", type=int, default=1, help=("When species names are guessed from node names," " this argument specifies the position of the species" " name code relative to the name splitting delimiter")) parser.add_argument("--root", dest="root", type=str, nargs="*", help="Roots the tree to the node grouping the list" " of node names provided (space separated). In example:" "'--root human rat mouse'") parser.add_argument("--skip_ortholog_detection", dest="skip_ortholog_detection", action="store_true", help=("Skip automatic detection of" " speciation and duplication events, thus relying in the" " correct annotation of the provided tree using" " the extended newick format (i.e. '((A, A)[&&NHX:evoltype=D], B)[&&NHX:evoltype=S];')")) parser.add_argument("--evoltype_attr", dest="evoltype_attr", type=str, default="evoltype", help=("When orthology detection is disabled," " the attribute name provided here will be expected to exist" " in all internal nodes and read from the extended newick format")) parser.add_argument("--database", dest="database", type=str, default="", help=("Database name")) parser.add_argument("--show", dest="show", action="store_true", default="", help=("Show the tree and its evolutionary events before orthoXML export")) parser.add_argument("--ascii", dest="ascii", action="store_true", default="", help=("Show the tree using ASCII representation and all its evolutionary" " events before orthoXML export")) parser.add_argument("--newick", dest="newick", action="store_true", default="", help=("print the extended newick format for provided tree using" " ASCII representation and all its evolutionary events" " before orthoXML export")) args = parser.parse_args() newick = args.tree[0] SPECIES_NAME_POS = args.species_field SPECIES_NAME_DELIMITER = args.species_delimiter # load a phylomeDB Tree provided as a newick file in the command line t = PhyloTree(newick, sp_naming_function=extract_spname) if args.root: if len(args.root) > 1: outgroup = t.get_common_ancestor(args.root) else: outgroup = t & args.root[0] t.set_outgroup(outgroup) if not args.skip_ortholog_detection: # detect speciation and duplication events using the species overlap # algorithm used in phylomeDB t.get_descendant_evol_events() if args.ascii: print t.get_ascii(attributes=[args.evoltype_attr, "name"], show_internal=True) if args.newick: print t.write(features=[args.evoltype_attr], format_root_node=True) if args.show: t.show() export_as_orthoXML(t, args.database, args.evoltype_attr)