def test_ncbiquery(self): ncbi = NCBITaxa(dbfile=DATABASE_PATH) id2name = ncbi.get_taxid_translator(['9606', '7507']) self.assertEqual(id2name[7507], 'Mantis religiosa') self.assertEqual(id2name[9606], 'H**o sapiens') name2id = ncbi.get_name_translator(['Mantis religiosa', 'h**o sapiens']) self.assertEqual(name2id['Mantis religiosa'], [7507]) self.assertEqual(name2id['h**o sapiens'], [9606]) name2id = ncbi.get_name_translator(['Bacteria']) self.assertEqual(set(name2id['Bacteria']), set([2, 629395])) out = ncbi.get_descendant_taxa("9605", intermediate_nodes=True) #Out[9]: [1425170, 741158, 63221, 9606] self.assertEqual(set(out), set([1425170, 741158, 63221, 9606])) out = ncbi.get_descendant_taxa("9605", intermediate_nodes=False) #Out[10]: [1425170, 741158, 63221] self.assertEqual(set(out), set([1425170, 741158, 63221])) out = ncbi.get_descendant_taxa("9605", intermediate_nodes=False, rank_limit="species") #Out[11]: [9606, 1425170] self.assertEqual(set(out), set([9606, 1425170]))
def test_ncbiquery(self): ncbi = NCBITaxa(dbfile=DATABASE_PATH) id2name = ncbi.get_taxid_translator(["9606", "7507"]) self.assertEqual(id2name[7507], "Mantis religiosa") self.assertEqual(id2name[9606], "H**o sapiens") name2id = ncbi.get_name_translator(["Mantis religiosa", "h**o sapiens"]) self.assertEqual(name2id["Mantis religiosa"], 7507) self.assertEqual(name2id["h**o sapiens"], 9606)
def getNcbiTaxonomy(): ncbi = NCBITaxa() nameToTaxIdList = ncbi.get_name_translator(ORGANISM_NAMES_LIST) #print (str(nameToTaxIdList)) with open (OUTPUT_FILE, "w") as outputFile: for name in ORGANISM_NAMES_LIST: #for name, taxIds in nameToTaxIdList.items(): taxIds = nameToTaxIdList[name] for eachId in taxIds: lineage = ncbi.get_lineage(str(eachId)) names = ncbi.get_taxid_translator(lineage) outputFile.write("\t".join([names[taxid] for taxid in lineage]) + "\n")
from pandas import DataFrame from Bio import SeqIO from pandas import Index from ete2 import NCBITaxa data_path = "/home/moritz/people/MoreData/genomes/img_od1s" img_fasta = "/home/moritz/people/MoreData/raw_imgs/od1s.fasta" img_xls = "/home/moritz/people/MoreData/raw_imgs/od1s.xls" name = "parcu_from_img_" taxDb = NCBITaxa() contigs = DataFrame.from_csv(img_xls, sep="\t", header=0, index_col=0) manual_taxo = taxDb.get_name_translator(['Candidatus Parcubacteria' ]).values()[0][0] metadata = { name + str(g): { 'IMG_ID': g, 'name': name + str(g), 'species_taxid': manual_taxo, 'long_name': contigs.loc[contigs['Genome ID'] == g]['Genome'].iloc[0] } for g in set(contigs['Genome ID']) } seq_dict = {k: [] for k in metadata} with open(img_fasta, "r") as file: for i, c in enumerate(SeqIO.parse(file, "fasta")): seq_dict[name + str(contigs.iloc[i]['Genome ID'])] += [c]
def run(args): # add lineage profiles/stats import re from ete2 import PhyloTree, NCBITaxa if not args.taxonomy and not args.info: args.taxonomy = True ncbi = NCBITaxa() all_taxids = {} all_names = set() queries = [] if not args.search: log.error('Search terms should be provided (i.e. --search) ') sys.exit(-1) for n in args.search: queries.append(n) try: all_taxids[int(n)] = None except ValueError: all_names.add(n.strip()) # translate names name2tax = ncbi.get_name_translator(all_names) all_taxids.update([(v, None) for v in name2tax.values()]) not_found_names = all_names - set(name2tax.keys()) if args.fuzzy and not_found_names: log.warn("%s unknown names", len(not_found_names)) for name in not_found_names: # enable extension loading tax, realname, sim = ncbi.get_fuzzy_name_translation( name, args.fuzzy) if tax: all_taxids[tax] = None name2tax[name] = tax name2realname[name] = realname name2score[name] = "Fuzzy:%0.2f" % sim if args.taxonomy: log.info("Dumping NCBI taxonomy of %d taxa..." % (len(all_taxids))) t = ncbi.get_topology(all_taxids.keys(), intermediate_nodes=args.full_lineage, rank_limit=args.rank_limit, collapse_subspecies=args.collapse_subspecies) id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()]) for n in t.traverse(): n.add_features(taxid=n.name) n.add_features(sci_name=str(id2name.get(int(n.name), "?"))) n.name = "%s - %s" % (id2name.get(int(n.name), n.name), n.name) lineage = ncbi.get_lineage(n.taxid) n.add_features( named_lineage='|'.join(ncbi.translate_to_names(lineage))) dump(t, features=[ "taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage" ]) elif args.info: print '# ' + '\t'.join( ["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"]) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid, name in translator.iteritems(): lineage = ncbi.get_lineage(taxid) named_lineage = ','.join(ncbi.translate_to_names(lineage)) lineage_string = ','.join(map(str, lineage)) print '\t'.join([ str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string ])
def run(args): # add lineage profiles/stats import re from ete2 import PhyloTree, NCBITaxa # dump tree by default if not args.tree and not args.info and not args.descendants: args.tree = True ncbi = NCBITaxa() all_taxids = {} all_names = set() queries = [] if not args.search: log.error('Search terms should be provided (i.e. --search) ') sys.exit(-1) for n in args.search: queries.append(n) try: all_taxids[int(n)] = None except ValueError: all_names.add(n.strip()) # translate names name2tax = ncbi.get_name_translator(all_names) all_taxids.update([(v, None) for v in name2tax.values()]) not_found_names = all_names - set(name2tax.keys()) if args.fuzzy and not_found_names: log.warn("%s unknown names", len(not_found_names)) for name in not_found_names: # enable extension loading tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy) if tax: all_taxids[tax] = None name2tax[name] = tax name2realname[name] = realname name2score[name] = "Fuzzy:%0.2f" %sim if not_found_names: log.warn("[%s] could not be translated into taxids!" %','.join(not_found_names)) if args.tree: if len(all_taxids) == 1: target_taxid = all_taxids.keys()[0] log.info("Dumping NCBI descendants tree for %s" %(target_taxid)) t = ncbi.get_descendant_taxa(target_taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit, return_tree=True) else: log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids))) t = ncbi.get_topology(all_taxids.keys(), intermediate_nodes=args.full_lineage, rank_limit=args.rank_limit, collapse_subspecies=args.collapse_subspecies) id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()]) for n in t.traverse(): n.add_features(taxid=n.name) n.add_features(sci_name=str(id2name.get(int(n.name), "?"))) n.name = "%s - %s" %(id2name.get(int(n.name), n.name), n.name) lineage = ncbi.get_lineage(n.taxid) n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage))) dump(t, features=["taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage"]) elif args.descendants: log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids))) print '# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "descendant_taxids", "descendant_names"]) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid in all_taxids: descendants = ncbi.get_descendant_taxa(taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit) print '\t'.join([str(taxid), translator.get(taxid, taxid), ranks.get(taxid, ''), '|'.join(map(str, descendants)), '|'.join(map(str, ncbi.translate_to_names(descendants)))]) elif args.info: print '# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"]) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid, name in translator.iteritems(): lineage = ncbi.get_lineage(taxid) named_lineage = ','.join(ncbi.translate_to_names(lineage)) lineage_string = ','.join(map(str, lineage)) print '\t'.join([str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string])
from ete2 import NCBITaxa from ete2 import Tree, TreeStyle, AttrFace ncbi = NCBITaxa() input = [l.rstrip("\n") for l in open("db/example_input", "r")] taxid = ncbi.get_name_translator(input) tree = ncbi.get_topology(taxid.values()) #print tree.get_ascii(attributes=["sci_name", "rank", "taxid"]) # custom layout: adds "rank" on top of branches, and sci_name as tip names def my_layout(node): if getattr(node, "rank", None): rank_face = AttrFace("rank", fsize=7, fgcolor="indianred") node.add_face(rank_face, column=0, position="branch-top") if node.is_leaf(): sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue") node.add_face(sciname_face, column=0, position="branch-right") ts = TreeStyle() ts.layout_fn = my_layout ts.show_leaf_name = False tree.render("tree.pdf", tree_style=ts)