def test_getRankedDescendants(self): """NcbiTaxonNode getRankedDescendants should return correct list""" nested_species = '''3\t|\t3\t|\tsuperkingdom\t|\t\t|\t8\t|\t0\t|\t1\t|\t0\t|\t0\t|\t0\t|\t0\t|\t0\t|\t\t| 11\t|\t3\t|\tkingdom\t|\t\t|\t8\t|\t0\t|\t1\t|\t0\t|\t0\t|\t0\t|\t0\t|\t0\t|\t\t| 22\t|\t11\t|\tclass\t|\t\t|\t8\t|\t0\t|\t1\t|\t0\t|\t0\t|\t0\t|\t0\t|\t0\t|\t\t| 44\t|\t22\t|\torder\t|\t\t|\t0\t|\t0\t|\t11\t|\t0\t|\t0\t|\t0\t|\t0\t|\t0\t|\t\t| 66\t|\t22\t|\torder\t|\t\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t|\t 77\t|\t66\t|\tfamily\t|\t\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t\t| 99\t|\t66\t|\tfamily\t|\t\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t\t| 88\t|\t44\t|\tfamily\t|\t\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t\t| 101\t|\t77\t|\tgenus\t|\t\t|\t8\t|\t0\t|\t1\t|\t0\t|\t0\t|\t0\t|\t0\t|\t0\t|\t\t| 202\t|\t77\t|\tgenus\t|\t\t|\t0\t|\t0\t|\t11\t|\t0\t|\t0\t|\t0\t|\t0\t|\t0\t|\t\t| 606\t|\t99\t|\tgenus\t|\t\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t|\t 707\t|\t88\t|\tgenus\t|\t\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t\t| 909\t|\t88\t|\tgenus\t|\t\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t\t| 123\t|\t909\t|\tgroup\t|\t\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t\t| 1111\t|\t123\t|\tspecies\t|\tAT\t|\t8\t|\t0\t|\t1\t|\t0\t|\t0\t|\t0\t|\t0\t|\t0\t|\t\t| 2222\t|\t707\t|\tspecies\t|\tTT\t|\t0\t|\t0\t|\t11\t|\t0\t|\t0\t|\t0\t|\t0\t|\t0\t|\t\t| 6666\t|\t606\t|\tspecies\t|\tGG\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t|\t 7777\t|\t606\t|\tspecies\t|\tAC\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t\t| 9999\t|\t202\t|\tspecies\t|\tBA\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t\t| 1010\t|\t101\t|\tspecies\t|\tAC\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t\t| 5555\t|\t555\t|\tspecies\t|\tAC\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t\t| 555\t|\t3\t|\tsuperclass\t|\t\t|\t8\t|\t0\t|\t1\t|\t0\t|\t0\t|\t0\t|\t0\t|\t0\t|\t\t|'''.split( '\n') nested_names = [ '3|a||scientific name|', '11|b||scientific name|', '555|c||scientific name|', '22|d||scientific name|', '44|e||scientific name|', '66|f||scientific name|', '88|g||scientific name|', '77|h||scientific name|', '99|i||scientific name|', '707|j||scientific name|', '909|k||scientific name|', '101|l||scientific name|', '202|m||scientific name|', '606|n||scientific name|', '2222|o||scientific name|', '123|p||scientific name|', '1111|q||scientific name|', '1010|r||scientific name|', '9999|s||scientific name|', '7777|t||scientific name|', '6666|u||scientific name|', '5555|z||scientific name|', ] tx = NcbiTaxonomyFromFiles(nested_species, nested_names) dec = tx[3].getRankedDescendants('superclass') self.assertEqual(len(dec), 1) assert dec[0] is tx[555] sp = tx['f'].getRankedDescendants('species') self.assertSameItems(sp, [tx[1010], tx[9999], tx[7777], tx[6666]]) empty = tx[11].getRankedDescendants('superclass') self.assertEqual(empty, []) gr = tx[3].getRankedDescendants('group') self.assertEqual(gr, [tx[123]]) assert tx[3] is tx['a']
def test_init_bad(self): """NcbiTaxonomyFromFiles should produce deadbeats by default""" bad_tx = NcbiTaxonomyFromFiles(bad_nodes, good_names) self.assertEqual(len(bad_tx.Deadbeats), 2) assert 777 in bad_tx.Deadbeats assert 666 in bad_tx.Deadbeats assert bad_tx.Deadbeats[777] == bad_tx[9]
def main(): args.infile_nodesdmp_path, args.infile_namesdmp_path, args.infile_mergeddmp_path, \ args.infile_delnodesdmp_path = input_files() args.outfile_path, args.logfile_path = output_files() args.output_ranks = args.output_ranks.replace('_', ',').split(',') accessions_in_input_file = get_accessions_from_input_file() ncbi_full_taxonomy = NcbiTaxonomyFromFiles(open(args.infile_nodesdmp_path), open(args.infile_namesdmp_path)) merged_taxids = get_merged_nodes() deleted_taxids = get_deleted_nodes() included_nodes, taxid, missing_accessions = obtain_nodes_for_each_accession(accessions_in_input_file, \ args.infile_acc2taxid_path, ncbi_full_taxonomy, merged_taxids, deleted_taxids) taxid_taxonomy, missing_taxonomy = generate_taxonid_taxonomy( included_nodes, ncbi_full_taxonomy, args.output_ranks) generate_output_files(args.outfile_path, taxid_taxonomy, taxid, missing_accessions, missing_taxonomy) log_file = open(args.logfile_path, 'a') log_file.write('Finished running entrez_qiime.py at ' + strftime("%H:%M:%S on %d-%m-%Y", localtime()) + '\n') log_file.close()
def get_tax_local(taxid_dict, ranks): """ Fetch complete lineage locally (BETTER for huge queries) Return a dict with: k: GI v: complete lineage """ from cogent.parse.ncbi_taxonomy import NcbiTaxonomyFromFiles tree = NcbiTaxonomyFromFiles(open( DB_PATH + 'nodes.dmp'), open( DB_PATH + 'names.dmp')) root = tree.Root def get_lineage(node, my_ranks): ranks_lookup = dict([(r,idx) for idx,r in enumerate(my_ranks)]) lineage = [None] * len(my_ranks) curr = node while curr.Parent is not None: if curr.Rank in ranks_lookup: lineage[ranks_lookup[curr.Rank]] = curr.Name curr = curr.Parent return lineage tax_dict = {} for gi in taxid_dict: # Get lineage for each (gi i.e taxid) try: node = tree.ById[taxid_dict[gi]] tax_dict[gi] = get_lineage(node, ranks) except KeyError: print "Cannot Fetch taxonomy for GI: " + gi print "DONE: Complete taxonomy retrived for each GI" return tax_dict
def load_ncbi_tree(ncbi_nodes_file, ncbi_names_file): """Loading NCBI taxnomy tree """ try: ncbi_tree = NcbiTaxonomyFromFiles(open(ncbi_nodes_file, "rt"), open(ncbi_names_file, "rt")) except IOError: sys.exit("Error cannot open {0} or {1}".format(ncbi_nodes_file, ncbi_names_file)) return ncbi_tree
def test_init_strict(self): """NcbiTaxonomyFromFiles should fail if strict and deadbeats exist""" tx = NcbiTaxonomyFromFiles(good_nodes, good_names, strict=True) self.assertRaises(MissingParentError, NcbiTaxonomyFromFiles, \ bad_nodes, good_names, strict=True)
def setUp(self): self.tx = NcbiTaxonomyFromFiles(good_nodes, good_names)
#!/usr/bin/python2 import sys import re ## RUN FROM INFO115 from cogent.parse.ncbi_taxonomy import NcbiTaxonomyFromFiles tree = NcbiTaxonomyFromFiles(open('/2/scratch/brianne/taxonomy/nodes.dmp'), open('/2/scratch/brianne/taxonomy/names.dmp')) root = tree.Root # Here is an example of how to get the NCBI taxon IDs for all Eukaryota phylums # euks = root.getNodeMatchingName('Eukaryota') # for node in euks.getRankedDescendants('phylum'): # print node.Name, node.TaxonId # # Chlorophyta 3041 # Streptophyta 35493 # Chytridiomycota 4761 # Microsporidia 6029 # Glomeromycota 214504 # Neocallimastigomycota 451455 # Blastocladiomycota 451459 # Ascomycota 4890 # # etc # to get the full lineage for defined ranks based on an input Taxon id ranks = [ 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species' ]
NCBI_DB + '\" | tee -a blast_taxid.txt', shell=True) # New code ends map_dict = {} with open('blast_taxid.txt') as gi_taxid: for each in gi_taxid: each = each.split() gi = each[0] taxid = each[1] map_dict[gi] = taxid print 'reading in files for ' + SAMPLE_ID # tree = NcbiTaxonomyFromFiles(open('/home/roli/scripts/Python/GI_annotations/nodes.dmp'), open('/home/roli/scripts/Python/GI_annotations/names.dmp')) tree = NcbiTaxonomyFromFiles(open('nodes.dmp'), open('names.dmp')) root = tree.Root print 'generating lineages for ' + SAMPLE_ID error = open(SAMPLE_ID + '.no.lineage.found.out', 'w') error.write(header2 + "\n") if re.search("1|superkingdom", TAX_LEVEL): output_kingdom = open(SAMPLE_ID + '.superkingdom.blast_lineage.out', 'w') output_kingdom.write(header + "\n") if re.search("2|phylum", TAX_LEVEL): output_phylum = open(SAMPLE_ID + '.phylum.blast_lineage.out', 'w') output_phylum.write(header + "\n")
ranks= [a] ## Define function for pulling lineage info from NCBI nodes and names files def get_lineage(node, my_ranks): ranks_lookup = dict([(r,idx) for idx,r in enumerate(my_ranks)]) lineage = [None] * len(my_ranks) curr = node while curr.Parent is not None: if curr.Rank in ranks_lookup: lineage[ranks_lookup[curr.Rank]] = curr.Name curr = curr.Parent return lineage ## IMPORT DIR NAMES into DICTIONARY tree = NcbiTaxonomyFromFiles(open('/home/roli/db/nodes.dmp'), open('/home/roli/db/names.dmp')) root = tree.Root output = open("LINEAGES.tsv", "w") with open(FILE_LIST) as f: for line in f: line = line.strip("\r\n") line = line.split("\t") NAME = line[0] TaxID = int(line[1]) node = tree.ById[TaxID] tax = get_lineage(node, ranks) tax = str(tax[0]).lower()