def get_taxonomy(species_name, name_format="Genus species", ranks=None, update_db=False): species_name = str(species_name) ncbi = NCBITaxa() if update_db == True: ncbi.update_taxonomy_database() if name_format == "Genus species": species_name = species_name if name_format == "Genus_species": species_name = species_name.replace("_", " ") species_id = ncbi.get_name_translator([species_name]) if len(species_id) == 0 and ranks == None: return (['unknown']) if len(species_id) == 0 and ranks != None: return (['unknown'] * len(ranks)) lineage_ids = ncbi.get_lineage(species_id[species_name][0]) names = ncbi.get_taxid_translator(lineage_ids) if ranks == None: return (names) lineage_rk = ncbi.get_rank(lineage_ids) parsed_names = [] for rk in ranks: for rk_id, rk_rk in lineage_rk.items(): if rk_rk == rk: parsed_names.append(ncbi.get_taxid_translator([rk_id])[rk_id]) return (parsed_names)
def get_tax( cma_file ): #finds the 'lowest common ancestor' of species represented in a cma file ncbi = NCBITaxa() org_regex = r'\[(.*)\]' taxid_list = [] for line in open(cma_file, 'r'): if line.startswith(">"): find_org_name = re.search(org_regex, line) if find_org_name is not None: org_name = find_org_name.group(1) taxid = str(ncbi.get_name_translator([org_name])) taxid = re.sub(r'^.*\[', '', taxid) taxid = re.sub(r'\].*$', '', taxid) if taxid != '{}' and taxid != '32630' and taxid != '10239': #omit sequences from viruses and synthetic constructs' taxid_list.append(taxid) tax_list = ncbi.get_taxid_translator(taxid_list) tree = ncbi.get_topology(taxid_list) tree_labeled = tree.get_ascii(attributes=['sci_name', 'taxid']) lca_id = str(tree.get_tree_root) lca_id = re.sub(r"^.*node '", '', lca_id) lca_id = re.sub(r"'.*$", '', lca_id) lca_name = str(ncbi.get_taxid_translator([lca_id])) lca_name = re.sub(r"'}$", '', lca_name) lca_name = re.sub(r"^.*'", '', lca_name) return (lca_name, tax_list, tree_labeled)
def main(): """Make queries against NCBI Taxa databases""" # Get commandline args args = get_args() # Instantiate the ete NCBI taxa object ncbi = NCBITaxa() if args.verbose > 1: print("Taxa database is stored under ~/.etetoolkit/taxa.sqlite") # Update the database if required. if args.update is True: if args.verbose > 1: print( "Updating the taxonomy database. This may take several minutes..." ) ncbi.update_taxonomy_database() # If a name was provided instead of a TaxID, convert and store it. if args.name: args.taxid = ncbi.get_name_translator([args.name])[args.name][0] if args.verbose > 0: tax_dict = {} # If a name was provided, simply add it to dict if args.name: tax_dict['Name'] = args.name # If not, do the opposite conversion to the above and store that else: tax_dict['Name'] = ncbi.get_taxid_translator([args.taxid ])[args.taxid] # Continue to populate the taxa dict with other information tax_dict['TaxID'] = args.taxid tax_dict['Rank'] = ncbi.get_rank([args.taxid]) tax_dict['Lineage'] = ncbi.get_taxid_translator( ncbi.get_lineage(args.taxid)) print("Information about your selected taxa:") pretty(tax_dict) # Main feature of the script is to get all taxa within a given group. descendent_taxa = ncbi.get_descendant_taxa(args.taxid) descendent_taxa_names = ncbi.translate_to_names(descendent_taxa) print("Descendent taxa for TaxID: %s" % (args.taxid)) # Under python3, zip = izip. In python2, this list could be very large, and memory intensive # Suggest the script is run with python3 if args.verbose > 0: for dtn, dt in zip(descendent_taxa_names, descendent_taxa): print("%s\t%s" % (dtn, dt)) if args.outfile: with open(args.outfile, 'w') as ofh: for id in descendent_taxa: ofh.write(str(id) + '\n')
def main(): """main""" ncbi = NCBITaxa() args = get_args() infile = args.infile rank = args.classify_order out_file = args.outfile l = [] level = { 'superkingdom': 0, 'phylum': 1, 'class': 2, 'order': 3, 'family': 4, 'genus': 5, 'species': 6 } h_num = level[rank] with open(infile, 'r') as in_f: reader = csv.reader(in_f, delimiter='\t') next(reader) for row in reader: if row[h_num] != 'NA': taxid2name = ncbi.get_taxid_translator([row[h_num]]) name = taxid2name[int(row[h_num])] elif row[h_num - 1] != 'NA': taxid2name = ncbi.get_taxid_translator([row[h_num - 1]]) name = taxid2name[int(row[h_num - 1])] + '_NA' elif row[h_num - 2] != 'NA': taxid2name = ncbi.get_taxid_translator([row[h_num - 2]]) name = taxid2name[int(row[h_num - 2])] + '_NA' + '_NA' elif row[h_num - 3] != 'NA': taxid2name = ncbi.get_taxid_translator([row[h_num - 3]]) name = taxid2name[int(row[h_num - 3])] + '_NA' + '_NA' + '_NA' elif row[h_num - 4] != 'NA': taxid2name = ncbi.get_taxid_translator([row[h_num - 4]]) name = taxid2name[int( row[h_num - 4])] + '_NA' + '_NA' + '_NA' + '_NA' else: name = 'NA' t = (name, row[7]) l.append(t) d_count = {} for key, value in l: d_count[key] = int(d_count.get(key, 0)) + int(value) total_count = sum(d_count.values()) with open(out_file, 'w') as o_f: for key in sorted(d_count, key=d_count.get, reverse=True): percent = float(d_count[key] / total_count * 100) #print(key,'\t',d_count[key],'\t','{0:.2f}'.format(percent), file=o_f) print(key, '\t', d_count[key], file=o_f)
def from_taxid(cls, taxid: int) -> "Lineage": """ Create `Lineage` instance from taxid Parameters ---------- taxid : int A valid NCBI taxonomy id Returns ------- "Lineage" Instance of the `Lineage` class """ ncbi = NCBITaxa() lineage_taxids = ncbi.get_lineage(taxid) lineage_names = ncbi.get_taxid_translator(lineage_taxids) lineage_ranks = { v.capitalize(): k for k, v in ncbi.get_rank(lineage_taxids).items() } if "Superkingdom" in lineage_ranks: lineage_ranks["Kingdom"] = lineage_ranks["Superkingdom"] del lineage_ranks["Superkingdom"] taxa: Dict[str, str] = {} for field in cls._fields: if field in lineage_ranks: taxa[field] = lineage_names[lineage_ranks[field]] else: break return cls(**taxa)
def getTaxName(taxId): ncbi = NCBITaxa() try: name = ncbi.get_taxid_translator([taxId])[int(taxId)] except: name = 'UNK' return (name)
def get_lineage(self,taxid): ncbi = NCBITaxa() lineage = ncbi.get_lineage(taxid) names = ncbi.get_taxid_translator(lineage) lin = [names[taxid] for taxid in lineage] return lin
def get_metadata(records: List[SeqRecord]): ncbi = NCBITaxa() species = [gb.annotations["organism"] for gb in records] name_translator = ncbi.get_name_translator(species) sought_ranks = [ "superkingdom", "order", "family", "subfamily", "genus", "species" ] metadata = [] for gb in records: taxid = name_translator[gb.annotations["organism"]][0] lineage = ncbi.get_lineage(taxid) ranks = ncbi.get_rank(lineage) names = ncbi.get_taxid_translator(lineage) taxonomy = { ranks[k]: names[k] for k in lineage if ranks[k] in sought_ranks } metadata.append({**taxonomy, "aid": gb.id}) df = pd.DataFrame(metadata) df.to_csv("metadata.csv") return df
def get_full_lineage(otus): ### Makes the updated lineage file (full_lineages_updated.tsv), requires the ete3 library ## Input: List with the keys of the updated_input_dic ## Output: Generates the file full_lineages_updated.tsv from ete3 import NCBITaxa ncbi = NCBITaxa() lineages = {} if 0 in otus: lineages.update({0: ""}) otus.remove(0) if 1 in otus: lineages.update({1: "root"}) otus.remove(1) if 2 in otus: lineages.update({2: "root;Bacteria"}) otus.remove(2) for entrie in otus: lineage = ncbi.get_lineage(entrie) #returns list of lineage taxids names = ncbi.get_taxid_translator(lineage).values( ) #returns dict in which the taxids of the lineage list become the keys (int) and the translations the values. Error if there is a 0 all_names = ";".join(names) lineages.update({entrie: all_names}) lineages_df = pd.DataFrame(lineages.items(), columns=["OTU", "lineage"]) lineages_df.to_csv("full_lineages_updated.tsv", sep="\t", index=False, header=True)
def get_full_lineages(otus): #### makes the full lineage file (lineages.tsv). Requires ete3 #### #Input: list of the otus in the table obtained from the get_otus function #Output: makes the full_lineages.tsv file from ete3 import NCBITaxa ncbi = NCBITaxa() lineages = {} if 0 in otus: lineages.update({0: ""}) otus.remove(0) if 1 in otus: lineages.update({1: "root"}) otus.remove(1) if 2 in otus: lineages.update({2: "root;Bacteria"}) otus.remove(2) for entrie in otus: lineage = ncbi.get_lineage(entrie) #returns list of lineage taxids names = ncbi.get_taxid_translator(lineage).values( ) #returns dict in which the taxids of the lineage list become the keys (int) and the translations the values. Error if there is a 0 all_names = ";".join(names) lineages.update({entrie: all_names}) lineages_df = pd.DataFrame(lineages.items(), columns=["OTU", "LINEAGE"]) lineages_df.to_csv("full_lineages.tsv", sep="\t", index=False, header=True) print("full lineage file created")
def correct_erroneous_repres_of_taxon_instances(): protDB = db_handling.ProteinDatabase() ncbi = NCBITaxa() erroneous_instances = protDB.get_erroneous_repres_of_taxon_instances() for instance in erroneous_instances: lineage = ncbi.get_lineage(instance[1]) lineage_ranks = ncbi.get_rank(lineage) representative_id = protDB.get_protein_entry(instance[2])[5] lineage_translation = ncbi.get_taxid_translator(lineage) if protDB.get_protein_entry(instance[2]): if representative_id != None: representative_id = protDB.get_protein_entry( instance[2])[5] print('representative_id', representative_id) lineage_representative = ncbi.get_lineage( representative_id) print('lineage_representative', lineage_representative) lineage_representative_ranks = ncbi.get_rank( lineage_representative) print('lineage_representative_ranks', lineage_representative_ranks) if lineage_ranks[ instance[1]] == lineage_representative_ranks[ representative_id]: count_instance = protDB.get_count_of_children_of_repres( instance[1]) count_representative = protDB.get_count_of_children_of_repres( representative_id) if count_representative >= count_instance: protDB.update_protein_entry( {'representative_of_taxon': None}, instance[1]) else: protDB.update_protein_entry_by_repres_by( {'represented_by': instance[1]}, representative_id) protDB.update_protein_entry( {'represented_by': instance[1]}, representative_id) protDB.update_protein_entry( {'representative_of_taxon': None}, representative_id) else: protDB.update_protein_entry({'representative_of_taxon': None}, instance[0]) protDB.update_protein_entry( {'representative_of_taxon': instance[1]}, instance[2]) protDB.update_protein_entry( { 'taxon_name_representative': lineage_translation[instance[1]] }, representative_id) protDB.update_protein_entry( {'representative_taxon_rank': lineage_ranks[instance[1]]}, representative_id) print( '---------------------------------------------------------------------------------' )
def contig_tax(annot_df, ncbi_db, min_prot, prop_annot, tax_thres): """This function takes the annotation table generated by viral_contig_maps.py and generates a table that provides the taxonomic lineage of each viral contig, based on the corresponding ViPhOG annotations""" ncbi = NCBITaxa(dbfile=ncbi_db) tax_rank_order = ["genus", "subfamily", "family", "order"] contig_set = set(annot_df["Contig"]) for contig in contig_set: contig_lineage = [contig] contig_df = annot_df[annot_df["Contig"] == contig] total_prot = len(contig_df) annot_prot = sum(contig_df["Best_hit"] != "No hit") if annot_prot < prop_annot * total_prot: contig_lineage.extend([""] * 4) else: contig_hits = contig_df[pd.notnull( contig_df["Label"])]["Label"].values taxid_list = [ ncbi.get_name_translator([item])[item][0] for item in contig_hits ] hit_lineages = [{ y: x for x, y in ncbi.get_rank(ncbi.get_lineage(item)).items() if y in tax_rank_order } for item in taxid_list] for rank in tax_rank_order: taxon_list = [item.get(rank) for item in hit_lineages] total_hits = sum(pd.notnull(taxon_list)) if total_hits < min_prot: contig_lineage.append("") continue else: count_hits = Counter( [item for item in taxon_list if pd.notnull(item)]) best_hit = sorted( [(x, y) for x, y in count_hits.items()], key=lambda x: x[1], reverse=True, )[0] prop_hits = best_hit[1] / total_hits if prop_hits < tax_thres: contig_lineage.append(prop_hits) continue else: best_lineage = ncbi.get_lineage(best_hit[0]) contig_lineage.extend([ ncbi.get_taxid_translator([key])[key] if pd.notnull(key) else "" for key in [{ y: x for x, y in ncbi.get_rank( best_lineage).items() }.get(item ) for item in tax_rank_order[tax_rank_order. index(rank):]] ]) break yield contig_lineage
def ellect_taxon_rank_representatives_step(self, taxon_rank_id, taxon_rank): protDB = db_handling.ProteinDatabase() ncbi = NCBITaxa() paralell = Parallelization() taxon_name = ncbi.get_taxid_translator([taxon_rank_id])[taxon_rank_id] representative_seqs = protDB.get_comparisons_same_taxon_id( taxon_rank_id, taxon_rank) identity_matrix = {} for seq_tuple in representative_seqs: if seq_tuple[0] not in identity_matrix.keys(): identity_matrix[seq_tuple[0]] = [seq_tuple[1]] elif seq_tuple[0] in identity_matrix.keys(): identity_matrix[seq_tuple[0]].append(seq_tuple[1]) if seq_tuple[1] not in identity_matrix.keys(): identity_matrix[seq_tuple[1]] = [seq_tuple[0]] elif seq_tuple[1] in identity_matrix.keys(): identity_matrix[seq_tuple[1]].append(seq_tuple[0]) if identity_matrix: params_a, params_b = [], [] chosen_item = max(identity_matrix, key=lambda k: len(identity_matrix[k])) # print(chosen_item, identity_matrix[chosen_item]) for protein_id in identity_matrix[chosen_item]: # protDB.update_protein_entry({'represented_by':chosen_item},protein_id) params_a.append({'represented_by': chosen_item}) params_b.append(protein_id) # print('paralellizing updates...') bar = Bar(taxon_rank + ' ' + taxon_name, max=len(params_a)) paralell.parallelize_7(protDB.update_protein_entry, [params_a, params_b], bar=bar) bar.finish() # print('collapsed '+str(len(identity_matrix[chosen_item]))+' seqs of '+taxon_rank+' '+taxon_name+' into protein entry '+str(chosen_item)) protDB.update_protein_entry( { 'representative_of_taxon': taxon_rank_id, 'representative_taxon_rank': taxon_rank, 'taxon_name_representative': taxon_name }, chosen_item) return True else: sys.stdout.write("\033[K") print(taxon_rank + ' ' + taxon_name) return False
def getNcbiName(taxonName): ncbi = NCBITaxa() taxId = taxonName.split('@')[1] try: name = ncbi.get_taxid_translator([taxId])[int(taxId)] except: name = taxonName return (name)
def from_name2ids(phylum_name, dataset='genbank', return_d2ids=False): """ retrieve ids and metadata from genbank file :param phylum_name: :return: """ phylum_names = [_ for _ in phylum_name.split(';') if _] # phylum_name = "Nitrospirae;" # phylum_tid = "40117" ncbi = NCBITaxa() p2tid = ncbi.get_name_translator(phylum_names) for _ in phylum_names: if not p2tid.get(_): print(f" '{_}'' not found. please check the name") tids = [p2tid.get(_, [None])[0] for _ in phylum_names if p2tid.get(_)] tid2name = { p2tid.get(_, [None])[0]: _ for _ in phylum_names if p2tid.get(_) } domain2dids = defaultdict(list) descend_ids = [] tid2dids = {} for tid in tids: lineages = ncbi.get_lineage(tid) ranks = ncbi.get_rank(lineages) ranks = {v: k for k, v in ranks.items()} names = ncbi.get_taxid_translator(lineages) domain = names[ranks['superkingdom']] _descend_ids = ncbi.get_descendant_taxa(tid, intermediate_nodes=True) tid2dids[tid2name[tid]] = len(_descend_ids) descend_ids += _descend_ids domain2dids[domain].extend(_descend_ids) print(f"in total, {len(descend_ids)} taxids were found. ") if return_d2ids: return domain2dids domain2aids = defaultdict(list) collect_info = [] descend_ids = set(descend_ids) for domain, ids in domain2dids.items(): d = domain.lower() metadata = join(metadata_files_dir, f"{dataset}_{d}_assembly_summary.txt") tqdm.write( f'read {metadata} which last modified at : {time.ctime(os.path.getmtime(metadata))}' ) for row in tqdm(open(metadata)): if row.startswith("GC"): rows = row.split('\t') if int(rows[5]) in descend_ids: collect_info.append(row) domain2aids[d].append(rows[0]) return domain2aids, collect_info
def name_output(taxid, gcf): """ Given a taxid and a gcf, create a name with the organism name and its gcf """ ncbi = NCBITaxa() name_org = ncbi.get_taxid_translator([int(taxid)])[int(taxid)] name_org = name_org.replace("'", '') name_org = name_org.replace("/", '_') return name_org + " " + gcf
def getTaxName(taxId): ncbi = NCBITaxa() try: ncbiName = ncbi.get_taxid_translator([taxId])[int(taxId)] ncbiName = re.sub('[^a-zA-Z1-9\s]+', '', ncbiName) taxName = ncbiName.split() name = taxName[0][:3].upper()+taxName[1][:2].upper() except: name = "UNK" + taxId return(name)
def extract_taxa(mpwt_taxon_file, taxon_output_file, tree_output_file): """From NCBI taxon ID, extract taxonomy rank and create a tree file Args: mpwt_taxon_file (str): mpwt taxon file for species in sbml folder taxon_output_file (str): path to phylum output file tree_output_file (str): path to tree output file """ ncbi = NCBITaxa() taxon_ids = [] phylum_count = {} with open(taxon_output_file, "w") as phylum_file: csvwriter = csv.writer(phylum_file, delimiter="\t") csvwriter.writerow([ "species", "taxid", "phylum_number", "phylum", "class", "order", "family", "genus", "species" ]) with open(mpwt_taxon_file, "r") as taxon_file: csvfile = csv.reader(taxon_file, delimiter="\t") for line in csvfile: if "taxon" not in line[1]: taxon_ids.append(line[1]) lineage = ncbi.get_lineage(line[1]) lineage2ranks = ncbi.get_rank(lineage) names = ncbi.get_taxid_translator(lineage) ranks2lineage = dict( (rank, names[taxid]) for (taxid, rank) in lineage2ranks.items()) ranks = [ ranks2lineage.get(rank, "no_information") for rank in [ "phylum", "class", "order", "family", "genus", "species" ] ] if ranks[0] != "no_information": phylum = ranks[0][:4] else: phylum = "no_information" if phylum not in phylum_count: phylum_count[phylum] = 1 elif phylum == "no_information": phylum_count[phylum] = "" else: phylum_count[phylum] += 1 row = ([line[0], line[1]] + [phylum + str(phylum_count[phylum])] + ranks) csvwriter.writerow(row) tree = ncbi.get_topology(taxon_ids) with open(tree_output_file, "w") as tree_file: tree_file.write(tree.get_ascii(attributes=["sci_name", "rank"]))
def get_rank_dict(taxa_name=None): ncbi = NCBITaxa() name_dict = ncbi.get_name_translator([taxa_name]) if not name_dict: ## try only the first word (which may be a genus name?) print("can not find taxid for", taxa_name, file=sys.stderr) taxa_name = taxa_name.split() if len(taxa_name) > 1: taxa_name = taxa_name[0] print("try to search %s instead..." % taxa_name, file=sys.stderr) name_dict = ncbi.get_name_translator([taxa_name]) if not name_dict: print("can not find taxid for %s, maybe it's a misspelling.\n" % taxa_name, file=sys.stderr) return None lineage_taxid_list = ncbi.get_lineage(name_dict[taxa_name][0]) rank_dict = dict() for rank in [ 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species' ]: rank_dict[rank] = 'NA' for j in lineage_taxid_list: rank = ncbi.get_rank([j])[j] taxa = ncbi.get_taxid_translator([j])[j] if rank == 'kingdom': rank_dict['kingdom'] = taxa elif rank == 'phylum': rank_dict['phylum'] = taxa elif rank == 'class': rank_dict['class'] = taxa elif rank == 'order': rank_dict['order'] = taxa elif rank == 'family': rank_dict['family'] = taxa elif rank == 'genus': rank_dict['genus'] = taxa elif rank == 'species': rank_dict['species'] = taxa else: pass return rank_dict
def taxid2lineage(taxid): ranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] ncbi = NCBITaxa() lineage = ncbi.get_lineage(taxid) lineage_dict = dict() names = ncbi.get_taxid_translator(lineage) for rank in ranks: for k, v in ncbi.get_rank(lineage).items(): if v == rank: lineage_dict.update({v: names[k]}) return lineage_dict
def checkTaxId(taxId): ncbi = NCBITaxa() tmp = ncbi.get_rank([taxId]) try: tmp = ncbi.get_rank([taxId]) rank = tmp[int(taxId)] if not rank == 'species': print('\033[92mWARNING: rank of %s is not SPECIES (%s)\033[0m' % (taxId, rank)) else: print('\033[92mNCBI taxon info: %s %s\033[0m' % (taxId, ncbi.get_taxid_translator([taxId])[int(taxId)])) except: print('\033[92mWARNING: %s not found in NCBI taxonomy database!\033[0m' % taxId)
def get_taxo(self, function_get_taxid): ncbi = NCBITaxa() taxname = None taxrank = None taxid = function_get_taxid(self) taxname_dic = ncbi.get_taxid_translator([taxid]) if taxname_dic: taxname = taxname_dic[int(taxid)] taxrank_dic = ncbi.get_rank([taxid]) if taxrank_dic: taxrank = taxrank_dic[int(taxid)] self.taxo = Taxo(taxid, taxname, taxrank)
def get_tax_lineage(taxonid, source): """Return taxonomy lineage information This function uses Biopython library to connect NCBI database and search for taxonomy information or ete3 to download taxdump file and search the information locally. Parameters ------------- taxonid : string Taxonomic id of the species source : string Source to be used to collect the info about the taxonid Returns ------------- lineage: dict Species lineage """ if taxonid not in LINEAGES: if source == "taxdump": ncbi_taxdump = NCBITaxa() lineage_ids = ncbi_taxdump.get_lineage(taxonid) ranks = ncbi_taxdump.get_rank(lineage_ids) names = ncbi_taxdump.get_taxid_translator(lineage_ids) lineage = {ranks[i]:names[i] for i in lineage_ids} LINEAGES[taxonid] = lineage return LINEAGES[taxonid] while True: data = "" try: Entrez.email = "*****@*****.**" handle = Entrez.efetch(id = taxonid, db = "taxonomy", retmode = "xml") data = Entrez.read(handle) handle.close() except Exception as e: with open(LOG, "a") as log: print("Error when searching information about {}".format(taxonid), file=log) if data: break lineage = {d["Rank"]:d["ScientificName"] for d in data[0]["LineageEx"]} lineage[data[0]["Rank"]] = data[0]["ScientificName"] LINEAGES[taxonid] = lineage return LINEAGES[taxonid]
def get_lineage_sciname_at_desired_ranks(taxid, desired_ranks): 'Retrieve lineage information at desired taxonomic ranks' # initiate an instance of the ncbi taxonomy database ncbi = NCBITaxa() # retrieve lineage information for each full length 16S molecule lineage = ncbi.get_lineage(taxid) lineage2ranks = ncbi.get_rank(lineage) ranks2lineage = dict((rank, taxid) for (taxid, rank) in lineage2ranks.items()) ranki = [ranks2lineage.get(x) for x in desired_ranks] ranks = [x if x is not None else 0 for x in ranki] ranks_translation = ncbi.get_taxid_translator(ranks) ranks_sciname = [ranks_translation[x] if x != 0 else 'NA' for x in ranks] return(ranks, ranks_sciname)
def assign_rank_representation(self, rank='species'): protDB = db_handling.ProteinDatabase() ncbi = NCBITaxa() entries_no_representative = protDB.get_entries_no_representative() for entry in entries_no_representative: taxon_id = entry[1] with warnings.catch_warnings(record=True) as w: warn_msg = None warnings.simplefilter("always") lineage = ncbi.get_lineage(taxon_id) for a in w: warn_msg = a.message if warn_msg: warn_data = str(warn_msg).split() taxon_id = int(warn_data[-1]) protDB.update_protein_entry( {'representative_of_taxon': taxon_id}, entry[0]) lineage_ranks = ncbi.get_rank(lineage) lineage_translation = ncbi.get_taxid_translator(lineage) insert = True ellected_rank_id = '' for rank_id, lineage_rank in lineage_ranks.items(): if rank == lineage_rank: ellected_rank_id = rank_id print(entry[0]) if lineage_ranks[taxon_id] != rank: if not self.bigger_than_rank_taxon( lineage_ranks[taxon_id], rank) and ellected_rank_id != '': protDB.update_protein_entry( { 'representative_of_taxon': ellected_rank_id, 'representative_taxon_rank': rank, 'taxon_name_representative': lineage_translation[ellected_rank_id] }, entry[0]) insert = False if entry[2] == None and insert and ellected_rank_id != '': protDB.update_protein_entry( { 'representative_of_taxon': ellected_rank_id, 'representative_taxon_rank': rank, 'taxon_name_representative': lineage_translation[ellected_rank_id] }, entry[0])
def make_krona_table(f, db): if not db: ncbi_taxa = NCBITaxa() else: ncbi_taxa = NCBITaxa(db) krona_table = pd.DataFrame(columns = ["abundance","superkingdom","phylum","class","order","family","genus", "species","leaf"]) one_letter_ranks = {"D": "superkingdom", "P": "phylum", "C": "class", "O": "order", "F": "family", "G": "genus", "S": "species"} df = pd.read_csv(f, header=None, names = ["clade_percent", "clade_reads", "reads", "rank", "taxid", "name"], sep="\t") df = df.loc[df.reads > 0] for j, i in enumerate(df.index): r = df.loc[i] taxid = r["taxid"] reads = r["reads"] name = r["name"] one_letter_rank = r["rank"] if one_letter_rank == "-": rank = ncbi_taxa.get_rank([taxid])[taxid] try: parent_taxid = ncbi_taxa.get_lineage(taxid)[-2] except IndexError: parent_taxid = taxid parent_rank = ncbi_taxa.get_rank([parent_taxid])[parent_taxid] if rank == "no rank" and parent_rank == "species": rank = "leaf" else: continue elif one_letter_rank == "U": rank = "unclassified" else: try: rank = one_letter_ranks[one_letter_rank] #TODO: Shouldn't be too many reads mapped directly to ranks not in the krona table, but check eventually except KeyError: continue res = {"abundance": reads, "superkingdom": "", "phylum": "", "class": "", "order": "", "family": "", "genus": "", "species": "", "leaf": ""} if rank != "unclassified": rank_dict = ncbi_taxa.get_rank(ncbi_taxa.get_lineage(taxid)) name_dict = ncbi_taxa.get_taxid_translator(ncbi_taxa.get_lineage(taxid)) for dict_taxid, dict_rank in rank_dict.items(): if dict_rank in res.keys(): rank_name = name_dict[dict_taxid] res[dict_rank] = rank_name if not rank in ["superkingdom", "phylum", "class", "order", "family", "genus", "species"]: res["leaf"] = name _df = pd.DataFrame(res, index=[j])[krona_table.columns] krona_table = pd.concat([krona_table, _df]) return krona_table
def taxid_to_lineage_string(taxid): tax_order = ['kingdom', 'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species'] outstr = '' ncbi = NCBITaxa() lineage = ncbi.get_lineage(taxid) names = ncbi.get_taxid_translator(lineage) for level in tax_order: for tid in names: rank = ncbi.get_rank([tid]) if rank[tid] == 'superkingdom': rank[tid] = 'domain' if rank[tid] == level: outstr += level[0] + '_' + names[tid] + ';' return outstr[:-1]
class SynTax: """Synopsis: SynTax class contains all the relevant taxonomy to mine""" def __init__(self): self.ncbi = NCBITaxa() def get_descendants(self, domain: str, taxon_rank: str) -> List[str]: """Synopsis: Fetch all the available taxids""" # Domain must be in title case taxids = self.ncbi.get_descendant_taxa(domain, rank_limit=taxon_rank, collapse_subspecies=True) taxa_names = (self.ncbi.get_taxid_translator([taxa]) for taxa in taxids) return [values for i in taxa_names for key, values in i.items()]
def get_taxonomic_group_mapping(group_ids: List[str], selected_rank: str) -> Tuple[Dict, Dict]: """ Function to create a mapping from NCBI-taxon ids to groups which are used to split the provided training records into training and validation sets :param group_ids: List of identifiers that should be NCBI taxon ids :param selected_rank: selected standard rank determining on which level the set is split in training and validation-set :return: Mapping of input taxon_ids as string and groups as integers """ ncbi = NCBITaxa() standard_ranks = [ "superkingdom", "phylum", "class", "order", "family", "genus", "species" ] if not selected_rank.lower() in standard_ranks: selected_rank = auto_select_rank(group_ids) taxon_ids_set = set(group_ids) taxon_ancestor_mapping = {} for taxon in taxon_ids_set: lineage = ncbi.get_lineage(int(taxon)) ids_of_ranks = ncbi.get_rank(lineage) taxon_ancestor_mapping[ taxon] = 0 # fall-back value if sample does not have an entry on this level for ancestor_id, rank in ids_of_ranks.items(): if rank == selected_rank: taxon_ancestor_mapping[taxon] = ancestor_id ancestor_ids = set(taxon_ancestor_mapping.values()) ancestor_names = ncbi.get_taxid_translator(ancestor_ids) ancestor_names[0] = "unknown" ancestor_enumeration = { ancestor_id: x for x, ancestor_id in enumerate(ancestor_ids) } group_name_mapping = { taxon: ancestor_names[taxon_ancestor_mapping[taxon]] for taxon in group_ids } group_id_mapping = { taxon: ancestor_enumeration[taxon_ancestor_mapping[taxon]] for taxon in group_ids } return group_name_mapping, group_id_mapping
def get_ncbi_taxonomy(taxid): ncbi= NCBITaxa() lineage = ncbi.get_lineage(taxid) names = ncbi.get_taxid_translator(lineage) ranks = ncbi.get_rank(lineage) ncbi_taxonomy_path="" for taxid in lineage: if not ranks[taxid]=="no rank": ncbi_taxonomy_path = ncbi_taxonomy_path +";"+names[taxid] return(ncbi_taxonomy_path)
def run(args): # add lineage profiles/stats import re from ete3 import PhyloTree, NCBITaxa # dump tree by default if not args.tree and not args.info and not args.descendants: args.tree = True ncbi = NCBITaxa() all_taxids = {} all_names = set() queries = [] if not args.search: log.error('Search terms should be provided (i.e. --search) ') sys.exit(-1) for n in args.search: queries.append(n) try: all_taxids[int(n)] = None except ValueError: all_names.add(n.strip()) # translate names name2tax = ncbi.get_name_translator(all_names) all_taxids.update([(v, None) for v in list(name2tax.values())]) not_found_names = all_names - set(name2tax.keys()) if args.fuzzy and not_found_names: log.warn("%s unknown names", len(not_found_names)) for name in not_found_names: # enable extension loading tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy) if tax: all_taxids[tax] = None name2tax[name] = tax name2realname[name] = realname name2score[name] = "Fuzzy:%0.2f" %sim if not_found_names: log.warn("[%s] could not be translated into taxids!" %','.join(not_found_names)) if args.tree: if len(all_taxids) == 1: target_taxid = next(all_taxids.keys()) log.info("Dumping NCBI descendants tree for %s" %(target_taxid)) t = ncbi.get_descendant_taxa(target_taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit, return_tree=True) else: log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids))) t = ncbi.get_topology(list(all_taxids.keys()), intermediate_nodes=args.full_lineage, rank_limit=args.rank_limit, collapse_subspecies=args.collapse_subspecies) id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()]) for n in t.traverse(): n.add_features(taxid=n.name) n.add_features(sci_name=str(id2name.get(int(n.name), "?"))) n.name = "%s - %s" %(id2name.get(int(n.name), n.name), n.name) lineage = ncbi.get_lineage(n.taxid) n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage))) dump(t, features=["taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage"]) elif args.descendants: log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids))) print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "descendant_taxids", "descendant_names"])) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid in all_taxids: descendants = ncbi.get_descendant_taxa(taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit) print('\t'.join([str(taxid), translator.get(taxid, taxid), ranks.get(taxid, ''), '|'.join(map(str, descendants)), '|'.join(map(str, ncbi.translate_to_names(descendants)))])) elif args.info: print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"])) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid, name in six.iteritems(translator): lineage = ncbi.get_lineage(taxid) named_lineage = ','.join(ncbi.translate_to_names(lineage)) lineage_string = ','.join(map(str, lineage)) print('\t'.join([str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string]))
#!/usr/bin/python Usage = """ Print taxid's lineage and ranks by default prints to the stdout Usage: taxid_ranks.py taxid > ouput.txt Arun Seetharam [email protected] taxid_ranks.py -version 1.0 04/13/2017 """ from ete3 import NCBITaxa import sys ncbi = NCBITaxa() if len(sys.argv)<2: print Usage else: cmdargs = str(sys.argv) lineage = ncbi.get_lineage((sys.argv[1])) names = ncbi.get_taxid_translator(lineage) for taxid in lineage: print [ncbi.get_rank([taxid])], [names[taxid]] # print [names[taxid] for taxid in lineage] # print [ncbi.get_rank([taxid]) for taxid in lineage] # print [ncbi.get_rank([name]) for name in names]
#!/usr/bin/python3 from ete3 import NCBITaxa ncbi = NCBITaxa() diamond_path = "/home/anna/bioinformatics/diplonema/dpapi_genome_diamond.tsv" out_path = "/home/anna/bioinformatics/diplonema/dpapi_genome_diamond_annotation.tsv" taxids = [] with open(diamond_path) as input_f: for line in input_f: newtaxid = line.split("\t")[1] taxids.append(newtaxid) taxids_nr = list(set(taxids)) tax_names = ncbi.get_taxid_translator(taxids_nr) input_f = open(diamond_path, "r") output_f = open(out_path, 'w') for line in input_f: line_split = line.rstrip().split("\t") id = line_split[0] taxid = line_split[1] evalue = line_split[2] if taxid == "0": name = "None" is_bacteria = 0 else: name = tax_names[int(taxid)] is_bacteria = 1 if 2 in ncbi.get_lineage(taxid) else 0