def correct_erroneous_repres_of_taxon_instances(): protDB = db_handling.ProteinDatabase() ncbi = NCBITaxa() erroneous_instances = protDB.get_erroneous_repres_of_taxon_instances() for instance in erroneous_instances: lineage = ncbi.get_lineage(instance[1]) lineage_ranks = ncbi.get_rank(lineage) representative_id = protDB.get_protein_entry(instance[2])[5] lineage_translation = ncbi.get_taxid_translator(lineage) if protDB.get_protein_entry(instance[2]): if representative_id != None: representative_id = protDB.get_protein_entry( instance[2])[5] print('representative_id', representative_id) lineage_representative = ncbi.get_lineage( representative_id) print('lineage_representative', lineage_representative) lineage_representative_ranks = ncbi.get_rank( lineage_representative) print('lineage_representative_ranks', lineage_representative_ranks) if lineage_ranks[ instance[1]] == lineage_representative_ranks[ representative_id]: count_instance = protDB.get_count_of_children_of_repres( instance[1]) count_representative = protDB.get_count_of_children_of_repres( representative_id) if count_representative >= count_instance: protDB.update_protein_entry( {'representative_of_taxon': None}, instance[1]) else: protDB.update_protein_entry_by_repres_by( {'represented_by': instance[1]}, representative_id) protDB.update_protein_entry( {'represented_by': instance[1]}, representative_id) protDB.update_protein_entry( {'representative_of_taxon': None}, representative_id) else: protDB.update_protein_entry({'representative_of_taxon': None}, instance[0]) protDB.update_protein_entry( {'representative_of_taxon': instance[1]}, instance[2]) protDB.update_protein_entry( { 'taxon_name_representative': lineage_translation[instance[1]] }, representative_id) protDB.update_protein_entry( {'representative_taxon_rank': lineage_ranks[instance[1]]}, representative_id) print( '---------------------------------------------------------------------------------' )
def contig_tax(annot_df, ncbi_db, min_prot, prop_annot, tax_thres): """This function takes the annotation table generated by viral_contig_maps.py and generates a table that provides the taxonomic lineage of each viral contig, based on the corresponding ViPhOG annotations""" ncbi = NCBITaxa(dbfile=ncbi_db) tax_rank_order = ["genus", "subfamily", "family", "order"] contig_set = set(annot_df["Contig"]) for contig in contig_set: contig_lineage = [contig] contig_df = annot_df[annot_df["Contig"] == contig] total_prot = len(contig_df) annot_prot = sum(contig_df["Best_hit"] != "No hit") if annot_prot < prop_annot * total_prot: contig_lineage.extend([""] * 4) else: contig_hits = contig_df[pd.notnull( contig_df["Label"])]["Label"].values taxid_list = [ ncbi.get_name_translator([item])[item][0] for item in contig_hits ] hit_lineages = [{ y: x for x, y in ncbi.get_rank(ncbi.get_lineage(item)).items() if y in tax_rank_order } for item in taxid_list] for rank in tax_rank_order: taxon_list = [item.get(rank) for item in hit_lineages] total_hits = sum(pd.notnull(taxon_list)) if total_hits < min_prot: contig_lineage.append("") continue else: count_hits = Counter( [item for item in taxon_list if pd.notnull(item)]) best_hit = sorted( [(x, y) for x, y in count_hits.items()], key=lambda x: x[1], reverse=True, )[0] prop_hits = best_hit[1] / total_hits if prop_hits < tax_thres: contig_lineage.append(prop_hits) continue else: best_lineage = ncbi.get_lineage(best_hit[0]) contig_lineage.extend([ ncbi.get_taxid_translator([key])[key] if pd.notnull(key) else "" for key in [{ y: x for x, y in ncbi.get_rank( best_lineage).items() }.get(item ) for item in tax_rank_order[tax_rank_order. index(rank):]] ]) break yield contig_lineage
def make_krona_table(f, db): if not db: ncbi_taxa = NCBITaxa() else: ncbi_taxa = NCBITaxa(db) krona_table = pd.DataFrame(columns = ["abundance","superkingdom","phylum","class","order","family","genus", "species","leaf"]) one_letter_ranks = {"D": "superkingdom", "P": "phylum", "C": "class", "O": "order", "F": "family", "G": "genus", "S": "species"} df = pd.read_csv(f, header=None, names = ["clade_percent", "clade_reads", "reads", "rank", "taxid", "name"], sep="\t") df = df.loc[df.reads > 0] for j, i in enumerate(df.index): r = df.loc[i] taxid = r["taxid"] reads = r["reads"] name = r["name"] one_letter_rank = r["rank"] if one_letter_rank == "-": rank = ncbi_taxa.get_rank([taxid])[taxid] try: parent_taxid = ncbi_taxa.get_lineage(taxid)[-2] except IndexError: parent_taxid = taxid parent_rank = ncbi_taxa.get_rank([parent_taxid])[parent_taxid] if rank == "no rank" and parent_rank == "species": rank = "leaf" else: continue elif one_letter_rank == "U": rank = "unclassified" else: try: rank = one_letter_ranks[one_letter_rank] #TODO: Shouldn't be too many reads mapped directly to ranks not in the krona table, but check eventually except KeyError: continue res = {"abundance": reads, "superkingdom": "", "phylum": "", "class": "", "order": "", "family": "", "genus": "", "species": "", "leaf": ""} if rank != "unclassified": rank_dict = ncbi_taxa.get_rank(ncbi_taxa.get_lineage(taxid)) name_dict = ncbi_taxa.get_taxid_translator(ncbi_taxa.get_lineage(taxid)) for dict_taxid, dict_rank in rank_dict.items(): if dict_rank in res.keys(): rank_name = name_dict[dict_taxid] res[dict_rank] = rank_name if not rank in ["superkingdom", "phylum", "class", "order", "family", "genus", "species"]: res["leaf"] = name _df = pd.DataFrame(res, index=[j])[krona_table.columns] krona_table = pd.concat([krona_table, _df]) return krona_table
def main(tree_path): ncbi = NCBITaxa() tree = Tree(tree_path, format=1) names = [] ids = [] in_magnoliophyta = [] for leaf in tree: name = leaf.name.replace("_", ' ') name2taxid = ncbi.get_name_translator([name]) if not name2taxid: if name in byhand: id = byhand[name] magno = "yes" else: id = "not found" magno = "" elif len(name2taxid) > 1: id = str(name2taxid[name]) magno = "" print "two ids: ", name else: id = str(name2taxid[name][0]) lineage = ncbi.get_lineage(id) if 3398 in lineage: #3398 - magnoliophyta id magno = "yes" else: magno = "no" if id != "not found" and id in ids: print "duplicate: ", name, id id += "_B" leaf.name = id names.append(name) ids.append(id) in_magnoliophyta.append(magno) df = pd.DataFrame({ 'name': names, 'id': ids, 'in magnoliophyta': in_magnoliophyta }) df.to_csv('names_to_ids.csv') p = "/groups/itay_mayrose/nomihadar/trees/magnoliophyta_tree/sequences_filtered_zanne/species/intersect_mytree_zannetree_mangoete3.ls" with open(p, 'r') as f: lines = f.read().splitlines() species = [x for x in lines if x not in ['58454', '142615', '77013']] tree.prune(list(set(species)), preserve_branch_length=True) tree.write(outfile="tree_ids.tree")
def from_taxid(cls, taxid: int) -> "Lineage": """ Create `Lineage` instance from taxid Parameters ---------- taxid : int A valid NCBI taxonomy id Returns ------- "Lineage" Instance of the `Lineage` class """ ncbi = NCBITaxa() lineage_taxids = ncbi.get_lineage(taxid) lineage_names = ncbi.get_taxid_translator(lineage_taxids) lineage_ranks = { v.capitalize(): k for k, v in ncbi.get_rank(lineage_taxids).items() } if "Superkingdom" in lineage_ranks: lineage_ranks["Kingdom"] = lineage_ranks["Superkingdom"] del lineage_ranks["Superkingdom"] taxa: Dict[str, str] = {} for field in cls._fields: if field in lineage_ranks: taxa[field] = lineage_names[lineage_ranks[field]] else: break return cls(**taxa)
def get_metadata(records: List[SeqRecord]): ncbi = NCBITaxa() species = [gb.annotations["organism"] for gb in records] name_translator = ncbi.get_name_translator(species) sought_ranks = [ "superkingdom", "order", "family", "subfamily", "genus", "species" ] metadata = [] for gb in records: taxid = name_translator[gb.annotations["organism"]][0] lineage = ncbi.get_lineage(taxid) ranks = ncbi.get_rank(lineage) names = ncbi.get_taxid_translator(lineage) taxonomy = { ranks[k]: names[k] for k in lineage if ranks[k] in sought_ranks } metadata.append({**taxonomy, "aid": gb.id}) df = pd.DataFrame(metadata) df.to_csv("metadata.csv") return df
def get_full_lineages(otus): #### makes the full lineage file (lineages.tsv). Requires ete3 #### #Input: list of the otus in the table obtained from the get_otus function #Output: makes the full_lineages.tsv file from ete3 import NCBITaxa ncbi = NCBITaxa() lineages = {} if 0 in otus: lineages.update({0: ""}) otus.remove(0) if 1 in otus: lineages.update({1: "root"}) otus.remove(1) if 2 in otus: lineages.update({2: "root;Bacteria"}) otus.remove(2) for entrie in otus: lineage = ncbi.get_lineage(entrie) #returns list of lineage taxids names = ncbi.get_taxid_translator(lineage).values( ) #returns dict in which the taxids of the lineage list become the keys (int) and the translations the values. Error if there is a 0 all_names = ";".join(names) lineages.update({entrie: all_names}) lineages_df = pd.DataFrame(lineages.items(), columns=["OTU", "LINEAGE"]) lineages_df.to_csv("full_lineages.tsv", sep="\t", index=False, header=True) print("full lineage file created")
def get_lineage(self,taxid): ncbi = NCBITaxa() lineage = ncbi.get_lineage(taxid) names = ncbi.get_taxid_translator(lineage) lin = [names[taxid] for taxid in lineage] return lin
def sort_collection_by_taxon_rank(collection, key, rank='species', rank_id=None): ncbi = NCBITaxa() new_collection = collection i = 0 for item in collection: lineage = ncbi.get_lineage(item[key]) lineage_ranks = ncbi.get_rank(lineage) if (rank in lineage_ranks.values() and item[key] in lineage_ranks.keys() and lineage_ranks[item[key]] != rank): for taxon, taxon_rank in lineage_ranks.items(): if rank == taxon_rank: new_rep = [] for k in range(len(item)): if k == key: new_rep.append(taxon) else: new_rep.append(item[k]) new_rep = tuple(new_rep) if rank_id == None: new_collection[i] = new_rep elif taxon == rank_id: new_collection[i] = new_rep i += 1 sorted_collection = tuple( sorted(new_collection, key=operator.itemgetter(key))) return sorted_collection
def get_taxonomy(species_name, name_format="Genus species", ranks=None, update_db=False): species_name = str(species_name) ncbi = NCBITaxa() if update_db == True: ncbi.update_taxonomy_database() if name_format == "Genus species": species_name = species_name if name_format == "Genus_species": species_name = species_name.replace("_", " ") species_id = ncbi.get_name_translator([species_name]) if len(species_id) == 0 and ranks == None: return (['unknown']) if len(species_id) == 0 and ranks != None: return (['unknown'] * len(ranks)) lineage_ids = ncbi.get_lineage(species_id[species_name][0]) names = ncbi.get_taxid_translator(lineage_ids) if ranks == None: return (names) lineage_rk = ncbi.get_rank(lineage_ids) parsed_names = [] for rk in ranks: for rk_id, rk_rk in lineage_rk.items(): if rk_rk == rk: parsed_names.append(ncbi.get_taxid_translator([rk_id])[rk_id]) return (parsed_names)
def get_full_lineage(otus): ### Makes the updated lineage file (full_lineages_updated.tsv), requires the ete3 library ## Input: List with the keys of the updated_input_dic ## Output: Generates the file full_lineages_updated.tsv from ete3 import NCBITaxa ncbi = NCBITaxa() lineages = {} if 0 in otus: lineages.update({0: ""}) otus.remove(0) if 1 in otus: lineages.update({1: "root"}) otus.remove(1) if 2 in otus: lineages.update({2: "root;Bacteria"}) otus.remove(2) for entrie in otus: lineage = ncbi.get_lineage(entrie) #returns list of lineage taxids names = ncbi.get_taxid_translator(lineage).values( ) #returns dict in which the taxids of the lineage list become the keys (int) and the translations the values. Error if there is a 0 all_names = ";".join(names) lineages.update({entrie: all_names}) lineages_df = pd.DataFrame(lineages.items(), columns=["OTU", "lineage"]) lineages_df.to_csv("full_lineages_updated.tsv", sep="\t", index=False, header=True)
def get_parent_taxa(self): """ Get parent taxa Returns: :obj:`list` of :obj:`Taxon`: list of parent taxa """ if self.id_of_nearest_ncbi_taxon is None: return None cls = self.__class__ ncbi_taxa = NCBITaxa() lineage = [ cls(ncbi_id=id) for id in ncbi_taxa.get_lineage(self.id_of_nearest_ncbi_taxon) ] if self.additional_name_beyond_nearest_ncbi_taxon: base_name = ncbi_taxa.translate_to_names( [self.id_of_nearest_ncbi_taxon])[0] names = self.additional_name_beyond_nearest_ncbi_taxon[1:].split( ' ') for i_rank, name, in enumerate(names): lineage.append( cls(name=base_name + ''.join(' ' + n for n in name[0:i_rank + 1]))) return lineage[0:-1]
def from_name2ids(phylum_name, dataset='genbank', return_d2ids=False): """ retrieve ids and metadata from genbank file :param phylum_name: :return: """ phylum_names = [_ for _ in phylum_name.split(';') if _] # phylum_name = "Nitrospirae;" # phylum_tid = "40117" ncbi = NCBITaxa() p2tid = ncbi.get_name_translator(phylum_names) for _ in phylum_names: if not p2tid.get(_): print(f" '{_}'' not found. please check the name") tids = [p2tid.get(_, [None])[0] for _ in phylum_names if p2tid.get(_)] tid2name = { p2tid.get(_, [None])[0]: _ for _ in phylum_names if p2tid.get(_) } domain2dids = defaultdict(list) descend_ids = [] tid2dids = {} for tid in tids: lineages = ncbi.get_lineage(tid) ranks = ncbi.get_rank(lineages) ranks = {v: k for k, v in ranks.items()} names = ncbi.get_taxid_translator(lineages) domain = names[ranks['superkingdom']] _descend_ids = ncbi.get_descendant_taxa(tid, intermediate_nodes=True) tid2dids[tid2name[tid]] = len(_descend_ids) descend_ids += _descend_ids domain2dids[domain].extend(_descend_ids) print(f"in total, {len(descend_ids)} taxids were found. ") if return_d2ids: return domain2dids domain2aids = defaultdict(list) collect_info = [] descend_ids = set(descend_ids) for domain, ids in domain2dids.items(): d = domain.lower() metadata = join(metadata_files_dir, f"{dataset}_{d}_assembly_summary.txt") tqdm.write( f'read {metadata} which last modified at : {time.ctime(os.path.getmtime(metadata))}' ) for row in tqdm(open(metadata)): if row.startswith("GC"): rows = row.split('\t') if int(rows[5]) in descend_ids: collect_info.append(row) domain2aids[d].append(rows[0]) return domain2aids, collect_info
def main(): """Make queries against NCBI Taxa databases""" # Get commandline args args = get_args() # Instantiate the ete NCBI taxa object ncbi = NCBITaxa() if args.verbose > 1: print("Taxa database is stored under ~/.etetoolkit/taxa.sqlite") # Update the database if required. if args.update is True: if args.verbose > 1: print( "Updating the taxonomy database. This may take several minutes..." ) ncbi.update_taxonomy_database() # If a name was provided instead of a TaxID, convert and store it. if args.name: args.taxid = ncbi.get_name_translator([args.name])[args.name][0] if args.verbose > 0: tax_dict = {} # If a name was provided, simply add it to dict if args.name: tax_dict['Name'] = args.name # If not, do the opposite conversion to the above and store that else: tax_dict['Name'] = ncbi.get_taxid_translator([args.taxid ])[args.taxid] # Continue to populate the taxa dict with other information tax_dict['TaxID'] = args.taxid tax_dict['Rank'] = ncbi.get_rank([args.taxid]) tax_dict['Lineage'] = ncbi.get_taxid_translator( ncbi.get_lineage(args.taxid)) print("Information about your selected taxa:") pretty(tax_dict) # Main feature of the script is to get all taxa within a given group. descendent_taxa = ncbi.get_descendant_taxa(args.taxid) descendent_taxa_names = ncbi.translate_to_names(descendent_taxa) print("Descendent taxa for TaxID: %s" % (args.taxid)) # Under python3, zip = izip. In python2, this list could be very large, and memory intensive # Suggest the script is run with python3 if args.verbose > 0: for dtn, dt in zip(descendent_taxa_names, descendent_taxa): print("%s\t%s" % (dtn, dt)) if args.outfile: with open(args.outfile, 'w') as ofh: for id in descendent_taxa: ofh.write(str(id) + '\n')
def extract_taxa(mpwt_taxon_file, taxon_output_file, tree_output_file): """From NCBI taxon ID, extract taxonomy rank and create a tree file Args: mpwt_taxon_file (str): mpwt taxon file for species in sbml folder taxon_output_file (str): path to phylum output file tree_output_file (str): path to tree output file """ ncbi = NCBITaxa() taxon_ids = [] phylum_count = {} with open(taxon_output_file, "w") as phylum_file: csvwriter = csv.writer(phylum_file, delimiter="\t") csvwriter.writerow([ "species", "taxid", "phylum_number", "phylum", "class", "order", "family", "genus", "species" ]) with open(mpwt_taxon_file, "r") as taxon_file: csvfile = csv.reader(taxon_file, delimiter="\t") for line in csvfile: if "taxon" not in line[1]: taxon_ids.append(line[1]) lineage = ncbi.get_lineage(line[1]) lineage2ranks = ncbi.get_rank(lineage) names = ncbi.get_taxid_translator(lineage) ranks2lineage = dict( (rank, names[taxid]) for (taxid, rank) in lineage2ranks.items()) ranks = [ ranks2lineage.get(rank, "no_information") for rank in [ "phylum", "class", "order", "family", "genus", "species" ] ] if ranks[0] != "no_information": phylum = ranks[0][:4] else: phylum = "no_information" if phylum not in phylum_count: phylum_count[phylum] = 1 elif phylum == "no_information": phylum_count[phylum] = "" else: phylum_count[phylum] += 1 row = ([line[0], line[1]] + [phylum + str(phylum_count[phylum])] + ranks) csvwriter.writerow(row) tree = ncbi.get_topology(taxon_ids) with open(tree_output_file, "w") as tree_file: tree_file.write(tree.get_ascii(attributes=["sci_name", "rank"]))
def get_distance_to_root(self): """ Get the distance from the taxon to the root of the NCBI taxonomy tree Returns: :obj:`int`: distance from the taxon to the root """ if self.id_of_nearest_ncbi_taxon is None: return id_of_nearest_ncbi_taxon ncbi_taxa = NCBITaxa() return len(ncbi_taxa.get_lineage(self.id_of_nearest_ncbi_taxon)) - 1 + self.distance_from_nearest_ncbi_taxon
def get_lineage_at_desired_ranks(taxid, desired_ranks): 'Retrieve lineage information at desired taxonomic ranks' # initiate an instance of the ncbi taxonomy database ncbi = NCBITaxa() # retrieve lineage information for each full length 16S molecule lineage = ncbi.get_lineage(taxid) lineage2ranks = ncbi.get_rank(lineage) ranks2lineage = dict((rank, taxid) for (taxid, rank) in lineage2ranks.items()) ranki = [ranks2lineage.get(x) for x in desired_ranks] ranks = [x if x is not None else 0 for x in ranki] return(ranks)
def get_rank_dict(taxa_name=None): ncbi = NCBITaxa() name_dict = ncbi.get_name_translator([taxa_name]) if not name_dict: ## try only the first word (which may be a genus name?) print("can not find taxid for", taxa_name, file=sys.stderr) taxa_name = taxa_name.split() if len(taxa_name) > 1: taxa_name = taxa_name[0] print("try to search %s instead..." % taxa_name, file=sys.stderr) name_dict = ncbi.get_name_translator([taxa_name]) if not name_dict: print("can not find taxid for %s, maybe it's a misspelling.\n" % taxa_name, file=sys.stderr) return None lineage_taxid_list = ncbi.get_lineage(name_dict[taxa_name][0]) rank_dict = dict() for rank in [ 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species' ]: rank_dict[rank] = 'NA' for j in lineage_taxid_list: rank = ncbi.get_rank([j])[j] taxa = ncbi.get_taxid_translator([j])[j] if rank == 'kingdom': rank_dict['kingdom'] = taxa elif rank == 'phylum': rank_dict['phylum'] = taxa elif rank == 'class': rank_dict['class'] = taxa elif rank == 'order': rank_dict['order'] = taxa elif rank == 'family': rank_dict['family'] = taxa elif rank == 'genus': rank_dict['genus'] = taxa elif rank == 'species': rank_dict['species'] = taxa else: pass return rank_dict
def taxid2lineage(taxid): ranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] ncbi = NCBITaxa() lineage = ncbi.get_lineage(taxid) lineage_dict = dict() names = ncbi.get_taxid_translator(lineage) for rank in ranks: for k, v in ncbi.get_rank(lineage).items(): if v == rank: lineage_dict.update({v: names[k]}) return lineage_dict
def check_ancestor(name: str, tax_id: int, rank: str = None) -> bool: ncbi = NCBITaxa() ancestor_ids = ncbi.get_name_translator([name]).get(name, []) if not ancestor_ids: raise ValueError("No taxonomy id for {}".format(name)) lineage = ncbi.get_lineage(tax_id) for anc_id in lineage: if rank is None or ncbi.get_rank([anc_id]).get(anc_id, '') == rank: if anc_id in ancestor_ids: return True return False
def get_tax_lineage(taxonid, source): """Return taxonomy lineage information This function uses Biopython library to connect NCBI database and search for taxonomy information or ete3 to download taxdump file and search the information locally. Parameters ------------- taxonid : string Taxonomic id of the species source : string Source to be used to collect the info about the taxonid Returns ------------- lineage: dict Species lineage """ if taxonid not in LINEAGES: if source == "taxdump": ncbi_taxdump = NCBITaxa() lineage_ids = ncbi_taxdump.get_lineage(taxonid) ranks = ncbi_taxdump.get_rank(lineage_ids) names = ncbi_taxdump.get_taxid_translator(lineage_ids) lineage = {ranks[i]:names[i] for i in lineage_ids} LINEAGES[taxonid] = lineage return LINEAGES[taxonid] while True: data = "" try: Entrez.email = "*****@*****.**" handle = Entrez.efetch(id = taxonid, db = "taxonomy", retmode = "xml") data = Entrez.read(handle) handle.close() except Exception as e: with open(LOG, "a") as log: print("Error when searching information about {}".format(taxonid), file=log) if data: break lineage = {d["Rank"]:d["ScientificName"] for d in data[0]["LineageEx"]} lineage[data[0]["Rank"]] = data[0]["ScientificName"] LINEAGES[taxonid] = lineage return LINEAGES[taxonid]
def assign_rank_representation(self, rank='species'): protDB = db_handling.ProteinDatabase() ncbi = NCBITaxa() entries_no_representative = protDB.get_entries_no_representative() for entry in entries_no_representative: taxon_id = entry[1] with warnings.catch_warnings(record=True) as w: warn_msg = None warnings.simplefilter("always") lineage = ncbi.get_lineage(taxon_id) for a in w: warn_msg = a.message if warn_msg: warn_data = str(warn_msg).split() taxon_id = int(warn_data[-1]) protDB.update_protein_entry( {'representative_of_taxon': taxon_id}, entry[0]) lineage_ranks = ncbi.get_rank(lineage) lineage_translation = ncbi.get_taxid_translator(lineage) insert = True ellected_rank_id = '' for rank_id, lineage_rank in lineage_ranks.items(): if rank == lineage_rank: ellected_rank_id = rank_id print(entry[0]) if lineage_ranks[taxon_id] != rank: if not self.bigger_than_rank_taxon( lineage_ranks[taxon_id], rank) and ellected_rank_id != '': protDB.update_protein_entry( { 'representative_of_taxon': ellected_rank_id, 'representative_taxon_rank': rank, 'taxon_name_representative': lineage_translation[ellected_rank_id] }, entry[0]) insert = False if entry[2] == None and insert and ellected_rank_id != '': protDB.update_protein_entry( { 'representative_of_taxon': ellected_rank_id, 'representative_taxon_rank': rank, 'taxon_name_representative': lineage_translation[ellected_rank_id] }, entry[0])
def taxid_to_lineage_string(taxid): tax_order = ['kingdom', 'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species'] outstr = '' ncbi = NCBITaxa() lineage = ncbi.get_lineage(taxid) names = ncbi.get_taxid_translator(lineage) for level in tax_order: for tid in names: rank = ncbi.get_rank([tid]) if rank[tid] == 'superkingdom': rank[tid] = 'domain' if rank[tid] == level: outstr += level[0] + '_' + names[tid] + ';' return outstr[:-1]
def taxid_to_lineage(taxid): """ Function for retrieving the taxonomic rank of given taxid :param taxid: :return: """ ncbi = NCBITaxa() lineage = ncbi.get_lineage(taxid) rank_to_id = {rank: id for (id, rank) in ncbi.get_rank(lineage).items()} rank_to_id = { desired_rank: (rank_to_id[desired_rank] if desired_rank in rank_to_id.keys() else None) for desired_rank in desired_ranks } return rank_to_id
def get_taxonomic_group_mapping(group_ids: List[str], selected_rank: str) -> Tuple[Dict, Dict]: """ Function to create a mapping from NCBI-taxon ids to groups which are used to split the provided training records into training and validation sets :param group_ids: List of identifiers that should be NCBI taxon ids :param selected_rank: selected standard rank determining on which level the set is split in training and validation-set :return: Mapping of input taxon_ids as string and groups as integers """ ncbi = NCBITaxa() standard_ranks = [ "superkingdom", "phylum", "class", "order", "family", "genus", "species" ] if not selected_rank.lower() in standard_ranks: selected_rank = auto_select_rank(group_ids) taxon_ids_set = set(group_ids) taxon_ancestor_mapping = {} for taxon in taxon_ids_set: lineage = ncbi.get_lineage(int(taxon)) ids_of_ranks = ncbi.get_rank(lineage) taxon_ancestor_mapping[ taxon] = 0 # fall-back value if sample does not have an entry on this level for ancestor_id, rank in ids_of_ranks.items(): if rank == selected_rank: taxon_ancestor_mapping[taxon] = ancestor_id ancestor_ids = set(taxon_ancestor_mapping.values()) ancestor_names = ncbi.get_taxid_translator(ancestor_ids) ancestor_names[0] = "unknown" ancestor_enumeration = { ancestor_id: x for x, ancestor_id in enumerate(ancestor_ids) } group_name_mapping = { taxon: ancestor_names[taxon_ancestor_mapping[taxon]] for taxon in group_ids } group_id_mapping = { taxon: ancestor_enumeration[taxon_ancestor_mapping[taxon]] for taxon in group_ids } return group_name_mapping, group_id_mapping
def get_ncbi_taxonomy(taxid): ncbi= NCBITaxa() lineage = ncbi.get_lineage(taxid) names = ncbi.get_taxid_translator(lineage) ranks = ncbi.get_rank(lineage) ncbi_taxonomy_path="" for taxid in lineage: if not ranks[taxid]=="no rank": ncbi_taxonomy_path = ncbi_taxonomy_path +";"+names[taxid] return(ncbi_taxonomy_path)
def get_taxonomy(updateBool, spName): ncbi = NCBITaxa() #add update condition if updateBool is True: ncbi.update_taxonomy_database() #get only genus name genus = spName.partition('_')[0] name2taxid = ncbi.get_name_translator([genus]) lineage = ncbi.get_lineage(name2taxid[genus][0]) return lineage[2:]
def main(InputMSA, output): ncbi = NCBITaxa() #ncbi.update_taxonomy_database() headers, seqs = readAlg(InputMSA) sys.stdout.write("Annotating headers for %d sequences..." % len(headers)) for i in range(0, len(headers)): head_terms = read_header(headers[i]) lin = ncbi.get_lineage(head_terms["taxid"]) #sp_name = ncbi.translate_to_names([tid]) lin_name = ncbi.translate_to_names(lin) with open(output, 'w+') as output_fasta: output_fasta.write(">%s|%s|%s\n%s\n" % (head_terms["header"], lin_name[-1], ", ".join( lin_name[1:]), seqs[i])) sys.stdout.write("Done\n")
def get_leaves_taxid(nodeset): ncbi =NCBITaxa() df=pd.read_csv("stats.csv", names=['taxid','CDS','CDS_Mean','exon','exon_Mean','gene','gene_Mean','mRNA','mRNA_Mean']) taxid_list=df['taxid'] gff_set=set() for nodeid in nodeset: for taxid in taxid_list: try: if nodeid in ncbi.get_lineage(taxid): gff_set.add(taxid) except ValueError: print ("error in getting get_lineage()") return gff_set
def fetchinfomap(self): """ function to make sure the information of all models is known to the class """ ncbi = NCBITaxa() if len(self.modelinfomap) == 0: info = self.fetch_info("{}info.csv".format(url)) logging.debug("Fetching models from {}".format(url)) for line in info.split("\n"): l = line.split(",") # fetch lineage from ete3 for each model # time consuming but important to adapt to changes # in NCBI taxonomy if len(l) > 1: self.modelinfomap[l[0]] = ncbi.get_lineage(l[1])
def get_tags_leaves(tree, taxid_dict): ncbi_taxa = NCBITaxa() bacteria_taxid = 2 dpapi_taxid = 91374 leaf_tags = {} for leaf in tree.iter_leaves(): seqid = leaf.name if "DIPPA" in seqid: leaf_tags[seqid] = "dpapi" elif seqid in taxid_dict.keys(): # print (seqid) # print (taxid_dict[seqid]) taxid = int(taxid_dict[seqid]) if taxid == dpapi_taxid: leaf_tags[seqid] = "dpapi" elif bacteria_taxid in ncbi_taxa.get_lineage(taxid): leaf_tags[seqid] = "bacteria" else: leaf_tags[seqid] = "other" else: print (seqid, "is not in taxid dict!") leaf_tags[seqid] = "other" return leaf_tags
#!/usr/bin/env python from ete3 import NCBITaxa import sys import os args = sys.argv if len(args) < 2: print("Usage:", args[0], "[IDs]") sys.exit(1) ncbi = NCBITaxa() for id in open(args[1]): print ncbi.get_lineage(id)
filenum = args.filenum # filename = 'C:/Users/Andrew.Hwang/Desktop/fastaq2phylo/output/blastout.txt' # dbType = 'nt' # filenum = "0" memory = {} writeLines = [] with open(filename, 'r') as f: for line in f: line_arr = line.split("\t") ID=line_arr[1] pos=int(round(100 * int(line_arr[2]) / int(line_arr[4]))) pos2=int(round(100 * int(line_arr[3]) / int(line_arr[4]))) if dbType == 'nt': lineage = ncbi.get_lineage(ID) names = ncbi.get_taxid_translator(lineage) strList = [str(names[taxid]) for taxid in lineage] writeLines.append( '; '.join(strList) + "::" + str(pos)+"-"+str(pos2)) elif dbType == 'viruses' or dbType == 'blood': if not memory.has_key(ID): handle = Entrez.efetch(db='nucleotide', id=ID, retmode="xml") records = Entrez.read(handle) lineage = records[0]['GBSeq_taxonomy'] organism = records[0]['GBSeq_organism'] lineage = "; ".join(lineage.split("; ")[:-1] + [organism]) memory[ID] = lineage writeLines.append(memory[ID] + "::" + str(pos)+"-"+str(pos2)) else: writeLines.append(line_arr[5][:-1] + "::" + str(pos)+"-"+str(pos2))
``` if len(sys.argv) == 1: sys.exit("USAGE: python %s <path/to/ncbi_gi_taxid_file> > <output.txt>" % sys.argv[0]) ncbi = NCBITaxa() #ncbi.update_taxonomy_database() fp = open('taxa-ids-not-found.txt', 'w') hier = ["superkingdom", "kingdom", "phylum", "class", "order", "family", "genus", "species"] missing = [] for x in open(sys.argv[1]): dat = x.rstrip().split('\t')[-1] try: lineage = ncbi.get_lineage(dat) names = ncbi.get_taxid_translator(lineage) ranks = ncbi.get_rank(lineage) new_ranks = {} for keys in ranks: if ranks[keys] in hier: new_ranks[keys]=ranks[keys] d = {} for taxid in lineage and new_ranks: d[new_ranks[taxid]] = names[taxid] for key in sorted(d): print x.rstrip() + "\t"+ str(key)+"\t"+d[key] except ValueError:
#!/usr/bin/python Usage = """ Print taxid's lineage and ranks by default prints to the stdout Usage: taxid_ranks.py taxid > ouput.txt Arun Seetharam [email protected] taxid_ranks.py -version 1.0 04/13/2017 """ from ete3 import NCBITaxa import sys ncbi = NCBITaxa() if len(sys.argv)<2: print Usage else: cmdargs = str(sys.argv) lineage = ncbi.get_lineage((sys.argv[1])) names = ncbi.get_taxid_translator(lineage) for taxid in lineage: print [ncbi.get_rank([taxid])], [names[taxid]] # print [names[taxid] for taxid in lineage] # print [ncbi.get_rank([taxid]) for taxid in lineage] # print [ncbi.get_rank([name]) for name in names]
NCBI = False if NCBI : from ete3 import NCBITaxa ncbi = NCBITaxa() #ncbi.update_taxonomy_database() taxIDlist=[] for gene in geneList: name2taxID = ncbi.get_name_translator([gene.organism]) gene.taxID = name2taxID[gene.organism][0] for i in ncbi.get_lineage(gene.taxID): gene.addlineageid(i) taxIDlist.append(gene.taxID) #taxid2name = ncbi.get_taxid_translator([9606, 9443]) #print taxid2name tree = False if tree : tree = ncbi.get_topology(taxIDlist) print tree.get_ascii(attributes=["sci_name", "rank"])
taxids = [] with open(diamond_path) as input_f: for line in input_f: newtaxid = line.split("\t")[1] taxids.append(newtaxid) taxids_nr = list(set(taxids)) tax_names = ncbi.get_taxid_translator(taxids_nr) input_f = open(diamond_path, "r") output_f = open(out_path, 'w') for line in input_f: line_split = line.rstrip().split("\t") id = line_split[0] taxid = line_split[1] evalue = line_split[2] if taxid == "0": name = "None" is_bacteria = 0 else: name = tax_names[int(taxid)] is_bacteria = 1 if 2 in ncbi.get_lineage(taxid) else 0 output_f.write('{}\t{}\t{}\t{}\t{}\n'.format(id, taxid, evalue, name, is_bacteria)) input_f.close() output_f.close()
def parse_args(parser): args = parser.parse_args() if args.version: print get_version() sys.exit(0) if not args.no_annot and not pexists(EGGNOGDB_FILE): print colorify('Annotation database data/eggnog.db not present. Use download_eggnog_database.py to fetch it', 'red') raise emapperException() if args.mode == 'diamond' and not pexists(EGGNOG_DMND_DB): print colorify('DIAMOND database data/eggnog_proteins.dmnd not present. Use download_eggnog_database.py to fetch it', 'red') raise emapperException() if args.cpu == 0: args.cpu = multiprocessing.cpu_count() # No --servermode available for diamond if args.mode == 'diamond' and args.servermode: parser.error('--mode [diamond] and --servermode are mutually exclusive') # Output file required unless running in servermode if not args.servermode and not args.output: parser.error('An output project name is required (-o)') # Servermode implies using mem-based databases if args.servermode: args.usemem = True # Direct annotation implies no searches if args.annotate_hits_table: args.no_search = True args.no_annot = False # Check inputs for running sequence searches if not args.no_search and not args.servermode: if not args.input: parser.error('An input fasta file is required (-i)') # HMM if args.mode == 'hmmer': if not args.db and not args.guessdb: parser.error('HMMER mode requires specifying a target database (i.e. -d, --guessdb ))') if args.db and args.guessdb: parser.error('-d and --guessdb options are mutually exclusive') if args.guessdb: from ete3 import NCBITaxa ncbi = NCBITaxa() lineage = ncbi.get_lineage(args.guessdb) for tid in reversed(lineage): if tid in TAXID2LEVEL: print tid, TAXID2LEVEL[tid] args.db = TAXID2LEVEL[tid] break # DIAMOND elif args.mode == 'diamond': #if args.db or args.guessdb: # parser.error('diamond mode does not require -d or --guessdb options') pass return args
def run(args): # add lineage profiles/stats import re from ete3 import PhyloTree, NCBITaxa # dump tree by default if not args.tree and not args.info and not args.descendants: args.tree = True ncbi = NCBITaxa() all_taxids = {} all_names = set() queries = [] if not args.search: log.error('Search terms should be provided (i.e. --search) ') sys.exit(-1) for n in args.search: queries.append(n) try: all_taxids[int(n)] = None except ValueError: all_names.add(n.strip()) # translate names name2tax = ncbi.get_name_translator(all_names) all_taxids.update([(v, None) for v in list(name2tax.values())]) not_found_names = all_names - set(name2tax.keys()) if args.fuzzy and not_found_names: log.warn("%s unknown names", len(not_found_names)) for name in not_found_names: # enable extension loading tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy) if tax: all_taxids[tax] = None name2tax[name] = tax name2realname[name] = realname name2score[name] = "Fuzzy:%0.2f" %sim if not_found_names: log.warn("[%s] could not be translated into taxids!" %','.join(not_found_names)) if args.tree: if len(all_taxids) == 1: target_taxid = next(all_taxids.keys()) log.info("Dumping NCBI descendants tree for %s" %(target_taxid)) t = ncbi.get_descendant_taxa(target_taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit, return_tree=True) else: log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids))) t = ncbi.get_topology(list(all_taxids.keys()), intermediate_nodes=args.full_lineage, rank_limit=args.rank_limit, collapse_subspecies=args.collapse_subspecies) id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()]) for n in t.traverse(): n.add_features(taxid=n.name) n.add_features(sci_name=str(id2name.get(int(n.name), "?"))) n.name = "%s - %s" %(id2name.get(int(n.name), n.name), n.name) lineage = ncbi.get_lineage(n.taxid) n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage))) dump(t, features=["taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage"]) elif args.descendants: log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids))) print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "descendant_taxids", "descendant_names"])) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid in all_taxids: descendants = ncbi.get_descendant_taxa(taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit) print('\t'.join([str(taxid), translator.get(taxid, taxid), ranks.get(taxid, ''), '|'.join(map(str, descendants)), '|'.join(map(str, ncbi.translate_to_names(descendants)))])) elif args.info: print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"])) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid, name in six.iteritems(translator): lineage = ncbi.get_lineage(taxid) named_lineage = ','.join(ncbi.translate_to_names(lineage)) lineage_string = ','.join(map(str, lineage)) print('\t'.join([str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string]))