def search_regulator(args): valid = True if args.min_abundance <= 0: print("Input minimum abundance is not valid.") valid = False from ete3 import NCBITaxa ncbi = NCBITaxa() if hasattr(args, 'temp_dir'): args.findmitoscaf_dir = os.path.join(args.temp_dir, 'findmitoscaf') else: args.findmitoscaf_dir = os.path.join(os.getcwd(), 'findmitoscaf') try: safe_makedirs(args.findmitoscaf_dir) except FileExistsError: valid = False print('Diretory exist before validating, please check and remove it to prevent data loss.') except Exception: valid = False print('Error occured when validating the directories, please check your permissions or things could be related.') if args.required_taxa not in ncbi.get_name_translator([args.required_taxa]): print("Specified taxanomy name not in NCBI taxanomy database.") return False args.taxa_ids = ncbi.get_name_translator([args.required_taxa])[ args.required_taxa] return valid
def get_rank_dict(taxa_name=None): ncbi = NCBITaxa() name_dict = ncbi.get_name_translator([taxa_name]) if not name_dict: ## try only the first word (which may be a genus name?) print("can not find taxid for", taxa_name, file=sys.stderr) taxa_name = taxa_name.split() if len(taxa_name) > 1: taxa_name = taxa_name[0] print("try to search %s instead..." % taxa_name, file=sys.stderr) name_dict = ncbi.get_name_translator([taxa_name]) if not name_dict: print("can not find taxid for %s, maybe it's a misspelling.\n" % taxa_name, file=sys.stderr) return None lineage_taxid_list = ncbi.get_lineage(name_dict[taxa_name][0]) rank_dict = dict() for rank in [ 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species' ]: rank_dict[rank] = 'NA' for j in lineage_taxid_list: rank = ncbi.get_rank([j])[j] taxa = ncbi.get_taxid_translator([j])[j] if rank == 'kingdom': rank_dict['kingdom'] = taxa elif rank == 'phylum': rank_dict['phylum'] = taxa elif rank == 'class': rank_dict['class'] = taxa elif rank == 'order': rank_dict['order'] = taxa elif rank == 'family': rank_dict['family'] = taxa elif rank == 'genus': rank_dict['genus'] = taxa elif rank == 'species': rank_dict['species'] = taxa else: pass return rank_dict
def read_old_kjV2(filename, artifact_threshold=0): """ Takes kaiju output file and make them into python dictionary. Parameters ------------ filename: str, location/file name of the kaiju output file artifact_threshold: int, threshold for artifact range, if it is lower than threshold, the OTU is not added to the dictionary. Returns ------------ readsDict: dict, dictionary file where: key : NCBI taxanomy ID value : number of reads """ from ete3 import NCBITaxa ncbi = NCBITaxa() readFile = open(filename, 'r') readsDict = {} all_lines = readFile.readlines() for line in all_lines[1:-3]: tokens = line.rstrip().split('\t') if len(tokens) == 5: count = int(tokens[2]) final_id = -1 taxa = tokens[4] taxa = taxa[:-1] taxa = taxa.split(';') taxa = [t.strip() for t in taxa] otu_name = taxa[final_id] while not bool(ncbi.get_name_translator([otu_name])): final_id -= 1 otu_name = taxa[final_id] taxa_id = ncbi.get_name_translator([otu_name])[otu_name] if count > artifact_threshold: readsDict[taxa_id[0]] = int(count) return readsDict
def get_ncbi_taxonomy_species_tree(names_list): ncbi = NCBITaxa() name2taxid_dic = ncbi.get_name_translator(names_list) taxid_list = [] for i in range(len(names_list)): taxid_list.append(name2taxid_dic[names_list[i]][0]) return ncbi.get_topology(taxid_list)
def get_species_by_rank(species_list, group, rank): ncbi = NCBITaxa() #convert group name to id group_id = ncbi.get_name_translator([group])[group][0] #get an annotated tree tree = ncbi.get_descendant_taxa(group_id, collapse_subspecies=True, return_tree=True) groups = [] for node in tree.traverse("levelorder"): #if current rank is the requested rank if node.rank == rank: #get node's species ids species = [str(leaf.taxid) for leaf in node.get_leaves()] #remove species not found in the species list species_inlist = [s for s in species if s in species_list] if species_inlist: #create a group group = Group(taxid = node.taxid, species = species_inlist) #add group to dictionary groups.append(group) return groups
def get_tax( cma_file ): #finds the 'lowest common ancestor' of species represented in a cma file ncbi = NCBITaxa() org_regex = r'\[(.*)\]' taxid_list = [] for line in open(cma_file, 'r'): if line.startswith(">"): find_org_name = re.search(org_regex, line) if find_org_name is not None: org_name = find_org_name.group(1) taxid = str(ncbi.get_name_translator([org_name])) taxid = re.sub(r'^.*\[', '', taxid) taxid = re.sub(r'\].*$', '', taxid) if taxid != '{}' and taxid != '32630' and taxid != '10239': #omit sequences from viruses and synthetic constructs' taxid_list.append(taxid) tax_list = ncbi.get_taxid_translator(taxid_list) tree = ncbi.get_topology(taxid_list) tree_labeled = tree.get_ascii(attributes=['sci_name', 'taxid']) lca_id = str(tree.get_tree_root) lca_id = re.sub(r"^.*node '", '', lca_id) lca_id = re.sub(r"'.*$", '', lca_id) lca_name = str(ncbi.get_taxid_translator([lca_id])) lca_name = re.sub(r"'}$", '', lca_name) lca_name = re.sub(r"^.*'", '', lca_name) return (lca_name, tax_list, tree_labeled)
def get_species_by_rank(group, rank): ncbi = NCBITaxa() group_id = ncbi.get_name_translator([group])[group][0] # get an annotated tree tree = ncbi.get_descendant_taxa(group_id, collapse_subspecies=True, return_tree=True) dic_ids = {} dic_names = {} for node in tree.traverse("levelorder"): #if the rank of the current rank is the requested rank if node.rank == rank: #get its leaves leaves = node.get_leaves() #get their ids dic_ids[node.taxid] = [leaf.taxid for leaf in leaves] dic_names[node.sci_name] = [leaf.sci_name for leaf in leaves] print "# of {rank}: {num}".format(rank=rank, num=len(dic_ids)) return dic_ids
def name_ancestors(timetreefile, to_table=False, ete3_algo=False, uniq=True): logger.info('Loading data') ### /!\ quoted_node_names only from ete3 v3.1.1 timetree = PhyloTree(timetreefile, format=1, quoted_node_names=True) ncbi = NCBITaxa() name2taxid = ncbi.get_name_translator([sp.replace('_', ' ') for sp in \ timetree.get_leaf_names()]) for leaf in timetree.get_leaves(): try: leaf.add_feature('taxid', name2taxid[leaf.name.replace('_', ' ')][0]) except KeyError: logger.warning('Species %r not found', leaf.name) leaf.delete(prevent_nondicotomic=True, preserve_branch_length=True) logger.info('Placing common ancestors') if ete3_algo: ncbi.annotate_tree(timetree, 'taxid') else: myannotate(timetree, ncbi) matchrename_ncbitax(timetree, uniq) #logger.debug({ft:getattr(timetree, ft) for ft in timetree.features}) if not to_table: print(timetree.write(format=1, format_root_node=True)) else: for node in timetree.traverse(): if not node.is_leaf(): print(node.oldname + '\t' + getattr(node, 'sci_name', ''))
def main(tree_path): ncbi = NCBITaxa() tree = Tree(tree_path, format=1) names = [] ids = [] in_magnoliophyta = [] for leaf in tree: name = leaf.name.replace("_", ' ') name2taxid = ncbi.get_name_translator([name]) if not name2taxid: if name in byhand: id = byhand[name] magno = "yes" else: id = "not found" magno = "" elif len(name2taxid) > 1: id = str(name2taxid[name]) magno = "" print "two ids: ", name else: id = str(name2taxid[name][0]) lineage = ncbi.get_lineage(id) if 3398 in lineage: #3398 - magnoliophyta id magno = "yes" else: magno = "no" if id != "not found" and id in ids: print "duplicate: ", name, id id += "_B" leaf.name = id names.append(name) ids.append(id) in_magnoliophyta.append(magno) df = pd.DataFrame({ 'name': names, 'id': ids, 'in magnoliophyta': in_magnoliophyta }) df.to_csv('names_to_ids.csv') p = "/groups/itay_mayrose/nomihadar/trees/magnoliophyta_tree/sequences_filtered_zanne/species/intersect_mytree_zannetree_mangoete3.ls" with open(p, 'r') as f: lines = f.read().splitlines() species = [x for x in lines if x not in ['58454', '142615', '77013']] tree.prune(list(set(species)), preserve_branch_length=True) tree.write(outfile="tree_ids.tree")
def get_taxonomy(species_name, name_format="Genus species", ranks=None, update_db=False): species_name = str(species_name) ncbi = NCBITaxa() if update_db == True: ncbi.update_taxonomy_database() if name_format == "Genus species": species_name = species_name if name_format == "Genus_species": species_name = species_name.replace("_", " ") species_id = ncbi.get_name_translator([species_name]) if len(species_id) == 0 and ranks == None: return (['unknown']) if len(species_id) == 0 and ranks != None: return (['unknown'] * len(ranks)) lineage_ids = ncbi.get_lineage(species_id[species_name][0]) names = ncbi.get_taxid_translator(lineage_ids) if ranks == None: return (names) lineage_rk = ncbi.get_rank(lineage_ids) parsed_names = [] for rk in ranks: for rk_id, rk_rk in lineage_rk.items(): if rk_rk == rk: parsed_names.append(ncbi.get_taxid_translator([rk_id])[rk_id]) return (parsed_names)
def get_metadata(records: List[SeqRecord]): ncbi = NCBITaxa() species = [gb.annotations["organism"] for gb in records] name_translator = ncbi.get_name_translator(species) sought_ranks = [ "superkingdom", "order", "family", "subfamily", "genus", "species" ] metadata = [] for gb in records: taxid = name_translator[gb.annotations["organism"]][0] lineage = ncbi.get_lineage(taxid) ranks = ncbi.get_rank(lineage) names = ncbi.get_taxid_translator(lineage) taxonomy = { ranks[k]: names[k] for k in lineage if ranks[k] in sought_ranks } metadata.append({**taxonomy, "aid": gb.id}) df = pd.DataFrame(metadata) df.to_csv("metadata.csv") return df
def contig_tax(annot_df, ncbi_db, min_prot, prop_annot, tax_thres): """This function takes the annotation table generated by viral_contig_maps.py and generates a table that provides the taxonomic lineage of each viral contig, based on the corresponding ViPhOG annotations""" ncbi = NCBITaxa(dbfile=ncbi_db) tax_rank_order = ["genus", "subfamily", "family", "order"] contig_set = set(annot_df["Contig"]) for contig in contig_set: contig_lineage = [contig] contig_df = annot_df[annot_df["Contig"] == contig] total_prot = len(contig_df) annot_prot = sum(contig_df["Best_hit"] != "No hit") if annot_prot < prop_annot * total_prot: contig_lineage.extend([""] * 4) else: contig_hits = contig_df[pd.notnull( contig_df["Label"])]["Label"].values taxid_list = [ ncbi.get_name_translator([item])[item][0] for item in contig_hits ] hit_lineages = [{ y: x for x, y in ncbi.get_rank(ncbi.get_lineage(item)).items() if y in tax_rank_order } for item in taxid_list] for rank in tax_rank_order: taxon_list = [item.get(rank) for item in hit_lineages] total_hits = sum(pd.notnull(taxon_list)) if total_hits < min_prot: contig_lineage.append("") continue else: count_hits = Counter( [item for item in taxon_list if pd.notnull(item)]) best_hit = sorted( [(x, y) for x, y in count_hits.items()], key=lambda x: x[1], reverse=True, )[0] prop_hits = best_hit[1] / total_hits if prop_hits < tax_thres: contig_lineage.append(prop_hits) continue else: best_lineage = ncbi.get_lineage(best_hit[0]) contig_lineage.extend([ ncbi.get_taxid_translator([key])[key] if pd.notnull(key) else "" for key in [{ y: x for x, y in ncbi.get_rank( best_lineage).items() }.get(item ) for item in tax_rank_order[tax_rank_order. index(rank):]] ]) break yield contig_lineage
def main(): """Make queries against NCBI Taxa databases""" # Get commandline args args = get_args() # Instantiate the ete NCBI taxa object ncbi = NCBITaxa() if args.verbose > 1: print("Taxa database is stored under ~/.etetoolkit/taxa.sqlite") # Update the database if required. if args.update is True: if args.verbose > 1: print( "Updating the taxonomy database. This may take several minutes..." ) ncbi.update_taxonomy_database() # If a name was provided instead of a TaxID, convert and store it. if args.name: args.taxid = ncbi.get_name_translator([args.name])[args.name][0] if args.verbose > 0: tax_dict = {} # If a name was provided, simply add it to dict if args.name: tax_dict['Name'] = args.name # If not, do the opposite conversion to the above and store that else: tax_dict['Name'] = ncbi.get_taxid_translator([args.taxid ])[args.taxid] # Continue to populate the taxa dict with other information tax_dict['TaxID'] = args.taxid tax_dict['Rank'] = ncbi.get_rank([args.taxid]) tax_dict['Lineage'] = ncbi.get_taxid_translator( ncbi.get_lineage(args.taxid)) print("Information about your selected taxa:") pretty(tax_dict) # Main feature of the script is to get all taxa within a given group. descendent_taxa = ncbi.get_descendant_taxa(args.taxid) descendent_taxa_names = ncbi.translate_to_names(descendent_taxa) print("Descendent taxa for TaxID: %s" % (args.taxid)) # Under python3, zip = izip. In python2, this list could be very large, and memory intensive # Suggest the script is run with python3 if args.verbose > 0: for dtn, dt in zip(descendent_taxa_names, descendent_taxa): print("%s\t%s" % (dtn, dt)) if args.outfile: with open(args.outfile, 'w') as ofh: for id in descendent_taxa: ofh.write(str(id) + '\n')
def from_name2ids(phylum_name, dataset='genbank', return_d2ids=False): """ retrieve ids and metadata from genbank file :param phylum_name: :return: """ phylum_names = [_ for _ in phylum_name.split(';') if _] # phylum_name = "Nitrospirae;" # phylum_tid = "40117" ncbi = NCBITaxa() p2tid = ncbi.get_name_translator(phylum_names) for _ in phylum_names: if not p2tid.get(_): print(f" '{_}'' not found. please check the name") tids = [p2tid.get(_, [None])[0] for _ in phylum_names if p2tid.get(_)] tid2name = { p2tid.get(_, [None])[0]: _ for _ in phylum_names if p2tid.get(_) } domain2dids = defaultdict(list) descend_ids = [] tid2dids = {} for tid in tids: lineages = ncbi.get_lineage(tid) ranks = ncbi.get_rank(lineages) ranks = {v: k for k, v in ranks.items()} names = ncbi.get_taxid_translator(lineages) domain = names[ranks['superkingdom']] _descend_ids = ncbi.get_descendant_taxa(tid, intermediate_nodes=True) tid2dids[tid2name[tid]] = len(_descend_ids) descend_ids += _descend_ids domain2dids[domain].extend(_descend_ids) print(f"in total, {len(descend_ids)} taxids were found. ") if return_d2ids: return domain2dids domain2aids = defaultdict(list) collect_info = [] descend_ids = set(descend_ids) for domain, ids in domain2dids.items(): d = domain.lower() metadata = join(metadata_files_dir, f"{dataset}_{d}_assembly_summary.txt") tqdm.write( f'read {metadata} which last modified at : {time.ctime(os.path.getmtime(metadata))}' ) for row in tqdm(open(metadata)): if row.startswith("GC"): rows = row.split('\t') if int(rows[5]) in descend_ids: collect_info.append(row) domain2aids[d].append(rows[0]) return domain2aids, collect_info
def check_ancestor(name: str, tax_id: int, rank: str = None) -> bool: ncbi = NCBITaxa() ancestor_ids = ncbi.get_name_translator([name]).get(name, []) if not ancestor_ids: raise ValueError("No taxonomy id for {}".format(name)) lineage = ncbi.get_lineage(tax_id) for anc_id in lineage: if rank is None or ncbi.get_rank([anc_id]).get(anc_id, '') == rank: if anc_id in ancestor_ids: return True return False
def phylo_tree_2_ete_tree(tree, names_list): ncbi = NCBITaxa() handle = StringIO() Phylo.write(tree, handle, 'newick') newick_tree = handle.getvalue() name2taxid_dic = ncbi.get_name_translator(names_list) for i in range(len(names_list)): newick_tree = newick_tree.replace( names_list[i], str(name2taxid_dic[names_list[i]][0])) newick_tree = newick_tree.replace('\'', '') return Tree(newick_tree, format=1)
def get_group_tree(group, rank): ncbi = NCBITaxa() translator = ncbi.get_name_translator([group]) group_id = translator[group][0] # get an annotated tree tree = ncbi.get_descendant_taxa(group_id, collapse_subspecies=True, return_tree=True) return tree
def get_ncbi_taxa_rank(taxa_name): ncbi= NCBITaxa() name2taxid=ncbi.get_name_translator([taxa_name]) rank="N/A" ncbi_taxid="N/A" if name2taxid: ncbi_taxid=name2taxid[taxa_name].pop() ncbi_ranks=ncbi.get_rank([ncbi_taxid]) rank=ncbi_ranks[ncbi_taxid] return(rank,ncbi_taxid)
def NamesToTaxIDs(names): if not isETE3DBTAXAFILEexists(ETE3DBTAXAFILE): logging.info("Did not find taxa.sqlite in {}. Initializaing ete3 taxonomy database".format(ETE3DBTAXAFILE)) initETE3Database(ETE3DBTAXAFILE) ncbi = NCBITaxa(dbfile=ETE3DBTAXAFILE) if not isETE3DBTAXAFILEexists(ETE3DBTAXAFILE): logging.error( "Tried ete3 init, but still was not able to find taxa.sqlite file for ete3 lib in {}. Aborting".format( ETE3DBTAXAFILE)) return {} return ncbi.get_name_translator(names)
def get_taxonomy(updateBool, spName): ncbi = NCBITaxa() #add update condition if updateBool is True: ncbi.update_taxonomy_database() #get only genus name genus = spName.partition('_')[0] name2taxid = ncbi.get_name_translator([genus]) lineage = ncbi.get_lineage(name2taxid[genus][0]) return lineage[2:]
def get_taxonid_to_readfilenames(name_ftpdirpaths_filename): #searches through all the data files and create a dictionary that maps taxonids to readfilenames ncbi = NCBITaxa() taxonid_to_readfilenames = defaultdict(list) #a given taxonid may map to multiple readfiles, thus each value in the dictionary is a list for line in open(name_ftpdirpaths_filename): splits = line.strip().split(' ') #split on each of the spaces name = ' '.join(splits[:-1]) #concatenate everything before the last space ftpdirpath = splits[-1] #everything after the last space readfilename = './Bacteria_Genomes/' + ftpdirpath.split('/')[-1] + '_genomic.fna.gz' name_to_taxonid = ncbi.get_name_translator([name]) #a dictionary with name as key and [taxonid] as value listtaxonid = [taxonid for [taxonid] in name_to_taxonid.values()] #if the name was found in the ncbi database, listtaxonid is a list containing the taxonid; otherwise, an empty list. if listtaxonid: taxonid = listtaxonid[0] taxonid_to_readfilenames[taxonid].append(readfilename) else: if name == 'Donghicola sp. JLT3646': #upon further inspection, this name in assembly_summary.txt has been updated in NCBI print('Changing', name, 'to Marivivens sp. JLT3646') name = 'Marivivens sp. JLT3646' name_to_taxonid = ncbi.get_name_translator([name]) listtaxonid = [taxonid for [taxonid] in name_to_taxonid.values()] taxonid = listtaxonid[0] taxonid_to_readfilenames[taxonid].append(readfilename) if name == 'Mycobacterium intracellulare MOTT-64': #upon further inspection, this name in assembly_summary.txt is curiously absent from NCBI print(name, 'is not in the NCBI database') return taxonid_to_readfilenames
def run_ete_ncbiquery_py(query): ncbi = NCBITaxa() query = query.split(',') final_query = [] for i in query: try: i.lstrip() i = int(i) final_query.append(i) except ValueError: i = i.lstrip() name2taxid = ncbi.get_name_translator([i])[i] final_query += name2taxid tree = ncbi.get_topology(final_query) return tree.get_ascii(attributes=["sci_name", "rank"])
def filter_species(species_ids, group): ncbi = NCBITaxa() group_id = ncbi.get_name_translator([group])[group][0] filtered_ids = [] for species_id in species_ids: lineage = ncbi.get_lineage(species_id) print species_id, lineage if group_id in lineage: filtered_ids.append(species_id) return filtered_ids
def get_species_by_rank(input_tree, group, rank): #get species of input tree tree_taxa = [leaf.name for leaf in input_tree] ncbi = NCBITaxa() #convert group name to id group_id = ncbi.get_name_translator([group])[group][0] #get an annotated tree tree = ncbi.get_descendant_taxa(group_id, collapse_subspecies=True, return_tree=True) groups = {} for node in tree.traverse("levelorder"): #if the rank of the current rank is the requested rank if node.rank == rank: #get node's species and subspecies ids descen = list( ncbi.get_descendant_taxa(node.taxid, collapse_subspecies=False, return_tree=False)) #species descen2 = list( ncbi.get_descendant_taxa(node.taxid, collapse_subspecies=True, return_tree=False)) #species/varietas descendants = list(set(descen).union(descen2)) #to string species = [str(descendant) for descendant in descendants] #remove species not found in the input tree species_intree = [s for s in species if s in tree_taxa] #create a group group = Group(taxid=node.taxid, species=species, species_intree=species_intree) #add group to dictionary groups[node.taxid] = group return groups
def AddmyID(modelIDList, ID, filepath): ncbi = NCBITaxa() if ID.isdigit(): modelIDList.append(int(ID)) else: name2taxID = ncbi.get_name_translator(ID) modelIDList.append(int(name2taxID[ID][0])) tree = model2Tree(modelIDList) #print tree.get_ascii(attributes=["sci_name", "rank"]) outfile = "outTree.tmp" out = open(outfile, "w") for line in tree.get_ascii(attributes=["sci_name", "rank"]): out.write(line) out.close() return modelIDList
def __init__(self, id='', name='', ncbi_id=None, cross_references=None): """ Args: id (:obj:`str`, optional): identifier name (:obj:`str`, optional): name ncbi_id (:obj:`int`, optional): NCBI identifier cross_references (:obj:`list` of :obj:`CrossReference`, optional): list of cross references """ self.id = id self.name = name self.id_of_nearest_ncbi_taxon = None self.distance_from_nearest_ncbi_taxon = None self.additional_name_beyond_nearest_ncbi_taxon = None self.cross_references = cross_references or [] ncbi_taxa = NCBITaxa() if ncbi_id: self.id_of_nearest_ncbi_taxon = ncbi_id self.distance_from_nearest_ncbi_taxon = 0 self.additional_name_beyond_nearest_ncbi_taxon = '' self.name = ncbi_taxa.translate_to_names([ncbi_id])[0] if self.name == ncbi_id: raise ValueError( 'The NCBI taxonomy database does not contain a taxon with id {}' .format(ncbi_id)) else: rank_names = name.split(' ') for i_rank in range(len(rank_names)): partial_name = ' '.join(rank_names[0:len(rank_names) - i_rank]) result = ncbi_taxa.get_name_translator([partial_name]) if result: self.id_of_nearest_ncbi_taxon = result[partial_name][0] self.distance_from_nearest_ncbi_taxon = i_rank self.additional_name_beyond_nearest_ncbi_taxon = ''.join( ' ' + n for n in rank_names[len(rank_names) - i_rank:]) self.name = ncbi_taxa.translate_to_names([self.id_of_nearest_ncbi_taxon])[0] \ + self.additional_name_beyond_nearest_ncbi_taxon return self.name = name
def get_taxonomic_lineage(base_species): """ Get the lineage of a species Args: base_species (:obj:`bool`): a species (e.g. escherichia coli) Returns: :`list` of :obj:`str`: a list of strings corresponding to the layer of its taxonomy """ ncbi = NCBITaxa() base_species = ncbi.get_name_translator([base_species])[base_species][0] lineage = ncbi.get_lineage(base_species) names = ncbi.get_taxid_translator(lineage) chain = [names[taxid] for taxid in lineage] i = len(chain) new = [] while i > 0: new.append(chain[i - 1]) i = i - 1 return new
def findclade(namelist, ranks='family|genus'): #rankregex = re.compile('^(%s)$' % ranks) ncbi = NCBITaxa() name2taxid = ncbi.get_name_translator(namelist) lineages = ncbi.get_lineage_translator([v[0] for v in name2taxid.values()]) cladetaxids = [] for name in namelist: lineage = lineages[name2taxid[name][0]] #print(name, name2taxid[name], lineage) rank2clade = { rk: taxid for taxid, rk in ncbi.get_rank(lineage).items() } cladetaxids.append( [rank2clade.get(rank, 0) for rank in ranks.split('|')]) #print(cladetaxids) taxid2clade = ncbi.get_taxid_translator(chain(*cladetaxids)) for name, taxidlist in zip(namelist, cladetaxids): yield name, [taxid2clade.get(t, '') for t in taxidlist]
def normalize_target_taxa(target_taxa): """ Receives a list of taxa IDs and/or taxa names and returns a set of expanded taxids numbers """ from ete3 import NCBITaxa ncbi = NCBITaxa() expanded_taxa = set() for taxon in target_taxa: taxid = "" try: taxid = int(taxon) except ValueError: taxid = ncbi.get_name_translator([taxon])[taxon][0] else: taxon = ncbi.get_taxid_translator([taxid])[taxid] species = ncbi.get_descendant_taxa(taxid, collapse_subspecies=False) for sp in species: expanded_taxa.add(sp) return expanded_taxa
def load_taxonomy_tree(otu_list): ncbi = NCBITaxa() # downloads the NCBI locally # TODO: it should maybe be done as a separate rule # ncbi.update_taxonomy_database() sp2taxid = ncbi.get_name_translator(otu_list) # The ETE docs scared me into checking if there are no or different matches for sp in sp2taxid: if len(sp2taxid[sp]) > 1: print("More than two NCBI taxonomy matches for:\n", sp, sp2taxid[sp]) sys.exit() if len(sp2taxid[sp]) < 1: print("No NCBI taxonomy match for:\n", sp, sp2taxid[sp]) sys.exit() lineages = {} i = 1 ncbi = NCBITaxa() for sp in sp2taxid: lineage = ncbi.get_lineage(sp2taxid[sp][0]) names = ncbi.get_taxid_translator(lineage) lineages[sp] = [names[taxid] for taxid in lineage] i += 1 tree = TreeNode.from_taxonomy(lineages.items()) # Branches are required to have a length # TODO: Length should be parametrized for node in tree.postorder(): node.length = 1 # Lineages for species that are not found are silently not reported # They break the unifraq and have to be discarded from the dataframe # TODO: The notfound list should be reported! tips = set([tip.name for tip in tree.tips()]) notfound = set([otu for otu in otu_list if otu not in tips]) return tree, notfound
def create_taxonomic_data_ete(species_name): """ Query ete taxonomy with the species name to create a dictionary containing taxon id, taxonomy and some other informations. Useful when no internet connection is available and the NCBITaxa database have already been downloaded. Args: species_name (str): species name (must be with genus for example "Escherichia coli") Returns: species_informations (dict): dictionary containing information about species """ species_informations = {} compatible_species_name = species_name.replace('/', '_') species_informations['description'] = compatible_species_name + ' genome' species_informations['organism'] = compatible_species_name species_informations['keywords'] = [compatible_species_name] ncbi = NCBITaxa() species_taxids = ncbi.get_name_translator([species_name]) if species_name in species_taxids: species_taxid = species_taxids[species_name][-1] species_informations['db_xref'] = 'taxon:' + str(species_taxid) else: logger.critical( '/!\\ Error with {} this taxa has not been found in ete3 NCBITaxa Database' .format(species_name)) logger.critical( '/!\\ Check the name of the taxa and its presence in the NCBITaxa database.' ) logger.critical( '/!\\ No genbank will be created for {}.'.format(species_name)) return None return species_informations
def model_organisms(inputfile): ncbi = NCBITaxa() infile = open(inputfile, "r") modelList = [] for line in infile: modelList.append(line[:-1]) infile.close() if modelList[0].isdigit(): print "List of model IDs Loaded" Type = 'Id' else: print "List of model names Loaded" Type = 'Sp' modelIDList = [] if Type == 'Sp': name2taxID = ncbi.get_name_translator(modelList) for model in modelList: modelIDList.append(name2taxID[model][0]) else: modelIDList = modelList return modelIDList
options, args = parser.parse_args() if options.database == "yes": try: ncbi.update_taxonomy_database() except: pass if options.input_species_filename is None: raise Exception('-s option must be specified, Species list in text format one species in each line') with open(options.input_species_filename) as f: species_name = [_.strip().replace('_', ' ') for _ in f.readlines()] name2taxid = ncbi.get_name_translator(species_name) taxid = [name2taxid[_][0] for _ in species_name] tree = ncbi.get_topology(taxid) if options.treebest == "yes": inv_map = {str(v[0]): k.replace(" ", "") + "*" for k, v in name2taxid.items()} else: inv_map = {str(v[0]): k for k, v in name2taxid.items()} for leaf in tree: leaf.name = inv_map[leaf.name] newickTree = tree.write(format=int(options.format))
# begin iterating through the file and getting GenBank records while 1: # get a SeqFeature object for the next GenBank record. When we run # out of records in the file, cur_entry will be None cur_entry = iterator.next() if cur_entry is None: break nid = cur_entry.id organism = cur_entry.annotations['organism'] # name exception.. try: taxid = ncbi.get_name_translator([organism])[organism][0] except KeyError: if organism in CorrectDict: taxid = ncbi.get_name_translator([CorrectDict[organism]])[CorrectDict[organism]][0] else: print cur_entry.annotations try: correct_organism = raw_input("name for %s\n" % organism) taxid = ncbi.get_name_translator([correct_organism])[correct_organism][0] except KeyError: taxid = int(raw_input("taxid")) #print "Printing cDNA info for %s" % nid # loop through all of the features for the entry for feature in cur_entry.features: # when we've got CDS features, parse the info out of them if feature.type == "CDS":
NCBI = False if NCBI : from ete3 import NCBITaxa ncbi = NCBITaxa() #ncbi.update_taxonomy_database() taxIDlist=[] for gene in geneList: name2taxID = ncbi.get_name_translator([gene.organism]) gene.taxID = name2taxID[gene.organism][0] for i in ncbi.get_lineage(gene.taxID): gene.addlineageid(i) taxIDlist.append(gene.taxID) #taxid2name = ncbi.get_taxid_translator([9606, 9443]) #print taxid2name tree = False if tree : tree = ncbi.get_topology(taxIDlist) print tree.get_ascii(attributes=["sci_name", "rank"])
def run(args): # add lineage profiles/stats import re from ete3 import PhyloTree, NCBITaxa # dump tree by default if not args.tree and not args.info and not args.descendants: args.tree = True ncbi = NCBITaxa() all_taxids = {} all_names = set() queries = [] if not args.search: log.error('Search terms should be provided (i.e. --search) ') sys.exit(-1) for n in args.search: queries.append(n) try: all_taxids[int(n)] = None except ValueError: all_names.add(n.strip()) # translate names name2tax = ncbi.get_name_translator(all_names) all_taxids.update([(v, None) for v in list(name2tax.values())]) not_found_names = all_names - set(name2tax.keys()) if args.fuzzy and not_found_names: log.warn("%s unknown names", len(not_found_names)) for name in not_found_names: # enable extension loading tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy) if tax: all_taxids[tax] = None name2tax[name] = tax name2realname[name] = realname name2score[name] = "Fuzzy:%0.2f" %sim if not_found_names: log.warn("[%s] could not be translated into taxids!" %','.join(not_found_names)) if args.tree: if len(all_taxids) == 1: target_taxid = next(all_taxids.keys()) log.info("Dumping NCBI descendants tree for %s" %(target_taxid)) t = ncbi.get_descendant_taxa(target_taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit, return_tree=True) else: log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids))) t = ncbi.get_topology(list(all_taxids.keys()), intermediate_nodes=args.full_lineage, rank_limit=args.rank_limit, collapse_subspecies=args.collapse_subspecies) id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()]) for n in t.traverse(): n.add_features(taxid=n.name) n.add_features(sci_name=str(id2name.get(int(n.name), "?"))) n.name = "%s - %s" %(id2name.get(int(n.name), n.name), n.name) lineage = ncbi.get_lineage(n.taxid) n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage))) dump(t, features=["taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage"]) elif args.descendants: log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids))) print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "descendant_taxids", "descendant_names"])) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid in all_taxids: descendants = ncbi.get_descendant_taxa(taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit) print('\t'.join([str(taxid), translator.get(taxid, taxid), ranks.get(taxid, ''), '|'.join(map(str, descendants)), '|'.join(map(str, ncbi.translate_to_names(descendants)))])) elif args.info: print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"])) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid, name in six.iteritems(translator): lineage = ncbi.get_lineage(taxid) named_lineage = ','.join(ncbi.translate_to_names(lineage)) lineage_string = ','.join(map(str, lineage)) print('\t'.join([str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string]))