def __init__(self, filename=None, verbose=True, online=True, source="ncbi"): """.. rubric:: constructor :param offline: if you do not have internet, the connction to Ensembl may hang for a while and fail. If so, set **offline** to True :param from: download taxonomy databases from ncbi """ assert source in ['ncbi', 'ena'] self.source = source if online: from bioservices import Ensembl, EUtils self.ensembl = Ensembl(verbose=False) self.records = {} # empty to start with. self.verbose = verbose if filename is None: self._dbname = "taxonomy.dat" self.database = sequana_config_path + os.sep + self._dbname else: self.database = filename self._custom_db = sequana_config_path self._custom_db += "/taxonomy/taxonomy_custom.dat"
def __init__(self, verbose=True, online=True): """.. rubric:: constructor :param offline: if you do not have internet, the connction to Ensembl may hang for a while and fail. If so, set **offline** to True """ if online: from bioservices import Ensembl self.ensembl = Ensembl(verbose=False) self.records = {} # empty to start with. self.verbose = verbose
def get_ensembl_metadata_online(ensembl_ids): ensembl_ids = list(set(ensembl_ids)) print('get_ensembl_metadata', len(ensembl_ids)) BATCH_SIZE = 1000 ens = Ensembl() ensembl_lookup = {} cumulative_total = 0 for x in batch(ensembl_ids, BATCH_SIZE): batch_ids = [i for i in x] cumulative_total += len(batch_ids) print(cumulative_total, '/', len(ensembl_ids)) lookup = ens.post_lookup_by_id(identifiers=batch_ids) ensembl_lookup.update(lookup) return ensembl_lookup
def ensembl(): return Ensembl(verbose=False)
def get_single_ensembl_metadata_online(ensembl_id): ens = Ensembl() res = ens.get_lookup_by_id(ensembl_id, expand=True) return res
def setup_class(klass): klass.s = Ensembl(verbose=False)
class Taxon(object): """Utility to search for information related to a taxon Uses HGNC service to fetch information about a taxon. :: >>> from bioservices.apps.taxonomy import Taxon >>> t = Taxon() >>> t.search_by_taxon("9606") {'Scientific Name': 'H**o sapiens', 'taxon': '9606'} You can also pop up the Uniprot page using:: t.uniprot_onweb("9606") A full list of taxons is available here:: http://www.uniprot.org/taxonomy/?query=*&format=* .. versionadded:: 1.2.0 """ def __init__(self): super(Taxon, self).__init__() # self.df = pd.DataFrame(index=[], columns=["Taxon", "Scientific Name"]) self._eutils_service = EUtils() self._ensembl_service = Ensembl() # there is a search by name, easier to use than EUtils def search_by_name(self, name): """using ensembl, tries to get the taxon identifier from the given name :: >>> s.search_by_name('mouse') 10090 """ res = self._ensembl_service.get_taxonomy_name("mouse")[0] try: return res["id"] except: return res def search_by_taxon(self, taxon): """ should be a string without comma (only one entry accepted") """ assert isinstance(taxon, str) assert "," not in taxon ret = self._eutils_service.taxonomy(taxon) if ret == "\n": # nothing found pass else: res = {"taxon": taxon, "Scientific Name": ret.Taxon[0].ScientificName} # self.df.append(res) return res def info(self, taxon, lineage=False): """Prints information about a Taxon :param str taxon: taxon identifier :param bool lineage: prints lineage is set to True """ ret = self._eutils_service.taxonomy(taxon) print("Display Name: %s" % ret.Taxon[0].OtherNames.Name.DispName) print("GenBank Common name: %s" % ret.Taxon[0].OtherNames.GenbankCommonName) print("Taxon Id: %s " % ret.Taxon[0].TaxId) if lineage: print("Lineage:") for i, x in enumerate(ret.Taxon[0].Lineage.split(";")): print(i * " " + x) def uniprot_onweb(self, taxon): """Open Uniprot taxonomy page for a given taxon :param str taxon: taxon identifier """ import webbrowser try: from urllib.request import urlopen from urllib.error import HTTPError, URLError except: from urllib2 import urlopen, HTTPError, URLError try: urlopen("http://www.uniprot.org/taxonomy/%s" % taxon) webbrowser.open("http://www.uniprot.org/taxonomy/%s" % taxon) except HTTPError as err: print("Invalid taxon") except URLError as err: print(err.args)
def __init__(self): super(Taxon, self).__init__() # self.df = pd.DataFrame(index=[], columns=["Taxon", "Scientific Name"]) self._eutils_service = EUtils() self._ensembl_service = Ensembl() # there is a search by name, easier to use than EUtils
class Taxonomy(object): """This class should ease the retrieval and manipulation of Taxons There are many resources to retrieve information about a Taxon. For instance, from BioServices, one can use UniProt, Ensembl, or EUtils. This is convenient to retrieve a Taxon (see :meth:`fetch_by_name` and :meth:`fetch_by_id` that rely on Ensembl). However, you can also download a flat file from EBI ftp server, which stores a set or records (1.3M at the time of the implementation). Note that the Ensembl database does not seem to be as up to date as the flat files but entries contain more information. for instance taxon 2 is in the flat file but not available through the :meth:`fetch_by_id`, which uses ensembl. So, you may access to a taxon in 2 different ways getting differnt dictionary. However, 3 keys are common (id, parent, scientific_name) :: >>> t = taxonomy.Taxonomy() >>> t.fetch_by_id(9606) # Get a dictionary from Ensembl >>> t.records[9606] # or just try with the get >>> t[9606] >>> t.get_lineage(9606) """ def __init__(self, verbose=True, online=True): """.. rubric:: constructor :param offline: if you do not have internet, the connction to Ensembl may hang for a while and fail. If so, set **offline** to True """ if online: from bioservices import Ensembl self.ensembl = Ensembl(verbose=False) self.records = {} # empty to start with. self.verbose = verbose def _load_flat_file(self, overwrite=False): """Loads entire flat file from EBI Do not overwrite the file by default. """ import ftplib output_filename = 'taxonomy.dat' self.name = output_filename self.filename = biokitPATH + os.sep + self.name if os.path.exists(self.filename) and overwrite is False: return url = 'ftp.ebi.ac.uk' # /pub/databases/taxonomy/' self.ftp = ftplib.FTP(url) self.ftp.login() self.ftp.cwd('pub') self.ftp.cwd('databases') self.ftp.cwd('taxonomy') print('Downloading and saving in %s' % self.filename) self.ftp.retrbinary('RETR taxonomy.dat', open(self.filename, 'wb').write) def load_records(self, overwrite=False): """Load a flat file and store records in :attr:`records` """ self._load_flat_file(overwrite=overwrite) self.records = {} # TODO: check if it exists otherwise, load it ? if os.path.exists(self.filename) is False: self.load() with open(self.filename) as f: data = f.read().strip() data = data.split("//\n") # the sep is //\n self._child_match = re.compile('ID\s+\:\s*(\d+)\s*') self._parent_match = re.compile('PARENT ID\s+\:\s*(\d+)\s*') self._rank_match = re.compile('RANK\s+\:\s*([^\n]+)\s*') self._name_match = re.compile('SCIENTIFIC NAME\s+\:\s*([^\n]+)\s*') from easydev import Progress pb = Progress(len(data)) if self.verbose: print('Loading all taxon records.') for i, record in enumerate(data[0:]): # try/except increase comput. time by 5% try: dico = self._interpret_record(record) identifier = int(dico['id']) self.records[identifier] = dico except Exception as err: print(err) print('Could not parse the following record ' + \ 'Please fill bug report on http://github.com/biokit') print(record) if self.verbose: pb.animate(i + 1) if self.verbose: print() def _interpret_record(self, record): data = {'raw': record} # in principle, we should check for the existence of a match # but this takes time. All entries must have an ID so no # need to check for it. Same for parent and scientific name. # Does not save that much time though. m = self._child_match.search(record) if m: data['id'] = m.group(1) m = self._parent_match.search(record) if m: data['parent'] = m.group(1) m = self._name_match.search(record) if m: data['scientific_name'] = m.group(1) m = self._rank_match.search(record) if m: data['rank'] = m.group(1) return data def fetch_by_id(self, taxon): """Search for a taxon by identifier :return; a dictionary. :: >>> ret = s.search_by_id('10090') >>> ret['name'] 'Mus Musculus' """ res = self.ensembl.get_taxonomy_by_id(taxon) return res def fetch_by_name(self, name): """Search a taxon by its name. :param str name: name of an organism. SQL cards possible e.g., _ and % characters. :return: a list of possible matches. Each item being a dictionary. :: >>> ret = s.search_by_name('Mus Musculus') >>> ret[0]['id'] 10090 """ res = self.ensembl.get_taxonomy_by_name(name) return res def on_web(self, taxon): """Open UniProt page for a given taxon""" # Should work for python2 and 3 import webbrowser try: from urllib.request import urlopen from urllib.error import HTTPError, URLError except: from urllib2 import urlopen, HTTPError, URLError try: urlopen('http://www.uniprot.org/taxonomy/%s' % taxon) webbrowser.open("http://www.uniprot.org/taxonomy/%s" % taxon) except HTTPError as err: print("Invalid taxon") except URLError as err: print(err.args) def get_lineage(self, taxon): """Get lineage of a taxon :param int taxon: a known taxon :return: list containing the lineage """ # important to reinit the second argument to [] taxon = int(taxon) lineage = self._gen_lineage_and_rank(taxon, []) lineage = [x[0] for x in lineage] return lineage def _gen_lineage_and_rank(self, taxon, lineage_rank=[]): # recursively filling the lineage argument if len(self.records) == 0: self.load_records() try: record = self.records[taxon] except: return [('unknown', 'no rank')] parent = int(record['parent']) if parent not in [0]: lineage_rank.append((record['scientific_name'], record['rank'])) return self._gen_lineage_and_rank(parent, lineage_rank) else: lineage_rank.reverse() return lineage_rank def get_lineage_and_rank(self, taxon): """Get lineage and rank of a taxon :param int taxon: :return: a list of tuples. Each tuple is a pair of taxon name/rank The list is the lineage for to the input taxon. """ taxon = int(taxon) lineage = self._gen_lineage_and_rank(taxon, []) return lineage def get_children(self, taxon): if len(self.records) == 0: self.load_records() taxon = str(taxon) children = [ self.records[k] for k in self.records.keys() if self.records[k]['parent'] == taxon ] children = [child['id'] for child in children] return children def get_family_tree(self, taxon): """root is taxon and we return the corresponding tree""" # should limit the tree size # uniprot flat files has no record about children, so we would # need to reconstruct the tree tree = {} children = self.get_children(taxon) if len(children) == 0: return tree else: return [self.get_family_tree(child) for child in children] def __getitem__(self, iden): if len(self.records) == 0: self.load_records() return self.records[iden]
class Taxonomy(object): """This class should ease the retrieval and manipulation of Taxons There are many resources to retrieve information about a Taxon. For instance, from BioServices, one can use UniProt, Ensembl, or EUtils. This is convenient to retrieve a Taxon (see :meth:`fetch_by_name` and :meth:`fetch_by_id` that rely on Ensembl). However, you can also download a flat file from EBI ftp server, which stores a set or records (2.8M (april 2020). Note that the Ensembl database does not seem to be as up to date as the flat files but entries contain more information. for instance taxon 2 is in the flat file but not available through the :meth:`fetch_by_id`, which uses ensembl. So, you may access to a taxon in 2 different ways getting differnt dictionary. However, 3 keys are common (id, parent, scientific_name) :: >>> t = taxonomy.Taxonomy() >>> t.fetch_by_id(9606) # Get a dictionary from Ensembl >>> t.records[9606] # or just try with the get >>> t[9606] >>> t.get_lineage(9606) """ def __init__(self, filename=None, verbose=True, online=True, source="ncbi"): """.. rubric:: constructor :param offline: if you do not have internet, the connction to Ensembl may hang for a while and fail. If so, set **offline** to True :param from: download taxonomy databases from ncbi """ assert source in ['ncbi', 'ena'] self.source = source if online: from bioservices import Ensembl, EUtils self.ensembl = Ensembl(verbose=False) self.records = {} # empty to start with. self.verbose = verbose if filename is None: self._dbname = "taxonomy.dat" self.database = sequana_config_path + os.sep + self._dbname else: self.database = filename self._custom_db = sequana_config_path self._custom_db += "/taxonomy/taxonomy_custom.dat" def _update_custom_taxonomy_bases(self, taxid): """ """ taxid = str(taxid) self.eutils = EUtils(verbose=False) res = self.eutils.taxonomy_summary(taxid) if "error" in res[taxid]: print("not found in NCBI (EUtils)") else: print("found in NCBI (EUtils) and added to local databases") with open(self.custom_db, "w") as fout: data = res[taxid] fout.write("ID : {}\n".format(taxid)) #fout.write("PARENT ID : {}\n".format(taxid)) fout.write("RANK : {}\n".format(data['rank'])) #fout.write("GC ID : {}\n".format(data[''])) fout.write("SCIENTIFIC NAME : {}\n".format( data['scientificname'])) def download_taxonomic_file(self, overwrite=False): """Loads entire flat file from EBI Do not overwrite the file by default. """ import ftplib from sequana import sequana_config_path if os.path.exists(self.database) and overwrite is False: logger.info( "Found taxonomy.dat file in sequana your path {}".format( sequana_config_path)) return else: logger.info( "Downloading and extracting the taxonomy file from the web. Please be patient." ) if self.source == "ena": url = 'ftp.ebi.ac.uk' else: url = 'ftp.ncbi.nlm.nih.gov' self.ftp = ftplib.FTP(url) self.ftp.login() if self.source == "ena": # for the EBI ftp only: self.ftp.cwd('databases') self.ftp.cwd('pub') self.ftp.cwd('databases') self.ftp.cwd('taxonomy') logger.warning( 'Downloading and saving in %s. This is from ebi and may be behind the NCBI taxonomy' % self.database) self.ftp.retrbinary('RETR taxonomy.dat', open(self.database, 'wb').write) ftp.close() else: self.ftp.cwd('pub') self.ftp.cwd('taxonomy') logger.warning('Downloading and saving in %s from ncbi ftp' % self.database) import tempfile import shutil with tempfile.TemporaryDirectory() as tmpdir: filename = tmpdir + os.sep + "taxdump.tar.gz" self.ftp.retrbinary('RETR taxdump.tar.gz', open(filename, "wb").write) import tarfile tf = tarfile.open(filename) assert "nodes.dmp" in tf.getnames() assert "names.dmp" in tf.getnames() tf.extract("nodes.dmp", tmpdir) tf.extract("names.dmp", tmpdir) ncbi = NCBITaxonomy(tmpdir + os.sep + "names.dmp", tmpdir + os.sep + "nodes.dmp") ncbi.create_taxonomy_file(tmpdir + os.sep + "taxonomy.dat") shutil.move(tmpdir + os.sep + "taxonomy.dat", self.database) self.ftp.close() def load_records(self, overwrite=False): """Load a flat file and store records in :attr:`records` Since version 0.8.3 we use NCBI that is updated more often than the ebi ftp according to their README. ftp://ncbi.nlm.nih.gov/pub/taxonomy/ """ self.download_taxonomic_file(overwrite=overwrite) self.records = {} # TODO: check if it exists otherwise, load it ? if os.path.exists(self.database) is False: self.load() with open(self.database) as f: data = f.read().strip() # This is fast. tried parse package, much slower. cost of progress bar # is not important. data = data.split("//\n") # the sep is //\n self._child_match = re.compile(r'ID\s+\:\s*(\d+)\s*') self._parent_match = re.compile(r'PARENT ID\s+\:\s*(\d+)\s*') self._rank_match = re.compile(r'RANK\s+\:\s*([^\n]+)\s*') self._name_match = re.compile(r'SCIENTIFIC NAME\s+\:\s*([^\n]+)\s*') from easydev import Progress pb = Progress(len(data)) logger.info('Loading all taxon records.') for i, record in enumerate(data[0:]): dd = {'raw': record} dd['id'] = int(self._child_match.search(record).group(1)) dd['parent'] = int(self._parent_match.search(record).group(1)) dd['scientific_name'] = self._name_match.search(record).group(1) dd['rank'] = self._rank_match.search(record).group(1) self.records[dd["id"]] = dd if self.verbose: pb.animate(i + 1) if self.verbose: print() def find_taxon(self, taxid, mode="ncbi"): taxid = str(taxid) if mode == "ncbi": from bioservices import EUtils self.eutils = EUtils(verbose=False) res = self.eutils.taxonomy_summary(taxid) else: res = self.ensembl.get_taxonomy_by_id(taxid) return res """if "error" in res[taxid]: print("not found in NCBI (EUtils)") else: data = res[taxid] fout.write("ID : {}\n".format(taxid)) #fout.write("PARENT ID : {}\n".format(taxid)) fout.write("RANK : {}\n".format(data['rank'])) #fout.write("GC ID : {}\n".format(data[''])) fout.write("SCIENTIFIC NAME : {}\n".format(data['scientificname'])) """ @load_taxons def fetch_by_id(self, taxon): """Search for a taxon by identifier :return; a dictionary. :: >>> ret = s.search_by_id('10090') >>> ret['name'] 'Mus Musculus' """ res = self.ensembl.get_taxonomy_by_id(taxon) return res @load_taxons def fetch_by_name(self, name): """Search a taxon by its name. :param str name: name of an organism. SQL cards possible e.g., _ and % characters. :return: a list of possible matches. Each item being a dictionary. :: >>> ret = s.search_by_name('Mus Musculus') >>> ret[0]['id'] 10090 """ res = self.ensembl.get_taxonomy_by_name(name) return res def on_web(self, taxon): """Open UniProt page for a given taxon""" # Should work for python2 and 3 import webbrowser try: from urllib.request import urlopen from urllib.error import HTTPError, URLError except: from urllib2 import urlopen, HTTPError, URLError try: urlopen('http://www.uniprot.org/taxonomy/%s' % taxon) webbrowser.open("http://www.uniprot.org/taxonomy/%s" % taxon) except HTTPError as err: print("Invalid taxon") except URLError as err: print(err.args) @load_taxons def get_lineage(self, taxon): """Get lineage of a taxon :param int taxon: a known taxon :return: list containing the lineage """ # important to reinit the second argument to [] taxon = int(taxon) lineage = self._gen_lineage_and_rank(taxon, []) lineage = [x[0] for x in lineage] return lineage @load_taxons def _gen_lineage_and_rank(self, taxon, lineage_rank=[]): # recursively filling the lineage argument try: record = self.records[taxon] except: return [('unknown_taxon:{}'.format(taxon), 'no rank')] parent = int(record['parent']) if taxon == 1: lineage_rank.append((record['scientific_name'], record['rank'])) lineage_rank.reverse() return lineage_rank else: lineage_rank.append((record['scientific_name'], record['rank'])) return self._gen_lineage_and_rank(parent, lineage_rank) @load_taxons def get_parent_taxon(self, taxon): return self.records[taxon]['parent'] @load_taxons def get_parent_name(self, taxon): taxid = self.get_parent_taxon(taxon) return self.records[taxid]['scientific_name'] @load_taxons def get_lineage_and_rank(self, taxon): """Get lineage and rank of a taxon :param int taxon: :return: a list of tuples. Each tuple is a pair of taxon name/rank The list is the lineage for to the input taxon. """ taxon = int(taxon) lineage = self._gen_lineage_and_rank(taxon, []) return lineage @load_taxons def get_ranks(self): return Counter([x['rank'] for x in self.records.values()]) @load_taxons def get_record_for_given_rank(self, rank): return [x for x in self.records.values() if x['rank'] == rank] @load_taxons def get_names_for_given_rank(self, rank): data = [x for x in self.records.values() if x['rank'] == rank] return [x['scientific_name'] for x in data] @load_taxons def get_children(self, taxon): taxon = str(taxon) children = [ self.records[k] for k in self.records.keys() if self.records[k]['parent'] == taxon ] children = [child['id'] for child in children] return children @load_taxons def get_family_tree(self, taxon): """root is taxon and we return the corresponding tree""" # should limit the tree size # uniprot flat files has no record about children, so we would # need to reconstruct the tree tree = {} children = self.get_children(taxon) if len(children) == 0: return tree else: return [self.get_family_tree(child) for child in children] @load_taxons def __getitem__(self, iden): return self.records[iden] @load_taxons def __getitem__(self, iden): return len(self.records) def append_existing_database(self, filename): """ Taxonomy DB looks like:: ID : 2731450 PARENT ID : 1914233 RANK : genus SCIENTIFIC NAME : Limnoglobus // a = NCBITaxonomy("names.dmp", "nodes.dmp") a.create_taxonomy_file("taxonomy.dat") tax = Taxonomy() tax.append_existing_database("taxonomy.dat") """ tax = Taxonomy(filename) tax.load_records() self.load_records() toadd = [] for record in tax.records.keys(): if record not in self.records: toadd.append(record) with open(self.database, "a") as fout: for record in toadd: fout.write(tax.records[record]['raw'] + "//\n")
class Taxonomy(object): """This class should ease the retrieval and manipulation of Taxons There are many resources to retrieve information about a Taxon. For instance, from BioServices, one can use UniProt, Ensembl, or EUtils. This is convenient to retrieve a Taxon (see :meth:`fetch_by_name` and :meth:`fetch_by_id` that rely on Ensembl). However, you can also download a flat file from EBI ftp server, which stores a set or records (1.3M at the time of the implementation). Note that the Ensembl database does not seem to be as up to date as the flat files but entries contain more information. for instance taxon 2 is in the flat file but not available through the :meth:`fetch_by_id`, which uses ensembl. So, you may access to a taxon in 2 different ways getting differnt dictionary. However, 3 keys are common (id, parent, scientific_name) :: >>> t = taxonomy.Taxonomy() >>> t.fetch_by_id(9606) # Get a dictionary from Ensembl >>> t.records[9606] # or just try with the get >>> t[9606] >>> t.get_lineage(9606) """ def __init__(self, verbose=True): from bioservices import Ensembl self.ensembl = Ensembl(verbose=False) self.records = {} # empty to start with. self.verbose = verbose def _load_flat_file(self, overwrite=False): """Loads entire flat file from EBI Do not overwrite the file by default. """ import ftplib output_filename='taxonomy.dat' self.name = output_filename self.filename = biokitPATH + os.sep + self.name if os.path.exists(self.filename) and overwrite is False: return url = 'ftp.ebi.ac.uk' # /pub/databases/taxonomy/' self.ftp = ftplib.FTP(url) self.ftp.login() self.ftp.cwd('pub') self.ftp.cwd('databases') self.ftp.cwd('taxonomy') print('Downloading and saving in %s' % self.filename) self.ftp.retrbinary('RETR taxonomy.dat', open(self.filename, 'wb').write) def load_records(self, overwrite=False): """Load a flat file and store records in :attr:`records` """ self._load_flat_file(overwrite=overwrite) self.records = {} # TODO: check if it exists otherwise, load it ? if os.path.exists(self.filename) is False: self.load() with open(self.filename) as f: data = f.read().strip() data = data.split("//\n") # the sep is //\n self._child_match = re.compile('ID\s+\:\s*(\d+)\s*') self._parent_match = re.compile('PARENT ID\s+\:\s*(\d+)\s*') self._rank_match = re.compile('RANK\s+\:\s*([^\n]+)\s*') self._name_match = re.compile('SCIENTIFIC NAME\s+\:\s*([^\n]+)\s*') from easydev import Progress pb = Progress(len(data)) if self.verbose: print('Loading all taxon records.') for i, record in enumerate(data[0:]): # try/except increase comput. time by 5% try: dico = self._interpret_record(record) identifier = int(dico['id']) self.records[identifier] = dico except Exception as err: print(err) print('Could not parse the following record ' + \ 'Please fill bug report on http://github.com/biokit') print(record) if self.verbose: pb.animate(i+1) def _interpret_record(self, record): data = {'raw': record} # in principle, we should check for the existence of a match # but this takes time. All entries must have an ID so no # need to check for it. Same for parent and scientific name. # Does not save that much time though. m = self._child_match.search(record) if m: data['id'] = m.group(1) m = self._parent_match.search(record) if m: data['parent'] = m.group(1) m = self._name_match.search(record) if m: data['scientific_name'] = m.group(1) m = self._rank_match.search(record) if m: data['rank'] = m.group(1) return data def fetch_by_id(self, taxon): """Search for a taxon by identifier :return; a dictionary. :: >>> ret = s.search_by_id('10090') >>> ret['name'] 'Mus Musculus' """ res = self.ensembl.get_taxonomy_by_id(taxon) return res def fetch_by_name(self, name): """Search a taxon by its name. :param str name: name of an organism. SQL cards possible e.g., _ and % characters. :return: a list of possible matches. Each item being a dictionary. :: >>> ret = s.search_by_name('Mus Musculus') >>> ret[0]['id'] 10090 """ res = self.ensembl.get_taxonomy_by_name(name) return res def on_web(self, taxon): """Open UniProt page for a given taxon""" # Should work for python2 and 3 import webbrowser try: from urllib.request import urlopen from urllib.error import HTTPError, URLError except: from urllib2 import urlopen, HTTPError, URLError try: urlopen('http://www.uniprot.org/taxonomy/%s' % taxon) webbrowser.open("http://www.uniprot.org/taxonomy/%s" % taxon) except HTTPError as err: print("Invalid taxon") except URLError as err: print(err.args) def get_lineage(self, taxon): """Get lineage of a taxon :param int taxon: a known taxon :return: list containing the lineage """ # important to reinit the second argument to [] taxon = int(taxon) lineage = self._gen_lineage_and_rank(taxon, []) lineage = [x[0] for x in lineage] return lineage def _gen_lineage_and_rank(self, taxon, lineage_rank=[]): # recursively filling the lineage argument if len(self.records) == 0: self.load_records() record = self.records[taxon] parent = int(record['parent']) if parent not in [0]: lineage_rank.append((record['scientific_name'], record['rank'])) return self._gen_lineage_and_rank(parent, lineage_rank) else: lineage_rank.reverse() return lineage_rank def get_lineage_and_rank(self, taxon): """Get lineage and rank of a taxon :param int taxon: :return: a list of tuples. Each tuple is a pair of taxon name/rank The list is the lineage for to the input taxon. """ taxon = int(taxon) lineage = self._gen_lineage_and_rank(taxon, []) return lineage def get_children(self, taxon): if len(self.records) == 0: self.load_records() taxon = str(taxon) children = [self.records[k] for k in self.records.keys() if self.records[k]['parent'] == taxon] children = [child['id'] for child in children] return children def get_family_tree(self, taxon): """root is taxon and we return the corresponding tree""" # should limit the tree size # uniprot flat files has no record about childrent, so we would # need to reconstruct the tree tree = {} children = self.get_children(taxon) if len(children) == 0: return tree else: return [self.get_family_tree(child) for child in children] def __getitem__(self, iden): if len(self.records) == 0: self.load_records() return self.records[iden]
def __init__(self, verbose=True): from bioservices import Ensembl self.ensembl = Ensembl(verbose=False) self.records = {} # empty to start with. self.verbose = verbose
class Taxon(object): """Utility to search for information related to a taxon Uses HGNC service to fetch information about a taxon. :: >>> from bioservices.apps.taxonomy import Taxon >>> t = Taxon() >>> t.search_by_taxon("9606") {'Scientific Name': 'H**o sapiens', 'taxon': '9606'} You can also pop up the Uniprot page using:: t.uniprot_onweb("9606") A full list of taxons is available here:: http://www.uniprot.org/taxonomy/?query=*&format=* .. versionadded:: 1.2.0 """ def __init__(self): super(Taxon, self).__init__() # self.df = pd.DataFrame(index=[], columns=["Taxon", "Scientific Name"]) self._eutils_service = EUtils() self._ensembl_service = Ensembl() # there is a search by name, easier to use than EUtils def search_by_name(self, name): """using ensembl, tries to get the taxon identifier from the given name :: >>> s.search_by_name('mouse') 10090 """ res = self._ensembl_service.get_taxonomy_name("mouse")[0] try: return res['id'] except: return res def search_by_taxon(self, taxon): """ should be a string without comma (only one entry accepted") """ assert isinstance(taxon, str) assert "," not in taxon ret = self._eutils_service.taxonomy(taxon) if ret == "\n": # nothing found pass else: res = {'taxon': taxon, 'Scientific Name': ret.Taxon[0].ScientificName} # self.df.append(res) return res def info(self, taxon, lineage=False): """Prints information about a Taxon :param str taxon: taxon identifier :param bool lineage: prints lineage is set to True """ ret = self._eutils_service.taxonomy(taxon) print("Display Name: %s" % ret.Taxon[0].OtherNames.Name.DispName) print("GenBank Common name: %s" % ret.Taxon[0].OtherNames.GenbankCommonName) print("Taxon Id: %s " % ret.Taxon[0].TaxId) if lineage: print("Lineage:") for i, x in enumerate(ret.Taxon[0].Lineage.split(";")): print(i*" "+x) def uniprot_onweb(self, taxon): """Open Uniprot taxonomy page for a given taxon :param str taxon: taxon identifier """ import webbrowser try: from urllib.request import urlopen from urllib.error import HTTPError, URLError except: from urllib2 import urlopen, HTTPError, URLError try: urlopen('http://www.uniprot.org/taxonomy/%s' % taxon) webbrowser.open("http://www.uniprot.org/taxonomy/%s" % taxon) except HTTPError as err: print("Invalid taxon") except URLError as err: print(err.args)