def __init__(self, filename=None): self._hgnc_service = HGNC() if filename == None: self.alldata = self.load_all_hgnc() self.df = self.build_dataframe() else: self.read_csv(filename)
def _lookup_hgnc_id(self): hgnc_web = HGNC() hgnc = hgnc_web.fetch('hgnc_id', self.external_id) if hgnc['response']['numFound'] == 1: self.name = hgnc['response']['docs'][0]['symbol'] self.description = hgnc['response']['docs'][0]['name'] # Get synonyms if requested. if self.get_synonyms: for item in hgnc['response']['docs'][0]['alias_symbol']: self.synonyms.append(item) elif hgnc['response']['numFound'] == 0: self.error = "No results found when querying HGNC for {}".format( self.external_id) return self
class HGNCMapper(object): hgnc_dblink = ['EC','Ensembl', 'EntrezGene', 'GDB', 'GENATLAS', 'GeneCards', 'GeneTests', 'GoPubmed', 'H-InvDB', 'HCDM', 'HCOP', 'HGNC', 'HORDE', 'IMGT_GENE_DB', 'INTERFIL', 'IUPHAR', 'KZNF', 'MEROPS', 'Nucleotide', 'OMIM', 'PubMed', 'RefSeq', 'Rfam', 'Treefam', 'UniProt', 'Vega', 'miRNA', 'snoRNABase'] def __init__(self, filename=None): self._hgnc_service = HGNC() if filename == None: self.alldata = self.load_all_hgnc() self.df = self.build_dataframe() else: self.read_csv(filename) def load_all_hgnc(self): """keys are unique Gene names""" print("Fetching the data from HGNC first. May take a few minutes"), alldata = self._hgnc_service.mapping_all() print("done") return alldata def build_dataframe(self): # simplify to get a dictionary of dictionary data = {k1:{k2:v2['xkey'] for k2,v2 in self.alldata[k1].iteritems()} for k1 in self.alldata.keys()} dfdata = pd.DataFrame(data) dfdata = dfdata.transpose() # rename to tag with "HGNC" dfdata.columns = [this + "__HGNC_mapping" for this in dfdata.columns] print("a dataframe was built using HGNC data set and saved in attributes self._df_hgnc") return dfdata
def add_sequence_to_nodes(n: str, d: Dict[str, Any]): """ Maps UniProt ACC to UniProt ID. Retrieves sequence from UniProt and adds it to the node as a feature :param n: Graph node. :type n: str :param d: Graph attribute dictionary. :type d: Dict[str, Any] """ h = HGNC(verbose=False) u = UniProt(verbose=False) d["uniprot_ids"] = h.fetch( "symbol", d["protein_id"])["response"]["docs"][0]["uniprot_ids"] # Todo these API calls should probably be batched # Todo mapping with bioservices to support other protein IDs? for id in d["uniprot_ids"]: d[f"sequence_{id}"] = u.get_fasta_sequence(id)
def kegg_to_hugo(genes, species='hsa'): """ Converts all KEGG names to HGNC Parameters ---------- genes : list species : str Returns ------- dict """ prefix = species + ':' hugo = HGNC(verbose=True) hugo_dict = {} not_found = set() for i in genes: tmp_name = i.lstrip(prefix) mapping = hugo.search(tmp_name) if 'response' in mapping: response = mapping['response'] if 'numFound' in response: if response['numFound'] == 0: not_found.add(i) continue elif response['numFound'] == 1: docs = response['docs'][0] hugo_dict[i] = docs['symbol'] continue else: if 'symbol' in response['docs'][0]: hugo_dict[i] = response['docs'][0]['symbol'] else: not_found.add(i) if not_found != 0: print("{} not found after HGNC mapping".format(len(not_found))) print("{} ".format(not_found)) return hugo_dict, not_found
def __init__(self, verbosity="INFO"): super(Mapper, self).__init__(level=verbosity) self.logging.info("Initialising the services") self.logging.info("... uniprots") self._uniprot_service = UniProt() self.logging.info("... KEGG") self._kegg_service = KeggParser(verbose=False) self.logging.info("... HGNC") self._hgnc_service = HGNC() self.logging.info("... UniChem") self._unichem_service = UniChem() self.logging.info("...BioDBNet") self._biodbnet = BioDBNet()
class HGNCMapper(object): hgnc_dblink = [ 'EC', 'Ensembl', 'EntrezGene', 'GDB', 'GENATLAS', 'GeneCards', 'GeneTests', 'GoPubmed', 'H-InvDB', 'HCDM', 'HCOP', 'HGNC', 'HORDE', 'IMGT_GENE_DB', 'INTERFIL', 'IUPHAR', 'KZNF', 'MEROPS', 'Nucleotide', 'OMIM', 'PubMed', 'RefSeq', 'Rfam', 'Treefam', 'UniProt', 'Vega', 'miRNA', 'snoRNABase' ] def __init__(self, filename=None): self._hgnc_service = HGNC() if filename == None: self.alldata = self.load_all_hgnc() self.df = self.build_dataframe() else: self.read_csv(filename) def load_all_hgnc(self): """keys are unique Gene names""" print("Fetching the data from HGNC first. May take a few minutes"), alldata = self._hgnc_service.mapping_all() print("done") return alldata def build_dataframe(self): # simplify to get a dictionary of dictionary data = { k1: {k2: v2['xkey'] for k2, v2 in self.alldata[k1].iteritems()} for k1 in self.alldata.keys() } dfdata = pd.DataFrame(data) dfdata = dfdata.transpose() # rename to tag with "HGNC" dfdata.columns = [this + "__HGNC_mapping" for this in dfdata.columns] print( "a dataframe was built using HGNC data set and saved in attributes self._df_hgnc" ) return dfdata
def test_hgnc(): h = HGNC() h.get_info() h.fetch('symbol', 'ZNF3') h.fetch('alias_name', 'A-kinase anchor protein, 350kDa') h.search('BRAF') h.search('symbol', 'ZNF*') h.search('symbol', 'ZNF?') h.search('symbol', 'ZNF*+AND+status:Approved') h.search('symbol', 'ZNF3+OR+ZNF12') h.search('symbol', 'ZNF*+NOT+status:Approved')
def __init__(self): self.s = HGNC(verbose=False)
class test_hgnc(): def __init__(self): self.s = HGNC(verbose=False) @attr('skip') def test_get_xml(self): xml = self.s.get_xml("ZAP70") xml = self.s.get_xml("ZAP70;INSR") assert len(xml.findAll("gene")) == 2 self.s.get_xml("wrong") @attr('skip') def test_aliases(self): assert self.s.get_aliases("ZAP70") == [u'ZAP-70', u'STD'] self.s.get_name("ZAP70") self.s.get_chromosome("ZAP70") self.s.get_previous_symbols("ZAP70") self.s.get_withdrawn_symbols("ZAP70") self.s.get_previous_names("ZAP70") @attr('skip') def test_xref(self): assert self.s.get_xrefs("ZAP70")['UniProt']['xkey'] == 'P43403' assert self.s.get_xrefs("ZAP70", "xml")['UniProt']['link'] == ['http://www.uniprot.org/uniprot/P43403.xml'] @attr('skip') def test_lookfor(self): self.s.lookfor("ZAP70") @attr('skip') def test_mapping(self): value = "UniProt:P43403" res = self.s.mapping(value) res[0]['xlink:title'] == "ZAP70"
def test_hgnc(): h = HGNC() h.get_info() h.fetch("symbol", "ZNF3") h.fetch("alias_name", "A-kinase anchor protein, 350kDa") h.search("BRAF") h.search("symbol", "ZNF*") h.search("symbol", "ZNF?") h.search("symbol", "ZNF*+AND+status:Approved") h.search("symbol", "ZNF3+OR+ZNF12") h.search("symbol", "ZNF*+NOT+status:Approved")
class test_hgnc(): def __init__(self): self.s = HGNC(verbose=False) def test_get_xml(self): xml = self.s.get_xml("ZAP70") xml = self.s.get_xml("ZAP70;INSR") assert len(xml.findAll("gene")) == 2 self.s.get_xml("wrong") def test_aliases(self): assert self.s.get_aliases("ZAP70") == [u'ZAP-70', u'STD'] self.s.get_name("ZAP70") self.s.get_chromosome("ZAP70") self.s.get_previous_symbols("ZAP70") self.s.get_withdrawn_symbols("ZAP70") self.s.get_previous_names("ZAP70") def test_xref(self): assert self.s.get_xrefs("ZAP70")['UniProt']['xkey'] == 'P43403' assert self.s.get_xrefs("ZAP70", "xml")['UniProt']['link'] == [ 'http://www.uniprot.org/uniprot/P43403.xml' ] def test_lookfor(self): self.s.lookfor("ZAP70") def test_mapping(self): value = "UniProt:P43403" res = self.s.mapping(value) res['xlink:title'] == "ZAP70"