class KEGGMapper(object): """ """ kegg_dblinks = ["IMGT", "Ensembl", "HGNC", "HPRD", "NCBI-GI", "OMIM", "NCBI-GeneID", "UniProt", "Vega"] def __init__(self, verbose=True): self._kegg_service = KeggParser(verbose=verbose) print("Loading all gene identifiers for HSA") names = self._kegg_service.list("hsa") names = names.strip().split("\n") names = [x.split("\t")[0] for x in names] self._names = names[:] print("Fetching all data") self.alldata = {} self.load_all_kegg_entries() print("Building the dataframe") try: self.df = self.build_dataframe() except: print("error in build_dataframe") def _get_names(self): return self._names names = property(_get_names) def _get_entries(self): return sorted(self.alldata.keys()) entries = property(_get_entries) def build_dataframe(self): names = ['class', 'definition', 'disease', 'drug_target', 'module', 'motif', 'name', 'orthology', 'pathway', 'position', 'structure'] N = len(self.entries) # build an empty dataframe with relevant names data = {} # for the dblinks for this in self.kegg_dblinks: data.update({"%s_kegg" % this: [None] * N}) # and other interesting entries for name in names: #e.g. name == position data[name] = [self.alldata[entry][name] if name in self.alldata[entry].keys() else None for entry in self.entries] df = pd.DataFrame(data, index=self.entries) # scan again to fill the df with dblinks for index, entry in enumerate(self.entries): res = self.alldata[entry]['dblinks'] for key in res.keys(): if key in self.kegg_dblinks: # fill df_i,j df.ix[entry][key+"_kegg"] = res[key] else: raise NotImplementedError("Found an unknown key in KEGG dblink:%s" % key) return df def load_all_kegg_entries(self, filename="kegg_gene.dat"): if os.path.isfile(filename): import pickle results = pickle.load(open(filename, "r")) return results # TODO: # donwload from a URL print("could not find kegg data. fetching data from website if possible") # Fetches the KEGG results using multicore to send several requests at the same time found = self.alldata.keys() names = [x for x in self.names if x not in found] print("Fetching %s enties" % len(names)) mc = test_func(names) self.mcresults = mc.results.copy() # here are the entries to be used as keys try: entries = ["hsa:"+x['entry'].split()[0] for x in self.mcresults if x] for entry, result in zip(entries, self.mcresults): self.alldata[entry] = result except: print("something wrng happened while scaning mcresults")
class KEGGMapper(object): """ """ kegg_dblinks = [ "IMGT", "Ensembl", "HGNC", "HPRD", "NCBI-GI", "OMIM", "NCBI-GeneID", "UniProt", "Vega" ] def __init__(self, verbose=True): self._kegg_service = KeggParser(verbose=verbose) print("Loading all gene identifiers for HSA") names = self._kegg_service.list("hsa") names = names.strip().split("\n") names = [x.split("\t")[0] for x in names] self._names = names[:] print("Fetching all data") self.alldata = {} self.load_all_kegg_entries() print("Building the dataframe") try: self.df = self.build_dataframe() except: print("error in build_dataframe") def _get_names(self): return self._names names = property(_get_names) def _get_entries(self): return sorted(self.alldata.keys()) entries = property(_get_entries) def build_dataframe(self): names = [ 'class', 'definition', 'disease', 'drug_target', 'module', 'motif', 'name', 'orthology', 'pathway', 'position', 'structure' ] N = len(self.entries) # build an empty dataframe with relevant names data = {} # for the dblinks for this in self.kegg_dblinks: data.update({"%s_kegg" % this: [None] * N}) # and other interesting entries for name in names: #e.g. name == position data[name] = [ self.alldata[entry][name] if name in self.alldata[entry].keys() else None for entry in self.entries ] df = pd.DataFrame(data, index=self.entries) # scan again to fill the df with dblinks for index, entry in enumerate(self.entries): res = self.alldata[entry]['dblinks'] for key in res.keys(): if key in self.kegg_dblinks: # fill df_i,j df.ix[entry][key + "_kegg"] = res[key] else: raise NotImplementedError( "Found an unknown key in KEGG dblink:%s" % key) return df def load_all_kegg_entries(self, filename="kegg_gene.dat"): if os.path.isfile(filename): import pickle results = pickle.load(open(filename, "r")) return results # TODO: # donwload from a URL print( "could not find kegg data. fetching data from website if possible") # Fetches the KEGG results using multicore to send several requests at the same time found = self.alldata.keys() names = [x for x in self.names if x not in found] print("Fetching %s enties" % len(names)) mc = test_func(names) self.mcresults = mc.results.copy() # here are the entries to be used as keys try: entries = [ "hsa:" + x['entry'].split()[0] for x in self.mcresults if x ] for entry, result in zip(entries, self.mcresults): self.alldata[entry] = result except: print("something wrng happened while scaning mcresults")