def load_ensembl2pfam(self): #Prosite DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt') load_start(DATAFILE) ensembl2pfam = dict_nodup(tab2dict(DATAFILE, (1, 4), 0)) ensembl2pfam = value_convert(ensembl2pfam, lambda x: {'pfam': x}, traverse_list=False) load_done('[%d]' % len(ensembl2pfam)) return self.convert2entrez(ensembl2pfam)
def load_ensembl2pfam(self): #Prosite DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt') load_start(DATAFILE) ensembl2pfam = dict_nodup(tab2dict(DATAFILE, (1, 4), 0)) ensembl2pfam = value_convert(ensembl2pfam, lambda x: {'pfam': x}, traverse_list=False) load_done('[%d]' % len(ensembl2pfam)) return self.convert2entrez(ensembl2pfam)
def _load_ensembl_2taxid(self): """ensembl2taxid""" DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt') load_start(DATAFILE) ensembl2taxid = dict_nodup(tab2dict(DATAFILE, (0, 1), 1, includefn=_not_LRG)) # need to convert taxid to integer here ensembl2taxid = value_convert(ensembl2taxid, lambda x: int(x)) load_done('[%d]' % len(ensembl2taxid)) return ensembl2taxid
def load_ensembl2interpro(self): #Interpro DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt') load_start(DATAFILE) ensembl2interpro = dict_nodup(tab2dict(DATAFILE, (1, 4, 5, 6), 0)) ensembl2interpro = value_convert(ensembl2interpro, lambda x: {'id': x[0], 'short_desc': x[1], 'desc': x[2]}) ensembl2interpro = value_convert(ensembl2interpro, lambda x: {'interpro': x}, traverse_list=False) load_done('[%d]' % len(ensembl2interpro)) return self.convert2entrez(ensembl2interpro)
def load_ensembl2pos(self): #Genomic position DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt') load_start(DATAFILE) ensembl2pos = dict_nodup(tab2dict(DATAFILE, (1, 3, 4, 5, 6), 0, includefn=_not_LRG)) ensembl2pos = value_convert(ensembl2pos, lambda x: {'chr': x[2], 'start': int(x[0]), 'end': int(x[1]), 'strand': int(x[3])}) ensembl2pos = value_convert(ensembl2pos, lambda x: {'genomic_pos': x}, traverse_list=False) load_done('[%d]' % len(ensembl2pos)) return self.convert2entrez(ensembl2pos)
def _load_ensembl_2taxid(self): """ensembl2taxid""" DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt') load_start(DATAFILE) ensembl2taxid = dict_nodup(tab2dict(DATAFILE, (0, 1), 1, includefn=_not_LRG)) # need to convert taxid to integer here ensembl2taxid = value_convert(ensembl2taxid, lambda x: int(x)) load_done('[%d]' % len(ensembl2taxid)) return ensembl2taxid
def load_ensembl2interpro(self): #Interpro DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt') load_start(DATAFILE) ensembl2interpro = dict_nodup(tab2dict(DATAFILE, (1, 4, 5, 6), 0)) ensembl2interpro = value_convert(ensembl2interpro, lambda x: {'id': x[0], 'short_desc': x[1], 'desc': x[2]}) ensembl2interpro = value_convert(ensembl2interpro, lambda x: {'interpro': x}, traverse_list=False) load_done('[%d]' % len(ensembl2interpro)) return self.convert2entrez(ensembl2interpro)
def load_ensembl2pos(self): #Genomic position DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt') load_start(DATAFILE) ensembl2pos = dict_nodup(tab2dict(DATAFILE, (1, 3, 4, 5, 6), 0, includefn=_not_LRG)) ensembl2pos = value_convert(ensembl2pos, lambda x: {'chr': x[2], 'start': int(x[0]), 'end': int(x[1]), 'strand': int(x[3])}) ensembl2pos = value_convert(ensembl2pos, lambda x: {'genomic_pos': x}, traverse_list=False) load_done('[%d]' % len(ensembl2pos)) return self.convert2entrez(ensembl2pos)
def convert2entrez(self, ensembl2x): '''convert a dict with ensembl gene ids as the keys to matching entrezgene ids as the keys.''' if not self.ensembl2entrez_li: self._load_ensembl2entrez_li() if not self.ensembl_main: self.ensembl_main = self.load_ensembl_main() ensembl2entrez = list2dict(self.ensembl2entrez_li, 0) entrez2ensembl = list2dict(self.ensembl2entrez_li, 1) #Now make a dictionary indexed by entrez gene id print('# of ensembl IDs in total: %d' % len(set(ensembl2x) | set(ensembl2entrez))) print('# of ensembl IDs match entrez Gene IDs: %d' % len(set(ensembl2x) & set(ensembl2entrez))) print('# of ensembl IDs DO NOT match entrez Gene IDs: %d' % len(set(ensembl2x) - set(ensembl2entrez))) #all genes with matched entrez def _fn(eid, taxid=None): d = copy.copy(ensembl2x.get( eid, {})) # need to make a copy of the value here. return d # otherwise, it will cause issue when multiple entrezgene ids # match the same ensembl gene, for example, # ENSMUSG00000027104 --> (11909, 100047997) data = value_convert(entrez2ensembl, _fn) #add those has no matched entrez geneid, using ensembl id as the key for eid in (set(ensembl2x) - set(ensembl2entrez)): _g = ensembl2x[eid] #_g.update(self.ensembl_main.get(eid, {})) data[eid] = _g for id in data: if isinstance(data[id], dict): _doc = dict_nodup(data[id], sort=True) else: #if one entrez gene matches multiple ensembl genes _doc = dict_attrmerge(data[id], removedup=True, sort=True) data[id] = _doc return data
def load_ensembl2pos(self): #Genomic position DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt') load_start(DATAFILE) # Twice 1 because first is the dict key, the second because we need gene id within genomic_pos ensembl2pos = dict_nodup( tab2dict(DATAFILE, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG)) ensembl2pos = value_convert( ensembl2pos, lambda x: { 'ensemblgene': x[0], 'chr': x[3], 'start': int(x[1]), 'end': int(x[2]), 'strand': int(x[4]) }) ensembl2pos = value_convert(ensembl2pos, lambda x: {'genomic_pos': x}, traverse_list=False) load_done('[%d]' % len(ensembl2pos)) return self.convert2entrez(ensembl2pos)
def convert2entrez(self, ensembl2x): '''convert a dict with ensembl gene ids as the keys to matching entrezgene ids as the keys.''' if not self.ensembl2entrez_li: self._load_ensembl2entrez_li() if not self.ensembl_main: self.ensembl_main = self.load_ensembl_main() ensembl2entrez = list2dict(self.ensembl2entrez_li, 0) entrez2ensembl = list2dict(self.ensembl2entrez_li, 1) #Now make a dictionary indexed by entrez gene id print '# of ensembl IDs in total: %d' % len(set(ensembl2x) | set(ensembl2entrez)) print '# of ensembl IDs match entrez Gene IDs: %d' % len(set(ensembl2x) & set(ensembl2entrez)) print '# of ensembl IDs DO NOT match entrez Gene IDs: %d' % len(set(ensembl2x) - set(ensembl2entrez)) #all genes with matched entrez def _fn(eid, taxid=None): d = copy.copy(ensembl2x.get(eid, {})) #need to make a copy of the value here. return d #otherwise, it will cause issue when multiple entrezgene ids #match the same ensembl gene, for example, # ENSMUSG00000027104 --> (11909, 100047997) data = value_convert(entrez2ensembl, _fn) #add those has no matched entrez geneid, using ensembl id as the key for eid in (set(ensembl2x) - set(ensembl2entrez)): _g = ensembl2x[eid] #_g.update(self.ensembl_main.get(eid, {})) data[eid] = _g doc_li = [] for id in data: if type(data[id]) is types.DictType: _doc = dict_nodup(data[id], sort=True) else: #if one entrez gene matches multiple ensembl genes _doc = dict_attrmerge(data[id], removedup=True, sort=True) data[id] = _doc return data