def load_ensembl2interpro(self): #Interpro datafile = os.path.join(self.data_folder, 'gene_ensembl__prot_interpro__dm.txt') for datadict in tab2dict_iter(datafile, (1, 4, 5, 6), 0): datadict = dict_nodup(datadict) # optimize with on call/convert datadict = value_convert( datadict, lambda x: { 'id': x[0], 'short_desc': x[1], 'desc': x[2] }) # __aslistofdict__ : merge to 'interpro' as list of dict, not merging keys as list # (these are merging instructions for later called merge_struct) # 'interpro' : {'a': 1, 'b': 2} and 'interpro' : {'a': 3, 'b': 4} should result in: # => 'interpro' : [{'a': 1, 'b': 2},{'a': 3, 'b': 4}] # or not: # => 'interpro' : {'a': [1,3], 'b': [2,4]} datadict = value_convert(datadict, lambda x: { 'interpro': x, '__aslistofdict__': 'interpro' }, traverse_list=False) for doc in map_id(datadict, self.ensembl2entrez): yield doc
def _load_ensembl_2taxid(self): """ensembl2taxid""" datafile = os.path.join(self.data_folder, 'gene_ensembl__translation__main.txt') ensembl2taxid = dict_nodup( tab2dict(datafile, (0, 1), 1, includefn=_not_LRG)) # need to convert taxid to integer here ensembl2taxid = value_convert(ensembl2taxid, lambda x: int(x)) return ensembl2taxid
def load_ensembl2pos(self): datafile = os.path.join( self.data_folder, 'gene_ensembl__gene__main.txt') # Twice 1 because first is the dict key, the second because we need gene id within genomic_pos ensembl2pos = dict_nodup( tab2dict(datafile, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG)) ensembl2pos = value_convert(ensembl2pos, lambda x: { 'ensemblgene': x[0], 'chr': x[3], 'start': int(x[1]), 'end': int(x[2]), 'strand': int(x[4])}) ensembl2pos = value_convert(ensembl2pos, lambda x: { 'genomic_pos': x}, traverse_list=False) for datadict in tab2dict_iter(datafile, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG): datadict = dict_nodup(datadict) datadict = value_convert(datadict, lambda x: {'ensemblgene': x[0], 'chr': x[3], 'start': int( x[1]), 'end': int(x[2]), 'strand': int(x[4])}) datadict = value_convert(datadict, lambda x: { 'genomic_pos': x, '__aslistofdict__': 'genomic_pos'}, traverse_list=False) for doc in map_id(datadict, self.ensembl2entrez): yield doc
def load_ensembl2pfam(self): # Prosite datafile = os.path.join( self.data_folder, 'gene_ensembl__prot_pfam__dm.txt') for datadict in tab2dict_iter(datafile, (1, 4), 0): datadict = dict_nodup(datadict) datadict = value_convert(datadict, lambda x: { 'pfam': x}, traverse_list=False) for doc in map_id(datadict, self.ensembl2entrez): yield doc
def convert2entrez(self, ensembl2x): '''convert a dict with ensembl gene ids as the keys to matching entrezgene ids as the keys.''' if not self.ensembl2entrez_li: self._load_ensembl2entrez_li() if not self.ensembl_main: self.ensembl_main = self.load_ensembl_main() ensembl2entrez = list2dict(self.ensembl2entrez_li, 0) entrez2ensembl = list2dict(self.ensembl2entrez_li, 1) # Now make a dictionary indexed by entrez gene id print('# of ensembl IDs in total: %d' % len(set(ensembl2x) | set(ensembl2entrez))) print('# of ensembl IDs match entrez Gene IDs: %d' % len(set(ensembl2x) & set(ensembl2entrez))) print('# of ensembl IDs DO NOT match entrez Gene IDs: %d' % len(set(ensembl2x) - set(ensembl2entrez))) # all genes with matched entrez def _fn(eid, taxid=None): # need to make a copy of the value here. d = copy.copy(ensembl2x.get(eid, {})) # otherwise, it will cause issue when multiple entrezgene ids return d # match the same ensembl gene, for example, # ENSMUSG00000027104 --> (11909, 100047997) data = value_convert(entrez2ensembl, _fn) # add those has no matched entrez geneid, using ensembl id as the key for eid in (set(ensembl2x) - set(ensembl2entrez)): _g = ensembl2x[eid] #_g.update(self.ensembl_main.get(eid, {})) data[eid] = _g for id in data: if isinstance(data[id], dict): _doc = dict_nodup(data[id], sort=True) else: # if one entrez gene matches multiple ensembl genes _doc = dict_attrmerge(data[id], removedup=True, sort=True) data[id] = _doc return data