Exemple #1
0
 def load_ensembl2interpro(self):
     #Interpro
     datafile = os.path.join(self.data_folder,
                             'gene_ensembl__prot_interpro__dm.txt')
     for datadict in tab2dict_iter(datafile, (1, 4, 5, 6), 0):
         datadict = dict_nodup(datadict)
         # optimize with on call/convert
         datadict = value_convert(
             datadict, lambda x: {
                 'id': x[0],
                 'short_desc': x[1],
                 'desc': x[2]
             })
         # __aslistofdict__ : merge to 'interpro' as list of dict, not merging keys as list
         # (these are merging instructions for later called merge_struct)
         # 'interpro' : {'a': 1, 'b': 2} and 'interpro' : {'a': 3, 'b': 4} should result in:
         # => 'interpro' : [{'a': 1, 'b': 2},{'a': 3, 'b': 4}]
         # or not:
         # => 'interpro' : {'a': [1,3], 'b': [2,4]}
         datadict = value_convert(datadict,
                                  lambda x: {
                                      'interpro': x,
                                      '__aslistofdict__': 'interpro'
                                  },
                                  traverse_list=False)
         for doc in map_id(datadict, self.ensembl2entrez):
             yield doc
Exemple #2
0
 def _load_ensembl_2taxid(self):
     """ensembl2taxid"""
     datafile = os.path.join(self.data_folder,
                             'gene_ensembl__translation__main.txt')
     ensembl2taxid = dict_nodup(
         tab2dict(datafile, (0, 1), 1, includefn=_not_LRG))
     # need to convert taxid to integer here
     ensembl2taxid = value_convert(ensembl2taxid, lambda x: int(x))
     return ensembl2taxid
Exemple #3
0
 def load_ensembl2pos(self):
     datafile = os.path.join(
         self.data_folder, 'gene_ensembl__gene__main.txt')
     # Twice 1 because first is the dict key, the second because we need gene id within genomic_pos
     ensembl2pos = dict_nodup(
         tab2dict(datafile, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG))
     ensembl2pos = value_convert(ensembl2pos, lambda x: {
                                 'ensemblgene': x[0], 'chr': x[3], 'start': int(x[1]), 'end': int(x[2]), 'strand': int(x[4])})
     ensembl2pos = value_convert(ensembl2pos, lambda x: {
                                 'genomic_pos': x}, traverse_list=False)
     for datadict in tab2dict_iter(datafile, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG):
         datadict = dict_nodup(datadict)
         datadict = value_convert(datadict, lambda x: {'ensemblgene': x[0], 'chr': x[3], 'start': int(
             x[1]), 'end': int(x[2]), 'strand': int(x[4])})
         datadict = value_convert(datadict, lambda x: {
                                  'genomic_pos': x, '__aslistofdict__': 'genomic_pos'}, traverse_list=False)
         for doc in map_id(datadict, self.ensembl2entrez):
             yield doc
Exemple #4
0
 def load_ensembl2pfam(self):
     # Prosite
     datafile = os.path.join(
         self.data_folder, 'gene_ensembl__prot_pfam__dm.txt')
     for datadict in tab2dict_iter(datafile, (1, 4), 0):
         datadict = dict_nodup(datadict)
         datadict = value_convert(datadict, lambda x: {
                                  'pfam': x}, traverse_list=False)
         for doc in map_id(datadict, self.ensembl2entrez):
             yield doc
Exemple #5
0
    def convert2entrez(self, ensembl2x):
        '''convert a dict with ensembl gene ids as the keys to matching entrezgene ids as the keys.'''
        if not self.ensembl2entrez_li:
            self._load_ensembl2entrez_li()

        if not self.ensembl_main:
            self.ensembl_main = self.load_ensembl_main()

        ensembl2entrez = list2dict(self.ensembl2entrez_li, 0)
        entrez2ensembl = list2dict(self.ensembl2entrez_li, 1)

        # Now make a dictionary indexed by entrez gene id
        print('# of ensembl IDs in total: %d' %
              len(set(ensembl2x) | set(ensembl2entrez)))
        print('# of ensembl IDs match entrez Gene IDs: %d' %
              len(set(ensembl2x) & set(ensembl2entrez)))
        print('# of ensembl IDs DO NOT match entrez Gene IDs: %d' %
              len(set(ensembl2x) - set(ensembl2entrez)))

        # all genes with matched entrez
        def _fn(eid, taxid=None):
            # need to make a copy of the value here.
            d = copy.copy(ensembl2x.get(eid, {}))
            # otherwise, it will cause issue when multiple entrezgene ids
            return d
            # match the same ensembl gene, for example,
            #      ENSMUSG00000027104 --> (11909, 100047997)

        data = value_convert(entrez2ensembl, _fn)

        # add those has no matched entrez geneid, using ensembl id as the key
        for eid in (set(ensembl2x) - set(ensembl2entrez)):
            _g = ensembl2x[eid]
            #_g.update(self.ensembl_main.get(eid, {}))
            data[eid] = _g

        for id in data:
            if isinstance(data[id], dict):
                _doc = dict_nodup(data[id], sort=True)
            else:
                # if one entrez gene matches multiple ensembl genes
                _doc = dict_attrmerge(data[id], removedup=True, sort=True)
            data[id] = _doc

        return data