def load_ensembl2pfam(self):
     #Prosite
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt')
     load_start(DATAFILE)
     ensembl2pfam = dict_nodup(tab2dict(DATAFILE, (1, 4), 0))
     ensembl2pfam = value_convert(ensembl2pfam, lambda x: {'pfam': x}, traverse_list=False)
     load_done('[%d]' % len(ensembl2pfam))
     return self.convert2entrez(ensembl2pfam)
Exemple #2
0
 def load_ensembl2pfam(self):
     #Prosite
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt')
     load_start(DATAFILE)
     ensembl2pfam = dict_nodup(tab2dict(DATAFILE, (1, 4), 0))
     ensembl2pfam = value_convert(ensembl2pfam, lambda x: {'pfam': x}, traverse_list=False)
     load_done('[%d]' % len(ensembl2pfam))
     return self.convert2entrez(ensembl2pfam)
 def _load_ensembl_2taxid(self):
     """ensembl2taxid"""
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt')
     load_start(DATAFILE)
     ensembl2taxid = dict_nodup(tab2dict(DATAFILE, (0, 1), 1, includefn=_not_LRG))
     # need to convert taxid to integer here
     ensembl2taxid = value_convert(ensembl2taxid, lambda x: int(x))
     load_done('[%d]' % len(ensembl2taxid))
     return ensembl2taxid
 def load_ensembl2interpro(self):
     #Interpro
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt')
     load_start(DATAFILE)
     ensembl2interpro = dict_nodup(tab2dict(DATAFILE, (1, 4, 5, 6), 0))
     ensembl2interpro = value_convert(ensembl2interpro, lambda x: {'id': x[0], 'short_desc': x[1], 'desc': x[2]})
     ensembl2interpro = value_convert(ensembl2interpro, lambda x: {'interpro': x}, traverse_list=False)
     load_done('[%d]' % len(ensembl2interpro))
     return self.convert2entrez(ensembl2interpro)
 def load_ensembl2pos(self):
     #Genomic position
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt')
     load_start(DATAFILE)
     ensembl2pos = dict_nodup(tab2dict(DATAFILE, (1, 3, 4, 5, 6), 0, includefn=_not_LRG))
     ensembl2pos = value_convert(ensembl2pos, lambda x: {'chr': x[2], 'start': int(x[0]), 'end': int(x[1]), 'strand': int(x[3])})
     ensembl2pos = value_convert(ensembl2pos, lambda x: {'genomic_pos': x}, traverse_list=False)
     load_done('[%d]' % len(ensembl2pos))
     return self.convert2entrez(ensembl2pos)
Exemple #6
0
 def _load_ensembl_2taxid(self):
     """ensembl2taxid"""
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt')
     load_start(DATAFILE)
     ensembl2taxid = dict_nodup(tab2dict(DATAFILE, (0, 1), 1, includefn=_not_LRG))
     # need to convert taxid to integer here
     ensembl2taxid = value_convert(ensembl2taxid, lambda x: int(x))
     load_done('[%d]' % len(ensembl2taxid))
     return ensembl2taxid
Exemple #7
0
 def load_ensembl2interpro(self):
     #Interpro
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt')
     load_start(DATAFILE)
     ensembl2interpro = dict_nodup(tab2dict(DATAFILE, (1, 4, 5, 6), 0))
     ensembl2interpro = value_convert(ensembl2interpro, lambda x: {'id': x[0], 'short_desc': x[1], 'desc': x[2]})
     ensembl2interpro = value_convert(ensembl2interpro, lambda x: {'interpro': x}, traverse_list=False)
     load_done('[%d]' % len(ensembl2interpro))
     return self.convert2entrez(ensembl2interpro)
Exemple #8
0
 def load_ensembl2pos(self):
     #Genomic position
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt')
     load_start(DATAFILE)
     ensembl2pos = dict_nodup(tab2dict(DATAFILE, (1, 3, 4, 5, 6), 0, includefn=_not_LRG))
     ensembl2pos = value_convert(ensembl2pos, lambda x: {'chr': x[2], 'start': int(x[0]), 'end': int(x[1]), 'strand': int(x[3])})
     ensembl2pos = value_convert(ensembl2pos, lambda x: {'genomic_pos': x}, traverse_list=False)
     load_done('[%d]' % len(ensembl2pos))
     return self.convert2entrez(ensembl2pos)
Exemple #9
0
    def convert2entrez(self, ensembl2x):
        '''convert a dict with ensembl gene ids as the keys to matching entrezgene ids as the keys.'''
        if not self.ensembl2entrez_li:
            self._load_ensembl2entrez_li()

        if not self.ensembl_main:
            self.ensembl_main = self.load_ensembl_main()

        ensembl2entrez = list2dict(self.ensembl2entrez_li, 0)
        entrez2ensembl = list2dict(self.ensembl2entrez_li, 1)

        #Now make a dictionary indexed by entrez gene id
        print('# of ensembl IDs in total: %d' %
              len(set(ensembl2x) | set(ensembl2entrez)))
        print('# of ensembl IDs match entrez Gene IDs: %d' %
              len(set(ensembl2x) & set(ensembl2entrez)))
        print('# of ensembl IDs DO NOT match entrez Gene IDs: %d' %
              len(set(ensembl2x) - set(ensembl2entrez)))

        #all genes with matched entrez
        def _fn(eid, taxid=None):
            d = copy.copy(ensembl2x.get(
                eid, {}))  # need to make a copy of the value here.
            return d  # otherwise, it will cause issue when multiple entrezgene ids
            # match the same ensembl gene, for example,
            #      ENSMUSG00000027104 --> (11909, 100047997)

        data = value_convert(entrez2ensembl, _fn)

        #add those has no matched entrez geneid, using ensembl id as the key
        for eid in (set(ensembl2x) - set(ensembl2entrez)):
            _g = ensembl2x[eid]
            #_g.update(self.ensembl_main.get(eid, {}))
            data[eid] = _g

        for id in data:
            if isinstance(data[id], dict):
                _doc = dict_nodup(data[id], sort=True)
            else:
                #if one entrez gene matches multiple ensembl genes
                _doc = dict_attrmerge(data[id], removedup=True, sort=True)
            data[id] = _doc

        return data
 def load_ensembl2pos(self):
     #Genomic position
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt')
     load_start(DATAFILE)
     # Twice 1 because first is the dict key, the second because we need gene id within genomic_pos
     ensembl2pos = dict_nodup(
         tab2dict(DATAFILE, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG))
     ensembl2pos = value_convert(
         ensembl2pos, lambda x: {
             'ensemblgene': x[0],
             'chr': x[3],
             'start': int(x[1]),
             'end': int(x[2]),
             'strand': int(x[4])
         })
     ensembl2pos = value_convert(ensembl2pos,
                                 lambda x: {'genomic_pos': x},
                                 traverse_list=False)
     load_done('[%d]' % len(ensembl2pos))
     return self.convert2entrez(ensembl2pos)
Exemple #11
0
    def convert2entrez(self, ensembl2x):
        '''convert a dict with ensembl gene ids as the keys to matching entrezgene ids as the keys.'''
        if not self.ensembl2entrez_li:
            self._load_ensembl2entrez_li()

        if not self.ensembl_main:
            self.ensembl_main = self.load_ensembl_main()

        ensembl2entrez = list2dict(self.ensembl2entrez_li, 0)
        entrez2ensembl = list2dict(self.ensembl2entrez_li, 1)

        #Now make a dictionary indexed by entrez gene id
        print '# of ensembl IDs in total: %d' % len(set(ensembl2x) | set(ensembl2entrez))
        print '# of ensembl IDs match entrez Gene IDs: %d' % len(set(ensembl2x) & set(ensembl2entrez))
        print '# of ensembl IDs DO NOT match entrez Gene IDs: %d' % len(set(ensembl2x) - set(ensembl2entrez))

        #all genes with matched entrez
        def _fn(eid, taxid=None):
            d = copy.copy(ensembl2x.get(eid, {}))    #need to make a copy of the value here.
            return d                                    #otherwise, it will cause issue when multiple entrezgene ids
                                                        #match the same ensembl gene, for example,
                                                        #      ENSMUSG00000027104 --> (11909, 100047997)

        data = value_convert(entrez2ensembl, _fn)

        #add those has no matched entrez geneid, using ensembl id as the key
        for eid in (set(ensembl2x) - set(ensembl2entrez)):
            _g = ensembl2x[eid]
            #_g.update(self.ensembl_main.get(eid, {}))
            data[eid] = _g

        doc_li = []
        for id in data:
            if type(data[id]) is types.DictType:
                _doc = dict_nodup(data[id], sort=True)
            else:
                #if one entrez gene matches multiple ensembl genes
                _doc = dict_attrmerge(data[id], removedup=True, sort=True)
            data[id] = _doc

        return data