Exemple #1
0
 def load_ensembl2interpro(self):
     #Interpro
     datafile = os.path.join(self.data_folder,
                             'gene_ensembl__prot_interpro__dm.txt')
     for datadict in tab2dict_iter(datafile, (1, 4, 5, 6), 0):
         datadict = dict_nodup(datadict)
         # optimize with on call/convert
         datadict = value_convert(
             datadict, lambda x: {
                 'id': x[0],
                 'short_desc': x[1],
                 'desc': x[2]
             })
         # __aslistofdict__ : merge to 'interpro' as list of dict, not merging keys as list
         # (these are merging instructions for later called merge_struct)
         # 'interpro' : {'a': 1, 'b': 2} and 'interpro' : {'a': 3, 'b': 4} should result in:
         # => 'interpro' : [{'a': 1, 'b': 2},{'a': 3, 'b': 4}]
         # or not:
         # => 'interpro' : {'a': [1,3], 'b': [2,4]}
         datadict = value_convert(datadict,
                                  lambda x: {
                                      'interpro': x,
                                      '__aslistofdict__': 'interpro'
                                  },
                                  traverse_list=False)
         for doc in map_id(datadict, self.ensembl2entrez):
             yield doc
Exemple #2
0
    def load_ensembl_main(self):
        """loading ensembl gene to symbol+name mapping"""
        def _fn(x):
            import logging
            out = {'taxid': int(x[0])}
            if x[1].strip() not in ['', '\\N']:
                out['symbol'] = x[1].strip()
            if x[2].strip() not in ['', '\\N']:
                _name = SubStr(x[2].strip(), '', ' [Source:').strip()
                if _name:
                    out['name'] = _name
            return out

        skip_count = 0
        datafile = os.path.join(
            self.data_folder, 'gene_ensembl__gene__main.txt')
        for datadict in tab2dict_iter(datafile, (0, 1, 2, 7, 8), 1, includefn=_not_LRG):
            datadict = value_convert(datadict, _fn)
            for id, doc in datadict.items():
                if id.isdigit():
                    if skip_count < ERR_THRESHOLD:
                        skip_count += 1
                    else:
                        raise ValueError('Too many ensembl ids are entirely numeric')
                    self.logger.warning(
                        "Document Skipped: All-digit id {}".format(id))
                    continue
                doc['_id'] = id
                yield doc
Exemple #3
0
    def load(self):
        cnt = 0
        for datadict in tab2dict_iter(self.datafile, (1, 2, 4), 0, alwayslist=1):
            datadict = dict_convert(datadict, valuefn=lambda v: {
                            'generif': [dict(pubmed=self._cvt_pubmed(x[0]), text=x[1]) for x in v]})

            for id,doc in datadict.items():
                cnt += 1
                doc['_id'] = id
                yield doc
Exemple #4
0
 def load_ensembl2pfam(self):
     # Prosite
     datafile = os.path.join(
         self.data_folder, 'gene_ensembl__prot_pfam__dm.txt')
     for datadict in tab2dict_iter(datafile, (1, 4), 0):
         datadict = dict_nodup(datadict)
         datadict = value_convert(datadict, lambda x: {
                                  'pfam': x}, traverse_list=False)
         for doc in map_id(datadict, self.ensembl2entrez):
             yield doc
Exemple #5
0
    def load(self, aslist=False):
        gene2go = tab2dict_iter(self.datafile, (1, 2, 3, 4, 5, 6, 7),
                                0,
                                alwayslist=1,
                                includefn=self.species_filter)
        category_d = {'Function': 'MF', 'Process': 'BP', 'Component': 'CC'}

        def _ff(d):
            out = {}
            for goid, evidence, qualifier, goterm, pubmed, gocategory in d:
                _gocategory = category_d[gocategory]
                _d = out.get(_gocategory, [])
                _rec = dict(id=goid, term=goterm)
                if gocategory == 'Function':
                    _rec['category'] = 'MF'
                elif gocategory == 'Process':
                    _rec['gocategory'] = 'BP'
                elif gocategory == 'Component':
                    _rec['gocategory'] = 'CC'
                if evidence != '-':
                    _rec['evidence'] = evidence
                if qualifier != '-':
                    # here I also fixing some inconsistency issues in NCBI data
                    # Colocalizes_with -> colocalizes_with
                    # Contributes_with -> contributes_with
                    # Not -> NOT
                    _rec['qualifier'] = qualifier.replace('Co', 'co').replace(
                        'Not', 'NOT')
                if pubmed != '-':
                    if pubmed.find('|') != -1:
                        pubmed = [int(pid) for pid in pubmed.split('|')]
                    else:
                        pubmed = int(pubmed)
                    _rec['pubmed'] = pubmed
                _d.append(_rec)
                out[_gocategory] = _d
            for k in out:
                if len(out[k]) == 1:
                    out[k] = out[k][0]
            return out

        for gd in gene2go:
            convd = dict_convert(gd, valuefn=_ff)
            assert len(list(
                convd.items())) == 1, "nope: %s" % list(convd.items())
            gid, go = list(convd.items())[0]
            gene_d = {"_id": gid, "go": go}
            yield gene_d
Exemple #6
0
 def load_ensembl2pos(self):
     datafile = os.path.join(
         self.data_folder, 'gene_ensembl__gene__main.txt')
     # Twice 1 because first is the dict key, the second because we need gene id within genomic_pos
     ensembl2pos = dict_nodup(
         tab2dict(datafile, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG))
     ensembl2pos = value_convert(ensembl2pos, lambda x: {
                                 'ensemblgene': x[0], 'chr': x[3], 'start': int(x[1]), 'end': int(x[2]), 'strand': int(x[4])})
     ensembl2pos = value_convert(ensembl2pos, lambda x: {
                                 'genomic_pos': x}, traverse_list=False)
     for datadict in tab2dict_iter(datafile, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG):
         datadict = dict_nodup(datadict)
         datadict = value_convert(datadict, lambda x: {'ensemblgene': x[0], 'chr': x[3], 'start': int(
             x[1]), 'end': int(x[2]), 'strand': int(x[4])})
         datadict = value_convert(datadict, lambda x: {
                                  'genomic_pos': x, '__aslistofdict__': 'genomic_pos'}, traverse_list=False)
         for doc in map_id(datadict, self.ensembl2entrez):
             yield doc
Exemple #7
0
    def load(self, aslist=False):
        gene2acc = tab2dict_iter(self.datafile, (1, 3, 5, 7),
                                 0,
                                 alwayslist=1,
                                 includefn=self.species_filter)

        def _ff(d):
            out = {'rna': [], 'protein': [], 'genomic': [], 'translation': []}
            for rna, prot, dna in d:
                if rna == '-': rna = None
                if prot == '-': prot = None
                if dna == '-': dna = None
                if rna is not None:
                    out['rna'].append(rna)
                if prot is not None:
                    out['protein'].append(prot)
                if dna is not None:
                    out['genomic'].append(dna)
                if rna and prot:
                    out['translation'].append({'rna': rna, 'protein': prot})
            # remove dup
            for k in out:
                out[k] = normalized_value(out[k])
            # remove empty rna/protein/genomic field
            _out = {}
            for k, v in out.items():
                if v:
                    _out[k] = v
            if _out:
                _out = {self.fieldname: _out}
            return _out

        #gene/2acc = dict_convert(gene2acc, valuefn=_ff)
        cnt = 0
        for gd in gene2acc:
            convd = self.format(dict_convert(gd, valuefn=_ff))
            yield convd
            cnt += 1

        if aslist:
            return dict_to_list(gene2acc)
        else:
            return gene2acc
Exemple #8
0
def load_data(data_folder):
    datafile = os.path.join(data_folder, 'NCBI2Reactome_All_Levels.txt')
    data = tab2dict_iter(datafile, (0, 1, 3), 0, header=0, alwayslist=True)
    def convert(data):
        for dvalue in data:
            assert len(dvalue) == 1
            _id = list(dvalue.keys())[0]
            doc = {"_id" : _id,
                   "pathway" : {"reactome" : None}
                  }
            lvals = []
            for val in dvalue[_id]:
                lvals.append({"id" : val[0], "name" : val[1]})
            if len(lvals) == 1:
                lvals = lvals.pop()
            doc["pathway"]["reactome"] = lvals
            yield doc

    return convert(data)
Exemple #9
0
    def load_ensembl_main(self):
        """loading ensembl gene to symbol+name mapping"""
        def _fn(x):
            import logging
            out = {'taxid': int(x[0])}
            if x[1].strip() not in ['', '\\N']:
                out['symbol'] = x[1].strip()
            if x[2].strip() not in ['', '\\N']:
                _name = SubStr(x[2].strip(), '', ' [Source:').strip()
                if _name:
                    out['name'] = _name
            return out

        datafile = os.path.join(self.data_folder,
                                'gene_ensembl__gene__main.txt')
        for datadict in tab2dict_iter(datafile, (0, 1, 2, 7, 8),
                                      1,
                                      includefn=_not_LRG):
            datadict = value_convert(datadict, _fn)
            for id, doc in datadict.items():
                doc['_id'] = id
                yield doc
Exemple #10
0
 def load(self, aslist=False):
     uni_d = tab2dict(self.datafile, (0, 1), 0, alwayslist=0)
     DATAFILE = os.path.join(self.data_folder, 'gene_history.gz')
     retired2gene = tab2dict(DATAFILE, (1, 2),
                             1,
                             alwayslist=0,
                             includefn=lambda ld: ld[1] != '-')
     for id in list(uni_d.keys()):
         uni_d[retired2gene.get(id, id)] = uni_d[id]
     geneid_d = get_geneid_d(self.data_folder,
                             self.species_li,
                             load_cache=False,
                             save_cache=False,
                             only_for=uni_d)
     gene2unigene = tab2dict_iter(
         self.datafile, (0, 1),
         0,
         alwayslist=0,
         includefn=lambda ld: int(ld[0]) in geneid_d)
     cnt = 0
     for doc in gene2unigene:
         yield self.format(doc)
         cnt += 1
Exemple #11
0
    def load(self, aslist=False):
        '''
        loading ncbi "gene_info" file
        This must be called first to create basic gene documents
        with all basic fields, e.g., name, symbol, synonyms, etc.

        format of gene_info file:
        #Format: tax_id GeneID Symbol LocusTag Synonyms dbXrefs
                 map_location description type_of_gene Symbol_from
                 nomenclature_authority Full_name_from_nomenclature_authority
        Nomenclature_status Other_designations Modification_da
        te (tab is used as a separator, pound sign - start of a comment)

        '''
        gene_d = tab2dict_iter(self.datafile,
                               (0, 1, 2, 3, 4, 5, 7, 8, 9, 13, 14),
                               key=1,
                               alwayslist=0,
                               includefn=self.species_filter)

        def _ff(d):
            (taxid, symbol, locus_tag, synonyms, dbxrefs, map_location,
             description, type_of_gene, other_designations,
             modification_date) = d
            out = dict(taxid=int(taxid), symbol=symbol, name=description)
            if map_location != '-':
                out['map_location'] = map_location
            if type_of_gene != '-':
                out['type_of_gene'] = type_of_gene
            if synonyms != '-':
                out['alias'] = normalized_value(synonyms.split('|'))
            if locus_tag != '-':
                out['locus_tag'] = locus_tag
            if other_designations != "-":
                out['other_names'] = normalized_value(
                    other_designations.split('|'))

            ### when merged, this will become the default timestamp
            ### as of 2017/12/10, some timestamps can have different formats
            ##if len(modification_date) > 8:
            ##    out["_timestamp"] = datetime.datetime.strptime(modification_date,"%m/%d/%Y %H:%M:%S")
            ##else:
            ##    out["_timestamp"] = datetime.datetime.strptime(modification_date,"%Y%m%d")

            for x in dbxrefs.split('|'):
                if x == '-':
                    continue
                xd = x.split(':')
                if len(xd) == 3 and xd[0] == xd[1] and \
                        xd[0] in ['VGNC', 'HGNC', 'MGI']:
                    # a fix for NCBI bug for dup xref prefix, 'HGNC:HGNC:36328'
                    xd = xd[1:]
                try:
                    _db, _id = xd
                except:
                    print(repr(x))
                    raise
                # we don't need ensembl xref from here, we will get it from
                # Ensembl directly
                if _db.lower() in ['ensembl', 'imgt/gene-db']:
                    # we don't need 'IMGT/GENE-DB" xref either, because they
                    # are mostly the same as gene symbol
                    continue
                # add "MGI:" prefix for MGI ids.
                if _db.lower() == 'mgi':
                    _id = "MGI:" + _id
                out[_db] = _id
            return out

        # add entrezgene field
        cnt = 0
        for d in gene_d:
            d = value_convert(d, _ff)
            yield self.format(d)
            cnt += 1