Ejemplo n.º 1
0
    def load(self, aslist=False):
        load_start(self.datafile)
        print()
        geneid_d = get_geneid_d(self.species_li)
        gene2unigene = tab2dict(self.datafile, (0, 1), 0, alwayslist=0,
                                includefn=lambda ld: int(ld[0]) in geneid_d)
        gene_d = {}
        for gid, unigene in gene2unigene.items():
            gene_d[gid] = {'unigene': unigene}
        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
Ejemplo n.º 2
0
    def load(self, aslist=False):
        load_start(self.datafile)
        print()
        geneid_d = get_geneid_d(self.species_li)
        gene2unigene = tab2dict(self.datafile, (0, 1), 0, alwayslist=0,
                                includefn=lambda ld: int(ld[0]) in geneid_d)
        gene_d = {}
        for gid, unigene in gene2unigene.items():
            gene_d[gid] = {'unigene': unigene}
        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
Ejemplo n.º 3
0
    def load(self, aslist=False):
        load_start(self.datafile)
        gene2go = tab2dict(self.datafile, (1, 2, 3, 4, 5, 6, 7),
                           0,
                           alwayslist=1,
                           includefn=self.species_filter)
        category_d = {'Function': 'MF', 'Process': 'BP', 'Component': 'CC'}

        def _ff(d):
            out = {}
            for goid, evidence, qualifier, goterm, pubmed, gocategory in d:
                _gocategory = category_d[gocategory]
                _d = out.get(_gocategory, [])
                _rec = dict(id=goid, term=goterm)
                if evidence != '-':
                    _rec['evidence'] = evidence
                if qualifier != '-':
                    # here I also fixing some inconsistency issues in NCBI data
                    # Colocalizes_with -> colocalizes_with
                    # Contributes_with -> contributes_with
                    # Not -> NOT
                    _rec['qualifier'] = qualifier.replace('Co', 'co').replace(
                        'Not', 'NOT')
                if pubmed != '-':
                    if pubmed.find('|') != -1:
                        pubmed = [int(pid) for pid in pubmed.split('|')]
                    else:
                        pubmed = int(pubmed)
                    _rec['pubmed'] = pubmed
                _d.append(_rec)
                out[_gocategory] = _d
            for k in out:
                if len(out[k]) == 1:
                    out[k] = out[k][0]
            return out

        gene2go = dict_convert(gene2go, valuefn=_ff)
        gene_d = {}
        for gid, go in gene2go.items():
            gene_d[gid] = {'go': go}
        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
Ejemplo n.º 4
0
    def load(self, aslist=False):
        load_start(self.datafile)
        gene2go = tab2dict(self.datafile, (1, 2, 3, 4, 5, 6, 7), 0, alwayslist=1,
                           includefn=self.species_filter)
        category_d = {'Function': 'MF',
                      'Process': 'BP',
                      'Component': 'CC'}

        def _ff(d):
            out = {}
            for goid, evidence, qualifier, goterm, pubmed, gocategory in d:
                _gocategory = category_d[gocategory]
                _d = out.get(_gocategory, [])
                _rec = dict(id=goid, term=goterm)
                if evidence != '-':
                    _rec['evidence'] = evidence
                if qualifier != '-':
                    # here I also fixing some inconsistency issues in NCBI data
                    # Colocalizes_with -> colocalizes_with
                    # Contributes_with -> contributes_with
                    # Not -> NOT
                    _rec['qualifier'] = qualifier.replace('Co', 'co').replace('Not', 'NOT')
                if pubmed != '-':
                    if pubmed.find('|') != -1:
                        pubmed = [int(pid) for pid in pubmed.split('|')]
                    else:
                        pubmed = int(pubmed)
                    _rec['pubmed'] = pubmed
                _d.append(_rec)
                out[_gocategory] = _d
            for k in out:
                if len(out[k]) == 1:
                    out[k] = out[k][0]
            return out

        gene2go = dict_convert(gene2go, valuefn=_ff)
        gene_d = {}
        for gid, go in gene2go.items():
            gene_d[gid] = {'go': go}
        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
Ejemplo n.º 5
0
    def load(self, aslist=False):
        load_start(self.datafile)
        if self.species_li:
            _includefn = lambda ld: int(ld[0]) in self.taxid_set and ld[1] != '-'
        else:
            _includefn = lambda ld: ld[1] != '-'
        gene2retired = tab2dict(self.datafile, (1, 2), 0, alwayslist=1,
                                includefn=_includefn)
        gene2retired = dict_convert(gene2retired, valuefn=lambda x: normalized_value([int(xx) for xx in x]))

        gene_d = {}
        for gid, retired in gene2retired.items():
            gene_d[gid] = {'retired': retired}
        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
Ejemplo n.º 6
0
    def load(self, aslist=False):
        load_start(self.datafile)
        if self.species_li:
            _includefn = lambda ld: int(ld[0]) in self.taxid_set and ld[1] != '-'
        else:
            _includefn = lambda ld: ld[1] != '-'
        gene2retired = tab2dict(self.datafile, (1, 2), 0, alwayslist=1,
                                includefn=_includefn)
        gene2retired = dict_convert(gene2retired, valuefn=lambda x: normalized_value([int(xx) for xx in x]))

        gene_d = {}
        for gid, retired in gene2retired.items():
            gene_d[gid] = {'retired': retired}
        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
Ejemplo n.º 7
0
    def load(self, aslist=False):
        load_start(self.datafile)
        gene2acc = tab2dict(self.datafile, (1, 3, 5, 7), 0, alwayslist=1,
                            includefn=self.species_filter)

        def _ff(d):
            out = {
                'rna': [],
                'protein': [],
                'genomic': [],
                'translation': []
            }
            for rna, prot, dna in d:
                if rna == '-': rna = None
                if prot == '-': prot = None
                if dna == '-': dna = None
                if rna is not None:
                    out['rna'].append(rna)
                if prot is not None:
                    out['protein'].append(prot)
                if dna is not None:
                    out['genomic'].append(dna)
                if rna and prot:
                    out['translation'].append({'rna' : rna, 'protein' : prot})
            # remove dup
            for k in out:
                out[k] = normalized_value(out[k])
            # remove empty rna/protein/genomic field
            _out = {}
            for k, v in out.items():
                if v:
                    _out[k] = v
            if _out:
                _out = {self.fieldname: _out}
            return _out

        gene2acc = dict_convert(gene2acc, valuefn=_ff)
        load_done('[%d]' % len(gene2acc))

        if aslist:
            return dict_to_list(gene2acc)
        else:
            return gene2acc
Ejemplo n.º 8
0
    def load(self, aslist=False):
        load_start(self.datafile)
        gene2acc = tab2dict(self.datafile, (1, 3, 5, 7), 0, alwayslist=1,
                            includefn=self.species_filter)

        def _ff(d):
            out = {
                'rna': [],
                'protein': [],
                'genomic': []
            }
            for x1, x2, x3 in d:
                if x1 != '-':
                    out['rna'].append(x1.split('.')[0])   # trim version number after dot
                if x2 != '-':
                    out['protein'].append(x2.split('.')[0])
                if x3 != '-':
                    out['genomic'].append(x3.split('.')[0])
            # remove dup
            for k in out:
                out[k] = normalized_value(out[k])
            # remove empty rna/protein/genomic field
            _out = {}
            for k, v in out.items():
                if v:
                    _out[k] = v
            if _out:
                _out = {self.fieldname: _out}
            return _out

        gene2acc = dict_convert(gene2acc, valuefn=_ff)
        load_done('[%d]' % len(gene2acc))

        if aslist:
            return dict_to_list(gene2acc)
        else:
            return gene2acc
Ejemplo n.º 9
0
    def load(self, aslist=False):
        '''
        loading ncbi "gene_info" file
        This must be called first to create basic gene documents
        with all basic fields, e.g., name, symbol, synonyms, etc.

        format of gene_info file:
        #Format: tax_id GeneID Symbol LocusTag Synonyms dbXrefs
                 map_location description type_of_gene Symbol_from
                 nomenclature_authority Full_name_from_nomenclature_authority
        Nomenclature_status Other_designations Modification_da
        te (tab is used as a separator, pound sign - start of a comment)

        '''
        load_start(self.datafile)
        gene_d = tab2dict(self.datafile, (0, 1, 2, 3, 4, 5, 7, 8, 9, 13, 14),
                          key=1,
                          alwayslist=0,
                          includefn=self.species_filter)

        def _ff(d):
            (taxid, symbol, locus_tag, synonyms, dbxrefs, map_location,
             description, type_of_gene, other_designations,
             modification_date) = d
            out = dict(taxid=int(taxid), symbol=symbol, name=description)
            if map_location != '-':
                out['map_location'] = map_location
            if type_of_gene != '-':
                out['type_of_gene'] = type_of_gene
            if synonyms != '-':
                out['alias'] = normalized_value(synonyms.split('|'))
            if locus_tag != '-':
                out['locus_tag'] = locus_tag
            if other_designations != "-":
                out['other_names'] = normalized_value(
                    other_designations.split('|'))

            # when merged, this will become the default timestamp
            out["_timestamp"] = datetime.datetime.strptime(
                modification_date, "%Y%m%d")

            for x in dbxrefs.split('|'):
                if x == '-':
                    continue
                xd = x.split(':')
                if len(xd) == 3 and xd[0] == xd[1] and \
                        xd[0] in ['VGNC', 'HGNC', 'MGI']:
                    # a fix for NCBI bug for dup xref prefix, 'HGNC:HGNC:36328'
                    xd = xd[1:]
                try:
                    _db, _id = xd
                except:
                    print(repr(x))
                    raise
                # we don't need ensembl xref from here, we will get it from
                # Ensembl directly
                if _db.lower() in ['ensembl', 'imgt/gene-db']:
                    # we don't need 'IMGT/GENE-DB" xref either, because they
                    # are mostly the same as gene symbol
                    continue
                # add "MGI:" prefix for MGI ids.
                if _db.lower() == 'mgi':
                    _id = "MGI:" + _id
                out[_db] = _id
            return out

        gene_d = value_convert(gene_d, _ff)

        # add entrezgene field
        for geneid in gene_d:
            d = gene_d[geneid]
            d['entrezgene'] = int(geneid)
            gene_d[geneid] = d

        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
Ejemplo n.º 10
0
    def load(self, aslist=False):
        '''
        loading ncbi "gene_info" file
        This must be called first to create basic gene documents
        with all basic fields, e.g., name, symbol, synonyms, etc.

        format of gene_info file:
        #Format: tax_id GeneID Symbol LocusTag Synonyms dbXrefs chromosome map_location description type_of_gene Symbol_from
        _nomenclature_authority Full_name_from_nomenclature_authority Nomenclature_status Other_designations Modification_da
        te (tab is used as a separator, pound sign - start of a comment)

        '''
        load_start(self.datafile)
        gene_d = tab2dict(self.datafile, (0, 1, 2, 4, 5, 7, 8, 9), key=1, alwayslist=0, includefn=self.species_filter)

        def _ff(d):
            (
                taxid, symbol, synonyms,
                dbxrefs, map_location,
                description, type_of_gene
            ) = d
            out = dict(taxid=int(taxid),
                       symbol=symbol,
                       name=description)
            if map_location != '-':
                out['map_location'] = map_location
            if type_of_gene != '-':
                out['type_of_gene'] = type_of_gene
            if synonyms != '-':
                out['alias'] = normalized_value(synonyms.split('|'))

            for x in dbxrefs.split('|'):
                if x == '-':
                    continue
                xd = x.split(':')
                if len(xd) == 3 and xd[0] == xd[1] and xd[0] in ['HGNC', 'MGI']:
                    xd = xd[1:]      # a fix for NCBI bug for dup xref prefix, 'HGNC:HGNC:36328'
                try:
                    _db, _id = xd
                except:
                    print(x)
                    raise
                if _db.lower() in ['ensembl', 'imgt/gene-db']:      # we don't need ensembl xref from here, we will get it from Ensembl directly
                    continue                                        # we don't need 'IMGT/GENE-DB" xref either, because they are mostly the same as gene symbol
                if _db.lower() == 'mgi':            # add "MGI:" prefix for MGI ids.
                    _id = "MGI:"+_id
                out[_db] = _id
            return out

        gene_d = value_convert(gene_d, _ff)

        # add entrezgene field
        for geneid in gene_d:
            d = gene_d[geneid]
            d['entrezgene'] = int(geneid)
            gene_d[geneid] = d

        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d