Example #1
0
        def _fn(x, eid):
            out = {'gene': eid, 'translation' : []}
            def mapping(transcript_id, protein_id):
                trid = transcript_id and transcript_id != '\\N' and transcript_id or None
                pid = protein_id and protein_id != '\\N' and protein_id or None
                if trid and pid:
                    out['translation'].append({"rna" : trid, "protein" : pid})

            if isinstance(x, list):
                transcript_li = []
                protein_li = []
                for _x in x:
                    if _x[0] and _x[0] != '\\N':
                        transcript_li.append(_x[0])
                    if _x[1] and _x[1] != '\\N':
                        protein_li.append(_x[1])
                    mapping(_x[0],_x[1])

                if transcript_li:
                    out['transcript'] = normalized_value(transcript_li)
                if protein_li:
                    out['protein'] = normalized_value(protein_li)
            else:
                if x[0] and x[0] != '\\N':
                    out['transcript'] = x[0]
                if x[1] and x[1] != '\\N':
                    out['protein'] = x[1]
                mapping(x[0],x[1])

            return out
Example #2
0
        def _fn(x, eid):
            out = {'gene': eid, 'translation' : []}
            def mapping(transcript_id, protein_id):
                trid = transcript_id and transcript_id != '\\N' and transcript_id or None
                pid = protein_id and protein_id != '\\N' and protein_id or None
                if trid and pid:
                    out['translation'].append({"rna" : trid, "protein" : pid})

            if isinstance(x, list):
                transcript_li = []
                protein_li = []
                for _x in x:
                    if _x[0] and _x[0] != '\\N':
                        transcript_li.append(_x[0])
                    if _x[1] and _x[1] != '\\N':
                        protein_li.append(_x[1])
                    mapping(_x[0],_x[1])

                if transcript_li:
                    out['transcript'] = normalized_value(transcript_li)
                if protein_li:
                    out['protein'] = normalized_value(protein_li)
            else:
                if x[0] and x[0] != '\\N':
                    out['transcript'] = x[0]
                if x[1] and x[1] != '\\N':
                    out['protein'] = x[1]
                mapping(x[0],x[1])

            return out
Example #3
0
        def _ff(d):
            (
                taxid, symbol, locus_tag, synonyms,
                dbxrefs, map_location,
                description, type_of_gene, other_designations,
                modification_date
            ) = d
            out = dict(taxid=int(taxid),
                       symbol=symbol,
                       name=description)
            if map_location != '-':
                out['map_location'] = map_location
            if type_of_gene != '-':
                out['type_of_gene'] = type_of_gene
            if synonyms != '-':
                out['alias'] = normalized_value(synonyms.split('|'))
            if locus_tag != '-':
                out['locus_tag'] = locus_tag
            if other_designations != "-":
                out['other_names'] = normalized_value(other_designations.split('|'))

            # when merged, this will become the default timestamp
            out["_timestamp"] = datetime.datetime.strptime(modification_date,"%Y%m%d")

            for x in dbxrefs.split('|'):
                if x == '-':
                    continue
                xd = x.split(':')
                if len(xd) == 3 and xd[0] == xd[1] and \
                        xd[0] in ['VGNC', 'HGNC', 'MGI']:
                    # a fix for NCBI bug for dup xref prefix, 'HGNC:HGNC:36328'
                    xd = xd[1:]
                try:
                    _db, _id = xd
                except:
                    print(repr(x))
                    raise
                # we don't need ensembl xref from here, we will get it from
                # Ensembl directly
                if _db.lower() in ['ensembl', 'imgt/gene-db']:
                    # we don't need 'IMGT/GENE-DB" xref either, because they
                    # are mostly the same as gene symbol
                    continue
                # add "MGI:" prefix for MGI ids.
                if _db.lower() == 'mgi':
                    _id = "MGI:" + _id
                out[_db] = _id
            return out
Example #4
0
 def _ff(d):
     out = {'rna': [], 'protein': [], 'genomic': [], 'translation': []}
     for rna, prot, dna in d:
         if rna == '-': rna = None
         if prot == '-': prot = None
         if dna == '-': dna = None
         if rna is not None:
             out['rna'].append(rna)
         if prot is not None:
             out['protein'].append(prot)
         if dna is not None:
             out['genomic'].append(dna)
         if rna and prot:
             out['translation'].append({'rna': rna, 'protein': prot})
     # remove dup
     for k in out:
         out[k] = normalized_value(out[k])
     # remove empty rna/protein/genomic field
     _out = {}
     for k, v in out.items():
         if v:
             _out[k] = v
     if _out:
         _out = {self.fieldname: _out}
     return _out
Example #5
0
        def _ff(d):
            (
                taxid, symbol, synonyms,
                dbxrefs, map_location,
                description, type_of_gene
            ) = d
            out = dict(taxid=int(taxid),
                       symbol=symbol,
                       name=description)
            if map_location != '-':
                out['map_location'] = map_location
            if type_of_gene != '-':
                out['type_of_gene'] = type_of_gene
            if synonyms != '-':
                out['alias'] = normalized_value(synonyms.split('|'))

            for x in dbxrefs.split('|'):
                if x == '-':
                    continue
                xd = x.split(':')
                if len(xd) == 3 and xd[0] == xd[1] and xd[0] in ['HGNC', 'MGI']:
                    xd = xd[1:]      # a fix for NCBI bug for dup xref prefix, 'HGNC:HGNC:36328'
                try:
                    _db, _id = xd
                except:
                    print(x)
                    raise
                if _db.lower() in ['ensembl', 'imgt/gene-db']:      # we don't need ensembl xref from here, we will get it from Ensembl directly
                    continue                                        # we don't need 'IMGT/GENE-DB" xref either, because they are mostly the same as gene symbol
                if _db.lower() == 'mgi':            # add "MGI:" prefix for MGI ids.
                    _id = "MGI:"+_id
                out[_db] = _id
            return out
Example #6
0
 def _ff(d):
     out = {
         'rna': [],
         'protein': [],
         'genomic': [],
         'translation': []
     }
     for rna, prot, dna in d:
         if rna == '-': rna = None
         if prot == '-': prot = None
         if dna == '-': dna = None
         if rna is not None:
             out['rna'].append(rna)
         if prot is not None:
             out['protein'].append(prot)
         if dna is not None:
             out['genomic'].append(dna)
         if rna and prot:
             out['translation'].append({'rna' : rna, 'protein' : prot})
     # remove dup
     for k in out:
         out[k] = normalized_value(out[k])
     # remove empty rna/protein/genomic field
     _out = {}
     for k, v in out.items():
         if v:
             _out[k] = v
     if _out:
         _out = {self.fieldname: _out}
     return _out
Example #7
0
        def _fn(x, eid):
            out = {'gene': eid}
            if isinstance(x, list):
                transcript_li = []
                protein_li = []
                for _x in x:
                    if _x[0] and _x[0] != '\\N':
                        transcript_li.append(_x[0])
                    if _x[0] and _x[1] != '\\N':
                        protein_li.append(_x[1])

                if transcript_li:
                    out['transcript'] = normalized_value(transcript_li)
                if protein_li:
                    out['protein'] = normalized_value(protein_li)
            else:
                if x[0] and x[0] != '\\N':
                    out['transcript'] = x[0]
                if x[1] and x[1] != '\\N':
                    out['protein'] = x[1]
            return out
Example #8
0
    def load(self, aslist=False):
        load_start(self.datafile)
        if self.species_li:
            _includefn = lambda ld: int(ld[0]) in self.taxid_set and ld[1] != '-'
        else:
            _includefn = lambda ld: ld[1] != '-'
        gene2retired = tab2dict(self.datafile, (1, 2), 0, alwayslist=1,
                                includefn=_includefn)
        gene2retired = dict_convert(gene2retired, valuefn=lambda x: normalized_value([int(xx) for xx in x]))

        gene_d = {}
        for gid, retired in gene2retired.items():
            gene_d[gid] = {'retired': retired}
        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
Example #9
0
    def load(self, aslist=False):
        load_start(self.datafile)
        if self.species_li:
            _includefn = lambda ld: int(ld[0]) in self.taxid_set and ld[1] != '-'
        else:
            _includefn = lambda ld: ld[1] != '-'
        gene2retired = tab2dict(self.datafile, (1, 2), 0, alwayslist=1,
                                includefn=_includefn)
        gene2retired = dict_convert(gene2retired, valuefn=lambda x: normalized_value([int(xx) for xx in x]))

        gene_d = {}
        for gid, retired in gene2retired.items():
            gene_d[gid] = {'retired': retired}
        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
Example #10
0
 def _ff(d):
     out = {
         'rna': [],
         'protein': [],
         'genomic': []
     }
     for x1, x2, x3 in d:
         if x1 != '-':
             out['rna'].append(x1.split('.')[0])   # trim version number after dot
         if x2 != '-':
             out['protein'].append(x2.split('.')[0])
         if x3 != '-':
             out['genomic'].append(x3.split('.')[0])
     # remove dup
     for k in out:
         out[k] = normalized_value(out[k])
     # remove empty rna/protein/genomic field
     _out = {}
     for k, v in out.items():
         if v:
             _out[k] = v
     if _out:
         _out = {self.fieldname: _out}
     return _out