def _fn(x, eid): out = {'gene': eid, 'translation' : []} def mapping(transcript_id, protein_id): trid = transcript_id and transcript_id != '\\N' and transcript_id or None pid = protein_id and protein_id != '\\N' and protein_id or None if trid and pid: out['translation'].append({"rna" : trid, "protein" : pid}) if isinstance(x, list): transcript_li = [] protein_li = [] for _x in x: if _x[0] and _x[0] != '\\N': transcript_li.append(_x[0]) if _x[1] and _x[1] != '\\N': protein_li.append(_x[1]) mapping(_x[0],_x[1]) if transcript_li: out['transcript'] = normalized_value(transcript_li) if protein_li: out['protein'] = normalized_value(protein_li) else: if x[0] and x[0] != '\\N': out['transcript'] = x[0] if x[1] and x[1] != '\\N': out['protein'] = x[1] mapping(x[0],x[1]) return out
def _ff(d): ( taxid, symbol, locus_tag, synonyms, dbxrefs, map_location, description, type_of_gene, other_designations, modification_date ) = d out = dict(taxid=int(taxid), symbol=symbol, name=description) if map_location != '-': out['map_location'] = map_location if type_of_gene != '-': out['type_of_gene'] = type_of_gene if synonyms != '-': out['alias'] = normalized_value(synonyms.split('|')) if locus_tag != '-': out['locus_tag'] = locus_tag if other_designations != "-": out['other_names'] = normalized_value(other_designations.split('|')) # when merged, this will become the default timestamp out["_timestamp"] = datetime.datetime.strptime(modification_date,"%Y%m%d") for x in dbxrefs.split('|'): if x == '-': continue xd = x.split(':') if len(xd) == 3 and xd[0] == xd[1] and \ xd[0] in ['VGNC', 'HGNC', 'MGI']: # a fix for NCBI bug for dup xref prefix, 'HGNC:HGNC:36328' xd = xd[1:] try: _db, _id = xd except: print(repr(x)) raise # we don't need ensembl xref from here, we will get it from # Ensembl directly if _db.lower() in ['ensembl', 'imgt/gene-db']: # we don't need 'IMGT/GENE-DB" xref either, because they # are mostly the same as gene symbol continue # add "MGI:" prefix for MGI ids. if _db.lower() == 'mgi': _id = "MGI:" + _id out[_db] = _id return out
def _ff(d): out = {'rna': [], 'protein': [], 'genomic': [], 'translation': []} for rna, prot, dna in d: if rna == '-': rna = None if prot == '-': prot = None if dna == '-': dna = None if rna is not None: out['rna'].append(rna) if prot is not None: out['protein'].append(prot) if dna is not None: out['genomic'].append(dna) if rna and prot: out['translation'].append({'rna': rna, 'protein': prot}) # remove dup for k in out: out[k] = normalized_value(out[k]) # remove empty rna/protein/genomic field _out = {} for k, v in out.items(): if v: _out[k] = v if _out: _out = {self.fieldname: _out} return _out
def _ff(d): ( taxid, symbol, synonyms, dbxrefs, map_location, description, type_of_gene ) = d out = dict(taxid=int(taxid), symbol=symbol, name=description) if map_location != '-': out['map_location'] = map_location if type_of_gene != '-': out['type_of_gene'] = type_of_gene if synonyms != '-': out['alias'] = normalized_value(synonyms.split('|')) for x in dbxrefs.split('|'): if x == '-': continue xd = x.split(':') if len(xd) == 3 and xd[0] == xd[1] and xd[0] in ['HGNC', 'MGI']: xd = xd[1:] # a fix for NCBI bug for dup xref prefix, 'HGNC:HGNC:36328' try: _db, _id = xd except: print(x) raise if _db.lower() in ['ensembl', 'imgt/gene-db']: # we don't need ensembl xref from here, we will get it from Ensembl directly continue # we don't need 'IMGT/GENE-DB" xref either, because they are mostly the same as gene symbol if _db.lower() == 'mgi': # add "MGI:" prefix for MGI ids. _id = "MGI:"+_id out[_db] = _id return out
def _ff(d): out = { 'rna': [], 'protein': [], 'genomic': [], 'translation': [] } for rna, prot, dna in d: if rna == '-': rna = None if prot == '-': prot = None if dna == '-': dna = None if rna is not None: out['rna'].append(rna) if prot is not None: out['protein'].append(prot) if dna is not None: out['genomic'].append(dna) if rna and prot: out['translation'].append({'rna' : rna, 'protein' : prot}) # remove dup for k in out: out[k] = normalized_value(out[k]) # remove empty rna/protein/genomic field _out = {} for k, v in out.items(): if v: _out[k] = v if _out: _out = {self.fieldname: _out} return _out
def _fn(x, eid): out = {'gene': eid} if isinstance(x, list): transcript_li = [] protein_li = [] for _x in x: if _x[0] and _x[0] != '\\N': transcript_li.append(_x[0]) if _x[0] and _x[1] != '\\N': protein_li.append(_x[1]) if transcript_li: out['transcript'] = normalized_value(transcript_li) if protein_li: out['protein'] = normalized_value(protein_li) else: if x[0] and x[0] != '\\N': out['transcript'] = x[0] if x[1] and x[1] != '\\N': out['protein'] = x[1] return out
def load(self, aslist=False): load_start(self.datafile) if self.species_li: _includefn = lambda ld: int(ld[0]) in self.taxid_set and ld[1] != '-' else: _includefn = lambda ld: ld[1] != '-' gene2retired = tab2dict(self.datafile, (1, 2), 0, alwayslist=1, includefn=_includefn) gene2retired = dict_convert(gene2retired, valuefn=lambda x: normalized_value([int(xx) for xx in x])) gene_d = {} for gid, retired in gene2retired.items(): gene_d[gid] = {'retired': retired} load_done('[%d]' % len(gene_d)) if aslist: return dict_to_list(gene_d) else: return gene_d
def _ff(d): out = { 'rna': [], 'protein': [], 'genomic': [] } for x1, x2, x3 in d: if x1 != '-': out['rna'].append(x1.split('.')[0]) # trim version number after dot if x2 != '-': out['protein'].append(x2.split('.')[0]) if x3 != '-': out['genomic'].append(x3.split('.')[0]) # remove dup for k in out: out[k] = normalized_value(out[k]) # remove empty rna/protein/genomic field _out = {} for k, v in out.items(): if v: _out[k] = v if _out: _out = {self.fieldname: _out} return _out