def load_x(idx, fieldname, cvt_fn=None): print('DATA_FOLDER: ' + DATA_FOLDER) DATAFILE = os.path.join(DATA_FOLDER, 'idmapping_selected.tab.gz') load_start(DATAFILE) t0 = time.time() xli = [] for ld in tabfile_feeder(DATAFILE, header=1): ld = listitems(ld, *(2,19,idx)) # GeneID Ensembl(Gene) target_value for value in dupline_seperator(dupline=ld, dup_sep='; '): xli.append(value) ensembl2geneid = list2dict(list_nondup([(x[1], x[0]) for x in xli if x[0]!='' and x[1]!='']), 0, alwayslist=True) xli2 = [] for entrez_id, ensembl_id, x_value in xli: if x_value: if cvt_fn: x_value = cvt_fn(x_value) if entrez_id: xli2.append((entrez_id, x_value)) elif ensembl_id: entrez_id = ensembl2geneid.get(ensembl_id, None) if entrez_id: for _eid in entrez_id: xli2.append((_eid, x_value)) else: xli2.append((ensembl_id, x_value)) gene2x = list2dict(list_nondup(xli2), 0) fn = lambda value: {fieldname: sorted(value) if type(value) is types.ListType else value} gene2x = value_convert(gene2x, fn, traverse_list=False) load_done('[%d, %s]' % (len(gene2x), timesofar(t0))) return gene2x
def load_broadinstitute_exac(): print('DATA_FOLDER: ' + DATA_FOLDER) t0 = time.time() exacs = load_broadinstitute_exac_all() for k, v in load_broadinstitute_exac_nontcga().items(): try: exacs[k]["exac"]["nontcga"] = v["exac"]["nontcga"] except KeyError: exacs[k] = v for k, v in load_broadinstitute_exac_nonpsych().items(): try: exacs[k]["exac"]["nonpsych"] = v["exac"]["nonpsych"] except KeyError: exacs[k] = v logging.info("Convert transcript ID to EntrezID") import dataload.sources.ensembl.ensembl_base as ensembl_base ensembl_parser = ensembl_base.EnsemblParser() ensembl_parser._load_ensembl2entrez_li() ensembl2entrez = list2dict(ensembl_parser.ensembl2entrez_li, 0, alwayslist=True) ensembl_dir = get_data_folder("ensembl") for line in tabfile_feeder( os.path.join(ensembl_dir, "gene_ensembl__translation__main.txt")): _, ensid, transid, _ = line if transid in exacs: data = exacs.pop( transid) # pop so no-match means no data in the end for entrezid in ensembl2entrez.get(ensid, [ensid]): exacs[entrezid] = data load_done('[%d, %s]' % (len(exacs), timesofar(t0))) return exacs
def load_broadinstitute_exac(): print('DATA_FOLDER: ' + DATA_FOLDER) t0 = time.time() exacs = load_broadinstitute_exac_all() for k,v in load_broadinstitute_exac_nontcga().items(): try: exacs[k]["exac"]["nontcga"] = v["exac"]["nontcga"] except KeyError: exacs[k] = v for k,v in load_broadinstitute_exac_nonpsych().items(): try: exacs[k]["exac"]["nonpsych"] = v["exac"]["nonpsych"] except KeyError: exacs[k] = v logging.info("Convert transcript ID to EntrezID") import dataload.sources.ensembl.ensembl_base as ensembl_base ensembl_parser = ensembl_base.EnsemblParser() ensembl_parser._load_ensembl2entrez_li() ensembl2entrez = list2dict(ensembl_parser.ensembl2entrez_li, 0, alwayslist=True) ensembl_dir = get_data_folder("ensembl") for line in tabfile_feeder(os.path.join(ensembl_dir,"gene_ensembl__translation__main.txt")): _,ensid,transid,_ = line if transid in exacs: data = exacs.pop(transid) # pop so no-match means no data in the end for entrezid in ensembl2entrez.get(ensid,[ensid]): exacs[entrezid] = data load_done('[%d, %s]' % (len(exacs), timesofar(t0))) return exacs
def _cvt(pli): _d = list2dict(pli, 2) _d = value_convert(_d, _inner_cvt) for p_source in _d: if isinstance(_d[p_source], list): _d[p_source].sort() return {'pathway': _d}
def _cvt(pli): _d = list2dict(pli, 2) _d = value_convert(_d, _inner_cvt) for p_source in _d: if isinstance(_d[p_source], list): _d[p_source].sort(key=lambda e: e["id"]) return {'pathway': _d}
def load_uniprot(): print('DATA_FOLDER: ' + DATA_FOLDER) DATAFILE = os.path.join(DATA_FOLDER, 'idmapping_selected.tab.gz') load_start(DATAFILE) t0 = time.time() xli = [] for ld in tabfile_feeder(DATAFILE, header=1, assert_column_no=VALID_COLUMN_NO): ld = listitems(ld, *(0, 1, 2, 18)) # UniProtKB-AC UniProtKB-ID GeneID Ensembl(Gene) for value in dupline_seperator( dupline=ld, dup_idx=[2, 3 ], # GeneID and EnsemblID columns may have duplicates dup_sep='; '): value = list(value) value[1] = get_uniprot_section(value[1]) value = tuple(value) xli.append(value) ensembl2geneid = list2dict([(x[3], x[2]) for x in xli if x[2] != '' and x[3] != ''], 0, alwayslist=True) xli2 = [] for uniprot_acc, section, entrez_id, ensembl_id in xli: if entrez_id: xli2.append((uniprot_acc, section, entrez_id)) elif ensembl_id: entrez_id = ensembl2geneid.get(ensembl_id, None) if entrez_id: #if ensembl_id can be mapped to entrez_id for _eid in entrez_id: xli2.append((uniprot_acc, section, _eid)) else: #otherwise, just use ensembl_id xli2.append((uniprot_acc, section, ensembl_id)) gene2uniprot = list2dict(list_nondup(xli2), 2, alwayslist=True) gene2uniprot = value_convert(gene2uniprot, _dict_convert, traverse_list=False) load_done('[%d, %s]' % (len(gene2uniprot), timesofar(t0))) return gene2uniprot
def _load_ensembl2entrez_li(self): ensembl2entrez_li = loadobj(("ensembl_gene__2entrezgene_list.pyobj", self.src), mode='gridfs') #filter out those deprecated entrez gene ids logging.info(len(ensembl2entrez_li)) ensembl2entrez_li = [(ensembl_id, self._entrez_geneid_d[int(entrez_id)]) for (ensembl_id, entrez_id) in ensembl2entrez_li if int(entrez_id) in self._entrez_geneid_d] logging.info(len(ensembl2entrez_li)) ensembl2entrez = list2dict(ensembl2entrez_li, 0) self._idmapping_d_cache['ensembl_gene'] = ensembl2entrez
def convert2entrez(self, ensembl2x): '''convert a dict with ensembl gene ids as the keys to matching entrezgene ids as the keys.''' if not self.ensembl2entrez_li: self._load_ensembl2entrez_li() if not self.ensembl_main: self.ensembl_main = self.load_ensembl_main() ensembl2entrez = list2dict(self.ensembl2entrez_li, 0) entrez2ensembl = list2dict(self.ensembl2entrez_li, 1) #Now make a dictionary indexed by entrez gene id print('# of ensembl IDs in total: %d' % len(set(ensembl2x) | set(ensembl2entrez))) print('# of ensembl IDs match entrez Gene IDs: %d' % len(set(ensembl2x) & set(ensembl2entrez))) print('# of ensembl IDs DO NOT match entrez Gene IDs: %d' % len(set(ensembl2x) - set(ensembl2entrez))) #all genes with matched entrez def _fn(eid, taxid=None): d = copy.copy(ensembl2x.get( eid, {})) # need to make a copy of the value here. return d # otherwise, it will cause issue when multiple entrezgene ids # match the same ensembl gene, for example, # ENSMUSG00000027104 --> (11909, 100047997) data = value_convert(entrez2ensembl, _fn) #add those has no matched entrez geneid, using ensembl id as the key for eid in (set(ensembl2x) - set(ensembl2entrez)): _g = ensembl2x[eid] #_g.update(self.ensembl_main.get(eid, {})) data[eid] = _g for id in data: if isinstance(data[id], dict): _doc = dict_nodup(data[id], sort=True) else: #if one entrez gene matches multiple ensembl genes _doc = dict_attrmerge(data[id], removedup=True, sort=True) data[id] = _doc return data
def load_x(idx, fieldname, cvt_fn=None): '''idx is 0-based column number''' print('DATA_FOLDER: ' + DATA_FOLDER) DATAFILE = os.path.join(DATA_FOLDER, 'idmapping_selected.tab.gz') load_start(DATAFILE) t0 = time.time() xli = [] for ld in tabfile_feeder(DATAFILE, header=1, assert_column_no=VALID_COLUMN_NO): ld = listitems(ld, *(2, 19, idx)) # GeneID Ensembl(Gene) target_value for value in dupline_seperator(dupline=ld, dup_sep='; '): xli.append(value) ensembl2geneid = list2dict(list_nondup([(x[1], x[0]) for x in xli if x[0] != '' and x[1] != '']), 0, alwayslist=True) xli2 = [] for entrez_id, ensembl_id, x_value in xli: if x_value: if cvt_fn: x_value = cvt_fn(x_value) if entrez_id: xli2.append((entrez_id, x_value)) elif ensembl_id: entrez_id = ensembl2geneid.get(ensembl_id, None) if entrez_id: for _eid in entrez_id: xli2.append((_eid, x_value)) else: xli2.append((ensembl_id, x_value)) gene2x = list2dict(list_nondup(xli2), 0) fn = lambda value: { fieldname: sorted(value) if isinstance(value, list) else value } gene2x = value_convert(gene2x, fn, traverse_list=False) load_done('[%d, %s]' % (len(gene2x), timesofar(t0))) return gene2x
def convert2entrez(self, ensembl2x): '''convert a dict with ensembl gene ids as the keys to matching entrezgene ids as the keys.''' if not self.ensembl2entrez_li: self._load_ensembl2entrez_li() if not self.ensembl_main: self.ensembl_main = self.load_ensembl_main() ensembl2entrez = list2dict(self.ensembl2entrez_li, 0) entrez2ensembl = list2dict(self.ensembl2entrez_li, 1) #Now make a dictionary indexed by entrez gene id print '# of ensembl IDs in total: %d' % len(set(ensembl2x) | set(ensembl2entrez)) print '# of ensembl IDs match entrez Gene IDs: %d' % len(set(ensembl2x) & set(ensembl2entrez)) print '# of ensembl IDs DO NOT match entrez Gene IDs: %d' % len(set(ensembl2x) - set(ensembl2entrez)) #all genes with matched entrez def _fn(eid, taxid=None): d = copy.copy(ensembl2x.get(eid, {})) #need to make a copy of the value here. return d #otherwise, it will cause issue when multiple entrezgene ids #match the same ensembl gene, for example, # ENSMUSG00000027104 --> (11909, 100047997) data = value_convert(entrez2ensembl, _fn) #add those has no matched entrez geneid, using ensembl id as the key for eid in (set(ensembl2x) - set(ensembl2entrez)): _g = ensembl2x[eid] #_g.update(self.ensembl_main.get(eid, {})) data[eid] = _g doc_li = [] for id in data: if type(data[id]) is types.DictType: _doc = dict_nodup(data[id], sort=True) else: #if one entrez gene matches multiple ensembl genes _doc = dict_attrmerge(data[id], removedup=True, sort=True) data[id] = _doc return data
def _dict_convert(uniprot_li): ''' convert [(u'E7ESI2', 'TrEMBL'), (u'P24941', 'Swiss-Prot'), (u'G3V5T9', 'TrEMBL'), (u'G3V317', 'TrEMBL')] into {'Swiss-Prot': u'P24941', 'TrEMBL': [u'E7ESI2', u'G3V5T9', u'G3V317']} ''' _dict = list2dict(uniprot_li, 1) for k, v in _dict.items(): if isinstance(v, list): _dict[k] = sorted(v) return {'uniprot': _dict}
def _load_ensembl2entrez_li(self): ensembl2entrez_li = loadobj( ("ensembl_gene__2entrezgene_list.pyobj", self.src), mode='gridfs') #filter out those deprecated entrez gene ids logging.info(len(ensembl2entrez_li)) ensembl2entrez_li = [(ensembl_id, self._entrez_geneid_d[int(entrez_id)]) for (ensembl_id, entrez_id) in ensembl2entrez_li if int(entrez_id) in self._entrez_geneid_d] logging.info(len(ensembl2entrez_li)) ensembl2entrez = list2dict(ensembl2entrez_li, 0) self._idmapping_d_cache['ensembl_gene'] = ensembl2entrez
def _dict_convert(uniprot_li): ''' convert [(u'E7ESI2', 'TrEMBL'), (u'P24941', 'Swiss-Prot'), (u'G3V5T9', 'TrEMBL'), (u'G3V317', 'TrEMBL')] into {'Swiss-Prot': u'P24941', 'TrEMBL': [u'E7ESI2', u'G3V5T9', u'G3V317']} ''' _dict = list2dict(uniprot_li, 1) for k, v in _dict.items(): if type(v) is types.ListType: _dict[k] = sorted(v) return {'uniprot': _dict}
def load_uniprot(): print('DATA_FOLDER: ' + DATA_FOLDER) DATAFILE = os.path.join(DATA_FOLDER, 'idmapping_selected.tab.gz') load_start(DATAFILE) t0 = time.time() xli = [] for ld in tabfile_feeder(DATAFILE, header=1): ld = listitems(ld, *(0,1,2,19)) #UniProtKB-AC UniProtKB-ID GeneID Ensembl(Gene) for value in dupline_seperator(dupline=ld, dup_idx=[2,3], #GeneID and EnsemblID columns may have duplicates dup_sep='; '): value = list(value) value[1] = get_uniprot_section(value[1]) value = tuple(value) xli.append(value) ensembl2geneid = list2dict([(x[3], x[2]) for x in xli if x[2]!='' and x[3]!=''], 0, alwayslist=True) xli2 = [] for uniprot_acc, section, entrez_id, ensembl_id in xli: if entrez_id: xli2.append((uniprot_acc, section, entrez_id)) elif ensembl_id: entrez_id = ensembl2geneid.get(ensembl_id, None) if entrez_id: #if ensembl_id can be mapped to entrez_id for _eid in entrez_id: xli2.append((uniprot_acc, section, _eid)) else: #otherwise, just use ensembl_id xli2.append((uniprot_acc, section, ensembl_id)) gene2uniprot = list2dict(list_nondup(xli2), 2, alwayslist=True) gene2uniprot = value_convert(gene2uniprot, _dict_convert, traverse_list=False) load_done('[%d, %s]' % (len(gene2uniprot), timesofar(t0))) return gene2uniprot
def load_cpdb(__metadata__): # only import pathways from these sources PATHWAY_SOURCES_INCLUDED = __metadata__['pathway_sources_included'] VALID_COLUMN_NO = 4 t0 = time.time() print('DATA_FOLDER: ' + DATA_FOLDER) DATA_FILES = [] DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_mouse.tab')) DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_yeast.tab')) DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_human.tab')) _out = [] for DATA_FILE in DATA_FILES: load_start(DATA_FILE) for ld in tabfile_feeder(DATA_FILE, header=1, assert_column_no=VALID_COLUMN_NO): p_name, p_id, p_source = ld[:3] p_source = p_source.lower() if p_source == 'kegg' and p_id.startswith('path:'): p_id = p_id[5:] if p_source in PATHWAY_SOURCES_INCLUDED: genes = ld[-1].split(",") for gene in genes: _out.append((gene, p_name, p_id, p_source)) load_done() _out = list2dict(_out, 0, alwayslist=True) def _inner_cvt(p): p_name, p_id = p _d = {'name': p_name} if p_id != 'None': _d['id'] = p_id return _d def _cvt(pli): _d = list2dict(pli, 2) _d = value_convert(_d, _inner_cvt) for p_source in _d: if isinstance(_d[p_source], list): _d[p_source].sort(key=lambda e: e["id"]) return {'pathway': _d} _out = dict_convert(_out, valuefn=_cvt) load_done('[%d, %s]' % (len(_out), timesofar(t0))) return _out
def load_cpdb(__metadata__): # only import pathways from these sources PATHWAY_SOURCES_INCLUDED = __metadata__['pathway_sources_included'] VALID_COLUMN_NO = 4 t0 = time.time() print('DATA_FOLDER: ' + DATA_FOLDER) DATA_FILES = [] DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_mouse.tab')) DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_yeast.tab')) DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_human.tab')) _out = [] for DATA_FILE in DATA_FILES: load_start(DATA_FILE) for ld in tabfile_feeder(DATA_FILE, header=1, assert_column_no=VALID_COLUMN_NO): p_name, p_id, p_source = ld[:3] p_source = p_source.lower() if p_source == 'kegg' and p_id.startswith('path:'): p_id = p_id[5:] if p_source in PATHWAY_SOURCES_INCLUDED: genes = ld[-1].split(",") for gene in genes: _out.append((gene, p_name, p_id, p_source)) load_done() _out = list2dict(_out, 0, alwayslist=True) def _inner_cvt(p): p_name, p_id = p _d = {'name': p_name} if p_id != 'None': _d['id'] = p_id return _d def _cvt(pli): _d = list2dict(pli, 2) _d = value_convert(_d, _inner_cvt) for p_source in _d: if isinstance(_d[p_source], list): _d[p_source].sort() return {'pathway': _d} _out = dict_convert(_out, valuefn=_cvt) load_done('[%d, %s]' % (len(_out), timesofar(t0))) return _out
def _db_upload(self, doc_li, step=10000, verbose=True): import time from biothings.utils.common import timesofar from utils.dataload import list2dict, list_itemcnt, listsort output = [] t0 = time.time() for i in range(0, len(doc_li), step): output.extend(self.target_db.update(doc_li[i:i+step])) if verbose: print('\t%d-%d Done [%s]...' % (i+1, min(i+step, len(doc_li)), timesofar(t0))) res = list2dict(list_itemcnt([x[0] for x in output]), 0) print("Done![%s, %d OK, %d Error]" % (timesofar(t0), res.get(True, 0), res.get(False, 0))) res = listsort(list_itemcnt([x[2].args[0] for x in output if x[0] is False]), 1, reverse=True) print('\n'.join(['\t%s\t%d' % x for x in res[:10]])) if len(res) > 10: print("\t%d lines omitted..." % (len(res)-10))
def _db_upload(self, doc_li, step=10000, verbose=True): import time from utils.common import timesofar from utils.dataload import list2dict, list_itemcnt, listsort output = [] t0 = time.time() for i in range(0, len(doc_li), step): output.extend(self.target_db.update(doc_li[i:i + step])) if verbose: print('\t%d-%d Done [%s]...' % (i + 1, min(i + step, len(doc_li)), timesofar(t0))) res = list2dict(list_itemcnt([x[0] for x in output]), 0) print("Done![%s, %d OK, %d Error]" % (timesofar(t0), res.get(True, 0), res.get(False, 0))) res = listsort(list_itemcnt([x[2].args[0] for x in output if x[0] is False]), 1, reverse=True) print('\n'.join(['\t%s\t%d' % x for x in res[:10]])) if len(res) > 10: print("\t%d lines omitted..." % (len(res) - 10))
def load_exons_for_species(species, exons_key='exons'): refflat_file = os.path.join(DATA_FOLDER, species, 'database/refFlat.txt.gz') reflink_file = os.path.join(DATA_FOLDER, species, 'database/refLink.txt.gz') load_start(refflat_file) t0 = time.time() refseq2gene = tab2dict(reflink_file, (2, 6), 0, alwayslist=False) ref2exons = [] for ld in tabfile_feeder(refflat_file, header=0): refseq = ld[1] chr = ld[2] if chr.startswith('chr'): chr = chr[3:] exons = zip([int(x) for x in ld[9].split(',') if x], [int(x) for x in ld[10].split(',') if x]) assert len(exons) == int(ld[8]), (len(exons), int(ld[8])) ref2exons.append((refseq, { 'chr': chr, 'strand': -1 if ld[3] == '-' else 1, 'txstart': int(ld[4]), 'txend': int(ld[5]), 'cdsstart': int(ld[6]), 'cdsend': int(ld[7]), 'exons': exons })) ref2exons = list2dict(ref2exons, 0) gene2exons = {} for refseq in sorted(ref2exons.keys()): geneid = refseq2gene.get(refseq, None) if geneid and geneid != '0': if geneid not in gene2exons: gene2exons[geneid] = {exons_key: {refseq: ref2exons[refseq]}} else: gene2exons[geneid][exons_key][refseq] = ref2exons[refseq] load_done('[%d, %s]' % (len(gene2exons), timesofar(t0))) return gene2exons
def src_clean_archives(keep_last=1, src=None, verbose=True, noconfirm=False): '''clean up archive collections in src db, only keep last <kepp_last> number of archive. ''' from utils.dataload import list2dict from biothings.utils.common import ask src = src or get_src_db() archive_li = sorted([(coll.split('_archive_')[0], coll) for coll in src.collection_names() if coll.find('archive') != -1]) archive_d = list2dict(archive_li, 0, alwayslist=1) coll_to_remove = [] for k, v in archive_d.items(): print(k, end='') #check current collection exists if src[k].count() > 0: cnt = 0 for coll in sorted(v)[:-keep_last]: coll_to_remove.append(coll) cnt += 1 print("\t\t%s archived collections marked to remove." % cnt) else: print('skipped. Missing current "%s" collection!' % k) if len(coll_to_remove) > 0: print("%d archived collections will be removed." % len(coll_to_remove)) if verbose: for coll in coll_to_remove: print('\t', coll) if noconfirm or ask("Continue?") == 'Y': for coll in coll_to_remove: src[coll].drop() print("Done.[%s collections removed]" % len(coll_to_remove)) else: print("Aborted.") else: print("Nothing needs to be removed.")
def src_clean_archives(keep_last=1, src=None, verbose=True, noconfirm=False): '''clean up archive collections in src db, only keep last <kepp_last> number of archive. ''' from utils.dataload import list2dict from utils.common import ask src = src or get_src_db() archive_li = sorted([(coll.split('_archive_')[0], coll) for coll in src.collection_names() if coll.find('archive') != -1]) archive_d = list2dict(archive_li, 0, alwayslist=1) coll_to_remove = [] for k, v in archive_d.items(): print k, #check current collection exists if src[k].count() > 0: cnt = 0 for coll in sorted(v)[:-keep_last]: coll_to_remove.append(coll) cnt += 1 print "\t\t%s archived collections marked to remove." % cnt else: print 'skipped. Missing current "%s" collection!' % k if len(coll_to_remove) > 0: print "%d archived collections will be removed." % len(coll_to_remove) if verbose: for coll in coll_to_remove: print '\t', coll if noconfirm or ask("Continue?") == 'Y': for coll in coll_to_remove: src[coll].drop() print "Done.[%s collections removed]" % len(coll_to_remove) else: print "Aborted." else: print "Nothing needs to be removed."