def _cvt(pli): _d = list2dict(pli, 2) _d = value_convert(_d, _inner_cvt) for p_source in _d: if isinstance(_d[p_source], list): _d[p_source].sort(key=lambda e: e["id"]) return {'pathway': _d}
def load_broadinstitute_exac(data_folder): t0 = time.time() exacs = load_broadinstitute_exac_all(data_folder) for k,v in load_broadinstitute_exac_nontcga(data_folder).items(): try: exacs[k]["exac"]["nontcga"] = v["exac"]["nontcga"] except KeyError: exacs[k] = v for k,v in load_broadinstitute_exac_nonpsych(data_folder).items(): try: exacs[k]["exac"]["nonpsych"] = v["exac"]["nonpsych"] except KeyError: exacs[k] = v logging.info("Convert transcript ID to EntrezID") from ..ensembl.parser import EnsemblParser from biothings.utils.hub_db import get_src_dump ensembl_doc = get_src_dump().find_one({"_id":"ensembl"}) or {} ensembl_dir = ensembl_doc.get("data_folder") assert ensembl_dir, "Can't find Ensembl data directory (used for id conversion)" ensembl_parser = EnsemblParser(ensembl_dir) ensembl_parser._load_ensembl2entrez_li() ensembl2entrez = list2dict(ensembl_parser.ensembl2entrez_li, 0, alwayslist=True) for line in tabfile_feeder(os.path.join(ensembl_dir,"gene_ensembl__translation__main.txt")): _,ensid,transid,_ = line if transid in exacs: data = exacs.pop(transid) # pop so no-match means no data in the end for entrezid in ensembl2entrez.get(ensid,[ensid]): exacs[entrezid] = data return exacs
def convert2entrez(self, ensembl2x): '''convert a dict with ensembl gene ids as the keys to matching entrezgene ids as the keys.''' if not self.ensembl2entrez_li: self._load_ensembl2entrez_li() if not self.ensembl_main: self.ensembl_main = self.load_ensembl_main() ensembl2entrez = list2dict(self.ensembl2entrez_li, 0) entrez2ensembl = list2dict(self.ensembl2entrez_li, 1) # Now make a dictionary indexed by entrez gene id print('# of ensembl IDs in total: %d' % len(set(ensembl2x) | set(ensembl2entrez))) print('# of ensembl IDs match entrez Gene IDs: %d' % len(set(ensembl2x) & set(ensembl2entrez))) print('# of ensembl IDs DO NOT match entrez Gene IDs: %d' % len(set(ensembl2x) - set(ensembl2entrez))) # all genes with matched entrez def _fn(eid, taxid=None): # need to make a copy of the value here. d = copy.copy(ensembl2x.get(eid, {})) # otherwise, it will cause issue when multiple entrezgene ids return d # match the same ensembl gene, for example, # ENSMUSG00000027104 --> (11909, 100047997) data = value_convert(entrez2ensembl, _fn) # add those has no matched entrez geneid, using ensembl id as the key for eid in (set(ensembl2x) - set(ensembl2entrez)): _g = ensembl2x[eid] #_g.update(self.ensembl_main.get(eid, {})) data[eid] = _g for id in data: if isinstance(data[id], dict): _doc = dict_nodup(data[id], sort=True) else: # if one entrez gene matches multiple ensembl genes _doc = dict_attrmerge(data[id], removedup=True, sort=True) data[id] = _doc return data
def __init__(self, src_name, data_folder, load_ensembl2entrez=True): self.data_folder = data_folder self.ensembl2entrez_li = None self.ensembl_main = None if load_ensembl2entrez: self._load_ensembl2entrez_li(src_name) self.ensembl2entrez = list2dict(self.ensembl2entrez_li, 0, alwayslist=True)
def transform(xli2): gene2uniprot = list2dict(list_nondup(xli2), 2, alwayslist=True) gene2uniprot = value_convert(gene2uniprot, _dict_convert, traverse_list=False) gid, uniprot = list(gene2uniprot.items())[0] docs = [] for gid, uniprot in gene2uniprot.items(): doc = {"_id": gid} doc.update(uniprot) docs.append(doc) return docs
def _dict_convert(uniprot_li): ''' convert [(u'E7ESI2', 'TrEMBL'), (u'P24941', 'Swiss-Prot'), (u'G3V5T9', 'TrEMBL'), (u'G3V317', 'TrEMBL')] into {'Swiss-Prot': u'P24941', 'TrEMBL': [u'E7ESI2', u'G3V5T9', u'G3V317']} ''' _dict = list2dict(uniprot_li, 1) for k, v in _dict.items(): if isinstance(v, list): _dict[k] = sorted(v) return {'uniprot': _dict}
def load_cpdb(data_folder, pathways): # only import pathways from these sources PATHWAY_SOURCES_INCLUDED = pathways VALID_COLUMN_NO = 4 t0 = time.time() DATA_FILES = [] DATA_FILES.append(os.path.join(data_folder, 'CPDB_pathways_genes_mouse.tab')) DATA_FILES.append(os.path.join(data_folder, 'CPDB_pathways_genes_yeast.tab')) DATA_FILES.append(os.path.join(data_folder, 'CPDB_pathways_genes_human.tab')) _out = [] for DATA_FILE in DATA_FILES: for ld in tabfile_feeder(DATA_FILE, header=1, assert_column_no=VALID_COLUMN_NO): p_name, p_id, p_source = ld[:3] p_source = p_source.lower() if p_source == 'kegg' and p_id.startswith('path:'): p_id = p_id[5:] if p_source in PATHWAY_SOURCES_INCLUDED: genes = ld[-1].split(",") for gene in genes: _out.append((gene, p_name, p_id, p_source)) _out = list2dict(_out, 0, alwayslist=True) def _inner_cvt(p): p_name, p_id = p _d = {'name': p_name} if p_id != 'None': _d['id'] = p_id return _d def _cvt(pli): _d = list2dict(pli, 2) _d = value_convert(_d, _inner_cvt) for p_source in _d: if isinstance(_d[p_source], list): _d[p_source].sort(key=lambda e: e["id"]) return {'pathway': _d} _out = dict_convert(_out, valuefn=_cvt) return _out
def src_clean_archives(keep_last=1, src=None, verbose=True, noconfirm=False): '''clean up archive collections in src db, only keep last <kepp_last> number of archive. ''' from biothings.utils.dataload import list2dict from biothings.utils.common import ask src = src or get_src_db() archive_li = sorted([(coll.split('_archive_')[0], coll) for coll in src.collection_names() if coll.find('archive') != -1]) archive_d = list2dict(archive_li, 0, alwayslist=1) coll_to_remove = [] for k, v in archive_d.items(): print(k, end='') #check current collection exists if src[k].count() > 0: cnt = 0 for coll in sorted(v)[:-keep_last]: coll_to_remove.append(coll) cnt += 1 print("\t\t%s archived collections marked to remove." % cnt) else: print('skipped. Missing current "%s" collection!' % k) if len(coll_to_remove) > 0: print("%d archived collections will be removed." % len(coll_to_remove)) if verbose: for coll in coll_to_remove: print('\t', coll) if noconfirm or ask("Continue?") == 'Y': for coll in coll_to_remove: src[coll].drop() print("Done.[%s collections removed]" % len(coll_to_remove)) else: print("Aborted.") else: print("Nothing needs to be removed.")