def __init__(self, **kwargs): self.__workPath = kwargs.get("workPath", ".") self.__workDirSuffix = kwargs.get("workDirSuffix", "marshall_") self.__workDirPrefix = kwargs.get("workDirSuffix", "_tempdir") # self.__fileU = FileUtil(workPath=self.__workPath) self.__ioU = IoUtil()
def __rebuildCache(self, targetUrl, mapNameL, outDirPath, rawDirPath, fmt="pickle", useCache=True): """Fetch the UniProt selected id mapping resource file and extract UniProt Acc to 'mapIndex' mapping. Serialize the mapping as required. Args: targetUrl (str): source URL of the remote index file mapNameL (list): list of key mapping names to extract from the index outDirPath (str): directory path for raw and processed mapping files fmt (str, optional): output format (pickle|json) . Defaults to "pickle". useCache (bool, optional): use cached files. Defaults to True. Returns: dict: od[uniprotId] = mapped value idmapping_selected.tab 1. UniProtKB-AC 2. UniProtKB-ID 3. GeneID (EntrezGene) 4. RefSeq 5. GI 6. PDB 7. GO 8. UniRef100 9. UniRef90 10. UniRef50 11. UniParc 12. PIR 13. NCBI-taxon 14. MIM 15. UniGene 16. PubMed 17. EMBL 18. EMBL-CDS 19. Ensembl 20. Ensembl_TRS 21. Ensembl_PRO 22. Additional PubMed """ startTime = time.time() nL = mapNameL oD = {} try: fileU = FileUtil() fExt = "pic" if fmt == "pickle" else "json" fExt = "tdd" if fmt == "tdd" else fExt fN, _ = os.path.splitext(fileU.getFileName(targetUrl)) mapFileName = fN + "-map." + fExt idMapPath = os.path.join(outDirPath, mapFileName) mU = MarshalUtil() if useCache and mU.exists(idMapPath): logger.info("Reading cached serialized file %r", idMapPath) if fmt in ["pickle", "json"]: tD = mU.doImport(idMapPath, fmt=fmt) nL = list(set(tD["idNameList"])) oD = tD["uniprotMapD"] logger.info("keys %r", list(oD.keys())[:10]) logger.info("nL %r", nL) ok = True elif fmt == "tdd": ioU = IoUtil() it = ioU.deserializeCsvIter(idMapPath, delimiter="\t", rowFormat="list", encodingErrors="ignore") tL = next(it, []) nL = tL[1:] if len(nL) == 1: for row in it: oD[row[0]] = row[1] else: for row in it: oD[row[0]] = row[1:] ok = True else: idPath = os.path.join(rawDirPath, fileU.getFileName(targetUrl)) if not fileU.exists(idPath): logger.info( "Fetching selected UniProt idmapping data from %r in %r", targetUrl, outDirPath) ok = fileU.get(targetUrl, idPath) if not ok: logger.error("Failed to downlowd %r", targetUrl) return oD else: logger.info("Using cached mapping file %r", idPath) # --- ioU = IoUtil() if fmt in ["pickle", "json"]: if len(mapNameL) == 1: for row in ioU.deserializeCsvIter( idPath, delimiter="\t", rowFormat="list", encodingErrors="ignore"): oD[row[0]] = str( row[self.__mapRecordD[mapNameL[0]] - 1]) else: for row in ioU.deserializeCsvIter( idPath, delimiter="\t", rowFormat="list", encodingErrors="ignore"): for mapName in mapNameL: oD.setdefault(row[0], []).append( str(row[self.__mapRecordD[mapName] - 1])) logger.info("Writing serialized mapping file %r", idMapPath) ok = mU.doExport(idMapPath, { "idNameList": mapNameL, "uniprotMapD": oD }, fmt=fmt) elif fmt == "tdd": # logger.info("Writing serialized mapping file %r", idMapPath) fU = FileUtil() fU.mkdirForFile(idMapPath) colNameL = [] colNameL.append("UniProtId") colNameL.extend(mapNameL) with open(idMapPath, "w", encoding="utf-8") as ofh: ofh.write("%s\n" % "\t".join(colNameL)) if len(mapNameL) == 1: idx = self.__mapRecordD[mapNameL[0]] - 1 for row in ioU.deserializeCsvIter( idPath, delimiter="\t", rowFormat="list", encodingErrors="ignore"): ofh.write("%s\t%s\n" % (row[0], row[idx])) else: idxL = [0] idxL.extend([ self.__mapRecordD[mapName] - 1 for mapName in mapNameL ]) for row in ioU.deserializeCsvIter( idPath, delimiter="\t", rowFormat="list", encodingErrors="ignore"): ofh.write( "%s\n" % "\t".join([str(row[idx]) for idx in idxL])) # nL, oD = self.__rebuildCache(targetUrl, mapNameL, outDirPath, rawDirPath, fmt=fmt, useCache=True) ok = True if nL and oD else False logger.info("Completed reload (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) except Exception as e: logger.exception("Failing with %s", str(e)) # return nL, oD
def setUp(self): self.__verbose = True self.__pathPdbxDictionaryFile = os.path.join(TOPDIR, "rcsb", "mock-data", "dictionaries", "mmcif_pdbx_v5_next.dic") self.__pathJsonTestFile = os.path.join(TOPDIR, "rcsb", "mock-data", "dictionaries", "vrpt_dictmap.json") self.__pathIndexFile = os.path.join(TOPDIR, "rcsb", "mock-data", "MOCK_EXCHANGE_SANDBOX", "update-lists", "all-pdb-list") self.__pathCifFile = os.path.join(TOPDIR, "rcsb", "mock-data", "MOCK_BIRD_CC_REPO", "0", "PRDCC_000010.cif") # self.__workPath = os.path.join(HERE, "test-output") self.__pathSaveDictionaryFile = os.path.join(self.__workPath, "mmcif_pdbx_v5_next.dic") self.__pathSaveJsonTestFile = os.path.join(self.__workPath, "json-content.json") self.__pathSaveIndexFile = os.path.join(self.__workPath, "all-pdb-list") self.__pathSaveCifFile = os.path.join(self.__workPath, "cif-content.cif") self.__pathSavePickleFile = os.path.join(self.__workPath, "json-content.pic") self.__pathSaveTextFile = os.path.join(self.__workPath, "json-content.txt") # # self.__pathInsilicoFile = os.path.join(TOPDIR, "rcsb", "mock-data", "MOCK_EXCHANGE_SANDBOX", "status", "theoretical_model.tsv") self.__pathSaveInsilicoFile = os.path.join( self.__workPath, "saved-theoretical_model.tsv") # # self.__pathVariantFastaFile = os.path.join(self.__mockTopPath, 'UniProt', 'uniprot_sprot_varsplic.fasta.gz') self.__pathFastaFile = os.path.join(TOPDIR, "rcsb", "mock-data", "MOCK_EXCHANGE_SANDBOX", "sequence", "pdb_seq_prerelease.fasta") self.__pathSaveFastaFile = os.path.join(self.__workPath, "test-pre-release.fasta") # self.__pathTaxonomyFile = os.path.join(TOPDIR, "rcsb", "mock-data", "NCBI", "names.dmp.gz") self.__pathSaveTaxonomyFilePic = os.path.join(self.__workPath, "taxonomy_names.pic") self.__pathSaveTaxonomyFileCsv = os.path.join(self.__workPath, "taxonomy_names.csv") # self.__pathSiftsFile = os.path.join(TOPDIR, "rcsb", "mock-data", "sifts-summary", "pdb_chain_go.csv.gz") # self.__ioU = IoUtil() self.__startTime = time.time() logger.debug("Running tests on version %s", __version__) logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))