def __rebuildCache(self, urlTargetPfam, urlTargetPfamFB, dirPath, useCache): pfamD = {} fmt = "json" ext = fmt if fmt == "json" else "pic" pfamDataPath = os.path.join(dirPath, "pfam-data.%s" % ext) # logger.debug("Using cache data path %s", dirPath) self.__mU.mkdir(dirPath) # if useCache and self.__mU.exists(pfamDataPath): pfamD = self.__mU.doImport(pfamDataPath, fmt=fmt) logger.debug("Pfam data length %d", len(pfamD)) elif not useCache: # ------ fU = FileUtil() logger.info("Fetch data from source %s in %s", urlTargetPfam, dirPath) fp = os.path.join(dirPath, fU.getFileName(urlTargetPfam)) ok = fU.get(urlTargetPfam, fp) if not ok: fp = os.path.join(dirPath, fU.getFileName(urlTargetPfamFB)) ok = fU.get(urlTargetPfamFB, fp) logger.info("Fetch data fallback fetch status is %r", ok) pfamD = self.__getPfamIndex(fp) ok = self.__mU.doExport(pfamDataPath, pfamD, fmt=fmt) logger.info("Caching %d in %s status %r", len(pfamD), pfamDataPath, ok) # ------ # return pfamD
def __reloadEntryIds(self, urlTarget, urlFallbackTarget, dirPath, useCache=True): idD = {} fU = FileUtil() fn = fU.getFileName(urlTarget) fp = os.path.join(dirPath, fn) self.__mU.mkdir(dirPath) # if useCache and self.__mU.exists(fp): tdL = self.__mU.doImport(fp, fmt="json") logger.debug("Reading cached IDs list (%d)", len(tdL)) else: logger.info("Fetch ID list from %s", urlTarget) ok = fU.get(urlTarget, fp) if not ok: ok = fU.get(urlFallbackTarget, fp) # if ok: tdL = self.__mU.doImport(fp, fmt="json") # for td in tdL: for k, v in td.items(): try: idD[k] = datetime.datetime.fromisoformat(v) except Exception as e: logger.error("Date processing failing for %r %r with %s", k, v, str(e)) # sTupL = sorted(idD.items(), key=lambda item: item[1]) return {k: v for k, v in sTupL}
def __reloadGlycans(self, baseUrl, fallbackUrl, dirPath, useCache=True): gD = {} logger.debug("Using dirPath %r", dirPath) self.__mU.mkdir(dirPath) # myDataPath = os.path.join(dirPath, "glygen-glycan-list.json") if useCache and self.__mU.exists(myDataPath): gD = self.__mU.doImport(myDataPath, fmt="json") logger.debug("GlyGen glycan data length %d", len(gD)) elif not useCache: logger.debug( "Fetch GlyGen glycan data from primary data source %s", baseUrl) endPoint = os.path.join(baseUrl, "glycan_masterlist.csv") # logger.info("Fetch GlyGen glycan data from primary data source %s", endPoint) rawPath = os.path.join(dirPath, "glycan_masterlist.csv") fU = FileUtil() ok = fU.get(endPoint, rawPath) logger.debug("Fetch GlyGen glycan data status %r", ok) if not ok: endPoint = os.path.join(fallbackUrl, "glycan_masterlist.csv") ok = fU.get(endPoint, rawPath) logger.info("Fetch fallback GlyGen glycan data status %r", ok) # if ok: gD = self.__parseGlycanList(rawPath) ok = self.__mU.doExport(myDataPath, gD, fmt="json") logger.info("Exported GlyGen glycan list (%d) (%r) %s", len(gD), ok, myDataPath) # return gD
def __reload(self, dirPath, baseVersion, useCache, **kwargs): startTime = time.time() mU = MarshalUtil(workPath=dirPath) chemblDbUrl = kwargs.get( "ChEMBLDbUrl", "ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/") ok = False fU = FileUtil() fU.mkdir(dirPath) # # ChEMBL current version <baseVersion>,... # template: chembl_<baseVersion>.fa.gz # targetFileName = "chembl_" + str(baseVersion) + ".fa.gz" mappingFileName = "chembl_uniprot_mapping.txt" # chemblTargetPath = os.path.join(dirPath, targetFileName) chemblMappingPath = os.path.join(dirPath, mappingFileName) mappingFilePath = os.path.join(dirPath, "chembl_uniprot_mapping.json") # mapD = {} if useCache and fU.exists(mappingFilePath): logger.info("useCache %r using %r and %r and %r", useCache, chemblTargetPath, chemblMappingPath, mappingFilePath) mapD = mU.doImport(mappingFilePath, fmt="json") else: # Get the ChEMBL UniProt mapping file url = os.path.join(chemblDbUrl, mappingFileName) ok = fU.get(url, chemblMappingPath) logger.info("Fetched %r url %s path %s", ok, url, chemblMappingPath) logger.info("Reading ChEMBL mapping file path %s", mappingFilePath) rowL = mU.doImport(chemblMappingPath, fmt="tdd", rowFormat="list") for row in rowL: mapD[row[0]] = (row[1], row[2], row[3]) ok = mU.doExport(mappingFilePath, mapD, fmt="json") logger.info("Processed mapping path %s (%d) %r", mappingFilePath, len(mapD), ok) # # Get the target FASTA files -- for vers in range(baseVersion, baseVersion + 10): logger.info("Now fetching version %r", vers) self.__version = vers targetFileName = "chembl_" + str(vers) + ".fa.gz" chemblTargetPath = os.path.join(dirPath, "chembl_targets_raw.fa.gz") url = os.path.join(chemblDbUrl, targetFileName) ok = fU.get(url, chemblTargetPath) logger.info("Fetched %r url %s path %s", ok, url, chemblTargetPath) if ok: break # logger.info("Completed reload at %s (%.4f seconds)", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) # return mapD
def __reloadGlycoproteins(self, baseUrl, fallbackUrl, dirPath, useCache=True): gD = {} logger.debug("Using dirPath %r", dirPath) self.__mU.mkdir(dirPath) # myDataPath = os.path.join(dirPath, "glygen-glycoprotein-list.json") if useCache and self.__mU.exists(myDataPath): gD = self.__mU.doImport(myDataPath, fmt="json") logger.debug("GlyGen glycoprotein data length %d", len(gD)) else: for fn in [ "sarscov1_protein_masterlist.csv", "sarscov2_protein_masterlist.csv", "hcv1b_protein_masterlist.csv", "hcv1a_protein_masterlist.csv", "human_protein_masterlist.csv", "mouse_protein_masterlist.csv", "rat_protein_masterlist.csv", ]: logger.debug( "Fetch GlyGen glycoprotein data from primary data source %s", baseUrl) endPoint = os.path.join(baseUrl, fn) # logger.debug( "Fetch GlyGen glycoprotein data from primary data source %s", endPoint) rawPath = os.path.join(dirPath, fn) fU = FileUtil() ok = fU.get(endPoint, rawPath) logger.debug("Fetch GlyGen glycoprotein data status %r", ok) if not ok: endPoint = os.path.join(fallbackUrl, fn) ok = fU.get(endPoint, rawPath) logger.info("Fetch fallback GlyGen data status %r", ok) # if ok: tD = self.__parseGlycoproteinList(rawPath) gD.update(tD) # ok = self.__mU.doExport(myDataPath, gD, fmt="json") logger.info("Exported GlyGen glycoprotein list (%d) (%r) %s", len(gD), ok, myDataPath) # return gD
def restoreDependencies(self, url, dirPath, bundleLabel="A", userName=None, pw=None): """Restore bundled dependencies from remote storage and unbundle these in the current local cache directory. Args: url (str): remote URL dirPath (str): remote directory path on the bundleLabel (str, optional): optional label preppended to the stashed dependency bundle artifact (default='A') userName (str, optional): optional access information. Defaults to None. password (str, optional): optional access information. Defaults to None. """ try: ok = False fileU = FileUtil() fn = self.__makeBundleFileName(self.__dependFileName, bundleLabel=bundleLabel) if not url: remotePath = os.path.join(dirPath, fn) ok = fileU.get(remotePath, self.__dependTarFilePath) elif url and url.startswith("http://"): remotePath = url + os.path.join("/", dirPath, fn) ok = fileU.get(remotePath, self.__dependTarFilePath) elif url and url.startswith("sftp://"): sftpU = SftpUtil() ok = sftpU.connect(url[7:], userName, pw=pw, port=22) if ok: remotePath = os.path.join(dirPath, fn) ok = sftpU.get(remotePath, self.__dependTarFilePath) else: logger.error("Unsupported protocol %r", url) if ok: ok = fileU.unbundleTarfile(self.__dependTarFilePath, dirPath=self.__cachePath) return ok except Exception as e: logger.exception("For %r %r Failing with %s", url, dirPath, str(e)) ok = False return ok
def fetchBundle(self, localRestoreDirPath, url, remoteDirPath, remoteStashPrefix="A", userName=None, password=None): """Restore bundled dependencies from remote storage and unbundle these in the current local cache directory. Args: localRestoreDirPath (str): local restore path url (str): remote URL remoteDirPath (str): remote directory path on the remote resource remoteStashPrefix (str, optional): optional label preppended to the stashed dependency bundle artifact (default='A') userName (str, optional): optional access information. Defaults to None. password (str, optional): optional access information. Defaults to None. """ try: ok = False fileU = FileUtil() fn = self.__makeBundleFileName(self.__baseBundleFileName, remoteStashPrefix=remoteStashPrefix) if not url: remotePath = os.path.join(remoteDirPath, fn) if fileU.exists(remotePath): ok = fileU.get(remotePath, self.__localStashTarFilePath) else: ok = False logger.warning("Missing bundle file %r", remotePath) elif url and (url.startswith("http://") or url.startswith("https://")): remotePath = url + os.path.join("/", remoteDirPath, fn) ok = fileU.get(remotePath, self.__localStashTarFilePath) elif url and url.startswith("sftp://"): sftpU = SftpUtil() ok = sftpU.connect(url[7:], userName, pw=password, port=22) if ok: remotePath = os.path.join(remoteDirPath, fn) ok = sftpU.get(remotePath, self.__localStashTarFilePath) else: logger.error("Unsupported protocol %r", url) if ok: ok = fileU.unbundleTarfile(self.__localStashTarFilePath, dirPath=localRestoreDirPath) return ok except Exception as e: logger.exception("For %r %r Failing with %s", url, remoteDirPath, str(e)) ok = False return ok
def __fetchFromSource(self, urlTarget): """Fetch the classification names and domain assignments from the ECOD repo.""" fU = FileUtil() fn = fU.getFileName(urlTarget) fp = os.path.join(self.__dirPath, fn) if not fU.exists(fp): fU.get(urlTarget, fp) # with open(fp, "r", encoding="utf-8") as ifh: line = ifh.readline() line = ifh.readline() line = ifh.readline() ff = line[:-1].split() self.__version = ff[-1] # nmL = self.__mU.doImport(fp, fmt="list", uncomment=True) fU.remove(fp) # return nmL
def __fetchFromBackup(self, urlBackupPath, cathDirPath): fn = self.__getCathDomainFileName() cathDomainPath = os.path.join(cathDirPath, fn) self.__mU.mkdir(cathDirPath) # backupUrl = urlBackupPath + "/" + fn logger.info("Using backup URL %r", backupUrl) fU = FileUtil() ok = fU.get(backupUrl, cathDomainPath) return ok
def __pharosFixture(self): try: ok = False fU = FileUtil() srcPath = os.path.join(self.__dataPath, "Pharos") dstPath = os.path.join(self.__cachePath, "Pharos-targets") for fn in ["drug_activity", "cmpd_activity", "protein"]: inpPath = os.path.join(srcPath, fn + ".tdd.gz") outPath = os.path.join(dstPath, fn + ".tdd.gz") fU.get(inpPath, outPath) fU.uncompress(outPath, outputDir=dstPath) fU.remove(outPath) fU.put(os.path.join(srcPath, "pharos-readme.txt"), os.path.join(dstPath, "pharos-readme.txt")) ok = True except Exception as e: logger.exception("Failing with %s", str(e)) ok = False return ok
def __reload(self, urlTarget, urlFallbackTarget, dirPath, useCache=True): invD = {} fU = FileUtil() fn = fU.getFileName(urlTarget) fp = os.path.join(dirPath, fn) self.__mU.mkdir(dirPath) # if useCache and self.__mU.exists(fp): invD = self.__mU.doImport(fp, fmt="json") logger.debug("Reading cached inventory (%d)", len(invD)) else: logger.info("Fetch inventory from %s", urlTarget) ok = fU.get(urlTarget, fp) if not ok: ok = fU.get(urlFallbackTarget, fp) # if ok: invD = self.__mU.doImport(fp, fmt="json") # return invD
def __fetchFromBackup(self, urlBackupPath, scopDirPath): pyVersion = sys.version_info[0] fn = "scop_domains-py%s.pic" % str(pyVersion) scopDomainPath = os.path.join(scopDirPath, fn) self.__mU.mkdir(scopDirPath) # backupUrl = urlBackupPath + "/" + fn logger.info("Using backup URL %r", backupUrl) fU = FileUtil() ok = fU.get(backupUrl, scopDomainPath) return ok
def __fetchFromBackup(self, fmt="json"): urlTarget = "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/SCOP2" # fn = self.__getAssignmentFileName(fmt=fmt) assignmentPath = os.path.join(self.__dirPath, fn) urlPath = os.path.join(urlTarget, fn) self.__mU.mkdir(assignmentPath) # logger.info("Using backup URL %r", urlPath) fU = FileUtil() ok = fU.get(urlPath, assignmentPath) return ok
def __rebuildCache(self, urlTargetIsoLtwa, dirPath, useCache): """Rebuild the cache of ISO abbreviation term data Args: urlTargetIsoLtwa (str): URL for ISO4 LTWA title word abbreviations dirPath (str): cache path useCache (bool): flag to use cached files Returns: tuple: (dict) title word abbreviations (dict) language conflict dictionary (list) multi-word abbreviation targets Notes: ISO source file (tab delimited UTF-16LE) is maintained at the ISSN site - https://www.issn.org/wp-content/uploads/2013/09/LTWA_20160915.txt """ aD = {} mU = MarshalUtil(workPath=dirPath) fmt = "json" ext = fmt if fmt == "json" else "pic" isoLtwaNamePath = os.path.join(dirPath, "iso-ltwa.%s" % ext) logger.debug("Using cache data path %s", dirPath) mU.mkdir(dirPath) if not useCache: for fp in [isoLtwaNamePath]: try: os.remove(fp) except Exception: pass # if useCache and mU.exists(isoLtwaNamePath): aD = mU.doImport(isoLtwaNamePath, fmt=fmt) logger.debug("Abbreviation name length %d", len(aD["abbrev"])) elif not useCache: # ------ fU = FileUtil() logger.info("Fetch data from source %s in %s", urlTargetIsoLtwa, dirPath) fp = os.path.join(dirPath, fU.getFileName(urlTargetIsoLtwa)) ok = fU.get(urlTargetIsoLtwa, fp) aD = self.__getLtwaTerms(dirPath, fp) ok = mU.doExport(isoLtwaNamePath, aD, fmt=fmt) logger.debug("abbrevD keys %r", list(aD.keys())) logger.debug("Caching %d ISO LTWA in %s status %r", len(aD["abbrev"]), isoLtwaNamePath, ok) # abbrevD = aD["abbrev"] if "abbrev" in aD else {} conflictD = aD["conflicts"] if "conflicts" in aD else {} multiWordTermL = aD[ "multi_word_abbrev"] if "multi_word_abbrev" in aD else [] # return abbrevD, conflictD, multiWordTermL
def __rebuildCache(self, urlTargetInterPro, urlTargetInterProFB, urlTargetInterProParent, urlTargetInterProParentFB, dirPath, useCache): fmt = "json" ext = fmt if fmt == "json" else "pic" interProDataPath = os.path.join(dirPath, "interPro-data.%s" % ext) # logger.debug("Using cache data path %s", dirPath) self.__mU.mkdir(dirPath) # if useCache and self.__mU.exists(interProDataPath): rD = self.__mU.doImport(interProDataPath, fmt=fmt) interProD = rD["index"] interProParentD = rD["parents"] logger.debug("InterPro index length %d parent length %d", len(interProD), len(interProParentD)) else: # ------ fU = FileUtil() logger.info("Fetch data from source %s in %s", urlTargetInterPro, dirPath) fp = os.path.join(dirPath, fU.getFileName(urlTargetInterPro)) ok = fU.get(urlTargetInterPro, fp) if not ok: fp = os.path.join(dirPath, fU.getFileName(urlTargetInterProFB)) ok = fU.get(urlTargetInterProFB, fp) logger.info("Fetch data fallback fetch status is %r", ok) interProD = self.__getInterProIndex(fp) logger.info("Caching %d in %s status %r", len(interProD), interProDataPath, ok) # ------ logger.info("Fetch data from source %s in %s", urlTargetInterProParent, dirPath) fp = os.path.join(dirPath, fU.getFileName(urlTargetInterProParent)) ok = fU.get(urlTargetInterProParent, fp) if not ok: fp = os.path.join(dirPath, fU.getFileName(urlTargetInterProParentFB)) ok = fU.get(urlTargetInterProParentFB, fp) logger.info("Fetch data fallback fetch status is %r", ok) interProParentD = self.__getInterProParents(fp) # ok = self.__mU.doExport(interProDataPath, {"index": interProD, "parents": interProParentD}, fmt=fmt) # return interProD, interProParentD
def __fetchUrl(self, urlTarget, dirPath, useCache=False): fU = FileUtil() fn = fU.getFileName(urlTarget) filePath = os.path.join(dirPath, fn) if not (useCache and fU.exists(filePath)): startTime = time.time() ok2 = fU.get(urlTarget, filePath) endTime = time.time() if ok2: logger.info("Fetched %s for resource file %s (status = %r) (%.4f seconds)", urlTarget, filePath, ok2, endTime - startTime) else: logger.error("Failing fetch for %s for resource file %s (status = %r) (%.4f seconds)", urlTarget, filePath, ok2, endTime - startTime) # return filePath
def __reload(self, dirPath, **kwargs): oD = None version = None startTime = time.time() useCache = kwargs.get("useCache", True) # # CARDDumpUrl = kwargs.get("CARDDumpUrl", "https://card.mcmaster.ca/latest/data/broadstreet-v3.1.0.tar.bz2") cardDumpUrl = kwargs.get("CARDDumpUrl", "https://card.mcmaster.ca/latest/data") ok = False fU = FileUtil() cardDumpFileName = "card-data.tar.bz2" cardDumpPath = os.path.join(dirPath, cardDumpFileName) cardDumpDirPath = os.path.join(dirPath, "dump") # fU.mkdir(dirPath) cardDataPath = os.path.join(dirPath, "card-select-data.json") # logger.info("useCache %r CARDDumpPath %r", useCache, cardDumpPath) if useCache and self.__mU.exists(cardDataPath): qD = self.__mU.doImport(cardDataPath, fmt="json") version = qD["version"] oD = qD["data"] else: logger.info("Fetching url %s path %s", cardDumpUrl, cardDumpPath) ok = fU.get(cardDumpUrl, cardDumpPath) fU.mkdir(cardDumpDirPath) fU.uncompress(cardDumpPath, outputDir=cardDumpDirPath) fU.unbundleTarfile(os.path.join(cardDumpDirPath, cardDumpFileName[:-4]), dirPath=cardDumpDirPath) logger.info("Completed fetch (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) oD, version = self.__parseCardData( os.path.join(cardDumpDirPath, "card.json")) tS = datetime.datetime.now().isoformat() qD = {"version": version, "created": tS, "data": oD} oD = qD["data"] ok = self.__mU.doExport(cardDataPath, qD, fmt="json", indent=3) logger.info("Export CARD data (%d) status %r", len(oD), ok) # --- return oD, version
def __processAppendedSections(self, appendConfigOption, cachePath, useCache=True): """Fetch and append configuration assets assigned to input configuration option. Args: appendConfigOption (str): reserved configuration option to hold a list of configuration asset locators cachePath (str): path to store cached copies configuration assets useCache (bool, optional): use existing cached configuration assets. Defaults to True. Returns: bool: True for success of False otherwise """ try: ret = True appendLocL = self.getList(appendConfigOption, sectionName=self.__defaultSectionName) logger.debug("appendLocL is %r", appendLocL) if appendLocL: cP = os.path.join(cachePath, "config") fU = FileUtil(workPath=cP) logger.debug("Fetching append sections from %r", appendLocL) for appendLoc in appendLocL: fn = fU.getFileName(appendLoc) fp = os.path.join(cP, fn) okF = True if not (useCache and fU.exists(fp)): # get a fresh copy from source okF = fU.get(appendLoc, fp) logger.debug("Fetched %r to %r", appendLoc, fp) ok = self.appendConfig(fp) ret = ret and ok and okF except Exception as e: logger.exception("Failing for option %r cachePath %r with %s", appendConfigOption, cachePath, str(e)) ret = False # if not ret: logger.error("Fetching appended sections failing %r", appendLocL) return ret
def __reload(self, urlTarget, dirPath, useCache=True): """Reload local cache of mapping resources to support validation report reader and translator. Args: urlTarget (list, str): URL for schema mapping file dirPath (str): path to the directory containing cache files useCache (bool, optional): flag to use cached files. Defaults to True. Returns: (object): instance of ValidationReportReader() """ mapD = {} # mU = MarshalUtil() fU = FileUtil() fn = fU.getFileName(urlTarget) mappingFilePath = os.path.join(dirPath, fn) mU.mkdir(dirPath) # # if not useCache: # for fp in [mappingFilePath]: # try: # os.remove(fp) # except Exception: # pass # # logger.debug("Loading validation mapping data in %s (useCache %r)", fn, useCache) if useCache and fU.exists(mappingFilePath): mapD = mU.doImport(mappingFilePath, fmt="json") else: logger.info("Fetching url %s to resource file %s", urlTarget, mappingFilePath) tS = uuid.uuid4().hex tP = os.path.join(dirPath, "._" + tS) ok = fU.get(urlTarget, tP) if ok: mapD = mU.doImport(tP, fmt="json") os.replace(tP, mappingFilePath) return mapD
def __reload(self, urlTarget, dirPath, useCache=True): """ Reload input GO OBO ontology file and return a nx graph object. ' Returns: dictionary[goId] = {'name_list': ... , 'id_list': ... 'depth_list': ... } """ goGraph = None # # mU = MarshalUtil() fU = FileUtil() fn = fU.getFileName(urlTarget) oboFilePath = os.path.join(dirPath, fn) fU.mkdir(dirPath) # if not useCache: for fp in [oboFilePath]: try: os.remove(fp) except Exception: pass # if useCache and fU.exists(oboFilePath): goGraph = obonet.read_obo(oboFilePath) else: logger.info("Fetching url %s to resource file %s", urlTarget, oboFilePath) ok = fU.get(urlTarget, oboFilePath) if ok: goGraph = obonet.read_obo(oboFilePath) if goGraph: logger.info("Reading %d nodes and %d edges", len(goGraph), goGraph.number_of_edges()) else: logger.info("Go graph construction failing") # return goGraph
def __rebuildCache(self, targetUrl, mapNameL, outDirPath, rawDirPath, fmt="pickle", useCache=True): """Fetch the UniProt selected id mapping resource file and extract UniProt Acc to 'mapIndex' mapping. Serialize the mapping as required. Args: targetUrl (str): source URL of the remote index file mapNameL (list): list of key mapping names to extract from the index outDirPath (str): directory path for raw and processed mapping files fmt (str, optional): output format (pickle|json) . Defaults to "pickle". useCache (bool, optional): use cached files. Defaults to True. Returns: dict: od[uniprotId] = mapped value idmapping_selected.tab 1. UniProtKB-AC 2. UniProtKB-ID 3. GeneID (EntrezGene) 4. RefSeq 5. GI 6. PDB 7. GO 8. UniRef100 9. UniRef90 10. UniRef50 11. UniParc 12. PIR 13. NCBI-taxon 14. MIM 15. UniGene 16. PubMed 17. EMBL 18. EMBL-CDS 19. Ensembl 20. Ensembl_TRS 21. Ensembl_PRO 22. Additional PubMed """ startTime = time.time() nL = mapNameL oD = {} try: fileU = FileUtil() fExt = "pic" if fmt == "pickle" else "json" fExt = "tdd" if fmt == "tdd" else fExt fN, _ = os.path.splitext(fileU.getFileName(targetUrl)) mapFileName = fN + "-map." + fExt idMapPath = os.path.join(outDirPath, mapFileName) mU = MarshalUtil() if useCache and mU.exists(idMapPath): logger.info("Reading cached serialized file %r", idMapPath) if fmt in ["pickle", "json"]: tD = mU.doImport(idMapPath, fmt=fmt) nL = list(set(tD["idNameList"])) oD = tD["uniprotMapD"] logger.info("keys %r", list(oD.keys())[:10]) logger.info("nL %r", nL) ok = True elif fmt == "tdd": ioU = IoUtil() it = ioU.deserializeCsvIter(idMapPath, delimiter="\t", rowFormat="list", encodingErrors="ignore") tL = next(it, []) nL = tL[1:] if len(nL) == 1: for row in it: oD[row[0]] = row[1] else: for row in it: oD[row[0]] = row[1:] ok = True else: idPath = os.path.join(rawDirPath, fileU.getFileName(targetUrl)) if not fileU.exists(idPath): logger.info( "Fetching selected UniProt idmapping data from %r in %r", targetUrl, outDirPath) ok = fileU.get(targetUrl, idPath) if not ok: logger.error("Failed to downlowd %r", targetUrl) return oD else: logger.info("Using cached mapping file %r", idPath) # --- ioU = IoUtil() if fmt in ["pickle", "json"]: if len(mapNameL) == 1: for row in ioU.deserializeCsvIter( idPath, delimiter="\t", rowFormat="list", encodingErrors="ignore"): oD[row[0]] = str( row[self.__mapRecordD[mapNameL[0]] - 1]) else: for row in ioU.deserializeCsvIter( idPath, delimiter="\t", rowFormat="list", encodingErrors="ignore"): for mapName in mapNameL: oD.setdefault(row[0], []).append( str(row[self.__mapRecordD[mapName] - 1])) logger.info("Writing serialized mapping file %r", idMapPath) ok = mU.doExport(idMapPath, { "idNameList": mapNameL, "uniprotMapD": oD }, fmt=fmt) elif fmt == "tdd": # logger.info("Writing serialized mapping file %r", idMapPath) fU = FileUtil() fU.mkdirForFile(idMapPath) colNameL = [] colNameL.append("UniProtId") colNameL.extend(mapNameL) with open(idMapPath, "w", encoding="utf-8") as ofh: ofh.write("%s\n" % "\t".join(colNameL)) if len(mapNameL) == 1: idx = self.__mapRecordD[mapNameL[0]] - 1 for row in ioU.deserializeCsvIter( idPath, delimiter="\t", rowFormat="list", encodingErrors="ignore"): ofh.write("%s\t%s\n" % (row[0], row[idx])) else: idxL = [0] idxL.extend([ self.__mapRecordD[mapName] - 1 for mapName in mapNameL ]) for row in ioU.deserializeCsvIter( idPath, delimiter="\t", rowFormat="list", encodingErrors="ignore"): ofh.write( "%s\n" % "\t".join([str(row[idx]) for idx in idxL])) # nL, oD = self.__rebuildCache(targetUrl, mapNameL, outDirPath, rawDirPath, fmt=fmt, useCache=True) ok = True if nL and oD else False logger.info("Completed reload (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) except Exception as e: logger.exception("Failing with %s", str(e)) # return nL, oD
def __reload(self, dirPath, useCache=False, imgtDumpUrl=None, testList=None, maxCount=None): imgtD = {} startTime = time.time() fU = FileUtil() fU.mkdir(dirPath) # imgtDataPath = os.path.join(self.__dirPath, "imgt-data.json") # logger.info("useCache %r imgtFeaturePath %r", useCache, imgtDataPath) if useCache and self.__mU.exists(imgtDataPath): imgtD = self.__mU.doImport(imgtDataPath, fmt="json") self.__version = imgtD["version"] else: imgtDumpUrl = imgtDumpUrl if imgtDumpUrl else "http://www.imgt.org/download/3Dstructure-DB/IMGT3DFlatFiles.tgz" imgtReadmeUrl = "http://www.imgt.org/download/3Dstructure-DB/RELEASE" imgtDumpFileName = fU.getFileName(imgtDumpUrl) imgtDumpPath = os.path.join(dirPath, imgtDumpFileName) imgtReleasePath = os.path.join(dirPath, "IMGT-release.txt") _, fn = os.path.split(imgtDumpUrl) imgtFlatFilePath = os.path.join(self.__dirPath, fn[:-4]) # logger.info("Fetching url %s path %s", imgtDumpUrl, imgtDumpPath) ok1 = fU.get(imgtDumpUrl, imgtDumpPath) ok2 = fU.get(imgtReadmeUrl, imgtReleasePath) fU.unbundleTarfile(imgtDumpPath, dirPath=dirPath) logger.info("Completed fetch (%r) at %s (%.4f seconds)", ok1 and ok2, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) # --- readmeLines = self.__mU.doImport(imgtReleasePath, fmt="list") self.__version = readmeLines[0].strip() if readmeLines else None logger.info("IMGT version %r", self.__version) # --- chainD, rawD = self.__imgtFlatFileProcessor(imgtFlatFilePath, maxCount=maxCount, testList=testList) # --- tS = datetime.datetime.now().isoformat() # vS = datetime.datetime.now().strftime("%Y-%m-%d") if testList: imgtD = { "version": self.__version, "date": tS, "chains": chainD, "raw": rawD } else: imgtD = { "version": self.__version, "date": tS, "chains": chainD } ok = self.__mU.doExport(imgtDataPath, imgtD, fmt="json", indent=3) logger.info("Completed flatfile prep (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) return imgtD
def exportFasta(self, withGaps=False): """ Example: The IMGT/GENE-DB FASTA header contains 15 fields separated by '|': 1. IMGT/LIGM-DB accession number(s) 2. IMGT gene and allele name 3. species (may be followed by an "_" and the name of the strain, breed or isolate, if defined) 4. IMGT gene and allele functionality 5. exon(s), region name(s), or extracted label(s) 6. start and end positions in the IMGT/LIGM-DB accession number(s) 7. number of nucleotides in the IMGT/LIGM-DB accession number(s) 8. codon start, or 'NR' (not relevant) for non coding labels 9. +n: number of nucleotides (nt) added in 5' compared to the corresponding label extracted from IMGT/LIGM-DB 10. +n or -n: number of nucleotides (nt) added or removed in 3' compared to the corresponding label extracted from IMGT/LIGM-DB 11. +n, -n, and/or nS: number of added, deleted, and/or substituted nucleotides to correct sequencing errors, or 'not corrected' if non corrected sequencing errors 12. number of amino acids (AA): this field indicates that the sequence is in amino acids 13. number of characters in the sequence: nt (or AA)+IMGT gaps=total 14. partial (if it is) 15. reverse complementary (if it is) """ # -- fU = FileUtil() fU.mkdir(self.__dirPath) if withGaps: imgtTargetUrl = "http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-AA-WithGaps-F+ORF+inframeP" else: imgtTargetUrl = "http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-AA-WithoutGaps-F+ORF+inframeP" imgtTargetFileName = fU.getFileName(imgtTargetUrl) rawFastaPath = os.path.join(self.__dirPath, imgtTargetFileName) # -- logger.debug("Fetching url %s path %s", imgtTargetUrl, rawFastaPath) ok = fU.get(imgtTargetUrl, rawFastaPath) logger.info("Fetch status (%r) url %s path %s", ok, imgtTargetUrl, rawFastaPath) # -- fastaPath = os.path.join(self.__dirPath, "imgt-reference.fa") taxonPath = os.path.join(self.__dirPath, "imgt-reference-taxon.tdd") tP = TaxonomyProvider(cachePath=self.__cachePath, useCache=True) ok = tP.testCache() if not ok: tP = TaxonomyProvider(cachePath=self.__cachePath, useCache=False) rawQD = self.__mU.doImport(rawFastaPath, fmt="fasta", commentStyle="default") oD = {} taxonL = [] for queryId, sD in rawQD.items(): qL = queryId.split("|") tL = qL[2].split("_") taxName = tL[0] taxVar = tL[1].replace(" ", "_") if len(tL) > 1 else None taxId = tP.getTaxId(taxName) if taxId: tD = { "seqId": qL[0], "imgtGene": qL[1], "functionality": qL[3], "labels": qL[4], "taxId": taxId } if taxVar: tD["taxVar"] = taxVar sD.update(tD) else: logger.info("Unknown taxonomy %r (taxName=%r)", queryId, taxName) sD["sequence"].replace(".", "-") seqId = "" cL = [] for k, v in sD.items(): if k in ["sequence"]: continue cL.append(str(v)) cL.append(str(k)) seqId = "|".join(cL) oD[seqId] = sD taxonL.append("%s\t%s" % (seqId, taxId)) # ok1 = self.__mU.doExport(taxonPath, taxonL, fmt="list") ok2 = self.__mU.doExport(fastaPath, oD, fmt="fasta", makeComment=True) return ok1 and ok2
class DataTypeApiProvider(SingletonClass): """ Data type application and instance information provider. """ def __init__(self, cfgOb, cachePath, useCache=True, **kwargs): """Data type application and instance information provider. Args: cfgOb (object): ConfigInfo() object instance cachePath (str): path to hold the cache directory useCache (bool, optional): flag to use cached files. Defaults to True. """ self.__cfgOb = cfgOb self.__configName = self.__cfgOb.getDefaultSectionName() self.__useCache = useCache self.__cachePath = cachePath # self.__contentInfoConfigName = "content_info_helper_configuration" self.__fileU = FileUtil() self.__contentDefHelper = self.__cfgOb.getHelper( "CONTENT_DEF_HELPER_MODULE", sectionName=self.__configName, cfgOb=self.__cfgOb) self.__dirPath = os.path.join( cachePath, self.__cfgOb.get("DATA_TYPE_INFO_CACHE_DIR", sectionName=self.__configName)) self.__kwargs = kwargs # logger.debug("Leaving constructor") def getDataTypeInstanceApi(self, databaseName, **kwargs): """Return instance of DataTypeInstanceInfo(). Args: databaseName (str): database name Returns: (object): Instance of DataTypeInstanceInfo() """ _ = kwargs dataTypeInstanceLocatorPath = self.__cfgOb.getPath( "INSTANCE_DATA_TYPE_INFO_LOCATOR_PATH", sectionName=self.__configName) dataTypeInstanceFile = self.__contentDefHelper.getDataTypeInstanceFile( databaseName) if self.__contentDefHelper else None if dataTypeInstanceLocatorPath and dataTypeInstanceFile: loc = os.path.join(dataTypeInstanceLocatorPath, dataTypeInstanceFile) filePath = self.__reload(loc, self.__dirPath, useCache=self.__useCache) dtApi = DataTypeInstanceInfo(filePath) else: # DataTypeInstanceInfo() provides an internal by-pass mode where no coverage data is available. dtApi = DataTypeInstanceInfo(None) logger.debug("No data coverage available for database %s", databaseName) return dtApi def getDataTypeApplicationApi(self, appName, **kwargs): """Return instance of DataTypeApplicationInfo. Args: appName (str): application name (e.g., SQL, ANY) Returns: (object): Instance of DataTypeApplicationInfo() """ _ = kwargs dataTypeApplicationLocator = self.__cfgOb.getPath( "APP_DATA_TYPE_INFO_LOCATOR", sectionName=self.__configName) filePath = self.__reload(dataTypeApplicationLocator, self.__dirPath, useCache=self.__useCache) dtApi = DataTypeApplicationInfo( filePath, dataTyping=appName, workPath=self.__dirPath) if filePath else None return dtApi def __reload(self, urlTarget, dirPath, useCache=True): # fn = self.__fileU.getFileName(urlTarget) filePath = os.path.join(dirPath, fn) logger.debug("Using cache path %s", dirPath) self.__fileU.mkdir(dirPath) if not useCache: try: os.remove(filePath) except Exception: pass # if useCache and self.__fileU.exists(filePath): ok = True else: logger.debug("Fetch data from source %s", urlTarget) ok = self.__fileU.get(urlTarget, os.path.join(dirPath, fn)) return filePath if ok else None
class DictionaryApiProvider(SingletonClass): """ Resource provider for dictionary APIs. """ def __init__(self, dirPath, useCache=True): """Resource provider for dictionary APIs. Args: dirPath (str): path to the directory containing cache files useCache (bool, optional): flag to use cached files. Defaults to True. """ self.__apiMap = {} self.__dirPath = dirPath self.__useCache = useCache # self.__fileU = FileUtil(workPath=self.__dirPath) logger.debug("Leaving constructor") def __reload(self, dictLocators, dirPath, useCache=True): """Reload local cache of dictionary resources and return a dictionary API instance. Args: dictLocators (list, str): list of locators for dictionary resource files dirPath (str): path to the directory containing cache files useCache (bool, optional): flag to use cached files. Defaults to True. Returns: (object): instance of dictionary API """ # # verify the exitence of the cache directory ... self.__fileU.mkdir(dirPath) if not useCache: for dictLocator in dictLocators: try: fn = self.__fileU.getFileName(dictLocator) os.remove(os.path.join(dirPath, fn)) except Exception: pass # ret = True for dictLocator in dictLocators: cacheFilePath = os.path.join(dirPath, self.__fileU.getFileName(dictLocator)) if useCache and self.__fileU.exists(cacheFilePath): # nothing to do continue logger.debug("Fetching url %s caching in %s", dictLocator, cacheFilePath) ok = self.__fileU.get(dictLocator, cacheFilePath) ret = ret and ok return ret def getApi(self, dictLocators, **kwargs): """Return a dictionary API object of the input dictioaries. Arguments: dictLocators {list str} -- list of dictionary locator paths Returns: [object] -- returns DictionaryApi() object for input dictionaries """ dictFileNames = [ self.__fileU.getFileName(dictLocator) for dictLocator in dictLocators ] dictTup = tuple(dictFileNames) dApi = self.__apiMap[ dictTup] if dictTup in self.__apiMap else self.__getApi( dictLocators, **kwargs) self.__apiMap[dictTup] = dApi return dApi def __getApi(self, dictLocators, **kwargs): """ Return an instance of a dictionary API instance for the input dictionary locator list. """ consolidate = kwargs.get("consolidate", True) replaceDefinition = kwargs.get("replaceDefinitions", True) verbose = kwargs.get("verbose", True) # ok = self.__reload(dictLocators, self.__dirPath, useCache=self.__useCache) # dApi = None if ok: mU = MarshalUtil() containerList = [] for dictLocator in dictLocators: cacheFilePath = os.path.join( self.__dirPath, self.__fileU.getFileName(dictLocator)) containerList.extend( mU.doImport(cacheFilePath, fmt="mmcif-dict")) # dApi = DictionaryApi(containerList=containerList, consolidate=consolidate, replaceDefinition=replaceDefinition, verbose=verbose) return dApi
def fetchPartitionedBundle(self, localRestoreDirPath, gitRepositoryPath, gitRawHost="raw.githubusercontent.com", gitBranch="master", remoteStashPrefix="A"): """Fetch bundle from a remote stash public git repository via http. Args: localRestoreDirPath (str): local restore path gitRepositoryPath (str): git repository path (e.g., rcsb/py-rcsb_exdb_assets_stash) gitHost (str, optional): git repository host name. Defaults to github.com. gitBranch (str, optional): git branch name. Defaults to master. remoteStashPrefix (str, optional): optional label preppended to the stashed dependency bundle artifact (default='A') Returns: bool: True for success or False otherwise https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets_stash/master/stash/<file_or_dir> """ try: ok = False fileU = FileUtil() bundleFileName = self.__makeBundleFileName(self.__baseBundleFileName, remoteStashPrefix=remoteStashPrefix) urlBase = "https://" + gitRawHost rp = gitRepositoryPath[:-4] if gitRepositoryPath.endswith(".git") else gitRepositoryPath repoDirPath = os.path.join(urlBase, rp, gitBranch, "stash") # First fetch the manifest file remoteDirPath = os.path.join(repoDirPath, bundleFileName[:-7]) remotePath = os.path.join(remoteDirPath, "MANIFEST") logger.debug("Manifest remote %r", remotePath) # localDirPath = os.path.join(self.__localBundlePath, bundleFileName[:-7]) manifestPath = os.path.join(localDirPath, "MANIFEST") ok = fileU.get(remotePath, manifestPath) if not ok: logger.error("No manifest file at %r", remotePath) return ok # --- partFileName = "part_1" remotePartPath = os.path.join(repoDirPath, bundleFileName[:-7], partFileName) logger.debug("remotePartPath %r", remotePartPath) # --- partList = [] with open(manifestPath, "r") as mfh: line = mfh.readline() tf, myHash = line[:-1].split("\t") logger.debug("Fetched manifest for %s hash %r", tf, myHash) for line in mfh: partList.append(line[:-1]) # logger.debug("Parts (%d) %r", len(partList), partList) for part in partList: localPath = os.path.join(localDirPath, part) remotePath = os.path.join(repoDirPath, bundleFileName[:-7], part) logger.debug("%r %r", remotePath, localPath) fileU.get(remotePath, localPath) # sj = SplitJoin() ok = sj.join(self.__localStashTarFilePath, localDirPath) if ok: ok = fileU.unbundleTarfile(self.__localStashTarFilePath, dirPath=localRestoreDirPath) return ok except Exception as e: logger.exception("Failing for %r with %s", bundleFileName, str(e)) ok = False return ok
class FileUtilTests(unittest.TestCase): def setUp(self): self.__verbose = True self.__pathPdbxDictionaryFile = os.path.join(TOPDIR, "rcsb", "mock-data", "dictionaries", "mmcif_pdbx_v5_next.dic") self.__pathTaxonomyFile = os.path.join(TOPDIR, "rcsb", "mock-data", "NCBI", "names.dmp.gz") self.__zipFileUrl = "https://inventory.data.gov/dataset/794cd3d7-4d28-4408-8f7d-84b820dbf7f2/resource/6b78ec0c-4980-4ad8-9cbd-2d6eb9eda8e7/download/myfoodapediadata.zip" self.__xzFile = os.path.join(TOPDIR, "rcsb", "mock-data", "MOCK_MODBASE_MODELS", "NP_001030614.1_1.pdb.xz") # self.__ftpFileUrl = "ftp://ftp.wwpdb.org/pub/pdb/data/component-models/complete/chem_comp_model.cif.gz" self.__httpsFileUrl = "https://ftp.wwpdb.org/pub/pdb/data/component-models/complete/chem_comp_model.cif.gz" # self.__workPath = os.path.join(HERE, "test-output") self.__inpDirPath = os.path.join(HERE, "test-data") self.__fileU = FileUtil() self.__startTime = time.time() logger.debug("Running tests on version %s", __version__) logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testTarBundling(self): """Test case for tarfile bundling and unbundling""" try: tP = os.path.join(self.__workPath, "t0.tar.gz") dirPath = os.path.join(self.__inpDirPath, "topdir") ok = self.__fileU.bundleTarfile(tP, [dirPath], mode="w:gz", recursive=True) self.assertTrue(ok) numBytes = self.__fileU.size(tP) self.assertGreaterEqual(numBytes, 250) # md5 = self.__fileU.hash(tP, hashType="md5") self.assertTrue(md5 is not None) # ok = self.__fileU.unbundleTarfile(tP, dirPath=self.__workPath) self.assertTrue(ok) # tP = os.path.join(self.__workPath, "t1.tar.gz") dirPathList = [ os.path.join(self.__inpDirPath, "topdir", "subdirA"), os.path.join(self.__inpDirPath, "topdir", "subdirB") ] ok = self.__fileU.bundleTarfile(tP, dirPathList, mode="w:gz", recursive=True) self.assertTrue(ok) # ok = self.__fileU.unbundleTarfile(tP, dirPath=self.__workPath) self.assertTrue(ok) tP = os.path.join(self.__workPath, "t2.tar") dirPathList = [ os.path.join(self.__inpDirPath, "topdir", "subdirA"), os.path.join(self.__inpDirPath, "topdir", "subdirB") ] ok = self.__fileU.bundleTarfile(tP, dirPathList, mode="w", recursive=True) self.assertTrue(ok) # ok = self.__fileU.unbundleTarfile(tP, dirPath=self.__workPath) self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testGetFile(self): """Test case for a local files and directories""" try: remoteLocator = self.__pathPdbxDictionaryFile fn = self.__fileU.getFileName(remoteLocator) # _, fn = os.path.split(remoteLocator) lPath = os.path.join(self.__workPath, fn) ok = self.__fileU.get(remoteLocator, lPath) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertTrue(ok) ok = self.__fileU.isLocal(lPath) self.assertTrue(ok) tPath = self.__fileU.getFilePath(lPath) self.assertEqual(lPath, tPath) ok = self.__fileU.remove(lPath) self.assertTrue(ok) dPath = os.path.join(self.__workPath, "tdir") ok = self.__fileU.mkdir(dPath) self.assertTrue(ok) ok = self.__fileU.remove(dPath) self.assertTrue(ok) ok = self.__fileU.remove(";lakdjf") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testMoveAndCopyFile(self): """Test case for copying ("put") and moving ("replace") local files""" try: remoteLocator = self.__pathPdbxDictionaryFile fn = self.__fileU.getFileName(remoteLocator) # _, fn = os.path.split(remoteLocator) lPath = os.path.join(self.__workPath, fn) ok = self.__fileU.get(remoteLocator, lPath) self.assertTrue(ok) # Test copy file dPath2 = os.path.join(self.__workPath, "tdir") ok = self.__fileU.mkdir(dPath2) self.assertTrue(ok) lPath2 = os.path.join(dPath2, fn) ok = self.__fileU.put(lPath, lPath2) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertTrue(ok) ok = self.__fileU.exists(lPath2) self.assertTrue(ok) # Remove copied file (to test moving file next) ok = self.__fileU.remove(lPath2) self.assertTrue(ok) ok = self.__fileU.exists(lPath2) self.assertFalse(ok) # Test move file ok = self.__fileU.replace(lPath, lPath2) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertFalse(ok) ok = self.__fileU.exists(lPath2) self.assertTrue(ok) # Now clean up files and dirs ok = self.__fileU.remove(lPath) self.assertTrue(ok) ok = self.__fileU.remove(dPath2) self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testZipUrl(self): """Test case for downloading remote zip file and extracting contents.""" try: remoteLocator = self.__zipFileUrl # fn = self.__fileU.getFileName(remoteLocator) ok = self.__fileU.isLocal(remoteLocator) self.assertFalse(ok) # lPath = os.path.join(self.__workPath, self.__fileU.getFileName(self.__zipFileUrl)) ok = self.__fileU.get(remoteLocator, lPath) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertTrue(ok) ok = self.__fileU.isLocal(lPath) self.assertTrue(ok) tPath = self.__fileU.getFilePath(lPath) self.assertEqual(lPath, tPath) fp = self.__fileU.uncompress(lPath, outputDir=self.__workPath) ok = fp.endswith("Food_Display_Table.xlsx") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testFtpUrl(self): """Test case for downloading remote file ftp protocol and extracting contents.""" try: remoteLocator = self.__ftpFileUrl # fn = self.__fileU.getFileName(remoteLocator) ok = self.__fileU.isLocal(remoteLocator) self.assertFalse(ok) # dirPath = os.path.join(self.__workPath, "chem_comp_models") lPath = os.path.join(dirPath, self.__fileU.getFileName(self.__ftpFileUrl)) ok = self.__fileU.get(remoteLocator, lPath) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertTrue(ok) ok = self.__fileU.isLocal(lPath) self.assertTrue(ok) tPath = self.__fileU.getFilePath(lPath) self.assertEqual(lPath, tPath) fp = self.__fileU.uncompress(lPath, outputDir=dirPath) ok = fp.endswith("chem_comp_model.cif") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testRemote(self): """Test case remote status""" try: remoteLocator = self.__httpsFileUrl ok = self.__fileU.isLocal(remoteLocator) self.assertFalse(ok) # ok = self.__fileU.exists(remoteLocator) self.assertTrue(ok) size = self.__fileU.size(remoteLocator) self.assertGreaterEqual(size, 1000) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() @unittest.skip("DrugBank example -- skipping") def testGetDrugBankUrl(self): """Test case for downloading drugbank master xml file""" try: remoteLocator = "https://www.drugbank.ca/releases/latest/downloads/all-full-database" un = "username" pw = "password" # fn = self.__fileU.getFileName(remoteLocator) ok = self.__fileU.isLocal(remoteLocator) self.assertFalse(ok) # lPath = os.path.join(self.__workPath, "db-download.zip") ok = self.__fileU.get(remoteLocator, lPath, username=un, password=pw) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertTrue(ok) ok = self.__fileU.isLocal(lPath) self.assertTrue(ok) tPath = self.__fileU.getFilePath(lPath) self.assertEqual(lPath, tPath) self.__fileU.uncompress(lPath, outputDir=self.__workPath) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testXzFile(self): """Test case for extracting contents from xz file""" try: remoteLocator = self.__xzFile fn = self.__fileU.getFileName(remoteLocator) lPath = os.path.join(self.__workPath, fn) ok = self.__fileU.get(remoteLocator, lPath) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertTrue(ok) ok = self.__fileU.isLocal(lPath) self.assertTrue(ok) tPath = self.__fileU.getFilePath(lPath) self.assertEqual(lPath, tPath) fp = self.__fileU.uncompress(lPath, outputDir=self.__workPath) ok = fp.endswith(".pdb") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class CODModelSearch(object): def __init__(self, cachePath, **kwargs): self.__cachePath = cachePath # self.__useCache = kwargs.get("useCache", True) self.__ccUrlTarget = kwargs.get("ccUrlTarget", None) self.__birdUrlTarget = kwargs.get("birdUrlTarget", None) self.__descriptorUrlTarget = kwargs.get( "descriptorUrlTarget", "http://www.crystallography.net/cod/smi/allcod.smi") self.__prefix = kwargs.get("prefix", None) self.__numProc = kwargs.get("numProc", 4) self.__chunkSize = kwargs.get("chunkSize", 50) self.__ccFileNamePrefix = "cc-%s" % self.__prefix if self.__prefix else "cc-full" self.__fU = FileUtil() # self.__ccmG = ChemCompModelGen(self.__cachePath, self.__prefix) def getResultIndex(self): mU = MarshalUtil(workPath=self.__cachePath) cD = mU.doImport(self.getResultFilePath(), fmt="json") return cD def getResultDetails(self, codId): mU = MarshalUtil(workPath=self.__cachePath) dD = mU.doImport(self.__getCodDetailsFilePath(codId), fmt="json") return dD def storeResultIndex(self, cD): mU = MarshalUtil(workPath=self.__cachePath) ok = mU.doExport(self.getResultFilePath(), cD, fmt="json", indent=3) return ok def getResultDirFilePath(self): dN = "cod-%s-result-files" % self.__prefix if self.__prefix else "cod-result-files" return os.path.join(self.__cachePath, dN) def getRawResultFilePath(self): dN = "cod-%s-result-files" % self.__prefix if self.__prefix else "cod-search-files" return os.path.join(self.__cachePath, dN, "cod-raw-result-file-index.json") def getResultFilePath(self): dN = "cod-%s-result-files" % self.__prefix if self.__prefix else "cod-search-files" return os.path.join(self.__cachePath, dN, "cod-result-file-index.json") def getDescriptorPath(self): fn = self.__fU.getFileName(self.__descriptorUrlTarget) dirPath = self.getResultDirFilePath() filePath = os.path.join(dirPath, fn) return filePath def updateDescriptors(self): self.__fetchUrl(self.__descriptorUrlTarget, filePath=self.getDescriptorPath(), useCache=False) def __fetchUrl(self, urlTarget, filePath, useCache=False, noRetry=False): ok = False try: if not (useCache and self.__fU.exists(filePath)): startTime = time.time() ok = self.__fU.get(urlTarget, filePath, noRetry=noRetry) endTime = time.time() if ok: logger.debug( "Fetched %s for resource file %s (status = %r) (%.4f seconds)", urlTarget, filePath, ok, endTime - startTime) else: logger.error( "Failing fetch for %s for resource file %s (status = %r) (%.4f seconds)", urlTarget, filePath, ok, endTime - startTime) else: ok = True logger.debug("Using cached data for %s", urlTarget) # except Exception as e: logger.exception("Failing for %r with %s", urlTarget, str(e)) return ok def search(self, molLimit=None): try: bsw = BatchChemSearch( useCache=self.__useCache, ccUrlTarget=self.__ccUrlTarget, birdUrlTarget=self.__birdUrlTarget, ccFileNamePrefix=self.__ccFileNamePrefix, cachePath=self.__cachePath, numProc=self.__numProc, chunkSize=self.__chunkSize, ) smiPath = self.getDescriptorPath() smiL = bsw.fetchDescriptorList(smiPath, swap=True) logger.info("Query length (%d)", len(smiL)) # smiL = bsw.splitSmiles(smiL) retL = bsw.doQuery(smiL[:molLimit], "SMILES", matchOpts="graph-exact") logger.info("Result length (%d)", len(retL)) # for ii, ret in enumerate(retL, 1): logger.debug("%5d %8s %4s (%.3f) %s: %s", ii, ret.queryId, ret.ccId, ret.fpScore, ret.queryType, ret.query) # fp = self.getRawResultFilePath() ok = bsw.storeMatchList(fp, retL) return len(retL) if ok else 0 except Exception as e: logger.exception("Failing with %s", str(e)) def __getSearchResults(self): """Read search results and convert to a chemical component dictionary.""" fp = self.getRawResultFilePath() mU = MarshalUtil(workPath=self.__cachePath) rawL = mU.doImport(fp, fmt="json") rD = {} for cD in rawL: rD.setdefault(cD["ccId"], []).append(cD) return rD def __getCodEntryUrl(self, codId): # Template Examples: # https://molecules.crystallography.net/cod/sdf/1/00/00/1000098.sdf # https://molecules.crystallography.net/cod/sdf/6/00/05/6000557.sdf # baseUrl = "https://molecules.crystallography.net/cod/sdf" url = os.path.join(baseUrl, codId[0:1], codId[1:3], codId[3:5], codId + ".sdf") return url def __getCodDetailsUrl(self, codId): baseUrl = "http://www.crystallography.net/cod/optimade/structures" url = os.path.join(baseUrl, codId) return url def __getCodDetailsFilePath(self, codId): dirPath = self.getResultDirFilePath() fp = os.path.join(dirPath, "cod-data", codId[0:1], codId[1:3], codId[3:5], codId + ".json") return fp def __getCodEntryFilePath(self, codId): dirPath = self.getResultDirFilePath() fp = os.path.join(dirPath, "cod-data", codId[0:1], codId[1:3], codId[3:5], codId + ".sdf") return fp def fetchMatchedData(self, useCache=True): """Fetch COD matched entries and metadata and update the raw search index with essential COD data attrbutes. Args: useCache (bool, optional): use any cached COD data. Defaults to True. Returns: int: search result count """ eCount = 0 eSkip = 0 rcD = {} cD = self.__getSearchResults() # for ccId, qDL in cD.items(): # cifPath = self.__ccmG.getChemCompPath(ccId) # if not cifPath: # logger.info("No CIF for %s skipping", ccId) # continue parentId = ccId.split("|")[0] rqDL = [] for qD in qDL: codId = qD["queryId"] codEntryFilePath = self.__getCodEntryFilePath(codId) codDetailsFilePath = self.__getCodDetailsFilePath(codId) ok1 = self.__fetchUrl(self.__getCodEntryUrl(codId), self.__getCodEntryFilePath(codId), useCache=useCache, noRetry=True) ok2 = self.__fetchUrl(self.__getCodDetailsUrl(codId), self.__getCodDetailsFilePath(codId), useCache=useCache, noRetry=True) tD = self.getResultDetails(codId) dD = tD["data"][ "attributes"] if "data" in tD and "attributes" in tD[ "data"] else {} mD = tD["meta"][ "implementation"] if "meta" in tD and "implementation" in tD[ "meta"] else {} if ok1 & ok2: logger.info("Fetched COD entry and details for %s (%r)", codId, ok1 & ok2) eCount += 1 qD["codEntryFilePath"] = codEntryFilePath qD["codDetailsFilePath"] = codDetailsFilePath # qD["cifPath"] = cifPath qD["parentId"] = parentId qD["chemicalName"] = dD[ "_cod_commonname"] if "_cod_commonname" in dD else None qD["chemicalName"] = dD[ "_cod_chemname"] if "_cod_chemname" in dD else qD[ "chemicalName"] qD["rValue"] = dD[ "_cod_Robs"] if "_cod_Robs" in dD else None qD["diffrnTemp"] = dD[ "_cod_diffrtemp"] if "_cod_diffrtemp" in dD else None qD["radiationSource"] = dD[ "_cod_radType"] if "_cod_radType" in dD else None qD["publicationDOI"] = dD[ "_cod_doi"] if "_cod_doi" in dD else None qD["version"] = mD["version"] if "version" in mD else None qD["hasDisorder"] = "N" rqDL.append(qD) else: logger.info("Skipping entry missing data for %r at %r", codId, self.__getCodEntryUrl(codId)) eSkip += 1 if rqDL: rcD[ccId] = rqDL # ok = self.storeResultIndex(rcD) logger.info( "Final match result (w/sdf and metadata) (%d/%d) cod hits (%d) skipped (%d)", len(rcD), len(cD), eCount, eSkip) return eCount if ok else 0 def fetchMatchedDataMp(self, numProc=6, chunkSize=5, useCache=True): rcD = {} cD = self.__getSearchResults() idList = list(cD.keys()) # --- mpu = MultiProcUtil(verbose=True) mpu.setWorkingDir(self.__cachePath) mpu.setOptions(optionsD={ "resultPath": self.__cachePath, "cD": cD, "useCache": useCache }) mpu.set(workerObj=self, workerMethod="fetchDataWorker") ok, failList, resultList, _ = mpu.runMulti(dataList=idList, numProc=numProc, numResults=1, chunkSize=chunkSize) logger.info("Run ended with status %r success count %d failures %r", ok, len(resultList[0]), len(failList)) for rTup in resultList[0]: rcD[rTup[0]] = rTup[1] # --- ok = self.storeResultIndex(rcD) logger.info("Final match result (w/sdf and metadata) (%d/%d)", len(rcD), len(cD)) return True def fetchDataWorker(self, dataList, procName, optionsD, workingDir): """Worker method to fetch COD data for matched entries Args: dataList (list): list of mol2 file paths to be searched procName (str): processName optionsD (dict): dictionary of options workingDir (str): path to working directory (not used) Returns: (successList, resultList, []): success and result lists of mol2 paths with CCDC matches """ resultPath = optionsD["resultPath"] cD = optionsD["cD"] useCache = optionsD["useCache"] _ = workingDir resultList = [] successList = [] startTime = time.time() logger.info("starting %s at %s", procName, time.strftime("%Y %m %d %H:%M:%S", time.localtime())) # eCount = 0 eSkip = 0 try: stopPath = os.path.join(resultPath, "STOP") logger.info("%s starting search data length %d", procName, len(dataList)) if self.__checkStop(stopPath): logger.info("%s stopping", procName) return resultList, resultList, [] # # for ccId, qDL in cD.items(): for ccId in dataList: if ccId in cD: qDL = cD[ccId] # parentId = ccId.split("|")[0] rqDL = [] for qD in qDL: codId = qD["queryId"] codEntryFilePath = self.__getCodEntryFilePath(codId) codDetailsFilePath = self.__getCodDetailsFilePath(codId) ok1 = self.__fetchUrl(self.__getCodEntryUrl(codId), self.__getCodEntryFilePath(codId), useCache=useCache, noRetry=True) ok2 = self.__fetchUrl(self.__getCodDetailsUrl(codId), self.__getCodDetailsFilePath(codId), useCache=useCache, noRetry=True) tD = self.getResultDetails(codId) dD = tD["data"][ "attributes"] if "data" in tD and "attributes" in tD[ "data"] else {} mD = tD["meta"][ "implementation"] if "meta" in tD and "implementation" in tD[ "meta"] else {} if ok1 & ok2: logger.info( "Fetched COD entry and details for %s (%r)", codId, ok1 & ok2) eCount += 1 qD["codEntryFilePath"] = codEntryFilePath qD["codDetailsFilePath"] = codDetailsFilePath # qD["cifPath"] = cifPath qD["parentId"] = parentId qD["chemicalName"] = dD[ "_cod_commonname"] if "_cod_commonname" in dD else None qD["chemicalName"] = dD[ "_cod_chemname"] if "_cod_chemname" in dD else qD[ "chemicalName"] qD["rValue"] = dD[ "_cod_Robs"] if "_cod_Robs" in dD else None qD["diffrnTemp"] = dD[ "_cod_diffrtemp"] if "_cod_diffrtemp" in dD else None qD["radiationSource"] = dD[ "_cod_radType"] if "_cod_radType" in dD else None qD["publicationDOI"] = dD[ "_cod_doi"] if "_cod_doi" in dD else None qD["version"] = mD[ "version"] if "version" in mD else None qD["hasDisorder"] = "N" rqDL.append(qD) else: logger.info("Skipping entry missing data for %r at %r", codId, self.__getCodEntryUrl(codId)) eSkip += 1 if rqDL: resultList.append((ccId, rqDL)) successList.append(ccId) except Exception as e: logger.exception("Failing with %s", str(e)) endTime = time.time() logger.info( "%s (entries %d skipped %d) (ccId result length %d) completed at %s (%.2f seconds)", procName, eCount, eSkip, len(successList), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime, ) return successList, resultList, [] def __checkStop(self, path): try: if os.access(path, os.F_OK): return True except Exception: pass return False
class SchemaProvider(SingletonClass): """ A collection of schema build and caching methods. Static cache worflow: <authorative source> <-- <cache dir> <- client API Compute workflow: <dependent resource files, config file, dictionaries> -> [schema builder] --> <schema def> --> <Json schema> """ def __init__(self, cfgOb, cachePath, useCache=True, rebuildFlag=False, **kwargs): """A collection of schema build and caching methods. Args: cfgOb (object): ConfigInfo() instance cachePath (str): path to directory containing schema useCache (bool, optional): use cached schema. Defaults to True. rebuildFlag (bool, optional): on-the-fly rebuild and cache schema """ self.__cfgOb = cfgOb self.__configName = self.__cfgOb.getDefaultSectionName() self.__cachePath = os.path.abspath(cachePath) self.__useCache = useCache self.__rebuildFlag = rebuildFlag self.__useCache = rebuildFlag if rebuildFlag else useCache # self.__workPath = os.path.join(self.__cachePath, "work") self.__fileU = FileUtil(workPath=os.path.join(self.__cachePath, "work")) self.__schemaCachePath = os.path.join(self.__cachePath, self.__cfgOb.get("SCHEMA_DEFINITION_CACHE_DIR", sectionName=self.__configName)) self.__jsonSchemaCachePath = os.path.join(self.__cachePath, self.__cfgOb.get("JSON_SCHEMA_DEFINITION_CACHE_DIR", sectionName=self.__configName)) self.__fileU.mkdir(self.__schemaCachePath) self.__fileU.mkdir(self.__jsonSchemaCachePath) self.__kwargs = kwargs def getSchemaOptions(self, schemaLevel, extraOpts=None): opts = extraOpts + "|" if extraOpts else "" if schemaLevel == "full": return opts + "mandatoryKeys|mandatoryAttributes|bounds|enums|rcsb" elif schemaLevel in ["min", "minimum"]: return opts + "mandatoryKeys|enums|rcsb" else: return opts def getSchemaInfo(self, databaseName, dataTyping="ANY"): """Convenience method to return essential schema details for the input repository content type. Args: databaseName (str): schema name (e.g. pdbx, bird, chem_comp, ...) dataTyping (str, optional): Application name for the target schema (e.g. ANY, SQL, ...) Returns: tuple: SchemaDefAccess(object), target database name, target collection name list, primary index attribute list """ sd = None dbName = None collectionNameList = [] docIndexD = {} try: mU = MarshalUtil(workPath=self.__workPath) schemaLocator = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping) if self.__rebuildFlag: filePath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator)) self.makeSchemaDef(databaseName, dataTyping=dataTyping, saveSchema=True) else: filePath = self.__reload(schemaLocator, self.__schemaCachePath, useCache=self.__useCache) if not filePath: logger.error("Unable to recover schema %s (%s)", databaseName, dataTyping) logger.debug("ContentType %r dataTyping %r schemaLocator %r", databaseName, dataTyping, schemaLocator) schemaDef = mU.doImport(filePath, fmt="json") if schemaDef: logger.debug("Using cached schema definition for %s application %s", databaseName, dataTyping) sd = SchemaDefAccess(schemaDef) if sd: dbName = sd.getDatabaseName() collectionInfoList = sd.getCollectionInfo() logger.debug("Schema %s database name %s collections %r", databaseName, dbName, collectionInfoList) for cd in collectionInfoList: collectionName = cd["NAME"] collectionNameList.append(collectionName) docIndexD[collectionName] = sd.getDocumentIndices(collectionName) except Exception as e: logger.exception("Retreiving schema %s for %s failing with %s", databaseName, dataTyping, str(e)) return sd, dbName, collectionNameList, docIndexD def schemaDefCompare(self, databaseName, dataTyping="ANY"): """Compare computed schema defintion with current source/cached version. Args: databaseName (str): schema definition name for comparison dataTyping (str, optional): data type conventions for the schema comparison. Defaults to "ANY". Returns: (str): file path for schema difference or None """ mU = MarshalUtil(workPath=self.__workPath) schemaDiffPath = os.path.join(self.__cachePath, "schema_diff") mU.mkdir(schemaDiffPath) schemaPath = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping) fn = self.__fileU.getFileName(schemaPath) sD = self.makeSchemaDef(databaseName, dataTyping=dataTyping) v2 = sD["DATABASE_VERSION"] # ---- # tPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaPath) + "-test") # logger.info("Exporting schema def to %s", tPath) # mU.doExport(tPath, sD, fmt="json", indent=3) # sD = mU.doImport(tPath, fmt="json") # ---- cPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaPath)) sDCache = mU.doImport(cPath, fmt="json") v1 = sDCache["DATABASE_VERSION"] # numDiff, difD = self.schemaCompare(sDCache, sD) # # jD = diff(sDCache, sD, syntax="explicit", marshal=True) diffPath = None if numDiff: bn, _ = os.path.splitext(fn) diffPath = os.path.join(schemaDiffPath, bn + "-" + v1 + "-" + v2 + "-diff.json") # logger.info("diff for %s %s = \n%s", databaseName, dataTyping, pprint.pformat(difD, indent=3, width=100)) mU.doExport(diffPath, difD, fmt="json", indent=3) # return diffPath def jsonSchemaCompare(self, databaseName, collectionName, encodingType, level, extraOpts=None): """Compare computed JSON schema defintion with current source/cached version. Args: databaseName (str): schema name collectionName (str): collection name encodingType (str): schema data type conventions (JSON|BSON) level (str): metadata level (min|full) extraOpts (str): extra schema construction options Returns: (str): path to the difference file or None """ mU = MarshalUtil(workPath=self.__workPath) schemaDiffPath = os.path.join(self.__cachePath, "schema_diff") mU.mkdir(schemaDiffPath) schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType, level) fn = self.__fileU.getFileName(schemaLocator) schemaPath = os.path.join(self.__jsonSchemaCachePath, fn) # sD = self.makeSchema(databaseName, collectionName, encodingType=encodingType, level=level, saveSchema=False, extraOpts=extraOpts) v2 = self.__getSchemaVersion(sD) # ---- # tPath = os.path.join(self.__jsonSchemaCachePath, self.__fileU.getFileName(schemaPath) + "-test") # logger.info("Exporting json schema to %s", tPath) # mU.doExport(tPath, sD, fmt="json", indent=3) # ---- # sDCache = mU.doImport(schemaPath, fmt="json") v1 = self.__getSchemaVersion(sDCache) if not v1: logger.error("no version for %s - %s %s", schemaLocator, databaseName, collectionName) # numDiff, difD = self.schemaCompare(sDCache, sD) # jD = diff(sDCache, sD, marshal=True, syntax="explicit") diffPath = None if numDiff: logger.debug("diff for %s %s %s %s = \n%s", databaseName, collectionName, encodingType, level, pprint.pformat(difD, indent=3, width=100)) bn, _ = os.path.splitext(fn) diffPath = os.path.join(schemaDiffPath, bn + "-" + v1 + "-" + v2 + "-diff.json") mU.doExport(diffPath, difD, fmt="json", indent=3) return diffPath def __getSchemaVersion(self, jsonSchema): try: comment = jsonSchema["$comment"] if "$comment" in jsonSchema else "" ff = comment.split(":") version = ff[1].strip() return version except Exception as e: logger.exception("Failing for with %s", str(e)) return "" def __getSchemaDefLocator(self, databaseName, dataTyping="ANY"): """Internal method returning schema definition path for the input content type and application. Defines schema definition naming convention - Args: databaseName (str): schema name (e.g. pdbx, bird, chem_comp, ...) dataTyping (str, optional): Application name for the target schema (e.g. ANY, SQL, ...) Returns: str: schema definition file locator """ schemaLocator = None try: locPath = self.__cfgOb.get("SCHEMA_DEFINITION_LOCATOR_PATH", sectionName=self.__configName) fn = "schema_def-%s-%s.json" % (databaseName, dataTyping.upper()) schemaLocator = os.path.join(locPath, fn) except Exception as e: logger.exception("Retreiving schema definition path %s for %s failing with %s", databaseName, dataTyping, str(e)) return schemaLocator def __getJsonSchemaLocator(self, databaseName, collectionName, encodingType="BSON", level="full"): """Internal method returning JSON schema path for the input collection data type convention and level. Defines the JSON/BSON schema naming convention - Args: databaseName (str): database name in the document store collectionName (str): collection name in document store encodingType (str, optional): data type convention (BSON|JSON) level (str, optional): Completeness of the schema (e.g. min or full) Returns: str: schema file locator """ schemaLocator = None try: sdType = None sLevel = None schemaLocator = None if encodingType.upper() in ["JSON", "BSON"]: sdType = encodingType.lower() if level.lower() in ["min", "minimun"]: sLevel = "min" elif level.lower() in ["full"]: sLevel = level.lower() # if sdType and sLevel: locPath = self.__cfgOb.get("JSON_SCHEMA_DEFINITION_LOCATOR_PATH", sectionName=self.__configName) fn = "%s-%s-db-%s-col-%s.json" % (sdType, sLevel, databaseName, collectionName) schemaLocator = os.path.join(locPath, fn) else: logger.error("Unsupported schema options: %s level %r type %r", collectionName, level, encodingType) schemaLocator = None except Exception as e: logger.debug("Retreiving JSON schema definition for %s type %s failing with %s", collectionName, encodingType, str(e)) # return schemaLocator def __reload(self, locator, dirPath, useCache=True): # fn = self.__fileU.getFileName(locator) filePath = os.path.join(dirPath, fn) logger.debug("Target cache filePath %s", filePath) self.__fileU.mkdir(dirPath) if not useCache: try: os.remove(filePath) except Exception: pass # if useCache and self.__fileU.exists(filePath): ok = True else: logger.info("Fetch data from source %s to %s", locator, filePath) ok = self.__fileU.get(locator, filePath) return filePath if ok else None def getJsonSchema(self, databaseName, collectionName, encodingType="BSON", level="full", extraOpts=None): """Return JSON schema (w/ BSON types) object for the input collection and level.and Args: databaseName (str): database name collectionName (str): collection name in document store encodingType (str, optional): data type convention (BSON|JSON) level (str, optional): Completeness of the schema (e.g. min or full) Returns: dict: Schema object """ sObj = None schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType=encodingType, level=level) # if self.__rebuildFlag: filePath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator)) self.makeSchema(databaseName, collectionName, encodingType=encodingType, level=level, extraOpts=extraOpts) else: filePath = self.__reload(schemaLocator, self.__jsonSchemaCachePath, useCache=self.__useCache) mU = MarshalUtil(workPath=self.__workPath) if filePath and mU.exists(filePath): mU = MarshalUtil(workPath=self.__workPath) sObj = mU.doImport(filePath, fmt="json") else: logger.debug("Failed to read schema for %s %r", collectionName, level) return sObj def makeSchema(self, databaseName, collectionName, encodingType="BSON", level="full", saveSchema=False, extraOpts=None): try: smb = SchemaDefBuild(databaseName, self.__cfgOb, cachePath=self.__cachePath) # cD = None stU = encodingType.upper() cD = smb.build(collectionName, dataTyping=stU, encodingType=stU, enforceOpts=self.getSchemaOptions(level, extraOpts=extraOpts)) if cD and saveSchema: schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType=encodingType, level=level) localPath = os.path.join(self.__jsonSchemaCachePath, self.__fileU.getFileName(schemaLocator)) mU = MarshalUtil(workPath=self.__workPath) mU.doExport(localPath, cD, fmt="json", indent=3, enforceAscii=False) except Exception as e: logger.exception("Building schema %s collection %s failing with %s", databaseName, collectionName, str(e)) return cD def makeSchemaDef(self, databaseName, dataTyping="ANY", saveSchema=False): schemaDef = None try: smb = SchemaDefBuild(databaseName, self.__cfgOb, cachePath=self.__cachePath) schemaDef = smb.build(dataTyping=dataTyping, encodingType="rcsb") if schemaDef and saveSchema: schemaLocator = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping) localPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator)) mU = MarshalUtil(workPath=self.__workPath) mU.doExport(localPath, schemaDef, fmt="json", indent=3, enforceAscii=False) except Exception as e: logger.exception("Building schema %s failing with %s", databaseName, str(e)) return schemaDef def schemaCompare(self, orgD, newD): """ Compute the difference of nested dictionaries. """ fOrgD = self.__flatten(orgD) fNewD = self.__flatten(newD) if len(fOrgD) != len(fNewD): logger.debug("Schema lengths differ: org %d new %d", len(fOrgD), len(fNewD)) # addedD = {k: fNewD[k] for k in set(fNewD) - set(fOrgD)} removedD = {k: fOrgD[k] for k in set(fOrgD) - set(fNewD)} changedOrgD = {k: fOrgD[k] for k in set(fOrgD) & set(fNewD) if fOrgD[k] != fNewD[k]} changedNewD = {k: fNewD[k] for k in set(fOrgD) & set(fNewD) if fOrgD[k] != fNewD[k]} chD = {} for ky in changedOrgD: kyS = ".".join(ky) vOrg = changedOrgD[ky] vNew = changedNewD[ky] if isinstance(vOrg, (list, tuple)) and isinstance(vNew, (list, tuple)): # logger.info(" >> %r vOrg %r vNew %r", ky, vOrg, vNew) dV = list(set(vNew) - set(vOrg)) if dV: chD[kyS] = {"diff": dV} else: chD[kyS] = {"from": vOrg, "to": vNew} # nT = len(addedD) + len(removedD) + len(chD) diffD = {"added": [".".join(kk) for kk in addedD.keys()], "removed": [".".join(kk) for kk in removedD.keys()], "changed": chD} return nT, diffD def __flatten(self, inpDict, prefix=None): prefix = prefix[:] if prefix else [] outDict = {} for key, value in inpDict.items(): if isinstance(value, dict) and value: deeper = self.__flatten(value, prefix + [key]) outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()}) elif isinstance(value, (list, tuple)) and value: for index, sublist in enumerate(value, start=1): if isinstance(sublist, dict) and sublist: deeper = self.__flatten(sublist, prefix + [key] + [str(index)]) outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()}) else: outDict[tuple(prefix + [key] + [str(index)])] = value else: outDict[tuple(prefix + [key])] = value return outDict def __flattenX(self, inpDict, prefix=None): prefix = prefix[:] if prefix else [] # separator = "." outDict = {} for key, value in inpDict.items(): if isinstance(value, dict) and value: deeper = self.__flatten(value, prefix + [key]) outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()}) elif isinstance(value, list) and value: for index, sublist in enumerate(value, start=1): if isinstance(sublist, dict) and sublist: deeper = self.__flatten(sublist, prefix + [key] + [str(index)]) outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()}) else: outDict[tuple(prefix + [key] + [str(index)])] = value else: outDict[tuple(prefix + [key])] = value return outDict def __flattenOrg(self, inpDict, separator=".", prefix=""): outDict = {} for key, value in inpDict.items(): if isinstance(value, dict) and value: deeper = self.__flattenOrg(value, separator, prefix + key + separator) outDict.update({key2: val2 for key2, val2 in deeper.items()}) elif isinstance(value, list) and value: for index, sublist in enumerate(value, start=1): if isinstance(sublist, dict) and sublist: deeper = self.__flattenOrg(sublist, separator, prefix + key + separator + str(index) + separator) outDict.update({key2: val2 for key2, val2 in deeper.items()}) else: outDict[prefix + key + separator + str(index)] = value else: outDict[prefix + key] = value return outDict def __dictGen(self, indict, pre=None): pre = pre[:] if pre else [] if isinstance(indict, dict): for key, value in indict.items(): if isinstance(value, dict): for dD in self.__dictGen(value, pre + [key]): yield dD elif isinstance(value, list) or isinstance(value, tuple): for v in value: for dD in self.__dictGen(v, pre + [key]): yield dD else: yield pre + [key, value] else: yield indict
class ProvenanceProvider(SingletonClass): """Utilities to access and update provenance details.""" def __init__(self, cfgOb, cachePath, useCache=True, **kwargs): """Utilities to access and update provenance details. Args: cfgOb ([type]): ConfigInfo() instance cachePath ([type]): path to directory containing schema useCache (bool, optional): use cached schema. Defaults to True. """ self.__cfgOb = cfgOb self.__configName = self.__cfgOb.getDefaultSectionName() self.__cachePath = cachePath self.__useCache = useCache # self.__workPath = os.path.join(self.__cachePath, "work") self.__provenanceCachePath = os.path.join( self.__cachePath, self.__cfgOb.get("PROVENANCE_INFO_CACHE_DIR", sectionName=self.__configName)) self.__provenanceLocator = self.__cfgOb.getPath( "PROVENANCE_INFO_LOCATOR", sectionName=self.__configName) # self.__fileU = FileUtil(workPath=self.__workPath) self.__fileU.mkdir(self.__provenanceCachePath) self.__kwargs = kwargs # def __reload(self, locator, dirPath, useCache=True): # fn = self.__fileU.getFileName(locator) filePath = os.path.join(dirPath, fn) logger.debug("Using cache path %s", dirPath) self.__fileU.mkdir(dirPath) if not useCache: try: os.remove(filePath) except Exception: pass # if useCache and self.__fileU.exists(filePath): ok = True else: logger.debug("Fetch data from source %s", locator) ok = self.__fileU.get(locator, filePath) return filePath if ok else None def fetch(self): try: provenanceFileCachePath = self.__reload(self.__provenanceLocator, self.__provenanceCachePath, useCache=self.__useCache) mU = MarshalUtil(workPath=self.__workPath) return mU.doImport(provenanceFileCachePath, fmt="json") except Exception as e: logger.exception("Failed retreiving provenance with %s", str(e)) return {} def update(self, provD): ok = False try: provenanceFileCachePath = self.__reload(self.__provenanceLocator, self.__provenanceCachePath, useCache=self.__useCache) mU = MarshalUtil(workPath=self.__workPath) tD = mU.doImport(provenanceFileCachePath, fmt="json") tD.update(provD) ok = mU.doExport(provenanceFileCachePath, tD, fmt="json") except Exception as e: logger.exception("Failed updating provenance with %s", str(e)) return ok def store(self, provD): ok = False try: provenanceFileCachePath = self.__reload(self.__provenanceLocator, self.__provenanceCachePath, useCache=self.__useCache) mU = MarshalUtil(workPath=self.__workPath) ok = mU.doExport(provenanceFileCachePath, provD, fmt="json") except Exception as e: logger.exception("Failed storing provenance with %s", str(e)) return ok