def __reload(self, dirPath, baseVersion, useCache, **kwargs): startTime = time.time() mU = MarshalUtil(workPath=dirPath) chemblDbUrl = kwargs.get( "ChEMBLDbUrl", "ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/") ok = False fU = FileUtil() fU.mkdir(dirPath) # # ChEMBL current version <baseVersion>,... # template: chembl_<baseVersion>.fa.gz # targetFileName = "chembl_" + str(baseVersion) + ".fa.gz" mappingFileName = "chembl_uniprot_mapping.txt" # chemblTargetPath = os.path.join(dirPath, targetFileName) chemblMappingPath = os.path.join(dirPath, mappingFileName) mappingFilePath = os.path.join(dirPath, "chembl_uniprot_mapping.json") # mapD = {} if useCache and fU.exists(mappingFilePath): logger.info("useCache %r using %r and %r and %r", useCache, chemblTargetPath, chemblMappingPath, mappingFilePath) mapD = mU.doImport(mappingFilePath, fmt="json") else: # Get the ChEMBL UniProt mapping file url = os.path.join(chemblDbUrl, mappingFileName) ok = fU.get(url, chemblMappingPath) logger.info("Fetched %r url %s path %s", ok, url, chemblMappingPath) logger.info("Reading ChEMBL mapping file path %s", mappingFilePath) rowL = mU.doImport(chemblMappingPath, fmt="tdd", rowFormat="list") for row in rowL: mapD[row[0]] = (row[1], row[2], row[3]) ok = mU.doExport(mappingFilePath, mapD, fmt="json") logger.info("Processed mapping path %s (%d) %r", mappingFilePath, len(mapD), ok) # # Get the target FASTA files -- for vers in range(baseVersion, baseVersion + 10): logger.info("Now fetching version %r", vers) self.__version = vers targetFileName = "chembl_" + str(vers) + ".fa.gz" chemblTargetPath = os.path.join(dirPath, "chembl_targets_raw.fa.gz") url = os.path.join(chemblDbUrl, targetFileName) ok = fU.get(url, chemblTargetPath) logger.info("Fetched %r url %s path %s", ok, url, chemblTargetPath) if ok: break # logger.info("Completed reload at %s (%.4f seconds)", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) # return mapD
def pushBundle(self, gitRepositoryPath, accessToken, gitHost="github.com", gitBranch="master", remoteStashPrefix="A", maxSizeMB=95): """Push bundle to remote stash git repository. Args: gitRepositoryPath (str): git repository path (e.g., rcsb/py-rcsb_exdb_assets_stash) accessToken (str): git repository access token gitHost (str, optional): git repository host name. Defaults to github.com. gitBranch (str, optional): git branch name. Defaults to master. remoteStashPrefix (str, optional): optional label preppended to the stashed dependency bundle artifact (default='A') maxSizeMB (int, optional): maximum stash bundle file size that will be committed. Defaults to 95MB. Returns: bool: True for success or False otherwise """ try: ok = False gU = GitUtil(token=accessToken, repositoryHost=gitHost) fU = FileUtil() localRepositoryPath = os.path.join(self.__localBundlePath, "stash_repository") fn = self.__makeBundleFileName(self.__baseBundleFileName, remoteStashPrefix=remoteStashPrefix) # # Update existing local repository, otherwise clone a new copy if fU.exists(localRepositoryPath): ok = gU.pull(localRepositoryPath, branch=gitBranch) logger.debug("After pull status %r", gU.status(localRepositoryPath)) else: ok = gU.clone(gitRepositoryPath, localRepositoryPath, branch=gitBranch) # # Split all bundles mbSize = float(fU.size(self.__localStashTarFilePath)) / 1000000.0 logger.info("Splitting bundle %r (%.3f MB/Max %d MB)", fn, mbSize, maxSizeMB) sj = SplitJoin() splitDirPath = os.path.join(localRepositoryPath, "stash", fn[:-7]) sj.split(self.__localStashTarFilePath, splitDirPath, maxSizeMB=maxSizeMB) fU.remove(self.__localStashTarFilePath) # else: # fU.put(self.__localStashTarFilePath, os.path.join(localRepositoryPath, "stash", fn)) ok = gU.addAll(localRepositoryPath, branch=gitBranch) ok = gU.commit(localRepositoryPath, branch=gitBranch) logger.debug("After commit status %r", gU.status(localRepositoryPath)) # if accessToken: ok = gU.push(localRepositoryPath, branch=gitBranch) logger.info("After push status %r", gU.status(localRepositoryPath)) # return ok except Exception as e: logger.exception("For %r %r failing with %s", gitHost, gitRepositoryPath, str(e)) return False
def __fetchUrl(self, urlTarget, dirPath, useCache=False): fU = FileUtil() fn = fU.getFileName(urlTarget) filePath = os.path.join(dirPath, fn) if not (useCache and fU.exists(filePath)): startTime = time.time() ok2 = fU.get(urlTarget, filePath) endTime = time.time() if ok2: logger.info("Fetched %s for resource file %s (status = %r) (%.4f seconds)", urlTarget, filePath, ok2, endTime - startTime) else: logger.error("Failing fetch for %s for resource file %s (status = %r) (%.4f seconds)", urlTarget, filePath, ok2, endTime - startTime) # return filePath
def __reload(self, dirPath, useCache): startTime = time.time() aD = {} fU = FileUtil() fU.mkdir(dirPath) targetMechanismFilePath = self.getTargetMechanismDataPath() # if useCache and fU.exists(targetMechanismFilePath): logger.info("useCache %r using %r", useCache, targetMechanismFilePath) qD = self.__mU.doImport(targetMechanismFilePath, fmt="json") aD = qD["mechanism"] if "mechanism" in qD else {} # logger.info("Completed reload of (%d) at %s (%.4f seconds)", len(aD), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) # return aD
def fetchBundle(self, localRestoreDirPath, url, remoteDirPath, remoteStashPrefix="A", userName=None, password=None): """Restore bundled dependencies from remote storage and unbundle these in the current local cache directory. Args: localRestoreDirPath (str): local restore path url (str): remote URL remoteDirPath (str): remote directory path on the remote resource remoteStashPrefix (str, optional): optional label preppended to the stashed dependency bundle artifact (default='A') userName (str, optional): optional access information. Defaults to None. password (str, optional): optional access information. Defaults to None. """ try: ok = False fileU = FileUtil() fn = self.__makeBundleFileName(self.__baseBundleFileName, remoteStashPrefix=remoteStashPrefix) if not url: remotePath = os.path.join(remoteDirPath, fn) if fileU.exists(remotePath): ok = fileU.get(remotePath, self.__localStashTarFilePath) else: ok = False logger.warning("Missing bundle file %r", remotePath) elif url and (url.startswith("http://") or url.startswith("https://")): remotePath = url + os.path.join("/", remoteDirPath, fn) ok = fileU.get(remotePath, self.__localStashTarFilePath) elif url and url.startswith("sftp://"): sftpU = SftpUtil() ok = sftpU.connect(url[7:], userName, pw=password, port=22) if ok: remotePath = os.path.join(remoteDirPath, fn) ok = sftpU.get(remotePath, self.__localStashTarFilePath) else: logger.error("Unsupported protocol %r", url) if ok: ok = fileU.unbundleTarfile(self.__localStashTarFilePath, dirPath=localRestoreDirPath) return ok except Exception as e: logger.exception("For %r %r Failing with %s", url, remoteDirPath, str(e)) ok = False return ok
def __fetchFromSource(self, urlTarget): """Fetch the classification names and domain assignments from the ECOD repo.""" fU = FileUtil() fn = fU.getFileName(urlTarget) fp = os.path.join(self.__dirPath, fn) if not fU.exists(fp): fU.get(urlTarget, fp) # with open(fp, "r", encoding="utf-8") as ifh: line = ifh.readline() line = ifh.readline() line = ifh.readline() ff = line[:-1].split() self.__version = ff[-1] # nmL = self.__mU.doImport(fp, fmt="list", uncomment=True) fU.remove(fp) # return nmL
def __processAppendedSections(self, appendConfigOption, cachePath, useCache=True): """Fetch and append configuration assets assigned to input configuration option. Args: appendConfigOption (str): reserved configuration option to hold a list of configuration asset locators cachePath (str): path to store cached copies configuration assets useCache (bool, optional): use existing cached configuration assets. Defaults to True. Returns: bool: True for success of False otherwise """ try: ret = True appendLocL = self.getList(appendConfigOption, sectionName=self.__defaultSectionName) logger.debug("appendLocL is %r", appendLocL) if appendLocL: cP = os.path.join(cachePath, "config") fU = FileUtil(workPath=cP) logger.debug("Fetching append sections from %r", appendLocL) for appendLoc in appendLocL: fn = fU.getFileName(appendLoc) fp = os.path.join(cP, fn) okF = True if not (useCache and fU.exists(fp)): # get a fresh copy from source okF = fU.get(appendLoc, fp) logger.debug("Fetched %r to %r", appendLoc, fp) ok = self.appendConfig(fp) ret = ret and ok and okF except Exception as e: logger.exception("Failing for option %r cachePath %r with %s", appendConfigOption, cachePath, str(e)) ret = False # if not ret: logger.error("Fetching appended sections failing %r", appendLocL) return ret
def __reload(self, urlTarget, dirPath, useCache=True): """Reload local cache of mapping resources to support validation report reader and translator. Args: urlTarget (list, str): URL for schema mapping file dirPath (str): path to the directory containing cache files useCache (bool, optional): flag to use cached files. Defaults to True. Returns: (object): instance of ValidationReportReader() """ mapD = {} # mU = MarshalUtil() fU = FileUtil() fn = fU.getFileName(urlTarget) mappingFilePath = os.path.join(dirPath, fn) mU.mkdir(dirPath) # # if not useCache: # for fp in [mappingFilePath]: # try: # os.remove(fp) # except Exception: # pass # # logger.debug("Loading validation mapping data in %s (useCache %r)", fn, useCache) if useCache and fU.exists(mappingFilePath): mapD = mU.doImport(mappingFilePath, fmt="json") else: logger.info("Fetching url %s to resource file %s", urlTarget, mappingFilePath) tS = uuid.uuid4().hex tP = os.path.join(dirPath, "._" + tS) ok = fU.get(urlTarget, tP) if ok: mapD = mU.doImport(tP, fmt="json") os.replace(tP, mappingFilePath) return mapD
def get(self, remotePath, localPath): """Get a file from a remote FTP server. Arguments: remotePath (str): remote file path localPath (str): local file path Returns: bool: True for success or false otherwise """ try: fileU = FileUtil() fileU.mkdirForFile(localPath) # If provided localPath already exists and is a directory, retrieve the file using the name on the remote server # to avoid unintentionally overwriting an entire local directory with a single retrieved file if (os.path.exists(localPath) and os.path.isdir(localPath)): remoteFileName = FileUtil().getFileName(remotePath) localFilePath = os.path.join(localPath, remoteFileName) else: localFilePath = localPath with open(localFilePath, 'wb') as lFP: self.__ftpClient.retrbinary('RETR %s' % remotePath, lFP.write) ok = fileU.exists(localFilePath) if ok: return True else: logger.error("get failing for remotePath %s localFilePath %s", remotePath, localFilePath) return False except Exception as e: if self.__raiseExceptions: raise e else: logger.error( "get failing for remotePath %s localPath %s with %s", remotePath, localPath, str(e)) return False
def __reload(self, dirPath, useCache): startTime = time.time() aD = {} allIdD = {} fU = FileUtil() fU.mkdir(dirPath) targetActivityFilePath = self.getTargetActivityDataPath() # if useCache and fU.exists(targetActivityFilePath): logger.info("useCache %r using %r", useCache, targetActivityFilePath) qD = self.__mU.doImport(targetActivityFilePath, fmt="json") aD = qD["activity"] if "activity" in qD else {} idL = qD["all_ids"] if "all_ids" in qD else [] allIdD = {k: k in aD for k in idL} # logger.info( "Completed reload (%d activities) (%d tried identifiers) at %s (%.4f seconds)", len(aD), len(allIdD), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime, ) # return aD, allIdD
def __reload(self, urlTarget, dirPath, useCache=True): """ Reload input GO OBO ontology file and return a nx graph object. ' Returns: dictionary[goId] = {'name_list': ... , 'id_list': ... 'depth_list': ... } """ goGraph = None # # mU = MarshalUtil() fU = FileUtil() fn = fU.getFileName(urlTarget) oboFilePath = os.path.join(dirPath, fn) fU.mkdir(dirPath) # if not useCache: for fp in [oboFilePath]: try: os.remove(fp) except Exception: pass # if useCache and fU.exists(oboFilePath): goGraph = obonet.read_obo(oboFilePath) else: logger.info("Fetching url %s to resource file %s", urlTarget, oboFilePath) ok = fU.get(urlTarget, oboFilePath) if ok: goGraph = obonet.read_obo(oboFilePath) if goGraph: logger.info("Reading %d nodes and %d edges", len(goGraph), goGraph.number_of_edges()) else: logger.info("Go graph construction failing") # return goGraph
class IoUtil(object): def __init__(self, **kwargs): self.__fileU = FileUtil(**kwargs) def serialize(self, filePath, myObj, fmt="pickle", **kwargs): """Public method to serialize format appropriate objects Args: filePath (str): local file path' myObj (object): format appropriate object to be serialized format (str, optional): one of ['mmcif', mmcif-dict', json', 'list', 'text-dump', pickle' (default)] **kwargs: additional keyword arguments passed to worker methods - Returns: bool: status of serialization operation; true for success or false otherwise """ ret = False fmt = str(fmt).lower() ret = self.__fileU.mkdirForFile(filePath) if not ret: return ret if fmt in ["mmcif"]: ret = self.__serializeMmCif(filePath, myObj, **kwargs) elif fmt in ["json"]: ret = self.__serializeJson(filePath, myObj, **kwargs) elif fmt in ["pickle"]: ret = self.__serializePickle(filePath, myObj, **kwargs) elif fmt in ["list"]: ret = self.__serializeList(filePath, myObj, enforceAscii=True, **kwargs) elif fmt in ["mmcif-dict"]: ret = self.__serializeMmCifDict(filePath, myObj, **kwargs) elif fmt in ["text-dump"]: ret = self.__textDump(filePath, myObj, **kwargs) elif fmt in ["fasta"]: ret = self.__serializeFasta(filePath, myObj, **kwargs) elif fmt in ["csv"]: ret = self.__serializeCsv(filePath, myObj, **kwargs) else: pass return ret def deserialize(self, filePath, fmt="pickle", **kwargs): """Public method to deserialize objects in supported formats. Args: filePath (str): local file path format (str, optional): one of ['mmcif', 'json', 'list', ..., 'pickle' (default)] **kwargs: additional keyword arguments passed to worker methods - Returns: object: deserialized object data """ fmt = str(fmt).lower() if fmt in ["mmcif"]: ret = self.__deserializeMmCif(filePath, **kwargs) # type: ignore elif fmt in ["json"]: ret = self.__deserializeJson(filePath, **kwargs) # type: ignore elif fmt in ["pickle"]: ret = self.__deserializePickle(filePath, **kwargs) # type: ignore elif fmt in ["list"]: ret = self.__deserializeList(filePath, enforceAscii=True, **kwargs) # type: ignore elif fmt in ["mmcif-dict"]: ret = self.__deserializeMmCifDict(filePath, **kwargs) # type: ignore elif fmt in ["fasta"]: ret = self.__deserializeFasta(filePath, **kwargs) # type: ignore # elif fmt in ["vrpt-xml-to-cif"]: # ret = self.__deserializeVrptToCif(filePath, **kwargs) # type: ignore elif fmt in ["csv", "tdd"]: delimiter = kwargs.get("csvDelimiter", "," if fmt == "csv" else "\t") ret = self.__deserializeCsv(filePath, delimiter=delimiter, **kwargs) # type: ignore elif fmt in ["xml"]: ret = self.__deserializeXml(filePath, **kwargs) # type: ignore else: ret = None # type: ignore return ret def __sliceInChunks(self, myList, numChunks): mc = min(len(myList), numChunks) chunkSize = int(len(myList) / mc) if len(myList) % mc: chunkSize += 1 for i in range(0, len(myList), chunkSize): yield myList[i:i + chunkSize] def serializeInParts(self, filePath, myObj, numParts, fmt="json", **kwargs): """Public method to serialize format appropriate (json, pickle) objects in multiple parts Args: filePath (str): local file path myObj (object): format appropriate object to be serialized numParts (int): divide the data into numParts segments format (str, optional): one of ['json' or 'pickle']. Defaults to json **kwargs: additional keyword arguments passed to worker methods - Returns: bool: True for success or False otherwise """ if fmt not in ["json", "pickle"]: logger.error("Unsupported format for %s", fmt) return False pth, fn = os.path.split(filePath) self.__fileU.mkdirForFile(pth) bn, ext = os.path.splitext(fn) ret = True if isinstance(myObj, list): for ii, subList in enumerate(self.__sliceInChunks(myObj, numParts)): fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext) ok = self.serialize(fp, subList, fmt=fmt, **kwargs) ret = ret and ok elif isinstance(myObj, dict): for ii, keyList in enumerate( self.__sliceInChunks(list(myObj.keys()), numParts)): fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext) ok = self.serialize(fp, OrderedDict([(k, myObj[k]) for k in keyList]), fmt=fmt, **kwargs) ret = ret and ok else: logger.error("Unsupported data type for serialization in parts") ret = False # return ret def deserializeInParts(self, filePath, numParts, fmt="json", **kwargs): """Public method to deserialize objects in supported formats from multiple parts Args: filePath (str): local file path numParts (int): reconstruct the data object from numParts segments format (str, optional): one of ['json' or 'pickle']. Defaults to json **kwargs: additional keyword arguments passed to worker methods - Returns: object: deserialized object data """ rObj = None if fmt not in ["json", "pickle"]: logger.error("Unsupported format for %s", fmt) return rObj # pth, fn = os.path.split(filePath) bn, ext = os.path.splitext(fn) if not numParts: fp = os.path.join(pth, bn + "_part_*" + ext) numParts = len(glob.glob(fp)) # for ii in range(numParts): fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext) tObj = self.deserialize(fp, fmt=fmt, **kwargs) if isinstance(tObj, list): if not rObj: rObj = [] rObj.extend(tObj) elif isinstance(tObj, dict): if not rObj: rObj = OrderedDict() rObj.update(tObj) else: logger.error( "Unsupported data type for deserialization in parts") return rObj def exists(self, filePath, mode=os.R_OK): return self.__fileU.exists(filePath, mode=mode) def mkdir(self, dirPath, mode=0o755): return self.__fileU.mkdir(dirPath, mode=mode) def remove(self, pth): return self.__fileU.remove(pth) def __deserializeFasta(self, filePath, **kwargs): try: commentStyle = kwargs.get("commentStyle", "uniprot") fau = FastaUtil() return fau.readFasta(filePath, commentStyle=commentStyle) except Exception as e: logger.error("Unable to deserialize %r %r ", filePath, str(e)) return {} def __serializeFasta(self, filePath, myObj, **kwargs): try: maxLineLength = int(kwargs.get("maxLineLength", 70)) makeComment = kwargs.get("makeComment", False) fau = FastaUtil() ok = fau.writeFasta(filePath, myObj, maxLineLength=maxLineLength, makeComment=makeComment) return ok except Exception as e: logger.error("Unable to serialize FASTA file %r %r", filePath, str(e)) return False def __textDump(self, filePath, myObj, **kwargs): try: indent = kwargs.get("indent", 1) width = kwargs.get("width", 120) sOut = pprint.pformat(myObj, indent=indent, width=width) with open(filePath, "w") as ofh: ofh.write("\n%s\n" % sOut) return True except Exception as e: logger.error("Unable to dump to %r %r", filePath, str(e)) return False def __serializePickle(self, filePath, myObj, **kwargs): try: pickleProtocol = kwargs.get("pickleProtocol", pickle.DEFAULT_PROTOCOL) with open(filePath, "wb") as outfile: pickle.dump(myObj, outfile, pickleProtocol) return True except Exception as e: logger.error("Unable to serialize %r %r", filePath, str(e)) return False def __deserializePickle(self, filePath, **kwargs): myDefault = kwargs.get("default", {}) try: if sys.version_info[0] > 2: encoding = kwargs.get("encoding", "ASCII") errors = kwargs.get("errors", "strict") with open(filePath, "rb") as outfile: return pickle.load(outfile, encoding=encoding, errors=errors) else: with open(filePath, "rb") as outfile: return pickle.load(outfile) except Exception as e: logger.warning("Unable to deserialize %r %r", filePath, str(e)) return myDefault def __serializeJson(self, filePath, myObj, **kwargs): """Internal method to serialize the input object as JSON. An encoding helper class is included to handle selected python data types (e.g., datetime) """ indent = kwargs.get("indent", 0) enforceAscii = kwargs.get("enforceAscii", True) try: if enforceAscii: with open(filePath, "w") as outfile: json.dump(myObj, outfile, indent=indent, cls=JsonTypeEncoder, ensure_ascii=enforceAscii) else: with io.open(filePath, "w", encoding="utf-8") as outfile: json.dump(myObj, outfile, indent=indent, cls=JsonTypeEncoder, ensure_ascii=enforceAscii) return True except Exception as e: logger.error("Unable to serialize %r %r", filePath, str(e)) return False def __deserializeJson(self, filePath, **kwargs): myDefault = kwargs.get("default", {}) encoding = kwargs.get("encoding", "utf-8-sig") encodingErrors = kwargs.get("encodingErrors", "ignore") try: if filePath[-3:] == ".gz": if sys.version_info[0] > 2: with gzip.open(filePath, "rt", encoding=encoding, errors=encodingErrors) as inpFile: return json.load(inpFile, object_pairs_hook=OrderedDict) else: # Py2 situation non-ascii encodings is problematic # with gzip.open(filePath, "rb") as csvFile: # oL = self.__csvReader(csvFile, rowFormat, delimiter) tPath = self.__fileU.uncompress(filePath, outputDir=None) with io.open(tPath, newline="", encoding=encoding, errors="ignore") as inpFile: return json.load(inpFile, object_pairs_hook=OrderedDict) else: with open(filePath, "r") as inpFile: return json.load(inpFile, object_pairs_hook=OrderedDict) except Exception as e: logger.warning("Unable to deserialize %r %r", filePath, str(e)) return myDefault def __hasMinSize(self, pth, minSize): try: return os.path.getsize(pth) >= minSize except Exception: return False def __deserializeMmCif(self, locator, **kwargs): """ """ try: containerList = [] workPath = kwargs.get("workPath", None) enforceAscii = kwargs.get("enforceAscii", True) raiseExceptions = kwargs.get("raiseExceptions", True) useCharRefs = kwargs.get("useCharRefs", True) minSize = kwargs.get("minSize", 5) # if self.__fileU.isLocal(locator): if minSize >= 0 and not self.__hasMinSize(locator, minSize): logger.warning("Minimum file size not satisfied for: %r", locator) myIo = IoAdapter(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs) containerList = myIo.readFile( locator, enforceAscii=enforceAscii, outDirPath=workPath) # type: ignore else: # myIo = IoAdapterPy(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs) # containerList = myIo.readFile(locator, enforceAscii=enforceAscii, outDirPath=workPath) containerList = self.__deserializeMmCifRemote( locator, useCharRefs, enforceAscii, workPath) except Exception as e: logger.error("Failing for %s with %s", locator, str(e)) return containerList @retry((requests.exceptions.RequestException), maxAttempts=3, delaySeconds=1, multiplier=2, defaultValue=[], logger=logger) def __deserializeMmCifRemote(self, locator, useCharRefs, enforceAscii, workPath): containerList = [] try: myIo = IoAdapterPy(raiseExceptions=True, useCharRefs=useCharRefs) containerList = myIo.readFile(locator, enforceAscii=enforceAscii, outDirPath=workPath) except Exception as e: raise e return containerList def __serializeMmCif(self, filePath, containerList, **kwargs): """ """ try: ret = False workPath = kwargs.get("workPath", None) enforceAscii = kwargs.get("enforceAscii", True) raiseExceptions = kwargs.get("raiseExceptions", True) useCharRefs = kwargs.get("useCharRefs", True) # myIo = IoAdapter(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs) if filePath.endswith(".gz") and workPath: rfn = "".join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) tPath = os.path.join(workPath, rfn) ret = myIo.writeFile(tPath, containerList=containerList, enforceAscii=enforceAscii) ret = self.__fileU.compress(tPath, filePath, compressType="gzip") else: ret = myIo.writeFile(filePath, containerList=containerList, enforceAscii=enforceAscii) except Exception as e: logger.error("Failing for %s with %s", filePath, str(e)) return ret def __deserializeMmCifDict(self, filePath, **kwargs): """ """ try: containerList = [] workPath = kwargs.get("workPath", None) enforceAscii = kwargs.get("enforceAscii", True) raiseExceptions = kwargs.get("raiseExceptions", True) useCharRefs = kwargs.get("useCharRefs", True) # myIo = IoAdapterPy(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs) containerList = myIo.readFile(filePath, enforceAscii=enforceAscii, outDirPath=workPath) except Exception as e: logger.error("Failing for %s with %s", filePath, str(e)) return containerList def __serializeMmCifDict(self, filePath, containerList, **kwargs): """ """ try: ret = False # workPath = kwargs.get('workPath', None) enforceAscii = kwargs.get("enforceAscii", True) raiseExceptions = kwargs.get("raiseExceptions", True) useCharRefs = kwargs.get("useCharRefs", True) # myIo = IoAdapterPy(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs) ret = myIo.writeFile(filePath, containerList=containerList, enforceAscii=enforceAscii) except Exception as e: logger.error("Failing for %s with %s", filePath, str(e)) return ret def __serializeList(self, filePath, aList, enforceAscii=True, **kwargs): """ """ try: _ = kwargs if enforceAscii: encoding = "ascii" else: encoding = "utf-8" # if sys.version_info[0] > 2: with open(filePath, "w") as ofh: if enforceAscii: for st in aList: ofh.write("%s\n" % st.encode( "ascii", "xmlcharrefreplace").decode("ascii")) else: for st in aList: ofh.write("%s\n" % st) else: if enforceAscii: with io.open(filePath, "w", encoding=encoding) as ofh: for st in aList: ofh.write("%s\n" % st.encode( "ascii", "xmlcharrefreplace").decode("ascii")) else: with open(filePath, "wb") as ofh: for st in aList: ofh.write("%s\n" % st) return True except Exception as e: logger.error("Unable to serialize %r %r", filePath, str(e)) return False def __processList(self, ifh, enforceAscii=True, **kwargs): uncomment = kwargs.get("uncomment", True) aList = [] for line in ifh: if enforceAscii: pth = line[:-1].encode("ascii", "xmlcharrefreplace").decode("ascii") else: pth = line[:-1] if not pth or (uncomment and pth.startswith("#")): continue aList.append(pth) return aList def __deserializeList(self, filePath, enforceAscii=True, encodingErrors="ignore", **kwargs): aList = [] _ = kwargs try: if filePath[-3:] == ".gz": if sys.version_info[0] > 2: with gzip.open(filePath, "rt", encoding="utf-8-sig", errors=encodingErrors) as ifh: aList = self.__processList(ifh, enforceAscii=enforceAscii, **kwargs) else: tPath = self.__fileU.uncompress(filePath, outputDir=None) # for py2 this commented code is problematic for non-ascii data # with gzip.open(filePath, "rb") as ifh: # aList = self.__processList(ifh, enforceAscii=enforceAscii) with io.open(tPath, encoding="utf-8-sig", errors="ignore") as ifh: aList = self.__processList(ifh, enforceAscii=enforceAscii) else: with io.open(filePath, encoding="utf-8-sig", errors="ignore") as ifh: aList = self.__processList(ifh, enforceAscii=enforceAscii, **kwargs) except Exception as e: logger.error("Unable to deserialize %r %s", filePath, str(e)) # logger.debug("Reading list length %d", len(aList)) return aList def __csvReader(self, csvFile, rowFormat, delimiter, uncomment=True): oL = [] maxInt = sys.maxsize csv.field_size_limit(maxInt) if rowFormat == "dict": if uncomment: reader = csv.DictReader(uncommentFilter(csvFile), delimiter=delimiter) else: reader = csv.DictReader(csvFile, delimiter=delimiter) for rowD in reader: oL.append(rowD) elif rowFormat == "list": if uncomment: reader = csv.reader(uncommentFilter(csvFile), delimiter=delimiter) else: reader = csv.reader(csvFile, delimiter=delimiter) for rowL in reader: oL.append(rowL) return oL def deserializeCsvIter(self, filePath, delimiter=",", rowFormat="dict", encodingErrors="ignore", uncomment=True, **kwargs): """Return an iterator to input CSV format file. Args: filePath (str): input file path delimiter (str, optional): CSV delimiter. Defaults to ",". rowFormat (str, optional): format for each process row (list or dict). Defaults to "dict". encodingErrors (str, optional): treatment of encoding errors. Defaults to "ignore". uncomment (bool, optional): flag to ignore leading comments. Defaults to True. Returns: (iterator): iterator for rowwise access to processed CSV data """ encoding = kwargs.get("encoding", "utf-8-sig") maxInt = sys.maxsize csv.field_size_limit(maxInt) try: if filePath[-3:] == ".gz": with gzip.open(filePath, "rt", encoding=encoding, errors=encodingErrors) as csvFile: startIt = itertools.dropwhile( lambda x: x.startswith("#"), csvFile) if uncomment else csvFile if rowFormat == "dict": reader = csv.DictReader(startIt, delimiter=delimiter) elif rowFormat == "list": reader = csv.reader(startIt, delimiter=delimiter) for row in reader: yield row else: with io.open(filePath, newline="", encoding=encoding, errors="ignore") as csvFile: startIt = itertools.dropwhile( lambda x: x.startswith("#"), csvFile) if uncomment else csvFile if rowFormat == "dict": reader = csv.DictReader(startIt, delimiter=delimiter) elif rowFormat == "list": reader = csv.reader(startIt, delimiter=delimiter) for row in reader: # if uncomment and row.startswith("#"): # continue yield row except Exception as e: logger.error("Unable to deserialize %r %s", filePath, str(e)) def __deserializeCsv(self, filePath, delimiter=",", rowFormat="dict", encodingErrors="ignore", uncomment=True, **kwargs): oL = [] encoding = kwargs.get("encoding", "utf-8-sig") try: if filePath[-3:] == ".gz": if sys.version_info[0] > 2: with gzip.open(filePath, "rt", encoding=encoding, errors=encodingErrors) as csvFile: oL = self.__csvReader(csvFile, rowFormat, delimiter, uncomment=uncomment) else: # Py2 situation non-ascii encodings is problematic # with gzip.open(filePath, "rb") as csvFile: # oL = self.__csvReader(csvFile, rowFormat, delimiter) tPath = self.__fileU.uncompress(filePath, outputDir=None) with io.open(tPath, newline="", encoding=encoding, errors="ignore") as csvFile: oL = self.__csvReader(csvFile, rowFormat, delimiter, uncomment=uncomment) else: with io.open(filePath, newline="", encoding=encoding, errors="ignore") as csvFile: oL = self.__csvReader(csvFile, rowFormat, delimiter, uncomment=uncomment) return oL except Exception as e: logger.error("Unable to deserialize %r %s", filePath, str(e)) # logger.debug("Reading list length %d", len(oL)) return oL def __serializeCsv(self, filePath, rowDictList, fieldNames=None, **kwargs): """ """ _ = kwargs try: wD = {} ret = False fNames = fieldNames if fieldNames else list(rowDictList[0].keys()) # with io.open(filePath, 'w', newline='') as csvFile: with open(filePath, "w") as csvFile: writer = csv.DictWriter(csvFile, fieldnames=fNames) writer.writeheader() for ii, rowDict in enumerate(rowDictList): try: wD = {k: v for k, v in rowDict.items() if k in fNames} writer.writerow(wD) except Exception as e: logger.error( "Skipping bad CSV record %d wD %r rowDict %r with %s", ii + 1, wD, rowDict, str(e)) continue ret = True except Exception as e: logger.error("Failing for %s : %r with %s", filePath, wD, str(e)) return ret def __csvEncoder(self, csvData, encoding="utf-8-sig", encodingErrors="ignore"): """Handle encoding issues for gzipped data in Py2. (beware of the BOM chars) Args: csvData (text lines): uncompressed data from gzip open encoding (str, optional): character encoding. Defaults to "utf-8-sig". encodingErrors (str, optional): error treatment. Defaults to "ignore". """ for line in csvData: yield line.decode("utf-8-sig", errors=encodingErrors).encode( encoding, errors=encodingErrors) def __deserializeXmlPrev(self, filePath, **kwargs): """Read the input XML file path and return an ElementTree data object instance. Args: filePath (sting): input XML file path Returns: object: instance of an ElementTree tree object """ _ = kwargs tree = None try: logger.debug("Parsing XML path %s", filePath) if filePath[-3:] == ".gz": with gzip.open(filePath, mode="rb") as ifh: tV = time.time() tree = ET.parse(ifh) else: with open(filePath, mode="rb") as ifh: tV = time.time() tree = ET.parse(ifh) logger.debug("Parsed %s in %.2f seconds", filePath, time.time() - tV) except Exception as e: logger.error("Unable to deserialize %r %s", filePath, str(e)) # return tree def __testGzip(self, filePath): ok = True with gzip.open(filePath, "r") as fh: try: fh.read(1) except gzip.BadGzipFile: ok = False except Exception: ok = False logger.debug("Gzip file check %r", ok) return ok def __deserializeXml(self, filePath, **kwargs): """Read the input XML file path and return an ElementTree data object instance. Args: filePath (sting): input XML file path Returns: object: instance of an ElementTree tree object """ _ = kwargs tree = None encoding = kwargs.get("encoding", "utf-8-sig") encodingErrors = kwargs.get("encodingErrors", "ignore") # try: logger.debug("Parsing XML path %s", filePath) if filePath[-3:] == ".gz" and self.__testGzip(filePath): if sys.version_info[0] > 2: with gzip.open(filePath, "rt", encoding=encoding, errors=encodingErrors) as ifh: tV = time.time() tree = ET.parse(ifh) else: tPath = self.__fileU.uncompress(filePath, outputDir=None) with io.open(tPath, encoding=encoding, errors=encodingErrors) as ifh: tV = time.time() tree = ET.parse(ifh) else: with io.open(filePath, encoding=encoding, errors=encodingErrors) as ifh: tV = time.time() tree = ET.parse(ifh) logger.debug("Parsed %s in %.2f seconds", filePath, time.time() - tV) except Exception as e: logger.error("Unable to deserialize %r %s", filePath, str(e)) # return tree
class DataTypeApiProvider(SingletonClass): """ Data type application and instance information provider. """ def __init__(self, cfgOb, cachePath, useCache=True, **kwargs): """Data type application and instance information provider. Args: cfgOb (object): ConfigInfo() object instance cachePath (str): path to hold the cache directory useCache (bool, optional): flag to use cached files. Defaults to True. """ self.__cfgOb = cfgOb self.__configName = self.__cfgOb.getDefaultSectionName() self.__useCache = useCache self.__cachePath = cachePath # self.__contentInfoConfigName = "content_info_helper_configuration" self.__fileU = FileUtil() self.__contentDefHelper = self.__cfgOb.getHelper( "CONTENT_DEF_HELPER_MODULE", sectionName=self.__configName, cfgOb=self.__cfgOb) self.__dirPath = os.path.join( cachePath, self.__cfgOb.get("DATA_TYPE_INFO_CACHE_DIR", sectionName=self.__configName)) self.__kwargs = kwargs # logger.debug("Leaving constructor") def getDataTypeInstanceApi(self, databaseName, **kwargs): """Return instance of DataTypeInstanceInfo(). Args: databaseName (str): database name Returns: (object): Instance of DataTypeInstanceInfo() """ _ = kwargs dataTypeInstanceLocatorPath = self.__cfgOb.getPath( "INSTANCE_DATA_TYPE_INFO_LOCATOR_PATH", sectionName=self.__configName) dataTypeInstanceFile = self.__contentDefHelper.getDataTypeInstanceFile( databaseName) if self.__contentDefHelper else None if dataTypeInstanceLocatorPath and dataTypeInstanceFile: loc = os.path.join(dataTypeInstanceLocatorPath, dataTypeInstanceFile) filePath = self.__reload(loc, self.__dirPath, useCache=self.__useCache) dtApi = DataTypeInstanceInfo(filePath) else: # DataTypeInstanceInfo() provides an internal by-pass mode where no coverage data is available. dtApi = DataTypeInstanceInfo(None) logger.debug("No data coverage available for database %s", databaseName) return dtApi def getDataTypeApplicationApi(self, appName, **kwargs): """Return instance of DataTypeApplicationInfo. Args: appName (str): application name (e.g., SQL, ANY) Returns: (object): Instance of DataTypeApplicationInfo() """ _ = kwargs dataTypeApplicationLocator = self.__cfgOb.getPath( "APP_DATA_TYPE_INFO_LOCATOR", sectionName=self.__configName) filePath = self.__reload(dataTypeApplicationLocator, self.__dirPath, useCache=self.__useCache) dtApi = DataTypeApplicationInfo( filePath, dataTyping=appName, workPath=self.__dirPath) if filePath else None return dtApi def __reload(self, urlTarget, dirPath, useCache=True): # fn = self.__fileU.getFileName(urlTarget) filePath = os.path.join(dirPath, fn) logger.debug("Using cache path %s", dirPath) self.__fileU.mkdir(dirPath) if not useCache: try: os.remove(filePath) except Exception: pass # if useCache and self.__fileU.exists(filePath): ok = True else: logger.debug("Fetch data from source %s", urlTarget) ok = self.__fileU.get(urlTarget, os.path.join(dirPath, fn)) return filePath if ok else None
class DictionaryApiProvider(SingletonClass): """ Resource provider for dictionary APIs. """ def __init__(self, dirPath, useCache=True): """Resource provider for dictionary APIs. Args: dirPath (str): path to the directory containing cache files useCache (bool, optional): flag to use cached files. Defaults to True. """ self.__apiMap = {} self.__dirPath = dirPath self.__useCache = useCache # self.__fileU = FileUtil(workPath=self.__dirPath) logger.debug("Leaving constructor") def __reload(self, dictLocators, dirPath, useCache=True): """Reload local cache of dictionary resources and return a dictionary API instance. Args: dictLocators (list, str): list of locators for dictionary resource files dirPath (str): path to the directory containing cache files useCache (bool, optional): flag to use cached files. Defaults to True. Returns: (object): instance of dictionary API """ # # verify the exitence of the cache directory ... self.__fileU.mkdir(dirPath) if not useCache: for dictLocator in dictLocators: try: fn = self.__fileU.getFileName(dictLocator) os.remove(os.path.join(dirPath, fn)) except Exception: pass # ret = True for dictLocator in dictLocators: cacheFilePath = os.path.join(dirPath, self.__fileU.getFileName(dictLocator)) if useCache and self.__fileU.exists(cacheFilePath): # nothing to do continue logger.debug("Fetching url %s caching in %s", dictLocator, cacheFilePath) ok = self.__fileU.get(dictLocator, cacheFilePath) ret = ret and ok return ret def getApi(self, dictLocators, **kwargs): """Return a dictionary API object of the input dictioaries. Arguments: dictLocators {list str} -- list of dictionary locator paths Returns: [object] -- returns DictionaryApi() object for input dictionaries """ dictFileNames = [ self.__fileU.getFileName(dictLocator) for dictLocator in dictLocators ] dictTup = tuple(dictFileNames) dApi = self.__apiMap[ dictTup] if dictTup in self.__apiMap else self.__getApi( dictLocators, **kwargs) self.__apiMap[dictTup] = dApi return dApi def __getApi(self, dictLocators, **kwargs): """ Return an instance of a dictionary API instance for the input dictionary locator list. """ consolidate = kwargs.get("consolidate", True) replaceDefinition = kwargs.get("replaceDefinitions", True) verbose = kwargs.get("verbose", True) # ok = self.__reload(dictLocators, self.__dirPath, useCache=self.__useCache) # dApi = None if ok: mU = MarshalUtil() containerList = [] for dictLocator in dictLocators: cacheFilePath = os.path.join( self.__dirPath, self.__fileU.getFileName(dictLocator)) containerList.extend( mU.doImport(cacheFilePath, fmt="mmcif-dict")) # dApi = DictionaryApi(containerList=containerList, consolidate=consolidate, replaceDefinition=replaceDefinition, verbose=verbose) return dApi
class FileUtilTests(unittest.TestCase): def setUp(self): self.__verbose = True self.__pathPdbxDictionaryFile = os.path.join(TOPDIR, "rcsb", "mock-data", "dictionaries", "mmcif_pdbx_v5_next.dic") self.__pathTaxonomyFile = os.path.join(TOPDIR, "rcsb", "mock-data", "NCBI", "names.dmp.gz") self.__zipFileUrl = "https://inventory.data.gov/dataset/794cd3d7-4d28-4408-8f7d-84b820dbf7f2/resource/6b78ec0c-4980-4ad8-9cbd-2d6eb9eda8e7/download/myfoodapediadata.zip" self.__xzFile = os.path.join(TOPDIR, "rcsb", "mock-data", "MOCK_MODBASE_MODELS", "NP_001030614.1_1.pdb.xz") # self.__ftpFileUrl = "ftp://ftp.wwpdb.org/pub/pdb/data/component-models/complete/chem_comp_model.cif.gz" self.__httpsFileUrl = "https://ftp.wwpdb.org/pub/pdb/data/component-models/complete/chem_comp_model.cif.gz" # self.__workPath = os.path.join(HERE, "test-output") self.__inpDirPath = os.path.join(HERE, "test-data") self.__fileU = FileUtil() self.__startTime = time.time() logger.debug("Running tests on version %s", __version__) logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testTarBundling(self): """Test case for tarfile bundling and unbundling""" try: tP = os.path.join(self.__workPath, "t0.tar.gz") dirPath = os.path.join(self.__inpDirPath, "topdir") ok = self.__fileU.bundleTarfile(tP, [dirPath], mode="w:gz", recursive=True) self.assertTrue(ok) numBytes = self.__fileU.size(tP) self.assertGreaterEqual(numBytes, 250) # md5 = self.__fileU.hash(tP, hashType="md5") self.assertTrue(md5 is not None) # ok = self.__fileU.unbundleTarfile(tP, dirPath=self.__workPath) self.assertTrue(ok) # tP = os.path.join(self.__workPath, "t1.tar.gz") dirPathList = [ os.path.join(self.__inpDirPath, "topdir", "subdirA"), os.path.join(self.__inpDirPath, "topdir", "subdirB") ] ok = self.__fileU.bundleTarfile(tP, dirPathList, mode="w:gz", recursive=True) self.assertTrue(ok) # ok = self.__fileU.unbundleTarfile(tP, dirPath=self.__workPath) self.assertTrue(ok) tP = os.path.join(self.__workPath, "t2.tar") dirPathList = [ os.path.join(self.__inpDirPath, "topdir", "subdirA"), os.path.join(self.__inpDirPath, "topdir", "subdirB") ] ok = self.__fileU.bundleTarfile(tP, dirPathList, mode="w", recursive=True) self.assertTrue(ok) # ok = self.__fileU.unbundleTarfile(tP, dirPath=self.__workPath) self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testGetFile(self): """Test case for a local files and directories""" try: remoteLocator = self.__pathPdbxDictionaryFile fn = self.__fileU.getFileName(remoteLocator) # _, fn = os.path.split(remoteLocator) lPath = os.path.join(self.__workPath, fn) ok = self.__fileU.get(remoteLocator, lPath) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertTrue(ok) ok = self.__fileU.isLocal(lPath) self.assertTrue(ok) tPath = self.__fileU.getFilePath(lPath) self.assertEqual(lPath, tPath) ok = self.__fileU.remove(lPath) self.assertTrue(ok) dPath = os.path.join(self.__workPath, "tdir") ok = self.__fileU.mkdir(dPath) self.assertTrue(ok) ok = self.__fileU.remove(dPath) self.assertTrue(ok) ok = self.__fileU.remove(";lakdjf") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testMoveAndCopyFile(self): """Test case for copying ("put") and moving ("replace") local files""" try: remoteLocator = self.__pathPdbxDictionaryFile fn = self.__fileU.getFileName(remoteLocator) # _, fn = os.path.split(remoteLocator) lPath = os.path.join(self.__workPath, fn) ok = self.__fileU.get(remoteLocator, lPath) self.assertTrue(ok) # Test copy file dPath2 = os.path.join(self.__workPath, "tdir") ok = self.__fileU.mkdir(dPath2) self.assertTrue(ok) lPath2 = os.path.join(dPath2, fn) ok = self.__fileU.put(lPath, lPath2) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertTrue(ok) ok = self.__fileU.exists(lPath2) self.assertTrue(ok) # Remove copied file (to test moving file next) ok = self.__fileU.remove(lPath2) self.assertTrue(ok) ok = self.__fileU.exists(lPath2) self.assertFalse(ok) # Test move file ok = self.__fileU.replace(lPath, lPath2) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertFalse(ok) ok = self.__fileU.exists(lPath2) self.assertTrue(ok) # Now clean up files and dirs ok = self.__fileU.remove(lPath) self.assertTrue(ok) ok = self.__fileU.remove(dPath2) self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testZipUrl(self): """Test case for downloading remote zip file and extracting contents.""" try: remoteLocator = self.__zipFileUrl # fn = self.__fileU.getFileName(remoteLocator) ok = self.__fileU.isLocal(remoteLocator) self.assertFalse(ok) # lPath = os.path.join(self.__workPath, self.__fileU.getFileName(self.__zipFileUrl)) ok = self.__fileU.get(remoteLocator, lPath) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertTrue(ok) ok = self.__fileU.isLocal(lPath) self.assertTrue(ok) tPath = self.__fileU.getFilePath(lPath) self.assertEqual(lPath, tPath) fp = self.__fileU.uncompress(lPath, outputDir=self.__workPath) ok = fp.endswith("Food_Display_Table.xlsx") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testFtpUrl(self): """Test case for downloading remote file ftp protocol and extracting contents.""" try: remoteLocator = self.__ftpFileUrl # fn = self.__fileU.getFileName(remoteLocator) ok = self.__fileU.isLocal(remoteLocator) self.assertFalse(ok) # dirPath = os.path.join(self.__workPath, "chem_comp_models") lPath = os.path.join(dirPath, self.__fileU.getFileName(self.__ftpFileUrl)) ok = self.__fileU.get(remoteLocator, lPath) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertTrue(ok) ok = self.__fileU.isLocal(lPath) self.assertTrue(ok) tPath = self.__fileU.getFilePath(lPath) self.assertEqual(lPath, tPath) fp = self.__fileU.uncompress(lPath, outputDir=dirPath) ok = fp.endswith("chem_comp_model.cif") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testRemote(self): """Test case remote status""" try: remoteLocator = self.__httpsFileUrl ok = self.__fileU.isLocal(remoteLocator) self.assertFalse(ok) # ok = self.__fileU.exists(remoteLocator) self.assertTrue(ok) size = self.__fileU.size(remoteLocator) self.assertGreaterEqual(size, 1000) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() @unittest.skip("DrugBank example -- skipping") def testGetDrugBankUrl(self): """Test case for downloading drugbank master xml file""" try: remoteLocator = "https://www.drugbank.ca/releases/latest/downloads/all-full-database" un = "username" pw = "password" # fn = self.__fileU.getFileName(remoteLocator) ok = self.__fileU.isLocal(remoteLocator) self.assertFalse(ok) # lPath = os.path.join(self.__workPath, "db-download.zip") ok = self.__fileU.get(remoteLocator, lPath, username=un, password=pw) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertTrue(ok) ok = self.__fileU.isLocal(lPath) self.assertTrue(ok) tPath = self.__fileU.getFilePath(lPath) self.assertEqual(lPath, tPath) self.__fileU.uncompress(lPath, outputDir=self.__workPath) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testXzFile(self): """Test case for extracting contents from xz file""" try: remoteLocator = self.__xzFile fn = self.__fileU.getFileName(remoteLocator) lPath = os.path.join(self.__workPath, fn) ok = self.__fileU.get(remoteLocator, lPath) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertTrue(ok) ok = self.__fileU.isLocal(lPath) self.assertTrue(ok) tPath = self.__fileU.getFilePath(lPath) self.assertEqual(lPath, tPath) fp = self.__fileU.uncompress(lPath, outputDir=self.__workPath) ok = fp.endswith(".pdb") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class CODModelSearch(object): def __init__(self, cachePath, **kwargs): self.__cachePath = cachePath # self.__useCache = kwargs.get("useCache", True) self.__ccUrlTarget = kwargs.get("ccUrlTarget", None) self.__birdUrlTarget = kwargs.get("birdUrlTarget", None) self.__descriptorUrlTarget = kwargs.get( "descriptorUrlTarget", "http://www.crystallography.net/cod/smi/allcod.smi") self.__prefix = kwargs.get("prefix", None) self.__numProc = kwargs.get("numProc", 4) self.__chunkSize = kwargs.get("chunkSize", 50) self.__ccFileNamePrefix = "cc-%s" % self.__prefix if self.__prefix else "cc-full" self.__fU = FileUtil() # self.__ccmG = ChemCompModelGen(self.__cachePath, self.__prefix) def getResultIndex(self): mU = MarshalUtil(workPath=self.__cachePath) cD = mU.doImport(self.getResultFilePath(), fmt="json") return cD def getResultDetails(self, codId): mU = MarshalUtil(workPath=self.__cachePath) dD = mU.doImport(self.__getCodDetailsFilePath(codId), fmt="json") return dD def storeResultIndex(self, cD): mU = MarshalUtil(workPath=self.__cachePath) ok = mU.doExport(self.getResultFilePath(), cD, fmt="json", indent=3) return ok def getResultDirFilePath(self): dN = "cod-%s-result-files" % self.__prefix if self.__prefix else "cod-result-files" return os.path.join(self.__cachePath, dN) def getRawResultFilePath(self): dN = "cod-%s-result-files" % self.__prefix if self.__prefix else "cod-search-files" return os.path.join(self.__cachePath, dN, "cod-raw-result-file-index.json") def getResultFilePath(self): dN = "cod-%s-result-files" % self.__prefix if self.__prefix else "cod-search-files" return os.path.join(self.__cachePath, dN, "cod-result-file-index.json") def getDescriptorPath(self): fn = self.__fU.getFileName(self.__descriptorUrlTarget) dirPath = self.getResultDirFilePath() filePath = os.path.join(dirPath, fn) return filePath def updateDescriptors(self): self.__fetchUrl(self.__descriptorUrlTarget, filePath=self.getDescriptorPath(), useCache=False) def __fetchUrl(self, urlTarget, filePath, useCache=False, noRetry=False): ok = False try: if not (useCache and self.__fU.exists(filePath)): startTime = time.time() ok = self.__fU.get(urlTarget, filePath, noRetry=noRetry) endTime = time.time() if ok: logger.debug( "Fetched %s for resource file %s (status = %r) (%.4f seconds)", urlTarget, filePath, ok, endTime - startTime) else: logger.error( "Failing fetch for %s for resource file %s (status = %r) (%.4f seconds)", urlTarget, filePath, ok, endTime - startTime) else: ok = True logger.debug("Using cached data for %s", urlTarget) # except Exception as e: logger.exception("Failing for %r with %s", urlTarget, str(e)) return ok def search(self, molLimit=None): try: bsw = BatchChemSearch( useCache=self.__useCache, ccUrlTarget=self.__ccUrlTarget, birdUrlTarget=self.__birdUrlTarget, ccFileNamePrefix=self.__ccFileNamePrefix, cachePath=self.__cachePath, numProc=self.__numProc, chunkSize=self.__chunkSize, ) smiPath = self.getDescriptorPath() smiL = bsw.fetchDescriptorList(smiPath, swap=True) logger.info("Query length (%d)", len(smiL)) # smiL = bsw.splitSmiles(smiL) retL = bsw.doQuery(smiL[:molLimit], "SMILES", matchOpts="graph-exact") logger.info("Result length (%d)", len(retL)) # for ii, ret in enumerate(retL, 1): logger.debug("%5d %8s %4s (%.3f) %s: %s", ii, ret.queryId, ret.ccId, ret.fpScore, ret.queryType, ret.query) # fp = self.getRawResultFilePath() ok = bsw.storeMatchList(fp, retL) return len(retL) if ok else 0 except Exception as e: logger.exception("Failing with %s", str(e)) def __getSearchResults(self): """Read search results and convert to a chemical component dictionary.""" fp = self.getRawResultFilePath() mU = MarshalUtil(workPath=self.__cachePath) rawL = mU.doImport(fp, fmt="json") rD = {} for cD in rawL: rD.setdefault(cD["ccId"], []).append(cD) return rD def __getCodEntryUrl(self, codId): # Template Examples: # https://molecules.crystallography.net/cod/sdf/1/00/00/1000098.sdf # https://molecules.crystallography.net/cod/sdf/6/00/05/6000557.sdf # baseUrl = "https://molecules.crystallography.net/cod/sdf" url = os.path.join(baseUrl, codId[0:1], codId[1:3], codId[3:5], codId + ".sdf") return url def __getCodDetailsUrl(self, codId): baseUrl = "http://www.crystallography.net/cod/optimade/structures" url = os.path.join(baseUrl, codId) return url def __getCodDetailsFilePath(self, codId): dirPath = self.getResultDirFilePath() fp = os.path.join(dirPath, "cod-data", codId[0:1], codId[1:3], codId[3:5], codId + ".json") return fp def __getCodEntryFilePath(self, codId): dirPath = self.getResultDirFilePath() fp = os.path.join(dirPath, "cod-data", codId[0:1], codId[1:3], codId[3:5], codId + ".sdf") return fp def fetchMatchedData(self, useCache=True): """Fetch COD matched entries and metadata and update the raw search index with essential COD data attrbutes. Args: useCache (bool, optional): use any cached COD data. Defaults to True. Returns: int: search result count """ eCount = 0 eSkip = 0 rcD = {} cD = self.__getSearchResults() # for ccId, qDL in cD.items(): # cifPath = self.__ccmG.getChemCompPath(ccId) # if not cifPath: # logger.info("No CIF for %s skipping", ccId) # continue parentId = ccId.split("|")[0] rqDL = [] for qD in qDL: codId = qD["queryId"] codEntryFilePath = self.__getCodEntryFilePath(codId) codDetailsFilePath = self.__getCodDetailsFilePath(codId) ok1 = self.__fetchUrl(self.__getCodEntryUrl(codId), self.__getCodEntryFilePath(codId), useCache=useCache, noRetry=True) ok2 = self.__fetchUrl(self.__getCodDetailsUrl(codId), self.__getCodDetailsFilePath(codId), useCache=useCache, noRetry=True) tD = self.getResultDetails(codId) dD = tD["data"][ "attributes"] if "data" in tD and "attributes" in tD[ "data"] else {} mD = tD["meta"][ "implementation"] if "meta" in tD and "implementation" in tD[ "meta"] else {} if ok1 & ok2: logger.info("Fetched COD entry and details for %s (%r)", codId, ok1 & ok2) eCount += 1 qD["codEntryFilePath"] = codEntryFilePath qD["codDetailsFilePath"] = codDetailsFilePath # qD["cifPath"] = cifPath qD["parentId"] = parentId qD["chemicalName"] = dD[ "_cod_commonname"] if "_cod_commonname" in dD else None qD["chemicalName"] = dD[ "_cod_chemname"] if "_cod_chemname" in dD else qD[ "chemicalName"] qD["rValue"] = dD[ "_cod_Robs"] if "_cod_Robs" in dD else None qD["diffrnTemp"] = dD[ "_cod_diffrtemp"] if "_cod_diffrtemp" in dD else None qD["radiationSource"] = dD[ "_cod_radType"] if "_cod_radType" in dD else None qD["publicationDOI"] = dD[ "_cod_doi"] if "_cod_doi" in dD else None qD["version"] = mD["version"] if "version" in mD else None qD["hasDisorder"] = "N" rqDL.append(qD) else: logger.info("Skipping entry missing data for %r at %r", codId, self.__getCodEntryUrl(codId)) eSkip += 1 if rqDL: rcD[ccId] = rqDL # ok = self.storeResultIndex(rcD) logger.info( "Final match result (w/sdf and metadata) (%d/%d) cod hits (%d) skipped (%d)", len(rcD), len(cD), eCount, eSkip) return eCount if ok else 0 def fetchMatchedDataMp(self, numProc=6, chunkSize=5, useCache=True): rcD = {} cD = self.__getSearchResults() idList = list(cD.keys()) # --- mpu = MultiProcUtil(verbose=True) mpu.setWorkingDir(self.__cachePath) mpu.setOptions(optionsD={ "resultPath": self.__cachePath, "cD": cD, "useCache": useCache }) mpu.set(workerObj=self, workerMethod="fetchDataWorker") ok, failList, resultList, _ = mpu.runMulti(dataList=idList, numProc=numProc, numResults=1, chunkSize=chunkSize) logger.info("Run ended with status %r success count %d failures %r", ok, len(resultList[0]), len(failList)) for rTup in resultList[0]: rcD[rTup[0]] = rTup[1] # --- ok = self.storeResultIndex(rcD) logger.info("Final match result (w/sdf and metadata) (%d/%d)", len(rcD), len(cD)) return True def fetchDataWorker(self, dataList, procName, optionsD, workingDir): """Worker method to fetch COD data for matched entries Args: dataList (list): list of mol2 file paths to be searched procName (str): processName optionsD (dict): dictionary of options workingDir (str): path to working directory (not used) Returns: (successList, resultList, []): success and result lists of mol2 paths with CCDC matches """ resultPath = optionsD["resultPath"] cD = optionsD["cD"] useCache = optionsD["useCache"] _ = workingDir resultList = [] successList = [] startTime = time.time() logger.info("starting %s at %s", procName, time.strftime("%Y %m %d %H:%M:%S", time.localtime())) # eCount = 0 eSkip = 0 try: stopPath = os.path.join(resultPath, "STOP") logger.info("%s starting search data length %d", procName, len(dataList)) if self.__checkStop(stopPath): logger.info("%s stopping", procName) return resultList, resultList, [] # # for ccId, qDL in cD.items(): for ccId in dataList: if ccId in cD: qDL = cD[ccId] # parentId = ccId.split("|")[0] rqDL = [] for qD in qDL: codId = qD["queryId"] codEntryFilePath = self.__getCodEntryFilePath(codId) codDetailsFilePath = self.__getCodDetailsFilePath(codId) ok1 = self.__fetchUrl(self.__getCodEntryUrl(codId), self.__getCodEntryFilePath(codId), useCache=useCache, noRetry=True) ok2 = self.__fetchUrl(self.__getCodDetailsUrl(codId), self.__getCodDetailsFilePath(codId), useCache=useCache, noRetry=True) tD = self.getResultDetails(codId) dD = tD["data"][ "attributes"] if "data" in tD and "attributes" in tD[ "data"] else {} mD = tD["meta"][ "implementation"] if "meta" in tD and "implementation" in tD[ "meta"] else {} if ok1 & ok2: logger.info( "Fetched COD entry and details for %s (%r)", codId, ok1 & ok2) eCount += 1 qD["codEntryFilePath"] = codEntryFilePath qD["codDetailsFilePath"] = codDetailsFilePath # qD["cifPath"] = cifPath qD["parentId"] = parentId qD["chemicalName"] = dD[ "_cod_commonname"] if "_cod_commonname" in dD else None qD["chemicalName"] = dD[ "_cod_chemname"] if "_cod_chemname" in dD else qD[ "chemicalName"] qD["rValue"] = dD[ "_cod_Robs"] if "_cod_Robs" in dD else None qD["diffrnTemp"] = dD[ "_cod_diffrtemp"] if "_cod_diffrtemp" in dD else None qD["radiationSource"] = dD[ "_cod_radType"] if "_cod_radType" in dD else None qD["publicationDOI"] = dD[ "_cod_doi"] if "_cod_doi" in dD else None qD["version"] = mD[ "version"] if "version" in mD else None qD["hasDisorder"] = "N" rqDL.append(qD) else: logger.info("Skipping entry missing data for %r at %r", codId, self.__getCodEntryUrl(codId)) eSkip += 1 if rqDL: resultList.append((ccId, rqDL)) successList.append(ccId) except Exception as e: logger.exception("Failing with %s", str(e)) endTime = time.time() logger.info( "%s (entries %d skipped %d) (ccId result length %d) completed at %s (%.2f seconds)", procName, eCount, eSkip, len(successList), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime, ) return successList, resultList, [] def __checkStop(self, path): try: if os.access(path, os.F_OK): return True except Exception: pass return False
def __reloadFasta(self, dirPath, **kwargs): """Reload DrugBank target FASTA data files. Args: dirPath (str, optional): path to DrugBank cache directory useCache (bool, optional): flag to use cached files. Defaults to True. Returns: """ startTime = time.time() logger.info("Starting db reload at %s", time.strftime("%Y %m %d %H:%M:%S", time.localtime())) retFilePathList = [] urlTargetL = [ "https://go.drugbank.com/releases/latest/downloads/target-all-polypeptide-sequences", "https://go.drugbank.com/releases/latest/downloads/enzyme-all-polypeptide-sequences", "https://go.drugbank.com/releases/latest/downloads/carrier-all-polypeptide-sequences", "https://go.drugbank.com/releases/latest/downloads/transporter-all-polypeptide-sequences", ] useCache = kwargs.get("useCache", True) username = kwargs.get("username", None) password = kwargs.get("password", None) # if not username or not password: return retFilePathList # fU = FileUtil() fU.mkdir(dirPath) # if not useCache: # Clear any cached files for urlTarget in urlTargetL: baseFileName = fU.getFileName(urlTarget) zipFileName = baseFileName + ".fasta.zip" retFileName = baseFileName + ".fa" for fn in [baseFileName, zipFileName, retFileName]: try: fp = os.path.join(dirPath, fn) os.remove(fp) except Exception: pass # ok = False if useCache: ok = True for urlTarget in urlTargetL: baseFileName = fU.getFileName(urlTarget) retFileName = baseFileName + ".fa" retFilePath = os.path.join(dirPath, retFileName) ok = fU.exists(retFilePath) if not ok: break retFilePathList.append(retFilePath) # logger.info("Using cached files %r", ok) if not useCache or not ok: if not username or not password: logger.warning( "Missing credentials for DrugBank file download...") for urlTarget in urlTargetL: baseFileName = fU.getFileName(urlTarget) zipFileName = baseFileName + ".fasta.zip" retFileName = baseFileName + ".fa" zipFilePath = os.path.join(dirPath, zipFileName) retFilePath = os.path.join(dirPath, retFileName) basePath = os.path.join(dirPath, baseFileName) logger.info("Fetching url %s for FASTA target file %s", urlTarget, baseFileName) ok = fU.get(urlTarget, zipFilePath, username=username, password=password) endTime = time.time() logger.info( "Completed db fetch at %s (%.4f seconds)", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime) # ok = fU.unbundleZipfile(zipFilePath, dirPath=basePath) fU.put(os.path.join(basePath, "protein.fasta"), retFilePath) endTime = time.time() logger.info( "Completed unzip at %s (%.4f seconds)", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime) retFilePathList.append(retFilePath) return retFilePathList
class MarshalUtil(object): """Wrapper for serialization and deserialization methods.""" def __init__(self, **kwargs): self.__workPath = kwargs.get("workPath", ".") self.__workDirSuffix = kwargs.get("workDirSuffix", "marshall_") self.__workDirPrefix = kwargs.get("workDirSuffix", "_tempdir") # self.__fileU = FileUtil(workPath=self.__workPath) self.__ioU = IoUtil() def doExport(self, locator, obj, fmt="list", marshalHelper=None, numParts=None, **kwargs): """Serialize the input object at locator path in specified format. The input object is optionally preprocessed by the helper method. Args: locator (str): target path or URI obj (object): data to be serialized fmt (str, optional): format for serialization (mmcif, tdd, csv, list). Defaults to "list". marshalHelper (method, optional): pre-processor method applied to input data object. Defaults to None. numParts (int, optional): serialize the data in parts. Defaults to None. (json and pickle formats) Returns: bool: True for sucess or False otherwise """ try: ret = False localFlag = self.__fileU.isLocal(locator) if marshalHelper: myObj = marshalHelper(obj, **kwargs) else: myObj = obj # if localFlag and numParts and fmt in ["json", "pickle"]: localFilePath = self.__fileU.getFilePath(locator) ret = self.__ioU.serializeInParts(localFilePath, myObj, numParts, fmt=fmt, **kwargs) elif localFlag: localFilePath = self.__fileU.getFilePath(locator) ret = self.__ioU.serialize(localFilePath, myObj, fmt=fmt, workPath=self.__workPath, **kwargs) else: with tempfile.TemporaryDirectory( suffix=self.__workDirSuffix, prefix=self.__workDirPrefix, dir=self.__workPath) as tmpDirName: # write a local copy then copy to destination - # localFilePath = os.path.join( self.__workPath, tmpDirName, self.__fileU.getFileName(locator)) ok1 = self.__ioU.serialize(localFilePath, myObj, fmt=fmt, workPath=self.__workPath, **kwargs) ok2 = True if ok1: ok2 = self.__fileU.put(localFilePath, locator, **kwargs) ret = ok1 and ok2 except Exception as e: logger.exception("Exporting locator %r failing with %s", locator, str(e)) return ret def doImport(self, locator, fmt="list", marshalHelper=None, numParts=None, **kwargs): """Deserialize data at the target locator in specified format. The deserialized data is optionally post-processed by the input helper method. Args: locator (str): path or URI to input data fmt (str, optional): format for deserialization (mmcif, tdd, csv, list). Defaults to "list". marshalHelper (method, optional): post-processor method applied to deserialized data object. Defaults to None. numParts (int, optional): deserialize the data in parts. Defaults to None. (json and pickle formats) tarMember (str, optional): name of a member of tar file bundle. Defaults to None. (tar file format) Returns: Any: format specific return type """ try: tarMember = kwargs.get("tarMember", None) localFlag = self.__fileU.isLocal(locator) and not tarMember # if localFlag and numParts and fmt in ["json", "pickle"]: filePath = self.__fileU.getFilePath(locator) ret = self.__ioU.deserializeInParts(filePath, numParts, fmt=fmt, **kwargs) elif localFlag: filePath = self.__fileU.getFilePath(locator) ret = self.__ioU.deserialize(filePath, fmt=fmt, workPath=self.__workPath, **kwargs) else: # if fmt == "mmcif": ret = self.__ioU.deserialize(locator, fmt=fmt, workPath=self.__workPath, **kwargs) else: with tempfile.TemporaryDirectory( suffix=self.__workDirSuffix, prefix=self.__workDirPrefix, dir=self.__workPath) as tmpDirName: # # Fetch first then read a local copy - # if tarMember: localFilePath = os.path.join( self.__workPath, tmpDirName, tarMember) else: localFilePath = os.path.join( self.__workPath, tmpDirName, self.__fileU.getFileName(locator)) # --- Local copy approach --- self.__fileU.get(locator, localFilePath, **kwargs) ret = self.__ioU.deserialize(localFilePath, fmt=fmt, workPath=self.__workPath, **kwargs) if marshalHelper: ret = marshalHelper(ret, **kwargs) except Exception as e: logger.exception("Importing locator %r failing with %s", locator, str(e)) ret = None return ret def exists(self, filePath, mode=os.R_OK): return self.__fileU.exists(filePath, mode=mode) def mkdir(self, dirPath, mode=0o755): return self.__fileU.mkdir(dirPath, mode=mode) def remove(self, pth): return self.__fileU.remove(pth)
def __rebuildCache(self, targetUrl, mapNameL, outDirPath, rawDirPath, fmt="pickle", useCache=True): """Fetch the UniProt selected id mapping resource file and extract UniProt Acc to 'mapIndex' mapping. Serialize the mapping as required. Args: targetUrl (str): source URL of the remote index file mapNameL (list): list of key mapping names to extract from the index outDirPath (str): directory path for raw and processed mapping files fmt (str, optional): output format (pickle|json) . Defaults to "pickle". useCache (bool, optional): use cached files. Defaults to True. Returns: dict: od[uniprotId] = mapped value idmapping_selected.tab 1. UniProtKB-AC 2. UniProtKB-ID 3. GeneID (EntrezGene) 4. RefSeq 5. GI 6. PDB 7. GO 8. UniRef100 9. UniRef90 10. UniRef50 11. UniParc 12. PIR 13. NCBI-taxon 14. MIM 15. UniGene 16. PubMed 17. EMBL 18. EMBL-CDS 19. Ensembl 20. Ensembl_TRS 21. Ensembl_PRO 22. Additional PubMed """ startTime = time.time() nL = mapNameL oD = {} try: fileU = FileUtil() fExt = "pic" if fmt == "pickle" else "json" fExt = "tdd" if fmt == "tdd" else fExt fN, _ = os.path.splitext(fileU.getFileName(targetUrl)) mapFileName = fN + "-map." + fExt idMapPath = os.path.join(outDirPath, mapFileName) mU = MarshalUtil() if useCache and mU.exists(idMapPath): logger.info("Reading cached serialized file %r", idMapPath) if fmt in ["pickle", "json"]: tD = mU.doImport(idMapPath, fmt=fmt) nL = list(set(tD["idNameList"])) oD = tD["uniprotMapD"] logger.info("keys %r", list(oD.keys())[:10]) logger.info("nL %r", nL) ok = True elif fmt == "tdd": ioU = IoUtil() it = ioU.deserializeCsvIter(idMapPath, delimiter="\t", rowFormat="list", encodingErrors="ignore") tL = next(it, []) nL = tL[1:] if len(nL) == 1: for row in it: oD[row[0]] = row[1] else: for row in it: oD[row[0]] = row[1:] ok = True else: idPath = os.path.join(rawDirPath, fileU.getFileName(targetUrl)) if not fileU.exists(idPath): logger.info( "Fetching selected UniProt idmapping data from %r in %r", targetUrl, outDirPath) ok = fileU.get(targetUrl, idPath) if not ok: logger.error("Failed to downlowd %r", targetUrl) return oD else: logger.info("Using cached mapping file %r", idPath) # --- ioU = IoUtil() if fmt in ["pickle", "json"]: if len(mapNameL) == 1: for row in ioU.deserializeCsvIter( idPath, delimiter="\t", rowFormat="list", encodingErrors="ignore"): oD[row[0]] = str( row[self.__mapRecordD[mapNameL[0]] - 1]) else: for row in ioU.deserializeCsvIter( idPath, delimiter="\t", rowFormat="list", encodingErrors="ignore"): for mapName in mapNameL: oD.setdefault(row[0], []).append( str(row[self.__mapRecordD[mapName] - 1])) logger.info("Writing serialized mapping file %r", idMapPath) ok = mU.doExport(idMapPath, { "idNameList": mapNameL, "uniprotMapD": oD }, fmt=fmt) elif fmt == "tdd": # logger.info("Writing serialized mapping file %r", idMapPath) fU = FileUtil() fU.mkdirForFile(idMapPath) colNameL = [] colNameL.append("UniProtId") colNameL.extend(mapNameL) with open(idMapPath, "w", encoding="utf-8") as ofh: ofh.write("%s\n" % "\t".join(colNameL)) if len(mapNameL) == 1: idx = self.__mapRecordD[mapNameL[0]] - 1 for row in ioU.deserializeCsvIter( idPath, delimiter="\t", rowFormat="list", encodingErrors="ignore"): ofh.write("%s\t%s\n" % (row[0], row[idx])) else: idxL = [0] idxL.extend([ self.__mapRecordD[mapName] - 1 for mapName in mapNameL ]) for row in ioU.deserializeCsvIter( idPath, delimiter="\t", rowFormat="list", encodingErrors="ignore"): ofh.write( "%s\n" % "\t".join([str(row[idx]) for idx in idxL])) # nL, oD = self.__rebuildCache(targetUrl, mapNameL, outDirPath, rawDirPath, fmt=fmt, useCache=True) ok = True if nL and oD else False logger.info("Completed reload (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) except Exception as e: logger.exception("Failing with %s", str(e)) # return nL, oD
class SchemaProvider(SingletonClass): """ A collection of schema build and caching methods. Static cache worflow: <authorative source> <-- <cache dir> <- client API Compute workflow: <dependent resource files, config file, dictionaries> -> [schema builder] --> <schema def> --> <Json schema> """ def __init__(self, cfgOb, cachePath, useCache=True, rebuildFlag=False, **kwargs): """A collection of schema build and caching methods. Args: cfgOb (object): ConfigInfo() instance cachePath (str): path to directory containing schema useCache (bool, optional): use cached schema. Defaults to True. rebuildFlag (bool, optional): on-the-fly rebuild and cache schema """ self.__cfgOb = cfgOb self.__configName = self.__cfgOb.getDefaultSectionName() self.__cachePath = os.path.abspath(cachePath) self.__useCache = useCache self.__rebuildFlag = rebuildFlag self.__useCache = rebuildFlag if rebuildFlag else useCache # self.__workPath = os.path.join(self.__cachePath, "work") self.__fileU = FileUtil(workPath=os.path.join(self.__cachePath, "work")) self.__schemaCachePath = os.path.join(self.__cachePath, self.__cfgOb.get("SCHEMA_DEFINITION_CACHE_DIR", sectionName=self.__configName)) self.__jsonSchemaCachePath = os.path.join(self.__cachePath, self.__cfgOb.get("JSON_SCHEMA_DEFINITION_CACHE_DIR", sectionName=self.__configName)) self.__fileU.mkdir(self.__schemaCachePath) self.__fileU.mkdir(self.__jsonSchemaCachePath) self.__kwargs = kwargs def getSchemaOptions(self, schemaLevel, extraOpts=None): opts = extraOpts + "|" if extraOpts else "" if schemaLevel == "full": return opts + "mandatoryKeys|mandatoryAttributes|bounds|enums|rcsb" elif schemaLevel in ["min", "minimum"]: return opts + "mandatoryKeys|enums|rcsb" else: return opts def getSchemaInfo(self, databaseName, dataTyping="ANY"): """Convenience method to return essential schema details for the input repository content type. Args: databaseName (str): schema name (e.g. pdbx, bird, chem_comp, ...) dataTyping (str, optional): Application name for the target schema (e.g. ANY, SQL, ...) Returns: tuple: SchemaDefAccess(object), target database name, target collection name list, primary index attribute list """ sd = None dbName = None collectionNameList = [] docIndexD = {} try: mU = MarshalUtil(workPath=self.__workPath) schemaLocator = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping) if self.__rebuildFlag: filePath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator)) self.makeSchemaDef(databaseName, dataTyping=dataTyping, saveSchema=True) else: filePath = self.__reload(schemaLocator, self.__schemaCachePath, useCache=self.__useCache) if not filePath: logger.error("Unable to recover schema %s (%s)", databaseName, dataTyping) logger.debug("ContentType %r dataTyping %r schemaLocator %r", databaseName, dataTyping, schemaLocator) schemaDef = mU.doImport(filePath, fmt="json") if schemaDef: logger.debug("Using cached schema definition for %s application %s", databaseName, dataTyping) sd = SchemaDefAccess(schemaDef) if sd: dbName = sd.getDatabaseName() collectionInfoList = sd.getCollectionInfo() logger.debug("Schema %s database name %s collections %r", databaseName, dbName, collectionInfoList) for cd in collectionInfoList: collectionName = cd["NAME"] collectionNameList.append(collectionName) docIndexD[collectionName] = sd.getDocumentIndices(collectionName) except Exception as e: logger.exception("Retreiving schema %s for %s failing with %s", databaseName, dataTyping, str(e)) return sd, dbName, collectionNameList, docIndexD def schemaDefCompare(self, databaseName, dataTyping="ANY"): """Compare computed schema defintion with current source/cached version. Args: databaseName (str): schema definition name for comparison dataTyping (str, optional): data type conventions for the schema comparison. Defaults to "ANY". Returns: (str): file path for schema difference or None """ mU = MarshalUtil(workPath=self.__workPath) schemaDiffPath = os.path.join(self.__cachePath, "schema_diff") mU.mkdir(schemaDiffPath) schemaPath = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping) fn = self.__fileU.getFileName(schemaPath) sD = self.makeSchemaDef(databaseName, dataTyping=dataTyping) v2 = sD["DATABASE_VERSION"] # ---- # tPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaPath) + "-test") # logger.info("Exporting schema def to %s", tPath) # mU.doExport(tPath, sD, fmt="json", indent=3) # sD = mU.doImport(tPath, fmt="json") # ---- cPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaPath)) sDCache = mU.doImport(cPath, fmt="json") v1 = sDCache["DATABASE_VERSION"] # numDiff, difD = self.schemaCompare(sDCache, sD) # # jD = diff(sDCache, sD, syntax="explicit", marshal=True) diffPath = None if numDiff: bn, _ = os.path.splitext(fn) diffPath = os.path.join(schemaDiffPath, bn + "-" + v1 + "-" + v2 + "-diff.json") # logger.info("diff for %s %s = \n%s", databaseName, dataTyping, pprint.pformat(difD, indent=3, width=100)) mU.doExport(diffPath, difD, fmt="json", indent=3) # return diffPath def jsonSchemaCompare(self, databaseName, collectionName, encodingType, level, extraOpts=None): """Compare computed JSON schema defintion with current source/cached version. Args: databaseName (str): schema name collectionName (str): collection name encodingType (str): schema data type conventions (JSON|BSON) level (str): metadata level (min|full) extraOpts (str): extra schema construction options Returns: (str): path to the difference file or None """ mU = MarshalUtil(workPath=self.__workPath) schemaDiffPath = os.path.join(self.__cachePath, "schema_diff") mU.mkdir(schemaDiffPath) schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType, level) fn = self.__fileU.getFileName(schemaLocator) schemaPath = os.path.join(self.__jsonSchemaCachePath, fn) # sD = self.makeSchema(databaseName, collectionName, encodingType=encodingType, level=level, saveSchema=False, extraOpts=extraOpts) v2 = self.__getSchemaVersion(sD) # ---- # tPath = os.path.join(self.__jsonSchemaCachePath, self.__fileU.getFileName(schemaPath) + "-test") # logger.info("Exporting json schema to %s", tPath) # mU.doExport(tPath, sD, fmt="json", indent=3) # ---- # sDCache = mU.doImport(schemaPath, fmt="json") v1 = self.__getSchemaVersion(sDCache) if not v1: logger.error("no version for %s - %s %s", schemaLocator, databaseName, collectionName) # numDiff, difD = self.schemaCompare(sDCache, sD) # jD = diff(sDCache, sD, marshal=True, syntax="explicit") diffPath = None if numDiff: logger.debug("diff for %s %s %s %s = \n%s", databaseName, collectionName, encodingType, level, pprint.pformat(difD, indent=3, width=100)) bn, _ = os.path.splitext(fn) diffPath = os.path.join(schemaDiffPath, bn + "-" + v1 + "-" + v2 + "-diff.json") mU.doExport(diffPath, difD, fmt="json", indent=3) return diffPath def __getSchemaVersion(self, jsonSchema): try: comment = jsonSchema["$comment"] if "$comment" in jsonSchema else "" ff = comment.split(":") version = ff[1].strip() return version except Exception as e: logger.exception("Failing for with %s", str(e)) return "" def __getSchemaDefLocator(self, databaseName, dataTyping="ANY"): """Internal method returning schema definition path for the input content type and application. Defines schema definition naming convention - Args: databaseName (str): schema name (e.g. pdbx, bird, chem_comp, ...) dataTyping (str, optional): Application name for the target schema (e.g. ANY, SQL, ...) Returns: str: schema definition file locator """ schemaLocator = None try: locPath = self.__cfgOb.get("SCHEMA_DEFINITION_LOCATOR_PATH", sectionName=self.__configName) fn = "schema_def-%s-%s.json" % (databaseName, dataTyping.upper()) schemaLocator = os.path.join(locPath, fn) except Exception as e: logger.exception("Retreiving schema definition path %s for %s failing with %s", databaseName, dataTyping, str(e)) return schemaLocator def __getJsonSchemaLocator(self, databaseName, collectionName, encodingType="BSON", level="full"): """Internal method returning JSON schema path for the input collection data type convention and level. Defines the JSON/BSON schema naming convention - Args: databaseName (str): database name in the document store collectionName (str): collection name in document store encodingType (str, optional): data type convention (BSON|JSON) level (str, optional): Completeness of the schema (e.g. min or full) Returns: str: schema file locator """ schemaLocator = None try: sdType = None sLevel = None schemaLocator = None if encodingType.upper() in ["JSON", "BSON"]: sdType = encodingType.lower() if level.lower() in ["min", "minimun"]: sLevel = "min" elif level.lower() in ["full"]: sLevel = level.lower() # if sdType and sLevel: locPath = self.__cfgOb.get("JSON_SCHEMA_DEFINITION_LOCATOR_PATH", sectionName=self.__configName) fn = "%s-%s-db-%s-col-%s.json" % (sdType, sLevel, databaseName, collectionName) schemaLocator = os.path.join(locPath, fn) else: logger.error("Unsupported schema options: %s level %r type %r", collectionName, level, encodingType) schemaLocator = None except Exception as e: logger.debug("Retreiving JSON schema definition for %s type %s failing with %s", collectionName, encodingType, str(e)) # return schemaLocator def __reload(self, locator, dirPath, useCache=True): # fn = self.__fileU.getFileName(locator) filePath = os.path.join(dirPath, fn) logger.debug("Target cache filePath %s", filePath) self.__fileU.mkdir(dirPath) if not useCache: try: os.remove(filePath) except Exception: pass # if useCache and self.__fileU.exists(filePath): ok = True else: logger.info("Fetch data from source %s to %s", locator, filePath) ok = self.__fileU.get(locator, filePath) return filePath if ok else None def getJsonSchema(self, databaseName, collectionName, encodingType="BSON", level="full", extraOpts=None): """Return JSON schema (w/ BSON types) object for the input collection and level.and Args: databaseName (str): database name collectionName (str): collection name in document store encodingType (str, optional): data type convention (BSON|JSON) level (str, optional): Completeness of the schema (e.g. min or full) Returns: dict: Schema object """ sObj = None schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType=encodingType, level=level) # if self.__rebuildFlag: filePath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator)) self.makeSchema(databaseName, collectionName, encodingType=encodingType, level=level, extraOpts=extraOpts) else: filePath = self.__reload(schemaLocator, self.__jsonSchemaCachePath, useCache=self.__useCache) mU = MarshalUtil(workPath=self.__workPath) if filePath and mU.exists(filePath): mU = MarshalUtil(workPath=self.__workPath) sObj = mU.doImport(filePath, fmt="json") else: logger.debug("Failed to read schema for %s %r", collectionName, level) return sObj def makeSchema(self, databaseName, collectionName, encodingType="BSON", level="full", saveSchema=False, extraOpts=None): try: smb = SchemaDefBuild(databaseName, self.__cfgOb, cachePath=self.__cachePath) # cD = None stU = encodingType.upper() cD = smb.build(collectionName, dataTyping=stU, encodingType=stU, enforceOpts=self.getSchemaOptions(level, extraOpts=extraOpts)) if cD and saveSchema: schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType=encodingType, level=level) localPath = os.path.join(self.__jsonSchemaCachePath, self.__fileU.getFileName(schemaLocator)) mU = MarshalUtil(workPath=self.__workPath) mU.doExport(localPath, cD, fmt="json", indent=3, enforceAscii=False) except Exception as e: logger.exception("Building schema %s collection %s failing with %s", databaseName, collectionName, str(e)) return cD def makeSchemaDef(self, databaseName, dataTyping="ANY", saveSchema=False): schemaDef = None try: smb = SchemaDefBuild(databaseName, self.__cfgOb, cachePath=self.__cachePath) schemaDef = smb.build(dataTyping=dataTyping, encodingType="rcsb") if schemaDef and saveSchema: schemaLocator = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping) localPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator)) mU = MarshalUtil(workPath=self.__workPath) mU.doExport(localPath, schemaDef, fmt="json", indent=3, enforceAscii=False) except Exception as e: logger.exception("Building schema %s failing with %s", databaseName, str(e)) return schemaDef def schemaCompare(self, orgD, newD): """ Compute the difference of nested dictionaries. """ fOrgD = self.__flatten(orgD) fNewD = self.__flatten(newD) if len(fOrgD) != len(fNewD): logger.debug("Schema lengths differ: org %d new %d", len(fOrgD), len(fNewD)) # addedD = {k: fNewD[k] for k in set(fNewD) - set(fOrgD)} removedD = {k: fOrgD[k] for k in set(fOrgD) - set(fNewD)} changedOrgD = {k: fOrgD[k] for k in set(fOrgD) & set(fNewD) if fOrgD[k] != fNewD[k]} changedNewD = {k: fNewD[k] for k in set(fOrgD) & set(fNewD) if fOrgD[k] != fNewD[k]} chD = {} for ky in changedOrgD: kyS = ".".join(ky) vOrg = changedOrgD[ky] vNew = changedNewD[ky] if isinstance(vOrg, (list, tuple)) and isinstance(vNew, (list, tuple)): # logger.info(" >> %r vOrg %r vNew %r", ky, vOrg, vNew) dV = list(set(vNew) - set(vOrg)) if dV: chD[kyS] = {"diff": dV} else: chD[kyS] = {"from": vOrg, "to": vNew} # nT = len(addedD) + len(removedD) + len(chD) diffD = {"added": [".".join(kk) for kk in addedD.keys()], "removed": [".".join(kk) for kk in removedD.keys()], "changed": chD} return nT, diffD def __flatten(self, inpDict, prefix=None): prefix = prefix[:] if prefix else [] outDict = {} for key, value in inpDict.items(): if isinstance(value, dict) and value: deeper = self.__flatten(value, prefix + [key]) outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()}) elif isinstance(value, (list, tuple)) and value: for index, sublist in enumerate(value, start=1): if isinstance(sublist, dict) and sublist: deeper = self.__flatten(sublist, prefix + [key] + [str(index)]) outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()}) else: outDict[tuple(prefix + [key] + [str(index)])] = value else: outDict[tuple(prefix + [key])] = value return outDict def __flattenX(self, inpDict, prefix=None): prefix = prefix[:] if prefix else [] # separator = "." outDict = {} for key, value in inpDict.items(): if isinstance(value, dict) and value: deeper = self.__flatten(value, prefix + [key]) outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()}) elif isinstance(value, list) and value: for index, sublist in enumerate(value, start=1): if isinstance(sublist, dict) and sublist: deeper = self.__flatten(sublist, prefix + [key] + [str(index)]) outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()}) else: outDict[tuple(prefix + [key] + [str(index)])] = value else: outDict[tuple(prefix + [key])] = value return outDict def __flattenOrg(self, inpDict, separator=".", prefix=""): outDict = {} for key, value in inpDict.items(): if isinstance(value, dict) and value: deeper = self.__flattenOrg(value, separator, prefix + key + separator) outDict.update({key2: val2 for key2, val2 in deeper.items()}) elif isinstance(value, list) and value: for index, sublist in enumerate(value, start=1): if isinstance(sublist, dict) and sublist: deeper = self.__flattenOrg(sublist, separator, prefix + key + separator + str(index) + separator) outDict.update({key2: val2 for key2, val2 in deeper.items()}) else: outDict[prefix + key + separator + str(index)] = value else: outDict[prefix + key] = value return outDict def __dictGen(self, indict, pre=None): pre = pre[:] if pre else [] if isinstance(indict, dict): for key, value in indict.items(): if isinstance(value, dict): for dD in self.__dictGen(value, pre + [key]): yield dD elif isinstance(value, list) or isinstance(value, tuple): for v in value: for dD in self.__dictGen(v, pre + [key]): yield dD else: yield pre + [key, value] else: yield indict
class ProvenanceProvider(SingletonClass): """Utilities to access and update provenance details.""" def __init__(self, cfgOb, cachePath, useCache=True, **kwargs): """Utilities to access and update provenance details. Args: cfgOb ([type]): ConfigInfo() instance cachePath ([type]): path to directory containing schema useCache (bool, optional): use cached schema. Defaults to True. """ self.__cfgOb = cfgOb self.__configName = self.__cfgOb.getDefaultSectionName() self.__cachePath = cachePath self.__useCache = useCache # self.__workPath = os.path.join(self.__cachePath, "work") self.__provenanceCachePath = os.path.join( self.__cachePath, self.__cfgOb.get("PROVENANCE_INFO_CACHE_DIR", sectionName=self.__configName)) self.__provenanceLocator = self.__cfgOb.getPath( "PROVENANCE_INFO_LOCATOR", sectionName=self.__configName) # self.__fileU = FileUtil(workPath=self.__workPath) self.__fileU.mkdir(self.__provenanceCachePath) self.__kwargs = kwargs # def __reload(self, locator, dirPath, useCache=True): # fn = self.__fileU.getFileName(locator) filePath = os.path.join(dirPath, fn) logger.debug("Using cache path %s", dirPath) self.__fileU.mkdir(dirPath) if not useCache: try: os.remove(filePath) except Exception: pass # if useCache and self.__fileU.exists(filePath): ok = True else: logger.debug("Fetch data from source %s", locator) ok = self.__fileU.get(locator, filePath) return filePath if ok else None def fetch(self): try: provenanceFileCachePath = self.__reload(self.__provenanceLocator, self.__provenanceCachePath, useCache=self.__useCache) mU = MarshalUtil(workPath=self.__workPath) return mU.doImport(provenanceFileCachePath, fmt="json") except Exception as e: logger.exception("Failed retreiving provenance with %s", str(e)) return {} def update(self, provD): ok = False try: provenanceFileCachePath = self.__reload(self.__provenanceLocator, self.__provenanceCachePath, useCache=self.__useCache) mU = MarshalUtil(workPath=self.__workPath) tD = mU.doImport(provenanceFileCachePath, fmt="json") tD.update(provD) ok = mU.doExport(provenanceFileCachePath, tD, fmt="json") except Exception as e: logger.exception("Failed updating provenance with %s", str(e)) return ok def store(self, provD): ok = False try: provenanceFileCachePath = self.__reload(self.__provenanceLocator, self.__provenanceCachePath, useCache=self.__useCache) mU = MarshalUtil(workPath=self.__workPath) ok = mU.doExport(provenanceFileCachePath, provD, fmt="json") except Exception as e: logger.exception("Failed storing provenance with %s", str(e)) return ok