def split(self, inputFilePath, splitDirPath, prefixName="part_", maxSizeMB=50): chunkSize = maxSizeMB * 1000000 partNumber = 0 fU = FileUtil() fU.mkdir(splitDirPath) manifestPath = os.path.join(splitDirPath, "MANIFEST") myHash = fU.hash(inputFilePath, hashType="md5") with open(manifestPath, "w") as mfh: mfh.write("%s\t%s\n" % (inputFilePath, myHash)) with open(inputFilePath, "rb") as ifh: chunk = ifh.read(chunkSize) while chunk: partNumber += 1 partName = prefixName + str(partNumber) fp = os.path.join(splitDirPath, partName) with open(fp, "wb") as ofh: ofh.write(chunk) mfh.write("%s\n" % partName) # chunk = ifh.read(chunkSize) return partNumber
def __reload(self, dirPath, baseVersion, useCache, **kwargs): startTime = time.time() mU = MarshalUtil(workPath=dirPath) chemblDbUrl = kwargs.get( "ChEMBLDbUrl", "ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/") ok = False fU = FileUtil() fU.mkdir(dirPath) # # ChEMBL current version <baseVersion>,... # template: chembl_<baseVersion>.fa.gz # targetFileName = "chembl_" + str(baseVersion) + ".fa.gz" mappingFileName = "chembl_uniprot_mapping.txt" # chemblTargetPath = os.path.join(dirPath, targetFileName) chemblMappingPath = os.path.join(dirPath, mappingFileName) mappingFilePath = os.path.join(dirPath, "chembl_uniprot_mapping.json") # mapD = {} if useCache and fU.exists(mappingFilePath): logger.info("useCache %r using %r and %r and %r", useCache, chemblTargetPath, chemblMappingPath, mappingFilePath) mapD = mU.doImport(mappingFilePath, fmt="json") else: # Get the ChEMBL UniProt mapping file url = os.path.join(chemblDbUrl, mappingFileName) ok = fU.get(url, chemblMappingPath) logger.info("Fetched %r url %s path %s", ok, url, chemblMappingPath) logger.info("Reading ChEMBL mapping file path %s", mappingFilePath) rowL = mU.doImport(chemblMappingPath, fmt="tdd", rowFormat="list") for row in rowL: mapD[row[0]] = (row[1], row[2], row[3]) ok = mU.doExport(mappingFilePath, mapD, fmt="json") logger.info("Processed mapping path %s (%d) %r", mappingFilePath, len(mapD), ok) # # Get the target FASTA files -- for vers in range(baseVersion, baseVersion + 10): logger.info("Now fetching version %r", vers) self.__version = vers targetFileName = "chembl_" + str(vers) + ".fa.gz" chemblTargetPath = os.path.join(dirPath, "chembl_targets_raw.fa.gz") url = os.path.join(chemblDbUrl, targetFileName) ok = fU.get(url, chemblTargetPath) logger.info("Fetched %r url %s path %s", ok, url, chemblTargetPath) if ok: break # logger.info("Completed reload at %s (%.4f seconds)", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) # return mapD
def __doAquireLock(self): fU = FileUtil() mode = os.O_WRONLY | os.O_CREAT | os.O_EXCL | os.O_TRUNC try: fU.mkdir(os.path.dirname(self.__lockFilePath)) fd = os.open(self.__lockFilePath, mode) except (IOError, OSError): pass else: self.__lockFileFileDescriptor = fd return None
def makeBundle(self, localParentPath, subDirList): """Bundle the subdirectories of the input parent directory path. Args: localParentPath (str): local parent directory path containing the bundling targets subDirList (list, str): list of subdirectories of the parent path to be bundled Returns: (bool): True for success or False otherwise """ fileU = FileUtil() fileU.mkdir(self.__localBundlePath) dirPathList = [os.path.join(localParentPath, subDir) for subDir in subDirList] okT = fileU.bundleTarfile(self.__localStashTarFilePath, dirPathList, mode="w:gz", recursive=True) return okT
def __reload(self, dirPath, useCache): startTime = time.time() fD = {} ok = False featurePath = self.__getFeatureDataPath() # logger.info("useCache %r featurePath %r", useCache, featurePath) if useCache and self.__mU.exists(featurePath): fD = self.__mU.doImport(featurePath, fmt="json") else: fU = FileUtil() fU.mkdir(dirPath) # --- logger.info("Completed reload (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) return fD
def __reload(self, dirPath, **kwargs): startTime = time.time() fD = {} useCache = kwargs.get("useCache", True) ok = False cofactorPath = self.__getCofactorDataPath() # logger.info("useCache %r cofactorPath %r", useCache, cofactorPath) if useCache and self.__mU.exists(cofactorPath): fD = self.__mU.doImport(cofactorPath, fmt="json") ok = True else: fU = FileUtil() fU.mkdir(dirPath) # --- logger.info("Completed reload with status (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) return fD
def __reload(self, dirPath, useCache): startTime = time.time() retD = {} ok = False mappingPath = self.__getMappingDataPath() # logger.info("useCache %r mappingPath %r", useCache, mappingPath) if useCache and self.__mU.exists(mappingPath): retD = self.__mU.doImport(mappingPath, fmt="json") ok = True else: fU = FileUtil() fU.mkdir(dirPath) # --- num = len(retD["mapping"]) if "mapping" in retD else 0 logger.info("Completed ligand mapping reload (%d) with status (%r) at %s (%.4f seconds)", num, ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) return retD
def __reload(self, dirPath, useCache): startTime = time.time() aD = {} fU = FileUtil() fU.mkdir(dirPath) targetMechanismFilePath = self.getTargetMechanismDataPath() # if useCache and fU.exists(targetMechanismFilePath): logger.info("useCache %r using %r", useCache, targetMechanismFilePath) qD = self.__mU.doImport(targetMechanismFilePath, fmt="json") aD = qD["mechanism"] if "mechanism" in qD else {} # logger.info("Completed reload of (%d) at %s (%.4f seconds)", len(aD), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) # return aD
def __reload(self, dirPath, **kwargs): oD = None version = None startTime = time.time() useCache = kwargs.get("useCache", True) # # CARDDumpUrl = kwargs.get("CARDDumpUrl", "https://card.mcmaster.ca/latest/data/broadstreet-v3.1.0.tar.bz2") cardDumpUrl = kwargs.get("CARDDumpUrl", "https://card.mcmaster.ca/latest/data") ok = False fU = FileUtil() cardDumpFileName = "card-data.tar.bz2" cardDumpPath = os.path.join(dirPath, cardDumpFileName) cardDumpDirPath = os.path.join(dirPath, "dump") # fU.mkdir(dirPath) cardDataPath = os.path.join(dirPath, "card-select-data.json") # logger.info("useCache %r CARDDumpPath %r", useCache, cardDumpPath) if useCache and self.__mU.exists(cardDataPath): qD = self.__mU.doImport(cardDataPath, fmt="json") version = qD["version"] oD = qD["data"] else: logger.info("Fetching url %s path %s", cardDumpUrl, cardDumpPath) ok = fU.get(cardDumpUrl, cardDumpPath) fU.mkdir(cardDumpDirPath) fU.uncompress(cardDumpPath, outputDir=cardDumpDirPath) fU.unbundleTarfile(os.path.join(cardDumpDirPath, cardDumpFileName[:-4]), dirPath=cardDumpDirPath) logger.info("Completed fetch (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) oD, version = self.__parseCardData( os.path.join(cardDumpDirPath, "card.json")) tS = datetime.datetime.now().isoformat() qD = {"version": version, "created": tS, "data": oD} oD = qD["data"] ok = self.__mU.doExport(cardDataPath, qD, fmt="json", indent=3) logger.info("Export CARD data (%d) status %r", len(oD), ok) # --- return oD, version
def __reload(self, dirPath, useCache): startTime = time.time() aD = {} allIdD = {} fU = FileUtil() fU.mkdir(dirPath) targetActivityFilePath = self.getTargetActivityDataPath() # if useCache and fU.exists(targetActivityFilePath): logger.info("useCache %r using %r", useCache, targetActivityFilePath) qD = self.__mU.doImport(targetActivityFilePath, fmt="json") aD = qD["activity"] if "activity" in qD else {} idL = qD["all_ids"] if "all_ids" in qD else [] allIdD = {k: k in aD for k in idL} # logger.info( "Completed reload (%d activities) (%d tried identifiers) at %s (%.4f seconds)", len(aD), len(allIdD), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime, ) # return aD, allIdD
def __reload(self, urlTarget, dirPath, useCache=True): """ Reload input GO OBO ontology file and return a nx graph object. ' Returns: dictionary[goId] = {'name_list': ... , 'id_list': ... 'depth_list': ... } """ goGraph = None # # mU = MarshalUtil() fU = FileUtil() fn = fU.getFileName(urlTarget) oboFilePath = os.path.join(dirPath, fn) fU.mkdir(dirPath) # if not useCache: for fp in [oboFilePath]: try: os.remove(fp) except Exception: pass # if useCache and fU.exists(oboFilePath): goGraph = obonet.read_obo(oboFilePath) else: logger.info("Fetching url %s to resource file %s", urlTarget, oboFilePath) ok = fU.get(urlTarget, oboFilePath) if ok: goGraph = obonet.read_obo(oboFilePath) if goGraph: logger.info("Reading %d nodes and %d edges", len(goGraph), goGraph.number_of_edges()) else: logger.info("Go graph construction failing") # return goGraph
def exportFasta(self, withGaps=False): """ Example: The IMGT/GENE-DB FASTA header contains 15 fields separated by '|': 1. IMGT/LIGM-DB accession number(s) 2. IMGT gene and allele name 3. species (may be followed by an "_" and the name of the strain, breed or isolate, if defined) 4. IMGT gene and allele functionality 5. exon(s), region name(s), or extracted label(s) 6. start and end positions in the IMGT/LIGM-DB accession number(s) 7. number of nucleotides in the IMGT/LIGM-DB accession number(s) 8. codon start, or 'NR' (not relevant) for non coding labels 9. +n: number of nucleotides (nt) added in 5' compared to the corresponding label extracted from IMGT/LIGM-DB 10. +n or -n: number of nucleotides (nt) added or removed in 3' compared to the corresponding label extracted from IMGT/LIGM-DB 11. +n, -n, and/or nS: number of added, deleted, and/or substituted nucleotides to correct sequencing errors, or 'not corrected' if non corrected sequencing errors 12. number of amino acids (AA): this field indicates that the sequence is in amino acids 13. number of characters in the sequence: nt (or AA)+IMGT gaps=total 14. partial (if it is) 15. reverse complementary (if it is) """ # -- fU = FileUtil() fU.mkdir(self.__dirPath) if withGaps: imgtTargetUrl = "http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-AA-WithGaps-F+ORF+inframeP" else: imgtTargetUrl = "http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-AA-WithoutGaps-F+ORF+inframeP" imgtTargetFileName = fU.getFileName(imgtTargetUrl) rawFastaPath = os.path.join(self.__dirPath, imgtTargetFileName) # -- logger.debug("Fetching url %s path %s", imgtTargetUrl, rawFastaPath) ok = fU.get(imgtTargetUrl, rawFastaPath) logger.info("Fetch status (%r) url %s path %s", ok, imgtTargetUrl, rawFastaPath) # -- fastaPath = os.path.join(self.__dirPath, "imgt-reference.fa") taxonPath = os.path.join(self.__dirPath, "imgt-reference-taxon.tdd") tP = TaxonomyProvider(cachePath=self.__cachePath, useCache=True) ok = tP.testCache() if not ok: tP = TaxonomyProvider(cachePath=self.__cachePath, useCache=False) rawQD = self.__mU.doImport(rawFastaPath, fmt="fasta", commentStyle="default") oD = {} taxonL = [] for queryId, sD in rawQD.items(): qL = queryId.split("|") tL = qL[2].split("_") taxName = tL[0] taxVar = tL[1].replace(" ", "_") if len(tL) > 1 else None taxId = tP.getTaxId(taxName) if taxId: tD = { "seqId": qL[0], "imgtGene": qL[1], "functionality": qL[3], "labels": qL[4], "taxId": taxId } if taxVar: tD["taxVar"] = taxVar sD.update(tD) else: logger.info("Unknown taxonomy %r (taxName=%r)", queryId, taxName) sD["sequence"].replace(".", "-") seqId = "" cL = [] for k, v in sD.items(): if k in ["sequence"]: continue cL.append(str(v)) cL.append(str(k)) seqId = "|".join(cL) oD[seqId] = sD taxonL.append("%s\t%s" % (seqId, taxId)) # ok1 = self.__mU.doExport(taxonPath, taxonL, fmt="list") ok2 = self.__mU.doExport(fastaPath, oD, fmt="fasta", makeComment=True) return ok1 and ok2
class DataTypeApiProvider(SingletonClass): """ Data type application and instance information provider. """ def __init__(self, cfgOb, cachePath, useCache=True, **kwargs): """Data type application and instance information provider. Args: cfgOb (object): ConfigInfo() object instance cachePath (str): path to hold the cache directory useCache (bool, optional): flag to use cached files. Defaults to True. """ self.__cfgOb = cfgOb self.__configName = self.__cfgOb.getDefaultSectionName() self.__useCache = useCache self.__cachePath = cachePath # self.__contentInfoConfigName = "content_info_helper_configuration" self.__fileU = FileUtil() self.__contentDefHelper = self.__cfgOb.getHelper( "CONTENT_DEF_HELPER_MODULE", sectionName=self.__configName, cfgOb=self.__cfgOb) self.__dirPath = os.path.join( cachePath, self.__cfgOb.get("DATA_TYPE_INFO_CACHE_DIR", sectionName=self.__configName)) self.__kwargs = kwargs # logger.debug("Leaving constructor") def getDataTypeInstanceApi(self, databaseName, **kwargs): """Return instance of DataTypeInstanceInfo(). Args: databaseName (str): database name Returns: (object): Instance of DataTypeInstanceInfo() """ _ = kwargs dataTypeInstanceLocatorPath = self.__cfgOb.getPath( "INSTANCE_DATA_TYPE_INFO_LOCATOR_PATH", sectionName=self.__configName) dataTypeInstanceFile = self.__contentDefHelper.getDataTypeInstanceFile( databaseName) if self.__contentDefHelper else None if dataTypeInstanceLocatorPath and dataTypeInstanceFile: loc = os.path.join(dataTypeInstanceLocatorPath, dataTypeInstanceFile) filePath = self.__reload(loc, self.__dirPath, useCache=self.__useCache) dtApi = DataTypeInstanceInfo(filePath) else: # DataTypeInstanceInfo() provides an internal by-pass mode where no coverage data is available. dtApi = DataTypeInstanceInfo(None) logger.debug("No data coverage available for database %s", databaseName) return dtApi def getDataTypeApplicationApi(self, appName, **kwargs): """Return instance of DataTypeApplicationInfo. Args: appName (str): application name (e.g., SQL, ANY) Returns: (object): Instance of DataTypeApplicationInfo() """ _ = kwargs dataTypeApplicationLocator = self.__cfgOb.getPath( "APP_DATA_TYPE_INFO_LOCATOR", sectionName=self.__configName) filePath = self.__reload(dataTypeApplicationLocator, self.__dirPath, useCache=self.__useCache) dtApi = DataTypeApplicationInfo( filePath, dataTyping=appName, workPath=self.__dirPath) if filePath else None return dtApi def __reload(self, urlTarget, dirPath, useCache=True): # fn = self.__fileU.getFileName(urlTarget) filePath = os.path.join(dirPath, fn) logger.debug("Using cache path %s", dirPath) self.__fileU.mkdir(dirPath) if not useCache: try: os.remove(filePath) except Exception: pass # if useCache and self.__fileU.exists(filePath): ok = True else: logger.debug("Fetch data from source %s", urlTarget) ok = self.__fileU.get(urlTarget, os.path.join(dirPath, fn)) return filePath if ok else None
class DictionaryApiProvider(SingletonClass): """ Resource provider for dictionary APIs. """ def __init__(self, dirPath, useCache=True): """Resource provider for dictionary APIs. Args: dirPath (str): path to the directory containing cache files useCache (bool, optional): flag to use cached files. Defaults to True. """ self.__apiMap = {} self.__dirPath = dirPath self.__useCache = useCache # self.__fileU = FileUtil(workPath=self.__dirPath) logger.debug("Leaving constructor") def __reload(self, dictLocators, dirPath, useCache=True): """Reload local cache of dictionary resources and return a dictionary API instance. Args: dictLocators (list, str): list of locators for dictionary resource files dirPath (str): path to the directory containing cache files useCache (bool, optional): flag to use cached files. Defaults to True. Returns: (object): instance of dictionary API """ # # verify the exitence of the cache directory ... self.__fileU.mkdir(dirPath) if not useCache: for dictLocator in dictLocators: try: fn = self.__fileU.getFileName(dictLocator) os.remove(os.path.join(dirPath, fn)) except Exception: pass # ret = True for dictLocator in dictLocators: cacheFilePath = os.path.join(dirPath, self.__fileU.getFileName(dictLocator)) if useCache and self.__fileU.exists(cacheFilePath): # nothing to do continue logger.debug("Fetching url %s caching in %s", dictLocator, cacheFilePath) ok = self.__fileU.get(dictLocator, cacheFilePath) ret = ret and ok return ret def getApi(self, dictLocators, **kwargs): """Return a dictionary API object of the input dictioaries. Arguments: dictLocators {list str} -- list of dictionary locator paths Returns: [object] -- returns DictionaryApi() object for input dictionaries """ dictFileNames = [ self.__fileU.getFileName(dictLocator) for dictLocator in dictLocators ] dictTup = tuple(dictFileNames) dApi = self.__apiMap[ dictTup] if dictTup in self.__apiMap else self.__getApi( dictLocators, **kwargs) self.__apiMap[dictTup] = dApi return dApi def __getApi(self, dictLocators, **kwargs): """ Return an instance of a dictionary API instance for the input dictionary locator list. """ consolidate = kwargs.get("consolidate", True) replaceDefinition = kwargs.get("replaceDefinitions", True) verbose = kwargs.get("verbose", True) # ok = self.__reload(dictLocators, self.__dirPath, useCache=self.__useCache) # dApi = None if ok: mU = MarshalUtil() containerList = [] for dictLocator in dictLocators: cacheFilePath = os.path.join( self.__dirPath, self.__fileU.getFileName(dictLocator)) containerList.extend( mU.doImport(cacheFilePath, fmt="mmcif-dict")) # dApi = DictionaryApi(containerList=containerList, consolidate=consolidate, replaceDefinition=replaceDefinition, verbose=verbose) return dApi
def __reloadFasta(self, dirPath, **kwargs): """Reload DrugBank target FASTA data files. Args: dirPath (str, optional): path to DrugBank cache directory useCache (bool, optional): flag to use cached files. Defaults to True. Returns: """ startTime = time.time() logger.info("Starting db reload at %s", time.strftime("%Y %m %d %H:%M:%S", time.localtime())) retFilePathList = [] urlTargetL = [ "https://go.drugbank.com/releases/latest/downloads/target-all-polypeptide-sequences", "https://go.drugbank.com/releases/latest/downloads/enzyme-all-polypeptide-sequences", "https://go.drugbank.com/releases/latest/downloads/carrier-all-polypeptide-sequences", "https://go.drugbank.com/releases/latest/downloads/transporter-all-polypeptide-sequences", ] useCache = kwargs.get("useCache", True) username = kwargs.get("username", None) password = kwargs.get("password", None) # if not username or not password: return retFilePathList # fU = FileUtil() fU.mkdir(dirPath) # if not useCache: # Clear any cached files for urlTarget in urlTargetL: baseFileName = fU.getFileName(urlTarget) zipFileName = baseFileName + ".fasta.zip" retFileName = baseFileName + ".fa" for fn in [baseFileName, zipFileName, retFileName]: try: fp = os.path.join(dirPath, fn) os.remove(fp) except Exception: pass # ok = False if useCache: ok = True for urlTarget in urlTargetL: baseFileName = fU.getFileName(urlTarget) retFileName = baseFileName + ".fa" retFilePath = os.path.join(dirPath, retFileName) ok = fU.exists(retFilePath) if not ok: break retFilePathList.append(retFilePath) # logger.info("Using cached files %r", ok) if not useCache or not ok: if not username or not password: logger.warning( "Missing credentials for DrugBank file download...") for urlTarget in urlTargetL: baseFileName = fU.getFileName(urlTarget) zipFileName = baseFileName + ".fasta.zip" retFileName = baseFileName + ".fa" zipFilePath = os.path.join(dirPath, zipFileName) retFilePath = os.path.join(dirPath, retFileName) basePath = os.path.join(dirPath, baseFileName) logger.info("Fetching url %s for FASTA target file %s", urlTarget, baseFileName) ok = fU.get(urlTarget, zipFilePath, username=username, password=password) endTime = time.time() logger.info( "Completed db fetch at %s (%.4f seconds)", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime) # ok = fU.unbundleZipfile(zipFilePath, dirPath=basePath) fU.put(os.path.join(basePath, "protein.fasta"), retFilePath) endTime = time.time() logger.info( "Completed unzip at %s (%.4f seconds)", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime) retFilePathList.append(retFilePath) return retFilePathList
def __reloadAssignments(self, dirPath, **kwargs): """Fetch and read Args: dirPath ([type]): [description] Returns: [type]: [description] """ startTime = time.time() aD = {} try: targetUrl = kwargs.get( "assignmentUrl", "http://opig.stats.ox.ac.uk/webapps/newsabdab/sabdab/summary/all" ) fU = FileUtil() dumpFileName = "sabdab_summary_all.tsv" # fU.mkdir(dirPath) dumpPath = os.path.join(dirPath, dumpFileName) logger.info("Fetching url %s path %s", targetUrl, dumpPath) ok = fU.get(targetUrl, dumpPath) rDL = self.__mU.doImport(dumpPath, fmt="tdd", rowFormat="dict") logger.info("SAbDab raw records (%d)", len(rDL)) logger.debug("rD keys %r", list(rDL[0].keys())) kyHL = [ "pdb", "Hchain", "model", "antigen_chain", "antigen_type", "antigen_het_name", "antigen_name", "heavy_subclass" ] kyLL = [ "pdb", "Lchain", "model", "antigen_chain", "antigen_type", "antigen_het_name", "antigen_name", "light_subclass", "light_ctype" ] # for rD in rDL: pdbId = rD["pdb"] if rD["pdb"] and rD["pdb"] != "NA" else None authAsymIdH = rD["Hchain"] if rD[ "Hchain"] and rD["Hchain"] != "NA" else None authAsymIdL = rD["Lchain"] if rD[ "Lchain"] and rD["Lchain"] != "NA" else None if pdbId and authAsymIdH: aD[pdbId + "." + authAsymIdH] = { k: v for k, v in rD.items() if v and v != "NA" and k in kyHL } if pdbId and authAsymIdL: aD[pdbId + "." + authAsymIdL] = { k: v for k, v in rD.items() if v and v != "NA" and k in kyLL } logger.info("Fetched (%d) SAbDab assignment records.", len(aD)) # except Exception as e: logger.exception("Failing with %s", str(e)) logger.info("Completed reload (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) return aD
class FileUtilTests(unittest.TestCase): def setUp(self): self.__verbose = True self.__pathPdbxDictionaryFile = os.path.join(TOPDIR, "rcsb", "mock-data", "dictionaries", "mmcif_pdbx_v5_next.dic") self.__pathTaxonomyFile = os.path.join(TOPDIR, "rcsb", "mock-data", "NCBI", "names.dmp.gz") self.__zipFileUrl = "https://inventory.data.gov/dataset/794cd3d7-4d28-4408-8f7d-84b820dbf7f2/resource/6b78ec0c-4980-4ad8-9cbd-2d6eb9eda8e7/download/myfoodapediadata.zip" self.__xzFile = os.path.join(TOPDIR, "rcsb", "mock-data", "MOCK_MODBASE_MODELS", "NP_001030614.1_1.pdb.xz") # self.__ftpFileUrl = "ftp://ftp.wwpdb.org/pub/pdb/data/component-models/complete/chem_comp_model.cif.gz" self.__httpsFileUrl = "https://ftp.wwpdb.org/pub/pdb/data/component-models/complete/chem_comp_model.cif.gz" # self.__workPath = os.path.join(HERE, "test-output") self.__inpDirPath = os.path.join(HERE, "test-data") self.__fileU = FileUtil() self.__startTime = time.time() logger.debug("Running tests on version %s", __version__) logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testTarBundling(self): """Test case for tarfile bundling and unbundling""" try: tP = os.path.join(self.__workPath, "t0.tar.gz") dirPath = os.path.join(self.__inpDirPath, "topdir") ok = self.__fileU.bundleTarfile(tP, [dirPath], mode="w:gz", recursive=True) self.assertTrue(ok) numBytes = self.__fileU.size(tP) self.assertGreaterEqual(numBytes, 250) # md5 = self.__fileU.hash(tP, hashType="md5") self.assertTrue(md5 is not None) # ok = self.__fileU.unbundleTarfile(tP, dirPath=self.__workPath) self.assertTrue(ok) # tP = os.path.join(self.__workPath, "t1.tar.gz") dirPathList = [ os.path.join(self.__inpDirPath, "topdir", "subdirA"), os.path.join(self.__inpDirPath, "topdir", "subdirB") ] ok = self.__fileU.bundleTarfile(tP, dirPathList, mode="w:gz", recursive=True) self.assertTrue(ok) # ok = self.__fileU.unbundleTarfile(tP, dirPath=self.__workPath) self.assertTrue(ok) tP = os.path.join(self.__workPath, "t2.tar") dirPathList = [ os.path.join(self.__inpDirPath, "topdir", "subdirA"), os.path.join(self.__inpDirPath, "topdir", "subdirB") ] ok = self.__fileU.bundleTarfile(tP, dirPathList, mode="w", recursive=True) self.assertTrue(ok) # ok = self.__fileU.unbundleTarfile(tP, dirPath=self.__workPath) self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testGetFile(self): """Test case for a local files and directories""" try: remoteLocator = self.__pathPdbxDictionaryFile fn = self.__fileU.getFileName(remoteLocator) # _, fn = os.path.split(remoteLocator) lPath = os.path.join(self.__workPath, fn) ok = self.__fileU.get(remoteLocator, lPath) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertTrue(ok) ok = self.__fileU.isLocal(lPath) self.assertTrue(ok) tPath = self.__fileU.getFilePath(lPath) self.assertEqual(lPath, tPath) ok = self.__fileU.remove(lPath) self.assertTrue(ok) dPath = os.path.join(self.__workPath, "tdir") ok = self.__fileU.mkdir(dPath) self.assertTrue(ok) ok = self.__fileU.remove(dPath) self.assertTrue(ok) ok = self.__fileU.remove(";lakdjf") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testMoveAndCopyFile(self): """Test case for copying ("put") and moving ("replace") local files""" try: remoteLocator = self.__pathPdbxDictionaryFile fn = self.__fileU.getFileName(remoteLocator) # _, fn = os.path.split(remoteLocator) lPath = os.path.join(self.__workPath, fn) ok = self.__fileU.get(remoteLocator, lPath) self.assertTrue(ok) # Test copy file dPath2 = os.path.join(self.__workPath, "tdir") ok = self.__fileU.mkdir(dPath2) self.assertTrue(ok) lPath2 = os.path.join(dPath2, fn) ok = self.__fileU.put(lPath, lPath2) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertTrue(ok) ok = self.__fileU.exists(lPath2) self.assertTrue(ok) # Remove copied file (to test moving file next) ok = self.__fileU.remove(lPath2) self.assertTrue(ok) ok = self.__fileU.exists(lPath2) self.assertFalse(ok) # Test move file ok = self.__fileU.replace(lPath, lPath2) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertFalse(ok) ok = self.__fileU.exists(lPath2) self.assertTrue(ok) # Now clean up files and dirs ok = self.__fileU.remove(lPath) self.assertTrue(ok) ok = self.__fileU.remove(dPath2) self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testZipUrl(self): """Test case for downloading remote zip file and extracting contents.""" try: remoteLocator = self.__zipFileUrl # fn = self.__fileU.getFileName(remoteLocator) ok = self.__fileU.isLocal(remoteLocator) self.assertFalse(ok) # lPath = os.path.join(self.__workPath, self.__fileU.getFileName(self.__zipFileUrl)) ok = self.__fileU.get(remoteLocator, lPath) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertTrue(ok) ok = self.__fileU.isLocal(lPath) self.assertTrue(ok) tPath = self.__fileU.getFilePath(lPath) self.assertEqual(lPath, tPath) fp = self.__fileU.uncompress(lPath, outputDir=self.__workPath) ok = fp.endswith("Food_Display_Table.xlsx") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testFtpUrl(self): """Test case for downloading remote file ftp protocol and extracting contents.""" try: remoteLocator = self.__ftpFileUrl # fn = self.__fileU.getFileName(remoteLocator) ok = self.__fileU.isLocal(remoteLocator) self.assertFalse(ok) # dirPath = os.path.join(self.__workPath, "chem_comp_models") lPath = os.path.join(dirPath, self.__fileU.getFileName(self.__ftpFileUrl)) ok = self.__fileU.get(remoteLocator, lPath) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertTrue(ok) ok = self.__fileU.isLocal(lPath) self.assertTrue(ok) tPath = self.__fileU.getFilePath(lPath) self.assertEqual(lPath, tPath) fp = self.__fileU.uncompress(lPath, outputDir=dirPath) ok = fp.endswith("chem_comp_model.cif") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testRemote(self): """Test case remote status""" try: remoteLocator = self.__httpsFileUrl ok = self.__fileU.isLocal(remoteLocator) self.assertFalse(ok) # ok = self.__fileU.exists(remoteLocator) self.assertTrue(ok) size = self.__fileU.size(remoteLocator) self.assertGreaterEqual(size, 1000) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() @unittest.skip("DrugBank example -- skipping") def testGetDrugBankUrl(self): """Test case for downloading drugbank master xml file""" try: remoteLocator = "https://www.drugbank.ca/releases/latest/downloads/all-full-database" un = "username" pw = "password" # fn = self.__fileU.getFileName(remoteLocator) ok = self.__fileU.isLocal(remoteLocator) self.assertFalse(ok) # lPath = os.path.join(self.__workPath, "db-download.zip") ok = self.__fileU.get(remoteLocator, lPath, username=un, password=pw) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertTrue(ok) ok = self.__fileU.isLocal(lPath) self.assertTrue(ok) tPath = self.__fileU.getFilePath(lPath) self.assertEqual(lPath, tPath) self.__fileU.uncompress(lPath, outputDir=self.__workPath) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testXzFile(self): """Test case for extracting contents from xz file""" try: remoteLocator = self.__xzFile fn = self.__fileU.getFileName(remoteLocator) lPath = os.path.join(self.__workPath, fn) ok = self.__fileU.get(remoteLocator, lPath) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertTrue(ok) ok = self.__fileU.isLocal(lPath) self.assertTrue(ok) tPath = self.__fileU.getFilePath(lPath) self.assertEqual(lPath, tPath) fp = self.__fileU.uncompress(lPath, outputDir=self.__workPath) ok = fp.endswith(".pdb") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def __reload(self, dirPath, **kwargs): startTime = time.time() oD = {} useCache = kwargs.get("useCache", True) targetUrl = kwargs.get( "targetUrl", "http://opig.stats.ox.ac.uk/webapps/newsabdab/static/downloads/TheraSAbDab_SeqStruc_OnlineDownload.csv" ) # ok = False fU = FileUtil() _, dumpFileName = os.path.split(targetUrl) # fU.mkdir(dirPath) dumpPath = os.path.join(dirPath, dumpFileName) dataPath = os.path.join(dirPath, "sabdab-data.json") # logger.info("useCache %r sabdabDumpPath %r", useCache, dumpPath) if useCache and self.__mU.exists(dataPath): oD = self.__mU.doImport(dataPath, fmt="json") else: logger.info("Fetching url %s path %s", targetUrl, dumpPath) ok = fU.get(targetUrl, dumpPath) # rDL = self.__mU.doImport(dumpPath, fmt="csv", rowFormat="dict") logger.debug("rD keys %r", list(rDL[0].keys())) tD = {} for rD in rDL: qD = {} for kTup in [ ("Therapeutic", "antibodyName"), ("Format", "antiBodyFormat"), ("CH1 Isotype", "ch1Isotype"), ("VD LC", "VD_LC"), ("Highest_Clin_Trial (Oct '21)", "maxClinicalPhase"), ("Est. Status", "status"), ("Target", "target"), ("Conditions Approved", "conditionsApproved"), ("Conditions Active", "conditionsActive"), ]: if kTup[0] in rD and rD[kTup[0]] not in ["na", "na;na"]: qD[kTup[1]] = rD[kTup[0]] else: qD[kTup[1]] = None if kTup[0] not in rD: logger.error( "SabDab key %r missing in input dataset %r", kTup[0], list(rD.keys())) tD[rD["Therapeutic"]] = qD aD = self.__reloadAssignments(dirPath, **kwargs) # tS = datetime.datetime.now().isoformat() vS = datetime.datetime.now().strftime("%Y-%m-%d") oD = { "version": vS, "created": tS, "identifiers": tD, "assignments": aD } ok = self.__mU.doExport(dataPath, oD, fmt="json", indent=3) logger.info( "Exporting (%d) Thera-SAbDab data records and (%d) SAbDab assignments in %r status %r", len(oD["identifiers"]), len(oD["assignments"]), dataPath, ok) # --- logger.info("Completed reload (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) return oD["identifiers"], oD["assignments"], dumpPath, oD["version"]
class MarshalUtil(object): """Wrapper for serialization and deserialization methods.""" def __init__(self, **kwargs): self.__workPath = kwargs.get("workPath", ".") self.__workDirSuffix = kwargs.get("workDirSuffix", "marshall_") self.__workDirPrefix = kwargs.get("workDirSuffix", "_tempdir") # self.__fileU = FileUtil(workPath=self.__workPath) self.__ioU = IoUtil() def doExport(self, locator, obj, fmt="list", marshalHelper=None, numParts=None, **kwargs): """Serialize the input object at locator path in specified format. The input object is optionally preprocessed by the helper method. Args: locator (str): target path or URI obj (object): data to be serialized fmt (str, optional): format for serialization (mmcif, tdd, csv, list). Defaults to "list". marshalHelper (method, optional): pre-processor method applied to input data object. Defaults to None. numParts (int, optional): serialize the data in parts. Defaults to None. (json and pickle formats) Returns: bool: True for sucess or False otherwise """ try: ret = False localFlag = self.__fileU.isLocal(locator) if marshalHelper: myObj = marshalHelper(obj, **kwargs) else: myObj = obj # if localFlag and numParts and fmt in ["json", "pickle"]: localFilePath = self.__fileU.getFilePath(locator) ret = self.__ioU.serializeInParts(localFilePath, myObj, numParts, fmt=fmt, **kwargs) elif localFlag: localFilePath = self.__fileU.getFilePath(locator) ret = self.__ioU.serialize(localFilePath, myObj, fmt=fmt, workPath=self.__workPath, **kwargs) else: with tempfile.TemporaryDirectory( suffix=self.__workDirSuffix, prefix=self.__workDirPrefix, dir=self.__workPath) as tmpDirName: # write a local copy then copy to destination - # localFilePath = os.path.join( self.__workPath, tmpDirName, self.__fileU.getFileName(locator)) ok1 = self.__ioU.serialize(localFilePath, myObj, fmt=fmt, workPath=self.__workPath, **kwargs) ok2 = True if ok1: ok2 = self.__fileU.put(localFilePath, locator, **kwargs) ret = ok1 and ok2 except Exception as e: logger.exception("Exporting locator %r failing with %s", locator, str(e)) return ret def doImport(self, locator, fmt="list", marshalHelper=None, numParts=None, **kwargs): """Deserialize data at the target locator in specified format. The deserialized data is optionally post-processed by the input helper method. Args: locator (str): path or URI to input data fmt (str, optional): format for deserialization (mmcif, tdd, csv, list). Defaults to "list". marshalHelper (method, optional): post-processor method applied to deserialized data object. Defaults to None. numParts (int, optional): deserialize the data in parts. Defaults to None. (json and pickle formats) tarMember (str, optional): name of a member of tar file bundle. Defaults to None. (tar file format) Returns: Any: format specific return type """ try: tarMember = kwargs.get("tarMember", None) localFlag = self.__fileU.isLocal(locator) and not tarMember # if localFlag and numParts and fmt in ["json", "pickle"]: filePath = self.__fileU.getFilePath(locator) ret = self.__ioU.deserializeInParts(filePath, numParts, fmt=fmt, **kwargs) elif localFlag: filePath = self.__fileU.getFilePath(locator) ret = self.__ioU.deserialize(filePath, fmt=fmt, workPath=self.__workPath, **kwargs) else: # if fmt == "mmcif": ret = self.__ioU.deserialize(locator, fmt=fmt, workPath=self.__workPath, **kwargs) else: with tempfile.TemporaryDirectory( suffix=self.__workDirSuffix, prefix=self.__workDirPrefix, dir=self.__workPath) as tmpDirName: # # Fetch first then read a local copy - # if tarMember: localFilePath = os.path.join( self.__workPath, tmpDirName, tarMember) else: localFilePath = os.path.join( self.__workPath, tmpDirName, self.__fileU.getFileName(locator)) # --- Local copy approach --- self.__fileU.get(locator, localFilePath, **kwargs) ret = self.__ioU.deserialize(localFilePath, fmt=fmt, workPath=self.__workPath, **kwargs) if marshalHelper: ret = marshalHelper(ret, **kwargs) except Exception as e: logger.exception("Importing locator %r failing with %s", locator, str(e)) ret = None return ret def exists(self, filePath, mode=os.R_OK): return self.__fileU.exists(filePath, mode=mode) def mkdir(self, dirPath, mode=0o755): return self.__fileU.mkdir(dirPath, mode=mode) def remove(self, pth): return self.__fileU.remove(pth)
def __reload(self, dirPath, useCache=False, imgtDumpUrl=None, testList=None, maxCount=None): imgtD = {} startTime = time.time() fU = FileUtil() fU.mkdir(dirPath) # imgtDataPath = os.path.join(self.__dirPath, "imgt-data.json") # logger.info("useCache %r imgtFeaturePath %r", useCache, imgtDataPath) if useCache and self.__mU.exists(imgtDataPath): imgtD = self.__mU.doImport(imgtDataPath, fmt="json") self.__version = imgtD["version"] else: imgtDumpUrl = imgtDumpUrl if imgtDumpUrl else "http://www.imgt.org/download/3Dstructure-DB/IMGT3DFlatFiles.tgz" imgtReadmeUrl = "http://www.imgt.org/download/3Dstructure-DB/RELEASE" imgtDumpFileName = fU.getFileName(imgtDumpUrl) imgtDumpPath = os.path.join(dirPath, imgtDumpFileName) imgtReleasePath = os.path.join(dirPath, "IMGT-release.txt") _, fn = os.path.split(imgtDumpUrl) imgtFlatFilePath = os.path.join(self.__dirPath, fn[:-4]) # logger.info("Fetching url %s path %s", imgtDumpUrl, imgtDumpPath) ok1 = fU.get(imgtDumpUrl, imgtDumpPath) ok2 = fU.get(imgtReadmeUrl, imgtReleasePath) fU.unbundleTarfile(imgtDumpPath, dirPath=dirPath) logger.info("Completed fetch (%r) at %s (%.4f seconds)", ok1 and ok2, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) # --- readmeLines = self.__mU.doImport(imgtReleasePath, fmt="list") self.__version = readmeLines[0].strip() if readmeLines else None logger.info("IMGT version %r", self.__version) # --- chainD, rawD = self.__imgtFlatFileProcessor(imgtFlatFilePath, maxCount=maxCount, testList=testList) # --- tS = datetime.datetime.now().isoformat() # vS = datetime.datetime.now().strftime("%Y-%m-%d") if testList: imgtD = { "version": self.__version, "date": tS, "chains": chainD, "raw": rawD } else: imgtD = { "version": self.__version, "date": tS, "chains": chainD } ok = self.__mU.doExport(imgtDataPath, imgtD, fmt="json", indent=3) logger.info("Completed flatfile prep (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) return imgtD
class IoUtil(object): def __init__(self, **kwargs): self.__fileU = FileUtil(**kwargs) def serialize(self, filePath, myObj, fmt="pickle", **kwargs): """Public method to serialize format appropriate objects Args: filePath (str): local file path' myObj (object): format appropriate object to be serialized format (str, optional): one of ['mmcif', mmcif-dict', json', 'list', 'text-dump', pickle' (default)] **kwargs: additional keyword arguments passed to worker methods - Returns: bool: status of serialization operation; true for success or false otherwise """ ret = False fmt = str(fmt).lower() ret = self.__fileU.mkdirForFile(filePath) if not ret: return ret if fmt in ["mmcif"]: ret = self.__serializeMmCif(filePath, myObj, **kwargs) elif fmt in ["json"]: ret = self.__serializeJson(filePath, myObj, **kwargs) elif fmt in ["pickle"]: ret = self.__serializePickle(filePath, myObj, **kwargs) elif fmt in ["list"]: ret = self.__serializeList(filePath, myObj, enforceAscii=True, **kwargs) elif fmt in ["mmcif-dict"]: ret = self.__serializeMmCifDict(filePath, myObj, **kwargs) elif fmt in ["text-dump"]: ret = self.__textDump(filePath, myObj, **kwargs) elif fmt in ["fasta"]: ret = self.__serializeFasta(filePath, myObj, **kwargs) elif fmt in ["csv"]: ret = self.__serializeCsv(filePath, myObj, **kwargs) else: pass return ret def deserialize(self, filePath, fmt="pickle", **kwargs): """Public method to deserialize objects in supported formats. Args: filePath (str): local file path format (str, optional): one of ['mmcif', 'json', 'list', ..., 'pickle' (default)] **kwargs: additional keyword arguments passed to worker methods - Returns: object: deserialized object data """ fmt = str(fmt).lower() if fmt in ["mmcif"]: ret = self.__deserializeMmCif(filePath, **kwargs) # type: ignore elif fmt in ["json"]: ret = self.__deserializeJson(filePath, **kwargs) # type: ignore elif fmt in ["pickle"]: ret = self.__deserializePickle(filePath, **kwargs) # type: ignore elif fmt in ["list"]: ret = self.__deserializeList(filePath, enforceAscii=True, **kwargs) # type: ignore elif fmt in ["mmcif-dict"]: ret = self.__deserializeMmCifDict(filePath, **kwargs) # type: ignore elif fmt in ["fasta"]: ret = self.__deserializeFasta(filePath, **kwargs) # type: ignore # elif fmt in ["vrpt-xml-to-cif"]: # ret = self.__deserializeVrptToCif(filePath, **kwargs) # type: ignore elif fmt in ["csv", "tdd"]: delimiter = kwargs.get("csvDelimiter", "," if fmt == "csv" else "\t") ret = self.__deserializeCsv(filePath, delimiter=delimiter, **kwargs) # type: ignore elif fmt in ["xml"]: ret = self.__deserializeXml(filePath, **kwargs) # type: ignore else: ret = None # type: ignore return ret def __sliceInChunks(self, myList, numChunks): mc = min(len(myList), numChunks) chunkSize = int(len(myList) / mc) if len(myList) % mc: chunkSize += 1 for i in range(0, len(myList), chunkSize): yield myList[i:i + chunkSize] def serializeInParts(self, filePath, myObj, numParts, fmt="json", **kwargs): """Public method to serialize format appropriate (json, pickle) objects in multiple parts Args: filePath (str): local file path myObj (object): format appropriate object to be serialized numParts (int): divide the data into numParts segments format (str, optional): one of ['json' or 'pickle']. Defaults to json **kwargs: additional keyword arguments passed to worker methods - Returns: bool: True for success or False otherwise """ if fmt not in ["json", "pickle"]: logger.error("Unsupported format for %s", fmt) return False pth, fn = os.path.split(filePath) self.__fileU.mkdirForFile(pth) bn, ext = os.path.splitext(fn) ret = True if isinstance(myObj, list): for ii, subList in enumerate(self.__sliceInChunks(myObj, numParts)): fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext) ok = self.serialize(fp, subList, fmt=fmt, **kwargs) ret = ret and ok elif isinstance(myObj, dict): for ii, keyList in enumerate( self.__sliceInChunks(list(myObj.keys()), numParts)): fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext) ok = self.serialize(fp, OrderedDict([(k, myObj[k]) for k in keyList]), fmt=fmt, **kwargs) ret = ret and ok else: logger.error("Unsupported data type for serialization in parts") ret = False # return ret def deserializeInParts(self, filePath, numParts, fmt="json", **kwargs): """Public method to deserialize objects in supported formats from multiple parts Args: filePath (str): local file path numParts (int): reconstruct the data object from numParts segments format (str, optional): one of ['json' or 'pickle']. Defaults to json **kwargs: additional keyword arguments passed to worker methods - Returns: object: deserialized object data """ rObj = None if fmt not in ["json", "pickle"]: logger.error("Unsupported format for %s", fmt) return rObj # pth, fn = os.path.split(filePath) bn, ext = os.path.splitext(fn) if not numParts: fp = os.path.join(pth, bn + "_part_*" + ext) numParts = len(glob.glob(fp)) # for ii in range(numParts): fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext) tObj = self.deserialize(fp, fmt=fmt, **kwargs) if isinstance(tObj, list): if not rObj: rObj = [] rObj.extend(tObj) elif isinstance(tObj, dict): if not rObj: rObj = OrderedDict() rObj.update(tObj) else: logger.error( "Unsupported data type for deserialization in parts") return rObj def exists(self, filePath, mode=os.R_OK): return self.__fileU.exists(filePath, mode=mode) def mkdir(self, dirPath, mode=0o755): return self.__fileU.mkdir(dirPath, mode=mode) def remove(self, pth): return self.__fileU.remove(pth) def __deserializeFasta(self, filePath, **kwargs): try: commentStyle = kwargs.get("commentStyle", "uniprot") fau = FastaUtil() return fau.readFasta(filePath, commentStyle=commentStyle) except Exception as e: logger.error("Unable to deserialize %r %r ", filePath, str(e)) return {} def __serializeFasta(self, filePath, myObj, **kwargs): try: maxLineLength = int(kwargs.get("maxLineLength", 70)) makeComment = kwargs.get("makeComment", False) fau = FastaUtil() ok = fau.writeFasta(filePath, myObj, maxLineLength=maxLineLength, makeComment=makeComment) return ok except Exception as e: logger.error("Unable to serialize FASTA file %r %r", filePath, str(e)) return False def __textDump(self, filePath, myObj, **kwargs): try: indent = kwargs.get("indent", 1) width = kwargs.get("width", 120) sOut = pprint.pformat(myObj, indent=indent, width=width) with open(filePath, "w") as ofh: ofh.write("\n%s\n" % sOut) return True except Exception as e: logger.error("Unable to dump to %r %r", filePath, str(e)) return False def __serializePickle(self, filePath, myObj, **kwargs): try: pickleProtocol = kwargs.get("pickleProtocol", pickle.DEFAULT_PROTOCOL) with open(filePath, "wb") as outfile: pickle.dump(myObj, outfile, pickleProtocol) return True except Exception as e: logger.error("Unable to serialize %r %r", filePath, str(e)) return False def __deserializePickle(self, filePath, **kwargs): myDefault = kwargs.get("default", {}) try: if sys.version_info[0] > 2: encoding = kwargs.get("encoding", "ASCII") errors = kwargs.get("errors", "strict") with open(filePath, "rb") as outfile: return pickle.load(outfile, encoding=encoding, errors=errors) else: with open(filePath, "rb") as outfile: return pickle.load(outfile) except Exception as e: logger.warning("Unable to deserialize %r %r", filePath, str(e)) return myDefault def __serializeJson(self, filePath, myObj, **kwargs): """Internal method to serialize the input object as JSON. An encoding helper class is included to handle selected python data types (e.g., datetime) """ indent = kwargs.get("indent", 0) enforceAscii = kwargs.get("enforceAscii", True) try: if enforceAscii: with open(filePath, "w") as outfile: json.dump(myObj, outfile, indent=indent, cls=JsonTypeEncoder, ensure_ascii=enforceAscii) else: with io.open(filePath, "w", encoding="utf-8") as outfile: json.dump(myObj, outfile, indent=indent, cls=JsonTypeEncoder, ensure_ascii=enforceAscii) return True except Exception as e: logger.error("Unable to serialize %r %r", filePath, str(e)) return False def __deserializeJson(self, filePath, **kwargs): myDefault = kwargs.get("default", {}) encoding = kwargs.get("encoding", "utf-8-sig") encodingErrors = kwargs.get("encodingErrors", "ignore") try: if filePath[-3:] == ".gz": if sys.version_info[0] > 2: with gzip.open(filePath, "rt", encoding=encoding, errors=encodingErrors) as inpFile: return json.load(inpFile, object_pairs_hook=OrderedDict) else: # Py2 situation non-ascii encodings is problematic # with gzip.open(filePath, "rb") as csvFile: # oL = self.__csvReader(csvFile, rowFormat, delimiter) tPath = self.__fileU.uncompress(filePath, outputDir=None) with io.open(tPath, newline="", encoding=encoding, errors="ignore") as inpFile: return json.load(inpFile, object_pairs_hook=OrderedDict) else: with open(filePath, "r") as inpFile: return json.load(inpFile, object_pairs_hook=OrderedDict) except Exception as e: logger.warning("Unable to deserialize %r %r", filePath, str(e)) return myDefault def __hasMinSize(self, pth, minSize): try: return os.path.getsize(pth) >= minSize except Exception: return False def __deserializeMmCif(self, locator, **kwargs): """ """ try: containerList = [] workPath = kwargs.get("workPath", None) enforceAscii = kwargs.get("enforceAscii", True) raiseExceptions = kwargs.get("raiseExceptions", True) useCharRefs = kwargs.get("useCharRefs", True) minSize = kwargs.get("minSize", 5) # if self.__fileU.isLocal(locator): if minSize >= 0 and not self.__hasMinSize(locator, minSize): logger.warning("Minimum file size not satisfied for: %r", locator) myIo = IoAdapter(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs) containerList = myIo.readFile( locator, enforceAscii=enforceAscii, outDirPath=workPath) # type: ignore else: # myIo = IoAdapterPy(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs) # containerList = myIo.readFile(locator, enforceAscii=enforceAscii, outDirPath=workPath) containerList = self.__deserializeMmCifRemote( locator, useCharRefs, enforceAscii, workPath) except Exception as e: logger.error("Failing for %s with %s", locator, str(e)) return containerList @retry((requests.exceptions.RequestException), maxAttempts=3, delaySeconds=1, multiplier=2, defaultValue=[], logger=logger) def __deserializeMmCifRemote(self, locator, useCharRefs, enforceAscii, workPath): containerList = [] try: myIo = IoAdapterPy(raiseExceptions=True, useCharRefs=useCharRefs) containerList = myIo.readFile(locator, enforceAscii=enforceAscii, outDirPath=workPath) except Exception as e: raise e return containerList def __serializeMmCif(self, filePath, containerList, **kwargs): """ """ try: ret = False workPath = kwargs.get("workPath", None) enforceAscii = kwargs.get("enforceAscii", True) raiseExceptions = kwargs.get("raiseExceptions", True) useCharRefs = kwargs.get("useCharRefs", True) # myIo = IoAdapter(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs) if filePath.endswith(".gz") and workPath: rfn = "".join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) tPath = os.path.join(workPath, rfn) ret = myIo.writeFile(tPath, containerList=containerList, enforceAscii=enforceAscii) ret = self.__fileU.compress(tPath, filePath, compressType="gzip") else: ret = myIo.writeFile(filePath, containerList=containerList, enforceAscii=enforceAscii) except Exception as e: logger.error("Failing for %s with %s", filePath, str(e)) return ret def __deserializeMmCifDict(self, filePath, **kwargs): """ """ try: containerList = [] workPath = kwargs.get("workPath", None) enforceAscii = kwargs.get("enforceAscii", True) raiseExceptions = kwargs.get("raiseExceptions", True) useCharRefs = kwargs.get("useCharRefs", True) # myIo = IoAdapterPy(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs) containerList = myIo.readFile(filePath, enforceAscii=enforceAscii, outDirPath=workPath) except Exception as e: logger.error("Failing for %s with %s", filePath, str(e)) return containerList def __serializeMmCifDict(self, filePath, containerList, **kwargs): """ """ try: ret = False # workPath = kwargs.get('workPath', None) enforceAscii = kwargs.get("enforceAscii", True) raiseExceptions = kwargs.get("raiseExceptions", True) useCharRefs = kwargs.get("useCharRefs", True) # myIo = IoAdapterPy(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs) ret = myIo.writeFile(filePath, containerList=containerList, enforceAscii=enforceAscii) except Exception as e: logger.error("Failing for %s with %s", filePath, str(e)) return ret def __serializeList(self, filePath, aList, enforceAscii=True, **kwargs): """ """ try: _ = kwargs if enforceAscii: encoding = "ascii" else: encoding = "utf-8" # if sys.version_info[0] > 2: with open(filePath, "w") as ofh: if enforceAscii: for st in aList: ofh.write("%s\n" % st.encode( "ascii", "xmlcharrefreplace").decode("ascii")) else: for st in aList: ofh.write("%s\n" % st) else: if enforceAscii: with io.open(filePath, "w", encoding=encoding) as ofh: for st in aList: ofh.write("%s\n" % st.encode( "ascii", "xmlcharrefreplace").decode("ascii")) else: with open(filePath, "wb") as ofh: for st in aList: ofh.write("%s\n" % st) return True except Exception as e: logger.error("Unable to serialize %r %r", filePath, str(e)) return False def __processList(self, ifh, enforceAscii=True, **kwargs): uncomment = kwargs.get("uncomment", True) aList = [] for line in ifh: if enforceAscii: pth = line[:-1].encode("ascii", "xmlcharrefreplace").decode("ascii") else: pth = line[:-1] if not pth or (uncomment and pth.startswith("#")): continue aList.append(pth) return aList def __deserializeList(self, filePath, enforceAscii=True, encodingErrors="ignore", **kwargs): aList = [] _ = kwargs try: if filePath[-3:] == ".gz": if sys.version_info[0] > 2: with gzip.open(filePath, "rt", encoding="utf-8-sig", errors=encodingErrors) as ifh: aList = self.__processList(ifh, enforceAscii=enforceAscii, **kwargs) else: tPath = self.__fileU.uncompress(filePath, outputDir=None) # for py2 this commented code is problematic for non-ascii data # with gzip.open(filePath, "rb") as ifh: # aList = self.__processList(ifh, enforceAscii=enforceAscii) with io.open(tPath, encoding="utf-8-sig", errors="ignore") as ifh: aList = self.__processList(ifh, enforceAscii=enforceAscii) else: with io.open(filePath, encoding="utf-8-sig", errors="ignore") as ifh: aList = self.__processList(ifh, enforceAscii=enforceAscii, **kwargs) except Exception as e: logger.error("Unable to deserialize %r %s", filePath, str(e)) # logger.debug("Reading list length %d", len(aList)) return aList def __csvReader(self, csvFile, rowFormat, delimiter, uncomment=True): oL = [] maxInt = sys.maxsize csv.field_size_limit(maxInt) if rowFormat == "dict": if uncomment: reader = csv.DictReader(uncommentFilter(csvFile), delimiter=delimiter) else: reader = csv.DictReader(csvFile, delimiter=delimiter) for rowD in reader: oL.append(rowD) elif rowFormat == "list": if uncomment: reader = csv.reader(uncommentFilter(csvFile), delimiter=delimiter) else: reader = csv.reader(csvFile, delimiter=delimiter) for rowL in reader: oL.append(rowL) return oL def deserializeCsvIter(self, filePath, delimiter=",", rowFormat="dict", encodingErrors="ignore", uncomment=True, **kwargs): """Return an iterator to input CSV format file. Args: filePath (str): input file path delimiter (str, optional): CSV delimiter. Defaults to ",". rowFormat (str, optional): format for each process row (list or dict). Defaults to "dict". encodingErrors (str, optional): treatment of encoding errors. Defaults to "ignore". uncomment (bool, optional): flag to ignore leading comments. Defaults to True. Returns: (iterator): iterator for rowwise access to processed CSV data """ encoding = kwargs.get("encoding", "utf-8-sig") maxInt = sys.maxsize csv.field_size_limit(maxInt) try: if filePath[-3:] == ".gz": with gzip.open(filePath, "rt", encoding=encoding, errors=encodingErrors) as csvFile: startIt = itertools.dropwhile( lambda x: x.startswith("#"), csvFile) if uncomment else csvFile if rowFormat == "dict": reader = csv.DictReader(startIt, delimiter=delimiter) elif rowFormat == "list": reader = csv.reader(startIt, delimiter=delimiter) for row in reader: yield row else: with io.open(filePath, newline="", encoding=encoding, errors="ignore") as csvFile: startIt = itertools.dropwhile( lambda x: x.startswith("#"), csvFile) if uncomment else csvFile if rowFormat == "dict": reader = csv.DictReader(startIt, delimiter=delimiter) elif rowFormat == "list": reader = csv.reader(startIt, delimiter=delimiter) for row in reader: # if uncomment and row.startswith("#"): # continue yield row except Exception as e: logger.error("Unable to deserialize %r %s", filePath, str(e)) def __deserializeCsv(self, filePath, delimiter=",", rowFormat="dict", encodingErrors="ignore", uncomment=True, **kwargs): oL = [] encoding = kwargs.get("encoding", "utf-8-sig") try: if filePath[-3:] == ".gz": if sys.version_info[0] > 2: with gzip.open(filePath, "rt", encoding=encoding, errors=encodingErrors) as csvFile: oL = self.__csvReader(csvFile, rowFormat, delimiter, uncomment=uncomment) else: # Py2 situation non-ascii encodings is problematic # with gzip.open(filePath, "rb") as csvFile: # oL = self.__csvReader(csvFile, rowFormat, delimiter) tPath = self.__fileU.uncompress(filePath, outputDir=None) with io.open(tPath, newline="", encoding=encoding, errors="ignore") as csvFile: oL = self.__csvReader(csvFile, rowFormat, delimiter, uncomment=uncomment) else: with io.open(filePath, newline="", encoding=encoding, errors="ignore") as csvFile: oL = self.__csvReader(csvFile, rowFormat, delimiter, uncomment=uncomment) return oL except Exception as e: logger.error("Unable to deserialize %r %s", filePath, str(e)) # logger.debug("Reading list length %d", len(oL)) return oL def __serializeCsv(self, filePath, rowDictList, fieldNames=None, **kwargs): """ """ _ = kwargs try: wD = {} ret = False fNames = fieldNames if fieldNames else list(rowDictList[0].keys()) # with io.open(filePath, 'w', newline='') as csvFile: with open(filePath, "w") as csvFile: writer = csv.DictWriter(csvFile, fieldnames=fNames) writer.writeheader() for ii, rowDict in enumerate(rowDictList): try: wD = {k: v for k, v in rowDict.items() if k in fNames} writer.writerow(wD) except Exception as e: logger.error( "Skipping bad CSV record %d wD %r rowDict %r with %s", ii + 1, wD, rowDict, str(e)) continue ret = True except Exception as e: logger.error("Failing for %s : %r with %s", filePath, wD, str(e)) return ret def __csvEncoder(self, csvData, encoding="utf-8-sig", encodingErrors="ignore"): """Handle encoding issues for gzipped data in Py2. (beware of the BOM chars) Args: csvData (text lines): uncompressed data from gzip open encoding (str, optional): character encoding. Defaults to "utf-8-sig". encodingErrors (str, optional): error treatment. Defaults to "ignore". """ for line in csvData: yield line.decode("utf-8-sig", errors=encodingErrors).encode( encoding, errors=encodingErrors) def __deserializeXmlPrev(self, filePath, **kwargs): """Read the input XML file path and return an ElementTree data object instance. Args: filePath (sting): input XML file path Returns: object: instance of an ElementTree tree object """ _ = kwargs tree = None try: logger.debug("Parsing XML path %s", filePath) if filePath[-3:] == ".gz": with gzip.open(filePath, mode="rb") as ifh: tV = time.time() tree = ET.parse(ifh) else: with open(filePath, mode="rb") as ifh: tV = time.time() tree = ET.parse(ifh) logger.debug("Parsed %s in %.2f seconds", filePath, time.time() - tV) except Exception as e: logger.error("Unable to deserialize %r %s", filePath, str(e)) # return tree def __testGzip(self, filePath): ok = True with gzip.open(filePath, "r") as fh: try: fh.read(1) except gzip.BadGzipFile: ok = False except Exception: ok = False logger.debug("Gzip file check %r", ok) return ok def __deserializeXml(self, filePath, **kwargs): """Read the input XML file path and return an ElementTree data object instance. Args: filePath (sting): input XML file path Returns: object: instance of an ElementTree tree object """ _ = kwargs tree = None encoding = kwargs.get("encoding", "utf-8-sig") encodingErrors = kwargs.get("encodingErrors", "ignore") # try: logger.debug("Parsing XML path %s", filePath) if filePath[-3:] == ".gz" and self.__testGzip(filePath): if sys.version_info[0] > 2: with gzip.open(filePath, "rt", encoding=encoding, errors=encodingErrors) as ifh: tV = time.time() tree = ET.parse(ifh) else: tPath = self.__fileU.uncompress(filePath, outputDir=None) with io.open(tPath, encoding=encoding, errors=encodingErrors) as ifh: tV = time.time() tree = ET.parse(ifh) else: with io.open(filePath, encoding=encoding, errors=encodingErrors) as ifh: tV = time.time() tree = ET.parse(ifh) logger.debug("Parsed %s in %.2f seconds", filePath, time.time() - tV) except Exception as e: logger.error("Unable to deserialize %r %s", filePath, str(e)) # return tree
class ProvenanceProvider(SingletonClass): """Utilities to access and update provenance details.""" def __init__(self, cfgOb, cachePath, useCache=True, **kwargs): """Utilities to access and update provenance details. Args: cfgOb ([type]): ConfigInfo() instance cachePath ([type]): path to directory containing schema useCache (bool, optional): use cached schema. Defaults to True. """ self.__cfgOb = cfgOb self.__configName = self.__cfgOb.getDefaultSectionName() self.__cachePath = cachePath self.__useCache = useCache # self.__workPath = os.path.join(self.__cachePath, "work") self.__provenanceCachePath = os.path.join( self.__cachePath, self.__cfgOb.get("PROVENANCE_INFO_CACHE_DIR", sectionName=self.__configName)) self.__provenanceLocator = self.__cfgOb.getPath( "PROVENANCE_INFO_LOCATOR", sectionName=self.__configName) # self.__fileU = FileUtil(workPath=self.__workPath) self.__fileU.mkdir(self.__provenanceCachePath) self.__kwargs = kwargs # def __reload(self, locator, dirPath, useCache=True): # fn = self.__fileU.getFileName(locator) filePath = os.path.join(dirPath, fn) logger.debug("Using cache path %s", dirPath) self.__fileU.mkdir(dirPath) if not useCache: try: os.remove(filePath) except Exception: pass # if useCache and self.__fileU.exists(filePath): ok = True else: logger.debug("Fetch data from source %s", locator) ok = self.__fileU.get(locator, filePath) return filePath if ok else None def fetch(self): try: provenanceFileCachePath = self.__reload(self.__provenanceLocator, self.__provenanceCachePath, useCache=self.__useCache) mU = MarshalUtil(workPath=self.__workPath) return mU.doImport(provenanceFileCachePath, fmt="json") except Exception as e: logger.exception("Failed retreiving provenance with %s", str(e)) return {} def update(self, provD): ok = False try: provenanceFileCachePath = self.__reload(self.__provenanceLocator, self.__provenanceCachePath, useCache=self.__useCache) mU = MarshalUtil(workPath=self.__workPath) tD = mU.doImport(provenanceFileCachePath, fmt="json") tD.update(provD) ok = mU.doExport(provenanceFileCachePath, tD, fmt="json") except Exception as e: logger.exception("Failed updating provenance with %s", str(e)) return ok def store(self, provD): ok = False try: provenanceFileCachePath = self.__reload(self.__provenanceLocator, self.__provenanceCachePath, useCache=self.__useCache) mU = MarshalUtil(workPath=self.__workPath) ok = mU.doExport(provenanceFileCachePath, provD, fmt="json") except Exception as e: logger.exception("Failed storing provenance with %s", str(e)) return ok
class SchemaProvider(SingletonClass): """ A collection of schema build and caching methods. Static cache worflow: <authorative source> <-- <cache dir> <- client API Compute workflow: <dependent resource files, config file, dictionaries> -> [schema builder] --> <schema def> --> <Json schema> """ def __init__(self, cfgOb, cachePath, useCache=True, rebuildFlag=False, **kwargs): """A collection of schema build and caching methods. Args: cfgOb (object): ConfigInfo() instance cachePath (str): path to directory containing schema useCache (bool, optional): use cached schema. Defaults to True. rebuildFlag (bool, optional): on-the-fly rebuild and cache schema """ self.__cfgOb = cfgOb self.__configName = self.__cfgOb.getDefaultSectionName() self.__cachePath = os.path.abspath(cachePath) self.__useCache = useCache self.__rebuildFlag = rebuildFlag self.__useCache = rebuildFlag if rebuildFlag else useCache # self.__workPath = os.path.join(self.__cachePath, "work") self.__fileU = FileUtil(workPath=os.path.join(self.__cachePath, "work")) self.__schemaCachePath = os.path.join(self.__cachePath, self.__cfgOb.get("SCHEMA_DEFINITION_CACHE_DIR", sectionName=self.__configName)) self.__jsonSchemaCachePath = os.path.join(self.__cachePath, self.__cfgOb.get("JSON_SCHEMA_DEFINITION_CACHE_DIR", sectionName=self.__configName)) self.__fileU.mkdir(self.__schemaCachePath) self.__fileU.mkdir(self.__jsonSchemaCachePath) self.__kwargs = kwargs def getSchemaOptions(self, schemaLevel, extraOpts=None): opts = extraOpts + "|" if extraOpts else "" if schemaLevel == "full": return opts + "mandatoryKeys|mandatoryAttributes|bounds|enums|rcsb" elif schemaLevel in ["min", "minimum"]: return opts + "mandatoryKeys|enums|rcsb" else: return opts def getSchemaInfo(self, databaseName, dataTyping="ANY"): """Convenience method to return essential schema details for the input repository content type. Args: databaseName (str): schema name (e.g. pdbx, bird, chem_comp, ...) dataTyping (str, optional): Application name for the target schema (e.g. ANY, SQL, ...) Returns: tuple: SchemaDefAccess(object), target database name, target collection name list, primary index attribute list """ sd = None dbName = None collectionNameList = [] docIndexD = {} try: mU = MarshalUtil(workPath=self.__workPath) schemaLocator = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping) if self.__rebuildFlag: filePath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator)) self.makeSchemaDef(databaseName, dataTyping=dataTyping, saveSchema=True) else: filePath = self.__reload(schemaLocator, self.__schemaCachePath, useCache=self.__useCache) if not filePath: logger.error("Unable to recover schema %s (%s)", databaseName, dataTyping) logger.debug("ContentType %r dataTyping %r schemaLocator %r", databaseName, dataTyping, schemaLocator) schemaDef = mU.doImport(filePath, fmt="json") if schemaDef: logger.debug("Using cached schema definition for %s application %s", databaseName, dataTyping) sd = SchemaDefAccess(schemaDef) if sd: dbName = sd.getDatabaseName() collectionInfoList = sd.getCollectionInfo() logger.debug("Schema %s database name %s collections %r", databaseName, dbName, collectionInfoList) for cd in collectionInfoList: collectionName = cd["NAME"] collectionNameList.append(collectionName) docIndexD[collectionName] = sd.getDocumentIndices(collectionName) except Exception as e: logger.exception("Retreiving schema %s for %s failing with %s", databaseName, dataTyping, str(e)) return sd, dbName, collectionNameList, docIndexD def schemaDefCompare(self, databaseName, dataTyping="ANY"): """Compare computed schema defintion with current source/cached version. Args: databaseName (str): schema definition name for comparison dataTyping (str, optional): data type conventions for the schema comparison. Defaults to "ANY". Returns: (str): file path for schema difference or None """ mU = MarshalUtil(workPath=self.__workPath) schemaDiffPath = os.path.join(self.__cachePath, "schema_diff") mU.mkdir(schemaDiffPath) schemaPath = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping) fn = self.__fileU.getFileName(schemaPath) sD = self.makeSchemaDef(databaseName, dataTyping=dataTyping) v2 = sD["DATABASE_VERSION"] # ---- # tPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaPath) + "-test") # logger.info("Exporting schema def to %s", tPath) # mU.doExport(tPath, sD, fmt="json", indent=3) # sD = mU.doImport(tPath, fmt="json") # ---- cPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaPath)) sDCache = mU.doImport(cPath, fmt="json") v1 = sDCache["DATABASE_VERSION"] # numDiff, difD = self.schemaCompare(sDCache, sD) # # jD = diff(sDCache, sD, syntax="explicit", marshal=True) diffPath = None if numDiff: bn, _ = os.path.splitext(fn) diffPath = os.path.join(schemaDiffPath, bn + "-" + v1 + "-" + v2 + "-diff.json") # logger.info("diff for %s %s = \n%s", databaseName, dataTyping, pprint.pformat(difD, indent=3, width=100)) mU.doExport(diffPath, difD, fmt="json", indent=3) # return diffPath def jsonSchemaCompare(self, databaseName, collectionName, encodingType, level, extraOpts=None): """Compare computed JSON schema defintion with current source/cached version. Args: databaseName (str): schema name collectionName (str): collection name encodingType (str): schema data type conventions (JSON|BSON) level (str): metadata level (min|full) extraOpts (str): extra schema construction options Returns: (str): path to the difference file or None """ mU = MarshalUtil(workPath=self.__workPath) schemaDiffPath = os.path.join(self.__cachePath, "schema_diff") mU.mkdir(schemaDiffPath) schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType, level) fn = self.__fileU.getFileName(schemaLocator) schemaPath = os.path.join(self.__jsonSchemaCachePath, fn) # sD = self.makeSchema(databaseName, collectionName, encodingType=encodingType, level=level, saveSchema=False, extraOpts=extraOpts) v2 = self.__getSchemaVersion(sD) # ---- # tPath = os.path.join(self.__jsonSchemaCachePath, self.__fileU.getFileName(schemaPath) + "-test") # logger.info("Exporting json schema to %s", tPath) # mU.doExport(tPath, sD, fmt="json", indent=3) # ---- # sDCache = mU.doImport(schemaPath, fmt="json") v1 = self.__getSchemaVersion(sDCache) if not v1: logger.error("no version for %s - %s %s", schemaLocator, databaseName, collectionName) # numDiff, difD = self.schemaCompare(sDCache, sD) # jD = diff(sDCache, sD, marshal=True, syntax="explicit") diffPath = None if numDiff: logger.debug("diff for %s %s %s %s = \n%s", databaseName, collectionName, encodingType, level, pprint.pformat(difD, indent=3, width=100)) bn, _ = os.path.splitext(fn) diffPath = os.path.join(schemaDiffPath, bn + "-" + v1 + "-" + v2 + "-diff.json") mU.doExport(diffPath, difD, fmt="json", indent=3) return diffPath def __getSchemaVersion(self, jsonSchema): try: comment = jsonSchema["$comment"] if "$comment" in jsonSchema else "" ff = comment.split(":") version = ff[1].strip() return version except Exception as e: logger.exception("Failing for with %s", str(e)) return "" def __getSchemaDefLocator(self, databaseName, dataTyping="ANY"): """Internal method returning schema definition path for the input content type and application. Defines schema definition naming convention - Args: databaseName (str): schema name (e.g. pdbx, bird, chem_comp, ...) dataTyping (str, optional): Application name for the target schema (e.g. ANY, SQL, ...) Returns: str: schema definition file locator """ schemaLocator = None try: locPath = self.__cfgOb.get("SCHEMA_DEFINITION_LOCATOR_PATH", sectionName=self.__configName) fn = "schema_def-%s-%s.json" % (databaseName, dataTyping.upper()) schemaLocator = os.path.join(locPath, fn) except Exception as e: logger.exception("Retreiving schema definition path %s for %s failing with %s", databaseName, dataTyping, str(e)) return schemaLocator def __getJsonSchemaLocator(self, databaseName, collectionName, encodingType="BSON", level="full"): """Internal method returning JSON schema path for the input collection data type convention and level. Defines the JSON/BSON schema naming convention - Args: databaseName (str): database name in the document store collectionName (str): collection name in document store encodingType (str, optional): data type convention (BSON|JSON) level (str, optional): Completeness of the schema (e.g. min or full) Returns: str: schema file locator """ schemaLocator = None try: sdType = None sLevel = None schemaLocator = None if encodingType.upper() in ["JSON", "BSON"]: sdType = encodingType.lower() if level.lower() in ["min", "minimun"]: sLevel = "min" elif level.lower() in ["full"]: sLevel = level.lower() # if sdType and sLevel: locPath = self.__cfgOb.get("JSON_SCHEMA_DEFINITION_LOCATOR_PATH", sectionName=self.__configName) fn = "%s-%s-db-%s-col-%s.json" % (sdType, sLevel, databaseName, collectionName) schemaLocator = os.path.join(locPath, fn) else: logger.error("Unsupported schema options: %s level %r type %r", collectionName, level, encodingType) schemaLocator = None except Exception as e: logger.debug("Retreiving JSON schema definition for %s type %s failing with %s", collectionName, encodingType, str(e)) # return schemaLocator def __reload(self, locator, dirPath, useCache=True): # fn = self.__fileU.getFileName(locator) filePath = os.path.join(dirPath, fn) logger.debug("Target cache filePath %s", filePath) self.__fileU.mkdir(dirPath) if not useCache: try: os.remove(filePath) except Exception: pass # if useCache and self.__fileU.exists(filePath): ok = True else: logger.info("Fetch data from source %s to %s", locator, filePath) ok = self.__fileU.get(locator, filePath) return filePath if ok else None def getJsonSchema(self, databaseName, collectionName, encodingType="BSON", level="full", extraOpts=None): """Return JSON schema (w/ BSON types) object for the input collection and level.and Args: databaseName (str): database name collectionName (str): collection name in document store encodingType (str, optional): data type convention (BSON|JSON) level (str, optional): Completeness of the schema (e.g. min or full) Returns: dict: Schema object """ sObj = None schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType=encodingType, level=level) # if self.__rebuildFlag: filePath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator)) self.makeSchema(databaseName, collectionName, encodingType=encodingType, level=level, extraOpts=extraOpts) else: filePath = self.__reload(schemaLocator, self.__jsonSchemaCachePath, useCache=self.__useCache) mU = MarshalUtil(workPath=self.__workPath) if filePath and mU.exists(filePath): mU = MarshalUtil(workPath=self.__workPath) sObj = mU.doImport(filePath, fmt="json") else: logger.debug("Failed to read schema for %s %r", collectionName, level) return sObj def makeSchema(self, databaseName, collectionName, encodingType="BSON", level="full", saveSchema=False, extraOpts=None): try: smb = SchemaDefBuild(databaseName, self.__cfgOb, cachePath=self.__cachePath) # cD = None stU = encodingType.upper() cD = smb.build(collectionName, dataTyping=stU, encodingType=stU, enforceOpts=self.getSchemaOptions(level, extraOpts=extraOpts)) if cD and saveSchema: schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType=encodingType, level=level) localPath = os.path.join(self.__jsonSchemaCachePath, self.__fileU.getFileName(schemaLocator)) mU = MarshalUtil(workPath=self.__workPath) mU.doExport(localPath, cD, fmt="json", indent=3, enforceAscii=False) except Exception as e: logger.exception("Building schema %s collection %s failing with %s", databaseName, collectionName, str(e)) return cD def makeSchemaDef(self, databaseName, dataTyping="ANY", saveSchema=False): schemaDef = None try: smb = SchemaDefBuild(databaseName, self.__cfgOb, cachePath=self.__cachePath) schemaDef = smb.build(dataTyping=dataTyping, encodingType="rcsb") if schemaDef and saveSchema: schemaLocator = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping) localPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator)) mU = MarshalUtil(workPath=self.__workPath) mU.doExport(localPath, schemaDef, fmt="json", indent=3, enforceAscii=False) except Exception as e: logger.exception("Building schema %s failing with %s", databaseName, str(e)) return schemaDef def schemaCompare(self, orgD, newD): """ Compute the difference of nested dictionaries. """ fOrgD = self.__flatten(orgD) fNewD = self.__flatten(newD) if len(fOrgD) != len(fNewD): logger.debug("Schema lengths differ: org %d new %d", len(fOrgD), len(fNewD)) # addedD = {k: fNewD[k] for k in set(fNewD) - set(fOrgD)} removedD = {k: fOrgD[k] for k in set(fOrgD) - set(fNewD)} changedOrgD = {k: fOrgD[k] for k in set(fOrgD) & set(fNewD) if fOrgD[k] != fNewD[k]} changedNewD = {k: fNewD[k] for k in set(fOrgD) & set(fNewD) if fOrgD[k] != fNewD[k]} chD = {} for ky in changedOrgD: kyS = ".".join(ky) vOrg = changedOrgD[ky] vNew = changedNewD[ky] if isinstance(vOrg, (list, tuple)) and isinstance(vNew, (list, tuple)): # logger.info(" >> %r vOrg %r vNew %r", ky, vOrg, vNew) dV = list(set(vNew) - set(vOrg)) if dV: chD[kyS] = {"diff": dV} else: chD[kyS] = {"from": vOrg, "to": vNew} # nT = len(addedD) + len(removedD) + len(chD) diffD = {"added": [".".join(kk) for kk in addedD.keys()], "removed": [".".join(kk) for kk in removedD.keys()], "changed": chD} return nT, diffD def __flatten(self, inpDict, prefix=None): prefix = prefix[:] if prefix else [] outDict = {} for key, value in inpDict.items(): if isinstance(value, dict) and value: deeper = self.__flatten(value, prefix + [key]) outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()}) elif isinstance(value, (list, tuple)) and value: for index, sublist in enumerate(value, start=1): if isinstance(sublist, dict) and sublist: deeper = self.__flatten(sublist, prefix + [key] + [str(index)]) outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()}) else: outDict[tuple(prefix + [key] + [str(index)])] = value else: outDict[tuple(prefix + [key])] = value return outDict def __flattenX(self, inpDict, prefix=None): prefix = prefix[:] if prefix else [] # separator = "." outDict = {} for key, value in inpDict.items(): if isinstance(value, dict) and value: deeper = self.__flatten(value, prefix + [key]) outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()}) elif isinstance(value, list) and value: for index, sublist in enumerate(value, start=1): if isinstance(sublist, dict) and sublist: deeper = self.__flatten(sublist, prefix + [key] + [str(index)]) outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()}) else: outDict[tuple(prefix + [key] + [str(index)])] = value else: outDict[tuple(prefix + [key])] = value return outDict def __flattenOrg(self, inpDict, separator=".", prefix=""): outDict = {} for key, value in inpDict.items(): if isinstance(value, dict) and value: deeper = self.__flattenOrg(value, separator, prefix + key + separator) outDict.update({key2: val2 for key2, val2 in deeper.items()}) elif isinstance(value, list) and value: for index, sublist in enumerate(value, start=1): if isinstance(sublist, dict) and sublist: deeper = self.__flattenOrg(sublist, separator, prefix + key + separator + str(index) + separator) outDict.update({key2: val2 for key2, val2 in deeper.items()}) else: outDict[prefix + key + separator + str(index)] = value else: outDict[prefix + key] = value return outDict def __dictGen(self, indict, pre=None): pre = pre[:] if pre else [] if isinstance(indict, dict): for key, value in indict.items(): if isinstance(value, dict): for dD in self.__dictGen(value, pre + [key]): yield dD elif isinstance(value, list) or isinstance(value, tuple): for v in value: for dD in self.__dictGen(v, pre + [key]): yield dD else: yield pre + [key, value] else: yield indict
def __reload(self, dirPath, reloadDb=False, fromDb=False, useCache=False, pharosDumpUrl=None, mysqlUser=None, mysqlPassword=None): startTime = time.time() pharosSelectedTables = ["drug_activity", "cmpd_activity", "target", "protein", "t2tc"] pharosDumpUrl = pharosDumpUrl if pharosDumpUrl else "http://juniper.health.unm.edu/tcrd/download/latest.sql.gz" pharosReadmeUrl = "http://juniper.health.unm.edu/tcrd/download/latest.README" ok = False fU = FileUtil() pharosDumpFileName = fU.getFileName(pharosDumpUrl) pharosDumpPath = os.path.join(dirPath, pharosDumpFileName) pharosUpdatePath = os.path.join(dirPath, "pharos-update.sql") pharosReadmePath = os.path.join(dirPath, "pharos-readme.txt") logPath = os.path.join(dirPath, "pharosLoad.log") # fU.mkdir(dirPath) # exU = ExecUtils() # if reloadDb: logger.info("useCache %r pharosDumpPath %r", useCache, pharosDumpPath) if useCache and self.__mU.exists(pharosDumpPath): ok = True else: logger.info("Fetching url %s path %s", pharosDumpUrl, pharosDumpPath) ok1 = fU.get(pharosDumpUrl, pharosDumpPath) ok2 = fU.get(pharosReadmeUrl, pharosReadmePath) logger.info("Completed fetch (%r) at %s (%.4f seconds)", ok1 and ok2, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) # --- readmeLines = self.__mU.doImport(pharosReadmePath, fmt="list") self.__version = readmeLines[0].split(" ")[1][1:] if readmeLines else "6" # --- logger.info("Filtering SQL dump %r for selected tables %r", pharosDumpFileName, pharosSelectedTables) doWrite = True # Note: the pharos dump file latest.sql.gz is not gzipped with open(pharosDumpPath, "r", encoding="utf-8") as ifh, open(pharosUpdatePath, "w", encoding="utf-8") as ofh: for line in ifh: if line.startswith("-- Table structure for table"): tN = line.split(" ")[-1][1:-2] doWrite = True if tN in pharosSelectedTables else False if doWrite: ofh.write(line) # --- ok = exU.run( "mysql", execArgList=["-v", "-u", mysqlUser, "--password=%s" % mysqlPassword, "-e", "create database if not exists tcrd6;"], outPath=logPath, outAppend=False, timeOut=None, ) # ok = exU.run( # "mysql", # execArgList=["-u", mysqlUser, "--password=%s" % mysqlPassword, "tcrd6"], # outPath=logPath, # inpPath=pharosDumpPath, # outAppend=True, # timeOut=None, # ) shellCmd = 'trap "" SIGHUP SIGINT SIGTERM; nohup mysql -u %s --password=%s tcrd6 < %s >& %s' % (mysqlUser, mysqlPassword, pharosUpdatePath, logPath) ok = exU.runShell( shellCmd, outPath=None, inpPath=None, outAppend=True, timeOut=None, ) logger.info("SQL dump restore status %r", ok) # -- if fromDb: for tbl in pharosSelectedTables: outPath = os.path.join(dirPath, "%s.tdd" % tbl) # if useCache and self.__mU.exists(outPath): # continue ok = exU.run( "mysql", execArgList=["-u", mysqlUser, "--password=%s" % mysqlPassword, "-e", "use tcrd6; select * from %s;" % tbl], outPath=outPath, outAppend=False, timeOut=None, suppressStderr=True, ) logger.info("SQL table %s export status %r", tbl, ok) return ok