class FileUtilTests(unittest.TestCase): def setUp(self): self.__verbose = True self.__pathPdbxDictionaryFile = os.path.join(TOPDIR, "rcsb", "mock-data", "dictionaries", "mmcif_pdbx_v5_next.dic") self.__pathTaxonomyFile = os.path.join(TOPDIR, "rcsb", "mock-data", "NCBI", "names.dmp.gz") self.__zipFileUrl = "https://inventory.data.gov/dataset/794cd3d7-4d28-4408-8f7d-84b820dbf7f2/resource/6b78ec0c-4980-4ad8-9cbd-2d6eb9eda8e7/download/myfoodapediadata.zip" self.__xzFile = os.path.join(TOPDIR, "rcsb", "mock-data", "MOCK_MODBASE_MODELS", "NP_001030614.1_1.pdb.xz") # self.__ftpFileUrl = "ftp://ftp.wwpdb.org/pub/pdb/data/component-models/complete/chem_comp_model.cif.gz" self.__httpsFileUrl = "https://ftp.wwpdb.org/pub/pdb/data/component-models/complete/chem_comp_model.cif.gz" # self.__workPath = os.path.join(HERE, "test-output") self.__inpDirPath = os.path.join(HERE, "test-data") self.__fileU = FileUtil() self.__startTime = time.time() logger.debug("Running tests on version %s", __version__) logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testTarBundling(self): """Test case for tarfile bundling and unbundling""" try: tP = os.path.join(self.__workPath, "t0.tar.gz") dirPath = os.path.join(self.__inpDirPath, "topdir") ok = self.__fileU.bundleTarfile(tP, [dirPath], mode="w:gz", recursive=True) self.assertTrue(ok) numBytes = self.__fileU.size(tP) self.assertGreaterEqual(numBytes, 250) # md5 = self.__fileU.hash(tP, hashType="md5") self.assertTrue(md5 is not None) # ok = self.__fileU.unbundleTarfile(tP, dirPath=self.__workPath) self.assertTrue(ok) # tP = os.path.join(self.__workPath, "t1.tar.gz") dirPathList = [ os.path.join(self.__inpDirPath, "topdir", "subdirA"), os.path.join(self.__inpDirPath, "topdir", "subdirB") ] ok = self.__fileU.bundleTarfile(tP, dirPathList, mode="w:gz", recursive=True) self.assertTrue(ok) # ok = self.__fileU.unbundleTarfile(tP, dirPath=self.__workPath) self.assertTrue(ok) tP = os.path.join(self.__workPath, "t2.tar") dirPathList = [ os.path.join(self.__inpDirPath, "topdir", "subdirA"), os.path.join(self.__inpDirPath, "topdir", "subdirB") ] ok = self.__fileU.bundleTarfile(tP, dirPathList, mode="w", recursive=True) self.assertTrue(ok) # ok = self.__fileU.unbundleTarfile(tP, dirPath=self.__workPath) self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testGetFile(self): """Test case for a local files and directories""" try: remoteLocator = self.__pathPdbxDictionaryFile fn = self.__fileU.getFileName(remoteLocator) # _, fn = os.path.split(remoteLocator) lPath = os.path.join(self.__workPath, fn) ok = self.__fileU.get(remoteLocator, lPath) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertTrue(ok) ok = self.__fileU.isLocal(lPath) self.assertTrue(ok) tPath = self.__fileU.getFilePath(lPath) self.assertEqual(lPath, tPath) ok = self.__fileU.remove(lPath) self.assertTrue(ok) dPath = os.path.join(self.__workPath, "tdir") ok = self.__fileU.mkdir(dPath) self.assertTrue(ok) ok = self.__fileU.remove(dPath) self.assertTrue(ok) ok = self.__fileU.remove(";lakdjf") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testMoveAndCopyFile(self): """Test case for copying ("put") and moving ("replace") local files""" try: remoteLocator = self.__pathPdbxDictionaryFile fn = self.__fileU.getFileName(remoteLocator) # _, fn = os.path.split(remoteLocator) lPath = os.path.join(self.__workPath, fn) ok = self.__fileU.get(remoteLocator, lPath) self.assertTrue(ok) # Test copy file dPath2 = os.path.join(self.__workPath, "tdir") ok = self.__fileU.mkdir(dPath2) self.assertTrue(ok) lPath2 = os.path.join(dPath2, fn) ok = self.__fileU.put(lPath, lPath2) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertTrue(ok) ok = self.__fileU.exists(lPath2) self.assertTrue(ok) # Remove copied file (to test moving file next) ok = self.__fileU.remove(lPath2) self.assertTrue(ok) ok = self.__fileU.exists(lPath2) self.assertFalse(ok) # Test move file ok = self.__fileU.replace(lPath, lPath2) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertFalse(ok) ok = self.__fileU.exists(lPath2) self.assertTrue(ok) # Now clean up files and dirs ok = self.__fileU.remove(lPath) self.assertTrue(ok) ok = self.__fileU.remove(dPath2) self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testZipUrl(self): """Test case for downloading remote zip file and extracting contents.""" try: remoteLocator = self.__zipFileUrl # fn = self.__fileU.getFileName(remoteLocator) ok = self.__fileU.isLocal(remoteLocator) self.assertFalse(ok) # lPath = os.path.join(self.__workPath, self.__fileU.getFileName(self.__zipFileUrl)) ok = self.__fileU.get(remoteLocator, lPath) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertTrue(ok) ok = self.__fileU.isLocal(lPath) self.assertTrue(ok) tPath = self.__fileU.getFilePath(lPath) self.assertEqual(lPath, tPath) fp = self.__fileU.uncompress(lPath, outputDir=self.__workPath) ok = fp.endswith("Food_Display_Table.xlsx") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testFtpUrl(self): """Test case for downloading remote file ftp protocol and extracting contents.""" try: remoteLocator = self.__ftpFileUrl # fn = self.__fileU.getFileName(remoteLocator) ok = self.__fileU.isLocal(remoteLocator) self.assertFalse(ok) # dirPath = os.path.join(self.__workPath, "chem_comp_models") lPath = os.path.join(dirPath, self.__fileU.getFileName(self.__ftpFileUrl)) ok = self.__fileU.get(remoteLocator, lPath) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertTrue(ok) ok = self.__fileU.isLocal(lPath) self.assertTrue(ok) tPath = self.__fileU.getFilePath(lPath) self.assertEqual(lPath, tPath) fp = self.__fileU.uncompress(lPath, outputDir=dirPath) ok = fp.endswith("chem_comp_model.cif") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testRemote(self): """Test case remote status""" try: remoteLocator = self.__httpsFileUrl ok = self.__fileU.isLocal(remoteLocator) self.assertFalse(ok) # ok = self.__fileU.exists(remoteLocator) self.assertTrue(ok) size = self.__fileU.size(remoteLocator) self.assertGreaterEqual(size, 1000) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() @unittest.skip("DrugBank example -- skipping") def testGetDrugBankUrl(self): """Test case for downloading drugbank master xml file""" try: remoteLocator = "https://www.drugbank.ca/releases/latest/downloads/all-full-database" un = "username" pw = "password" # fn = self.__fileU.getFileName(remoteLocator) ok = self.__fileU.isLocal(remoteLocator) self.assertFalse(ok) # lPath = os.path.join(self.__workPath, "db-download.zip") ok = self.__fileU.get(remoteLocator, lPath, username=un, password=pw) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertTrue(ok) ok = self.__fileU.isLocal(lPath) self.assertTrue(ok) tPath = self.__fileU.getFilePath(lPath) self.assertEqual(lPath, tPath) self.__fileU.uncompress(lPath, outputDir=self.__workPath) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testXzFile(self): """Test case for extracting contents from xz file""" try: remoteLocator = self.__xzFile fn = self.__fileU.getFileName(remoteLocator) lPath = os.path.join(self.__workPath, fn) ok = self.__fileU.get(remoteLocator, lPath) self.assertTrue(ok) ok = self.__fileU.exists(lPath) self.assertTrue(ok) ok = self.__fileU.isLocal(lPath) self.assertTrue(ok) tPath = self.__fileU.getFilePath(lPath) self.assertEqual(lPath, tPath) fp = self.__fileU.uncompress(lPath, outputDir=self.__workPath) ok = fp.endswith(".pdb") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class IoUtil(object): def __init__(self, **kwargs): self.__fileU = FileUtil(**kwargs) def serialize(self, filePath, myObj, fmt="pickle", **kwargs): """Public method to serialize format appropriate objects Args: filePath (str): local file path' myObj (object): format appropriate object to be serialized format (str, optional): one of ['mmcif', mmcif-dict', json', 'list', 'text-dump', pickle' (default)] **kwargs: additional keyword arguments passed to worker methods - Returns: bool: status of serialization operation; true for success or false otherwise """ ret = False fmt = str(fmt).lower() ret = self.__fileU.mkdirForFile(filePath) if not ret: return ret if fmt in ["mmcif"]: ret = self.__serializeMmCif(filePath, myObj, **kwargs) elif fmt in ["json"]: ret = self.__serializeJson(filePath, myObj, **kwargs) elif fmt in ["pickle"]: ret = self.__serializePickle(filePath, myObj, **kwargs) elif fmt in ["list"]: ret = self.__serializeList(filePath, myObj, enforceAscii=True, **kwargs) elif fmt in ["mmcif-dict"]: ret = self.__serializeMmCifDict(filePath, myObj, **kwargs) elif fmt in ["text-dump"]: ret = self.__textDump(filePath, myObj, **kwargs) elif fmt in ["fasta"]: ret = self.__serializeFasta(filePath, myObj, **kwargs) elif fmt in ["csv"]: ret = self.__serializeCsv(filePath, myObj, **kwargs) else: pass return ret def deserialize(self, filePath, fmt="pickle", **kwargs): """Public method to deserialize objects in supported formats. Args: filePath (str): local file path format (str, optional): one of ['mmcif', 'json', 'list', ..., 'pickle' (default)] **kwargs: additional keyword arguments passed to worker methods - Returns: object: deserialized object data """ fmt = str(fmt).lower() if fmt in ["mmcif"]: ret = self.__deserializeMmCif(filePath, **kwargs) # type: ignore elif fmt in ["json"]: ret = self.__deserializeJson(filePath, **kwargs) # type: ignore elif fmt in ["pickle"]: ret = self.__deserializePickle(filePath, **kwargs) # type: ignore elif fmt in ["list"]: ret = self.__deserializeList(filePath, enforceAscii=True, **kwargs) # type: ignore elif fmt in ["mmcif-dict"]: ret = self.__deserializeMmCifDict(filePath, **kwargs) # type: ignore elif fmt in ["fasta"]: ret = self.__deserializeFasta(filePath, **kwargs) # type: ignore # elif fmt in ["vrpt-xml-to-cif"]: # ret = self.__deserializeVrptToCif(filePath, **kwargs) # type: ignore elif fmt in ["csv", "tdd"]: delimiter = kwargs.get("csvDelimiter", "," if fmt == "csv" else "\t") ret = self.__deserializeCsv(filePath, delimiter=delimiter, **kwargs) # type: ignore elif fmt in ["xml"]: ret = self.__deserializeXml(filePath, **kwargs) # type: ignore else: ret = None # type: ignore return ret def __sliceInChunks(self, myList, numChunks): mc = min(len(myList), numChunks) chunkSize = int(len(myList) / mc) if len(myList) % mc: chunkSize += 1 for i in range(0, len(myList), chunkSize): yield myList[i:i + chunkSize] def serializeInParts(self, filePath, myObj, numParts, fmt="json", **kwargs): """Public method to serialize format appropriate (json, pickle) objects in multiple parts Args: filePath (str): local file path myObj (object): format appropriate object to be serialized numParts (int): divide the data into numParts segments format (str, optional): one of ['json' or 'pickle']. Defaults to json **kwargs: additional keyword arguments passed to worker methods - Returns: bool: True for success or False otherwise """ if fmt not in ["json", "pickle"]: logger.error("Unsupported format for %s", fmt) return False pth, fn = os.path.split(filePath) self.__fileU.mkdirForFile(pth) bn, ext = os.path.splitext(fn) ret = True if isinstance(myObj, list): for ii, subList in enumerate(self.__sliceInChunks(myObj, numParts)): fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext) ok = self.serialize(fp, subList, fmt=fmt, **kwargs) ret = ret and ok elif isinstance(myObj, dict): for ii, keyList in enumerate( self.__sliceInChunks(list(myObj.keys()), numParts)): fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext) ok = self.serialize(fp, OrderedDict([(k, myObj[k]) for k in keyList]), fmt=fmt, **kwargs) ret = ret and ok else: logger.error("Unsupported data type for serialization in parts") ret = False # return ret def deserializeInParts(self, filePath, numParts, fmt="json", **kwargs): """Public method to deserialize objects in supported formats from multiple parts Args: filePath (str): local file path numParts (int): reconstruct the data object from numParts segments format (str, optional): one of ['json' or 'pickle']. Defaults to json **kwargs: additional keyword arguments passed to worker methods - Returns: object: deserialized object data """ rObj = None if fmt not in ["json", "pickle"]: logger.error("Unsupported format for %s", fmt) return rObj # pth, fn = os.path.split(filePath) bn, ext = os.path.splitext(fn) if not numParts: fp = os.path.join(pth, bn + "_part_*" + ext) numParts = len(glob.glob(fp)) # for ii in range(numParts): fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext) tObj = self.deserialize(fp, fmt=fmt, **kwargs) if isinstance(tObj, list): if not rObj: rObj = [] rObj.extend(tObj) elif isinstance(tObj, dict): if not rObj: rObj = OrderedDict() rObj.update(tObj) else: logger.error( "Unsupported data type for deserialization in parts") return rObj def exists(self, filePath, mode=os.R_OK): return self.__fileU.exists(filePath, mode=mode) def mkdir(self, dirPath, mode=0o755): return self.__fileU.mkdir(dirPath, mode=mode) def remove(self, pth): return self.__fileU.remove(pth) def __deserializeFasta(self, filePath, **kwargs): try: commentStyle = kwargs.get("commentStyle", "uniprot") fau = FastaUtil() return fau.readFasta(filePath, commentStyle=commentStyle) except Exception as e: logger.error("Unable to deserialize %r %r ", filePath, str(e)) return {} def __serializeFasta(self, filePath, myObj, **kwargs): try: maxLineLength = int(kwargs.get("maxLineLength", 70)) makeComment = kwargs.get("makeComment", False) fau = FastaUtil() ok = fau.writeFasta(filePath, myObj, maxLineLength=maxLineLength, makeComment=makeComment) return ok except Exception as e: logger.error("Unable to serialize FASTA file %r %r", filePath, str(e)) return False def __textDump(self, filePath, myObj, **kwargs): try: indent = kwargs.get("indent", 1) width = kwargs.get("width", 120) sOut = pprint.pformat(myObj, indent=indent, width=width) with open(filePath, "w") as ofh: ofh.write("\n%s\n" % sOut) return True except Exception as e: logger.error("Unable to dump to %r %r", filePath, str(e)) return False def __serializePickle(self, filePath, myObj, **kwargs): try: pickleProtocol = kwargs.get("pickleProtocol", pickle.DEFAULT_PROTOCOL) with open(filePath, "wb") as outfile: pickle.dump(myObj, outfile, pickleProtocol) return True except Exception as e: logger.error("Unable to serialize %r %r", filePath, str(e)) return False def __deserializePickle(self, filePath, **kwargs): myDefault = kwargs.get("default", {}) try: if sys.version_info[0] > 2: encoding = kwargs.get("encoding", "ASCII") errors = kwargs.get("errors", "strict") with open(filePath, "rb") as outfile: return pickle.load(outfile, encoding=encoding, errors=errors) else: with open(filePath, "rb") as outfile: return pickle.load(outfile) except Exception as e: logger.warning("Unable to deserialize %r %r", filePath, str(e)) return myDefault def __serializeJson(self, filePath, myObj, **kwargs): """Internal method to serialize the input object as JSON. An encoding helper class is included to handle selected python data types (e.g., datetime) """ indent = kwargs.get("indent", 0) enforceAscii = kwargs.get("enforceAscii", True) try: if enforceAscii: with open(filePath, "w") as outfile: json.dump(myObj, outfile, indent=indent, cls=JsonTypeEncoder, ensure_ascii=enforceAscii) else: with io.open(filePath, "w", encoding="utf-8") as outfile: json.dump(myObj, outfile, indent=indent, cls=JsonTypeEncoder, ensure_ascii=enforceAscii) return True except Exception as e: logger.error("Unable to serialize %r %r", filePath, str(e)) return False def __deserializeJson(self, filePath, **kwargs): myDefault = kwargs.get("default", {}) encoding = kwargs.get("encoding", "utf-8-sig") encodingErrors = kwargs.get("encodingErrors", "ignore") try: if filePath[-3:] == ".gz": if sys.version_info[0] > 2: with gzip.open(filePath, "rt", encoding=encoding, errors=encodingErrors) as inpFile: return json.load(inpFile, object_pairs_hook=OrderedDict) else: # Py2 situation non-ascii encodings is problematic # with gzip.open(filePath, "rb") as csvFile: # oL = self.__csvReader(csvFile, rowFormat, delimiter) tPath = self.__fileU.uncompress(filePath, outputDir=None) with io.open(tPath, newline="", encoding=encoding, errors="ignore") as inpFile: return json.load(inpFile, object_pairs_hook=OrderedDict) else: with open(filePath, "r") as inpFile: return json.load(inpFile, object_pairs_hook=OrderedDict) except Exception as e: logger.warning("Unable to deserialize %r %r", filePath, str(e)) return myDefault def __hasMinSize(self, pth, minSize): try: return os.path.getsize(pth) >= minSize except Exception: return False def __deserializeMmCif(self, locator, **kwargs): """ """ try: containerList = [] workPath = kwargs.get("workPath", None) enforceAscii = kwargs.get("enforceAscii", True) raiseExceptions = kwargs.get("raiseExceptions", True) useCharRefs = kwargs.get("useCharRefs", True) minSize = kwargs.get("minSize", 5) # if self.__fileU.isLocal(locator): if minSize >= 0 and not self.__hasMinSize(locator, minSize): logger.warning("Minimum file size not satisfied for: %r", locator) myIo = IoAdapter(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs) containerList = myIo.readFile( locator, enforceAscii=enforceAscii, outDirPath=workPath) # type: ignore else: # myIo = IoAdapterPy(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs) # containerList = myIo.readFile(locator, enforceAscii=enforceAscii, outDirPath=workPath) containerList = self.__deserializeMmCifRemote( locator, useCharRefs, enforceAscii, workPath) except Exception as e: logger.error("Failing for %s with %s", locator, str(e)) return containerList @retry((requests.exceptions.RequestException), maxAttempts=3, delaySeconds=1, multiplier=2, defaultValue=[], logger=logger) def __deserializeMmCifRemote(self, locator, useCharRefs, enforceAscii, workPath): containerList = [] try: myIo = IoAdapterPy(raiseExceptions=True, useCharRefs=useCharRefs) containerList = myIo.readFile(locator, enforceAscii=enforceAscii, outDirPath=workPath) except Exception as e: raise e return containerList def __serializeMmCif(self, filePath, containerList, **kwargs): """ """ try: ret = False workPath = kwargs.get("workPath", None) enforceAscii = kwargs.get("enforceAscii", True) raiseExceptions = kwargs.get("raiseExceptions", True) useCharRefs = kwargs.get("useCharRefs", True) # myIo = IoAdapter(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs) if filePath.endswith(".gz") and workPath: rfn = "".join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) tPath = os.path.join(workPath, rfn) ret = myIo.writeFile(tPath, containerList=containerList, enforceAscii=enforceAscii) ret = self.__fileU.compress(tPath, filePath, compressType="gzip") else: ret = myIo.writeFile(filePath, containerList=containerList, enforceAscii=enforceAscii) except Exception as e: logger.error("Failing for %s with %s", filePath, str(e)) return ret def __deserializeMmCifDict(self, filePath, **kwargs): """ """ try: containerList = [] workPath = kwargs.get("workPath", None) enforceAscii = kwargs.get("enforceAscii", True) raiseExceptions = kwargs.get("raiseExceptions", True) useCharRefs = kwargs.get("useCharRefs", True) # myIo = IoAdapterPy(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs) containerList = myIo.readFile(filePath, enforceAscii=enforceAscii, outDirPath=workPath) except Exception as e: logger.error("Failing for %s with %s", filePath, str(e)) return containerList def __serializeMmCifDict(self, filePath, containerList, **kwargs): """ """ try: ret = False # workPath = kwargs.get('workPath', None) enforceAscii = kwargs.get("enforceAscii", True) raiseExceptions = kwargs.get("raiseExceptions", True) useCharRefs = kwargs.get("useCharRefs", True) # myIo = IoAdapterPy(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs) ret = myIo.writeFile(filePath, containerList=containerList, enforceAscii=enforceAscii) except Exception as e: logger.error("Failing for %s with %s", filePath, str(e)) return ret def __serializeList(self, filePath, aList, enforceAscii=True, **kwargs): """ """ try: _ = kwargs if enforceAscii: encoding = "ascii" else: encoding = "utf-8" # if sys.version_info[0] > 2: with open(filePath, "w") as ofh: if enforceAscii: for st in aList: ofh.write("%s\n" % st.encode( "ascii", "xmlcharrefreplace").decode("ascii")) else: for st in aList: ofh.write("%s\n" % st) else: if enforceAscii: with io.open(filePath, "w", encoding=encoding) as ofh: for st in aList: ofh.write("%s\n" % st.encode( "ascii", "xmlcharrefreplace").decode("ascii")) else: with open(filePath, "wb") as ofh: for st in aList: ofh.write("%s\n" % st) return True except Exception as e: logger.error("Unable to serialize %r %r", filePath, str(e)) return False def __processList(self, ifh, enforceAscii=True, **kwargs): uncomment = kwargs.get("uncomment", True) aList = [] for line in ifh: if enforceAscii: pth = line[:-1].encode("ascii", "xmlcharrefreplace").decode("ascii") else: pth = line[:-1] if not pth or (uncomment and pth.startswith("#")): continue aList.append(pth) return aList def __deserializeList(self, filePath, enforceAscii=True, encodingErrors="ignore", **kwargs): aList = [] _ = kwargs try: if filePath[-3:] == ".gz": if sys.version_info[0] > 2: with gzip.open(filePath, "rt", encoding="utf-8-sig", errors=encodingErrors) as ifh: aList = self.__processList(ifh, enforceAscii=enforceAscii, **kwargs) else: tPath = self.__fileU.uncompress(filePath, outputDir=None) # for py2 this commented code is problematic for non-ascii data # with gzip.open(filePath, "rb") as ifh: # aList = self.__processList(ifh, enforceAscii=enforceAscii) with io.open(tPath, encoding="utf-8-sig", errors="ignore") as ifh: aList = self.__processList(ifh, enforceAscii=enforceAscii) else: with io.open(filePath, encoding="utf-8-sig", errors="ignore") as ifh: aList = self.__processList(ifh, enforceAscii=enforceAscii, **kwargs) except Exception as e: logger.error("Unable to deserialize %r %s", filePath, str(e)) # logger.debug("Reading list length %d", len(aList)) return aList def __csvReader(self, csvFile, rowFormat, delimiter, uncomment=True): oL = [] maxInt = sys.maxsize csv.field_size_limit(maxInt) if rowFormat == "dict": if uncomment: reader = csv.DictReader(uncommentFilter(csvFile), delimiter=delimiter) else: reader = csv.DictReader(csvFile, delimiter=delimiter) for rowD in reader: oL.append(rowD) elif rowFormat == "list": if uncomment: reader = csv.reader(uncommentFilter(csvFile), delimiter=delimiter) else: reader = csv.reader(csvFile, delimiter=delimiter) for rowL in reader: oL.append(rowL) return oL def deserializeCsvIter(self, filePath, delimiter=",", rowFormat="dict", encodingErrors="ignore", uncomment=True, **kwargs): """Return an iterator to input CSV format file. Args: filePath (str): input file path delimiter (str, optional): CSV delimiter. Defaults to ",". rowFormat (str, optional): format for each process row (list or dict). Defaults to "dict". encodingErrors (str, optional): treatment of encoding errors. Defaults to "ignore". uncomment (bool, optional): flag to ignore leading comments. Defaults to True. Returns: (iterator): iterator for rowwise access to processed CSV data """ encoding = kwargs.get("encoding", "utf-8-sig") maxInt = sys.maxsize csv.field_size_limit(maxInt) try: if filePath[-3:] == ".gz": with gzip.open(filePath, "rt", encoding=encoding, errors=encodingErrors) as csvFile: startIt = itertools.dropwhile( lambda x: x.startswith("#"), csvFile) if uncomment else csvFile if rowFormat == "dict": reader = csv.DictReader(startIt, delimiter=delimiter) elif rowFormat == "list": reader = csv.reader(startIt, delimiter=delimiter) for row in reader: yield row else: with io.open(filePath, newline="", encoding=encoding, errors="ignore") as csvFile: startIt = itertools.dropwhile( lambda x: x.startswith("#"), csvFile) if uncomment else csvFile if rowFormat == "dict": reader = csv.DictReader(startIt, delimiter=delimiter) elif rowFormat == "list": reader = csv.reader(startIt, delimiter=delimiter) for row in reader: # if uncomment and row.startswith("#"): # continue yield row except Exception as e: logger.error("Unable to deserialize %r %s", filePath, str(e)) def __deserializeCsv(self, filePath, delimiter=",", rowFormat="dict", encodingErrors="ignore", uncomment=True, **kwargs): oL = [] encoding = kwargs.get("encoding", "utf-8-sig") try: if filePath[-3:] == ".gz": if sys.version_info[0] > 2: with gzip.open(filePath, "rt", encoding=encoding, errors=encodingErrors) as csvFile: oL = self.__csvReader(csvFile, rowFormat, delimiter, uncomment=uncomment) else: # Py2 situation non-ascii encodings is problematic # with gzip.open(filePath, "rb") as csvFile: # oL = self.__csvReader(csvFile, rowFormat, delimiter) tPath = self.__fileU.uncompress(filePath, outputDir=None) with io.open(tPath, newline="", encoding=encoding, errors="ignore") as csvFile: oL = self.__csvReader(csvFile, rowFormat, delimiter, uncomment=uncomment) else: with io.open(filePath, newline="", encoding=encoding, errors="ignore") as csvFile: oL = self.__csvReader(csvFile, rowFormat, delimiter, uncomment=uncomment) return oL except Exception as e: logger.error("Unable to deserialize %r %s", filePath, str(e)) # logger.debug("Reading list length %d", len(oL)) return oL def __serializeCsv(self, filePath, rowDictList, fieldNames=None, **kwargs): """ """ _ = kwargs try: wD = {} ret = False fNames = fieldNames if fieldNames else list(rowDictList[0].keys()) # with io.open(filePath, 'w', newline='') as csvFile: with open(filePath, "w") as csvFile: writer = csv.DictWriter(csvFile, fieldnames=fNames) writer.writeheader() for ii, rowDict in enumerate(rowDictList): try: wD = {k: v for k, v in rowDict.items() if k in fNames} writer.writerow(wD) except Exception as e: logger.error( "Skipping bad CSV record %d wD %r rowDict %r with %s", ii + 1, wD, rowDict, str(e)) continue ret = True except Exception as e: logger.error("Failing for %s : %r with %s", filePath, wD, str(e)) return ret def __csvEncoder(self, csvData, encoding="utf-8-sig", encodingErrors="ignore"): """Handle encoding issues for gzipped data in Py2. (beware of the BOM chars) Args: csvData (text lines): uncompressed data from gzip open encoding (str, optional): character encoding. Defaults to "utf-8-sig". encodingErrors (str, optional): error treatment. Defaults to "ignore". """ for line in csvData: yield line.decode("utf-8-sig", errors=encodingErrors).encode( encoding, errors=encodingErrors) def __deserializeXmlPrev(self, filePath, **kwargs): """Read the input XML file path and return an ElementTree data object instance. Args: filePath (sting): input XML file path Returns: object: instance of an ElementTree tree object """ _ = kwargs tree = None try: logger.debug("Parsing XML path %s", filePath) if filePath[-3:] == ".gz": with gzip.open(filePath, mode="rb") as ifh: tV = time.time() tree = ET.parse(ifh) else: with open(filePath, mode="rb") as ifh: tV = time.time() tree = ET.parse(ifh) logger.debug("Parsed %s in %.2f seconds", filePath, time.time() - tV) except Exception as e: logger.error("Unable to deserialize %r %s", filePath, str(e)) # return tree def __testGzip(self, filePath): ok = True with gzip.open(filePath, "r") as fh: try: fh.read(1) except gzip.BadGzipFile: ok = False except Exception: ok = False logger.debug("Gzip file check %r", ok) return ok def __deserializeXml(self, filePath, **kwargs): """Read the input XML file path and return an ElementTree data object instance. Args: filePath (sting): input XML file path Returns: object: instance of an ElementTree tree object """ _ = kwargs tree = None encoding = kwargs.get("encoding", "utf-8-sig") encodingErrors = kwargs.get("encodingErrors", "ignore") # try: logger.debug("Parsing XML path %s", filePath) if filePath[-3:] == ".gz" and self.__testGzip(filePath): if sys.version_info[0] > 2: with gzip.open(filePath, "rt", encoding=encoding, errors=encodingErrors) as ifh: tV = time.time() tree = ET.parse(ifh) else: tPath = self.__fileU.uncompress(filePath, outputDir=None) with io.open(tPath, encoding=encoding, errors=encodingErrors) as ifh: tV = time.time() tree = ET.parse(ifh) else: with io.open(filePath, encoding=encoding, errors=encodingErrors) as ifh: tV = time.time() tree = ET.parse(ifh) logger.debug("Parsed %s in %.2f seconds", filePath, time.time() - tV) except Exception as e: logger.error("Unable to deserialize %r %s", filePath, str(e)) # return tree
class MarshalUtil(object): """Wrapper for serialization and deserialization methods.""" def __init__(self, **kwargs): self.__workPath = kwargs.get("workPath", ".") self.__workDirSuffix = kwargs.get("workDirSuffix", "marshall_") self.__workDirPrefix = kwargs.get("workDirSuffix", "_tempdir") # self.__fileU = FileUtil(workPath=self.__workPath) self.__ioU = IoUtil() def doExport(self, locator, obj, fmt="list", marshalHelper=None, numParts=None, **kwargs): """Serialize the input object at locator path in specified format. The input object is optionally preprocessed by the helper method. Args: locator (str): target path or URI obj (object): data to be serialized fmt (str, optional): format for serialization (mmcif, tdd, csv, list). Defaults to "list". marshalHelper (method, optional): pre-processor method applied to input data object. Defaults to None. numParts (int, optional): serialize the data in parts. Defaults to None. (json and pickle formats) Returns: bool: True for sucess or False otherwise """ try: ret = False localFlag = self.__fileU.isLocal(locator) if marshalHelper: myObj = marshalHelper(obj, **kwargs) else: myObj = obj # if localFlag and numParts and fmt in ["json", "pickle"]: localFilePath = self.__fileU.getFilePath(locator) ret = self.__ioU.serializeInParts(localFilePath, myObj, numParts, fmt=fmt, **kwargs) elif localFlag: localFilePath = self.__fileU.getFilePath(locator) ret = self.__ioU.serialize(localFilePath, myObj, fmt=fmt, workPath=self.__workPath, **kwargs) else: with tempfile.TemporaryDirectory( suffix=self.__workDirSuffix, prefix=self.__workDirPrefix, dir=self.__workPath) as tmpDirName: # write a local copy then copy to destination - # localFilePath = os.path.join( self.__workPath, tmpDirName, self.__fileU.getFileName(locator)) ok1 = self.__ioU.serialize(localFilePath, myObj, fmt=fmt, workPath=self.__workPath, **kwargs) ok2 = True if ok1: ok2 = self.__fileU.put(localFilePath, locator, **kwargs) ret = ok1 and ok2 except Exception as e: logger.exception("Exporting locator %r failing with %s", locator, str(e)) return ret def doImport(self, locator, fmt="list", marshalHelper=None, numParts=None, **kwargs): """Deserialize data at the target locator in specified format. The deserialized data is optionally post-processed by the input helper method. Args: locator (str): path or URI to input data fmt (str, optional): format for deserialization (mmcif, tdd, csv, list). Defaults to "list". marshalHelper (method, optional): post-processor method applied to deserialized data object. Defaults to None. numParts (int, optional): deserialize the data in parts. Defaults to None. (json and pickle formats) tarMember (str, optional): name of a member of tar file bundle. Defaults to None. (tar file format) Returns: Any: format specific return type """ try: tarMember = kwargs.get("tarMember", None) localFlag = self.__fileU.isLocal(locator) and not tarMember # if localFlag and numParts and fmt in ["json", "pickle"]: filePath = self.__fileU.getFilePath(locator) ret = self.__ioU.deserializeInParts(filePath, numParts, fmt=fmt, **kwargs) elif localFlag: filePath = self.__fileU.getFilePath(locator) ret = self.__ioU.deserialize(filePath, fmt=fmt, workPath=self.__workPath, **kwargs) else: # if fmt == "mmcif": ret = self.__ioU.deserialize(locator, fmt=fmt, workPath=self.__workPath, **kwargs) else: with tempfile.TemporaryDirectory( suffix=self.__workDirSuffix, prefix=self.__workDirPrefix, dir=self.__workPath) as tmpDirName: # # Fetch first then read a local copy - # if tarMember: localFilePath = os.path.join( self.__workPath, tmpDirName, tarMember) else: localFilePath = os.path.join( self.__workPath, tmpDirName, self.__fileU.getFileName(locator)) # --- Local copy approach --- self.__fileU.get(locator, localFilePath, **kwargs) ret = self.__ioU.deserialize(localFilePath, fmt=fmt, workPath=self.__workPath, **kwargs) if marshalHelper: ret = marshalHelper(ret, **kwargs) except Exception as e: logger.exception("Importing locator %r failing with %s", locator, str(e)) ret = None return ret def exists(self, filePath, mode=os.R_OK): return self.__fileU.exists(filePath, mode=mode) def mkdir(self, dirPath, mode=0o755): return self.__fileU.mkdir(dirPath, mode=mode) def remove(self, pth): return self.__fileU.remove(pth)