def test(integration_case: _IntegrationCase, data_pattern: bytes, tmp_path: Path) -> None: xz_path, metadata = integration_case data = memoryview(data_pattern) if "padding" in xz_path.name: pytest.skip("Write mode does not support stream padding yet") generated_path = tmp_path / "archive.xz" with xz.open(generated_path, "w") as xzfile: for stream in metadata["streams"]: xzfile.check = stream["check"] xzfile.change_stream() for block in stream["blocks"]: xzfile.filters = block.get("filters") xzfile.change_block() xzfile.write(data[:block["length"]]) data = data[block["length"]:] assert not data expected_hash = sha256(xz_path.read_bytes()) generated_hash = sha256(generated_path.read_bytes()) assert generated_hash.hexdigest() == expected_hash.hexdigest()
def _tryOpenGlobalFile(filename): # This is not thread-safe! But it will be executed in a process pool, in which each worker has its own # global variable set. Using a global variable for this is safe because we know that there is one process pool # per BlockParallelReader, meaning the filename is a constant for each worker. global _parallelXzReaderFile if _parallelXzReaderFile is None: _parallelXzReaderFile = xz.open(filename, 'rb')
def __init__(self, filename: str, parallelization: Optional[int] = None): fileObject = xz.open(filename, 'rb') blockBoundaries = fileObject.block_boundaries.copy() blockBoundaries.append(len(fileObject)) super().__init__(filename, fileObject, blockBoundaries, parallelization, ParallelXZReader._initWorker2, (filename, )) self._openFiles()
def _testSequentialReading(archivePath: str, bufferSize: int, parallelization: int): with xz.open(archivePath, 'rb') as serialFile, lzma.open( archivePath) if parallelization == 1 else ParallelXZReader( archivePath, parallelization) as parallelFile: bytesRead = 0 while True: serialData = serialFile.read(bufferSize) parallelData = parallelFile.read(bufferSize) assert len(serialData) == len(parallelData) assert serialData == parallelData bytesRead += len(serialData) if len(serialData) < bufferSize: break if hasattr(parallelFile, 'blockBoundaries'): assert bytesRead == parallelFile.blockBoundaries[-1]
def benchmark_python_xz(): print("== Benchmark xz file decompression ==") h = guppy.hpy() result = None size = 0 t0 = time.time() with xz.open(sys.argv[1], 'rb') as file: t1 = time.time() while True: readSize = len(file.read(32*1024*1024)) if readSize == 0: break size += readSize if time.time() - t1 > 5: t1 = time.time() print(f"{t1 - t0:.2f}s {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss // 1024} MiB RSS") gc.collect() # Note that this heap call would add a reference count to the file object... #if result is None: # result = h.heap() #else: # print((h.heap() - result)) print(f"After finishing loop: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss // 1024} MiB RSS") file.close() print(f"After closing file: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss // 1024} MiB RSS") print("file closed?", file.closed) t1 = time.time() print("File type:", file) print("File type:", type(file)) print("File referred by:", gc.get_referrers(file)) print(f"After closing file: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss // 1024} MiB RSS") del file print(f"After deleting file: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss // 1024} MiB RSS") gc.collect() print(f"After garbage collection: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss // 1024} MiB RSS") if result is not None: print((h.heap() - result)) print(f"Reading {size} B took: {t1-t0}s")
def _testRandomReads(archivePath: str, samples: int, parallelization: int): with xz.open(archivePath, 'rb') as serialFile, lzma.open( archivePath) if parallelization == 1 else ParallelXZReader( archivePath, parallelization) as parallelFile: if hasattr(parallelFile, 'blockBoundaries'): size = parallelFile.blockBoundaries[-1] else: parallelFile.seek(io.SEEK_END) size = parallelFile.tell() parallelFile.seek(0) for _ in range(samples): offset = random.randint(0, size + 1) size = random.randint(0, (size + 1 - offset) * 2) # half the time read past the end serialFile.seek(offset) serialData = serialFile.read(size) parallelFile.seek(offset) parallelData = parallelFile.read(size) assert len(serialData) == len(parallelData) assert serialData == parallelData
'rar': CompressionInfo( ['rar'], [], 'rarfile', lambda x: x.read(6) == b'Rar!\x1A\x07', lambda x: rarfile.RarFile(x), ), 'xz': CompressionInfo( ['xz'], ['txz'], 'lzmaffi' if 'lzmaffi' in sys.modules else 'xz', lambda x: x.read(6) == b"\xFD7zXZ\x00", (lambda x: lzmaffi.open(x)) if 'lzmaffi' in sys.modules else (lambda x: xz.open(x)), ), 'zip': CompressionInfo( ['zip'], [], 'zipfile', lambda x: x.read(2) == b'PK', lambda x: zipfile.ZipFile(x), ), 'zst': CompressionInfo( ['zst', 'zstd'], ['tzst'], 'indexed_zstd', lambda x: x.read(4) == (0xFD2FB528).to_bytes(4, 'little'),
if __name__ == '__main__': for module in ('zstandard', 'indexed_zstd', 'ratarmountcore'): if hasattr( sys.modules[module], '__version__' ): print(module, "version:", getattr(sys.modules[module], '__version__')) print() filename = sys.argv[1] if filename.endswith('.xz'): filename = filename[:-3] elif filename.endswith('.zst'): filename = filename[:-4] if os.path.isfile(filename + '.xz'): compareReading(xz.open(filename + '.xz', 'rb'), ParallelXZReader(filename + '.xz', os.cpu_count())) benchmarkReading(xz.open(filename + '.xz', 'rb')) benchmarkReading(lzma.open(filename + '.xz', 'rb')) benchmarkReading(ParallelXZReader(filename + '.xz', os.cpu_count())) print() if os.path.isfile(filename + '.zst'): #simpleParallelZstdReading(filename + '.zst') #testZstdSeeking(filename + '.zst') compareReading(zstandard.open(filename + '.zst', 'rb'), ParallelZstdReader(filename + '.zst', os.cpu_count())) benchmarkReading(zstandard.open(filename + '.zst', 'rb')) benchmarkReading(indexed_zstd.IndexedZstdFile(filename + '.zst')) benchmarkReading(ParallelZstdReader(filename + '.zst', os.cpu_count()))