def _testRandomReads(archivePath: str, samples: int, parallelization: int): with indexed_zstd.IndexedZstdFile( archivePath) as serialFile, SeekableZstd( archivePath ) if parallelization == 1 else ParallelZstdReader( archivePath, parallelization) as parallelFile: if hasattr(parallelFile, 'blockBoundaries'): size = parallelFile.blockBoundaries[-1] else: parallelFile.seek(io.SEEK_END) size = parallelFile.tell() if parallelization == 1: parallelFile.close() parallelFile = SeekableZstd(archivePath) else: parallelFile.seek(0) for _ in range(samples): offset = random.randint(0, size + 1) size = random.randint(0, (size + 1 - offset) * 2) # half the time read past the end serialFile.seek(offset) serialData = serialFile.read(size) # Files opened with the zstandard module cannot seek back not even in an emulated manner. if parallelization == 1 and offset < parallelFile.tell(): parallelFile.close() parallelFile = SeekableZstd(archivePath) parallelFile.seek(offset) parallelData = parallelFile.read(size) assert len(serialData) == len(parallelData) assert serialData == parallelData
def _decodeBlock(filename, offset, size): # This is not thread-safe! But it will be executed in a process pool, in which each worker has its own # global variable set. Using a global variable for this is safe because we know that there is one process pool # per BlockParallelReader, meaning the filename is a constant for each worker. global _parallelZstdReaderFile if _parallelZstdReaderFile is None: _parallelZstdReaderFile = indexed_zstd.IndexedZstdFile(filename) _parallelZstdReaderFile.seek(offset) return _parallelZstdReaderFile.read(size)
def testZstdSeeking(filename): file = indexed_zstd.IndexedZstdFile(filename) for offset in file.block_offsets(): file.seek(0) file.read(1) t0 = time.time() file.seek(offset) file.read(1) t1 = time.time() print(f"Seeking to {offset} took {t1-t0:.3f}s")
def simpleParallelZstdReading(filename): parallelization = os.cpu_count() with concurrent.futures.ThreadPoolExecutor(parallelization) as pool: futures = [] with indexed_zstd.IndexedZstdFile(filename) as file: offsets = np.array(list(file.block_offsets().values())) sizes = offsets[1:] - offsets[:-1] t0 = time.time() for offset, size in zip(offsets[:-1], sizes): futures.append(pool.submit(readBlock, filename, offset, size)) while len(futures) >= parallelization: futures.pop(0).result() t1 = time.time() print(f"Reading in parallel with a thread pool took {t1-t0:.3f}s")
def _testSequentialReading(archivePath: str, bufferSize: int, parallelization: int): with indexed_zstd.IndexedZstdFile( archivePath) as serialFile, SeekableZstd( archivePath ) if parallelization == 1 else ParallelZstdReader( archivePath, parallelization) as parallelFile: bytesRead = 0 while True: serialData = serialFile.read(bufferSize) parallelData = parallelFile.read(bufferSize) assert len(serialData) == len(parallelData) assert serialData == parallelData bytesRead += len(serialData) if len(serialData) < bufferSize: break if hasattr(parallelFile, 'blockBoundaries'): assert bytesRead == parallelFile.blockBoundaries[-1]
def __init__(self, filename: str, parallelization: Optional[int] = None): fileObject = indexed_zstd.IndexedZstdFile(filename) blockBoundaries = list(fileObject.block_offsets().values()) super().__init__(filename, fileObject, blockBoundaries, parallelization)
), 'zip': CompressionInfo( ['zip'], [], 'zipfile', lambda x: x.read(2) == b'PK', lambda x: zipfile.ZipFile(x), ), 'zst': CompressionInfo( ['zst', 'zstd'], ['tzst'], 'indexed_zstd', lambda x: x.read(4) == (0xFD2FB528).to_bytes(4, 'little'), lambda x: indexed_zstd.IndexedZstdFile(x.fileno()), ), } def stripSuffixFromCompressedFile(path: str) -> str: """Strips compression suffixes like .bz2, .gz, ...""" for compression in supportedCompressions.values(): for suffix in compression.suffixes: if path.lower().endswith('.' + suffix.lower()): return path[:-(len(suffix) + 1)] return path def stripSuffixFromTarFile(path: str) -> str:
def readBlock(filename, offset, size): with indexed_zstd.IndexedZstdFile(filename) as file: file.seek(offset) return file.read(size)
if __name__ == '__main__': for module in ('zstandard', 'indexed_zstd', 'ratarmountcore'): if hasattr( sys.modules[module], '__version__' ): print(module, "version:", getattr(sys.modules[module], '__version__')) print() filename = sys.argv[1] if filename.endswith('.xz'): filename = filename[:-3] elif filename.endswith('.zst'): filename = filename[:-4] if os.path.isfile(filename + '.xz'): compareReading(xz.open(filename + '.xz', 'rb'), ParallelXZReader(filename + '.xz', os.cpu_count())) benchmarkReading(xz.open(filename + '.xz', 'rb')) benchmarkReading(lzma.open(filename + '.xz', 'rb')) benchmarkReading(ParallelXZReader(filename + '.xz', os.cpu_count())) print() if os.path.isfile(filename + '.zst'): #simpleParallelZstdReading(filename + '.zst') #testZstdSeeking(filename + '.zst') compareReading(zstandard.open(filename + '.zst', 'rb'), ParallelZstdReader(filename + '.zst', os.cpu_count())) benchmarkReading(zstandard.open(filename + '.zst', 'rb')) benchmarkReading(indexed_zstd.IndexedZstdFile(filename + '.zst')) benchmarkReading(ParallelZstdReader(filename + '.zst', os.cpu_count()))