Beispiel #1
0
    def _testRandomReads(archivePath: str, samples: int, parallelization: int):
        with indexed_zstd.IndexedZstdFile(
                archivePath) as serialFile, SeekableZstd(
                    archivePath
                ) if parallelization == 1 else ParallelZstdReader(
                    archivePath, parallelization) as parallelFile:
            if hasattr(parallelFile, 'blockBoundaries'):
                size = parallelFile.blockBoundaries[-1]
            else:
                parallelFile.seek(io.SEEK_END)
                size = parallelFile.tell()
                if parallelization == 1:
                    parallelFile.close()
                    parallelFile = SeekableZstd(archivePath)
                else:
                    parallelFile.seek(0)

            for _ in range(samples):
                offset = random.randint(0, size + 1)
                size = random.randint(0, (size + 1 - offset) *
                                      2)  # half the time read past the end

                serialFile.seek(offset)
                serialData = serialFile.read(size)

                # Files opened with the zstandard module cannot seek back not even in an emulated manner.
                if parallelization == 1 and offset < parallelFile.tell():
                    parallelFile.close()
                    parallelFile = SeekableZstd(archivePath)
                parallelFile.seek(offset)
                parallelData = parallelFile.read(size)

                assert len(serialData) == len(parallelData)
                assert serialData == parallelData
Beispiel #2
0
    def _decodeBlock(filename, offset, size):
        # This is not thread-safe! But it will be executed in a process pool, in which each worker has its own
        # global variable set. Using a global variable for this is safe because we know that there is one process pool
        # per BlockParallelReader, meaning the filename is a constant for each worker.
        global _parallelZstdReaderFile
        if _parallelZstdReaderFile is None:
            _parallelZstdReaderFile = indexed_zstd.IndexedZstdFile(filename)

        _parallelZstdReaderFile.seek(offset)
        return _parallelZstdReaderFile.read(size)
def testZstdSeeking(filename):
    file = indexed_zstd.IndexedZstdFile(filename)
    for offset in file.block_offsets():
        file.seek(0)
        file.read(1)
        t0 = time.time()
        file.seek(offset)
        file.read(1)
        t1 = time.time()
        print(f"Seeking to {offset} took {t1-t0:.3f}s")
def simpleParallelZstdReading(filename):
    parallelization = os.cpu_count()
    with concurrent.futures.ThreadPoolExecutor(parallelization) as pool:
        futures = []
        with indexed_zstd.IndexedZstdFile(filename) as file:
            offsets = np.array(list(file.block_offsets().values()))
        sizes = offsets[1:] - offsets[:-1]
        t0 = time.time()
        for offset, size in zip(offsets[:-1], sizes):
            futures.append(pool.submit(readBlock, filename, offset, size))
            while len(futures) >= parallelization:
                futures.pop(0).result()
        t1 = time.time()
        print(f"Reading in parallel with a thread pool took {t1-t0:.3f}s")
Beispiel #5
0
    def _testSequentialReading(archivePath: str, bufferSize: int,
                               parallelization: int):
        with indexed_zstd.IndexedZstdFile(
                archivePath) as serialFile, SeekableZstd(
                    archivePath
                ) if parallelization == 1 else ParallelZstdReader(
                    archivePath, parallelization) as parallelFile:
            bytesRead = 0
            while True:
                serialData = serialFile.read(bufferSize)
                parallelData = parallelFile.read(bufferSize)
                assert len(serialData) == len(parallelData)
                assert serialData == parallelData
                bytesRead += len(serialData)
                if len(serialData) < bufferSize:
                    break

            if hasattr(parallelFile, 'blockBoundaries'):
                assert bytesRead == parallelFile.blockBoundaries[-1]
Beispiel #6
0
 def __init__(self, filename: str, parallelization: Optional[int] = None):
     fileObject = indexed_zstd.IndexedZstdFile(filename)
     blockBoundaries = list(fileObject.block_offsets().values())
     super().__init__(filename, fileObject, blockBoundaries,
                      parallelization)
Beispiel #7
0
    ),
    'zip':
    CompressionInfo(
        ['zip'],
        [],
        'zipfile',
        lambda x: x.read(2) == b'PK',
        lambda x: zipfile.ZipFile(x),
    ),
    'zst':
    CompressionInfo(
        ['zst', 'zstd'],
        ['tzst'],
        'indexed_zstd',
        lambda x: x.read(4) == (0xFD2FB528).to_bytes(4, 'little'),
        lambda x: indexed_zstd.IndexedZstdFile(x.fileno()),
    ),
}


def stripSuffixFromCompressedFile(path: str) -> str:
    """Strips compression suffixes like .bz2, .gz, ..."""
    for compression in supportedCompressions.values():
        for suffix in compression.suffixes:
            if path.lower().endswith('.' + suffix.lower()):
                return path[:-(len(suffix) + 1)]

    return path


def stripSuffixFromTarFile(path: str) -> str:
def readBlock(filename, offset, size):
    with indexed_zstd.IndexedZstdFile(filename) as file:
        file.seek(offset)
        return file.read(size)

if __name__ == '__main__':
    for module in ('zstandard', 'indexed_zstd', 'ratarmountcore'):
        if hasattr( sys.modules[module], '__version__' ):
            print(module, "version:", getattr(sys.modules[module], '__version__'))
    print()

    filename = sys.argv[1]
    if filename.endswith('.xz'):
        filename = filename[:-3]
    elif filename.endswith('.zst'):
        filename = filename[:-4]

    if os.path.isfile(filename + '.xz'):
        compareReading(xz.open(filename + '.xz', 'rb'), ParallelXZReader(filename + '.xz', os.cpu_count()))
        benchmarkReading(xz.open(filename + '.xz', 'rb'))
        benchmarkReading(lzma.open(filename + '.xz', 'rb'))
        benchmarkReading(ParallelXZReader(filename + '.xz', os.cpu_count()))

    print()

    if os.path.isfile(filename + '.zst'):
        #simpleParallelZstdReading(filename + '.zst')
        #testZstdSeeking(filename + '.zst')

        compareReading(zstandard.open(filename + '.zst', 'rb'), ParallelZstdReader(filename + '.zst', os.cpu_count()))
        benchmarkReading(zstandard.open(filename + '.zst', 'rb'))
        benchmarkReading(indexed_zstd.IndexedZstdFile(filename + '.zst'))
        benchmarkReading(ParallelZstdReader(filename + '.zst', os.cpu_count()))