def pack_unpack_fp(repeats, chunk_size=DEFAULT_CHUNK_SIZE, progress=False, metadata=None): in_fp, out_fp, dcmp_fp = StringIO(), StringIO(), StringIO() if progress: print("Creating test array") create_array_fp(repeats, in_fp, progress=progress) in_fp_size = in_fp.tell() if progress: print("Compressing") in_fp.seek(0) nchunks, chunk_size, last_chunk_size = \ calculate_nchunks(in_fp_size, chunk_size) source = PlainFPSource(in_fp) sink = CompressedFPSink(out_fp) pack(source, sink, nchunks, chunk_size, last_chunk_size, metadata=metadata) out_fp.seek(0) if progress: print("Decompressing") source = CompressedFPSource(out_fp) sink = PlainFPSink(dcmp_fp) unpack(source, sink) if progress: print("Verifying") cmp_fp(in_fp, dcmp_fp) return source.metadata
def test_append_into_last_chunk(): # first create an array with a single chunk orig, new, dcmp = StringIO(), StringIO(), StringIO() create_array_fp(1, new) new_size = new.tell() new.seek(0) chunking = calculate_nchunks(new_size, chunk_size=new_size) source = PlainFPSource(new) sink = CompressedFPSink(orig) pack(source, sink, *chunking) orig.seek(0) new.seek(0) # append a few bytes, creating a new, smaller, last_chunk new_content = new.read() new.seek(0) nchunks = reset_append_fp(orig, StringIO(new_content[:1023]), 1023) bloscpack_header = reset_read_beginning(orig)[0] nt.assert_equal(nchunks, 1) nt.assert_equal(bloscpack_header['last_chunk'], 1023) # now append into that last chunk nchunks = reset_append_fp(orig, StringIO(new_content[:1023]), 1023) bloscpack_header = reset_read_beginning(orig)[0] nt.assert_equal(nchunks, 0) nt.assert_equal(bloscpack_header['last_chunk'], 2046) # now check by unpacking source = CompressedFPSource(orig) sink = PlainFPSink(dcmp) unpack(source, sink) dcmp.seek(0) new.seek(0) new_str = new.read() dcmp_str = dcmp.read() nt.assert_equal(len(dcmp_str), len(new_str) + 2046) nt.assert_equal(dcmp_str, new_str + new_str[:1023] * 2)
def pack_unpack_mem(repeats, chunk_size=DEFAULT_CHUNK_SIZE, progress=False, metadata=None): in_fp, out_fp, dcmp_fp = StringIO(), StringIO(), StringIO() if progress: print("Creating test array") create_array_fp(repeats, in_fp, progress=progress) in_fp_size = in_fp.tell() if progress: print("Compressing") in_fp.seek(0) nchunks, chunk_size, last_chunk_size = \ calculate_nchunks(in_fp_size, chunk_size) # let us play merry go round source = PlainFPSource(in_fp) sink = CompressedMemorySink() pack(source, sink, nchunks, chunk_size, last_chunk_size, metadata=metadata) source = CompressedMemorySource(sink) sink = PlainMemorySink() unpack(source, sink) nt.assert_equal(metadata, source.metadata) source = PlainMemorySource(sink.chunks) sink = CompressedFPSink(out_fp) pack(source, sink, nchunks, chunk_size, last_chunk_size, metadata=metadata) out_fp.seek(0) source = CompressedFPSource(out_fp) sink = PlainFPSink(dcmp_fp) unpack(source, sink) nt.assert_equal(metadata, source.metadata) in_fp.seek(0) dcmp_fp.seek(0) cmp_fp(in_fp, dcmp_fp) return source.metadata
def test_append_mix_shuffle(): orig, new, new_size, dcmp = prep_array_for_append() # use the typesize from the file # deactivate shuffle # crank up the clevel to ensure compression happens, otherwise the flags # will be screwed later on blosc_args = BloscArgs(typesize=None, shuffle=False, clevel=9) reset_append_fp(orig, new, new_size, blosc_args=blosc_args) source = CompressedFPSource(orig) sink = PlainFPSink(dcmp) unpack(source, sink) orig.seek(0) dcmp.seek(0) new.seek(0) new_str = new.read() dcmp_str = dcmp.read() nt.assert_equal(len(dcmp_str), len(new_str * 2)) nt.assert_equal(dcmp_str, new_str * 2) # now get the first and the last chunk and check that the shuffle doesn't # match bloscpack_header, offsets = reset_read_beginning(orig)[0:4:3] orig.seek(offsets[0]) checksum_impl = CHECKSUMS_LOOKUP[bloscpack_header['checksum']] compressed_zero, blosc_header_zero, digest = \ _read_compressed_chunk_fp(orig, checksum_impl) decompressed_zero = blosc.decompress(compressed_zero) orig.seek(offsets[-1]) compressed_last, blosc_header_last, digest = \ _read_compressed_chunk_fp(orig, checksum_impl) decompressed_last = blosc.decompress(compressed_last) # first chunk has shuffle active nt.assert_equal(blosc_header_zero['flags'], 1) # last chunk doesn't nt.assert_equal(blosc_header_last['flags'], 0)
def test_double_append(): orig, new, new_size, dcmp = prep_array_for_append() reset_append_fp(orig, new, new_size) reset_append_fp(orig, new, new_size) new_str = new.read() source = CompressedFPSource(orig) sink = PlainFPSink(dcmp) unpack(source, sink) dcmp.seek(0) dcmp_str = dcmp.read() nt.assert_equal(len(dcmp_str), len(new_str) * 3) nt.assert_equal(dcmp_str, new_str * 3)
def test_append_mix_shuffle(): orig, new, new_size, dcmp = prep_array_for_append() # use the typesize from the file # deactivate shuffle # crank up the clevel to ensure compression happens, otherwise the flags # will be screwed later on blosc_args = BloscArgs(typesize=None, shuffle=False, clevel=9) # need to create something that will be compressible even without shuffle, # the linspace used in 'new' doesn't work anymore as of python-blosc 1.6.1 to_append = np.zeros(int(2e6)) to_append_fp = StringIO() to_append_fp.write(to_append.tostring()) to_append_fp_size = to_append_fp.tell() to_append_fp.seek(0) # now do the append reset_append_fp(orig, to_append_fp, to_append_fp_size, blosc_args=blosc_args) # decompress 'orig' so that we can examine it source = CompressedFPSource(orig) sink = PlainFPSink(dcmp) unpack(source, sink) orig.seek(0) dcmp.seek(0) new.seek(0) new_str = new.read() dcmp_str = dcmp.read() # now sanity check the length and content of the decompressed nt.assert_equal(len(dcmp_str), len(new_str) + to_append_fp_size) nt.assert_equal(dcmp_str, new_str + to_append.tostring()) # now get the first and the last chunk and check that the shuffle doesn't # match bloscpack_header, offsets = reset_read_beginning(orig)[0:4:3] orig.seek(offsets[0]) checksum_impl = CHECKSUMS_LOOKUP[bloscpack_header['checksum']] compressed_zero, blosc_header_zero, digest = \ _read_compressed_chunk_fp(orig, checksum_impl) decompressed_zero = blosc.decompress(compressed_zero) orig.seek(offsets[-1]) compressed_last, blosc_header_last, digest = \ _read_compressed_chunk_fp(orig, checksum_impl) decompressed_last = blosc.decompress(compressed_last) # first chunk has shuffle active nt.assert_equal(blosc_header_zero['flags'], 1) # last chunk doesn't nt.assert_equal(blosc_header_last['flags'], 0)
def test_append_fp(): orig, new, new_size, dcmp = prep_array_for_append() # check that the header and offsets are as we expected them to be orig_bloscpack_header, orig_offsets = reset_read_beginning(orig)[0:4:3] expected_orig_bloscpack_header = BloscpackHeader( format_version=3, offsets=True, metadata=False, checksum='adler32', typesize=8, chunk_size=1048576, last_chunk=271360, nchunks=16, max_app_chunks=160, ) expected_orig_offsets = [1440] nt.assert_equal(expected_orig_bloscpack_header, orig_bloscpack_header) nt.assert_equal(expected_orig_offsets[0], orig_offsets[0]) # perform the append reset_append_fp(orig, new, new_size) # check that the header and offsets are as we expected them to be after # appending app_bloscpack_header, app_offsets = reset_read_beginning(orig)[0:4:3] expected_app_bloscpack_header = { 'chunk_size': 1048576, 'nchunks': 31, 'last_chunk': 542720, 'max_app_chunks': 145, 'format_version': 3, 'offsets': True, 'checksum': 'adler32', 'typesize': 8, 'metadata': False } expected_app_offsets = [1440] nt.assert_equal(expected_app_bloscpack_header, app_bloscpack_header) nt.assert_equal(expected_app_offsets[0], app_offsets[0]) # now check by unpacking source = CompressedFPSource(orig) sink = PlainFPSink(dcmp) unpack(source, sink) dcmp.seek(0) new.seek(0) new_str = new.read() dcmp_str = dcmp.read() nt.assert_equal(len(dcmp_str), len(new_str * 2)) nt.assert_equal(dcmp_str, new_str * 2)
def test_mixing_clevel(): # the first set of chunks has max compression blosc_args = BloscArgs(clevel=9) orig, new, new_size, dcmp = prep_array_for_append() # get the original size orig.seek(0, 2) orig_size = orig.tell() orig.seek(0) # get a backup of the settings bloscpack_header, metadata, metadata_header, offsets = \ reset_read_beginning(orig) # compressed size of the last chunk, including checksum last_chunk_compressed_size = orig_size - offsets[-1] # do append # use the typesize from the file and # make the second set of chunks have no compression blosc_args = BloscArgs(typesize=None, clevel=0) nchunks = append_fp(orig, new, new_size, blosc_args=blosc_args) # get the final size orig.seek(0, 2) final_size = orig.tell() orig.seek(0) # the original file minus the compressed size of the last chunk discounted_orig_size = orig_size - last_chunk_compressed_size # size of the appended data # * raw new size, since we have no compression # * uncompressed size of the last chunk # * nchunks + 1 times the blosc and checksum overhead appended_size = new_size + bloscpack_header['last_chunk'] + (nchunks + 1) * (16 + 4) # final size should be original plus appended data nt.assert_equal(final_size, appended_size + discounted_orig_size) # check by unpacking source = CompressedFPSource(orig) sink = PlainFPSink(dcmp) unpack(source, sink) dcmp.seek(0) new.seek(0) new_str = new.read() dcmp_str = dcmp.read() nt.assert_equal(len(dcmp_str), len(new_str * 2)) nt.assert_equal(dcmp_str, new_str * 2)
def test_append_metadata(): orig, new, dcmp = StringIO(), StringIO(), StringIO() create_array_fp(1, new) new_size = new.tell() new.seek(0) metadata = {"dtype": "float64", "shape": [1024], "others": []} chunking = calculate_nchunks(new_size, chunk_size=new_size) source = PlainFPSource(new) sink = CompressedFPSink(orig) pack(source, sink, *chunking, metadata=metadata) orig.seek(0) new.seek(0) reset_append_fp(orig, new, new_size) source = CompressedFPSource(orig) sink = PlainFPSink(dcmp) ans = unpack(source, sink) print(ans) dcmp.seek(0) new.seek(0) new_str = new.read() dcmp_str = dcmp.read() nt.assert_equal(len(dcmp_str), len(new_str) * 2) nt.assert_equal(dcmp_str, new_str * 2)
def test_append_fp(): orig, new, new_size, dcmp = prep_array_for_append() # check that the header and offsets are as we expected them to be orig_bloscpack_header, orig_offsets = reset_read_beginning(orig)[0:4:3] expected_orig_bloscpack_header = BloscpackHeader( format_version=3, offsets=True, metadata=False, checksum='adler32', typesize=8, chunk_size=1048576, last_chunk=271360, nchunks=16, max_app_chunks=160, ) expected_orig_offsets = [1440, 195299, 368931, 497746, 634063, 767529, 903070, 1038157, 1174555, 1297424, 1420339, 1544469, 1667805, 1791142, 1914839, 2038360] nt.assert_equal(expected_orig_bloscpack_header, orig_bloscpack_header) nt.assert_equal(expected_orig_offsets, orig_offsets) # perform the append reset_append_fp(orig, new, new_size) # check that the header and offsets are as we expected them to be after # appending app_bloscpack_header, app_offsets = reset_read_beginning(orig)[0:4:3] expected_app_bloscpack_header = { 'chunk_size': 1048576, 'nchunks': 31, 'last_chunk': 542720, 'max_app_chunks': 145, 'format_version': 3, 'offsets': True, 'checksum': 'adler32', 'typesize': 8, 'metadata': False } expected_app_offsets = [1440, 195299, 368931, 497746, 634063, 767529, 903070, 1038157, 1174555, 1297424, 1420339, 1544469, 1667805, 1791142, 1914839, 2038360, 2221798, 2390194, 2533644, 2663010, 2803431, 2936406, 3071130, 3209565, 3333390, 3457344, 3581581, 3705533, 3829188, 3952136, 4075509] nt.assert_equal(expected_app_bloscpack_header, app_bloscpack_header) nt.assert_equal(expected_app_offsets, app_offsets) # now check by unpacking source = CompressedFPSource(orig) sink = PlainFPSink(dcmp) unpack(source, sink) dcmp.seek(0) new.seek(0) new_str = new.read() dcmp_str = dcmp.read() nt.assert_equal(len(dcmp_str), len(new_str * 2)) nt.assert_equal(dcmp_str, new_str * 2)