def test_append_single_chunk(): orig, new, dcmp = StringIO(), StringIO(), StringIO() create_array_fp(1, new) new_size = new.tell() new.seek(0) chunking = calculate_nchunks(new_size, chunk_size=new_size) source = PlainFPSource(new) sink = CompressedFPSink(orig) pack(source, sink, *chunking) orig.seek(0) new.seek(0) # append a single chunk reset_append_fp(orig, new, new_size) bloscpack_header = reset_read_beginning(orig)[0] nt.assert_equal(bloscpack_header['nchunks'], 2) # append a large content, that amounts to two chunks new_content = new.read() new.seek(0) reset_append_fp(orig, StringIO(new_content * 2), new_size * 2) bloscpack_header = reset_read_beginning(orig)[0] nt.assert_equal(bloscpack_header['nchunks'], 4) # append half a chunk reset_append_fp(orig, StringIO(new_content[:len(new_content)]), new_size//2) bloscpack_header = reset_read_beginning(orig)[0] nt.assert_equal(bloscpack_header['nchunks'], 5) # append a few bytes reset_append_fp(orig, StringIO(new_content[:1023]), 1024) # make sure it is squashed into the lat chunk bloscpack_header = reset_read_beginning(orig)[0] nt.assert_equal(bloscpack_header['nchunks'], 5)
def pack_unpack_fp(repeats, chunk_size=DEFAULT_CHUNK_SIZE, progress=False, metadata=None): in_fp, out_fp, dcmp_fp = StringIO(), StringIO(), StringIO() if progress: print("Creating test array") create_array_fp(repeats, in_fp, progress=progress) in_fp_size = in_fp.tell() if progress: print("Compressing") in_fp.seek(0) nchunks, chunk_size, last_chunk_size = \ calculate_nchunks(in_fp_size, chunk_size) source = PlainFPSource(in_fp) sink = CompressedFPSink(out_fp) pack(source, sink, nchunks, chunk_size, last_chunk_size, metadata=metadata) out_fp.seek(0) if progress: print("Decompressing") source = CompressedFPSource(out_fp) sink = PlainFPSink(dcmp_fp) unpack(source, sink) if progress: print("Verifying") cmp_fp(in_fp, dcmp_fp) return source.metadata
def test_append_into_last_chunk(): # first create an array with a single chunk orig, new, dcmp = StringIO(), StringIO(), StringIO() create_array_fp(1, new) new_size = new.tell() new.seek(0) chunking = calculate_nchunks(new_size, chunk_size=new_size) source = PlainFPSource(new) sink = CompressedFPSink(orig) pack(source, sink, *chunking) orig.seek(0) new.seek(0) # append a few bytes, creating a new, smaller, last_chunk new_content = new.read() new.seek(0) nchunks = reset_append_fp(orig, StringIO(new_content[:1023]), 1023) bloscpack_header = reset_read_beginning(orig)[0] nt.assert_equal(nchunks, 1) nt.assert_equal(bloscpack_header['last_chunk'], 1023) # now append into that last chunk nchunks = reset_append_fp(orig, StringIO(new_content[:1023]), 1023) bloscpack_header = reset_read_beginning(orig)[0] nt.assert_equal(nchunks, 0) nt.assert_equal(bloscpack_header['last_chunk'], 2046) # now check by unpacking source = CompressedFPSource(orig) sink = PlainFPSink(dcmp) unpack(source, sink) dcmp.seek(0) new.seek(0) new_str = new.read() dcmp_str = dcmp.read() nt.assert_equal(len(dcmp_str), len(new_str) + 2046) nt.assert_equal(dcmp_str, new_str + new_str[:1023] * 2)
def test_unpack_exception(): a = np.arange(50) sio = StringIO() a_str = a.tostring() source = PlainFPSource(StringIO(a_str)) sink = CompressedFPSink(sio) pack(source, sink, *calculate_nchunks(len(a_str))) nt.assert_raises(NotANumpyArray, unpack_ndarray_str, sio.getvalue())
def roundtrip_numpy_file_pointers(ndarray): sio = StringIO() sink = CompressedFPSink(sio) pack_ndarray(ndarray, sink) sio.seek(0) source = CompressedFPSource(sio) b = unpack_ndarray(source) return npt.assert_array_equal, ndarray, b
def test_append_single_chunk(): orig, new, dcmp = StringIO(), StringIO(), StringIO() create_array_fp(1, new) new_size = new.tell() new.seek(0) chunking = calculate_nchunks(new_size, chunk_size=new_size) source = PlainFPSource(new) sink = CompressedFPSink(orig) pack(source, sink, *chunking) orig.seek(0) new.seek(0) # append a single chunk reset_append_fp(orig, new, new_size) bloscpack_header = reset_read_beginning(orig)[0] nt.assert_equal(bloscpack_header['nchunks'], 2) # append a large content, that amounts to two chunks new_content = new.read() new.seek(0) reset_append_fp(orig, StringIO(new_content * 2), new_size * 2) bloscpack_header = reset_read_beginning(orig)[0] nt.assert_equal(bloscpack_header['nchunks'], 4) # append half a chunk reset_append_fp(orig, StringIO(new_content[:len(new_content)]), new_size // 2) bloscpack_header = reset_read_beginning(orig)[0] nt.assert_equal(bloscpack_header['nchunks'], 5) # append a few bytes reset_append_fp(orig, StringIO(new_content[:1023]), 1024) # make sure it is squashed into the lat chunk bloscpack_header = reset_read_beginning(orig)[0] nt.assert_equal(bloscpack_header['nchunks'], 5)
def pack_unpack_mem(repeats, chunk_size=DEFAULT_CHUNK_SIZE, progress=False, metadata=None): in_fp, out_fp, dcmp_fp = StringIO(), StringIO(), StringIO() if progress: print("Creating test array") create_array_fp(repeats, in_fp, progress=progress) in_fp_size = in_fp.tell() if progress: print("Compressing") in_fp.seek(0) nchunks, chunk_size, last_chunk_size = \ calculate_nchunks(in_fp_size, chunk_size) # let us play merry go round source = PlainFPSource(in_fp) sink = CompressedMemorySink() pack(source, sink, nchunks, chunk_size, last_chunk_size, metadata=metadata) source = CompressedMemorySource(sink) sink = PlainMemorySink() unpack(source, sink) nt.assert_equal(metadata, source.metadata) source = PlainMemorySource(sink.chunks) sink = CompressedFPSink(out_fp) pack(source, sink, nchunks, chunk_size, last_chunk_size, metadata=metadata) out_fp.seek(0) source = CompressedFPSource(out_fp) sink = PlainFPSink(dcmp_fp) unpack(source, sink) nt.assert_equal(metadata, source.metadata) in_fp.seek(0) dcmp_fp.seek(0) cmp_fp(in_fp, dcmp_fp) return source.metadata
def test_disable_offsets(): in_fp, out_fp, dcmp_fp = StringIO(), StringIO(), StringIO() create_array_fp(1, in_fp) in_fp_size = in_fp.tell() in_fp.seek(0) bloscpack_args = BloscpackArgs(offsets=False) source = PlainFPSource(in_fp) sink = CompressedFPSink(out_fp) pack(source, sink, *calculate_nchunks(in_fp_size), bloscpack_args=bloscpack_args) out_fp.seek(0) bloscpack_header, metadata, metadata_header, offsets = _read_beginning(out_fp) nt.assert_true(len(offsets) == 0)
def prep_array_for_append(blosc_args=BloscArgs(), bloscpack_args=BloscpackArgs()): orig, new, dcmp = StringIO(), StringIO(), StringIO() create_array_fp(1, new) new_size = new.tell() new.seek(0) chunking = calculate_nchunks(new_size) source = PlainFPSource(new) sink = CompressedFPSink(orig) pack(source, sink, *chunking, blosc_args=blosc_args, bloscpack_args=bloscpack_args) orig.seek(0) new.seek(0) return orig, new, new_size, dcmp
def test_offsets(): with create_tmp_files() as (tdir, in_file, out_file, dcmp_file): create_array(1, in_file) pack_file(in_file, out_file, chunk_size='2M') with open(out_file, 'r+b') as input_fp: bloscpack_header = _read_bloscpack_header(input_fp) total_entries = bloscpack_header.total_prospective_chunks offsets = _read_offsets(input_fp, bloscpack_header) # First chunks should start after header and offsets first = BLOSCPACK_HEADER_LENGTH + 8 * total_entries # We assume that the others are correct nt.assert_equal(offsets[0], first) nt.assert_equal([736, 368207, 633319, 902306, 1173771, 1419535, 1666981, 1913995], offsets) # try to read the second header input_fp.seek(offsets[1], 0) blosc_header_raw = input_fp.read(BLOSC_HEADER_LENGTH) expected = {'versionlz': 1, 'blocksize': 262144, 'ctbytes': 265108, 'version': 2, 'flags': 1, 'nbytes': 2097152, 'typesize': 8} blosc_header = decode_blosc_header(blosc_header_raw) nt.assert_equal(expected, blosc_header) # now check the same thing again, but w/o any max_app_chunks input_fp, output_fp = StringIO(), StringIO() create_array_fp(1, input_fp) nchunks, chunk_size, last_chunk_size = \ calculate_nchunks(input_fp.tell(), chunk_size='2M') input_fp.seek(0, 0) bloscpack_args = BloscpackArgs(max_app_chunks=0) source = PlainFPSource(input_fp) sink = CompressedFPSink(output_fp) pack(source, sink, nchunks, chunk_size, last_chunk_size, bloscpack_args=bloscpack_args ) output_fp.seek(0, 0) bloscpack_header = _read_bloscpack_header(output_fp) nt.assert_equal(0, bloscpack_header.max_app_chunks) offsets = _read_offsets(output_fp, bloscpack_header) nt.assert_equal([96, 367567, 632679, 901666, 1173131, 1418895, 1666341, 1913355], offsets)
def test_offsets(): with create_tmp_files() as (tdir, in_file, out_file, dcmp_file): create_array(1, in_file) pack_file(in_file, out_file, chunk_size='2M') with open(out_file, 'r+b') as input_fp: bloscpack_header = _read_bloscpack_header(input_fp) total_entries = bloscpack_header.total_prospective_chunks offsets = _read_offsets(input_fp, bloscpack_header) # First chunks should start after header and offsets first = BLOSCPACK_HEADER_LENGTH + 8 * total_entries # We assume that the others are correct nt.assert_equal(offsets[0], first) nt.assert_equal(736, offsets[0]) # try to read the second header input_fp.seek(offsets[1], 0) blosc_header_raw = input_fp.read(BLOSC_HEADER_LENGTH) expected = {'versionlz': 1, 'version': 2, 'flags': 1, 'nbytes': 2097152, 'typesize': 8} blosc_header = decode_blosc_header(blosc_header_raw) blosc_header_slice = dict((k, blosc_header[k]) for k in expected.keys()) nt.assert_equal(expected, blosc_header_slice) # now check the same thing again, but w/o any max_app_chunks input_fp, output_fp = StringIO(), StringIO() create_array_fp(1, input_fp) nchunks, chunk_size, last_chunk_size = \ calculate_nchunks(input_fp.tell(), chunk_size='2M') input_fp.seek(0, 0) bloscpack_args = BloscpackArgs(max_app_chunks=0) source = PlainFPSource(input_fp) sink = CompressedFPSink(output_fp) pack(source, sink, nchunks, chunk_size, last_chunk_size, bloscpack_args=bloscpack_args ) output_fp.seek(0, 0) bloscpack_header = _read_bloscpack_header(output_fp) nt.assert_equal(0, bloscpack_header.max_app_chunks) offsets = _read_offsets(output_fp, bloscpack_header) nt.assert_equal(96, offsets[0])
def test_append_mix_shuffle(): orig, new, new_size, dcmp = prep_array_for_append() # use the typesize from the file # deactivate shuffle # crank up the clevel to ensure compression happens, otherwise the flags # will be screwed later on blosc_args = BloscArgs(typesize=None, shuffle=False, clevel=9) # need to create something that will be compressible even without shuffle, # the linspace used in 'new' doesn't work anymore as of python-blosc 1.6.1 to_append = np.zeros(int(2e6)) to_append_fp = StringIO() to_append_fp.write(to_append.tostring()) to_append_fp_size = to_append_fp.tell() to_append_fp.seek(0) # now do the append reset_append_fp(orig, to_append_fp, to_append_fp_size, blosc_args=blosc_args) # decompress 'orig' so that we can examine it source = CompressedFPSource(orig) sink = PlainFPSink(dcmp) unpack(source, sink) orig.seek(0) dcmp.seek(0) new.seek(0) new_str = new.read() dcmp_str = dcmp.read() # now sanity check the length and content of the decompressed nt.assert_equal(len(dcmp_str), len(new_str) + to_append_fp_size) nt.assert_equal(dcmp_str, new_str + to_append.tostring()) # now get the first and the last chunk and check that the shuffle doesn't # match bloscpack_header, offsets = reset_read_beginning(orig)[0:4:3] orig.seek(offsets[0]) checksum_impl = CHECKSUMS_LOOKUP[bloscpack_header['checksum']] compressed_zero, blosc_header_zero, digest = \ _read_compressed_chunk_fp(orig, checksum_impl) decompressed_zero = blosc.decompress(compressed_zero) orig.seek(offsets[-1]) compressed_last, blosc_header_last, digest = \ _read_compressed_chunk_fp(orig, checksum_impl) decompressed_last = blosc.decompress(compressed_last) # first chunk has shuffle active nt.assert_equal(blosc_header_zero['flags'], 1) # last chunk doesn't nt.assert_equal(blosc_header_last['flags'], 0)
def test_append_metadata(): orig, new, dcmp = StringIO(), StringIO(), StringIO() create_array_fp(1, new) new_size = new.tell() new.seek(0) metadata = {"dtype": "float64", "shape": [1024], "others": []} chunking = calculate_nchunks(new_size, chunk_size=new_size) source = PlainFPSource(new) sink = CompressedFPSink(orig) pack(source, sink, *chunking, metadata=metadata) orig.seek(0) new.seek(0) reset_append_fp(orig, new, new_size) source = CompressedFPSource(orig) sink = PlainFPSink(dcmp) ans = unpack(source, sink) print(ans) dcmp.seek(0) new.seek(0) new_str = new.read() dcmp_str = dcmp.read() nt.assert_equal(len(dcmp_str), len(new_str) * 2) nt.assert_equal(dcmp_str, new_str * 2)
def test_metadata_opportunisitic_compression(): # make up some metadata that can be compressed with benefit test_metadata = ("{'dtype': 'float64', 'shape': [1024], 'others': []," "'original_container': 'carray'}") target_fp = StringIO() _write_metadata(target_fp, test_metadata, MetadataArgs()) target_fp.seek(0, 0) metadata, header = _read_metadata(target_fp) nt.assert_equal('zlib', header['meta_codec']) # now do the same thing, but use badly compressible metadata test_metadata = "abc" target_fp = StringIO() # default args say: do compression... _write_metadata(target_fp, test_metadata, MetadataArgs()) target_fp.seek(0, 0) metadata, header = _read_metadata(target_fp) # but it wasn't of any use nt.assert_equal('None', header['meta_codec'])
def test_disable_offsets(): in_fp, out_fp, dcmp_fp = StringIO(), StringIO(), StringIO() create_array_fp(1, in_fp) in_fp_size = in_fp.tell() in_fp.seek(0) bloscpack_args = BloscpackArgs(offsets=False) source = PlainFPSource(in_fp) sink = CompressedFPSink(out_fp) pack(source, sink, *calculate_nchunks(in_fp_size), bloscpack_args=bloscpack_args) out_fp.seek(0) bloscpack_header, metadata, metadata_header, offsets = \ _read_beginning(out_fp) nt.assert_true(len(offsets) == 0)
def test_offsets(): with create_tmp_files() as (tdir, in_file, out_file, dcmp_file): create_array(1, in_file) pack_file(in_file, out_file, chunk_size='2M') with open(out_file, 'r+b') as input_fp: bloscpack_header = _read_bloscpack_header(input_fp) total_entries = bloscpack_header.total_prospective_chunks offsets = _read_offsets(input_fp, bloscpack_header) # First chunks should start after header and offsets first = BLOSCPACK_HEADER_LENGTH + 8 * total_entries # We assume that the others are correct nt.assert_equal(offsets[0], first) nt.assert_equal([ 736, 368207, 633319, 902306, 1173771, 1419535, 1666981, 1913995 ], offsets) # try to read the second header input_fp.seek(offsets[1], 0) blosc_header_raw = input_fp.read(BLOSC_HEADER_LENGTH) expected = { 'versionlz': 1, 'blocksize': 262144, 'ctbytes': 265108, 'version': 2, 'flags': 1, 'nbytes': 2097152, 'typesize': 8 } blosc_header = decode_blosc_header(blosc_header_raw) nt.assert_equal(expected, blosc_header) # now check the same thing again, but w/o any max_app_chunks input_fp, output_fp = StringIO(), StringIO() create_array_fp(1, input_fp) nchunks, chunk_size, last_chunk_size = \ calculate_nchunks(input_fp.tell(), chunk_size='2M') input_fp.seek(0, 0) bloscpack_args = BloscpackArgs(max_app_chunks=0) source = PlainFPSource(input_fp) sink = CompressedFPSink(output_fp) pack(source, sink, nchunks, chunk_size, last_chunk_size, bloscpack_args=bloscpack_args) output_fp.seek(0, 0) bloscpack_header = _read_bloscpack_header(output_fp) nt.assert_equal(0, bloscpack_header.max_app_chunks) offsets = _read_offsets(output_fp, bloscpack_header) nt.assert_equal( [96, 367567, 632679, 901666, 1173131, 1418895, 1666341, 1913355], offsets)
def test_offsets(): with create_tmp_files() as (tdir, in_file, out_file, dcmp_file): create_array(1, in_file) pack_file_to_file(in_file, out_file, chunk_size='2M') with open(out_file, 'r+b') as input_fp: bloscpack_header = _read_bloscpack_header(input_fp) total_entries = bloscpack_header.total_prospective_chunks offsets = _read_offsets(input_fp, bloscpack_header) # First chunks should start after header and offsets first = BLOSCPACK_HEADER_LENGTH + 8 * total_entries # We assume that the others are correct nt.assert_equal(offsets[0], first) nt.assert_equal(736, offsets[0]) # try to read the second header input_fp.seek(offsets[1], 0) blosc_header_raw = input_fp.read(BLOSC_HEADER_LENGTH) expected = { 'versionlz': 1, 'version': 2, 'flags': 1, 'nbytes': 2097152, 'typesize': 8 } blosc_header = decode_blosc_header(blosc_header_raw) blosc_header_slice = dict( (k, blosc_header[k]) for k in expected.keys()) nt.assert_equal(expected, blosc_header_slice) # now check the same thing again, but w/o any max_app_chunks input_fp, output_fp = StringIO(), StringIO() create_array_fp(1, input_fp) nchunks, chunk_size, last_chunk_size = \ calculate_nchunks(input_fp.tell(), chunk_size='2M') input_fp.seek(0, 0) bloscpack_args = BloscpackArgs(max_app_chunks=0) source = PlainFPSource(input_fp) sink = CompressedFPSink(output_fp) pack(source, sink, nchunks, chunk_size, last_chunk_size, bloscpack_args=bloscpack_args) output_fp.seek(0, 0) bloscpack_header = _read_bloscpack_header(output_fp) nt.assert_equal(0, bloscpack_header.max_app_chunks) offsets = _read_offsets(output_fp, bloscpack_header) nt.assert_equal(96, offsets[0])
def test_metadata_opportunisitic_compression(): # make up some metadata that can be compressed with benefit test_metadata = "{'dtype': 'float64', 'shape': [1024], 'others': []," "'original_container': 'carray'}" target_fp = StringIO() _write_metadata(target_fp, test_metadata, MetadataArgs()) target_fp.seek(0, 0) metadata, header = _read_metadata(target_fp) nt.assert_equal("zlib", header["meta_codec"]) # now do the same thing, but use badly compressible metadata test_metadata = "abc" target_fp = StringIO() # default args say: do compression... _write_metadata(target_fp, test_metadata, MetadataArgs()) target_fp.seek(0, 0) metadata, header = _read_metadata(target_fp) # but it wasn't of any use nt.assert_equal("None", header["meta_codec"])
def test_rewrite_metadata(): test_metadata = { 'dtype': 'float64', 'shape': [1024], 'others': [], } # assemble the metadata args from the default metadata_args = MetadataArgs() # avoid checksum and codec metadata_args.meta_checksum = 'None' metadata_args.meta_codec = 'None' # preallocate a fixed size metadata_args.max_meta_size = 1000 # fixed preallocation target_fp = StringIO() # write the metadata section _write_metadata(target_fp, test_metadata, metadata_args) # check that the length is correct nt.assert_equal(METADATA_HEADER_LENGTH + metadata_args.max_meta_size, len(target_fp.getvalue())) # now add stuff to the metadata test_metadata['container'] = 'numpy' test_metadata['data_origin'] = 'LHC' # compute the new length new_metadata_length = len(SERIALIZERS[0].dumps(test_metadata)) # jam the new metadata into the StringIO target_fp.seek(0, 0) _rewrite_metadata_fp(target_fp, test_metadata, codec=None, level=None) # now seek back, read the metadata and make sure it has been updated # correctly target_fp.seek(0, 0) result_metadata, result_header = _read_metadata(target_fp) nt.assert_equal(test_metadata, result_metadata) nt.assert_equal(new_metadata_length, result_header.meta_comp_size) # make sure that NoChangeInMetadata is raised target_fp.seek(0, 0) nt.assert_raises(NoChangeInMetadata, _rewrite_metadata_fp, target_fp, test_metadata, codec=None, level=None) # make sure that ChecksumLengthMismatch is raised, needs modified metadata target_fp.seek(0, 0) test_metadata['fluxcompensator'] = 'back to the future' nt.assert_raises(ChecksumLengthMismatch, _rewrite_metadata_fp, target_fp, test_metadata, codec=None, level=None, checksum='sha512') # make sure if level is not None, this works target_fp.seek(0, 0) test_metadata['hoverboard'] = 'back to the future 2' _rewrite_metadata_fp(target_fp, test_metadata, codec=None) # len of metadata when dumped to json should be around 1105 for i in range(100): test_metadata[str(i)] = str(i) target_fp.seek(0, 0) nt.assert_raises(MetadataSectionTooSmall, _rewrite_metadata_fp, target_fp, test_metadata, codec=None, level=None)
def test_rewrite_metadata(): test_metadata = {'dtype': 'float64', 'shape': [1024], 'others': [], } # assemble the metadata args from the default metadata_args = MetadataArgs() # avoid checksum and codec metadata_args.meta_checksum = 'None' metadata_args.meta_codec = 'None' # preallocate a fixed size metadata_args.max_meta_size = 1000 # fixed preallocation target_fp = StringIO() # write the metadata section _write_metadata(target_fp, test_metadata, metadata_args) # check that the length is correct nt.assert_equal(METADATA_HEADER_LENGTH + metadata_args.max_meta_size, len(target_fp.getvalue())) # now add stuff to the metadata test_metadata['container'] = 'numpy' test_metadata['data_origin'] = 'LHC' # compute the new length new_metadata_length = len(SERIALIZERS[0].dumps(test_metadata)) # jam the new metadata into the StringIO target_fp.seek(0, 0) _rewrite_metadata_fp(target_fp, test_metadata, codec=None, level=None) # now seek back, read the metadata and make sure it has been updated # correctly target_fp.seek(0, 0) result_metadata, result_header = _read_metadata(target_fp) nt.assert_equal(test_metadata, result_metadata) nt.assert_equal(new_metadata_length, result_header.meta_comp_size) # make sure that NoChangeInMetadata is raised target_fp.seek(0, 0) nt.assert_raises(NoChangeInMetadata, _rewrite_metadata_fp, target_fp, test_metadata, codec=None, level=None) # make sure that ChecksumLengthMismatch is raised, needs modified metadata target_fp.seek(0, 0) test_metadata['fluxcompensator'] = 'back to the future' nt.assert_raises(ChecksumLengthMismatch, _rewrite_metadata_fp, target_fp, test_metadata, codec=None, level=None, checksum='sha512') # make sure if level is not None, this works target_fp.seek(0, 0) test_metadata['hoverboard'] = 'back to the future 2' _rewrite_metadata_fp(target_fp, test_metadata, codec=None) # len of metadata when dumped to json should be around 1105 for i in range(100): test_metadata[str(i)] = str(i) target_fp.seek(0, 0) nt.assert_raises(MetadataSectionTooSmall, _rewrite_metadata_fp, target_fp, test_metadata, codec=None, level=None)