예제 #1
0
def pack_unpack_fp(repeats,
                   chunk_size=DEFAULT_CHUNK_SIZE,
                   progress=False,
                   metadata=None):
    in_fp, out_fp, dcmp_fp = StringIO(), StringIO(), StringIO()
    if progress:
        print("Creating test array")
    create_array_fp(repeats, in_fp, progress=progress)
    in_fp_size = in_fp.tell()
    if progress:
        print("Compressing")
    in_fp.seek(0)
    nchunks, chunk_size, last_chunk_size = \
            calculate_nchunks(in_fp_size, chunk_size)
    source = PlainFPSource(in_fp)
    sink = CompressedFPSink(out_fp)
    pack(source, sink, nchunks, chunk_size, last_chunk_size, metadata=metadata)
    out_fp.seek(0)
    if progress:
        print("Decompressing")
    source = CompressedFPSource(out_fp)
    sink = PlainFPSink(dcmp_fp)
    unpack(source, sink)
    if progress:
        print("Verifying")
    cmp_fp(in_fp, dcmp_fp)
    return source.metadata
예제 #2
0
def test_append_into_last_chunk():
    # first create an array with a single chunk
    orig, new, dcmp = StringIO(), StringIO(), StringIO()
    create_array_fp(1, new)
    new_size = new.tell()
    new.seek(0)
    chunking = calculate_nchunks(new_size, chunk_size=new_size)
    source = PlainFPSource(new)
    sink = CompressedFPSink(orig)
    pack(source, sink, *chunking)
    orig.seek(0)
    new.seek(0)
    # append a few bytes, creating a new, smaller, last_chunk
    new_content = new.read()
    new.seek(0)
    nchunks = reset_append_fp(orig, StringIO(new_content[:1023]), 1023)
    bloscpack_header = reset_read_beginning(orig)[0]
    nt.assert_equal(nchunks, 1)
    nt.assert_equal(bloscpack_header['last_chunk'], 1023)
    # now append into that last chunk
    nchunks = reset_append_fp(orig, StringIO(new_content[:1023]), 1023)
    bloscpack_header = reset_read_beginning(orig)[0]
    nt.assert_equal(nchunks, 0)
    nt.assert_equal(bloscpack_header['last_chunk'], 2046)

    # now check by unpacking
    source = CompressedFPSource(orig)
    sink = PlainFPSink(dcmp)
    unpack(source, sink)
    dcmp.seek(0)
    new.seek(0)
    new_str = new.read()
    dcmp_str = dcmp.read()
    nt.assert_equal(len(dcmp_str), len(new_str) + 2046)
    nt.assert_equal(dcmp_str, new_str + new_str[:1023] * 2)
예제 #3
0
def pack_unpack_mem(repeats,
                    chunk_size=DEFAULT_CHUNK_SIZE,
                    progress=False,
                    metadata=None):
    in_fp, out_fp, dcmp_fp = StringIO(), StringIO(), StringIO()
    if progress:
        print("Creating test array")
    create_array_fp(repeats, in_fp, progress=progress)
    in_fp_size = in_fp.tell()
    if progress:
        print("Compressing")
    in_fp.seek(0)
    nchunks, chunk_size, last_chunk_size = \
            calculate_nchunks(in_fp_size, chunk_size)
    # let us play merry go round
    source = PlainFPSource(in_fp)
    sink = CompressedMemorySink()
    pack(source, sink, nchunks, chunk_size, last_chunk_size, metadata=metadata)
    source = CompressedMemorySource(sink)
    sink = PlainMemorySink()
    unpack(source, sink)
    nt.assert_equal(metadata, source.metadata)
    source = PlainMemorySource(sink.chunks)
    sink = CompressedFPSink(out_fp)
    pack(source, sink, nchunks, chunk_size, last_chunk_size, metadata=metadata)
    out_fp.seek(0)
    source = CompressedFPSource(out_fp)
    sink = PlainFPSink(dcmp_fp)
    unpack(source, sink)
    nt.assert_equal(metadata, source.metadata)
    in_fp.seek(0)
    dcmp_fp.seek(0)
    cmp_fp(in_fp, dcmp_fp)
    return source.metadata
예제 #4
0
def test_append_mix_shuffle():
    orig, new, new_size, dcmp = prep_array_for_append()
    # use the typesize from the file
    # deactivate shuffle
    # crank up the clevel to ensure compression happens, otherwise the flags
    # will be screwed later on
    blosc_args = BloscArgs(typesize=None, shuffle=False, clevel=9)
    reset_append_fp(orig, new, new_size, blosc_args=blosc_args)
    source = CompressedFPSource(orig)
    sink = PlainFPSink(dcmp)
    unpack(source, sink)
    orig.seek(0)
    dcmp.seek(0)
    new.seek(0)
    new_str = new.read()
    dcmp_str = dcmp.read()
    nt.assert_equal(len(dcmp_str), len(new_str * 2))
    nt.assert_equal(dcmp_str, new_str * 2)

    # now get the first and the last chunk and check that the shuffle doesn't
    # match
    bloscpack_header, offsets = reset_read_beginning(orig)[0:4:3]
    orig.seek(offsets[0])
    checksum_impl = CHECKSUMS_LOOKUP[bloscpack_header['checksum']]
    compressed_zero,  blosc_header_zero, digest = \
        _read_compressed_chunk_fp(orig, checksum_impl)
    decompressed_zero = blosc.decompress(compressed_zero)
    orig.seek(offsets[-1])
    compressed_last,  blosc_header_last, digest = \
        _read_compressed_chunk_fp(orig, checksum_impl)
    decompressed_last = blosc.decompress(compressed_last)
    # first chunk has shuffle active
    nt.assert_equal(blosc_header_zero['flags'], 1)
    # last chunk doesn't
    nt.assert_equal(blosc_header_last['flags'], 0)
예제 #5
0
def test_double_append():
    orig, new, new_size, dcmp = prep_array_for_append()
    reset_append_fp(orig, new, new_size)
    reset_append_fp(orig, new, new_size)
    new_str = new.read()
    source = CompressedFPSource(orig)
    sink = PlainFPSink(dcmp)
    unpack(source, sink)
    dcmp.seek(0)
    dcmp_str = dcmp.read()
    nt.assert_equal(len(dcmp_str), len(new_str) * 3)
    nt.assert_equal(dcmp_str, new_str * 3)
예제 #6
0
def test_append_mix_shuffle():
    orig, new, new_size, dcmp = prep_array_for_append()
    # use the typesize from the file
    # deactivate shuffle
    # crank up the clevel to ensure compression happens, otherwise the flags
    # will be screwed later on
    blosc_args = BloscArgs(typesize=None, shuffle=False, clevel=9)

    # need to create something that will be compressible even without shuffle,
    # the linspace used in 'new' doesn't work anymore as of python-blosc 1.6.1
    to_append = np.zeros(int(2e6))
    to_append_fp = StringIO()
    to_append_fp.write(to_append.tostring())
    to_append_fp_size = to_append_fp.tell()
    to_append_fp.seek(0)

    # now do the append
    reset_append_fp(orig,
                    to_append_fp,
                    to_append_fp_size,
                    blosc_args=blosc_args)

    # decompress 'orig' so that we can examine it
    source = CompressedFPSource(orig)
    sink = PlainFPSink(dcmp)
    unpack(source, sink)
    orig.seek(0)
    dcmp.seek(0)
    new.seek(0)
    new_str = new.read()
    dcmp_str = dcmp.read()

    # now sanity check the length and content of the decompressed
    nt.assert_equal(len(dcmp_str), len(new_str) + to_append_fp_size)
    nt.assert_equal(dcmp_str, new_str + to_append.tostring())

    # now get the first and the last chunk and check that the shuffle doesn't
    # match
    bloscpack_header, offsets = reset_read_beginning(orig)[0:4:3]
    orig.seek(offsets[0])
    checksum_impl = CHECKSUMS_LOOKUP[bloscpack_header['checksum']]
    compressed_zero,  blosc_header_zero, digest = \
        _read_compressed_chunk_fp(orig, checksum_impl)
    decompressed_zero = blosc.decompress(compressed_zero)
    orig.seek(offsets[-1])
    compressed_last,  blosc_header_last, digest = \
        _read_compressed_chunk_fp(orig, checksum_impl)
    decompressed_last = blosc.decompress(compressed_last)
    # first chunk has shuffle active
    nt.assert_equal(blosc_header_zero['flags'], 1)
    # last chunk doesn't
    nt.assert_equal(blosc_header_last['flags'], 0)
예제 #7
0
def test_append_fp():
    orig, new, new_size, dcmp = prep_array_for_append()

    # check that the header and offsets are as we expected them to be
    orig_bloscpack_header, orig_offsets = reset_read_beginning(orig)[0:4:3]
    expected_orig_bloscpack_header = BloscpackHeader(
        format_version=3,
        offsets=True,
        metadata=False,
        checksum='adler32',
        typesize=8,
        chunk_size=1048576,
        last_chunk=271360,
        nchunks=16,
        max_app_chunks=160,
    )
    expected_orig_offsets = [1440]
    nt.assert_equal(expected_orig_bloscpack_header, orig_bloscpack_header)
    nt.assert_equal(expected_orig_offsets[0], orig_offsets[0])

    # perform the append
    reset_append_fp(orig, new, new_size)

    # check that the header and offsets are as we expected them to be after
    # appending
    app_bloscpack_header, app_offsets = reset_read_beginning(orig)[0:4:3]
    expected_app_bloscpack_header = {
        'chunk_size': 1048576,
        'nchunks': 31,
        'last_chunk': 542720,
        'max_app_chunks': 145,
        'format_version': 3,
        'offsets': True,
        'checksum': 'adler32',
        'typesize': 8,
        'metadata': False
    }
    expected_app_offsets = [1440]
    nt.assert_equal(expected_app_bloscpack_header, app_bloscpack_header)
    nt.assert_equal(expected_app_offsets[0], app_offsets[0])

    # now check by unpacking
    source = CompressedFPSource(orig)
    sink = PlainFPSink(dcmp)
    unpack(source, sink)
    dcmp.seek(0)
    new.seek(0)
    new_str = new.read()
    dcmp_str = dcmp.read()
    nt.assert_equal(len(dcmp_str), len(new_str * 2))
    nt.assert_equal(dcmp_str, new_str * 2)
예제 #8
0
def test_mixing_clevel():
    # the first set of chunks has max compression
    blosc_args = BloscArgs(clevel=9)
    orig, new, new_size, dcmp = prep_array_for_append()
    # get the original size
    orig.seek(0, 2)
    orig_size = orig.tell()
    orig.seek(0)
    # get a backup of the settings
    bloscpack_header, metadata, metadata_header, offsets = \
            reset_read_beginning(orig)
    # compressed size of the last chunk, including checksum
    last_chunk_compressed_size = orig_size - offsets[-1]

    # do append
    # use the typesize from the file and
    # make the second set of chunks have no compression
    blosc_args = BloscArgs(typesize=None, clevel=0)
    nchunks = append_fp(orig, new, new_size, blosc_args=blosc_args)

    # get the final size
    orig.seek(0, 2)
    final_size = orig.tell()
    orig.seek(0)

    # the original file minus the compressed size of the last chunk
    discounted_orig_size = orig_size - last_chunk_compressed_size
    # size of the appended data
    #  * raw new size, since we have no compression
    #  * uncompressed size of the last chunk
    #  * nchunks + 1 times the blosc and checksum overhead
    appended_size = new_size + bloscpack_header['last_chunk'] + (nchunks +
                                                                 1) * (16 + 4)
    # final size should be original plus appended data
    nt.assert_equal(final_size, appended_size + discounted_orig_size)

    # check by unpacking
    source = CompressedFPSource(orig)
    sink = PlainFPSink(dcmp)
    unpack(source, sink)
    dcmp.seek(0)
    new.seek(0)
    new_str = new.read()
    dcmp_str = dcmp.read()
    nt.assert_equal(len(dcmp_str), len(new_str * 2))
    nt.assert_equal(dcmp_str, new_str * 2)
예제 #9
0
def test_append_metadata():
    orig, new, dcmp = StringIO(), StringIO(), StringIO()
    create_array_fp(1, new)
    new_size = new.tell()
    new.seek(0)

    metadata = {"dtype": "float64", "shape": [1024], "others": []}
    chunking = calculate_nchunks(new_size, chunk_size=new_size)
    source = PlainFPSource(new)
    sink = CompressedFPSink(orig)
    pack(source, sink, *chunking, metadata=metadata)
    orig.seek(0)
    new.seek(0)
    reset_append_fp(orig, new, new_size)
    source = CompressedFPSource(orig)
    sink = PlainFPSink(dcmp)
    ans = unpack(source, sink)
    print(ans)
    dcmp.seek(0)
    new.seek(0)
    new_str = new.read()
    dcmp_str = dcmp.read()
    nt.assert_equal(len(dcmp_str), len(new_str) * 2)
    nt.assert_equal(dcmp_str, new_str * 2)
예제 #10
0
def test_append_fp():
    orig, new, new_size, dcmp = prep_array_for_append()

    # check that the header and offsets are as we expected them to be
    orig_bloscpack_header, orig_offsets = reset_read_beginning(orig)[0:4:3]
    expected_orig_bloscpack_header = BloscpackHeader(
            format_version=3,
            offsets=True,
            metadata=False,
            checksum='adler32',
            typesize=8,
            chunk_size=1048576,
            last_chunk=271360,
            nchunks=16,
            max_app_chunks=160,
            )
    expected_orig_offsets = [1440,
                             195299,
                             368931,
                             497746,
                             634063,
                             767529,
                             903070,
                             1038157,
                             1174555,
                             1297424,
                             1420339,
                             1544469,
                             1667805,
                             1791142,
                             1914839,
                             2038360]
    nt.assert_equal(expected_orig_bloscpack_header, orig_bloscpack_header)
    nt.assert_equal(expected_orig_offsets, orig_offsets)

    # perform the append
    reset_append_fp(orig, new, new_size)

    # check that the header and offsets are as we expected them to be after
    # appending
    app_bloscpack_header, app_offsets = reset_read_beginning(orig)[0:4:3]
    expected_app_bloscpack_header = {
            'chunk_size': 1048576,
            'nchunks': 31,
            'last_chunk': 542720,
            'max_app_chunks': 145,
            'format_version': 3,
            'offsets': True,
            'checksum': 'adler32',
            'typesize': 8,
            'metadata': False
    }
    expected_app_offsets = [1440,
                            195299,
                            368931,
                            497746,
                            634063,
                            767529,
                            903070,
                            1038157,
                            1174555,
                            1297424,
                            1420339,
                            1544469,
                            1667805,
                            1791142,
                            1914839,
                            2038360,
                            2221798,
                            2390194,
                            2533644,
                            2663010,
                            2803431,
                            2936406,
                            3071130,
                            3209565,
                            3333390,
                            3457344,
                            3581581,
                            3705533,
                            3829188,
                            3952136,
                            4075509]
    nt.assert_equal(expected_app_bloscpack_header, app_bloscpack_header)
    nt.assert_equal(expected_app_offsets, app_offsets)

    # now check by unpacking
    source = CompressedFPSource(orig)
    sink = PlainFPSink(dcmp)
    unpack(source, sink)
    dcmp.seek(0)
    new.seek(0)
    new_str = new.read()
    dcmp_str = dcmp.read()
    nt.assert_equal(len(dcmp_str), len(new_str * 2))
    nt.assert_equal(dcmp_str, new_str * 2)