def test_get_cbuffer_sizes(self): s = b'0123456789' * 100000 blosc.set_blocksize(2**16) c = blosc.compress(s, typesize=1) t = blosc.get_cbuffer_sizes(c) self.assertEqual(t[0], 1000000) # One cannot be sure of the exact compressed bytes, so round to KB self.assertEqual(t[1] // 2**10, 4354 // 2**10) self.assertEqual(t[2], 2**16)
def compress(self, data, **kwargs): '''Useful compression kwargs: nthreads compression_block_size blosc_block_size shuffle typesize cname clevel ''' # Blosc code probably assumes contiguous buffer assert data.contiguous nthreads = kwargs.pop('nthreads', 1) compression_block_size = kwargs.pop('compression_block_size', 1 << 22) blosc_block_size = kwargs.pop('blosc_block_size', 512 * 1024) typesize = kwargs.pop('typesize', 'auto') # dtype size in bytes, e.g. 8 for int64 clevel = kwargs.pop( 'clevel', 1) # compression level, usually only need lowest for zstd cname = kwargs.pop( 'cname', 'zstd' ) # compressor name, default zstd, good performance/compression tradeoff shuffle = kwargs.pop('shuffle', 'shuffle') if shuffle == 'shuffle': shuffle = blosc.SHUFFLE elif shuffle == 'bitshuffle': shuffle = blosc.BITSHUFFLE elif shuffle == None: shuffle = blosc.NOSHUFFLE else: raise ValueError(shuffle) blosc.set_nthreads(nthreads) blosc.set_blocksize(blosc_block_size) if typesize == 'auto': this_typesize = data.itemsize else: this_typesize = typesize #assert this_typesize != 1 nelem = compression_block_size // data.itemsize for i in range(0, len(data), nelem): compressed = blosc.compress(data[i:i + nelem], typesize=this_typesize, clevel=clevel, shuffle=shuffle, cname=cname, **kwargs) header = struct.pack('!I', len(compressed)) # TODO: this probably triggers a data copy, feels inefficient. Probably have to add output array arg to blosc to fix yield header + compressed
def doCompression(dataStack, compressor='zstd', blocksize=2**20, n_threads=16, shuffle=blosc.BITSHUFFLE, clevel=5): blosc.set_blocksize(blocksize) blosc.set_nthreads(n_threads) typeSize = dataStack.dtype.itemsize packedDataList = [None] * dataStack.shape[0] for J in np.arange(dataStack.shape[0]): packedDataList[J] = blosc.compress(dataStack[J, :, :], typesize=typeSize, clevel=clevel, shuffle=shuffle, cname=compressor) return packedDataList
def test_get_blocksize(self): s = b'0123456789' * 1000 blosc.set_blocksize(2**14) blosc.compress(s, typesize=1) d = blosc.get_blocksize() self.assertEqual(d, 2**14)
def __MRCExport(input_image, header, MRCfilename, slices, endchar='<', offset=0, idxnewfile=True): ''' MRCExport private interface with a dictionary rather than a mess of function arguments. ''' if idxnewfile: # If forcing a new file we truncate it even if it already exists: fmode = 'wb' else: # Otherwise we'll just update its header and append images as required: fmode = 'rb+' with open(MRCfilename, fmode, buffering=BUFFERSIZE) as f: extendedBytes = writeMRCHeader(f, header, slices, endchar=endchar) f.seek(DEFAULT_HEADER_LEN + extendedBytes + offset) dtype = header['dtype'] if ('compressor' in header) \ and (header['compressor'] in REVERSE_COMPRESSOR_ENUM) \ and (REVERSE_COMPRESSOR_ENUM[header['compressor']]) > 0: # compressed MRCZ logger.debug('Compressing %s with compressor %s%d' % (MRCfilename, header['compressor'], header['clevel'])) applyCast = False if slices > 0: chunkSize = input_image[0].size typeSize = input_image[0].dtype.itemsize if dtype != 'uint4' and input_image[0].dtype != dtype: applyCast = True else: chunkSize = input_image[0,:,:].size typeSize = input_image.dtype.itemsize if dtype != 'uint4' and input_image.dtype != dtype: applyCast = True blosc.set_nthreads(header['n_threads']) # for small image dimensions we need to scale blocksize appropriately # so we use the available cores block_size = np.minimum(BLOSC_BLOCK, chunkSize//header['n_threads']) blosc.set_blocksize(block_size) header['packedBytes'] = 0 clevel = header['clevel'] cname = header['compressor'] # For 3D frames in lists, we need to further sub-divide each frame # into slices so that each channel is compressed seperately by # blosc. if slices > 1: deep_image = input_image # grab a reference input_image = [] for frame in deep_image: for I in range(slices): input_image.append(frame[I,:,:]) for J, frame in enumerate(input_image): if applyCast: frame = frame.astype(dtype) if frame.flags['C_CONTIGUOUS'] and frame.flags['ALIGNED']: # Use pointer compressedData = blosc.compress_ptr(frame.__array_interface__['data'][0], frame.size, typeSize, clevel=header['clevel'], shuffle=blosc.BITSHUFFLE, cname=header['compressor']) else: # Use tobytes, which is slower in benchmarking compressedData = blosc.compress(frame.tobytes(), typeSize, clevel=clevel, shuffle=blosc.BITSHUFFLE, cname=cname) f.write(compressedData) header['packedBytes'] += len(compressedData) # Rewind and write out the total compressed size f.seek(144) np.int64(header['packedBytes']).astype(endchar + 'i8').tofile(f) else: # vanilla MRC if slices > 0: if dtype != 'uint4' and dtype != input_image[0].dtype: for z_slice in input_image: z_slice.astype(dtype).tofile(f) else: for z_slice in input_image: z_slice.tofile(f) else: if dtype != 'uint4' and dtype != input_image.dtype: input_image = input_image.astype(dtype) input_image.tofile(f) return
tPool.join() def decompressStack(imageShape, imageDtype, blosc_threads=1, pool_threads=maxThreads): blosc.set_nthreads(blosc_threads) tPool = ThreadPool(pool_threads) num_slices = imageShape[0] imageStack = np.full(imageShape, fill_value=0) blosc.print_versions() blosc.set_blocksize(BLOCKSIZE) print("Creating NumPy stack with %d float32 elements:" % (m * N * N)) stack = np.zeros([m, N, N], dtype=dtype) xmesh, ymesh = np.meshgrid(np.arange(-N / 2, N / 2), np.arange(-N / 2, N / 2)) compress_mesh = (np.cos(xmesh) + np.exp(-ymesh**2 / N)).astype(dtype) for J in np.arange(m): stack[J, :, :] = compress_mesh ### Determine arrangement of pool threads and blosc threads testCases = int(np.floor(np.log2(maxThreads)) + 1) powProduct = 2**np.arange(0, testCases) poolThreads = np.hstack([1, powProduct]) bloscThreads = np.hstack([1, powProduct[::-1]]) # Let's try instead just pool threads... #poolThreads = np.arange( 1, maxThreads+1 )
from effects import SnpEff # native Python imports import os.path import time import sys import sqlite3 import itertools as it import toml # toml.py # third-party imports import cyvcf2 as vcf import blosc blosc.set_nthreads(1) blosc.set_blocksize(8192) import zlib import cPickle def opack_blob(obj, _none=buffer(zlib.compress(cPickle.dumps(None, cPickle.HIGHEST_PROTOCOL)))): if obj is None: return _none return buffer(zlib.compress(cPickle.dumps(obj, cPickle.HIGHEST_PROTOCOL), 1)) def pack_blob(obj): if obj is None: return '' return buffer(blosc.compress(obj.tostring(), obj.dtype.itemsize, clevel=5, shuffle=True)) #return buffer(blosc.pack_array(obj)) def is_number(op, field): return field.endswith("_float") or op in ("mean", "median", "min", "max")
def __MRCExport( input_image, header, MRCfilename, endchar = '<' ): """ MRCExport private interface with a dictionary rather than a mess of function arguments. """ with open( MRCfilename, 'wb', buffering=BUFFERSIZE ) as f: writeMRCHeader( f, header, endchar ) f.seek(1024) if ('compressor' in header) \ and (header['compressor'] in REVERSE_COMPRESSOR_ENUM) \ and (REVERSE_COMPRESSOR_ENUM[header['compressor']]) > 0: # compressed MRCZ print( "Compressing %s with compressor %s%d" % (MRCfilename, header['compressor'], header['clevel'] ) ) if header['dtype'] != 'uint4' and input_image.dtype != header['dtype']: # This correctly works for text to dtype comparison input_image = input_image.astype(header['dtype']) if input_image.ndim == 3: chunkSize = input_image[0,:,:].size else: chunkSize = input_image.size input_image = np.reshape( input_image, [1,input_image.shape[0],input_image.shape[1] ]) blosc.set_nthreads( header['n_threads'] ) blosc.set_blocksize( 65536 ) header['packedBytes'] = 0 typeSize = input_image.dtype.itemsize print( input_image.shape ) for J in np.arange( input_image.shape[0] ): # print( "Slice %d: Compressing address at: %d of %d:" % (J, int(J*typeSize*blockSize), input_image.nbytes) ) # Looks like I have problem for typesize > 1? if int(J*typeSize*chunkSize) >= input_image.nbytes: raise MemoryError( "MRCExport: Tried to reference past end of ndarray %d > %d" % (int(J*typeSize*chunkSize), input_image.nbytes ) ) compressedData = blosc.compress( input_image[J,:,:].tobytes(), typeSize, clevel=header['clevel'], shuffle=blosc.BITSHUFFLE, cname=header['compressor'] ) f.write( compressedData ) header['packedBytes'] += len(compressedData) # print( "packedBytes = %d" % header['packedBytes'] ) # print( "Finished writing out compressedData" ) # Rewind and write out the total compressed size f.seek(144) np.int64( header['packedBytes'] ).astype( endchar + "i8" ).tofile(f) else: # vanilla MRC if header['dtype'] != 'uint4' and input_image.dtype != header['dtype']: input_image = input_image.astype( header['dtype'] ) input_image.tofile(f) return
def test_get_cbuffer_sizes(self): s = b'0123456789' * 100000 blosc.set_blocksize(2**16) c = blosc.compress(s, typesize=1) t = blosc.get_cbuffer_sizes(c) self.assertEqual(t, (1000000, 4354, 2**16))
def __MRCExport(input_image, header, MRCfilename, endchar='<', offset=0, idxnewfile=True): ''' MRCExport private interface with a dictionary rather than a mess of function arguments. ''' if idxnewfile: # If forcing a new file we truncate it even if it already exists: fmode = 'wb' else: # Otherwise we'll just update its header and append images as required: fmode = 'rb+' with open(MRCfilename, fmode, buffering=BUFFERSIZE) as f: extendedBytes = writeMRCHeader(f, header, endchar) f.seek(DEFAULT_HEADER_LEN + extendedBytes + offset) if ('compressor' in header) \ and (header['compressor'] in REVERSE_COMPRESSOR_ENUM) \ and (REVERSE_COMPRESSOR_ENUM[header['compressor']]) > 0: # compressed MRCZ logger.info('Compressing %s with compressor %s%d' % (MRCfilename, header['compressor'], header['clevel'])) if header['dtype'] != 'uint4' and input_image.dtype != header[ 'dtype']: # This correctly works for text to dtype comparison input_image = input_image.astype(header['dtype']) if input_image.ndim == 3: chunkSize = input_image[0, :, :].size else: chunkSize = input_image.size input_image = np.reshape( input_image, [1, input_image.shape[0], input_image.shape[1]]) blosc.set_nthreads(header['n_threads']) blosc.set_blocksize(BLOSC_BLOCK) header['packedBytes'] = 0 typeSize = input_image.dtype.itemsize for J in np.arange(input_image.shape[0]): # print( 'Slice %d: Compressing address at: %d of %d:' % (J, int(J*typeSize*blockSize), input_image.nbytes) ) # Looks like I have problem for typesize > 1? if int(J * typeSize * chunkSize) >= input_image.nbytes: raise MemoryError( 'MRCExport: Tried to reference past end of ndarray %d > %d' % (int(J * typeSize * chunkSize), input_image.nbytes)) compressedData = blosc.compress(input_image[J, :, :].tobytes(), typeSize, clevel=header['clevel'], shuffle=blosc.BITSHUFFLE, cname=header['compressor']) f.write(compressedData) header['packedBytes'] += len(compressedData) # Rewind and write out the total compressed size f.seek(144) np.int64(header['packedBytes']).astype(endchar + 'i8').tofile(f) else: # vanilla MRC if header['dtype'] != 'uint4' and input_image.dtype != header[ 'dtype']: input_image = input_image.astype(header['dtype']) input_image.tofile(f) return
# All operations are done 'in-place' tPool.map( compressSlice, tArgs ) tPool.close() tPool.join() def decompressStack( imageShape, imageDtype, blosc_threads = 1, pool_threads=maxThreads ): blosc.set_nthreads( blosc_threads ) tPool = ThreadPool( pool_threads ) num_slices = imageShape[0] imageStack = np.empty( imageShape ) blosc.print_versions() blosc.set_blocksize( BLOCKSIZE ) print("Creating NumPy stack with %d float32 elements:" %(m*N*N) ) stack = np.zeros( [m,N,N], dtype=dtype ) xmesh, ymesh = np.meshgrid( np.arange(-N/2,N/2), np.arange(-N/2,N/2) ) compress_mesh = (np.cos( xmesh ) + np.exp( -ymesh**2 / N )).astype(dtype) for J in np.arange(m): stack[J,:,:] = compress_mesh ### Determine arrangement of pool threads and blosc threads testCases = int( np.floor( np.log2( maxThreads )) + 1 ) powProduct = 2**np.arange(0,testCases) poolThreads = np.hstack( [1, powProduct] ) bloscThreads = np.hstack( [1, powProduct[::-1]] ) # Let's try instead just pool threads...
if buffer.dtype.kind in ('S', 'U'): is_string = True filters = (blosc.NOSHUFFLE, ) else: is_string = False filters = (blosc.NOSHUFFLE, blosc.SHUFFLE, blosc.BITSHUFFLE) for i, chunk in enumerate(chunk_generator(buffer)): if is_string: chunk_features = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0) else: chunk_features = extract_chunk_features(chunk) chunk_features += (calculate_streaks(chunk, chunk_features[1]), ) df = pd.DataFrame() for block_size in BLOCK_SIZES: blosc.set_blocksize(block_size) for codec in blosc.compressor_list(): for filter in filters: for clevel in C_LEVELS: row_data = (filename, path, table, d_type, i + 1, chunk.size * chunk.dtype.itemsize / MB) \ + chunk_features \ + (block_size / 2**10, codec, blosc.filters[filter], clevel) \ + test_codec(chunk, codec, filter, clevel) df = df.append(dict(zip(COLS, row_data)), ignore_index=True) print("%5.2f%% %-s %-s t%-s chunk %d completed" % ((i + 1) / n_chunks * 100, filename, path, table, (i + 1))) with open('blosc_test_data.csv', 'a') as f: df = df[COLS]