Exemple #1
0
 def test_get_cbuffer_sizes(self):
     s = b'0123456789' * 100000
     blosc.set_blocksize(2**16)
     c = blosc.compress(s, typesize=1)
     t = blosc.get_cbuffer_sizes(c)
     self.assertEqual(t[0], 1000000)
     # One cannot be sure of the exact compressed bytes, so round to KB
     self.assertEqual(t[1] // 2**10, 4354 // 2**10)
     self.assertEqual(t[2], 2**16)
Exemple #2
0
 def test_get_cbuffer_sizes(self):
     s = b'0123456789' * 100000
     blosc.set_blocksize(2**16)
     c = blosc.compress(s, typesize=1)
     t = blosc.get_cbuffer_sizes(c)
     self.assertEqual(t[0], 1000000)
     # One cannot be sure of the exact compressed bytes, so round to KB
     self.assertEqual(t[1] // 2**10, 4354 // 2**10)
     self.assertEqual(t[2], 2**16)
Exemple #3
0
    def compress(self, data, **kwargs):
        '''Useful compression kwargs:
        nthreads
        compression_block_size
        blosc_block_size
        shuffle
        typesize
        cname
        clevel
        '''
        # Blosc code probably assumes contiguous buffer
        assert data.contiguous

        nthreads = kwargs.pop('nthreads', 1)
        compression_block_size = kwargs.pop('compression_block_size', 1 << 22)
        blosc_block_size = kwargs.pop('blosc_block_size', 512 * 1024)
        typesize = kwargs.pop('typesize',
                              'auto')  # dtype size in bytes, e.g. 8 for int64
        clevel = kwargs.pop(
            'clevel',
            1)  # compression level, usually only need lowest for zstd
        cname = kwargs.pop(
            'cname', 'zstd'
        )  # compressor name, default zstd, good performance/compression tradeoff

        shuffle = kwargs.pop('shuffle', 'shuffle')
        if shuffle == 'shuffle':
            shuffle = blosc.SHUFFLE
        elif shuffle == 'bitshuffle':
            shuffle = blosc.BITSHUFFLE
        elif shuffle == None:
            shuffle = blosc.NOSHUFFLE
        else:
            raise ValueError(shuffle)

        blosc.set_nthreads(nthreads)
        blosc.set_blocksize(blosc_block_size)

        if typesize == 'auto':
            this_typesize = data.itemsize
        else:
            this_typesize = typesize
        #assert this_typesize != 1

        nelem = compression_block_size // data.itemsize
        for i in range(0, len(data), nelem):
            compressed = blosc.compress(data[i:i + nelem],
                                        typesize=this_typesize,
                                        clevel=clevel,
                                        shuffle=shuffle,
                                        cname=cname,
                                        **kwargs)
            header = struct.pack('!I', len(compressed))
            # TODO: this probably triggers a data copy, feels inefficient. Probably have to add output array arg to blosc to fix
            yield header + compressed
Exemple #4
0
def doCompression(dataStack,
                  compressor='zstd',
                  blocksize=2**20,
                  n_threads=16,
                  shuffle=blosc.BITSHUFFLE,
                  clevel=5):

    blosc.set_blocksize(blocksize)
    blosc.set_nthreads(n_threads)
    typeSize = dataStack.dtype.itemsize
    packedDataList = [None] * dataStack.shape[0]
    for J in np.arange(dataStack.shape[0]):
        packedDataList[J] = blosc.compress(dataStack[J, :, :],
                                           typesize=typeSize,
                                           clevel=clevel,
                                           shuffle=shuffle,
                                           cname=compressor)

    return packedDataList
Exemple #5
0
 def test_get_blocksize(self):
     s = b'0123456789' * 1000
     blosc.set_blocksize(2**14)
     blosc.compress(s, typesize=1)
     d = blosc.get_blocksize()
     self.assertEqual(d, 2**14)
Exemple #6
0
 def test_get_blocksize(self):
     s = b'0123456789' * 1000
     blosc.set_blocksize(2**14)
     blosc.compress(s, typesize=1)
     d = blosc.get_blocksize()
     self.assertEqual(d, 2**14)
Exemple #7
0
def __MRCExport(input_image, header, MRCfilename, slices, 
                endchar='<', offset=0, idxnewfile=True):
    '''
    MRCExport private interface with a dictionary rather than a mess of function 
    arguments.
    '''

    if idxnewfile: # If forcing a new file we truncate it even if it already exists:
        fmode = 'wb'
    else: # Otherwise we'll just update its header and append images as required:
        fmode = 'rb+'

    with open(MRCfilename, fmode, buffering=BUFFERSIZE) as f:
        extendedBytes = writeMRCHeader(f, header, slices, endchar=endchar)
        f.seek(DEFAULT_HEADER_LEN + extendedBytes + offset)

        dtype = header['dtype']
        if ('compressor' in header) \
                and (header['compressor'] in REVERSE_COMPRESSOR_ENUM) \
                and (REVERSE_COMPRESSOR_ENUM[header['compressor']]) > 0:
            # compressed MRCZ
            logger.debug('Compressing %s with compressor %s%d' %
                    (MRCfilename, header['compressor'], header['clevel']))
            
            applyCast = False
            if slices > 0:
                chunkSize = input_image[0].size
                typeSize = input_image[0].dtype.itemsize
                if dtype != 'uint4' and input_image[0].dtype != dtype: 
                    applyCast = True
            else:
                chunkSize = input_image[0,:,:].size
                typeSize = input_image.dtype.itemsize
                if dtype != 'uint4' and input_image.dtype != dtype: 
                    applyCast = True
                
            blosc.set_nthreads(header['n_threads'])
            # for small image dimensions we need to scale blocksize appropriately
            # so we use the available cores
            block_size = np.minimum(BLOSC_BLOCK, chunkSize//header['n_threads'])
            blosc.set_blocksize(block_size)
            
            header['packedBytes'] = 0
            
            clevel = header['clevel']
            cname = header['compressor']

            # For 3D frames in lists, we need to further sub-divide each frame 
            # into slices so that each channel is compressed seperately by 
            # blosc.
            if slices > 1:
                deep_image = input_image # grab a reference
                input_image = []
                for frame in deep_image:
                    for I in range(slices):
                        input_image.append(frame[I,:,:])

            for J, frame in enumerate(input_image):
                if applyCast:
                    frame = frame.astype(dtype)

                if frame.flags['C_CONTIGUOUS'] and frame.flags['ALIGNED']:
                    # Use pointer
                    compressedData = blosc.compress_ptr(frame.__array_interface__['data'][0], 
                                    frame.size,
                                    typeSize, 
                                    clevel=header['clevel'], 
                                    shuffle=blosc.BITSHUFFLE,
                                    cname=header['compressor'])
                else: 
                    # Use tobytes, which is slower in benchmarking
                    compressedData = blosc.compress(frame.tobytes(),
                                    typeSize, 
                                    clevel=clevel, 
                                    shuffle=blosc.BITSHUFFLE,
                                    cname=cname)
        

                f.write(compressedData)
                header['packedBytes'] += len(compressedData)

            # Rewind and write out the total compressed size
            f.seek(144)
            np.int64(header['packedBytes']).astype(endchar + 'i8').tofile(f)
            
        else: # vanilla MRC
            if slices > 0: 
                if dtype != 'uint4' and dtype != input_image[0].dtype:
                    for z_slice in input_image:
                        z_slice.astype(dtype).tofile(f)
                else:
                    for z_slice in input_image:
                        z_slice.tofile(f)
            else:
                if dtype != 'uint4' and dtype != input_image.dtype:
                    input_image = input_image.astype(dtype)
                input_image.tofile(f)
            
            
    return 
Exemple #8
0
    tPool.join()


def decompressStack(imageShape,
                    imageDtype,
                    blosc_threads=1,
                    pool_threads=maxThreads):
    blosc.set_nthreads(blosc_threads)
    tPool = ThreadPool(pool_threads)

    num_slices = imageShape[0]
    imageStack = np.full(imageShape, fill_value=0)


blosc.print_versions()
blosc.set_blocksize(BLOCKSIZE)
print("Creating NumPy stack with %d float32 elements:" % (m * N * N))

stack = np.zeros([m, N, N], dtype=dtype)
xmesh, ymesh = np.meshgrid(np.arange(-N / 2, N / 2), np.arange(-N / 2, N / 2))
compress_mesh = (np.cos(xmesh) + np.exp(-ymesh**2 / N)).astype(dtype)
for J in np.arange(m):
    stack[J, :, :] = compress_mesh

### Determine arrangement of pool threads and blosc threads
testCases = int(np.floor(np.log2(maxThreads)) + 1)
powProduct = 2**np.arange(0, testCases)
poolThreads = np.hstack([1, powProduct])
bloscThreads = np.hstack([1, powProduct[::-1]])
# Let's try instead just pool threads...
#poolThreads = np.arange( 1, maxThreads+1 )
Exemple #9
0
from effects import SnpEff

# native Python imports
import os.path
import time
import sys
import sqlite3
import itertools as it

import toml  # toml.py

# third-party imports
import cyvcf2 as vcf
import blosc
blosc.set_nthreads(1)
blosc.set_blocksize(8192)

import zlib
import cPickle

def opack_blob(obj, _none=buffer(zlib.compress(cPickle.dumps(None, cPickle.HIGHEST_PROTOCOL)))):
    if obj is None: return _none
    return buffer(zlib.compress(cPickle.dumps(obj, cPickle.HIGHEST_PROTOCOL), 1))

def pack_blob(obj):
    if obj is None: return ''
    return buffer(blosc.compress(obj.tostring(), obj.dtype.itemsize, clevel=5, shuffle=True))
    #return buffer(blosc.pack_array(obj))

def is_number(op, field):
    return field.endswith("_float") or op in ("mean", "median", "min", "max")
Exemple #10
0
def __MRCExport( input_image, header, MRCfilename, endchar = '<' ):
    """
    MRCExport private interface with a dictionary rather than a mess of function 
    arguments.
    """
    with open( MRCfilename, 'wb', buffering=BUFFERSIZE ) as f:
    
        writeMRCHeader( f, header, endchar )
        f.seek(1024)
        
        if ('compressor' in header) \
                and (header['compressor'] in REVERSE_COMPRESSOR_ENUM) \
                and (REVERSE_COMPRESSOR_ENUM[header['compressor']]) > 0:
            # compressed MRCZ
            print( "Compressing %s with compressor %s%d" %
                    (MRCfilename, header['compressor'], header['clevel'] ) )
            
            
            
            if header['dtype'] != 'uint4' and input_image.dtype != header['dtype']:
                # This correctly works for text to dtype comparison
                input_image = input_image.astype(header['dtype']) 
                
            if input_image.ndim == 3:
                chunkSize = input_image[0,:,:].size
            else:
                chunkSize = input_image.size
                input_image = np.reshape( input_image, [1,input_image.shape[0],input_image.shape[1] ])
                
            blosc.set_nthreads( header['n_threads'] )
            blosc.set_blocksize( 65536 )
            
            header['packedBytes'] = 0
            typeSize = input_image.dtype.itemsize
            
            print( input_image.shape )
            for J in np.arange( input_image.shape[0] ):
                # print( "Slice %d: Compressing address at: %d of %d:" % (J, int(J*typeSize*blockSize), input_image.nbytes) )
                
                # Looks like I have problem for typesize > 1?
                if int(J*typeSize*chunkSize) >= input_image.nbytes:
                    raise MemoryError( "MRCExport: Tried to reference past end of ndarray %d > %d" % (int(J*typeSize*chunkSize), input_image.nbytes ) )
                    

                compressedData = blosc.compress( input_image[J,:,:].tobytes(),
                            typeSize, 
                            clevel=header['clevel'], 
                            shuffle=blosc.BITSHUFFLE,
                            cname=header['compressor'] )
                f.write( compressedData )
                    
                header['packedBytes'] += len(compressedData)
                # print( "packedBytes = %d" % header['packedBytes'] )
                
            # print( "Finished writing out compressedData" )
            # Rewind and write out the total compressed size
            f.seek(144)
            np.int64( header['packedBytes'] ).astype( endchar + "i8" ).tofile(f)

            
        else: # vanilla MRC
            if header['dtype'] != 'uint4' and input_image.dtype != header['dtype']:
                input_image = input_image.astype( header['dtype'] )
            
            input_image.tofile(f)
            
            
    return 
Exemple #11
0
 def test_get_cbuffer_sizes(self):
     s = b'0123456789' * 100000
     blosc.set_blocksize(2**16)
     c = blosc.compress(s, typesize=1)
     t = blosc.get_cbuffer_sizes(c)
     self.assertEqual(t, (1000000, 4354, 2**16))
Exemple #12
0
def __MRCExport(input_image,
                header,
                MRCfilename,
                endchar='<',
                offset=0,
                idxnewfile=True):
    '''
    MRCExport private interface with a dictionary rather than a mess of function 
    arguments.
    '''

    if idxnewfile:
        # If forcing a new file we truncate it even if it already exists:
        fmode = 'wb'

    else:
        # Otherwise we'll just update its header and append images as required:
        fmode = 'rb+'

    with open(MRCfilename, fmode, buffering=BUFFERSIZE) as f:
        extendedBytes = writeMRCHeader(f, header, endchar)
        f.seek(DEFAULT_HEADER_LEN + extendedBytes + offset)

        if ('compressor' in header) \
                and (header['compressor'] in REVERSE_COMPRESSOR_ENUM) \
                and (REVERSE_COMPRESSOR_ENUM[header['compressor']]) > 0:
            # compressed MRCZ
            logger.info('Compressing %s with compressor %s%d' %
                        (MRCfilename, header['compressor'], header['clevel']))

            if header['dtype'] != 'uint4' and input_image.dtype != header[
                    'dtype']:
                # This correctly works for text to dtype comparison
                input_image = input_image.astype(header['dtype'])

            if input_image.ndim == 3:
                chunkSize = input_image[0, :, :].size
            else:
                chunkSize = input_image.size
                input_image = np.reshape(
                    input_image,
                    [1, input_image.shape[0], input_image.shape[1]])

            blosc.set_nthreads(header['n_threads'])
            blosc.set_blocksize(BLOSC_BLOCK)

            header['packedBytes'] = 0
            typeSize = input_image.dtype.itemsize

            for J in np.arange(input_image.shape[0]):
                # print( 'Slice %d: Compressing address at: %d of %d:' % (J, int(J*typeSize*blockSize), input_image.nbytes) )

                # Looks like I have problem for typesize > 1?
                if int(J * typeSize * chunkSize) >= input_image.nbytes:
                    raise MemoryError(
                        'MRCExport: Tried to reference past end of ndarray %d > %d'
                        % (int(J * typeSize * chunkSize), input_image.nbytes))

                compressedData = blosc.compress(input_image[J, :, :].tobytes(),
                                                typeSize,
                                                clevel=header['clevel'],
                                                shuffle=blosc.BITSHUFFLE,
                                                cname=header['compressor'])
                f.write(compressedData)

                header['packedBytes'] += len(compressedData)

            # Rewind and write out the total compressed size
            f.seek(144)
            np.int64(header['packedBytes']).astype(endchar + 'i8').tofile(f)

        else:  # vanilla MRC
            if header['dtype'] != 'uint4' and input_image.dtype != header[
                    'dtype']:
                input_image = input_image.astype(header['dtype'])
            input_image.tofile(f)

    return
Exemple #13
0
    
    # All operations are done 'in-place' 
    tPool.map( compressSlice, tArgs )
    tPool.close()
    tPool.join()
    
def decompressStack( imageShape, imageDtype, blosc_threads = 1, pool_threads=maxThreads ):
    blosc.set_nthreads( blosc_threads )
    tPool = ThreadPool( pool_threads )
    
    num_slices = imageShape[0]
    imageStack = np.empty( imageShape  )


blosc.print_versions()
blosc.set_blocksize( BLOCKSIZE )
print("Creating NumPy stack with %d float32 elements:" %(m*N*N) )

stack = np.zeros( [m,N,N], dtype=dtype )
xmesh, ymesh = np.meshgrid( np.arange(-N/2,N/2), np.arange(-N/2,N/2) )
compress_mesh = (np.cos( xmesh ) + np.exp( -ymesh**2 / N )).astype(dtype)
for J in np.arange(m):
    stack[J,:,:] = compress_mesh


### Determine arrangement of pool threads and blosc threads
testCases = int( np.floor( np.log2( maxThreads )) + 1 )
powProduct = 2**np.arange(0,testCases)
poolThreads = np.hstack( [1, powProduct] )
bloscThreads = np.hstack( [1, powProduct[::-1]] )
# Let's try instead just pool threads...
 if buffer.dtype.kind in ('S', 'U'):
     is_string = True
     filters = (blosc.NOSHUFFLE, )
 else:
     is_string = False
     filters = (blosc.NOSHUFFLE, blosc.SHUFFLE, blosc.BITSHUFFLE)
 for i, chunk in enumerate(chunk_generator(buffer)):
     if is_string:
         chunk_features = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
     else:
         chunk_features = extract_chunk_features(chunk)
         chunk_features += (calculate_streaks(chunk,
                                              chunk_features[1]), )
     df = pd.DataFrame()
     for block_size in BLOCK_SIZES:
         blosc.set_blocksize(block_size)
         for codec in blosc.compressor_list():
             for filter in filters:
                 for clevel in C_LEVELS:
                     row_data = (filename, path, table, d_type, i + 1,
                                 chunk.size * chunk.dtype.itemsize / MB) \
                         + chunk_features \
                         + (block_size / 2**10, codec, blosc.filters[filter], clevel) \
                         + test_codec(chunk, codec,
                                      filter, clevel)
                     df = df.append(dict(zip(COLS, row_data)),
                                    ignore_index=True)
     print("%5.2f%% %-s %-s t%-s chunk %d completed" %
           ((i + 1) / n_chunks * 100, filename, path, table, (i + 1)))
     with open('blosc_test_data.csv', 'a') as f:
         df = df[COLS]