def load_to_buffer(self, file, buffer): """ Decompress data from a file to a provided buffer. """ _ = dill.load(file) compressed = file.read() blosc.decompress_ptr(compressed, buffer.__array_interface__['data'][0]) return buffer
def decompress_ndarray(binary: bytes, output_array=None) -> numpy.ndarray: """ decompress_ndarray(binary[, output_array=None]) Decompress array from buffer. Data will write in output_array if it is provided. Save time request memory spaces and avoid out of memory. """ assert type(binary) is bytes assert type(output_array) in [numpy.ndarray, numpy.memmap, type(None)] buffer, dtype, shape = pickle.loads(binary) if output_array is None: output_array = numpy.empty(shape, dtype) else: assert dtype == output_array.dtype if type(output_array) is numpy.ndarray: output_array.resize(shape, refcheck=False) else: output_array = numpy.memmap(output_array.filename, shape=shape, dtype=dtype) assert output_array.shape == shape, "Array output's shape is't same as compressed array." blosc.decompress_ptr(buffer, output_array.__array_interface__['data'][0]) return output_array
def decompress_pre(Path, arr): f = open(Path, "rb") shape, dtype = pickle.load(f) c = f.read() #array allocation takes most of the time blosc.decompress_ptr(c, arr.__array_interface__['data'][0]) return arr
def test_decompress_ptr_input_types(self): import numpy as np # assume the expected answer was compressed from bytes expected = b'0123456789' out = np.zeros(len(expected), dtype=np.byte) compressed = blosc.compress(expected, typesize=1) # now for all the things that support the buffer interface out[:] = 0 # reset the output array nout = blosc.decompress_ptr(compressed, out.ctypes.data) self.assertEqual(expected, out.tobytes()) self.assertEqual(len(expected), nout) # check that we didn't write too many bytes out[:] = 0 nout = blosc.decompress_ptr(memoryview(compressed), out.ctypes.data) self.assertEqual(expected, out.tobytes()) self.assertEqual(len(expected), nout) out[:] = 0 nout = blosc.decompress_ptr(bytearray(compressed), out.ctypes.data) self.assertEqual(expected, out.tobytes()) self.assertEqual(len(expected), nout) out[:] = 0 nout = blosc.decompress_ptr(np.frombuffer(compressed, dtype=np.byte), out.ctypes.data) self.assertEqual(expected, out.tobytes()) self.assertEqual(len(expected), nout)
def load(self, file): """ Read the shape and dtype from a file, create a buffer, decompress data into it. """ shape, dtype = dill.load(file) compressed = file.read() array = np.empty(shape=shape, dtype=dtype) blosc.decompress_ptr(compressed, array.__array_interface__['data'][0]) return array
def decompression(self): """ data decompression Return ------------- uncompressed numpy array """ c = empty(self.size, dtype=self.dtype) decompress_ptr(self.data, c.__array_interface__['data'][0]) return c
def __getitem__(self, index): fname = os.path.join(self.dataname, self.filenames[index]) if self.is_compressed: with open(fname, 'rb') as f: arr = numpy.empty(self.ndim, dtype=numpy.float64) blosc.decompress_ptr(f.read(), arr.__array_interface__['data'][0]) return torch.from_numpy(arr) else: return torch.from_numpy(numpy.load(fname))
def test_codec(chunk, codec, filter_name, clevel): """ Compress the chunk and return tested data. Parameters ---------- chunk: bytes-like object (supporting the buffer interface) The data to be compressed. codec : string The name of the compressor used internally in Blosc. It can be any of the supported by Blosc ('blosclz', 'lz4', 'lz4hc', 'snappy', 'zlib', 'zstd' and maybe others too). filter_name : int The shuffle filter to be activated. Allowed values are blosc.NOSHUFFLE, blosc.SHUFFLE and blosc.BITSHUFFLE. clevel : int The compression level from 0 (no compression) to 9 (maximum compression). Returns ------- out: tuple The associated compression rate, compression speed and decompression speed (in GB/s). Raises ------ TypeError If bytesobj doesn't support the buffer interface. ValueError If bytesobj is too long. If typesize is not within the allowed range. If clevel is not within the allowed range. If cname is not a valid codec. """ t0 = time() c = blosc.compress_ptr(chunk.__array_interface__['data'][0], chunk.size, chunk.dtype.itemsize, clevel=clevel, shuffle=filter_name, cname=codec) tc = time() - t0 out = np.empty(chunk.size, dtype=chunk.dtype) times = [] for i in range(3): t0 = time() blosc.decompress_ptr(c, out.__array_interface__['data'][0]) times.append(time() - t0) chunk_byte_size = chunk.size * chunk.dtype.itemsize rate = chunk_byte_size / len(c) c_speed = chunk_byte_size / tc / SPEED_UNIT d_speed = chunk_byte_size / min(times) / SPEED_UNIT # print(" *** %-8s, %-10s, CL%d *** %6.4f s / %5.4f s " % # ( codec, blosc.filters[filter], clevel, tc, td), end='') # print("\tCompr. ratio: %5.1fx" % rate) return rate, c_speed, d_speed
def decompress_ndarray(binary: bytes, output_array=None) -> numpy.ndarray: assert type(binary) is bytes assert type(output_array) is numpy.ndarray buffer, dtype, shape = pickle.loads(binary) if not output_array: output_array = numpy.empty(shape, dtype) assert output_array.shape == shape, "Array output's shape is't same as compressed array." blosc.decompress_ptr(buffer, output_array.__array_interface__['data'][0]) return output_array
def test_codec( chunk, codec, filter, clevel ): """ Compresses the array chunk with the given codec, filter and clevel and return the compression time and rate. Parameters ---------- chunk : bytes-like object (supporting the buffer interface) The data to be compressed. codec : string The name of the compressor used internally in Blosc. It can be any of the supported by Blosc ('blosclz', 'lz4', 'lz4hc', 'snappy', 'zlib', 'zstd' and maybe others too). clevel : int The compression level from 0 (no compression) to 9 (maximum compression). shuffle : int The shuffle filter to be activated. Allowed values are blosc.NOSHUFFLE, blosc.SHUFFLE and blosc.BITSHUFFLE. Returns ------- out : tuple The associated compression time, rate and decompression time. Raises ------ TypeError If bytesobj doesn't support the buffer interface. ValueError If bytesobj is too long. If typesize is not within the allowed range. If clevel is not within the allowed range. If cname is not a valid codec. """ t0 = time() c = blosc.compress_ptr(chunk.__array_interface__['data'][0], chunk.size, chunk.dtype.itemsize, clevel = clevel, shuffle = filter, cname = codec) tc = time() - t0 out = np.empty(chunk.size, dtype = chunk.dtype) t0 = time() blosc.decompress_ptr(c, out.__array_interface__['data'][0]) td = time() - t0 rate = (chunk.size * chunk.dtype.itemsize / len(c)) assert ((chunk == out).all()) # print(" *** %-8s, %-10s, CL%d *** %6.4f s / %5.4f s " % # ( codec, blosc.filters[filter], clevel, tc, td), end='') # print("\tCompr. ratio: %5.1fx" % rate) return (rate, tc, td)
def _expanded_copy(obj): """Expand arrays within dicts, tuples and lists, do not dig other objects for now """ if isinstance(obj, _SqueezedArray): shape, dtype, comp = obj array = np.empty(shape, dtype=dtype) blosc.decompress_ptr(comp, array.__array_interface__['data'][0]) return array tpe = type(obj) if tpe is tuple or tpe is list: return tpe(_expanded_copy(el) for el in obj) if tpe is dict: return tpe((k, _expanded_copy(v)) for k, v in obj.items()) return obj
def read_blosc(stream, out=None): meta = read_json(stream) shape = tuple(meta['shape']) dtype = restore_dtype(meta['dtype']) if out is None: out = np.empty(shape, dtype) elif not isinstance(out, np.ndarray): raise TypeError('expected ndarray, got {}'.format(type(out).__name__)) elif out.shape != shape: raise ValueError('incompatible shape: expected {}, got {}'.format(shape, out.shape)) elif out.dtype != dtype: raise ValueError('incompatible dtype: expected {}, got {}'.format(dtype, out.dtype)) elif not out.flags.contiguous: raise ValueError('expected contiguous array') blosc.decompress_ptr( stream.read(meta['length']), out.__array_interface__['data'][0] ) if out.dtype.type is np.record: out = out.view(np.recarray) return out
def receive_msg(self): # receive (blocking) messages self.log.debug("Waiting for multipart message") frames = self.zmq_socket.recv_multipart(flags=0, copy=False, track=False) self.log.debug(" Multipart message received. Length: %d", len(frames)) arr_desc = msgpack.unpackb(frames[0].bytes) self.log.debug('Array description: %s', str(arr_desc)) self.log.debug("Unpacking numpy array from bytes") # Create an empty numpy array placeholder to unpack the compressed array into arr = np.empty(arr_desc['shape'], dtype=arr_desc['dtype']) dest_arr_ptr = arr.__array_interface__['data'][0] # Unfortunately the access to Frame.bytes makes a copy of the compressed data. # As we only read the compressed data it is not strictly necessary to make # a copy, however it seems impossible to get a string object out without making a copy... # We would have to modify the blosc python bindings to add a decompress_ptr function which # would work when given a python memoryview object (i.e. a pointer) compressed_bytes = frames[1].bytes blosc.decompress_ptr(compressed_bytes, dest_arr_ptr) self.log.debug(" unpacked array: shape: %s", str(arr.shape)) attr = msgpack.unpackb(frames[2].bytes) self.log.debug(" unpacked attributes: %s", str(attr)) return arr, attr
def decompress_ptr(): cx = blosc.compress_ptr(address, num_elements, typesize, clevel=0) blosc.decompress_ptr(cx, address)
print("Using *** %s *** compressor::" % cname) ctic = time.time() c = blosc.pack_array(in_, clevel=clevel, shuffle=True, cname=cname) ctoc = time.time() dtic = time.time() out = blosc.unpack_array(c) dtoc = time.time() assert ((in_ == out).all()) print(" Time for pack_array/unpack_array: %.3f/%.3f s." % \ (ctoc-ctic, dtoc-dtic), end='') print("\tCompr ratio: %.2f" % (in_.size * in_.dtype.itemsize * 1. / len(c))) ctic = time.time() c = blosc.compress_ptr(in_.__array_interface__['data'][0], in_.size, in_.dtype.itemsize, clevel=clevel, shuffle=True, cname=cname) ctoc = time.time() out = np.empty(in_.size, dtype=in_.dtype) dtic = time.time() blosc.decompress_ptr(c, out.__array_interface__['data'][0]) dtoc = time.time() assert ((in_ == out).all()) print(" Time for compress_ptr/decompress_ptr: %.3f/%.3f s." % \ (ctoc-ctic, dtoc-dtic), end='') print("\tCompr ratio: %.2f" % (in_.size * in_.dtype.itemsize * 1. / len(c)))
def put(self, compressed): bwritten = blosc.decompress_ptr(compressed, self.ptr) self.ptr += bwritten return bwritten
def decompress_ndarray(binary, output_array=None) -> numpy.ndarray: """ decompress_ndarray(binary[, output_array=None]) Decompress array from buffer. Data will write in output_array if it is provided. Save time request memory spaces and avoid out of memory. Parameters ---------- binary: bytes Numpy array bytes, which was compressed. output_array: numpy.ndarray Data after decompress will be write into output_array if give. """ if not isinstance(binary, bytes): raise TypeError("Require byte type of input data.") if output_array is not None and not isinstance( output_array, (numpy.ndarray, numpy.memmap)): raise TypeError("Require numpy.ndarray type of output array.") cursor = 0 # get header_size header_size = cvt_hex2dec(binary[cursor:cursor + 1]) cursor += 1 # get dtype_size dtype_size = cvt_hex2dec(binary[cursor:cursor + 1]) cursor += 1 # get dtype dtype = numpy.dtype(cvt_hex2str(binary[cursor:cursor + dtype_size])) cursor += dtype_size # get shape shape = [] while 1: if cursor >= header_size + 1: break shape.append(cvt_hex2dec(binary[cursor:cursor + 2])) cursor += 2 if output_array is None: output_array = numpy.empty(shape, dtype) else: if dtype != output_array.dtype: raise TypeError("Type of output array and data aren't the same!") if tuple(shape) != output_array.shape: if isinstance(output_array, numpy.memmap): output_array = numpy.memmap(output_array.filename, shape=shape, dtype=dtype) else: output_array.resize(shape, refcheck=False) blosc.decompress_ptr(binary[cursor:], output_array.__array_interface__['data'][0]) return output_array
def compressed_bytes_to_arraydata(cbytes, size, dtype): arraydata = np.empty(size, dtype=dtype) blosc.decompress_ptr(cbytes, arraydata.__array_interface__['data'][0]) return arraydata
arrays = [None]*3 labels = [None]*3 arrays[0] = np.arange(N, dtype=np.int64) labels[0] = "the arange linear distribution" arrays[1] = np.linspace(0, 1000, N) labels[1] = "the linspace linear distribution" arrays[2] = np.random.random_integers(0, 1000, N) labels[2] = "the random distribution" tic = time.time() out_ = np.copy(arrays[0]) toc = time.time() print(" *** np.copy() **** Time for memcpy(): %.3f s" % (toc-tic,)) for (in_, label) in zip(arrays, labels): print("\n*** %s ***" % label) for cname in blosc.compressor_list(): ctic = time.time() c = blosc.compress_ptr(in_.__array_interface__['data'][0], in_.size, in_.dtype.itemsize, clevel=clevel, shuffle=True, cname=cname) ctoc = time.time() out = np.empty(in_.size, dtype=in_.dtype) dtic = time.time() blosc.decompress_ptr(c, out.__array_interface__['data'][0]) dtoc = time.time() assert((in_ == out).all()) print(" *** %-8s *** Time for comp/decomp: %.3f/%.3f s." % \ (cname, ctoc-ctic, dtoc-dtic), end='') print("\tCompr ratio: %6.2f" % (in_.size*in_.dtype.itemsize*1. / len(c)))
def unpack(col): a = np.empty(col['size'], dtype=col['dtype']) blosc.decompress_ptr(bytes(col['data']), a.__array_interface__['data'][0]) return a
def decompress(size, dtype, data): out = np.empty(size, dtype) blosc.decompress_ptr(data, out.__array_interface__['data'][0]) return out
def decompress(self, blocks, out, **kwargs): '''Useful decompression kwargs: nthreads ''' # TODO: controlled globally for now #nthreads = kwargs.pop('nthreads',1) #blosc.set_nthreads(nthreads) _size = 0 _pos = 0 _buffer = None _partial_len = b'' decompression_time = 0. bytesout = 0 # Blosc code probably assumes contiguous buffer if not out.contiguous: raise ValueError(out.contiguous) # get the out address out = np.frombuffer(out, dtype=np.uint8).ctypes.data for block in blocks: block = memoryview(block).cast('c') try: block = block.toreadonly() # python>=3.8 only except AttributeError: pass if not block.contiguous: raise ValueError(block.contiguous) while len(block): if not _size: # Don't know the (compressed) length of this block yet if len(_partial_len) + len(block) < 4: _partial_len += block break # we've exhausted the data if _partial_len: # If we started to fill a len key, finish filling it remaining = 4 - len(_partial_len) if remaining: _partial_len += block[:remaining] block = block[remaining:] _size = struct.unpack('!I', _partial_len)[0] _partial_len = b'' else: # Otherwise just read the len key directly _size = struct.unpack('!I', block[:4])[0] block = block[4:] if len(block) < _size or _buffer is not None: # If we have a partial block, or we're already filling a buffer, use the buffer if _buffer is None: _buffer = np.empty( _size, dtype=np.byte ) # use numpy instead of bytearray so we can avoid zero initialization _pos = 0 newbytes = min( _size - _pos, len(block)) # don't fill past the buffer len! _buffer[_pos:_pos + newbytes] = np.frombuffer( block[:newbytes], dtype=np.byte) _pos += newbytes block = block[newbytes:] if _pos == _size: start = time.perf_counter() n_thisout = blosc.decompress_ptr( memoryview(_buffer), out + bytesout, **kwargs) decompression_time += time.perf_counter() - start bytesout += n_thisout _buffer = None _size = 0 else: # We have at least one full block start = time.perf_counter() n_thisout = blosc.decompress_ptr(memoryview(block[:_size]), out + bytesout, **kwargs) decompression_time += time.perf_counter() - start bytesout += n_thisout block = block[_size:] _size = 0 return bytesout
def decompress(shape, dtype, compressed): array = np.empty(shape, dtype=dtype) blosc.decompress_ptr(compressed, array.__array_interface__['data'][0]) return BloscItem(array)