def test1(): data = np.ones(1000, dtype=np.uint32) compressed = bitshuffle.compress_lz4(data) print(data.nbytes / compressed.nbytes) decompressed = bitshuffle.decompress_lz4(compressed, data.shape, data.dtype) assert (np.array_equal(data, decompressed))
def __setitem__(self, index, value): assert index is Ellipsis assert self.dtype is not None data = numpy.empty(self.shape, self.dtype) data[...] = value with file(self.datafilename, 'w') as ff: compressed = bitshuffle.compress_lz4(data) compressed.tofile(ff)
def test_hdf5(tmp_path): data = np.ones((100, 100), dtype=np.uint32) with h5py.File(os.path.join(tmp_path, 'test.h5'), 'w') as fh: dset = fh.create_dataset( 'data', (5, 100, 100), maxshape=(None, 100, 100), chunks=(1, 100, 100), compression=bitshuffle.BSHUF_H5FILTER, compression_opts=(0, bitshuffle.BSHUF_H5_COMPRESS_LZ4), dtype=np.uint32) compressed = bitshuffle.compress_lz4(data) for i in range(5): dset.id.write_direct_chunk((i, 0, 0), compressed.tobytes()) with h5py.File(os.path.join(tmp_path, 'test.h5'), 'r') as fh: for img in fh['data']: assert (np.array_equal(data, img))
def pack_data(numpy_array, dtype): """ Compress the provided numpy array. :param numpy_array: Array to compress. :param dtype: Data type (Numpy). :return: Header (unpacked length, compression block size) + Compressed data """ # Uncompressed block size, big endian, int64 (long long) unpacked_length_bytes = struct.pack(">q", numpy_array.nbytes) n_bytes_per_element = numpy.dtype(dtype).itemsize compression_block_size = BitshuffleLZ4.get_compression_block_size(n_bytes_per_element) # We multiply the compression block size by the n_bytes_per_element, because the HDF5 filter does so. # https://github.com/kiyo-masui/bitshuffle/blob/04e58bd553304ec26e222654f1d9b6ff64e97d10/src/bshuf_h5filter.c#L167 header_compression_block_size = compression_block_size * n_bytes_per_element # Compression block size, big endian, int32 (int). block_size_bytes = struct.pack(">i", header_compression_block_size) compressed_bytes = bitshuffle.compress_lz4(numpy_array, compression_block_size).tobytes() return unpacked_length_bytes + block_size_bytes + compressed_bytes
def bitshuffle_lz4_encode(data, level=1, blocksize=0, out=None): """Compress LZ4 with Bitshuffle.""" return bitshuffle.compress_lz4(data, blocksize)
def postprocess_raw(source, dest, disabled_modules=(), index=None, compression=False, batch_size=100): # a function for 'visititems' should have the args (name, object) def _visititems(name, obj): if isinstance(obj, h5py.Group): h5_dest.create_group(name) elif isinstance(obj, h5py.Dataset): dset_source = h5_source[name] # process all but the raw data if name != data_dset: if name.startswith("data"): # datasets with data per image, so indexing should be applied if index is None: data = dset_source[:] else: data = dset_source[index, :] args = {"shape": data.shape} h5_dest.create_dataset_like(name, dset_source, data=data, **args) else: h5_dest.create_dataset_like(name, dset_source, data=dset_source) else: raise TypeError(f"Unknown h5py object type {obj}") # copy group/dataset attributes if it's not a dataset with the actual data if name != data_dset: for key, value in h5_source[name].attrs.items(): h5_dest[name].attrs[key] = value with h5py.File(source, "r") as h5_source, h5py.File(dest, "w") as h5_dest: detector_name = h5_source["general/detector_name"][()].decode() data_dset = f"data/{detector_name}/data" # traverse the source file and copy/index all datasets, except the raw data h5_source.visititems(_visititems) # now process the raw data dset = h5_source[data_dset] args = dict() if index is None: n_images = dset.shape[0] else: index = np.array(index) n_images = len(index) n_modules = dset.shape[1] // MODULE_SIZE_Y out_shape = (MODULE_SIZE_Y * (n_modules - len(disabled_modules)), MODULE_SIZE_X) args["shape"] = (n_images, *out_shape) args["maxshape"] = (n_images, *out_shape) args["chunks"] = (1, *out_shape) if compression: args.update(compargs) h5_dest.create_dataset_like(data_dset, dset, **args) # calculate and save module_map module_map = [] tmp = 0 for ind in range(n_modules): if ind in disabled_modules: module_map.append(-1) else: module_map.append(tmp) tmp += 1 h5_dest[f"data/{detector_name}/module_map"] = np.tile( module_map, (n_images, 1)) # prepare buffers to be reused for every batch read_buffer = np.empty((batch_size, *dset.shape[1:]), dtype=DTYPE) out_buffer = np.zeros((batch_size, *out_shape), dtype=DTYPE) # process and write data in batches for batch_start_ind in range(0, n_images, batch_size): batch_range = range(batch_start_ind, min(batch_start_ind + batch_size, n_images)) if index is None: batch_ind = np.array(batch_range) else: batch_ind = index[batch_range] # TODO: avoid unnecessary buffers read_buffer_view = read_buffer[:len(batch_ind)] out_buffer_view = out_buffer[:len(batch_ind)] # Avoid a stride-bottleneck, see https://github.com/h5py/h5py/issues/977 if np.sum(np.diff(batch_ind)) == len(batch_ind) - 1: # consecutive index values dset.read_direct(read_buffer_view, source_sel=np.s_[batch_ind]) else: for i, j in enumerate(batch_ind): dset.read_direct(read_buffer_view, source_sel=np.s_[j], dest_sel=np.s_[i]) for i, m in enumerate(module_map): if m == -1: continue read_slice = read_buffer_view[:, i * MODULE_SIZE_Y:(i + 1) * MODULE_SIZE_Y, :] out_slice = out_buffer_view[:, m * MODULE_SIZE_Y:(m + 1) * MODULE_SIZE_Y, :] out_slice[:] = read_slice bytes_num_elem = struct.pack( ">q", out_shape[0] * out_shape[1] * DTYPE_SIZE) bytes_block_size = struct.pack(">i", BLOCK_SIZE * DTYPE_SIZE) header = bytes_num_elem + bytes_block_size for pos, im in zip(batch_range, out_buffer_view): if compression: byte_array = header + bitshuffle.compress_lz4( im, BLOCK_SIZE).tobytes() else: byte_array = im.tobytes() h5_dest[data_dset].id.write_direct_chunk((pos, 0, 0), byte_array)
def _process_data(self, h5_dest, index, roi, compression, dtype, batch_size): args = dict() data_dset = self.file[self._data_dset_name] if index is None: n_images = data_dset.shape[0] else: n_images = len(index) h5_dest[ f"data/{self.detector_name}/conversion_factor"] = self.handler.factor or np.NaN pixel_mask = self.get_pixel_mask() if roi is None: # save a pixel mask h5_dest[f"data/{self.detector_name}/pixel_mask"] = pixel_mask image_shape = self.get_shape_out() args["shape"] = (n_images, *image_shape) args["maxshape"] = (n_images, *image_shape) args["chunks"] = (1, *image_shape) if dtype is None: args["dtype"] = self.get_dtype_out() else: args["dtype"] = dtype if compression: args.update(compargs) h5_dest.create_dataset_like(data_dset.name, data_dset, **args) else: if len(roi) == 4 and all(isinstance(v, int) for v in roi): # this is a single tuple with coordinates, so wrap it in another tuple roi = (roi, ) h5_dest.create_dataset(f"data/{self.detector_name}/n_roi", data=len(roi)) for i, (roi_y1, roi_y2, roi_x1, roi_x2) in enumerate(roi): h5_dest.create_dataset(f"data/{self.detector_name}/roi_{i}", data=[(roi_y1, roi_y2), (roi_x1, roi_x2)]) # save a pixel mask for ROI h5_dest.create_dataset( f"data/{self.detector_name}/pixel_mask_roi_{i}", data=pixel_mask[slice(roi_y1, roi_y2), slice(roi_x1, roi_x2)], ) # prepare ROI datasets roi_shape = (roi_y2 - roi_y1, roi_x2 - roi_x1) args["shape"] = (n_images, *roi_shape) args["maxshape"] = (n_images, *roi_shape) args["chunks"] = (1, *roi_shape) if dtype is None: args["dtype"] = self.get_dtype_out() else: args["dtype"] = dtype if compression: args.update(compargs) h5_dest.create_dataset(f"{data_dset.name}_roi_{i}", **args) # prepare buffers to be reused for every batch read_buffer = np.empty((batch_size, *data_dset.shape[1:]), dtype=data_dset.dtype) out_shape = self.get_shape_out() out_dtype = self.get_dtype_out() out_buffer = np.zeros((batch_size, *out_shape), dtype=out_dtype) # process and write data in batches for batch_start_ind in range(0, n_images, batch_size): batch_range = np.arange( batch_start_ind, min(batch_start_ind + batch_size, n_images)) if index is None: batch_ind = batch_range else: batch_ind = index[batch_range] read_buffer_view = read_buffer[:len(batch_ind)] out_buffer_view = out_buffer[:len(batch_ind)] # Avoid a stride-bottleneck, see https://github.com/h5py/h5py/issues/977 if np.sum(np.diff(batch_ind)) == len(batch_ind) - 1: # consecutive index values data_dset.read_direct(read_buffer_view, source_sel=np.s_[batch_ind]) else: for i, j in enumerate(batch_ind): data_dset.read_direct(read_buffer_view, source_sel=np.s_[j], dest_sel=np.s_[i]) # Process data out_buffer_view = self.handler.process( read_buffer_view, conversion=self.conversion, mask=self.mask, gap_pixels=self.gap_pixels, double_pixels=self.double_pixels, geometry=self.geometry, parallel=self.parallel, out=out_buffer_view, ) out_buffer_view = np.ascontiguousarray(out_buffer_view) if roi is None: dtype_size = out_dtype.itemsize bytes_num_elem = struct.pack( ">q", image_shape[0] * image_shape[1] * dtype_size) bytes_block_size = struct.pack(">i", BLOCK_SIZE * dtype_size) header = bytes_num_elem + bytes_block_size for pos, im in zip(batch_range, out_buffer_view): if compression: byte_array = header + bitshuffle.compress_lz4( im, BLOCK_SIZE).tobytes() else: byte_array = im.tobytes() h5_dest[data_dset.name].id.write_direct_chunk((pos, 0, 0), byte_array) else: for i, (roi_y1, roi_y2, roi_x1, roi_x2) in enumerate(roi): roi_data = out_buffer_view[:, slice(roi_y1, roi_y2), slice(roi_x1, roi_x2)] h5_dest[f"{data_dset.name}_roi_{i}"][ batch_range] = roi_data