Ejemplo n.º 1
0
def test1():
    data = np.ones(1000, dtype=np.uint32)
    compressed = bitshuffle.compress_lz4(data)
    print(data.nbytes / compressed.nbytes)
    decompressed = bitshuffle.decompress_lz4(compressed, data.shape,
                                             data.dtype)
    assert (np.array_equal(data, decompressed))
Ejemplo n.º 2
0
 def __setitem__(self, index, value):
     assert index is Ellipsis
     assert self.dtype is not None
     data = numpy.empty(self.shape, self.dtype)
     data[...] = value
     with file(self.datafilename, 'w') as ff:
         compressed = bitshuffle.compress_lz4(data)
         compressed.tofile(ff)
Ejemplo n.º 3
0
def test_hdf5(tmp_path):
    data = np.ones((100, 100), dtype=np.uint32)
    with h5py.File(os.path.join(tmp_path, 'test.h5'), 'w') as fh:
        dset = fh.create_dataset(
            'data', (5, 100, 100),
            maxshape=(None, 100, 100),
            chunks=(1, 100, 100),
            compression=bitshuffle.BSHUF_H5FILTER,
            compression_opts=(0, bitshuffle.BSHUF_H5_COMPRESS_LZ4),
            dtype=np.uint32)

        compressed = bitshuffle.compress_lz4(data)
        for i in range(5):
            dset.id.write_direct_chunk((i, 0, 0), compressed.tobytes())

    with h5py.File(os.path.join(tmp_path, 'test.h5'), 'r') as fh:
        for img in fh['data']:
            assert (np.array_equal(data, img))
Ejemplo n.º 4
0
    def pack_data(numpy_array, dtype):
        """
        Compress the provided numpy array.
        :param numpy_array: Array to compress.
        :param dtype: Data type (Numpy).
        :return: Header (unpacked length, compression block size) + Compressed data
        """
        # Uncompressed block size, big endian, int64 (long long)
        unpacked_length_bytes = struct.pack(">q", numpy_array.nbytes)

        n_bytes_per_element = numpy.dtype(dtype).itemsize
        compression_block_size = BitshuffleLZ4.get_compression_block_size(n_bytes_per_element)

        # We multiply the compression block size by the n_bytes_per_element, because the HDF5 filter does so.
        # https://github.com/kiyo-masui/bitshuffle/blob/04e58bd553304ec26e222654f1d9b6ff64e97d10/src/bshuf_h5filter.c#L167
        header_compression_block_size = compression_block_size * n_bytes_per_element
        # Compression block size, big endian, int32 (int).
        block_size_bytes = struct.pack(">i", header_compression_block_size)

        compressed_bytes = bitshuffle.compress_lz4(numpy_array, compression_block_size).tobytes()

        return unpacked_length_bytes + block_size_bytes + compressed_bytes
Ejemplo n.º 5
0
def bitshuffle_lz4_encode(data, level=1, blocksize=0, out=None):
    """Compress LZ4 with Bitshuffle."""
    return bitshuffle.compress_lz4(data, blocksize)
Ejemplo n.º 6
0
def postprocess_raw(source,
                    dest,
                    disabled_modules=(),
                    index=None,
                    compression=False,
                    batch_size=100):
    # a function for 'visititems' should have the args (name, object)
    def _visititems(name, obj):
        if isinstance(obj, h5py.Group):
            h5_dest.create_group(name)

        elif isinstance(obj, h5py.Dataset):
            dset_source = h5_source[name]

            # process all but the raw data
            if name != data_dset:
                if name.startswith("data"):
                    # datasets with data per image, so indexing should be applied
                    if index is None:
                        data = dset_source[:]
                    else:
                        data = dset_source[index, :]

                    args = {"shape": data.shape}
                    h5_dest.create_dataset_like(name,
                                                dset_source,
                                                data=data,
                                                **args)
                else:
                    h5_dest.create_dataset_like(name,
                                                dset_source,
                                                data=dset_source)

        else:
            raise TypeError(f"Unknown h5py object type {obj}")

        # copy group/dataset attributes if it's not a dataset with the actual data
        if name != data_dset:
            for key, value in h5_source[name].attrs.items():
                h5_dest[name].attrs[key] = value

    with h5py.File(source, "r") as h5_source, h5py.File(dest, "w") as h5_dest:
        detector_name = h5_source["general/detector_name"][()].decode()
        data_dset = f"data/{detector_name}/data"

        # traverse the source file and copy/index all datasets, except the raw data
        h5_source.visititems(_visititems)

        # now process the raw data
        dset = h5_source[data_dset]

        args = dict()
        if index is None:
            n_images = dset.shape[0]
        else:
            index = np.array(index)
            n_images = len(index)

        n_modules = dset.shape[1] // MODULE_SIZE_Y
        out_shape = (MODULE_SIZE_Y * (n_modules - len(disabled_modules)),
                     MODULE_SIZE_X)

        args["shape"] = (n_images, *out_shape)
        args["maxshape"] = (n_images, *out_shape)
        args["chunks"] = (1, *out_shape)

        if compression:
            args.update(compargs)

        h5_dest.create_dataset_like(data_dset, dset, **args)

        # calculate and save module_map
        module_map = []
        tmp = 0
        for ind in range(n_modules):
            if ind in disabled_modules:
                module_map.append(-1)
            else:
                module_map.append(tmp)
                tmp += 1

        h5_dest[f"data/{detector_name}/module_map"] = np.tile(
            module_map, (n_images, 1))

        # prepare buffers to be reused for every batch
        read_buffer = np.empty((batch_size, *dset.shape[1:]), dtype=DTYPE)
        out_buffer = np.zeros((batch_size, *out_shape), dtype=DTYPE)

        # process and write data in batches
        for batch_start_ind in range(0, n_images, batch_size):
            batch_range = range(batch_start_ind,
                                min(batch_start_ind + batch_size, n_images))

            if index is None:
                batch_ind = np.array(batch_range)
            else:
                batch_ind = index[batch_range]

            # TODO: avoid unnecessary buffers
            read_buffer_view = read_buffer[:len(batch_ind)]
            out_buffer_view = out_buffer[:len(batch_ind)]

            # Avoid a stride-bottleneck, see https://github.com/h5py/h5py/issues/977
            if np.sum(np.diff(batch_ind)) == len(batch_ind) - 1:
                # consecutive index values
                dset.read_direct(read_buffer_view, source_sel=np.s_[batch_ind])
            else:
                for i, j in enumerate(batch_ind):
                    dset.read_direct(read_buffer_view,
                                     source_sel=np.s_[j],
                                     dest_sel=np.s_[i])

            for i, m in enumerate(module_map):
                if m == -1:
                    continue

                read_slice = read_buffer_view[:, i * MODULE_SIZE_Y:(i + 1) *
                                              MODULE_SIZE_Y, :]
                out_slice = out_buffer_view[:, m * MODULE_SIZE_Y:(m + 1) *
                                            MODULE_SIZE_Y, :]
                out_slice[:] = read_slice

            bytes_num_elem = struct.pack(
                ">q", out_shape[0] * out_shape[1] * DTYPE_SIZE)
            bytes_block_size = struct.pack(">i", BLOCK_SIZE * DTYPE_SIZE)
            header = bytes_num_elem + bytes_block_size

            for pos, im in zip(batch_range, out_buffer_view):
                if compression:
                    byte_array = header + bitshuffle.compress_lz4(
                        im, BLOCK_SIZE).tobytes()
                else:
                    byte_array = im.tobytes()

                h5_dest[data_dset].id.write_direct_chunk((pos, 0, 0),
                                                         byte_array)
Ejemplo n.º 7
0
    def _process_data(self, h5_dest, index, roi, compression, dtype,
                      batch_size):
        args = dict()

        data_dset = self.file[self._data_dset_name]
        if index is None:
            n_images = data_dset.shape[0]
        else:
            n_images = len(index)

        h5_dest[
            f"data/{self.detector_name}/conversion_factor"] = self.handler.factor or np.NaN

        pixel_mask = self.get_pixel_mask()

        if roi is None:
            # save a pixel mask
            h5_dest[f"data/{self.detector_name}/pixel_mask"] = pixel_mask

            image_shape = self.get_shape_out()

            args["shape"] = (n_images, *image_shape)
            args["maxshape"] = (n_images, *image_shape)
            args["chunks"] = (1, *image_shape)

            if dtype is None:
                args["dtype"] = self.get_dtype_out()
            else:
                args["dtype"] = dtype

            if compression:
                args.update(compargs)

            h5_dest.create_dataset_like(data_dset.name, data_dset, **args)

        else:
            if len(roi) == 4 and all(isinstance(v, int) for v in roi):
                # this is a single tuple with coordinates, so wrap it in another tuple
                roi = (roi, )

            h5_dest.create_dataset(f"data/{self.detector_name}/n_roi",
                                   data=len(roi))
            for i, (roi_y1, roi_y2, roi_x1, roi_x2) in enumerate(roi):
                h5_dest.create_dataset(f"data/{self.detector_name}/roi_{i}",
                                       data=[(roi_y1, roi_y2),
                                             (roi_x1, roi_x2)])

                # save a pixel mask for ROI
                h5_dest.create_dataset(
                    f"data/{self.detector_name}/pixel_mask_roi_{i}",
                    data=pixel_mask[slice(roi_y1, roi_y2),
                                    slice(roi_x1, roi_x2)],
                )

                # prepare ROI datasets
                roi_shape = (roi_y2 - roi_y1, roi_x2 - roi_x1)

                args["shape"] = (n_images, *roi_shape)
                args["maxshape"] = (n_images, *roi_shape)
                args["chunks"] = (1, *roi_shape)

                if dtype is None:
                    args["dtype"] = self.get_dtype_out()
                else:
                    args["dtype"] = dtype

                if compression:
                    args.update(compargs)

                h5_dest.create_dataset(f"{data_dset.name}_roi_{i}", **args)

        # prepare buffers to be reused for every batch
        read_buffer = np.empty((batch_size, *data_dset.shape[1:]),
                               dtype=data_dset.dtype)

        out_shape = self.get_shape_out()
        out_dtype = self.get_dtype_out()
        out_buffer = np.zeros((batch_size, *out_shape), dtype=out_dtype)

        # process and write data in batches
        for batch_start_ind in range(0, n_images, batch_size):
            batch_range = np.arange(
                batch_start_ind, min(batch_start_ind + batch_size, n_images))

            if index is None:
                batch_ind = batch_range
            else:
                batch_ind = index[batch_range]

            read_buffer_view = read_buffer[:len(batch_ind)]
            out_buffer_view = out_buffer[:len(batch_ind)]

            # Avoid a stride-bottleneck, see https://github.com/h5py/h5py/issues/977
            if np.sum(np.diff(batch_ind)) == len(batch_ind) - 1:
                # consecutive index values
                data_dset.read_direct(read_buffer_view,
                                      source_sel=np.s_[batch_ind])
            else:
                for i, j in enumerate(batch_ind):
                    data_dset.read_direct(read_buffer_view,
                                          source_sel=np.s_[j],
                                          dest_sel=np.s_[i])

            # Process data
            out_buffer_view = self.handler.process(
                read_buffer_view,
                conversion=self.conversion,
                mask=self.mask,
                gap_pixels=self.gap_pixels,
                double_pixels=self.double_pixels,
                geometry=self.geometry,
                parallel=self.parallel,
                out=out_buffer_view,
            )

            out_buffer_view = np.ascontiguousarray(out_buffer_view)

            if roi is None:
                dtype_size = out_dtype.itemsize
                bytes_num_elem = struct.pack(
                    ">q", image_shape[0] * image_shape[1] * dtype_size)
                bytes_block_size = struct.pack(">i", BLOCK_SIZE * dtype_size)
                header = bytes_num_elem + bytes_block_size

                for pos, im in zip(batch_range, out_buffer_view):
                    if compression:
                        byte_array = header + bitshuffle.compress_lz4(
                            im, BLOCK_SIZE).tobytes()
                    else:
                        byte_array = im.tobytes()

                    h5_dest[data_dset.name].id.write_direct_chunk((pos, 0, 0),
                                                                  byte_array)

            else:
                for i, (roi_y1, roi_y2, roi_x1, roi_x2) in enumerate(roi):
                    roi_data = out_buffer_view[:,
                                               slice(roi_y1, roi_y2),
                                               slice(roi_x1, roi_x2)]
                    h5_dest[f"{data_dset.name}_roi_{i}"][
                        batch_range] = roi_data