Ejemplo n.º 1
0
def test_vcf_to_zarr__parallel_compressor_and_filters(shared_datadir, is_path,
                                                      tmp_path):
    path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz",
                         is_path)
    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()
    regions = ["20", "21"]

    default_compressor = Blosc("zlib", 1, Blosc.NOSHUFFLE)
    variant_id_compressor = Blosc("zlib", 2, Blosc.NOSHUFFLE)
    encoding = dict(
        variant_id=dict(compressor=variant_id_compressor),
        variant_id_mask=dict(filters=None),
    )
    vcf_to_zarr(
        path,
        output,
        regions=regions,
        chunk_length=5_000,
        compressor=default_compressor,
        encoding=encoding,
    )

    # look at actual Zarr store to check compressor and filters
    z = zarr.open(output)
    assert z["call_genotype"].compressor == default_compressor
    assert z["call_genotype"].filters is None
    assert z["call_genotype_mask"].filters == [PackBits()]

    assert z["variant_id"].compressor == variant_id_compressor
    assert z["variant_id_mask"].filters is None
Ejemplo n.º 2
0
def test_vcf_to_zarr__compressor_and_filters(shared_datadir, is_path,
                                             tmp_path):
    path = path_for_test(shared_datadir, "sample.vcf.gz", is_path)
    output = tmp_path.joinpath("vcf.zarr").as_posix()

    default_compressor = Blosc("zlib", 1, Blosc.NOSHUFFLE)
    variant_id_compressor = Blosc("zlib", 2, Blosc.NOSHUFFLE)
    encoding = dict(
        variant_id=dict(compressor=variant_id_compressor),
        variant_id_mask=dict(filters=None),
    )
    vcf_to_zarr(
        path,
        output,
        chunk_length=5,
        chunk_width=2,
        compressor=default_compressor,
        encoding=encoding,
    )

    # look at actual Zarr store to check compressor and filters
    z = zarr.open(output)
    assert z["call_genotype"].compressor == default_compressor
    assert z["call_genotype"].filters is None
    assert z["call_genotype_mask"].filters == [PackBits()]

    assert z["variant_id"].compressor == variant_id_compressor
    assert z["variant_id_mask"].filters is None
Ejemplo n.º 3
0
def reencode_usgs(src_fld, target):

    usgs_data_raw = []

    for fn in tqdm.tqdm(os.listdir(src_fld)):
        if not fn.endswith('.csv'):
            continue
        usgs_data_raw.extend(_read_usgs_csv(os.path.join(src_fld, fn)))

    usgs_data = [quake for quake in usgs_data_raw if _quake_ok(quake)]
    print(
        f'Quakes ok={len(usgs_data):d} broken={len(usgs_data_raw)-len(usgs_data):d}'
    )

    usgs_data.sort(key=lambda x: x['time'])

    usgs_zarr = zarr.open(
        target,
        'w',
    )

    usgs_zarr.zeros(
        'time',
        shape=(len(usgs_data), ),
        chunks=(10000, ),
        dtype='u8',
        compressor=Blosc(cname='lz4'),
    )
    usgs_zarr['time'][:] = [
        int(quake['time'].timestamp() * 1000) for quake in usgs_data
    ]

    fields = [
        'lon', 'lat', 'depth', 'mag', 'horizontalError', 'depthError',
        'magError'
    ]
    usgs_zarr.zeros(
        'data',
        shape=(len(fields), len(usgs_data)),
        chunks=(
            len(fields),
            10000,
        ),
        dtype='f4',
        compressor=Blosc(cname='lz4'),
    )
    usgs_zarr.attrs['fields'] = fields
    for field_index, field in enumerate(fields):
        usgs_zarr['data'][field_index, :] = [
            quake[field] for quake in usgs_data
        ]
Ejemplo n.º 4
0
def execute(array, write_path=None, **kwargs):
    """
    """

    with ClusterWrap.cluster(**kwargs) as cluster:

        # if user wants to write to disk
        if write_path is not None:
            compressor = Blosc(
                cname='zstd',
                clevel=4,
                shuffle=Blosc.BITSHUFFLE,
            )
            zarr_disk = zarr.open(
                write_path,
                'w',
                shape=array.shape,
                chunks=array.chunksize,
                dtype=array.dtype,
                compressor=compressor,
            )
            to_zarr(array, zarr_disk)
            return zarr_disk

        # otherwise user wants result returned to local process
        return array.compute()
Ejemplo n.º 5
0
def write_dense(zarr_cache_dir, key, dense_name, chunk_factors):
    compressor = Blosc(cname='blosclz', clevel=3, shuffle=Blosc.SHUFFLE)
    store = zarr.open(zarr_cache_dir, mode='a')

    if (len(store[key]) == 3):
        # assume csr sparse matrix - parse as such
        array_keys = list(store[key].array_keys())
        X = sp.csr_matrix((store[key + "/" + array_keys[0]],
                           store[key + "/" + array_keys[1]],
                           store[key + "/" + array_keys[2]]))
    else:
        # assume dense matrix
        # TODO: checking for other cases of sparse matrices/mixed groups
        X = store[key]
    if ((not (dense_name in store)) or (X.shape != store[dense_name].shape)):
        store.create_dataset(dense_name,
                             shape=X.shape,
                             dtype=X.dtype,
                             fill_value=0,
                             chunks=(int(X.shape[0] / chunk_factors[0]),
                                     int(X.shape[1] / chunk_factors[1])),
                             compressor=compressor,
                             overwrite=True)
    if (sp.issparse(X) is True):
        X = X.tocoo()
        store[dense_name].set_coordinate_selection((X.row, X.col), X.data)
    else:
        store[dense_name] = X
    return None


### end celery queue task function definitions
Ejemplo n.º 6
0
def hdf2zarr(hdf_file,
             zarr_file=None,
             hdf_key=[],
             zarr_key=[],
             chunksize=None):

    hf = h5py.File(hdf_file, 'r')

    if zarr_file is None:
        zarr_file = hdf_file[:-4] + '.zarr'

    zf = zarr.open(zarr_file, mode='a')
    compressor = Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE)

    for i, hk in enumerate(hdf_key):

        array = np.asarray(hf[hk])

        if len(zarr_key) > i:
            zk = zarr_key[i]
        else:
            zk = hk

        zf.create_dataset(zk,
                          data=array,
                          shape=array.shape,
                          compressor=compressor,
                          dtype=array.dtype,
                          chunks=chunksize)

    hf.close()
Ejemplo n.º 7
0
def _check_storage_parms(storage_parms, default_outfile, graph_name):
    from numcodecs import Blosc
    parms_passed = True

    if not (_check_parms(storage_parms, 'to_disk', [bool], default=False)):
        parms_passed = False
    if not (_check_parms(
            storage_parms, 'graph_name', [str], default=graph_name)):
        parms_passed = False

    if storage_parms['to_disk'] == True:
        if not (_check_parms(
                storage_parms, 'outfile', [str], default=default_outfile)):
            parms_passed = False
        if not (_check_parms(storage_parms, 'append', [bool], default=False)):
            parms_passed = False
        if not (_check_parms(storage_parms,
                             'compressor', [Blosc],
                             default=Blosc(cname='zstd', clevel=2,
                                           shuffle=0))):
            parms_passed = False
        if not (_check_parms(
                storage_parms, 'chunks_on_disk', [dict], default={})):
            parms_passed = False
        if not (_check_parms(
                storage_parms, 'chunks_return', [dict], default={})):
            parms_passed = False

    return parms_passed
Ejemplo n.º 8
0
    def __init__(self, src, dest, compressor=None, chunk_size=(64, 64, 64)):
        """

        :param src: glob for tiffs or a zarr store
        :param dest: the destination folder for zarr arrays
        :param compressor: numcodecs compressor to use on eachj chunk. Default
        is Zstd level 1 with bitshuffle
        """
        self.files = None
        self.z_arr = None
        self.chunksize = chunk_size
        if isinstance(src, str):  # Assume it's a glob if src is a string
            self.files = sorted(glob.glob(src))
            self.z_extent = len(self.files)
            img0 = tifffile.imread(self.files[0])
            self.y_extent, self.x_extent = img0.shape
            self.dtype = img0.dtype
        elif isinstance(src, zarr.NestedDirectoryStore):
            self.z_arr = zarr.open(src, mode='r')
            self.z_extent, self.y_extent, self.x_extent = self.z_arr.shape
            self.dtype = self.z_arr.dtype
        else:
            raise ValueError('Unrecognized data source for ZarrStack')
        self.dest = dest
        if compressor is None:
            self.compressor = Blosc(cname='zstd',
                                    clevel=1,
                                    shuffle=Blosc.BITSHUFFLE)
        else:
            self.compressor = compressor
Ejemplo n.º 9
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     
     out_name = self.base_path.name.replace(''.join(self.base_path.suffixes),'')
     self.base_path = self.base_path.with_name(out_name)
     self.base_path.mkdir(exist_ok=True)
     self.root = zarr.open(str(self.base_path.joinpath("data.zarr").resolve()),
                           mode='a')
     if "0" in self.root.group_keys():
         self.root = self.root["0"]
     else:
         self.root = self.root.create_group("0")
     
     
     self.writers = {}
     max_scale = int(self.scale_info(-1)['key'])
     compressor = Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE)
     for S in range(10,len(self.info['scales'])):
         scale_info = self.scale_info(S)
         key = str(max_scale - int(scale_info['key']))
         if key not in self.root.array_keys():
             self.writers[key] = self.root.zeros(key,
                                                 shape=(1,self.max_output_depth,1) + (scale_info['size'][1],scale_info['size'][0]),
                                                 chunks=(1,1,1,CHUNK_SIZE,CHUNK_SIZE),
                                                 dtype=self.dtype,
                                                 compressor=compressor)
         else:
             self.root[key].resize((1,self.max_output_depth,1) + (scale_info['size'][1],scale_info['size'][0]))
             self.writers[key] = self.root[key]
Ejemplo n.º 10
0
    def __init__(
                self,max_strokes,max_stroke_length,
                batch_size,max_per_class=1000,
                root_dir=os.environ["QUICKDRAW_DATA_ROOT"],
                arr_dir="processed_data",transform=None):
        self.arr_dir = arr_dir
        self.root_dir = root_dir
        self.max_strokes = max_strokes
        self.max_stroke_length = max_stroke_length
        self.max_per_class = max_per_class
        self.batch_size = batch_size
        self.transform = transform

        self.zarr_kwargs = dict(
            compressor=Blosc(),
            chunks=(512,),
            dtype=object,
            object_codec=numcodecs.Pickle()
            )

        if not os.path.exists(self.get_arr_dir()):
            self.preprocess(root_dir)
       
        self.drawings, self.classes = (
            zarr.open(self.get_arr_path("drawings"),"r"),
            zarr.open(self.get_arr_path("classes"),"r")[:]
            )

        with open(self.get_json_path()) as f:
            d = json.load(f)
            self.class2label, self.country2label = d["class2label"], d["country2label"]
Ejemplo n.º 11
0
def create_zarr_dataset(g: zarr.Group,
                        name: str,
                        chunks: tuple,
                        dtype: Any,
                        shape: Tuple,
                        overwrite: bool = True) -> zarr.hierarchy:
    """
    Creates and returns a Zarr array.

    Args:
        g (zarr.hierarchy):
        name (str):
        chunks (tuple):
        dtype (Any):
        shape (Tuple):
        overwrite (bool):

    Returns:
        A Zarr Array.
    """
    from numcodecs import Blosc

    compressor = Blosc(cname='lz4', clevel=5, shuffle=Blosc.BITSHUFFLE)
    return g.create_dataset(name,
                            chunks=chunks,
                            dtype=dtype,
                            shape=shape,
                            compressor=compressor,
                            overwrite=overwrite)
Ejemplo n.º 12
0
def create_Ye_output(out_path, nproxies, nens, nyears, recon_yr_range):
    nbytes_in_nens = nens * 8

    # number of nens chunks in 5mb
    n_units_in_mb = 5 / (nbytes_in_nens / 1024**2)

    base_chunk_size = np.sqrt(n_units_in_mb / 2)
    nyr_chunk = int(base_chunk_size)
    nproxy_chunk = 2 * nyr_chunk

    if nyr_chunk > nyears:
        nyr_chunk = nyears
    if nproxy_chunk > nproxies:
        nproxy_chunk = nproxies

    chunk_shape = (nproxy_chunk, nyr_chunk, nens)

    compressor = Blosc(cname='zstd', clevel=4, shuffle=Blosc.BITSHUFFLE)

    ye_arr = zarr.open(out_path,
                       mode='w',
                       shape=(nproxies, nyears, nens),
                       chunks=chunk_shape,
                       compressor=compressor,
                       dtype=np.float64)
    ye_arr.attrs['recon_time_range'] = recon_yr_range

    return ye_arr
Ejemplo n.º 13
0
    def test_03_write(self):
        a = np.random.randint(0, 65535, (64, 64, 64), np.uint16)
        with make_files(1) as (dir_file, block_files):
            directory = Directory(1024, 1024, 1024, np.uint16, dir_file,
                                  compression=Compression.zstd,
                                  block_filenames=block_files)
            directory.create()
            directory.write_block(a, 64, 128, 192)
            directory.close()

            with open(block_files[0], "rb") as fd:
                block = fd.read()
            blosc = Blosc("zstd")
            a_out = np.frombuffer(blosc.decode(block), np.uint16)\
               .reshape(64, 64, 64)
            np.testing.assert_array_equal(a, a_out)
Ejemplo n.º 14
0
def write_image(xds, outfile='image.zarr'):
    """
    Write image dataset to xarray zarr format on disk
    
    Parameters
    ----------
    xds : xarray.core.dataset.Dataset
        image Dataset to write to disk
    outfile : str
        output filename, generally ends in .zarr
    
    Returns
    -------
    """
    import os
    from numcodecs import Blosc
    from itertools import cycle

    outfile = os.path.expanduser(outfile)
    compressor = Blosc(cname='zstd', clevel=2, shuffle=0)
    encoding = dict(
        zip(list(xds.data_vars), cycle([{
            'compressor': compressor
        }])))

    xds.to_zarr(outfile, mode='w', encoding=encoding)
Ejemplo n.º 15
0
 def test_compression_opts(self, tmp_path):
     self.filename = tmp_path / 'testfile.zspy'
     from numcodecs import Blosc
     comp = Blosc(cname='zstd', clevel=1, shuffle=Blosc.SHUFFLE)
     BaseSignal([1, 2, 3]).save(self.filename, compressor=comp)
     f = zarr.open(self.filename.__str__(), mode='r+')
     d = f['Experiments/__unnamed__/data']
     assert (d.compressor == comp)\
Ejemplo n.º 16
0
def block_writer_process(
        path:str,
        compression:str,
        compression_level:int,
        q_in:multiprocessing.Queue,
        q_out:multiprocessing.Queue):
    """
    The process function for a writer process

    :param path: the path to the file that the writer writes to
    :param compression: the compression method used for Blosc
    :param compression_level: the compression level to be used
    :param q_in: WriterMessages come down this queue. The process ends when
                 this queue is closed and nothing remains to be read.
    :param q_out: We send the offset and size down this queue to indicate that
                  the message has been passed
    """
    pid = os.getpid()
    logger.info("%d: Starting block writer process for %s" % (pid, path))
    with open(path, "r+b") as fd:
        fd.seek(0, os.SEEK_END)
        position = fd.tell()
        blosc = Blosc(cname=compression, clevel=compression_level)
        while True:
            try:
                msg = q_in.get()
                logger.debug("%d: Got message from queue" % pid)
            except IOError:
                logger.exception("%d: Queue failed with I/O error" % pid)
                break
            if msg == EOT:
                logger.info("%d: Got end-of-process message" % pid)
                break
            logger.debug("%d: Position = %d" % (pid, position))
            a = msg.get()
            block = blosc.encode(a)
            count = len(block)
            logger.debug("%d: Writing block of length %d" % (pid, count))
            fd.write(block)
            q_out.put((msg.directory_offset, position, count))
            position += len(block)
            logger.debug("%d: Task done: %d" % (pid, position))
    logger.info("Making sure q_out is empty: %d" % pid)
    while not q_out.empty():
        time.sleep(.25)
    logger.info("Exiting process. PID=%d" % os.getpid())
Ejemplo n.º 17
0
def compose_position_fields(fields,
                            spacing,
                            output,
                            blocksize=[
                                256,
                            ] * 3,
                            displacement=None):
    """
    """

    with distributed.distributedState() as ds:

        # get number of jobs needed
        block_grid = np.ceil(np.array(fields[0].shape[:-1]) /
                             blocksize).astype(int)
        nblocks = np.prod(block_grid)

        # set up the cluster
        ds.initializeLSFCluster(job_extra=["-P multifish"])
        ds.initializeClient()
        ds.scaleCluster(njobs=nblocks)

        # wrap fields as dask arrays
        fields_da = da.stack(
            [da.from_array(f, chunks=blocksize + [
                3,
            ]) for f in fields])

        # accumulate
        composed = da.sum(fields_da, axis=0)

        # modify for multiple position fields
        if displacement is not None:
            raise NotImplementedError(
                "composing displacement fields not implemented yet")
        else:
            grid = position_grid_dask(composed.shape[:3],
                                      blocksize) * spacing.astype(np.float32)
            composed = composed - (len(fields) - 1) * grid

        # write in parallel as 3D array to zarr file
        compressor = Blosc(cname='zstd', clevel=9, shuffle=Blosc.BITSHUFFLE)
        composed_disk = zarr.open(
            output,
            'w',
            shape=composed.shape,
            chunks=composed.chunksize,
            dtype=composed.dtype,
            compressor=compressor,
        )
        da.to_zarr(composed, composed_disk)

        # return pointer to zarr file
        return composed_disk
Ejemplo n.º 18
0
def parallel_write(filename: str, darray: dask.array) -> None:
    """Distribute Zarr writing task to workers using dask.
    Input filename should have extension .zarr"""
    client = Client()
    out = darray.to_zarr(filename, compressor=Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE),
        compute=False)
    try:
        progress(client) #  I believe this is for visualization purpose.
        fut = client.compute(out)
    except BrokenPipeError:
        print('Process complete (likely)...')
Ejemplo n.º 19
0
def convert_processed_to_zarr(filename, outname, chunk_size, axis_transpose):
    tiff_f = tifffile.TiffFile(filename)
    d_mmap = tiff_f.pages[0].asarray(out='memmap')
    tiff_f.close()
    d_transposed = d_mmap.transpose(axis_transpose)

    compressor = Blosc(cname='zstd',
                       clevel=9,
                       shuffle=Blosc.SHUFFLE,
                       blocksize=0)

    z_arr = zarr.open(outname,
                      mode='a',
                      shape=(d_transposed.shape[0], d_transposed.shape[1],
                             d_transposed.shape[2]),
                      dtype=d_transposed.dtype,
                      chunks=(1, None, None),
                      compressor=compressor)

    start = 0
    end = 0
    num_chunks = z_arr.shape[0] // chunk_size

    global_start = time.time()
    #TODO: tqdm for progress bar?
    for i in range(0, 4):
        start = i * chunk_size
        end = start + chunk_size

        copy_start = time.time()
        print("Start: {}\tEnd: {}".format(start, end))
        print("Copying d_transposed[{}:{}, :, :]".format(start, end))
        current_slice = np.copy(d_transposed[start:end, :, :])
        copy_end = time.time()

        print("Copying complete: {} minutes.".format(
            (copy_end - copy_start) / 60))
        print("Assigning slice into zarr...")
        z_arr[start:end, :, :] = current_slice
        assign_end = time.time()
        print("Assigned: {} minutes".format((assign_end - copy_end) / 60))

        del (current_slice)
        print("{} chunks remaining...".format(num_chunks - i - 1))
        print("#*#" * 20)

    if z_arr.shape[0] % chunk_size != 0:
        print("Copying remainder...")
        final_slice = np.copy(d_transposed[end:, :, :])
        print("Assigning remainder...")
        z_arr[end:, :, :] = final_slice

    global_end = time.time()
    print("TOTAL TIME: {}".format(global_end - global_start))
Ejemplo n.º 20
0
 def test_compression(self, compressor, tmp_path):
     if compressor == "blosc":
         from numcodecs import Blosc
         compressor = Blosc(cname='zstd',
                            clevel=3,
                            shuffle=Blosc.BITSHUFFLE)
     s = Signal1D(np.ones((3, 3)))
     s.save(tmp_path / 'test_compression.zspy',
            overwrite=True,
            compressor=compressor)
     load(tmp_path / 'test_compression.zspy')
def write_xarray(ds: xr.Dataset, out_file: str) -> None:
    if out_file.endswith('.nc'):
        comp = dict(zlib=True, complevel=5)
        encoding = {var: comp for var in ds.data_vars}
        ds.to_netcdf(out_file, mode='w', encoding=encoding)

    elif out_file.endswith('.zarr'):
        compressor = Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE)
        encoding = {var: {'compressor': compressor} for var in ds.data_vars}
        ds.to_zarr(out_file, mode='w', encoding=encoding)

    else:
        raise ValueError('Unknown file format: ' + out_file)
Ejemplo n.º 22
0
 def read_block(self, x, y, z):
     offset = self.offsetof(x, y, z)
     shape = self.get_block_size(x, y, z)
     idx = offset % len(self.block_filenames)
     directory_offset = self.directory_offset + \
                        offset * self.directory_entry_size
     if  self.filesize(self.directory_filename) <\
         directory_offset + self.directory_entry_size:
         return np.zeros(shape, self.dtype)
     fd = self.file_handle(self.directory_filename)
     fd.seek(directory_offset, os.SEEK_SET)
     data = fd.read(self.directory_entry_size)
     m = np.frombuffer(data, dtype=np.uint8)
     offset, size = self.decode_directory_entry(m)
     if size == 0:
         return np.zeros(shape, self.dtype)
     fd = self.file_handle(self.block_filenames[idx])
     fd.seek(offset)
     uncompressed = fd.read(size)
     blosc = Blosc(self.compression, self.compression_level)
     data = blosc.decode(uncompressed)
     return np.frombuffer(data, self.dtype).reshape(shape)
Ejemplo n.º 23
0
def remove_ramp_xr(
    ds,
    dset_name,
    outfile=None,
    deramp_order=1,
    mask=None,
    mask_val=0,
    overwrite=False,
    max_abs_val=None,
):
    from apertools import sario
    if not sario.check_dset(outfile, dset_name, overwrite):
        import xarray as xr
        return xr.open_dataset(outfile)

    logger.info("Removing ramp")

    if mask is None:
        mask = ds[dset_name] == mask_val
    if max_abs_val is not None and max_abs_val > 0:
        mask_abs = np.abs(ds[dset_name]) > max_abs_val
    else:
        mask_abs = np.ma.nomask

    outstack = remove_ramp(
        ds[dset_name].data,
        copy=True,
        deramp_order=deramp_order,
        mask=np.logical_or(mask, mask_abs),
    )
    if mask.ndim == 3:
        outstack[mask] = mask_val
    else:
        outstack[:, mask] = mask_val

    ds_out = ds.copy()
    ds_out[dset_name].data = outstack

    if outfile:
        # ext = os.path.splitext(infile)[1]
        # outfile = infile.replace(ext, "_ramp_removed" + out_format)
        if outfile.endswith("zarr"):
            from numcodecs import Blosc
            compressor = Blosc(cname="zstd", clevel=3, shuffle=Blosc.BITSHUFFLE)
            mode = "w" if not os.path.exists(outfile) else "a"
            ds_out.to_zarr(outfile, encoding={"igrams": {"compressor": compressor}}, mode=mode)
        elif outfile.endswith("nc"):
            mode = "w" if not os.path.exists(outfile) else "a"
            ds_out.to_netcdf(outfile, engine="h5netcdf", mode=mode)
    return ds_out
Ejemplo n.º 24
0
 def add_zarr_dataset(self, group, data, compress=False):
     h5f = zarr.open(self.z_file, 'r+')
     if group in h5f:
         del h5f[group]
     if compress:
         compressor = Blosc(cname='zstd',
                            clevel=3,
                            shuffle=Blosc.BITSHUFFLE)
         h5f.create_dataset(group,
                            data=data,
                            chunks=True,
                            compressor=compressor)
     else:
         h5f.create_dataset(group, data=data)
Ejemplo n.º 25
0
def create_zarr_obj_array(
    g: zarr.Group,
    name: str,
    data,
    dtype: Union[str, Any] = None,
    overwrite: bool = True,
    chunk_size: int = 100000,
) -> zarr.hierarchy:
    """
    Creates and returns a Zarr object array.

    A Zarr object array can contain any type of object.
    https://zarr.readthedocs.io/en/stable/tutorial.html#object-arrays

    Args:
        g (zarr.hierarchy):
        name (str):
        data ():
        dtype (Union[str, Any]):
        overwrite (bool):
        chunk_size (int):

    Returns:
        A Zarr object Array.
    """

    from numcodecs import Blosc

    compressor = Blosc(cname="lz4", clevel=5, shuffle=Blosc.BITSHUFFLE)

    data = np.array(data)
    if dtype is None or dtype == object:
        dtype = "U" + str(max([len(str(x)) for x in data]))
    if np.issubdtype(data.dtype, np.dtype("S")):
        data = data.astype("U")
        dtype = data.dtype
    if chunk_size is None or chunk_size is False:
        chunks = False
    else:
        chunks = (chunk_size, )
    return g.create_dataset(
        name,
        data=data,
        chunks=chunks,
        shape=len(data),
        dtype=dtype,
        overwrite=overwrite,
        compressor=compressor,
    )
Ejemplo n.º 26
0
def create_zarr_dataset(g: zarr.hierarchy,
                        name: str,
                        chunks: tuple,
                        dtype: Any,
                        shape: Tuple,
                        overwrite: bool = True) -> zarr.hierarchy:
    from numcodecs import Blosc

    compressor = Blosc(cname='lz4', clevel=5, shuffle=Blosc.BITSHUFFLE)
    return g.create_dataset(name,
                            chunks=chunks,
                            dtype=dtype,
                            shape=shape,
                            compressor=compressor,
                            overwrite=overwrite)
Ejemplo n.º 27
0
def compress_zarr_dataset(data,
                          file_path,
                          compression='lz4',
                          clevel=5,
                          start_idx=0,
                          end_idx=0):
    """
    Loads in a zarr data set and exports it with a given compression type and level
    :param data: Zarr data set which will be compressed
    :param file_path: File name path where the data will be exported (e.g. "./export/data.zip")
    :param compression: Compression type
    :param clevel: Compression level
    :param start_idx: Starting index of data to be exported.
    :param end_idx: If end_idx != 0 the data set will be exported to the specified index,
    excluding the sample at end_idx (e.g. end_idx = len(x) will export it fully)
    :return: True if a NaN value was detected
    """
    compressor = Blosc(cname=compression, clevel=clevel, shuffle=Blosc.SHUFFLE)

    # open a dataset file and create arrays
    store = zarr.ZipStore(file_path, mode="w")
    zarr_file = zarr.group(store=store, overwrite=True)

    nan_detected = False
    for key in data.keys():
        if end_idx == 0:
            x = data[key]
        else:
            x = data[key][start_idx:end_idx]

        if np.isnan(x).any():
            nan_detected = True

        array_shape = list(x.shape)
        array_shape[0] = 128
        # export array
        zarr_file.create_dataset(
            name=key,
            data=x,
            shape=x.shape,
            dtype=type(x.flatten()[0]),
            chunks=array_shape,
            synchronizer=zarr.ThreadSynchronizer(),
            compression=compressor,
        )
    store.close()
    logging.info("dataset was exported to: %s", file_path)
    return nan_detected
Ejemplo n.º 28
0
 def test_03_writer_send(self):
     with tempfile.NamedTemporaryFile() as tf:
         q_in = multiprocessing.Queue()
         q_out = multiprocessing.Queue()
         a = np.random.randint(0, np.iinfo(np.uint16).max, (4, 5, 6))
         writer = w.BlockWriter(tf.name, q_out, q_in, "zstd", 5)
         writer.start()
         writer.write(a, 1234)
         directory_offset, position, size = q_out.get()
         writer.close()
         self.assertEqual(directory_offset, 1234)
         self.assertEqual(position, 0)
         block = tf.file.read()
         self.assertEqual(len(block), size)
         a_out = np.frombuffer(Blosc("zstr", 5).decode(block),
                               a.dtype).reshape(a.shape)
         np.testing.assert_array_equal(a, a_out)
def band_at_timepoint_to_zarr(
        timepoint_fn,
        timepoint_number,
        band,
        band_number,
        *,
        out_zarrs=None,
        min_level_shape=(1024, 1024),
        num_timepoints=None,
        num_bands=None,
):
    basepath = os.path.splitext(os.path.basename(timepoint_fn))[0]
    path = basepath + '/' + basepath + '_' + band + '.tif'
    image = ziptiff2array(timepoint_fn, path)
    shape = image.shape
    dtype = image.dtype
    max_layer = np.log2(
        np.max(np.array(shape) / np.array(min_level_shape))
    ).astype(int)
    pyramid = pyramid_gaussian(image, max_layer=max_layer, downscale=DOWNSCALE)
    im_pyramid = list(pyramid)
    if isinstance(out_zarrs, str):
        fout_zarr = out_zarrs
        out_zarrs = []
        compressor = Blosc(cname='zstd', clevel=9, shuffle=Blosc.SHUFFLE, blocksize=0)
        for i in range(len(im_pyramid)):
            r, c = im_pyramid[i].shape
            out_zarrs.append(zarr.open(
                    os.path.join(fout_zarr, str(i)), 
                    mode='a', 
                    shape=(num_timepoints, num_bands, 1, r, c), 
                    dtype=np.int16,
                    chunks=(1, 1, 1, *min_level_shape), 
                    compressor=compressor,
                )
            )

    # for each resolution:
    for pyramid_level, downscaled in enumerate(im_pyramid):
        # convert back to int16
        downscaled = skimage.img_as_int(downscaled)
        # store into appropriate zarr
        out_zarrs[pyramid_level][timepoint_number, band_number, 0, :, :] = downscaled
    
    return out_zarrs
Ejemplo n.º 30
0
def write_vis(xds,
              outfile='vis.zarr',
              partition='part0',
              compressor=None,
              append=True):
    """
    Write xarray Visibility Dataset to zarr format on disk
  
    Parameters
    ----------
    xds : xarray.core.dataset.Dataset
        Visibility Dataset to write to disk
    outfile : str
        output filename, generally ends in .zarr
    partition : str
        Name of partition to write into outfile. Overwrites existing partition of same name. Default is 'part0'
    compressor : numcodecs.blosc.Blosc
        The blosc compressor to use when saving the converted data to disk using zarr.
        If None the zstd compression algorithm used with compression level 2.
    append : bool
        Append this partition in to an existing zarr directory. False will erase old zarr directory. Default=True
    
    Returns
    -------
    """
    import os
    from numcodecs import Blosc
    from itertools import cycle

    outfile = os.path.expanduser(outfile)

    if compressor is None:
        compressor = Blosc(cname='zstd', clevel=2, shuffle=0)

    # need to manually remove existing parquet file (if any)
    if not append:
        tmp = os.system("rm -fr " + outfile)
        tmp = os.system("mkdir " + outfile)

    encoding = dict(
        zip(list(xds.data_vars), cycle([{
            'compressor': compressor
        }])))
    xds.to_zarr(os.path.join(outfile, partition), mode='w', encoding=encoding)