def test_vcf_to_zarr__parallel_compressor_and_filters(shared_datadir, is_path, tmp_path): path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path) output = tmp_path.joinpath("vcf_concat.zarr").as_posix() regions = ["20", "21"] default_compressor = Blosc("zlib", 1, Blosc.NOSHUFFLE) variant_id_compressor = Blosc("zlib", 2, Blosc.NOSHUFFLE) encoding = dict( variant_id=dict(compressor=variant_id_compressor), variant_id_mask=dict(filters=None), ) vcf_to_zarr( path, output, regions=regions, chunk_length=5_000, compressor=default_compressor, encoding=encoding, ) # look at actual Zarr store to check compressor and filters z = zarr.open(output) assert z["call_genotype"].compressor == default_compressor assert z["call_genotype"].filters is None assert z["call_genotype_mask"].filters == [PackBits()] assert z["variant_id"].compressor == variant_id_compressor assert z["variant_id_mask"].filters is None
def test_vcf_to_zarr__compressor_and_filters(shared_datadir, is_path, tmp_path): path = path_for_test(shared_datadir, "sample.vcf.gz", is_path) output = tmp_path.joinpath("vcf.zarr").as_posix() default_compressor = Blosc("zlib", 1, Blosc.NOSHUFFLE) variant_id_compressor = Blosc("zlib", 2, Blosc.NOSHUFFLE) encoding = dict( variant_id=dict(compressor=variant_id_compressor), variant_id_mask=dict(filters=None), ) vcf_to_zarr( path, output, chunk_length=5, chunk_width=2, compressor=default_compressor, encoding=encoding, ) # look at actual Zarr store to check compressor and filters z = zarr.open(output) assert z["call_genotype"].compressor == default_compressor assert z["call_genotype"].filters is None assert z["call_genotype_mask"].filters == [PackBits()] assert z["variant_id"].compressor == variant_id_compressor assert z["variant_id_mask"].filters is None
def reencode_usgs(src_fld, target): usgs_data_raw = [] for fn in tqdm.tqdm(os.listdir(src_fld)): if not fn.endswith('.csv'): continue usgs_data_raw.extend(_read_usgs_csv(os.path.join(src_fld, fn))) usgs_data = [quake for quake in usgs_data_raw if _quake_ok(quake)] print( f'Quakes ok={len(usgs_data):d} broken={len(usgs_data_raw)-len(usgs_data):d}' ) usgs_data.sort(key=lambda x: x['time']) usgs_zarr = zarr.open( target, 'w', ) usgs_zarr.zeros( 'time', shape=(len(usgs_data), ), chunks=(10000, ), dtype='u8', compressor=Blosc(cname='lz4'), ) usgs_zarr['time'][:] = [ int(quake['time'].timestamp() * 1000) for quake in usgs_data ] fields = [ 'lon', 'lat', 'depth', 'mag', 'horizontalError', 'depthError', 'magError' ] usgs_zarr.zeros( 'data', shape=(len(fields), len(usgs_data)), chunks=( len(fields), 10000, ), dtype='f4', compressor=Blosc(cname='lz4'), ) usgs_zarr.attrs['fields'] = fields for field_index, field in enumerate(fields): usgs_zarr['data'][field_index, :] = [ quake[field] for quake in usgs_data ]
def execute(array, write_path=None, **kwargs): """ """ with ClusterWrap.cluster(**kwargs) as cluster: # if user wants to write to disk if write_path is not None: compressor = Blosc( cname='zstd', clevel=4, shuffle=Blosc.BITSHUFFLE, ) zarr_disk = zarr.open( write_path, 'w', shape=array.shape, chunks=array.chunksize, dtype=array.dtype, compressor=compressor, ) to_zarr(array, zarr_disk) return zarr_disk # otherwise user wants result returned to local process return array.compute()
def write_dense(zarr_cache_dir, key, dense_name, chunk_factors): compressor = Blosc(cname='blosclz', clevel=3, shuffle=Blosc.SHUFFLE) store = zarr.open(zarr_cache_dir, mode='a') if (len(store[key]) == 3): # assume csr sparse matrix - parse as such array_keys = list(store[key].array_keys()) X = sp.csr_matrix((store[key + "/" + array_keys[0]], store[key + "/" + array_keys[1]], store[key + "/" + array_keys[2]])) else: # assume dense matrix # TODO: checking for other cases of sparse matrices/mixed groups X = store[key] if ((not (dense_name in store)) or (X.shape != store[dense_name].shape)): store.create_dataset(dense_name, shape=X.shape, dtype=X.dtype, fill_value=0, chunks=(int(X.shape[0] / chunk_factors[0]), int(X.shape[1] / chunk_factors[1])), compressor=compressor, overwrite=True) if (sp.issparse(X) is True): X = X.tocoo() store[dense_name].set_coordinate_selection((X.row, X.col), X.data) else: store[dense_name] = X return None ### end celery queue task function definitions
def hdf2zarr(hdf_file, zarr_file=None, hdf_key=[], zarr_key=[], chunksize=None): hf = h5py.File(hdf_file, 'r') if zarr_file is None: zarr_file = hdf_file[:-4] + '.zarr' zf = zarr.open(zarr_file, mode='a') compressor = Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE) for i, hk in enumerate(hdf_key): array = np.asarray(hf[hk]) if len(zarr_key) > i: zk = zarr_key[i] else: zk = hk zf.create_dataset(zk, data=array, shape=array.shape, compressor=compressor, dtype=array.dtype, chunks=chunksize) hf.close()
def _check_storage_parms(storage_parms, default_outfile, graph_name): from numcodecs import Blosc parms_passed = True if not (_check_parms(storage_parms, 'to_disk', [bool], default=False)): parms_passed = False if not (_check_parms( storage_parms, 'graph_name', [str], default=graph_name)): parms_passed = False if storage_parms['to_disk'] == True: if not (_check_parms( storage_parms, 'outfile', [str], default=default_outfile)): parms_passed = False if not (_check_parms(storage_parms, 'append', [bool], default=False)): parms_passed = False if not (_check_parms(storage_parms, 'compressor', [Blosc], default=Blosc(cname='zstd', clevel=2, shuffle=0))): parms_passed = False if not (_check_parms( storage_parms, 'chunks_on_disk', [dict], default={})): parms_passed = False if not (_check_parms( storage_parms, 'chunks_return', [dict], default={})): parms_passed = False return parms_passed
def __init__(self, src, dest, compressor=None, chunk_size=(64, 64, 64)): """ :param src: glob for tiffs or a zarr store :param dest: the destination folder for zarr arrays :param compressor: numcodecs compressor to use on eachj chunk. Default is Zstd level 1 with bitshuffle """ self.files = None self.z_arr = None self.chunksize = chunk_size if isinstance(src, str): # Assume it's a glob if src is a string self.files = sorted(glob.glob(src)) self.z_extent = len(self.files) img0 = tifffile.imread(self.files[0]) self.y_extent, self.x_extent = img0.shape self.dtype = img0.dtype elif isinstance(src, zarr.NestedDirectoryStore): self.z_arr = zarr.open(src, mode='r') self.z_extent, self.y_extent, self.x_extent = self.z_arr.shape self.dtype = self.z_arr.dtype else: raise ValueError('Unrecognized data source for ZarrStack') self.dest = dest if compressor is None: self.compressor = Blosc(cname='zstd', clevel=1, shuffle=Blosc.BITSHUFFLE) else: self.compressor = compressor
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) out_name = self.base_path.name.replace(''.join(self.base_path.suffixes),'') self.base_path = self.base_path.with_name(out_name) self.base_path.mkdir(exist_ok=True) self.root = zarr.open(str(self.base_path.joinpath("data.zarr").resolve()), mode='a') if "0" in self.root.group_keys(): self.root = self.root["0"] else: self.root = self.root.create_group("0") self.writers = {} max_scale = int(self.scale_info(-1)['key']) compressor = Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE) for S in range(10,len(self.info['scales'])): scale_info = self.scale_info(S) key = str(max_scale - int(scale_info['key'])) if key not in self.root.array_keys(): self.writers[key] = self.root.zeros(key, shape=(1,self.max_output_depth,1) + (scale_info['size'][1],scale_info['size'][0]), chunks=(1,1,1,CHUNK_SIZE,CHUNK_SIZE), dtype=self.dtype, compressor=compressor) else: self.root[key].resize((1,self.max_output_depth,1) + (scale_info['size'][1],scale_info['size'][0])) self.writers[key] = self.root[key]
def __init__( self,max_strokes,max_stroke_length, batch_size,max_per_class=1000, root_dir=os.environ["QUICKDRAW_DATA_ROOT"], arr_dir="processed_data",transform=None): self.arr_dir = arr_dir self.root_dir = root_dir self.max_strokes = max_strokes self.max_stroke_length = max_stroke_length self.max_per_class = max_per_class self.batch_size = batch_size self.transform = transform self.zarr_kwargs = dict( compressor=Blosc(), chunks=(512,), dtype=object, object_codec=numcodecs.Pickle() ) if not os.path.exists(self.get_arr_dir()): self.preprocess(root_dir) self.drawings, self.classes = ( zarr.open(self.get_arr_path("drawings"),"r"), zarr.open(self.get_arr_path("classes"),"r")[:] ) with open(self.get_json_path()) as f: d = json.load(f) self.class2label, self.country2label = d["class2label"], d["country2label"]
def create_zarr_dataset(g: zarr.Group, name: str, chunks: tuple, dtype: Any, shape: Tuple, overwrite: bool = True) -> zarr.hierarchy: """ Creates and returns a Zarr array. Args: g (zarr.hierarchy): name (str): chunks (tuple): dtype (Any): shape (Tuple): overwrite (bool): Returns: A Zarr Array. """ from numcodecs import Blosc compressor = Blosc(cname='lz4', clevel=5, shuffle=Blosc.BITSHUFFLE) return g.create_dataset(name, chunks=chunks, dtype=dtype, shape=shape, compressor=compressor, overwrite=overwrite)
def create_Ye_output(out_path, nproxies, nens, nyears, recon_yr_range): nbytes_in_nens = nens * 8 # number of nens chunks in 5mb n_units_in_mb = 5 / (nbytes_in_nens / 1024**2) base_chunk_size = np.sqrt(n_units_in_mb / 2) nyr_chunk = int(base_chunk_size) nproxy_chunk = 2 * nyr_chunk if nyr_chunk > nyears: nyr_chunk = nyears if nproxy_chunk > nproxies: nproxy_chunk = nproxies chunk_shape = (nproxy_chunk, nyr_chunk, nens) compressor = Blosc(cname='zstd', clevel=4, shuffle=Blosc.BITSHUFFLE) ye_arr = zarr.open(out_path, mode='w', shape=(nproxies, nyears, nens), chunks=chunk_shape, compressor=compressor, dtype=np.float64) ye_arr.attrs['recon_time_range'] = recon_yr_range return ye_arr
def test_03_write(self): a = np.random.randint(0, 65535, (64, 64, 64), np.uint16) with make_files(1) as (dir_file, block_files): directory = Directory(1024, 1024, 1024, np.uint16, dir_file, compression=Compression.zstd, block_filenames=block_files) directory.create() directory.write_block(a, 64, 128, 192) directory.close() with open(block_files[0], "rb") as fd: block = fd.read() blosc = Blosc("zstd") a_out = np.frombuffer(blosc.decode(block), np.uint16)\ .reshape(64, 64, 64) np.testing.assert_array_equal(a, a_out)
def write_image(xds, outfile='image.zarr'): """ Write image dataset to xarray zarr format on disk Parameters ---------- xds : xarray.core.dataset.Dataset image Dataset to write to disk outfile : str output filename, generally ends in .zarr Returns ------- """ import os from numcodecs import Blosc from itertools import cycle outfile = os.path.expanduser(outfile) compressor = Blosc(cname='zstd', clevel=2, shuffle=0) encoding = dict( zip(list(xds.data_vars), cycle([{ 'compressor': compressor }]))) xds.to_zarr(outfile, mode='w', encoding=encoding)
def test_compression_opts(self, tmp_path): self.filename = tmp_path / 'testfile.zspy' from numcodecs import Blosc comp = Blosc(cname='zstd', clevel=1, shuffle=Blosc.SHUFFLE) BaseSignal([1, 2, 3]).save(self.filename, compressor=comp) f = zarr.open(self.filename.__str__(), mode='r+') d = f['Experiments/__unnamed__/data'] assert (d.compressor == comp)\
def block_writer_process( path:str, compression:str, compression_level:int, q_in:multiprocessing.Queue, q_out:multiprocessing.Queue): """ The process function for a writer process :param path: the path to the file that the writer writes to :param compression: the compression method used for Blosc :param compression_level: the compression level to be used :param q_in: WriterMessages come down this queue. The process ends when this queue is closed and nothing remains to be read. :param q_out: We send the offset and size down this queue to indicate that the message has been passed """ pid = os.getpid() logger.info("%d: Starting block writer process for %s" % (pid, path)) with open(path, "r+b") as fd: fd.seek(0, os.SEEK_END) position = fd.tell() blosc = Blosc(cname=compression, clevel=compression_level) while True: try: msg = q_in.get() logger.debug("%d: Got message from queue" % pid) except IOError: logger.exception("%d: Queue failed with I/O error" % pid) break if msg == EOT: logger.info("%d: Got end-of-process message" % pid) break logger.debug("%d: Position = %d" % (pid, position)) a = msg.get() block = blosc.encode(a) count = len(block) logger.debug("%d: Writing block of length %d" % (pid, count)) fd.write(block) q_out.put((msg.directory_offset, position, count)) position += len(block) logger.debug("%d: Task done: %d" % (pid, position)) logger.info("Making sure q_out is empty: %d" % pid) while not q_out.empty(): time.sleep(.25) logger.info("Exiting process. PID=%d" % os.getpid())
def compose_position_fields(fields, spacing, output, blocksize=[ 256, ] * 3, displacement=None): """ """ with distributed.distributedState() as ds: # get number of jobs needed block_grid = np.ceil(np.array(fields[0].shape[:-1]) / blocksize).astype(int) nblocks = np.prod(block_grid) # set up the cluster ds.initializeLSFCluster(job_extra=["-P multifish"]) ds.initializeClient() ds.scaleCluster(njobs=nblocks) # wrap fields as dask arrays fields_da = da.stack( [da.from_array(f, chunks=blocksize + [ 3, ]) for f in fields]) # accumulate composed = da.sum(fields_da, axis=0) # modify for multiple position fields if displacement is not None: raise NotImplementedError( "composing displacement fields not implemented yet") else: grid = position_grid_dask(composed.shape[:3], blocksize) * spacing.astype(np.float32) composed = composed - (len(fields) - 1) * grid # write in parallel as 3D array to zarr file compressor = Blosc(cname='zstd', clevel=9, shuffle=Blosc.BITSHUFFLE) composed_disk = zarr.open( output, 'w', shape=composed.shape, chunks=composed.chunksize, dtype=composed.dtype, compressor=compressor, ) da.to_zarr(composed, composed_disk) # return pointer to zarr file return composed_disk
def parallel_write(filename: str, darray: dask.array) -> None: """Distribute Zarr writing task to workers using dask. Input filename should have extension .zarr""" client = Client() out = darray.to_zarr(filename, compressor=Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE), compute=False) try: progress(client) # I believe this is for visualization purpose. fut = client.compute(out) except BrokenPipeError: print('Process complete (likely)...')
def convert_processed_to_zarr(filename, outname, chunk_size, axis_transpose): tiff_f = tifffile.TiffFile(filename) d_mmap = tiff_f.pages[0].asarray(out='memmap') tiff_f.close() d_transposed = d_mmap.transpose(axis_transpose) compressor = Blosc(cname='zstd', clevel=9, shuffle=Blosc.SHUFFLE, blocksize=0) z_arr = zarr.open(outname, mode='a', shape=(d_transposed.shape[0], d_transposed.shape[1], d_transposed.shape[2]), dtype=d_transposed.dtype, chunks=(1, None, None), compressor=compressor) start = 0 end = 0 num_chunks = z_arr.shape[0] // chunk_size global_start = time.time() #TODO: tqdm for progress bar? for i in range(0, 4): start = i * chunk_size end = start + chunk_size copy_start = time.time() print("Start: {}\tEnd: {}".format(start, end)) print("Copying d_transposed[{}:{}, :, :]".format(start, end)) current_slice = np.copy(d_transposed[start:end, :, :]) copy_end = time.time() print("Copying complete: {} minutes.".format( (copy_end - copy_start) / 60)) print("Assigning slice into zarr...") z_arr[start:end, :, :] = current_slice assign_end = time.time() print("Assigned: {} minutes".format((assign_end - copy_end) / 60)) del (current_slice) print("{} chunks remaining...".format(num_chunks - i - 1)) print("#*#" * 20) if z_arr.shape[0] % chunk_size != 0: print("Copying remainder...") final_slice = np.copy(d_transposed[end:, :, :]) print("Assigning remainder...") z_arr[end:, :, :] = final_slice global_end = time.time() print("TOTAL TIME: {}".format(global_end - global_start))
def test_compression(self, compressor, tmp_path): if compressor == "blosc": from numcodecs import Blosc compressor = Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE) s = Signal1D(np.ones((3, 3))) s.save(tmp_path / 'test_compression.zspy', overwrite=True, compressor=compressor) load(tmp_path / 'test_compression.zspy')
def write_xarray(ds: xr.Dataset, out_file: str) -> None: if out_file.endswith('.nc'): comp = dict(zlib=True, complevel=5) encoding = {var: comp for var in ds.data_vars} ds.to_netcdf(out_file, mode='w', encoding=encoding) elif out_file.endswith('.zarr'): compressor = Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE) encoding = {var: {'compressor': compressor} for var in ds.data_vars} ds.to_zarr(out_file, mode='w', encoding=encoding) else: raise ValueError('Unknown file format: ' + out_file)
def read_block(self, x, y, z): offset = self.offsetof(x, y, z) shape = self.get_block_size(x, y, z) idx = offset % len(self.block_filenames) directory_offset = self.directory_offset + \ offset * self.directory_entry_size if self.filesize(self.directory_filename) <\ directory_offset + self.directory_entry_size: return np.zeros(shape, self.dtype) fd = self.file_handle(self.directory_filename) fd.seek(directory_offset, os.SEEK_SET) data = fd.read(self.directory_entry_size) m = np.frombuffer(data, dtype=np.uint8) offset, size = self.decode_directory_entry(m) if size == 0: return np.zeros(shape, self.dtype) fd = self.file_handle(self.block_filenames[idx]) fd.seek(offset) uncompressed = fd.read(size) blosc = Blosc(self.compression, self.compression_level) data = blosc.decode(uncompressed) return np.frombuffer(data, self.dtype).reshape(shape)
def remove_ramp_xr( ds, dset_name, outfile=None, deramp_order=1, mask=None, mask_val=0, overwrite=False, max_abs_val=None, ): from apertools import sario if not sario.check_dset(outfile, dset_name, overwrite): import xarray as xr return xr.open_dataset(outfile) logger.info("Removing ramp") if mask is None: mask = ds[dset_name] == mask_val if max_abs_val is not None and max_abs_val > 0: mask_abs = np.abs(ds[dset_name]) > max_abs_val else: mask_abs = np.ma.nomask outstack = remove_ramp( ds[dset_name].data, copy=True, deramp_order=deramp_order, mask=np.logical_or(mask, mask_abs), ) if mask.ndim == 3: outstack[mask] = mask_val else: outstack[:, mask] = mask_val ds_out = ds.copy() ds_out[dset_name].data = outstack if outfile: # ext = os.path.splitext(infile)[1] # outfile = infile.replace(ext, "_ramp_removed" + out_format) if outfile.endswith("zarr"): from numcodecs import Blosc compressor = Blosc(cname="zstd", clevel=3, shuffle=Blosc.BITSHUFFLE) mode = "w" if not os.path.exists(outfile) else "a" ds_out.to_zarr(outfile, encoding={"igrams": {"compressor": compressor}}, mode=mode) elif outfile.endswith("nc"): mode = "w" if not os.path.exists(outfile) else "a" ds_out.to_netcdf(outfile, engine="h5netcdf", mode=mode) return ds_out
def add_zarr_dataset(self, group, data, compress=False): h5f = zarr.open(self.z_file, 'r+') if group in h5f: del h5f[group] if compress: compressor = Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE) h5f.create_dataset(group, data=data, chunks=True, compressor=compressor) else: h5f.create_dataset(group, data=data)
def create_zarr_obj_array( g: zarr.Group, name: str, data, dtype: Union[str, Any] = None, overwrite: bool = True, chunk_size: int = 100000, ) -> zarr.hierarchy: """ Creates and returns a Zarr object array. A Zarr object array can contain any type of object. https://zarr.readthedocs.io/en/stable/tutorial.html#object-arrays Args: g (zarr.hierarchy): name (str): data (): dtype (Union[str, Any]): overwrite (bool): chunk_size (int): Returns: A Zarr object Array. """ from numcodecs import Blosc compressor = Blosc(cname="lz4", clevel=5, shuffle=Blosc.BITSHUFFLE) data = np.array(data) if dtype is None or dtype == object: dtype = "U" + str(max([len(str(x)) for x in data])) if np.issubdtype(data.dtype, np.dtype("S")): data = data.astype("U") dtype = data.dtype if chunk_size is None or chunk_size is False: chunks = False else: chunks = (chunk_size, ) return g.create_dataset( name, data=data, chunks=chunks, shape=len(data), dtype=dtype, overwrite=overwrite, compressor=compressor, )
def create_zarr_dataset(g: zarr.hierarchy, name: str, chunks: tuple, dtype: Any, shape: Tuple, overwrite: bool = True) -> zarr.hierarchy: from numcodecs import Blosc compressor = Blosc(cname='lz4', clevel=5, shuffle=Blosc.BITSHUFFLE) return g.create_dataset(name, chunks=chunks, dtype=dtype, shape=shape, compressor=compressor, overwrite=overwrite)
def compress_zarr_dataset(data, file_path, compression='lz4', clevel=5, start_idx=0, end_idx=0): """ Loads in a zarr data set and exports it with a given compression type and level :param data: Zarr data set which will be compressed :param file_path: File name path where the data will be exported (e.g. "./export/data.zip") :param compression: Compression type :param clevel: Compression level :param start_idx: Starting index of data to be exported. :param end_idx: If end_idx != 0 the data set will be exported to the specified index, excluding the sample at end_idx (e.g. end_idx = len(x) will export it fully) :return: True if a NaN value was detected """ compressor = Blosc(cname=compression, clevel=clevel, shuffle=Blosc.SHUFFLE) # open a dataset file and create arrays store = zarr.ZipStore(file_path, mode="w") zarr_file = zarr.group(store=store, overwrite=True) nan_detected = False for key in data.keys(): if end_idx == 0: x = data[key] else: x = data[key][start_idx:end_idx] if np.isnan(x).any(): nan_detected = True array_shape = list(x.shape) array_shape[0] = 128 # export array zarr_file.create_dataset( name=key, data=x, shape=x.shape, dtype=type(x.flatten()[0]), chunks=array_shape, synchronizer=zarr.ThreadSynchronizer(), compression=compressor, ) store.close() logging.info("dataset was exported to: %s", file_path) return nan_detected
def test_03_writer_send(self): with tempfile.NamedTemporaryFile() as tf: q_in = multiprocessing.Queue() q_out = multiprocessing.Queue() a = np.random.randint(0, np.iinfo(np.uint16).max, (4, 5, 6)) writer = w.BlockWriter(tf.name, q_out, q_in, "zstd", 5) writer.start() writer.write(a, 1234) directory_offset, position, size = q_out.get() writer.close() self.assertEqual(directory_offset, 1234) self.assertEqual(position, 0) block = tf.file.read() self.assertEqual(len(block), size) a_out = np.frombuffer(Blosc("zstr", 5).decode(block), a.dtype).reshape(a.shape) np.testing.assert_array_equal(a, a_out)
def band_at_timepoint_to_zarr( timepoint_fn, timepoint_number, band, band_number, *, out_zarrs=None, min_level_shape=(1024, 1024), num_timepoints=None, num_bands=None, ): basepath = os.path.splitext(os.path.basename(timepoint_fn))[0] path = basepath + '/' + basepath + '_' + band + '.tif' image = ziptiff2array(timepoint_fn, path) shape = image.shape dtype = image.dtype max_layer = np.log2( np.max(np.array(shape) / np.array(min_level_shape)) ).astype(int) pyramid = pyramid_gaussian(image, max_layer=max_layer, downscale=DOWNSCALE) im_pyramid = list(pyramid) if isinstance(out_zarrs, str): fout_zarr = out_zarrs out_zarrs = [] compressor = Blosc(cname='zstd', clevel=9, shuffle=Blosc.SHUFFLE, blocksize=0) for i in range(len(im_pyramid)): r, c = im_pyramid[i].shape out_zarrs.append(zarr.open( os.path.join(fout_zarr, str(i)), mode='a', shape=(num_timepoints, num_bands, 1, r, c), dtype=np.int16, chunks=(1, 1, 1, *min_level_shape), compressor=compressor, ) ) # for each resolution: for pyramid_level, downscaled in enumerate(im_pyramid): # convert back to int16 downscaled = skimage.img_as_int(downscaled) # store into appropriate zarr out_zarrs[pyramid_level][timepoint_number, band_number, 0, :, :] = downscaled return out_zarrs
def write_vis(xds, outfile='vis.zarr', partition='part0', compressor=None, append=True): """ Write xarray Visibility Dataset to zarr format on disk Parameters ---------- xds : xarray.core.dataset.Dataset Visibility Dataset to write to disk outfile : str output filename, generally ends in .zarr partition : str Name of partition to write into outfile. Overwrites existing partition of same name. Default is 'part0' compressor : numcodecs.blosc.Blosc The blosc compressor to use when saving the converted data to disk using zarr. If None the zstd compression algorithm used with compression level 2. append : bool Append this partition in to an existing zarr directory. False will erase old zarr directory. Default=True Returns ------- """ import os from numcodecs import Blosc from itertools import cycle outfile = os.path.expanduser(outfile) if compressor is None: compressor = Blosc(cname='zstd', clevel=2, shuffle=0) # need to manually remove existing parquet file (if any) if not append: tmp = os.system("rm -fr " + outfile) tmp = os.system("mkdir " + outfile) encoding = dict( zip(list(xds.data_vars), cycle([{ 'compressor': compressor }]))) xds.to_zarr(os.path.join(outfile, partition), mode='w', encoding=encoding)