def sizefilter(filename_src, dataset_src, filename_tgt, dataset_tgt, thr, dat_file=None): srcf = zarr.open(filename_src, mode="r") if not os.path.exists(filename_tgt): os.makedirs(filename_tgt) tgtf = zarr.open(filename_tgt, mode="a") tgtf.empty( name=dataset_tgt, shape=srcf[dataset_src].shape, compressor=numcodecs.GZip(6), dtype="uint64", chunks=srcf[dataset_src].chunks, ) tgt = np.array(srcf[dataset_src][:]) ids, counts = np.unique(tgt, return_counts=True) if dat_file is not None: np.savetxt(dat_file, counts, "%.4g") remove_ids = [] for id, count in zip(ids, counts): if count <= thr: remove_ids.append(id) tgt[np.isin(tgt, remove_ids)] = BG_VAL tgtf[dataset_tgt][:] = tgt.astype(np.uint64) tgtf[dataset_tgt].attrs["offset"] = srcf[dataset_src].attrs["offset"]
def run(self): thrs = [127, 42] progress = 0.0 self.set_progress_percentage(progress) for s in self.samples: filename = os.path.join(os.path.dirname(self.input().fn), s + ".n5") dataset_src = "clefts_cropped" dataset_tgt = "clefts_cropped_thr{0:}" f = zarr.open(filename, mode="a") for t in thrs: f.empty( name=dataset_tgt.format(t), shape=f[dataset_src].shape, compressor=numcodecs.GZip(6), dtype="uint8", chunks=f[dataset_src].chunks, ) f[dataset_tgt.format(t)][:] = (f[dataset_src][:] > t).astype( np.uint8) f[dataset_tgt.format( t)].attrs["offset"] = f[dataset_src].attrs["offset"] progress += 100.0 / len(self.samples) try: self.set_progress_percentage(progress) except: pass done = self.output().open("w") done.close()
def test_read_zarr(self): from z5py.dataset import Dataset dtypes = list(Dataset._dtype_dict.keys()) zarr_compressors = {'blosc': numcodecs.Blosc(), 'zlib': numcodecs.Zlib(), 'raw': None, 'bzip2': numcodecs.BZ2()} # conda-forge version of numcodecs is not up-to-data # for python 3.5 and GZip is missing # thats why we need to check explicitly here to not fail the test if hasattr(numcodecs, 'GZip'): zarr_compressors.update({'gzip': numcodecs.GZip()}) zarr.open(self.path) for dtype in dtypes: for compression in zarr_compressors: data = np.random.randint(0, 127, size=self.shape).astype(dtype) # write the data with zarr key = 'test_%s_%s' % (dtype, compression) ar = zarr.open(os.path.join(self.path, key), mode='w', shape=self.shape, chunks=self.chunks, dtype=dtype, compressor=zarr_compressors[compression]) ar[:] = data # read with z5py out = z5py.File(self.path)[key][:] self.assertEqual(data.shape, out.shape) self.assertTrue(np.allclose(data, out))
def slicefilter( filename_src, dataset_src, filename_tgt, dataset_tgt, thr, dat_file=None ): srcf = zarr.open(filename_src, mode="r") if not os.path.exists(filename_tgt): os.makedirs(filename_tgt) tgtf = zarr.open(filename_tgt, mode="a") tgtf.empty( name=dataset_tgt, shape=srcf[dataset_src].shape, compressor=numcodecs.GZip(6), dtype="uint64", chunks=srcf[dataset_src].chunks, ) tgt = np.array(srcf[dataset_src][:]) ids, relabeled = np.unique(tgt, return_inverse=True) relabeled = relabeled.reshape(tgt.shape) + 1 if BG_VAL1 in ids: relabeled[tgt == BG_VAL1] = 0 if BG_VAL2 in ids: relabeled[tgt == BG_VAL2] = 0 obj_slices = scipy.ndimage.measurements.find_objects(relabeled) set_to_bg = [] for k, obs in enumerate(obj_slices): if not None: if relabeled[obs].shape[0] <= thr: set_to_bg.append(k + 1) tgt[np.isin(relabeled, set_to_bg)] = 0 tgtf[dataset_tgt][:] = tgt.astype(np.uint64) tgtf[dataset_tgt].attrs["offset"] = srcf[dataset_src].attrs["offset"]
def cc2( filename_src, dataset_src_high_thr, dataset_src_low_thr, filename_tgt, dataset_tgt ): srcf = zarr.open(filename_src, mode="r") if not os.path.exists(filename_tgt): os.makedirs(filename_tgt) assert ( srcf[dataset_src_high_thr].attrs["offset"] == srcf[dataset_src_low_thr].attrs["offset"] ) assert srcf[dataset_src_high_thr].shape == srcf[dataset_src_low_thr].shape tgtf = zarr.open(filename_tgt, mode="a") tgtf.empty( name=dataset_tgt, shape=srcf[dataset_src_high_thr].shape, compressor=numcodecs.GZip(6), dtype="uint64", chunks=srcf[dataset_src_high_thr].chunks, ) data_high_thr = np.array(srcf[dataset_src_high_thr][:]) data_low_thr = np.array(srcf[dataset_src_low_thr][:]) tgt = np.ones(data_low_thr.shape, dtype=np.uint64) maxid = scipy.ndimage.label(data_low_thr, output=tgt) maxes = scipy.ndimage.maximum( data_high_thr, labels=tgt, index=list(range(1, maxid + 1)) ) maxes = np.array([0] + list(maxes)) factors = maxes[tgt] tgt *= factors.astype(np.uint64) maxid = scipy.ndimage.label(tgt, output=tgt) tgtf[dataset_tgt][:] = tgt.astype(np.uint64) tgtf[dataset_tgt].attrs["offset"] = srcf[dataset_src_high_thr].attrs["offset"] tgtf[dataset_tgt].attrs["max_id"] = maxid
def crop_to_seg(filename_src, dataset_src, filename_tgt, dataset_tgt, offset, shape): srcf = zarr.open(filename_src, mode="r") if not os.path.exists(filename_tgt): os.makedirs(filename_tgt) tgtf = zarr.open(filename_tgt, mode="a") chunk_size = tuple(min(c, s) for c, s in zip(srcf[dataset_src].chunks, shape)) if os.path.exists(os.path.join(filename_tgt, dataset_tgt)): assert ( tgtf[dataset_tgt].shape == shape and tgtf[dataset_tgt].dtype == srcf[dataset_src].dtype and tgtf[dataset_tgt].chunks == chunk_size ) skip_ds_creation = True else: skip_ds_creation = False if not skip_ds_creation: tgtf.empty( name=dataset_tgt, shape=shape, compression=numcodecs.GZip(6), dtype=srcf[dataset_src].dtype, chunks=chunk_size, ) bb = tuple(slice(off, off + sh, None) for off, sh in zip(offset, shape)) tgtf[dataset_tgt][:] = srcf[dataset_src][bb] tgtf[dataset_tgt].attrs["offset"] = offset[::-1]
def save_mask(data_file, data_dsname_raw, data_dsname_mask, chunks, cut_axis): f = zarr.open(data_file, "a") raw_ds = f[data_dsname_raw] mask_ds = f.empty(name=data_dsname_mask, shape=raw_ds.shape, chunks=(256,256,256), dtype=np.uint64, compressor=numcodecs.GZip(6)) mask_ds.attrs["pixelResolution"] = raw_ds.attrs["pixelResolution"] start = (0,0,0) end = mask_ds.shape boundary = int(0.5 * mask_ds.shape[cut_axis]) for z, y, x in itertools.product(range(start[0], end[0], chunks[0]), range(start[1], end[1], chunks[1]), range(start[2], end[2], chunks[2])): sl = ( slice(z, min(z+chunks[0], end[0])), slice(y, min(y+chunks[1], end[1])), slice(x, min(x+chunks[2], end[2])) ) if sl[cut_axis].stop <= boundary: mask_ds[sl] = np.ones(get_slice_shape(sl), dtype=np.uint64) elif boundary <= sl[cut_axis].start: mask_ds[sl] = np.zeros(get_slice_shape(sl), dtype=np.uint64) else: sl_before = tuple(slice(sl[k].start, sl[k].stop, sl[k].step) if k!=cut_axis else slice(sl[k].start, boundary, sl[k].step) for k in range(3)) mask_ds[sl_before] = np.ones(get_slice_shape(sl_before), dtype=np.uint64) sl_after = tuple(slice(sl[k].start, sl[k].stop, sl[k].step) if k!=cut_axis else slice(boundary, sl[k].stop, sl[k].step) for k in range(3)) mask_ds[sl_after] = np.zeros(get_slice_shape(sl_after), dtype=np.uint64)
def _decode_codec_metadata(meta: Mapping) -> Optional[Codec]: if meta is None: return None # only support gzip for now if meta['codec'] != 'https://purl.org/zarr/spec/codec/gzip/1.0': raise NotImplementedError codec = numcodecs.GZip(level=meta['configuration']['level']) return codec
def add_ds(target, name, shape, dtype, chunks, resolution, offset, **kwargs): logging.info("Preparing dataset {0:} in {1:}".format(name, target.path)) ds = target.empty(name=name, shape=shape, chunks=chunks, dtype=dtype, compressor=numcodecs.GZip(6)) ds.attrs["resolution"] = resolution ds.attrs["offset"] = offset for k in kwargs: ds.attrs[k] = kwargs[k] return ds
def run(self): thr_high = 127 thr_low = 42 dataset_src = "clefts_cropped_thr{0:}" dataset_tgt = "clefts_cropped_thr{0:}_cc{1:}".format(thr_high, thr_low) progress = 0.0 self.set_progress_percentage(progress) for s in self.samples: filename = os.path.join(os.path.dirname(self.input().fn), s + ".n5") f = zarr.open(filename, mode="a") assert ( f[dataset_src.format(thr_high)].attrs["offset"] == f[dataset_src.format(thr_low)].attrs["offset"] ) assert ( f[dataset_src.format(thr_high)].shape == f[dataset_src.format(thr_low)].shape ) f.empty( name=dataset_tgt, shape=f[dataset_src.format(thr_high)].shape, compressor=numcodecs.GZip(6), dtype="uint64", chunks=f[dataset_src.format(thr_high)].chunks, ) data_high_thr = np.array(f[dataset_src.format(thr_high)][:]) data_low_thr = np.array(f[dataset_src.format(thr_low)][:]) tgt = np.ones(data_low_thr.shape, dtype=np.uint64) maxid = scipy.ndimage.label(data_low_thr, output=tgt) maxes = scipy.ndimage.maximum( data_high_thr, labels=tgt, index=list(range(1, maxid + 1)) ) maxes = np.array([0] + list(maxes)) factors = maxes[tgt] tgt *= factors.astype(np.uint64) maxid = scipy.ndimage.label(tgt, output=tgt) f[dataset_tgt][:] = tgt.astype(np.uint64) f[dataset_tgt].attrs["offset"] = f[dataset_src.format(thr_high)].attrs[ "offset" ] f[dataset_tgt].attrs["max_id"] = maxid progress += 100.0 / len(self.samples) try: self.set_progress_percentage(progress) except: pass done = self.output().open("w") done.close()
def cc(filename_src, dataset_src, filename_tgt, dataset_tgt): srcf = zarr.open(filename_src, mode="r") if not os.path.exists(filename_tgt): os.makedirs(filename_tgt) tgtf = zarr.open(filename_tgt, mode="a") tgtf.empty(name=dataset_tgt, shape=srcf[dataset_src].shape, compressor=numcodecs.GZip(6), dtype="uint64", chunks=srcf[dataset_src].chunks) data = np.array(srcf[dataset_src][:]) tgt = np.ones(data.shape, dtype=np.uint64) maxid = scipy.ndimage.label(data, output=tgt) tgtf[dataset_tgt][:] = tgt.astype(np.uint64) if "offset" in srcf[dataset_src].attrs.keys(): tgtf[dataset_tgt].attrs["offset"] = srcf[dataset_src].attrs["offset"] tgtf[dataset_tgt].attrs["max_id"] = maxid
def threshold(filename_src, dataset_src, filename_tgt, dataset_tgt, thr): srcf = zarr.open(filename_src, mode="r") if not os.path.exists(filename_tgt): os.makedirs(filename_tgt) tgtf = zarr.open(filename_tgt, mode="a") tgtf.create_dataset( name=dataset_tgt, shape=srcf[dataset_src].shape, compressor=numcodecs.GZip(6), dtype="uint8", chunks=srcf[dataset_src].chunks, ) ds = srcf[dataset_src][:] print(np.sum(ds > thr)) print(np.min(ds)) print(np.max(ds)) tgtf[dataset_tgt][:] = (srcf[dataset_src][:] > thr).astype(np.uint8) if "offset" in srcf[dataset_src].attrs.keys(): tgtf[dataset_tgt].attrs["offset"] = srcf[dataset_src].attrs["offset"]
def initialize_group( group_path: Pathlike, arrays: Sequence[NDArray[Any]], array_paths: Sequence[str], chunks: Sequence[int], group_attrs: Dict[str, Any] = {}, compressor: Codec = numcodecs.GZip(-1), array_attrs: Optional[Sequence[Dict[str, Any]]] = None, modes: Tuple[AccessMode, AccessMode] = ("w", "w"), group_kwargs: Dict[str, Any] = {}, array_kwargs: Dict[str, Any] = {}, ) -> zarr.hierarchy.Group: group_access_mode, array_access_mode = modes group = access(group_path, mode=group_access_mode, attrs=group_attrs, **group_kwargs) if array_attrs is None: _array_attrs: Tuple[Dict[str, Any], ...] = ({},) * len(arrays) else: _array_attrs = array_attrs for name, arr, attrs, chnks in zip(array_paths, arrays, _array_attrs, chunks): path = os.path.join(group.path, name) z_arr = zarr.open_array( store=group.store, mode=array_access_mode, fill_value=0, path=path, shape=arr.shape, dtype=arr.dtype, chunks=chnks, compressor=compressor, **array_kwargs) z_arr.attrs.update(attrs) return group
def run(self): progress = 0.0 self.set_progress_percentage(progress) if "unaligned" in self.de: aligned = False else: aligned = True for s in self.samples: filename = os.path.join(os.path.dirname(self.input().fn), self.de, s + ".n5") datasets_src = ["clefts", "pre_dist", "post_dist"] datasets_tgt = [ "clefts_cropped", "pre_dist_cropped", "post_dist_cropped" ] off = offsets[s][aligned] sh = shapes[s][aligned] f = zarr.open(filename, mode="a") for dss, dst in zip(datasets_src, datasets_tgt): chunk_size = tuple( min(c, shi) for c, shi in zip(f[dss].chunks, sh)) f.create_dataset( name=dst, shape=sh, compressor=numcodecs.GZip(6), dtype=f[dss].dtype, chunks=chunk_size, ) bb = tuple(slice(o, o + shi, None) for o, shi in zip(off, sh)) f[dst][:] = f[dss][bb] f[dst].attrs["offset"] = off[::-1] progress += 100.0 / (len(self.samples) * len(datasets_src)) try: self.set_progress_percentage(progress) except: pass done = self.output().open("w") done.close()
def set_mask_to_zero( filename_src, dataset_src, filename_mask, dataset_mask, filename_tgt, dataset_tgt, offset, shape, ): logging.info("setting mask to zero for " + filename_src + "/" + dataset_src) srcf = zarr.open(filename_src, mode="r") maskf = zarr.open(filename_mask, mode="r") if not os.path.exists(filename_tgt): os.makedirs(filename_tgt) tgtf = zarr.open(filename_tgt, mode="a") # grps = "" # for grp in dataset_tgt.split("/")[:-1]: # grps += grp # if not os.path.exists(os.path.join(filename_tgt, grps)): # tgtf.create_group(grps) # grps += "/" chunk_size = tuple( min(c, s) for c, s in zip(srcf[dataset_src].chunks, shape)) tgtf.empty( dataset_tgt, shape=shape, compressor=numcodecs.GZip(6), dtype=srcf[dataset_src].dtype, chunks=chunk_size, ) a = srcf[dataset_src][:] a[maskf[dataset_mask][:] == 0] = 0 tgtf[dataset_tgt][:] = a tgtf[dataset_tgt].attrs["offset"] = offset[::-1]
import zarr import numcodecs from skimage.data import astronaut # choose chunks s.t. we do have overhanging edge-chunks CHUNKS = (100, 100, 1) STR_TO_COMPRESSOR = { 'gzip': numcodecs.GZip(), 'blosc': numcodecs.Blosc(), 'zlib': numcodecs.Zlib() } def generate_zarr_format(compressors=['gzip', 'blosc', 'zlib', None]): path = '../data/zarr.zr' im = astronaut() f = zarr.open(path) for compressor in compressors: name = compressor if compressor is not None else 'raw' compressor_impl = STR_TO_COMPRESSOR[ compressor] if compressor is not None else None f.create_dataset(name, data=im, chunks=CHUNKS, compressor=compressor_impl) # this needs PR https://github.com/zarr-developers/zarr/pull/309 def generate_n5_format(compressors=['gzip', None]): path = '../data/zarr.n5'
def prepare_cell_inference(n_jobs, raw_data_path, dataset_id, sigma, raw_ds, setup_path, output_path, factor, min_sc, max_sc, float_range, safe_scale, n_cpus, finish_interrupted): # assert os.path.exists(setup_path), "Path to experiment directory does not exist" # sys.path.append(setup_path) # import unet_template if raw_data_path.endswith('/'): raw_data_path = raw_data_path[:-1] assert os.path.exists( raw_data_path ), "Path to N5 dataset with raw data and mask does not exist" # assert os.path.exists(os.path.join(setup_path, "blur.meta")) rf = zarr.open(raw_data_path, mode="r") assert raw_ds in rf, "Raw data not present in N5 dataset" shape_vc = rf[raw_ds].shape output_dir, out_file = get_output_paths(raw_data_path, setup_path, output_path) if not finish_interrupted: names = blur_tf(size, sigma) input_shape_vc = Coordinate(size) output_shape_wc = Coordinate(size) * voxel_size output_shape_vc = Coordinate(size) chunk_shape_vc = Coordinate(size) chunk_shape_wc = output_shape_wc full_shape_wc = Coordinate(shape_vc) * voxel_size full_shape_vc_output = Coordinate(shape_vc) # offset file, e.g. "(...)/setup01/HeLa_Cell2_4x4x4nm/offsets_volumes_masks_foreground_shape180x180x180.json" offset_filename = "offsets_{0:}_shape{1:}x{2:}x{3:}.json".format( mask_ds.replace("/", "_"), *output_shape_wc) offset_file = os.path.join(output_dir, offset_filename) # prepare datasets factor, scale, shift = get_contrast_adjustment(rf, raw_ds, factor, min_sc, max_sc) f = zarr.open(out_file) dataset_target_keys = ["raw_blurred"] for dstk in dataset_target_keys: if dstk not in f: ds = f.empty(name=dstk, shape=full_shape_vc_output, compressor=numcodecs.GZip(6), dtype="uint8", chunks=chunk_shape_vc) else: ds = f[dstk] ds.attrs["resolution"] = tuple(voxel_size)[::-1] ds.attrs["offset"] = (0, 0, 0) ds.attrs["raw_data_path"] = raw_data_path ds.attrs["raw_ds"] = raw_ds ds.attrs["parent_dataset_id"] = dataset_id ds.attrs["sigma"] = sigma ds.attrs["raw_scale"] = scale ds.attrs["raw_shift"] = shift ds.attrs["raw_normalize_factor"] = factor ds.attrs["float_range"] = float_range ds.attrs["safe_scale"] = safe_scale if not os.path.exists(offset_file): generate_full_list(offset_file, output_shape_wc, raw_data_path, raw_ds) shapes_file = os.path.join( setup_path, "shapes_steps_{0:}x{1:}x{2:}.json".format(*size)) if not os.path.exists(shapes_file): shapes = { "input_shape_vc": tuple(int(isv) for isv in input_shape_vc), "output_shape_vc": tuple(int(osv) for osv in output_shape_vc), "chunk_shape_vc": tuple(int(csv) for csv in chunk_shape_vc) } with open(shapes_file, "w") as f: json.dump(shapes, f) p_proc = re.compile("list_gpu_\d+_\S+_processed.txt") print(any([p_proc.match(f) is not None for f in os.listdir(out_file)])) if any([p_proc.match(f) is not None for f in os.listdir(out_file)]): print("Redistributing offset lists over {0:} jobs".format(n_jobs)) redistribute_offset_lists(list(range(n_jobs)), out_file) else: with open(offset_file, 'r') as f: offset_list = json.load(f) offset_list_from_precomputed(offset_list, list(range(n_jobs)), out_file) return input_shape_vc, output_shape_vc, chunk_shape_vc
def run(self): src = os.path.join(config_loader.get_config()["synapses"]["cremieval_path"], "{0:}/{1:}.n5") tgt = os.path.join(os.path.dirname(self.input().fn), "{0:}", "{1:}.n5") output_shape = (71, 650, 650) gpu_list = [] for i in range(8): nvsmi = subprocess.Popen( "nvidia-smi -d PIDS -q -i {0:}".format(i), shell=True, stdout=subprocess.PIPE, ).stdout.read() if "None" in nvsmi: gpu_list.append(i) completed = [] for de in self.data_eval: for s in self.samples: srcf = zarr.open(src.format(de, s), mode="r") shape = srcf["volumes/raw"].shape tgtf = zarr.open(tgt.format(de, s), mode="a") if not os.path.exists(os.path.join(tgt.format(de, s), "clefts")): tgtf.empty( name="clefts", shape=shape, compressor=numcodecs.GZip(6), dtype="uint8", chunks=output_shape, ) completed.append(False) else: if self.check_completeness()[0]: completed.append(True) else: completed.append(False) if not os.path.exists(os.path.join(tgt.format(de, s), "pre_dist")): tgtf.empty( name="pre_dist", shape=shape, compressor=numcodecs.GZip(6), dtype="uint8", chunks=output_shape, ) completed.append(False) else: if self.check_completeness()[0]: completed.append(True) else: completed.append(False) if not os.path.exists(os.path.join(tgt.format(de, s), "post_dist")): tgtf.empty( name="post_dist", shape=shape, compressor=numcodecs.GZip(6), dtype="uint8", chunks=output_shape, ) completed.append(False) else: if self.check_completeness()[0]: completed.append(True) else: completed.append(False) get_offset_lists( shape, gpu_list, tgt.format(de, s), output_shape=output_shape ) if all(completed): self.finish() return self.submit_inference(self.data_eval, gpu_list) reprocess_attempts = 0 while reprocess_attempts < 4: complete, reprocess_list = self.check_completeness(gpu_list) if complete: self.finish() return else: self.set_status_message( "Reprocessing {0:}, try {1:}".format( list(reprocess_list), reprocess_attempts ) ) self.submit_inference(tuple(reprocess_list), gpu_list) reprocess_attempts += 1 if reprocess_attempts >= 4: raise AssertionError
def _ensure_datasets_exist(self, volume_config): dtype = volume_config["zarr"]["creation-settings"]["dtype"] create_if_necessary = volume_config["zarr"]["create-if-necessary"] writable = volume_config["zarr"]["writable"] if writable is None: writable = create_if_necessary mode = 'r' if writable: mode = 'a' self._filemode = mode block_shape = volume_config["zarr"]["creation-settings"]["chunk-shape"][::-1] global_offset = volume_config["zarr"]["global-offset"][::-1] bounding_box_zyx = np.array(volume_config["geometry"]["bounding-box"])[:,::-1] creation_shape = np.array(volume_config["zarr"]["creation-settings"]["shape"][::-1]) replace_default_entries(creation_shape, bounding_box_zyx[1] - global_offset) compression = volume_config["zarr"]["creation-settings"]["compression"] if compression == 'gzip': compressor = numcodecs.GZip() elif compression.startswith('blosc-'): cname = compression[len('blosc-'):] compressor = numcodecs.Blosc(cname) else: assert compression == "", f"Unimplemented compression: {compression}" if create_if_necessary: max_scale = volume_config["zarr"]["creation-settings"]["max-scale"] if max_scale == -1: if -1 in creation_shape: raise RuntimeError("Can't auto-determine the appropriate max-scale to create " "(or extend) the data with, because you didn't specify a " "volume creation shape (or bounding box") max_scale = choose_pyramid_depth(creation_shape, 512) available_scales = [*range(1+max_scale)] else: available_scales = volume_config["geometry"]["available-scales"] if not os.path.exists(self._path): raise RuntimeError(f"File does not exist: {self._path}\n" "You did not specify 'create-if-necessary' in the config, so I won't create it.:\n") if self._dataset_name and not os.path.exists(f"{self._path}/{self._dataset_name}"): raise RuntimeError(f"File does not exist: {self._path}/{self._dataset_name}\n" "You did not specify 'create-if-necessary' in the config, so I won't create it.:\n") for scale in available_scales: if scale == 0: name = self._dataset_name else: name = self._dataset_name[:-1] + f'{scale}' if name not in self.zarr_file: if not writable: raise RuntimeError(f"Dataset for scale {scale} does not exist, and you " "didn't specify 'writable' in the config, so I won't create it.") if dtype == "auto": raise RuntimeError(f"Can't create Zarr array {self._path}/{self._dataset_name}: " "No dtype specified in the config.") # Use 128 if the user didn't specify a chunkshape replace_default_entries(block_shape, 3*[128]) # zarr misbehaves if the chunks are larger than the shape, # which could happen here if we aren't careful (for higher scales). scaled_shape = (creation_shape // (2**scale)) chunks = np.minimum(scaled_shape, block_shape).tolist() if (chunks != block_shape) and (scale == 0): logger.warning(f"Block shape ({block_shape}) is too small for " f"the dataset shape ({creation_shape}). Shrinking block shape.") self._zarr_datasets[scale] = self.zarr_file.create_dataset( name, shape=scaled_shape.tolist(), dtype=np.dtype(dtype), chunks=chunks, compressor=compressor )
import zarr def write_n5(path, shape, block_size, compressor): store = zarr.N5Store(path) data = np.arange(np.prod(shape), dtype=np.uint16) data = data.reshape(shape) data_transpose = data.transpose() z = zarr.zeros( data_transpose.shape, chunks=block_size[::-1], store=store, dtype=data.dtype, overwrite=True, compressor=compressor) z[...] = data_transpose write_n5(path='raw', shape=[5, 4], block_size=[3, 2], compressor=None) write_n5( path='gzip', shape=[5, 4], block_size=[3, 2], compressor=numcodecs.GZip()) write_n5( path='bzip2', shape=[5, 4], block_size=[3, 2], compressor=numcodecs.BZ2()) write_n5( path='xz', shape=[5, 4], block_size=[3, 2], compressor=numcodecs.LZMA(preset=4)) write_n5( path='blosc', shape=[5, 4], block_size=[3, 2], compressor=numcodecs.Blosc())
from typing import Tuple, List, Union import time import dask import dask.array as da from fst.pyramid import lazy_pyramid, get_downsampled_offset import zarr OUTPUT_DTYPES = ("same", "uint8", "uint16") OUTPUT_FMTS = ("n5",) program_name = "dat_to_n5.py" grid_spacing_unit = "nm" max_chunksize = 1024 compressor = numcodecs.GZip(level=-1) # all channels will the stored as subgroups under root_group_path root_group_path = Path("volumes/raw/") # raw data are stored z | c y x, we will split images into two channels along the channel axis channel_dim = 0 downscale_factor = 2 # set up logging logger = logging.getLogger(program_name) c_handler = logging.StreamHandler() c_formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") c_handler.setFormatter(c_formatter) logger.addHandler(c_handler) logger.setLevel(logging.INFO)
def prepare_cell_inference(n_jobs, raw_data_path, dataset_id, iteration, raw_ds, mask_ds, setup_path, output_path, factor, min_sc, max_sc, float_range, safe_scale, n_cpus, finish_interrupted): assert os.path.exists( setup_path), "Path to experiment directory does not exist" sys.path.append(setup_path) import setup_config if raw_data_path.endswith('/'): raw_data_path = raw_data_path[:-1] assert os.path.exists( raw_data_path ), "Path to N5 dataset with raw data and mask does not exist" assert os.path.exists( os.path.join( setup_path, "{0:}_train_checkpoint_{1:}.meta".format(setup_config.network_name, iteration))) assert os.path.exists( os.path.join( setup_path, "{0:}_train_checkpoint_{1:}.index".format( setup_config.network_name, iteration))) assert os.path.exists( os.path.join( setup_path, "{0:}_train_checkpoint_{1:}.data-00000-of-00001".format( setup_config.network_name, iteration))) assert os.path.exists( os.path.join(setup_path, setup_config.network_name + "_io_names.json")) rf = zarr.open(raw_data_path, mode="r") assert raw_ds in rf, "Raw data not present in N5 dataset" if mask_ds is not None: assert mask_ds in rf, "Mask data not present in N5 dataset" shape_vc = rf[raw_ds].shape output_dir, out_file = get_output_paths(raw_data_path, setup_path, output_path) if not finish_interrupted: net_name, input_shape_vc, output_shape_vc = setup_config.build_net( steps=setup_config.steps_inference, mode="inference") voxel_size_input = setup_config.voxel_size voxel_size_output = setup_config.voxel_size output_shape_wc = Coordinate(output_shape_vc) * voxel_size_output chunk_shape_vc = output_shape_vc chunk_shape_wc = Coordinate(output_shape_vc) * voxel_size_output full_shape_wc = Coordinate(shape_vc) * voxel_size_input full_shape_vc_output = full_shape_wc / voxel_size_output # offset file, e.g. "(...)/setup01/HeLa_Cell2_4x4x4nm/offsets_volumes_masks_foreground_shape180x180x180.json" if mask_ds is not None: offset_filename = "offsets_{0:}_shape{1:}x{2:}x{3:}.json".format( mask_ds.replace("/", "_"), *output_shape_wc) else: offset_filename = "offsets_{0:}_shape{1:}x{2:}x{3:}.json".format( "nomask", *output_shape_wc) offset_file = os.path.join(output_dir, offset_filename) # prepare datasets factor, scale, shift = get_contrast_adjustment(rf, raw_ds, factor, min_sc, max_sc) f = zarr.open(out_file) for out_name in setup_config.output_names: if out_name not in f: ds = f.empty(name=out_name + "_predicted", shape=full_shape_vc_output, compressor=numcodecs.GZip(6), dtype="uint8", chunks=chunk_shape_vc) else: ds = f[out_name + "_predicted"] ds.attrs["resolution"] = tuple(voxel_size_output)[::-1] ds.attrs["offset"] = (0, 0, 0) ds.attrs["raw_data_path"] = raw_data_path ds.attrs["raw_ds"] = raw_ds ds.attrs["parent_dataset_id"] = dataset_id ds.attrs["iteration"] = iteration ds.attrs["raw_scale"] = scale ds.attrs["raw_shift"] = shift ds.attrs["raw_normalize_factor"] = factor ds.attrs["float_range"] = float_range ds.attrs["safe_scale"] = safe_scale if not os.path.exists(offset_file): if mask_ds is not None: generate_list_for_mask(offset_file, output_shape_wc, raw_data_path, mask_ds, n_cpus) else: generate_full_list(offset_file, output_shape_wc, raw_data_path, raw_ds) shapes_file = os.path.join( setup_path, "shapes_steps{0:}.json".format(setup_config.steps_inference)) if not os.path.exists(shapes_file): shapes = { "input_shape_vc": tuple(int(isv) for isv in input_shape_vc), "output_shape_vc": tuple(int(osv) for osv in output_shape_vc), "chunk_shape_vc": tuple(int(csv) for csv in chunk_shape_vc) } with open(shapes_file, "w") as f: json.dump(shapes, f) p_proc = re.compile("list_gpu_\d+_\S+_processed.txt") print(any([p_proc.match(f) is not None for f in os.listdir(out_file)])) if any([p_proc.match(f) is not None for f in os.listdir(out_file)]): print("Redistributing offset lists over {0:} jobs".format(n_jobs)) redistribute_offset_lists(list(range(n_jobs)), out_file) else: with open(offset_file, 'r') as f: offset_list = json.load(f) offset_list_from_precomputed(offset_list, list(range(n_jobs)), out_file) return input_shape_vc, output_shape_vc, chunk_shape_vc
help="Maximum distance for stardist computation", type=float, default=None) args = parser.parse_args() directory = args.directory max_dist = args.max_dist # generate a dataset with a binary sphere sphere = raster_geometry.sphere(200, 70).astype( np.uint64) # image size: 200, radius: 70 sphere2 = raster_geometry.sphere(200, 50).astype(np.uint64) sphere = (sphere + sphere2 - 1).astype(np.uint64) f = zarr.open(os.path.join(directory, "sphere.n5"), mode="a") f.create_dataset(name="sphere", shape=sphere.shape, compressor=numcodecs.GZip(6), dtype=sphere.dtype, chunks=(50, 50, 50), overwrite=True) f["sphere"].attrs["offset"] = (0, 0, 0) f["sphere"].attrs["resolution"] = (1, 1, 1) f["sphere"][:] = sphere # declare arrays to use labels = gp.ArrayKey("LABELS") stardists = gp.ArrayKey("STARDIST") # prepare requests scan_request = gp.BatchRequest() scan_request[stardists] = gp.Roi((0, 0, 0), (50, 50, 50)) request = gp.BatchRequest()
def prepare_cell_inference( n_jobs: int, raw_data_path: str, dataset_id: str, iteration: int, raw_ds: str, mask_ds: Optional[str], setup_path: str, output_path: Optional[str], factor: Optional[int], min_sc: Optional[int], max_sc: Optional[int], float_range: Tuple[int, int], safe_scale: bool, n_cpus: int, finish_interrupted: bool, resolution: Optional[Tuple[int, int, int]] = None ) -> Tuple[Coordinate, Coordinate, Coordinate]: """ Set up output directories for inference, prepare offset lists and read input shape, output shape and chunk shape from network setup. Args: n_jobs: Number of jobs to split inference over. raw_data_path: Path to n5 container that contains raw data. dataset_id: Identifier of parent dataset. iteration: Iteration to pull inference for. raw_ds: Dataset in n5 container (`raw_data_path`) for raw data. mask_ds: Dataset in n5 container (`raw_data_path`) for mask data. Can be None if no mask exists. setup_path: Path containing setup. output_path: N5 container to save output to, autogenerated if None. factor: Factor to normalize raw data by, tried to infer from datatype if None. min_sc: Minimum intensity (mapped to -1) max_sc: Maximum intensity (mapped to 1) float_range: Range of output floats for conversion to uint8. safe_scale: If True, values are scaled such that all values within `float_range` fall within (0, 255). and are not cropped. If False, values at the lower end of `float_range` may be scaled to < 0 and then cropped to 0. n_cpus: Number of cpus to use per job. finish_interrupted: Whether running this is to finish an interrupted inference job. resolution: Resolution of raw data, tried to infer from metadata if None. Returns: Input shape, output shape and chunk shape in voxel coordinates. """ # todo: use setup_utils instead assert os.path.exists( setup_path), "Path to experiment directory does not exist" sys.path.append(setup_path) import unet_template if raw_data_path.endswith('/'): raw_data_path = raw_data_path[:-1] assert os.path.exists( raw_data_path ), "Path to N5 dataset with raw data and mask does not exist" assert os.path.exists( os.path.join(setup_path, "unet_train_checkpoint_{0:}.meta".format(iteration))) assert os.path.exists( os.path.join(setup_path, "unet_train_checkpoint_{0:}.index".format(iteration))) assert os.path.exists( os.path.join( setup_path, "unet_train_checkpoint_{0:}.data-00000-of-00001".format( iteration))) assert os.path.exists(os.path.join(setup_path, "net_io_names.json")) rf = zarr.open(raw_data_path, mode="r") assert raw_ds in rf, "Raw data not present in N5 dataset" if mask_ds is not None: assert mask_ds in rf, "Mask data not present in N5 dataset" shape_vc = rf[raw_ds].shape output_dir, out_file = get_output_paths(raw_data_path, setup_path, output_path, iteration) if not finish_interrupted: net_name, input_shape_vc, output_shape_vc = unet_template.build_net( steps=unet_template.steps_inference, mode="inference") voxel_size_input = unet_template.voxel_size_input voxel_size_output = unet_template.voxel_size output_shape_wc = Coordinate(output_shape_vc) * voxel_size_output chunk_shape_vc = output_shape_vc chunk_shape_wc = Coordinate(output_shape_vc) * voxel_size_output full_shape_wc = Coordinate(shape_vc) * voxel_size_input full_shape_vc_output = full_shape_wc / voxel_size_output # offset file, e.g. "(...)/setup01/HeLa_Cell2_4x4x4nm/offsets_volumes_masks_foreground_shape180x180x180.json" if mask_ds is not None: offset_filename = "offsets_{0:}_shape{1:}x{2:}x{3:}.json".format( mask_ds.replace("/", "_"), *output_shape_wc) else: offset_filename = "offsets_{0:}_shape{1:}x{2:}x{3:}.json".format( "nomask", *output_shape_wc) offset_file = os.path.join(output_dir, offset_filename) # prepare datasets factor, scale, shift = get_contrast_adjustment(rf, raw_ds, factor, min_sc, max_sc) f = zarr.open(out_file) for label in unet_template.labels: if label.labelname not in f: ds = f.empty(name=label.labelname, shape=full_shape_vc_output, compressor=numcodecs.GZip(6), dtype="uint8", chunks=chunk_shape_vc) else: ds = f[label.labelname] ds.attrs["resolution"] = tuple(voxel_size_output)[::-1] ds.attrs["offset"] = (0, 0, 0) ds.attrs["raw_data_path"] = raw_data_path ds.attrs["raw_ds"] = raw_ds ds.attrs["parent_dataset_id"] = dataset_id ds.attrs["iteration"] = iteration ds.attrs["raw_scale"] = scale ds.attrs["raw_shift"] = shift ds.attrs["raw_normalize_factor"] = factor ds.attrs["float_range"] = float_range ds.attrs["safe_scale"] = safe_scale ds.attrs["mask_ds"] = mask_ds if not os.path.exists(offset_file): if mask_ds is not None: generate_list_for_mask(offset_file, output_shape_wc, raw_data_path, mask_ds, n_cpus) else: generate_full_list(offset_file, output_shape_wc, raw_data_path, raw_ds, raw_voxel_size=resolution) shapes_file = os.path.join( setup_path, "shapes_steps{0:}.json".format(unet_template.steps_inference)) if not os.path.exists(shapes_file): shapes = { "input_shape_vc": tuple(int(isv) for isv in input_shape_vc), "output_shape_vc": tuple(int(osv) for osv in output_shape_vc), "chunk_shape_vc": tuple(int(csv) for csv in chunk_shape_vc) } with open(shapes_file, "w") as f: json.dump(shapes, f) p_proc = re.compile("list_gpu_\d+_\S+_processed.txt") print(any([p_proc.match(f) is not None for f in os.listdir(out_file)])) if any([p_proc.match(f) is not None for f in os.listdir(out_file)]): print("Redistributing offset lists over {0:} jobs".format(n_jobs)) redistribute_offset_lists(list(range(n_jobs)), out_file) else: with open(offset_file, 'r') as f: offset_list = json.load(f) offset_list_from_precomputed(offset_list, list(range(n_jobs)), out_file) return input_shape_vc, output_shape_vc, chunk_shape_vc