def test_open(): path = tempfile.mktemp() atexit.register( lambda: shutil.rmtree(path) if os.path.exists(path) else None ) z = open(path, mode='w', shape=100, chunks=10, dtype='i4') z[:] = 42 eq((100,), z.shape) eq((10,), z.chunks) assert_array_equal(np.full(100, fill_value=42, dtype='i4'), z[:]) z2 = open(path, mode='r') eq((100,), z2.shape) eq((10,), z2.chunks) assert_array_equal(z[:], z2[:])
def _create_array(self, data, **kwargs): # determine chunks chunks = default_chunks(data) kwargs.setdefault('chunks', chunks) # create array if 'path' in kwargs: kwargs['mode'] = 'w' kwargs['shape'] = data.shape # ensure dtype is specified dtype = kwargs.get('dtype', None) if not dtype: kwargs['dtype'] = data.dtype z = zarr.open(**kwargs) z[:] = data else: z = zarr.array(data, **kwargs) return z
def test_index_image(Image, image_search, tmp_path): """Test that indexing images is working correctly""" tmp_storage = zarr.open( str(tmp_path / 'tmp.zarr'), mode='a', shape=image_search.storage['/image_features'].shape, chunks=image_search.storage['/image_features'].chunks, dtype=np.float32) tmp_storage[:] = image_search.storage["/image_features"][:] # choose a random image to delete from the image index image_to_be_deleted = Image.query.order_by(func.random()).first() image_search.delete_index(image_to_be_deleted) assert np.sum(np.any(tmp_storage[:] != 0, axis=1)) - 1 == np.sum( np.any(image_search.storage['/image_features'][:] != 0, axis=1)) image_search.index_model(Image, threaded=False) # index all missing images assert np.sum(np.any(tmp_storage[:] != 0, axis=1)) == np.sum( np.any(image_search.storage['/image_features'][:] != 0, axis=1))
def execute( array, write_path=None, cluster_kwargs={}, worker_buffer=4, ): """ """ # start the cluster with ClusterWrap.cluster(**cluster_kwargs) as cluster: # print dashboard url print("cluster dashboard link: ", cluster.get_dashboard()) sys.stdout.flush() # scale cluster based on array chunks and buffer nchunks = np.prod(array.numblocks) cluster.scale_cluster(nchunks + worker_buffer) # if the user wants to write result to disk if write_path: compressor = Blosc(cname='zstd', clevel=9, shuffle=Blosc.BITSHUFFLE) array_disk = zarr.open( write_path, 'w', shape=array.shape, chunks=array.chunksize, dtype=array.dtype, compressor=compressor, ) da.to_zarr(array, array_disk) return array_disk # if the user wants the result back in memory else: return array.compute()
def getMetaData(self, key=None): iZTable = zarr.open(self._FactorDB.MainDir + os.sep + self.Name, mode="r") with self._FactorDB._DataLock: if key is not None: if key not in iZTable.attrs: return None MetaData = iZTable.attrs[key] if isinstance(MetaData, dict): Type = MetaData.get("_Type") if Type == "Array": return np.array(MetaData["List"]) elif Type == "Series": return pd.read_json(MetaData["Json"], typ="series") elif Type == "DataFrame": return pd.read_json(MetaData["Json"], typ="frame") else: return MetaData else: return MetaData MetaData = {} for iKey in iZTable.attrs: MetaData[iKey] = self.getMetaData(key=iKey) return MetaData
def merge_and_store(i, src, dst): """ Merge a dimension of the Dask Array and store the results. Paramters --------- i : int The block number of the final array. src, dst: pathlike The files to read from and write to. """ mapper = get_mapper(src) A = zarr.Array(mapper, read_only=True) zchunks = (A.shape[0], ) + A.chunks[1:] store = zarr.open(dst, mode="a", shape=A.shape, chunks=zchunks, dtype=A.dtype) ochunks = da.core.normalize_chunks(zchunks, A.shape) slices = da.core.slices_from_chunks(ochunks) store[slices[i]] = A[slices[i]]
def batchwise_to_zarr(arr: dask.array.core.Array, zarr_dir_name: str, rm: bool = False, batch_size: int = 1): dir_p = Path(zarr_dir_name) if dir_p.exists(): if rm: print("##########################################") print("removing " + str(dir_p)) shutil.rmtree(dir_p) else: print("##########################################") print(str(dir_p) + " already exists") return if False: # arr.nbytes < 8 * 1024 ** 3: # if the array fits into memory # the direct call of the to_zarr method # is possible (allthough it seems to imply a compute() # for the whole array or at least a part that is too big # to handle for bigger arrays arr.to_zarr(zarr_dir_name) else: # if the array is bigger than memory we compute explicitly # a part of it and write it to the zarr array. # This takes longer but gives us control over the # memory usage z = zr.open(zarr_dir_name, mode="w", shape=arr.shape, chunks=arr.chunksize) #ncores = 32 slices = batchSlices(arr.shape[-1], batch_size) print("result shape:", arr.shape) # print(629, slices) # for s in slices: for s in tqdm(slices): # Holger z[..., s] = arr[..., s].compute()
def __init__(self, cr: CrReader, zarr_fn: str, chunk_size=(1000, 1000), dtype: str = 'uint32'): """ Args: cr: A CrReader object, containing the Cellranger data. zarr_fn: The file name for the Zarr hierarchy. chunk_size: The requested size of chunks to load into memory and process. dtype: the dtype of the data. """ self.cr = cr self.fn = zarr_fn self.chunkSizes = chunk_size self.z = zarr.open(self.fn, mode='w') self._ini_cell_data() for assay_name in set(self.cr.assayFeats.columns): create_zarr_count_assay(self.z, assay_name, chunk_size, self.cr.nCells, self.cr.feature_ids(assay_name), self.cr.feature_names(assay_name), dtype)
def read_series(dataset: zarr.Array) -> Union[np.ndarray, pd.Categorical]: if "categories" in dataset.attrs: categories = dataset.attrs["categories"] if isinstance(categories, str): categories_key = categories parent_name = dataset.name.rstrip(dataset.basename) parent = zarr.open(dataset.store)[parent_name] categories_dset = parent[categories_key] categories = categories_dset[...] ordered = categories_dset.attrs.get("ordered", False) else: # TODO: remove this code at some point post 0.7 # TODO: Add tests for this warn( f"Your file {str(dataset.file.name)!r} has invalid categorical " "encodings due to being written from a development version of " "AnnData. Rewrite the file ensure you can read it in the future.", FutureWarning, ) return pd.Categorical.from_codes(dataset[...], categories, ordered=ordered) else: return dataset[...]
def __enter__(self): if self.separate: if self.out_block_type.lower() == 'zarr': self.root = zarr.open(self.zarr_file, mode='w') else: if 'compress' in self.kwargs: logger.warning('\nCannot write concurrently to a compressed raster when using a combination of processes and threads.\nTherefore, compression will be applied after the initial write.') del self.kwargs['compress'] # An alternative here is to leave the writeable object open as self. # However, this does not seem to work when used within a Dask # client environment because the `self.dst_` object cannot be pickled. # Create the output file with rio.open(self.filename, mode='w', **self.kwargs) as dst_: pass return self
def generate_mel_cache(self, audio_params): record_ids_list = list( sorted(set(self.data_all[SampleDataset.k_recording_id]))) file_path = self.get_file_path(record_ids_list[0]) single_item = read_as_melspectrogram(audio_params, file_path, None) dataset_shape = (len(record_ids_list), ) + single_item.shape chunks = (1, ) + single_item.shape zarr_group_name = self.gen_group_name(audio_params) zarr_root = zarr.open(self.folder_path + '_cache.zarr', mode='a') zarr_mel = zarr.convenience.open(str( Path(zarr_root.store.path).joinpath(zarr_group_name)), mode='a') stored_records = zarr_mel.attrs.get('record_ids_list', []) if stored_records != record_ids_list: shutil.rmtree(Path(zarr_root.store.path).joinpath(zarr_group_name)) zarr_mel = zarr_root.create_dataset(zarr_group_name, shape=dataset_shape, dtype=np.float16, chunks=chunks) record_path_list = list(map(self.get_file_path, record_ids_list)) map_iterator = zip(record_path_list, (audio_params, ) * len(record_path_list)) with Pool(multiprocessing.cpu_count() // 2) as pool: with tqdm(desc=f"Preparing mel cache [{zarr_group_name}]", total=len(record_ids_list)) as t: for record_id, record_mel in pool.imap( get_mel, map_iterator): # type: str, np.ndarray idx = record_ids_list.index(record_id) zarr_mel[idx, ...] = record_mel.astype(np.float16) t.update() zarr_mel.attrs['record_ids_list'] = record_ids_list return zarr.convenience.open(str( Path(zarr_root.store.path).joinpath(zarr_group_name)), mode='r')
def getFactorMetaData(self, factor_names=None, key=None): if factor_names is None: factor_names = self.FactorNames elif set(factor_names).isdisjoint(self.FactorNames): return super().getFactorMetaData(factor_names=factor_names, key=key) if key == "DataType": return self._DataType.loc[factor_names] with self._FactorDB._DataLock: MetaData = {} ZTable = zarr.open(self._FactorDB.MainDir + os.sep + self.Name, mode="r") for iFactorName in factor_names: if iFactorName in self.FactorNames: iZFactor = ZTable[iFactorName] if key is None: MetaData[iFactorName] = pd.Series(iZFactor.attrs) elif key in iZFactor.attrs: MetaData[iFactorName] = iZFactor.attrs[key] if not MetaData: return super().getFactorMetaData(factor_names=factor_names, key=key) if key is None: return pd.DataFrame(MetaData).loc[:, factor_names] else: return pd.Series(MetaData).loc[factor_names]
def _write_segment_info(self): """ This function creates the info file needed to segment the image """ if self.image_type != 'segmentation': raise TypeError( 'The NeuroglancerWriter object must have image_type = "segmentation" to use write_segment_info.' ) op = pathlib.Path(self.base_path).joinpath("infodir") op.mkdir(exist_ok=True) op = op.joinpath("info") # Get the labels root = zarr.open(str(self.base_path.joinpath("labels.zarr"))) labels = set() for d in root.array_keys(): labels = labels.union(set(root[d][:].squeeze().tolist())) inlineinfo = { "ids": [str(item) for item in labels], "properties": [{ "id": "label", "type": "label", "values": [str(item) for item in labels] }, { "id": "description", "type": "label", "values": [str(item) for item in labels] }] } info = { "@type": "neuroglancer_segment_properties", "inline": inlineinfo } # writing all the information into the file with open(op, 'w') as writer: json.dump(info, writer, indent=2)
def test_rechunk_array(tmp_path, shape, source_chunks, dtype, dims, target_chunks, max_mem): ### Create source array ### store_source = str(tmp_path / "source.zarr") source_array = zarr.ones(shape, chunks=source_chunks, dtype=dtype, store=store_source) # add some attributes source_array.attrs["foo"] = "bar" if dims: source_array.attrs[_DIMENSION_KEY] = dims ### Create targets ### target_store = str(tmp_path / "target.zarr") temp_store = str(tmp_path / "temp.zarr") delayed = api.rechunk(source_array, target_chunks, max_mem, target_store, temp_store=temp_store) assert isinstance(delayed, api.Rechunked) target_array = zarr.open(target_store) if isinstance(target_chunks, dict): target_chunks_list = [target_chunks[d] for d in dims] else: target_chunks_list = target_chunks assert target_array.chunks == tuple(target_chunks_list) assert dict(source_array.attrs) == dict(target_array.attrs) result = delayed.execute() assert isinstance(result, zarr.Array) a_tar = dsa.from_zarr(target_array) assert dsa.equal(a_tar, 1).all().compute()
def validate_gp(self, pattern): min_supp = self.d_set.thd_supp n = self.d_set.attr_size gen_pattern = GP() z_root = zarr.open(self.d_set.z_file, 'r') grp_name = 'dataset/' + self.d_set.step_name + '/rank_matrix' ranks = z_root[grp_name][:] # [:] TO BE REMOVED main_bin = ranks[:, pattern.gradual_items[0].attribute_col] for i in range(len(pattern.gradual_items)): gi = pattern.gradual_items[i] if i == 0: if gi.is_decrement(): main_bin = np.where(main_bin == 0.5, 1, np.where(main_bin == 1, 0.5, 0)) gen_pattern.add_gradual_item(gi) continue else: bin_2 = ranks[:, gi.attribute_col].copy() if gi.is_decrement(): bin_2 = np.where(bin_2 == 0.5, 1, np.where(bin_2 == 1, 0.5, 0)) # Rank multiplication temp_bin = np.where(main_bin == bin_2, main_bin, 0) # print(str(main_bin) + ' + ' + str(bin_2) + ' = ' + str(temp_bin)) supp = float(np.count_nonzero(temp_bin)) / float( n * (n - 1.0) / 2.0) if supp >= min_supp: main_bin = temp_bin.copy() gen_pattern.add_gradual_item(gi) gen_pattern.set_support(supp) if len(gen_pattern.gradual_items) <= 1: return pattern else: return gen_pattern
def __init__(self, filename, ds_name): self.filename = filename self.ds_name = ds_name ds = zarr.open(filename)[ds_name] self.voxel_size = gp.Coordinate(ds.attrs['resolution']) self.spatial_dims = len(self.voxel_size) if 'offset' in ds.attrs: self.offset = gp.Coordinate(ds['resolution']) else: self.offset = gp.Coordinate((0, ) * self.spatial_dims) self.shape = gp.Coordinate(ds.shape) self.spatial_shape = gp.Coordinate(self.shape[-self.spatial_dims:]) self.roi = gp.Roi(self.offset, self.spatial_shape * self.voxel_size) self.axes = {d: a for d, a in enumerate(ds.attrs['axes'])} if 'c' in self.axes: self.num_channels = self.shape[self.axes['c']] else: self.num_channels = 0 if 's' in self.axes: self.num_samples = self.shape[self.axes['s']] else: self.num_samples = 0 # gt specific if 'num_classes' in ds.attrs: self.num_classes = ds.attrs['num_classes'] else: self.num_classes = 0 if 'background_label' in ds.attrs: self.background_label = ds.attrs['background_label'] else: self.background_label = None
def get_validation_errors( self, schema_version: Optional[str] = None, devel_debug: bool = False, ) -> List[str]: try: data = zarr.open(self.filepath) except Exception as e: if devel_debug: raise lgr.warning( "Error opening %s: %s: %s", self.filepath, type(e).__name__, e, extra={"validating": True}, ) return [str(e)] if isinstance(data, zarr.Group) and not data: msg = "Zarr group is empty" if devel_debug: raise ValueError(msg) lgr.warning("%s: %s", self.filepath, msg, extra={"validating": True}) return [msg] try: next(self.filepath.glob(f"*{os.sep}" + os.sep.join(["*"] * MAX_ZARR_DEPTH))) except StopIteration: pass else: msg = f"Zarr directory tree more than {MAX_ZARR_DEPTH} directories deep" if devel_debug: raise ValueError(msg) lgr.warning("%s: %s", self.filepath, msg, extra={"validating": True}) return [msg] # TODO: Should this be appended to the above errors? return super().get_validation_errors( schema_version=schema_version, devel_debug=devel_debug )
def _open_zarr_root(self, path): #TODO: Use case where user opens an already HCS-store? """ Change current zarr to an existing store if zarr doesn't exist, raise error Parameters ---------- path: (str) path to store. Must end in .zarr Returns ------- """ if os.path.exists(path): self.store = zarr.open(path) self.__root_store_path = path else: raise FileNotFoundError( f'No store found at {path}, check spelling or create new store with create_zarr' )
def napari_get_reader(path): """Implementation of the napari_get_reader hook specification. Parameters ---------- path : str or list of str Path to file, or list of paths. Returns ------- function or None If the path is a recognized format, return a function that accepts the same path or list of paths, and returns a list of layer data tuples. """ # Inspect dataset path = Path(path) dataset = zarr.open(path.as_posix(), mode='r') # If dataset is a full dataset return reader if dataset.attrs['waver'] and dataset.attrs['dataset']: return load_simulation_dataset else: return None
def encode_dset(model, hparams, dset_path, emb_path): dataset = TripleEmbeddingDataset(dset_path, emb_path) loader = DataLoader( dataset, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=True, ) # model props model_props = get_model_properties(hparams) print(model_props) # encode z = zarr.open(str(dset_path), "r+") out_shape = (len(z.id), int(hparams["model_n"]["n"][-1])) z_result = {} for k in ["query", "pos", "neg"]: z_result[k] = z.zeros( f"{k}/result/{model_props}", shape=out_shape, chunks=(args.batch_size, None), overwrite=True, compressor=Zstd(), )
def __init__(self, img_directory, whole_dataset=False, training=True, transform=None): super(IDR0017_FullImgs_ZarrDataset, self).__init__() self.img_directory = img_directory self.training = training self.transform = transform img_dataset = zarr.open(img_directory, mode="r") img_array = img_dataset["images"] assert len(img_array) > 0, "No images found in path" self.data = img_array if whole_dataset: self.idxs = list(range(len(img_array))) else: train_idxs, test_idxs = train_test_split(list(range( len(img_array))), test_size=0.1, random_state=42, shuffle=True) self.idxs = train_idxs if training else test_idxs
def load_sim_data(load_path, use_zarr=False): """Loads Parameters ---------- load_path : str Path of the simulation data folder. use_zarr : bool, optional If True, the simulation data will be given as a zarr array, rather than as a numpy array. The former is useful if the data is very large. Returns ------- tuple A tuple `(node_mappings, cnode_mappings, ts, X_states)`, containing all simulation data. `X_states` is either an `np.ndarray` or a `zarr.core.Array`. If `use_zarr=True`, the latter will be given. """ node_mappings_path = 'node_mappings.pkl' cnode_mappings_path = 'cnode_mappings.pkl' ts_path = 'ts.npy' X_states_path = 'X_states.zarr' node_mappings = pickle.load( open("%s/%s" % (load_path, node_mappings_path), "rb")) cnode_mappings = pickle.load( open("%s/%s" % (load_path, cnode_mappings_path), "rb")) ts = np.load("%s/%s" % (load_path, ts_path)) X_states = zarr.open("%s/%s" % (load_path, X_states_path), chunks=(len(ts), 1)) if not use_zarr: X_states = X_states[:] sim_data = (node_mappings, cnode_mappings, ts, X_states) return sim_data
def run(self): progress = 0.0 self.set_progress_percentage(progress) if "unaligned" in self.de: aligned = False else: aligned = True for s in self.samples: filename = os.path.join( os.path.dirname(self.input()[0].fn), self.de, s + ".n5" ) datasets_src = ["clefts", "pre_dist", "post_dist"] datasets_tgt = ["clefts_cropped", "pre_dist_cropped", "post_dist_cropped"] off = offsets[s][aligned] sh = shapes[s][aligned] f = zarr.open(filename, mode="a") for dss, dst in zip(datasets_src, datasets_tgt): chunk_size = tuple(min(c, shi) for c, shi in zip(f[dss].chunks, sh)) f.create_dataset( name=dst, shape=sh, compressor=numcodecs.GZip(6), dtype=f[dss].dtype, chunks=chunk_size, ) bb = tuple(slice(o, o + shi, None) for o, shi in zip(off, sh)) f[dst][:] = f[dss][bb] f[dst].attrs["offset"] = off[::-1] progress += 100.0 / (len(self.samples) * len(datasets_src)) try: self.set_progress_percentage(progress) except: pass done = self.output().open("w") done.close()
def read_zarr(store: Union[str, Path, MutableMapping, zarr.Group]) -> AnnData: """Read from a hierarchical Zarr array store. Parameters ---------- store The filename, a :class:`~typing.MutableMapping`, or a Zarr storage class. """ if isinstance(store, Path): store = str(store) f = zarr.open(store, mode="r") d = {} for k in f.keys(): # Backwards compat if k.startswith("raw."): continue if k in {"obs", "var"}: d[k] = read_dataframe(f[k]) else: # Base case d[k] = read_attribute(f[k]) # Backwards compat raw = {} if "raw.var" in f: raw["var"] = read_dataframe(f["raw.var"]) # Backwards compat if "raw.varm" in f: raw["varm"] = read_attribute(f["raw.varm"]) if "raw.X" in f: raw["X"] = read_attribute(f["raw.X"]) if len(raw) > 0: assert "raw" not in d d["raw"] = raw _clean_uns(d) return AnnData(**d)
def main(): parser = argparse.ArgumentParser() parser.add_argument("zarr", metavar="PATH_OR_URL", description="Update a Zarr's start_date and" "stop_date attributes to match its data.") parser.add_argument("--dry-run", "-d", action="store_true", help="Don't actually write metadata") parser.add_argument("--verbose", "-v", action="store_true", help="Report progress to standard output") args = parser.parse_args() ds = xr.open_zarr(args.zarr) z = zarr.open(args.zarr) t0 = ds.time[0].values t1 = ds.time[-1].values if args.verbose: print("First/last times:", t0, t1) new_attrs = dict(start_date=pd.to_datetime(t0).strftime("%Y-%m-%d"), stop_date=pd.to_datetime(t1).strftime("%Y-%m-%d")) if args.verbose: for title, dic in ("Old", z.attrs), ("New", new_attrs): print(f"{title} attributes:") for key in "start_date", "stop_date": print(f' {key}: ' + (dic[key] if key in dic else "not present")) if args.dry_run: if args.verbose: print("Dry run -- not updating.") else: z.attrs.update(new_attrs) zarr.consolidate_metadata(args.zarr) if args.verbose: print("Attributes updated.")
def fromzarr(path, group=None, dataset=None, chunk_size=None): import zarr if isinstance(path, zarr.Array): arr = path if isinstance(arr.store, FSMap): root = arr.store.root path, dataset = root.rsplit('/', 1) else: path = arr.store.path if '/' in arr.path and group is None: group = arr.path.rsplit('/', 1)[0] dataset = arr.basename if not dataset: path, dataset = path.rsplit('/', 1) shape = arr.shape elif isinstance(path, str): fs = get_fs(path, None) fs_map = FSMap(path, fs) if group is None and dataset is None: arr = zarr.open(fs_map) if isinstance(arr, zarr.Array): return fromzarr(arr, chunk_size=chunk_size) g = zarr.group(store=fs_map) arr = g[TensorFromZarr.get_path(group, dataset)] shape = arr.shape else: raise TypeError('`path` passed has wrong type, ' 'expect str, or zarr.Array' f'got {type(path)}') chunk_size = chunk_size if chunk_size is not None else arr.chunks op = TensorFromZarr(filename=path, group=group, dataset=dataset, dtype=arr.dtype) return op(shape, chunk_size=chunk_size, order=TensorOrder(arr.order))
def setFactorMetaData(self, table_name, ifactor_name, key=None, value=None, meta_data=None): with self._DataLock: iZTable = zarr.open(self.MainDir + os.sep + table_name, mode="a") iZFactor = iZTable[ifactor_name] if key is not None: if key in iZFactor.attrs: del iZFactor.attrs[key] if isinstance(value, np.ndarray): iZFactor.attrs[key] = { "_Type": "Array", "List": value.tolist() } elif isinstance(value, pd.Series): iZFactor.attrs[key] = { "_Type": "Series", "Json": value.to_json(index=True) } elif isinstance(value, pd.DataFrame): iZFactor.attrs[key] = { "_Type": "DataFrame", "Json": value.to_json(index=True) } elif value is not None: iZFactor.attrs[key] = value if meta_data is not None: for iKey in meta_data: self.setFactorMetaData(table_name, ifactor_name=ifactor_name, key=iKey, value=meta_data[iKey], meta_data=None) return 0
def __init__( self, file: str, band: int = 0, as_crs: Optional[int] = 4326, crs_code: Optional[int] = None, ): """For representing a geotiff Args: file (str): Location of the geotiff file band (int): The band of the tiff file to use. Defaults to 0. as_crs (Optional[int]): The epsg crs code to read the data as. Defaults to 4326 (WGS84). crs_code (Optional[int]): The epsg crs code of the tiff file. Include this if the crs code can't be detected. """ self.file = file self._as_crs = crs_code if as_crs is None else as_crs tif = TiffFile(self.file) if not tif.is_geotiff: raise Exception("Not a geotiff file") store = tif.aszarr(key=band) self._z = zarr.open(store, mode="r") store.close() if isinstance(crs_code, int): self._crs_code: int = crs_code else: self._crs_code = self._get_crs_code(tif.geotiff_metadata) self._tif_shape: List[int] = self._z.shape scale: Tuple[float, float, float] = tif.geotiff_metadata["ModelPixelScale"] tilePoint: List[float] = tif.geotiff_metadata["ModelTiepoint"] self._tifTrans: TifTransformer = TifTransformer( self._tif_shape[0], self._tif_shape[1], scale, tilePoint) tif.close()
def main(input_paths, out_path): in_files = [h5py.File(in_path) for in_path in input_paths] in_file_sizes = [0] * len(in_files) out_file = zarr.open(out_path, mode='a') datasets = list(in_files[0].keys()) datasets_per_file = defaultdict(dict) for in_path, in_file in enumerate(in_files): in_file_sizes[in_path] = len(in_file[datasets[0]]) for dataset in datasets: datasets_per_file[in_path][dataset] = in_file[dataset] for dataset in datasets: current_infile = datasets_per_file[0][dataset] args = dict( name=dataset, shape=current_infile.shape, chunks=CHUNK_SHAPES.get(dataset, (CHUNK_SIZE, ) + current_infile.shape[1:]), dtype=current_infile.dtype, ) args.update(ENCODERS.get(dataset, DEFAULT_ENCODER)) print("Dataset %s: %s" % (dataset, args)) out_set = out_file.create_dataset(**args) for in_path in range(len(input_paths)): print("In-file %s..." % input_paths[in_path]) current_infile = datasets_per_file[in_path][dataset] num_batches = int(np.ceil(len(current_infile) / BATCH_SIZE)) for index_batch in tqdm.tqdm(range(num_batches)): start = index_batch * BATCH_SIZE end = min(len(current_infile), (index_batch + 1) * BATCH_SIZE) out_set[start:end] = current_infile[start:end] out_file.store.close() return
def labels_to_zarr(path, data, meta): """Write a 2D+ labels layer to zarr, chunked along the last two dimensions, presumed shape (..., y, x) Parameters ---------- path : str Path save to disk. Must end with .zarr data : array Labels data to be written meta : dict Labels metadata Returns ------- str or None path if any labels were written, otherwise None """ if not path.endswith('.zarr'): return None zarr_shape = data.shape zarr_dtype = data.dtype # we assume x,y are the final two dimensions and chunk accordingly zarr_chunks = tuple([1 for i in range(len(zarr_shape) - 2)] + [1024, 1024]) # TODO: compression type? Get from user? out_zarr = zarr.open( path, mode='w', shape=zarr_shape, dtype=zarr_dtype, chunks=zarr_chunks ) out_zarr[:] = data[:] return path
def set_provenance(self, src_file_names, prov_dict): """Set the Provenance group in the nc file. Parameters ---------- src_file_names list of source filenames prov_dict dictionary containing file conversion parameters prov_dict['conversion_software_name'] prov_dict['conversion_software_version'] prov_dict['conversion_time'] """ # create group files = ", ".join([os.path.basename(file) for file in src_file_names]) if self.format == '.nc': file = netCDF4.Dataset(self.file_path, "a", format="NETCDF4") pr = file.createGroup("Provenance") # dimensions pr.createDimension("filenames", None) # variables pr_src_fnames = pr.createVariable(files, str, "filenames") pr_src_fnames.long_name = "Source filenames" # set group attributes for k, v in prov_dict.items(): pr.setncattr(k, v) # close nc file file.close() elif self.format == '.zarr': file = zarr.open(self.file_path, 'a') pr = file.create_group('Provenance') pr_src_fnames = pr.create_dataset('filenames', data=files) pr_src_fnames.attrs['long_name'] = "Source filenames" for k, v in prov_dict.items(): pr[k] = v
def subset_assay_zarr( zarr_fn: str, in_grp: str, out_grp: str, cells_idx: np.ndarray, feat_idx: np.ndarray, chunk_size: tuple, ): """ Selects a subset of the data in an assay in the specified Zarr hierarchy. For the arguments `cells_idx` and `feat_idx`, refer to the documentation for numpy.split: https://numpy.org/doc/stable/reference/generated/numpy.split.html Args: zarr_fn: The file name for the Zarr hierarchy. in_grp: Group in Zarr hierarchy to subset. out_grp: Group name in Zarr hierarchy to write subsetted assay to. cells_idx: A list of cell indices to (keep | drop ?). feat_idx: A list of feature indices to (keep | drop ?). chunk_size: The requested size of chunks to load into memory and process. Returns: None """ z = zarr.open(zarr_fn, "r+") ig = z[in_grp] og = create_zarr_dataset(z, out_grp, chunk_size, "uint32", (len(cells_idx), len(feat_idx))) pos_start, pos_end = 0, 0 for i in tqdmbar( np.array_split(cells_idx, len(cells_idx) // chunk_size[0] + 1)): pos_end += len(i) og[pos_start:pos_end, :] = ig.get_orthogonal_selection((i, feat_idx)) pos_start = pos_end return None
def open(self): if self.conn is None: self.conn = zarr.open(self.url, mode=self.mode) self.attrs = self.conn.attrs return self