def write_multiscale(pyramid: List, group: zarr.Group) -> None: """Write a pyramid with multiscale metadata to disk.""" paths = [] for path, dataset in enumerate(pyramid): group.create_dataset(str(path), data=pyramid[path]) paths.append({"path": str(path)}) multiscales = [{"version": "0.1", "datasets": paths}] group.attrs["multiscales"] = multiscales
def write_array(self, group: zarr.Group, name: str, array: np.ndarray) -> None: if array.dtype.kind == 'V': self.write_dataframe(group, name, array) else: dtype = str if array.dtype.kind == 'O' else array.dtype group.create_dataset(name, data=array, shape=array.shape, chunks=calc_chunk(array.shape), dtype=dtype, compressor=COMPRESSOR, overwrite=True)
def write_multiscale( pyramid: List, group: zarr.Group, chunks: Union[Tuple[Any, ...], int] = None, ) -> None: """Write a pyramid with multiscale metadata to disk.""" paths = [] for path, dataset in enumerate(pyramid): # TODO: chunks here could be different per layer group.create_dataset(str(path), data=dataset, chunks=chunks) paths.append({"path": str(path)}) multiscales = [{"version": "0.2", "datasets": paths}] group.attrs["multiscales"] = multiscales
def create_zarr_count_assay(z: zarr.Group, assay_name: str, chunk_size: Tuple[int, int], n_cells: int, feat_ids: Union[np.ndarray, List[str]], feat_names: Union[np.ndarray, List[str]], dtype: str = 'uint32') -> zarr.hierarchy: """ Creates and returns a Zarr array with name 'counts'. Args: z (zarr.Group): assay_name (str): chunk_size (Tuple[int, int]): n_cells (int): feat_ids (Union[np.ndarray, List[str]]): feat_names (Union[np.ndarray, List[str]]): dtype (str = 'uint32'): Returns: A Zarr array. """ g = z.create_group(assay_name, overwrite=True) g.attrs['is_assay'] = True g.attrs['misc'] = {} create_zarr_obj_array(g, 'featureData/ids', feat_ids) create_zarr_obj_array(g, 'featureData/names', feat_names) create_zarr_obj_array(g, 'featureData/I', [True for _ in range(len(feat_ids))], 'bool') return create_zarr_dataset(g, 'counts', chunk_size, dtype, (n_cells, len(feat_ids)), overwrite=True)
def create_zarr_dataset(g: zarr.Group, name: str, chunks: tuple, dtype: Any, shape: Tuple, overwrite: bool = True) -> zarr.hierarchy: """ Creates and returns a Zarr array. Args: g (zarr.hierarchy): name (str): chunks (tuple): dtype (Any): shape (Tuple): overwrite (bool): Returns: A Zarr Array. """ from numcodecs import Blosc compressor = Blosc(cname='lz4', clevel=5, shuffle=Blosc.BITSHUFFLE) return g.create_dataset(name, chunks=chunks, dtype=dtype, shape=shape, compressor=compressor, overwrite=overwrite)
def create_zarr_obj_array(g: zarr.Group, name: str, data, dtype: Union[str, Any] = None, overwrite: bool = True) -> zarr.hierarchy: """ Creates and returns a Zarr object array. A Zarr object array can contain any type of object. https://zarr.readthedocs.io/en/stable/tutorial.html#object-arrays Args: g (zarr.hierarchy): name (str): data (): dtype (Union[str, Any]): overwrite (bool): Returns: A Zarr object Array. """ data = np.array(data) if dtype is None or dtype == object: dtype = 'U' + str(max([len(str(x)) for x in data])) if np.issubdtype(data.dtype, np.dtype('S')): data = data.astype('U') dtype = data.dtype return g.create_dataset(name, data=data, chunks=(100000, ), shape=len(data), dtype=dtype, overwrite=overwrite)
def write_dataframe(self, group: zarr.Group, name: str, df: Union[pd.DataFrame, np.recarray]) -> None: data_type = 'data_frame' if isinstance( df, pd.DataFrame) else 'record_array' sub_group = group.create_group(name, overwrite=True) attrs_dict = {'data_type': data_type} cols = list(df.columns if data_type == 'data_frame' else df.dtype.names) attrs_dict['columns'] = cols if data_type == 'data_frame': attrs_dict[ 'index_name'] = df.index.name if df.index.name is not None else 'index' sub_group.create_group( '_categories', overwrite=True ) # create a group to store category keys for catigorical columns self.write_series(sub_group, '_index', df.index.values, data_type) for col in cols: self.write_series( sub_group, col, (df[col].values if data_type == 'data_frame' else df[col]), data_type) sub_group.attrs.update(**attrs_dict)
def write_csr(self, group: zarr.Group, name: str, matrix: csr_matrix) -> None: sub_group = group.create_group(name, overwrite=True) sub_group.attrs.update(data_type='csr_matrix', shape=matrix.shape) sub_group.create_dataset('data', data=matrix.data, shape=matrix.data.shape, chunks=calc_chunk(matrix.data.shape), dtype=matrix.data.dtype, compressor=COMPRESSOR, overwrite=True) sub_group.create_dataset('indices', data=matrix.indices, shape=matrix.indices.shape, chunks=calc_chunk(matrix.indices.shape), dtype=matrix.indices.dtype, compressor=COMPRESSOR, overwrite=True) sub_group.create_dataset('indptr', data=matrix.indptr, shape=matrix.indptr.shape, chunks=calc_chunk(matrix.indptr.shape), dtype=matrix.indptr.dtype, compressor=COMPRESSOR, overwrite=True)
def write_series(self, group: zarr.Group, name: str, array: np.ndarray, data_type: str) -> None: if data_type == 'data_frame': if not is_categorical_dtype( array) and name != '_index' and is_string_dtype(array): keywords = set(array) if len(keywords ) <= array.size / 10.0: # at least 10x reduction array = pd.Categorical(array, categories=natsorted(keywords)) if is_categorical_dtype(array): # write category keys categories = group.require_group('_categories') values = array.categories.values if isinstance(values[0], bytes): values = np.array([x.decode() for x in values], dtype=object) dtype = str if values.dtype.kind == 'O' else values.dtype categories.create_dataset(name, data=values, shape=values.shape, chunks=calc_chunk(values.shape), dtype=dtype, compressor=COMPRESSOR, overwrite=True) # write codes codes_arr = group.create_dataset(name, data=array.codes, shape=array.codes.shape, chunks=calc_chunk( array.codes.shape), dtype=array.codes.dtype, compressor=COMPRESSOR, overwrite=True) codes_arr.attrs['ordered'] = array.ordered return None dtype = str if array.dtype.kind == 'O' else array.dtype group.create_dataset(name, data=array, shape=array.shape, chunks=calc_chunk(array.shape), dtype=dtype, compressor=COMPRESSOR, overwrite=True)
def _set_time_units_like(source_store: zarr.Group, target_store: zarr.Group): """Modify all time-like variables in source_store to use same units as corresponding variable in target_store. The provided source_store must be opened in a mode such that it can be modified (e.g. mode='r+')""" for variable, source_array in source_store.items(): target_array = target_store[variable] if "units" in source_array.attrs and "since" in source_array.attrs[ "units"]: _set_array_time_units_like(source_array, target_array)
def _init_coord(group: zarr.Group, coord: xr.DataArray): # fill_value=NaN is needed below for xr.open_zarr to succesfully load this # coordinate if decode_cf=True. Otherwise, time=0 gets filled in as nan. very # confusing... arr = np.asarray(coord) out_array = group.array( name=coord.name, data=arr, fill_value=_fill_value(arr.dtype) ) out_array.attrs.update(coord.attrs) _set_dims(out_array, coord.dims)
def read_group(group: zarr.Group): if "encoding-type" in group.attrs: enctype = group.attrs["encoding-type"] EncodingVersions[enctype].check(group.name, group.attrs["encoding-version"]) if enctype == "dataframe": return read_dataframe(group) elif enctype == "csr_matrix": return read_csr(group) elif enctype == "csc_matrix": return read_csc(group) # At the moment, just treat raw as normal group return {k: read_attribute(group[k]) for k in group.keys()}
def conv_chrom(fname: str, block_size: int, root: Group, chrom: int) -> None: num_positions = 0 with open(fname) as tfam: for line in tfam: # redo this with generators tokens = line.rstrip().split(' ') l_chrom = int(tokens[0]) if l_chrom < chrom: continue elif l_chrom > chrom: break num_positions += 1 tfam = open(fname) chrom_group = root.create_group(f'chromosome-{chrom}') all_calls = chrom_group.zeros('calls', shape=(num_positions, NUM_SAMPLES), dtype='B') block = [] all_positions = [] all_alleles = [] current_position = 0 for line in tfam: # redo this with generators tokens = line.rstrip().split(' ') l_chrom = int(tokens[0]) if l_chrom < chrom: continue elif l_chrom > chrom: break position = int(tokens[3]) all_positions.append(position) calls = tokens[4:] alleles = ''.join(set(calls[4:]) - set(['0'])) if len(alleles) == 1: alleles += alleles all_alleles.append(alleles) sample_calls = np.empty(shape=NUM_SAMPLES, dtype='B') for sample_position, sample in enumerate(range(NUM_SAMPLES)): a1, a2 = calls[2 * sample: 2 * sample + 2] try: sample_calls[sample_position] = encode_alleles(a1, a2, alleles) except Exception: print(chrom, current_position, sample_position) raise block.append(sample_calls) current_position += 1 if current_position % block_size == 0: all_calls[current_position - block_size:current_position, :] = block block = [] if len(block) > 0: all_calls[- len(block):, :] = block chrom_group.array('positions', all_positions) chrom_group.array('alleles', all_alleles)
def read_mapping(self, group: zarr.Group) -> dict: res_dict = {} if 'scalar' in group.attrs: res_dict.update(group.attrs['scalar']) for key in group.array_keys(): res_dict[key] = self.read_array(group, key) for key in group.group_keys(): sub_group = group[key] data_type = sub_group.attrs['data_type'] value = None if data_type == 'data_frame' or data_type == 'record_array': value = self.read_dataframe(sub_group) elif data_type == 'csr_matrix': value = self.read_csr(sub_group) else: assert data_type == 'dict' value = self.read_mapping(sub_group) res_dict[key] = value return res_dict
def write_multiscale( pyramid: List, group: zarr.Group, chunks: Union[Tuple[Any, ...], int] = None, fmt: Format = CurrentFormat(), axes: Union[str, List[str]] = None, ) -> None: """ Write a pyramid with multiscale metadata to disk. Parameters ---------- pyramid: List of np.ndarray the image data to save. Largest level first All image arrays MUST be up to 5-dimensional with dimensions ordered (t, c, z, y, x) group: zarr.Group the group within the zarr store to store the data in chunks: int or tuple of ints, size of the saved chunks to store the image fmt: Format The format of the ome_zarr data which should be used. Defaults to the most current. axes: str or list of str the names of the axes. e.g. "tczyx". Not needed for v0.1 or v0.2 or for v0.3 if 2D or 5D. Otherwise this must be provided """ dims = len(pyramid[0].shape) axes = _validate_axes_names(dims, axes, fmt) paths = [] for path, dataset in enumerate(pyramid): # TODO: chunks here could be different per layer group.create_dataset(str(path), data=dataset, chunks=chunks) paths.append(str(path)) write_multiscales_metadata(group, paths, fmt, axes)
def create_zarr_obj_array( g: zarr.Group, name: str, data, dtype: Union[str, Any] = None, overwrite: bool = True, chunk_size: int = 100000, ) -> zarr.hierarchy: """ Creates and returns a Zarr object array. A Zarr object array can contain any type of object. https://zarr.readthedocs.io/en/stable/tutorial.html#object-arrays Args: g (zarr.hierarchy): name (str): data (): dtype (Union[str, Any]): overwrite (bool): chunk_size (int): Returns: A Zarr object Array. """ from numcodecs import Blosc compressor = Blosc(cname="lz4", clevel=5, shuffle=Blosc.BITSHUFFLE) data = np.array(data) if dtype is None or dtype == object: dtype = "U" + str(max([len(str(x)) for x in data])) if np.issubdtype(data.dtype, np.dtype("S")): data = data.astype("U") dtype = data.dtype if chunk_size is None or chunk_size is False: chunks = False else: chunks = (chunk_size, ) return g.create_dataset( name, data=data, chunks=chunks, shape=len(data), dtype=dtype, overwrite=overwrite, compressor=compressor, )
def _init_data_var( group: zarr.Group, array: xr.DataArray, start_shape: Sequence[int], start_chunks, dims, ): shape = tuple(start_shape) + array.data.shape chunks = tuple(start_chunks) + array.data.shape out_array = group.empty( name=array.name, shape=shape, chunks=chunks, dtype=array.dtype ) out_array.attrs.update(array.attrs) dims = list(dims) + list(array.dims) _set_dims(out_array, dims)
def read_dataframe(self, group: zarr.Group) -> Union[pd.DataFrame, np.ndarray]: columns = group.attrs.get('columns', None) if columns is None: columns = [col for col in group.array_keys() if col != '_index'] if group.attrs['data_type'] == 'data_frame': data = {col: self.read_series(group, col) for col in columns} _index = self.read_series(group, '_index') index = pd.Index(_index, name = group.attrs['index_name'], dtype = _index.dtype) df = pd.DataFrame(data = data, index = index) # if add columns = columns, the generation will be slow return df else: array = np.rec.fromarrays([self.read_series(group, col) for col in columns], names = columns) return array
def write_unimodal_data(self, group: zarr.Group, name: str, data: UnimodalData, overwrite: bool = True) -> None: """ Write UnimodalData """ sub_group = group.require_group(name, overwrite = overwrite) attrs_dict = {'data_type': 'UnimodalData', '_cur_matrix': data.current_matrix()} sub_group.attrs.update(**attrs_dict) self.write_dataframe(sub_group, 'barcode_metadata', data.barcode_metadata) self.write_dataframe(sub_group, 'feature_metadata', data.feature_metadata) if overwrite or data.matrices.is_dirty(): self.write_mapping(sub_group, 'matrices', data.matrices, overwrite = overwrite) if overwrite or data.metadata.is_dirty(): self.write_mapping(sub_group, 'metadata', data.metadata, overwrite = overwrite) if overwrite or data.barcode_multiarrays.is_dirty(): self.write_mapping(sub_group, 'barcode_multiarrays', data.barcode_multiarrays, overwrite = overwrite) if overwrite or data.feature_multiarrays.is_dirty(): self.write_mapping(sub_group, 'feature_multiarrays', data.feature_multiarrays, overwrite = overwrite)
def write_mapping(self, group: zarr.Group, name: str, mapping: dict, overwrite=True) -> None: sub_group = group.require_group(name, overwrite=overwrite) scalar_dict = sub_group.attrs.pop( 'scalar', {}) # if overwrite == True, there should be no 'scalar' def _write_one_pair(key, value): if is_scalar(value): scalar_dict[key] = value elif isinstance(value, np.ndarray): self.write_array(sub_group, key, value) elif isinstance(value, pd.DataFrame): self.write_dataframe(sub_group, key, value) elif is_dict_like(value): self.write_mapping(sub_group, key, value) elif issparse(value): assert isinstance(value, csr_matrix) self.write_csr(sub_group, key, value) else: # assume value is either list or tuple, converting it to np.ndarray self.write_array( sub_group, key, value.astype(str) if is_categorical_dtype(value) else np.array(value)) if overwrite: for key, value in mapping.items(): _write_one_pair(key, value) else: for key in mapping.deleted: if key in scalar_dict: del scalar_dict[key] else: del sub_group[key] for key in mapping.modified: _write_one_pair(key, mapping[key]) attrs_dict = {'data_type': 'dict'} if len(scalar_dict) > 0: attrs_dict['scalar'] = scalar_dict sub_group.attrs.update(**attrs_dict)
def _shift_store(group: zarr.Group, dim: str, n_shift: int): """Shift local zarr store which represents an xarray dataset by n_shift along dim Args: group: zarr Group for an xarray dataset backed by a DirectoryStore dim: name of dimension of xarray dataset along which to shift zarr n_shift: how far to shift. The chunk size along dim of every array in group must evenly divide n_shift. Note: The zarr store represented by group will no longer be valid after this function is called since its chunks will not be listed starting at 0. It is intended that the output of this function be copied into another zarr store as a method of appending. """ for array in group.values(): if dim in array.attrs[XARRAY_DIM_NAMES_ATTR]: axis = array.attrs[XARRAY_DIM_NAMES_ATTR].index(dim) _shift_array(array, axis, n_shift)
def _assert_chunks_match(source_group: zarr.Group, target_group: zarr.Group, dim: str): """Ensure chunks for source and target groups are valid for appending. Specifically: 1. all arrays in source_group have corresponding arrays in target_group. 2. chunk size is same for each array in source and target group. 3. dim length is a multiple of chunk size for target group. In addition, log a warning if dim length is not a multiple of chunk size for source group.""" for key, source_array in source_group.items(): if key not in target_group: raise KeyError( f"Cannot append {source_array} because there is no corresponding array " f"in {target_group}.") if dim != key and dim in source_array.attrs[XARRAY_DIM_NAMES_ATTR]: axis = source_array.attrs[XARRAY_DIM_NAMES_ATTR].index(dim) target_array = target_group[key] _assert_array_chunks_match(source_array, target_array, axis)
def _create_zarr( dims: Sequence[str], coords: Mapping[str, xr.DataArray], group: zarr.Group, template: xr.Dataset, ): ds = template group.attrs.update(ds.attrs) start_shape = [len(coords[dim]) for dim in dims] start_chunks = (1,) * len(start_shape) for name in ds: _init_data_var(group, ds[name], start_shape, start_chunks, dims) for name in ds.coords: _init_coord(group, ds[name]) for name in coords: _init_coord(group, coords[name]) group.attrs["DIMS"] = dims
def write_multiscales_metadata( group: zarr.Group, paths: List[str], fmt: Format = CurrentFormat(), axes: List[str] = None, ) -> None: """ Write the multiscales metadata in the group. Parameters ---------- group: zarr.Group the group within the zarr store to write the metadata in. paths: list of str The list of paths to the datasets for this multiscale image. fmt: Format The format of the ome_zarr data which should be used. Defaults to the most current. axes: list of str the names of the axes. e.g. ["t", "c", "z", "y", "x"]. Ignored for versions 0.1 and 0.2. Required for version 0.3 or greater. """ multiscales = [ { "version": fmt.version, "datasets": [{"path": str(p)} for p in paths], } ] if axes is not None: if fmt.version in ("0.1", "0.2"): LOGGER.info("axes ignored for version 0.1 or 0.2") else: _validate_axes(axes, fmt) multiscales[0]["axes"] = axes group.attrs["multiscales"] = multiscales
def read_array(self, group: zarr.Group, name: str) -> Union[np.ndarray, np.recarray]: if name in group.group_keys(): return self.read_dataframe(group[name]) else: return group[name][...]
def _get_dim_size(group: zarr.Group, dim: str): """Get length of dim, assuming it is same for all arrays that contain it""" for array in group.values(): if dim in array.attrs[XARRAY_DIM_NAMES_ATTR]: axis = array.attrs[XARRAY_DIM_NAMES_ATTR].index(dim) return array.shape[axis]