def test_concatenate_and_rechunk__shape_mismatch(): z1 = zarr.zeros((5, 3), chunks=(2, 3), dtype="i4") z2 = zarr.zeros((5, 4), chunks=(2, 4), dtype="i4") zarrs = [z1, z2] with pytest.raises(ValueError, match="Zarr arrays must have matching shapes"): concatenate_and_rechunk(zarrs)
def test_concatenate_and_rechunk__1d(): z1 = zarr.zeros(5, chunks=2, dtype="i4") z1[:] = np.arange(5) z2 = zarr.zeros(5, chunks=2, dtype="i4") z2[:] = np.arange(5, 10) zarrs = [z1, z2] out = concatenate_and_rechunk(zarrs) assert out.chunks == ((2, 2, 2, 2, 2),) np.testing.assert_array_equal(out.compute(), np.arange(10))
def test_concatenate_and_rechunk__2d(): z1 = zarr.zeros((5, 3), chunks=(2, 3), dtype="i4") z1[:] = np.arange(15).reshape(5, 3) z2 = zarr.zeros((5, 3), chunks=(2, 3), dtype="i4") z2[:] = np.arange(15, 30).reshape(5, 3) zarrs = [z1, z2] out = concatenate_and_rechunk(zarrs) assert out.chunks == ((2, 2, 2, 2, 2), (3,)) np.testing.assert_array_equal(out.compute(), np.arange(30).reshape(10, 3))
def _concat_zarrs_optimized( zarr_files: List[str], output: PathType, vars_to_rechunk: List[Hashable], vars_to_copy: List[Hashable], ) -> None: zarr_groups = [zarr.open_group(f) for f in zarr_files] first_zarr_group = zarr_groups[0] # create the top-level group zarr.open_group(str(output), mode="w") # copy variables that are to be rechunked # NOTE: that this uses _to_zarr function defined here that is needed to avoid # race conditions between writing the array contents and its metadata # see https://github.com/pystatgen/sgkit/pull/486 delayed = [] # do all the rechunking operations in one computation for var in vars_to_rechunk: dtype = None if var in {"variant_id", "variant_allele"}: max_len = _get_max_len(zarr_groups, f"max_length_{var}") dtype = f"S{max_len}" arr = concatenate_and_rechunk([group[var] for group in zarr_groups], dtype=dtype) d = _to_zarr( # type: ignore[no-untyped-call] arr, str(output), component=var, overwrite=True, compute=False, fill_value=None, attrs=first_zarr_group[var].attrs.asdict(), ) delayed.append(d) da.compute(*delayed) # copy unchanged variables and top-level metadata with zarr.open_group(str(output)) as output_zarr: # copy variables that are not rechunked (e.g. sample_id) for var in vars_to_copy: output_zarr[var] = first_zarr_group[var] output_zarr[var].attrs.update(first_zarr_group[var].attrs) # copy top-level attributes output_zarr.attrs.update(first_zarr_group.attrs)
def test_concatenate_and_rechunk__tiny_file(): z1 = zarr.zeros(4, chunks=3, dtype="i4") z1[:] = np.arange(4) # this zarr array lies entirely within the second chunk z2 = zarr.zeros(1, chunks=3, dtype="i4") z2[:] = np.arange(4, 5) z3 = zarr.zeros(5, chunks=3, dtype="i4") z3[:] = np.arange(5, 10) zarrs = [z1, z2, z3] out = concatenate_and_rechunk(zarrs) assert out.chunks == ((3, 3, 3, 1),) np.testing.assert_array_equal(out.compute(), np.arange(10))
def _concat_zarrs_optimized( zarr_files: List[str], output: PathType, vars_to_rechunk: List[Hashable], vars_to_copy: List[Hashable], ) -> None: zarr_groups = [zarr.open_group(f) for f in zarr_files] first_zarr_group = zarr_groups[0] with zarr.open_group(str(output)) as output_zarr: var_to_attrs = {} # attributes to copy delayed = [] # do all the rechunking operations in one computation for var in vars_to_rechunk: var_to_attrs[var] = first_zarr_group[var].attrs.asdict() dtype = None if var == "variant_id": max_len = _get_max_len(zarr_groups, "max_variant_id_length") dtype = f"S{max_len}" elif var == "variant_allele": max_len = _get_max_len(zarr_groups, "max_variant_allele_length") dtype = f"S{max_len}" arr = concatenate_and_rechunk( [group[var] for group in zarr_groups], dtype=dtype) d = arr.to_zarr( str(output), component=var, overwrite=True, compute=False, fill_value=None, ) delayed.append(d) da.compute(*delayed) # copy variables that are not rechunked (e.g. sample_id) for var in vars_to_copy: output_zarr[var] = first_zarr_group[var] output_zarr[var].attrs.update(first_zarr_group[var].attrs) # copy attributes output_zarr.attrs.update(first_zarr_group.attrs) for (var, attrs) in var_to_attrs.items(): output_zarr[var].attrs.update(attrs)
def concat_zarrs_optimized( zarr_files: Sequence[str], output: Union[PathType, MutableMapping[str, bytes]], vars_to_rechunk: List[Hashable], vars_to_copy: List[Hashable], fix_strings: bool = False, ) -> None: if isinstance(output, Path): output = str(output) zarr_groups = [zarr.open_group(f) for f in zarr_files] first_zarr_group = zarr_groups[0] # create the top-level group zarr.open_group(output, mode="w") # copy variables that are to be rechunked # NOTE: that this uses _to_zarr function defined here that is needed to avoid # race conditions between writing the array contents and its metadata # see https://github.com/pystatgen/sgkit/pull/486 delayed = [] # do all the rechunking operations in one computation for var in vars_to_rechunk: dtype = None if fix_strings and var in {"variant_id", "variant_allele"}: max_len = _get_max_len(zarr_groups, f"max_length_{var}") dtype = f"S{max_len}" arr = concatenate_and_rechunk([group[var] for group in zarr_groups], dtype=dtype) _to_zarr_kwargs = dict( compressor=first_zarr_group[var].compressor, filters=first_zarr_group[var].filters, fill_value=None, ) if not fix_strings and arr.dtype == "O": # We assume that all object dtypes are variable length strings var_len_str_codec = numcodecs.VLenUTF8() _to_zarr_kwargs["object_codec"] = var_len_str_codec # Remove from filters to avoid double encoding error if var_len_str_codec in first_zarr_group[var].filters: filters = list(first_zarr_group[var].filters) filters.remove(var_len_str_codec) _to_zarr_kwargs["filters"] = filters d = _to_zarr( # type: ignore[no-untyped-call] arr, output, component=var, overwrite=True, compute=False, attrs=first_zarr_group[var].attrs.asdict(), **_to_zarr_kwargs, ) delayed.append(d) da.compute(*delayed) # copy unchanged variables and top-level metadata with zarr.open_group(output) as output_zarr: # copy variables that are not rechunked (e.g. sample_id) for var in vars_to_copy: output_zarr[var] = first_zarr_group[var] output_zarr[var].attrs.update(first_zarr_group[var].attrs) # copy top-level attributes group_attrs = dict(first_zarr_group.attrs) if "max_alt_alleles_seen" in group_attrs: max_alt_alleles_seen = _get_max_len(zarr_groups, "max_alt_alleles_seen") group_attrs["max_alt_alleles_seen"] = max_alt_alleles_seen output_zarr.attrs.update(group_attrs) # consolidate metadata zarr.consolidate_metadata(output)