prefix="vcf_to_zarr_", dir=tempdir, storage_options=tempdir_storage_options ) as tmpdir: paths = vcf_to_zarrs( input, tmpdir, regions, temp_chunk_length, chunk_width, tempdir_storage_options, ploidy=ploidy, mixed_ploidy=mixed_ploidy, truncate_calls=truncate_calls, ) ds = zarrs_to_dataset(paths, chunk_length, chunk_width, tempdir_storage_options) # Ensure Dask task graph is efficient, see https://github.com/dask/dask/issues/5105 with dask.config.set({"optimization.fuse.ave-width": dask_fuse_avg_width}): ds.to_zarr(output, mode="w") def vcf_to_zarrs( input: Union[PathType, Sequence[PathType]], output: PathType, regions: Union[None, Sequence[str], Sequence[Optional[Sequence[str]]]], chunk_length: int = 10_000, chunk_width: int = 1_000, output_storage_options: Optional[Dict[str, str]] = None, ploidy: int = 2, mixed_ploidy: bool = False,
def vcfzarr_to_zarr( input: PathType, output: PathType, *, contigs: Optional[List[str]] = None, grouped_by_contig: bool = False, consolidated: bool = False, tempdir: Optional[PathType] = None, concat_algorithm: Optional[Literal["xarray_internal"]] = None, ) -> None: """Convert VCF Zarr files created using scikit-allel to a single Zarr on-disk store in sgkit Xarray format. Parameters ---------- input Path to the input Zarr file. output Path to the ouput Zarr file. contigs The contigs to convert. By default all contigs are converted. grouped_by_contig Whether there is one group for each contig in the Zarr file, by default False. consolidated Whether the Zarr file has consolidated metadata, by default False. tempdir Temporary directory where intermediate files are stored. The default None means use the system default temporary directory. concat_algorithm The algorithm to use to concatenate and rechunk Zarr files. The default None means use the optimized version suitable for large files, whereas ``xarray_internal`` will use built-in Xarray APIs, which can exhibit high memory usage, see https://github.com/dask/dask/issues/6745. """ if consolidated: vcfzarr = zarr.open_consolidated(str(input), mode="r") else: vcfzarr = zarr.open_group(str(input), mode="r") if not grouped_by_contig: ds = _vcfzarr_to_dataset(vcfzarr) ds.to_zarr(str(output)) else: # read each contig separately, concatenate, rechunk, then save to zarr contigs = contigs or list(vcfzarr.group_keys()) # Index the contig names _, variant_contig_names = encode_array(contigs) variant_contig_names = list(variant_contig_names) vars_to_rechunk = [] vars_to_copy = [] with tempfile.TemporaryDirectory(prefix="vcfzarr_to_zarr_", suffix=".zarr", dir=tempdir) as tmpdir: zarr_files = [] for i, contig in enumerate(contigs): # convert contig group to zarr and save in tmpdir ds = _vcfzarr_to_dataset(vcfzarr[contig], contig, variant_contig_names) if i == 0: for (var, arr) in ds.data_vars.items(): if arr.dims[0] == "variants": vars_to_rechunk.append(var) else: vars_to_copy.append(var) contig_zarr_file = Path(tmpdir) / contig ds.to_zarr(contig_zarr_file) zarr_files.append(str(contig_zarr_file)) if concat_algorithm == "xarray_internal": ds = zarrs_to_dataset(zarr_files) ds.to_zarr(output, mode="w") else: # Use the optimized algorithm in `concatenate_and_rechunk` _concat_zarrs_optimized(zarr_files, output, vars_to_rechunk, vars_to_copy)
The algorithm to use to concatenate and rechunk Zarr files. The default None means use the optimized version suitable for large files, whereas ``xarray_internal`` will use built-in Xarray APIs, which can exhibit high memory usage, see https://github.com/dask/dask/issues/6745. chunk_length Length (number of variants) of chunks in which data are stored, by default 10,000. This is only used when ``concat_algorithm`` is ``xarray_internal``. chunk_width Width (number of samples) to use when storing chunks in output, by default 1,000. This is only used when ``concat_algorithm`` is ``xarray_internal``. storage_options Any additional parameters for the storage backend (see ``fsspec.open``). dask_fuse_avg_width Setting for Dask's ``optimization.fuse.ave-width``, see https://github.com/dask/dask/issues/5105 """ if concat_algorithm == "xarray_internal": ds = zarrs_to_dataset(urls, chunk_length, chunk_width, storage_options) with dask.config.set( {"optimization.fuse.ave-width": dask_fuse_avg_width}): ds.to_zarr(output, mode="w") else: vars_to_rechunk = [] vars_to_copy = [] storage_options = storage_options or {} ds = xr.open_zarr( # type: ignore[no-untyped-call] fsspec.get_mapper(urls[0], **storage_options), concat_characters=False) for (var, arr) in ds.data_vars.items(): if arr.dims[0] == "variants": vars_to_rechunk.append(var)