Example #1
0
        prefix="vcf_to_zarr_", dir=tempdir, storage_options=tempdir_storage_options
    ) as tmpdir:

        paths = vcf_to_zarrs(
            input,
            tmpdir,
            regions,
            temp_chunk_length,
            chunk_width,
            tempdir_storage_options,
            ploidy=ploidy,
            mixed_ploidy=mixed_ploidy,
            truncate_calls=truncate_calls,
        )

        ds = zarrs_to_dataset(paths, chunk_length, chunk_width, tempdir_storage_options)

        # Ensure Dask task graph is efficient, see https://github.com/dask/dask/issues/5105
        with dask.config.set({"optimization.fuse.ave-width": dask_fuse_avg_width}):
            ds.to_zarr(output, mode="w")


def vcf_to_zarrs(
    input: Union[PathType, Sequence[PathType]],
    output: PathType,
    regions: Union[None, Sequence[str], Sequence[Optional[Sequence[str]]]],
    chunk_length: int = 10_000,
    chunk_width: int = 1_000,
    output_storage_options: Optional[Dict[str, str]] = None,
    ploidy: int = 2,
    mixed_ploidy: bool = False,
Example #2
0
def vcfzarr_to_zarr(
    input: PathType,
    output: PathType,
    *,
    contigs: Optional[List[str]] = None,
    grouped_by_contig: bool = False,
    consolidated: bool = False,
    tempdir: Optional[PathType] = None,
    concat_algorithm: Optional[Literal["xarray_internal"]] = None,
) -> None:
    """Convert VCF Zarr files created using scikit-allel to a single Zarr on-disk store in sgkit Xarray format.

    Parameters
    ----------
    input
        Path to the input Zarr file.
    output
        Path to the ouput Zarr file.
    contigs
        The contigs to convert. By default all contigs are converted.
    grouped_by_contig
        Whether there is one group for each contig in the Zarr file, by default False.
    consolidated
        Whether the Zarr file has consolidated metadata, by default False.
    tempdir
        Temporary directory where intermediate files are stored. The default None means
        use the system default temporary directory.
    concat_algorithm
        The algorithm to use to concatenate and rechunk Zarr files. The default None means
        use the optimized version suitable for large files, whereas ``xarray_internal`` will
        use built-in Xarray APIs, which can exhibit high memory usage, see https://github.com/dask/dask/issues/6745.
    """

    if consolidated:
        vcfzarr = zarr.open_consolidated(str(input), mode="r")
    else:
        vcfzarr = zarr.open_group(str(input), mode="r")

    if not grouped_by_contig:
        ds = _vcfzarr_to_dataset(vcfzarr)
        ds.to_zarr(str(output))

    else:
        # read each contig separately, concatenate, rechunk, then save to zarr

        contigs = contigs or list(vcfzarr.group_keys())

        # Index the contig names
        _, variant_contig_names = encode_array(contigs)
        variant_contig_names = list(variant_contig_names)

        vars_to_rechunk = []
        vars_to_copy = []

        with tempfile.TemporaryDirectory(prefix="vcfzarr_to_zarr_",
                                         suffix=".zarr",
                                         dir=tempdir) as tmpdir:
            zarr_files = []
            for i, contig in enumerate(contigs):
                # convert contig group to zarr and save in tmpdir
                ds = _vcfzarr_to_dataset(vcfzarr[contig], contig,
                                         variant_contig_names)
                if i == 0:
                    for (var, arr) in ds.data_vars.items():
                        if arr.dims[0] == "variants":
                            vars_to_rechunk.append(var)
                        else:
                            vars_to_copy.append(var)

                contig_zarr_file = Path(tmpdir) / contig
                ds.to_zarr(contig_zarr_file)

                zarr_files.append(str(contig_zarr_file))

            if concat_algorithm == "xarray_internal":
                ds = zarrs_to_dataset(zarr_files)
                ds.to_zarr(output, mode="w")
            else:
                # Use the optimized algorithm in `concatenate_and_rechunk`
                _concat_zarrs_optimized(zarr_files, output, vars_to_rechunk,
                                        vars_to_copy)
Example #3
0
        The algorithm to use to concatenate and rechunk Zarr files. The default None means
        use the optimized version suitable for large files, whereas ``xarray_internal`` will
        use built-in Xarray APIs, which can exhibit high memory usage, see https://github.com/dask/dask/issues/6745.
    chunk_length
        Length (number of variants) of chunks in which data are stored, by default 10,000.
        This is only used when ``concat_algorithm`` is ``xarray_internal``.
    chunk_width
        Width (number of samples) to use when storing chunks in output, by default 1,000.
        This is only used when ``concat_algorithm`` is ``xarray_internal``.
    storage_options
        Any additional parameters for the storage backend (see ``fsspec.open``).
    dask_fuse_avg_width
        Setting for Dask's ``optimization.fuse.ave-width``, see https://github.com/dask/dask/issues/5105
    """
    if concat_algorithm == "xarray_internal":
        ds = zarrs_to_dataset(urls, chunk_length, chunk_width, storage_options)

        with dask.config.set(
            {"optimization.fuse.ave-width": dask_fuse_avg_width}):
            ds.to_zarr(output, mode="w")
    else:

        vars_to_rechunk = []
        vars_to_copy = []
        storage_options = storage_options or {}
        ds = xr.open_zarr(  # type: ignore[no-untyped-call]
            fsspec.get_mapper(urls[0], **storage_options),
            concat_characters=False)
        for (var, arr) in ds.data_vars.items():
            if arr.dims[0] == "variants":
                vars_to_rechunk.append(var)