Python zarrs_to_datasetの例

プログラミング言語: Python

名前空間/パッケージ名: sgkit.io.utils

メソッド/関数: zarrs_to_dataset

hotexamples.comのコード掲載数: 3

Python zarrs_to_dataset - 3件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのsgkit.io.utils.zarrs_to_datasetの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: vcf_reader.py プロジェクト: eric-czech/sgkit

        prefix="vcf_to_zarr_", dir=tempdir, storage_options=tempdir_storage_options
    ) as tmpdir:

        paths = vcf_to_zarrs(
            input,
            tmpdir,
            regions,
            temp_chunk_length,
            chunk_width,
            tempdir_storage_options,
            ploidy=ploidy,
            mixed_ploidy=mixed_ploidy,
            truncate_calls=truncate_calls,
        )

        ds = zarrs_to_dataset(paths, chunk_length, chunk_width, tempdir_storage_options)

        # Ensure Dask task graph is efficient, see https://github.com/dask/dask/issues/5105
        with dask.config.set({"optimization.fuse.ave-width": dask_fuse_avg_width}):
            ds.to_zarr(output, mode="w")


def vcf_to_zarrs(
    input: Union[PathType, Sequence[PathType]],
    output: PathType,
    regions: Union[None, Sequence[str], Sequence[Optional[Sequence[str]]]],
    chunk_length: int = 10_000,
    chunk_width: int = 1_000,
    output_storage_options: Optional[Dict[str, str]] = None,
    ploidy: int = 2,
    mixed_ploidy: bool = False,

コード例 #2

ファイルを表示

def vcfzarr_to_zarr(
    input: PathType,
    output: PathType,
    *,
    contigs: Optional[List[str]] = None,
    grouped_by_contig: bool = False,
    consolidated: bool = False,
    tempdir: Optional[PathType] = None,
    concat_algorithm: Optional[Literal["xarray_internal"]] = None,
) -> None:
    """Convert VCF Zarr files created using scikit-allel to a single Zarr on-disk store in sgkit Xarray format.

    Parameters
    ----------
    input
        Path to the input Zarr file.
    output
        Path to the ouput Zarr file.
    contigs
        The contigs to convert. By default all contigs are converted.
    grouped_by_contig
        Whether there is one group for each contig in the Zarr file, by default False.
    consolidated
        Whether the Zarr file has consolidated metadata, by default False.
    tempdir
        Temporary directory where intermediate files are stored. The default None means
        use the system default temporary directory.
    concat_algorithm
        The algorithm to use to concatenate and rechunk Zarr files. The default None means
        use the optimized version suitable for large files, whereas ``xarray_internal`` will
        use built-in Xarray APIs, which can exhibit high memory usage, see https://github.com/dask/dask/issues/6745.
    """

    if consolidated:
        vcfzarr = zarr.open_consolidated(str(input), mode="r")
    else:
        vcfzarr = zarr.open_group(str(input), mode="r")

    if not grouped_by_contig:
        ds = _vcfzarr_to_dataset(vcfzarr)
        ds.to_zarr(str(output))

    else:
        # read each contig separately, concatenate, rechunk, then save to zarr

        contigs = contigs or list(vcfzarr.group_keys())

        # Index the contig names
        _, variant_contig_names = encode_array(contigs)
        variant_contig_names = list(variant_contig_names)

        vars_to_rechunk = []
        vars_to_copy = []

        with tempfile.TemporaryDirectory(prefix="vcfzarr_to_zarr_",
                                         suffix=".zarr",
                                         dir=tempdir) as tmpdir:
            zarr_files = []
            for i, contig in enumerate(contigs):
                # convert contig group to zarr and save in tmpdir
                ds = _vcfzarr_to_dataset(vcfzarr[contig], contig,
                                         variant_contig_names)
                if i == 0:
                    for (var, arr) in ds.data_vars.items():
                        if arr.dims[0] == "variants":
                            vars_to_rechunk.append(var)
                        else:
                            vars_to_copy.append(var)

                contig_zarr_file = Path(tmpdir) / contig
                ds.to_zarr(contig_zarr_file)

                zarr_files.append(str(contig_zarr_file))

            if concat_algorithm == "xarray_internal":
                ds = zarrs_to_dataset(zarr_files)
                ds.to_zarr(output, mode="w")
            else:
                # Use the optimized algorithm in `concatenate_and_rechunk`
                _concat_zarrs_optimized(zarr_files, output, vars_to_rechunk,
                                        vars_to_copy)

コード例 #3

ファイルを表示

ファイル: vcf_reader.py プロジェクト: hammer/sgkit

        The algorithm to use to concatenate and rechunk Zarr files. The default None means
        use the optimized version suitable for large files, whereas ``xarray_internal`` will
        use built-in Xarray APIs, which can exhibit high memory usage, see https://github.com/dask/dask/issues/6745.
    chunk_length
        Length (number of variants) of chunks in which data are stored, by default 10,000.
        This is only used when ``concat_algorithm`` is ``xarray_internal``.
    chunk_width
        Width (number of samples) to use when storing chunks in output, by default 1,000.
        This is only used when ``concat_algorithm`` is ``xarray_internal``.
    storage_options
        Any additional parameters for the storage backend (see ``fsspec.open``).
    dask_fuse_avg_width
        Setting for Dask's ``optimization.fuse.ave-width``, see https://github.com/dask/dask/issues/5105
    """
    if concat_algorithm == "xarray_internal":
        ds = zarrs_to_dataset(urls, chunk_length, chunk_width, storage_options)

        with dask.config.set(
            {"optimization.fuse.ave-width": dask_fuse_avg_width}):
            ds.to_zarr(output, mode="w")
    else:

        vars_to_rechunk = []
        vars_to_copy = []
        storage_options = storage_options or {}
        ds = xr.open_zarr(  # type: ignore[no-untyped-call]
            fsspec.get_mapper(urls[0], **storage_options),
            concat_characters=False)
        for (var, arr) in ds.data_vars.items():
            if arr.dims[0] == "variants":
                vars_to_rechunk.append(var)