Example #1
0
def _metadata_is_consolidated(m: fsspec.FSMap) -> bool:
    try:
        zarr.open_consolidated(m)
        consolidated = True
    except KeyError:
        # group with un-consolidated metadata, or array
        consolidated = False
    return consolidated
Example #2
0
def test_zarr_array_to_parquet_table(dataset):
    """
    Test converting from a zarr array to a parquet table, specifying a list of
    variables to store and setting 'snappy' compression.
    """
    with tempfile.TemporaryDirectory() as tmpdir:
        zarrstore: str = os.path.join(tmpdir, "temp.zarr")
        dataset.to_zarr(store=zarrstore, consolidated=True)
        zarrarray: zarr.hierarchy.Group = zarr.open_consolidated(store=zarrstore)

        parquetpath: str = os.path.join(tmpdir, "temp.parquet")
        ndarray_to_parquet(
            ndarray=zarrarray,
            parquetpath=parquetpath,
            variables=["longitude", "latitude", "h_corr", "delta_time"],
            compression="snappy",
        )

        df: dask.dataframe.core.DataFrame = dask.dataframe.read_parquet(
            path=parquetpath
        )
        assert len(df) == 1404
        assert list(df.columns) == [
            "longitude",
            "latitude",
            "h_corr_1",
            "h_corr_2",
            "delta_time_1",
            "delta_time_2",
        ]
        assert all(np.issubdtype(dtype, np.float64) for dtype in df.dtypes)
Example #3
0
 def _open_snp_sites(self):
     if self._cache_snp_sites is None:
         path = f"{self.path}/v3/snp_genotypes/all/sites/"
         store = SafeStore(self.fs.get_mapper(path))
         root = zarr.open_consolidated(store=store)
         self._cache_snp_sites = root
     return self._cache_snp_sites
Example #4
0
def get_tracking_ids(zstore):
    """given a GC zarr location, fetch the associated dataset/netCDF tracking IDs"""
    from requests import session
    from zarr import open_consolidated
    from fsspec import get_mapper

    # request params
    client = session()
    base_url = 'http://hdl.handle.net/api/handles/'
    dset_id_query = '?type=IS_PART_OF'
    version_query = '?type=VERSION_NUMBER'

    # get primary tracking id
    netcdf_tracking_ids = open_consolidated(
        get_mapper(zstore)).attrs['tracking_id'].split('\n')
    file_tracking_id = netcdf_tracking_ids[0]

    version_ids = []
    dataset_ids = []

    # query for dataset_tracking_id
    dset_id_url = base_url + file_tracking_id[4:] + dset_id_query
    r = client.get(dset_id_url)
    r.raise_for_status()
    dataset_tracking_id = r.json()['values'][0]['data']['value']

    # query for version
    if ';' in dataset_tracking_id:
        # multiple dataset_ids erroneously reported
        dataset_tracking_id = "ambiguous"

    return dataset_tracking_id, netcdf_tracking_ids
Example #5
0
    def open_group(cls, store, mode='r', synchronizer=None, group=None,
                   consolidated=False, consolidate_on_close=False):
        import zarr
        min_zarr = '2.2'

        if LooseVersion(zarr.__version__) < min_zarr:  # pragma: no cover
            raise NotImplementedError("Zarr version %s or greater is "
                                      "required by xarray. See zarr "
                                      "installation "
                                      "http://zarr.readthedocs.io/en/stable/"
                                      "#installation" % min_zarr)

        if consolidated or consolidate_on_close:
            if LooseVersion(
                    zarr.__version__) <= '2.2.1.dev2':  # pragma: no cover
                raise NotImplementedError("Zarr version 2.2.1.dev2 or greater "
                                          "is required by for consolidated "
                                          "metadata.")

        open_kwargs = dict(mode=mode, synchronizer=synchronizer, path=group)
        if consolidated:
            # TODO: an option to pass the metadata_key keyword
            zarr_group = zarr.open_consolidated(store, **open_kwargs)
        else:
            zarr_group = zarr.open_group(store, **open_kwargs)
        return cls(zarr_group, consolidate_on_close)
Example #6
0
    def open_group(
        cls,
        store,
        mode="r",
        synchronizer=None,
        group=None,
        consolidated=False,
        consolidate_on_close=False,
        chunk_store=None,
        append_dim=None,
        write_region=None,
    ):
        import zarr

        # zarr doesn't support pathlib.Path objects yet. zarr-python#601
        if isinstance(store, pathlib.Path):
            store = os.fspath(store)

        open_kwargs = dict(mode=mode, synchronizer=synchronizer, path=group)
        if chunk_store:
            open_kwargs["chunk_store"] = chunk_store

        if consolidated:
            # TODO: an option to pass the metadata_key keyword
            zarr_group = zarr.open_consolidated(store, **open_kwargs)
        else:
            zarr_group = zarr.open_group(store, **open_kwargs)
        return cls(zarr_group, consolidate_on_close, append_dim, write_region)
Example #7
0
def get_schema(url, coords):

    mapper = fsspec.get_mapper(url)
    group = zarr.open_consolidated(mapper)
    schema = synth.read_schema_from_zarr(group, coords)

    return schema
Example #8
0
 def _open_zarr(self):
     fmap = fsspec.get_mapper(
         f's3://{self.bucket_name}/{self.dataset_id}',
         **self.storage_options,
     )
     self._zarr_group = zarr.open_consolidated(fmap)
     self._total_size = np.sum(
         [arr.nbytes for _, arr in self._zarr_group.items()])
     self._total_size_repr = memory_repr(self._total_size)
Example #9
0
 def consolidate_metadata(self, metadata_key='.zmetadata'):
     '''
     Wrapper over zarr.consolidate_metadata to pass chunk store when opening the zarr store
     '''
     zarr.consolidate_metadata(self.store, metadata_key=metadata_key)
     store_mode_cons = 'r' if self.store_mode == 'r' else 'r+'
     self.zgroup = zarr.open_consolidated(self.store, metadata_key=metadata_key,
                                          mode=store_mode_cons, chunk_store=self.zgroup.chunk_store,
                                          path=self.store_path)
     return self.zgroup
Example #10
0
 def _open_snp_genotypes(self, *, sample_set):
     try:
         return self._cache_snp_genotypes[sample_set]
     except KeyError:
         release = self._lookup_release(sample_set=sample_set)
         path = f"{self.path}/{release}/snp_genotypes/all/{sample_set}/"
         store = SafeStore(self.fs.get_mapper(path))
         root = zarr.open_consolidated(store=store)
         self._cache_snp_genotypes[sample_set] = root
         return root
Example #11
0
 def _open_site_filters(self, *, mask, analysis):
     key = mask, analysis
     try:
         return self._cache_site_filters[key]
     except KeyError:
         path = f"{self.path}/v3/site_filters/{analysis}/{mask}/"
         store = SafeStore(self.fs.get_mapper(path))
         root = zarr.open_consolidated(store=store)
         self._cache_site_filters[key] = root
         return root
Example #12
0
def fetch_zarr(zarr_url, storage_options={'anon': True}):
    zg = zarr.open_consolidated(fsspec.get_mapper(zarr_url, **storage_options),
                                mode='r')
    dimensions = {}
    variable_arrays = {}
    for k, a in zg.arrays():
        if k in a.attrs['_ARRAY_DIMENSIONS']:
            dimensions[k] = a.attrs['_ARRAY_DIMENSIONS']
        else:
            variable_arrays[k] = a.attrs['_ARRAY_DIMENSIONS']
    return zg, dimensions, variable_arrays
Example #13
0
def _open_with_xarray_or_zarr(
    m: fsspec.FSMap, consolidated: bool
) -> Tuple[Union[xr.Dataset, zarr.hierarchy.Group, zarr.core.Array], bool]:
    try:
        result = xr.open_zarr(m, consolidated=consolidated)
        is_xarray_dataset = True
    except KeyError:
        # xarray requires _ARRAY_DIMENSIONS attribute, assuming missing if KeyError
        result = zarr.open_consolidated(m) if consolidated else zarr.open(m)
        is_xarray_dataset = False
    return result, is_xarray_dataset
Example #14
0
    def open_mask_group(self):
        """Open the zarr group that contains the masks

        Returns
        -------
        mask_group : zarr.Group
        """

        mapper = self.mask_fs.get_mapper(self.mask_path)
        zgroup = zarr.open_consolidated(mapper)
        return zgroup
Example #15
0
    def _load(self):
        import zarr

        if self._grp is None:

            # obtain the zarr root group
            if isinstance(self._urlpath, zarr.hierarchy.Group):
                # use already-opened group, allows support for nested groups
                # as catalogs
                root = self._urlpath

            else:

                # obtain store
                if isinstance(self._urlpath, str):
                    # open store from url
                    from fsspec import get_mapper
                    store = get_mapper(self._urlpath, **self._storage_options)
                else:
                    # assume store passed directly
                    store = self._urlpath

                # open root group
                if self._consolidated:
                    # use consolidated metadata
                    root = zarr.open_consolidated(store=store, mode='r')
                else:
                    root = zarr.open_group(store=store, mode='r')

            # deal with component path
            if self._component is None:
                self._grp = root
            else:
                self._grp = root[self._component]

            # use zarr attributes as metadata
            self.metadata.update(self._grp.attrs.asdict())

        # build catalog entries
        entries = {}
        for k, v in self._grp.items():
            if isinstance(v, zarr.core.Array):
                entry = LocalCatalogEntry(name=k,
                                          description='',
                                          driver='ndzarr',
                                          args=dict(urlpath=v),
                                          catalog=self)
            else:
                entry = LocalCatalogEntry(name=k,
                                          description='',
                                          driver='zarr_cat',
                                          args=dict(urlpath=v))
            entries[k] = entry
        self._entries = entries
def structure_mesh(allen_id):
    if allen_id in _cache:
        return _cache[allen_id]
    fs = HTTPFileSystem()
    # Todo: Use AWS store after Scott / Lydia upload
    store = fs.get_mapper(
        "https://thewtex.github.io/allen-ccf-itk-vtk-zarr/meshes/{0}.zarr".
        format(allen_id))
    root = zarr.open_consolidated(store)
    mesh = zarr_to_vtkjs(root)
    _cache[allen_id] = mesh
    return mesh
Example #17
0
def get_version(zstore, method='fsspec'):

    client = requests.session()
    baseurl = 'http://hdl.handle.net/api/handles/'
    query1 = '?type=IS_PART_OF'
    query2 = '?type=VERSION_NUMBER'

    # get the `netcdf_tracking_ids` from the zstore metadata
    if method == 'fsspec':
        mapper = fsspec.get_mapper(zstore)
    else:
        mapper = zstore
    group = zarr.open_consolidated(mapper)
    tracking_ids = group.attrs['tracking_id']

    # query the dataset handler to obtain `dataset_tracking_id` and `version`
    versions = []
    datasets = []
    for file_tracking_id in tracking_ids.split('\n')[0:1]:
        url = baseurl + file_tracking_id[4:] + query1
        r = client.get(url)
        r.raise_for_status()
        dataset_tracking_id = r.json()['values'][0]['data']['value']
        datasets += [dataset_tracking_id]
        if ';' in dataset_tracking_id:
            # multiple dataset_ids erroneously reported
            dtracks = dataset_tracking_id.split(';')
            vs = []
            for dtrack in dtracks:
                url2 = baseurl + dtrack[4:] + query2
                r = client.get(url2)
                r.raise_for_status()
                r.json()['values'][0]['data']['value']
                vs += [r.json()['values'][0]['data']['value']]
            v = sorted(vs)[-1]
        else:
            url2 = baseurl + dataset_tracking_id[4:] + query2
            r = client.get(url2)
            r.raise_for_status()
            v = r.json()['values'][0]['data']['value']
        versions += [v]

    version_id = list(set(versions))
    dataset_id = list(set(datasets))

    assert len(version_id) == 1

    return dataset_id[0], version_id[0]
Example #18
0
    def open_group(
        cls,
        store,
        mode="r",
        synchronizer=None,
        group=None,
        consolidated=False,
        consolidate_on_close=False,
    ):
        import zarr

        open_kwargs = dict(mode=mode, synchronizer=synchronizer, path=group)
        if consolidated:
            # TODO: an option to pass the metadata_key keyword
            zarr_group = zarr.open_consolidated(store, **open_kwargs)
        else:
            zarr_group = zarr.open_group(store, **open_kwargs)
        return cls(zarr_group, consolidate_on_close)
Example #19
0
    def open_group(
        cls,
        store,
        mode="r",
        synchronizer=None,
        group=None,
        consolidated=False,
        consolidate_on_close=False,
        chunk_store=None,
        storage_options=None,
        append_dim=None,
        write_region=None,
        safe_chunks=True,
    ):

        # zarr doesn't support pathlib.Path objects yet. zarr-python#601
        if isinstance(store, pathlib.Path):
            store = os.fspath(store)

        open_kwargs = dict(
            mode=mode,
            synchronizer=synchronizer,
            path=group,
        )
        if LooseVersion(zarr.__version__) >= "2.5.0":
            open_kwargs["storage_options"] = storage_options
        elif storage_options:
            raise ValueError(
                "Storage options only compatible with zarr>=2.5.0")
        if chunk_store:
            open_kwargs["chunk_store"] = chunk_store

        if consolidated:
            # TODO: an option to pass the metadata_key keyword
            zarr_group = zarr.open_consolidated(store, **open_kwargs)
        else:
            zarr_group = zarr.open_group(store, **open_kwargs)
        return cls(zarr_group, consolidate_on_close, append_dim, write_region,
                   safe_chunks)
Example #20
0
def append_zarr_along_time(source_path: str,
                           target_path: str,
                           fs: fsspec.AbstractFileSystem,
                           dim: str = "time"):
    """Append local zarr store at source_path to zarr store at target_path along time.
    
    Args:
        source_path: Local path to zarr store that represents an xarray dataset.
        target_path: Local or remote url for zarr store to be appended to.
        fs: Filesystem for target_path.
        dim: (optional) name of time dimension. Defaults to "time".

    Raises:
        ValueError: If the chunk size in time does not evenly divide length of time
            dimension for zarr stores at source_path.

    Warning:
        The zarr store as source_path will be modified in place.
    """

    merged_time = _get_merged_time_coordinate(source_path, target_path, dim,
                                              fs)
    if fs.exists(target_path):
        source_store = zarr.open(source_path, mode="r+")
        target_store = zarr.open_consolidated(fsspec.get_mapper(target_path))
        _assert_chunks_match(source_store, target_store, dim)
        _set_time_units_like(source_store, target_store)
        _shift_store(source_store, dim, _get_dim_size(target_store, dim))
    elif fs.protocol == "file":
        os.makedirs(target_path)

    upload_dir(source_path, target_path)
    _overwrite_time_array_with_single_chunk(target_path, merged_time, dim)

    _, _, absolute_target_paths = fsspec.get_fs_token_paths(target_path)
    consolidate_metadata(fs, absolute_target_paths[0])
Example #21
0
def update_slice(store: Union[str, MutableMapping],
                 insert_index: int,
                 dataslice: xr.Dataset,
                 mode: str,
                 dimension: str = "time") -> None:
    """
    Update existing Zarr dataset with new data slice.

    :param store: A Zarr store.
    :param insert_index: index at which to insert
    :param dataslice: slice to insert
    :param mode: Update mode, 'insert' or 'replace'
    :param dimension: name of dimension perpendicular to slice
    """

    if mode not in ('insert', 'replace'):
        raise ValueError(f'illegal mode value: {mode!r}')

    insert_mode = mode == 'insert'

    append_dim_var_names = []
    encoding = {}

    # Neither Zarr nor xarray offer an explicit API function to check whether
    # a Zarr is consolidated. Here we use the workaround of attempting to
    # open as consolidated, and catching the resulting exception if this
    # isn't possible. In the case of a consolidated Zarr, there is a slight
    # inefficiency, since the consolidated metadata object is fetched twice
    # (by Zarr and thereafter by xarray). See comments on PR #48 for
    # discussion of possible optimizations.
    consolidated = True
    try:
        _ = zarr.open_consolidated(store)
    except KeyError:
        consolidated = False

    with xr.open_zarr(store, consolidated=consolidated) as ds:
        for var_name in ds.variables:
            var = ds[var_name]
            if var.ndim >= 1 and dimension in var.dims:
                if var.dims[0] != dimension:
                    # TODO: Remove this restriction -- it's not fundamentally
                    #   necessary. Removal should be accompanied by appropriate
                    #   unit tests and the addition of a warning to the user
                    #   about potential slowness / inefficiency.
                    raise ValueError(f"dimension '{dimension}' of variable "
                                     f"{var_name!r} must be first dimension")
                append_dim_var_names.append(var_name)
                enc = dict(ds[var_name].encoding)
                # xarray 0.17+ supports engine preferred chunks if exposed by
                # the backend zarr does that, but when we use the new
                # 'preferred_chunks' when writing to zarr it raises and says,
                # 'preferred_chunks' is an unsupported encoding
                if 'preferred_chunks' in enc:
                    del enc['preferred_chunks']
                encoding[var_name] = enc

    temp_dir = tempfile.TemporaryDirectory(prefix='nc2zarr-slice-',
                                           suffix='.zarr')
    dataslice.to_zarr(temp_dir.name, encoding=encoding)
    slice_root_group = zarr.open(temp_dir.name, mode='r')
    slice_arrays = dict(slice_root_group.arrays())

    root_group = zarr.open(store, mode='r+')
    for var_name, var_array in root_group.arrays():
        if var_name in append_dim_var_names:
            slice_array = slice_arrays[var_name]
            if insert_mode:
                # Add one empty step
                empty = zarr.creation.empty(slice_array.shape,
                                            dtype=var_array.dtype)
                var_array.append(empty, axis=0)
                # Shift contents
                var_array[insert_index + 1:, ...] = \
                    var_array[insert_index:-1, ...]
            # Replace slice
            var_array[insert_index, ...] = slice_array[0]

    if consolidated:
        zarr.consolidate_metadata(store)
Example #22
0
    def open_group(
        cls,
        store,
        mode="r",
        synchronizer=None,
        group=None,
        consolidated=False,
        consolidate_on_close=False,
        chunk_store=None,
        storage_options=None,
        append_dim=None,
        write_region=None,
        safe_chunks=True,
        stacklevel=2,
    ):

        # zarr doesn't support pathlib.Path objects yet. zarr-python#601
        if isinstance(store, pathlib.Path):
            store = os.fspath(store)

        open_kwargs = dict(
            mode=mode,
            synchronizer=synchronizer,
            path=group,
        )
        if LooseVersion(zarr.__version__) >= "2.5.0":
            open_kwargs["storage_options"] = storage_options
        elif storage_options:
            raise ValueError("Storage options only compatible with zarr>=2.5.0")

        if chunk_store:
            open_kwargs["chunk_store"] = chunk_store
            if consolidated is None:
                consolidated = False

        if consolidated is None:
            try:
                zarr_group = zarr.open_consolidated(store, **open_kwargs)
            except KeyError:
                warnings.warn(
                    "Failed to open Zarr store with consolidated metadata, "
                    "falling back to try reading non-consolidated metadata. "
                    "This is typically much slower for opening a dataset. "
                    "To silence this warning, consider:\n"
                    "1. Consolidating metadata in this existing store with "
                    "zarr.consolidate_metadata().\n"
                    "2. Explicitly setting consolidated=False, to avoid trying "
                    "to read consolidate metadata, or\n"
                    "3. Explicitly setting consolidated=True, to raise an "
                    "error in this case instead of falling back to try "
                    "reading non-consolidated metadata.",
                    RuntimeWarning,
                    stacklevel=stacklevel,
                )
                zarr_group = zarr.open_group(store, **open_kwargs)
        elif consolidated:
            # TODO: an option to pass the metadata_key keyword
            zarr_group = zarr.open_consolidated(store, **open_kwargs)
        else:
            zarr_group = zarr.open_group(store, **open_kwargs)
        return cls(
            zarr_group,
            mode,
            consolidate_on_close,
            append_dim,
            write_region,
            safe_chunks,
        )
Example #23
0
 def variation_main_pass(self):
     path = self.base_dir / 'variation/main/zarr/pass/ag1000g.phase2.ar1.pass'
     return zarr.open_consolidated(str(path))
Example #24
0
 def haplotypes_main(self):
     path = self.base_dir / 'haplotypes/main/zarr/ag1000g.phase2.ar1.haplotypes'
     return zarr.open_consolidated(str(path))
Example #25
0
def load_cmip(
    activity_ids: str = "CMIP",
    experiment_ids: str = "historical",
    member_ids: str = "r1i1p1f1",
    source_ids: str = "MIROC6",
    table_ids: str = "day",
    grid_labels: str = "gn",
    variable_ids: List[str] = ["tasmax"],
    return_type: str = 'zarr',
) -> xr.Dataset:
    """Loads CMIP6 GCM dataset based on input criteria.
    Parameters
    ----------
    activity_ids : list, optional
        activity_ids in CMIP6 catalog, by default ["CMIP", "ScenarioMIP"],
    experiment_ids : list, optional
        experiment_ids in CMIP6 catalog, by default ["historical", "ssp370"],  ex:#  "ssp126", "ssp245",  "ssp585"
    member_ids : list, optional
        member_ids in CMIP6 catalog, by default ["r1i1p1f1"]
    source_ids : list, optional
        source_ids in CMIP6 catalog, by default ["MIROC6"]
    table_ids : list, optional
        table_ids in CMIP6 catalog, by default ["day"]
    grid_labels : list, optional
        grid_labels in CMIP6 catalog, by default ["gn"]
    variable_ids : list, optional
        variable_ids in CMIP6 catalog, by default ['tasmax']
    Returns
    -------
    ds : xr.Dataset or zarr group
        Dataset or zarr group with CMIP data
    """

    if isinstance(variable_ids, str):
        variable_ids = [variable_ids]

    col = cat.cmip6()

    for i, var in enumerate(variable_ids):
        stores = (
            col.search(
                activity_id=activity_ids,
                experiment_id=experiment_ids,
                member_id=member_ids,
                source_id=source_ids,
                table_id=table_ids,
                grid_label=grid_labels,
                variable_id=[var],
            )
            .df['zstore']
            .to_list()
        )

        storage_options = config.get('data_catalog.era5.storage_options')
        if len(stores) > 1:
            raise ValueError('can only get 1 store at a time')
        if return_type == 'zarr':
            ds = zarr.open_consolidated(stores[0], mode='r', storage_options=storage_options)
        elif return_type == 'xr':
            ds = xr.open_zarr(stores[0], consolidated=True, storage_options=storage_options)

        # flip the lats if necessary and drop the extra dims/vars like bnds
        ds = gcm_munge(ds)
        ds = lon_to_180(ds)

        # convert to mm/day - helpful to prevent rounding errors from very tiny numbers
        if var == 'pr':
            ds['pr'] *= 86400

        if i == 0:
            ds_out = ds
        else:
            ds_out[var] = ds[var]

    return ds_out
Example #26
0
    def test_end_to_end_file_conversion(self, _callback_post):
        """
        Full end-to-end test of the adapter from call to `main` to Harmony callbacks, including
        ensuring the contents of the file are correct.  Mocks S3 interactions using @mock_s3.
        """
        conn = boto3.resource('s3')
        conn.create_bucket(Bucket='example-bucket',
                           CreateBucketConfiguration={
                               'LocationConstraint':
                               os.environ['AWS_DEFAULT_REGION']
                           })

        netcdf_file = create_full_dataset()
        netcdf_file2 = create_full_dataset()
        try:
            message = mock_message_for(netcdf_file, netcdf_file2)
            main([
                'harmony_netcdf_to_zarr', '--harmony-action', 'invoke',
                '--harmony-input', message
            ],
                 config=self.config)
        finally:
            os.remove(netcdf_file)
            os.remove(netcdf_file2)

        callbacks = parse_callbacks(_callback_post)

        # -- Progress and Callback Assertions --
        # Assert that we got three callbacks, one for first file, one for the second, and the final message
        self.assertEqual(len(callbacks), 3)
        self.assertEqual(callbacks[0]['progress'], '50')
        self.assertEqual(callbacks[0]['item[type]'], 'application/x-zarr')
        self.assertEqual(callbacks[1]['progress'], '100')
        self.assertEqual(callbacks[1]['item[type]'], 'application/x-zarr')
        self.assertEqual(callbacks[2], {'status': 'successful'})
        self.assertNotEqual(callbacks[0]['item[href]'],
                            callbacks[1]['item[href]'])
        self.assertTrue(callbacks[0]['item[href]'].endswith('.zarr'))
        self.assertTrue(callbacks[1]['item[href]'].endswith('.zarr'))

        # Now calls back with spatial and temporal if present in the incoming message
        self.assertEqual(callbacks[0]['item[temporal]'],
                         '2020-01-01T00:00:00.000Z,2020-01-02T00:00:00.000Z')
        self.assertEqual(callbacks[0]['item[bbox]'], '-11.1,-22.2,33.3,44.4')

        # Open the Zarr file that the adapter called back with
        zarr_location = callbacks[0]['item[href]']
        store = s3fs.S3FileSystem().get_mapper(root=zarr_location, check=False)
        out = zarr.open_consolidated(store)

        # -- Hierarchical Structure Assertions --
        contents = textwrap.dedent("""
            /
             ├── data
             │   ├── horizontal
             │   │   ├── east (1, 3, 3) int64
             │   │   └── west (1, 3, 3) float64
             │   └── vertical
             │       ├── north (1, 3, 3) float64
             │       └── south (1, 3, 3) float64
             ├── location
             │   ├── lat (3, 3) float64
             │   └── lon (3, 3) float64
             └── time (1,) float64
            """).strip()
        self.assertEqual(str(out.tree()), contents)

        # -- Metadata Assertions --
        # Root level values
        self.assertEqual(dict(out.attrs), ROOT_METADATA_VALUES)

        # Group metadata
        self.assertEqual(out['data'].attrs['description'],
                         'Group to hold the data')

        # Variable metadata
        var = out['data/vertical/north']
        self.assertEqual(var.attrs['coordinates'], 'lon lat')

        # -- Data Assertions --
        # Nested Byte Arrays
        self.assertEqual(out['data/vertical/north'][0, 0, 2], 16)
        self.assertEqual(out['data/vertical/north'][0, 2, 0], 0)
        self.assertEqual(out['data/vertical/south'][0, 2, 0], 16)
        self.assertEqual(out['data/vertical/south'][0, 0, 2], 0)
        self.assertEqual(out['data/horizontal/east'][0, 2, 2],
                         16)  # scale_factor = 2
        self.assertEqual(out['data/horizontal/east'][0, 0, 0], 0)
        self.assertEqual(out['data/horizontal/west'][0, 0, 0], 16)
        self.assertEqual(out['data/horizontal/west'][0, 2, 2], 0)

        # 'east' attributes scale_factor removed
        self.assertFalse(hasattr(out['data/horizontal/east'], 'scale_factor'))

        # 'east' attributes present and scaled
        self.assertEqual(out['data/horizontal/east'].attrs['valid_range'],
                         [0.0, 50.0])
        self.assertEqual(out['data/horizontal/east'].attrs['valid_min'], 0.0)
        self.assertEqual(out['data/horizontal/east'].attrs['valid_max'], 50.0)
        self.assertEqual(out['data/horizontal/east'].attrs['_FillValue'],
                         254.0)
        self.assertFalse(hasattr(out['data/horizontal/east'], 'missing_value'))

        # 2D Nested Float Arrays
        self.assertEqual(out['location/lat'][0, 1], 5.5)
        self.assertEqual(out['location/lon'][0, 1], -5.5)

        # 1D Root-Level Float Array sharing its name with a dimension
        self.assertEqual(out['time'][0], 166536)
Example #27
0
 def variation_main_pass_biallelic(self):
     path = self.base_dir / 'variation/main/zarr/biallelic/ag1000g.phase2.ar1.pass.biallelic'
     return zarr.open_consolidated(str(path))
Example #28
0
def vcfzarr_to_zarr(
    input: PathType,
    output: PathType,
    *,
    contigs: Optional[List[str]] = None,
    grouped_by_contig: bool = False,
    consolidated: bool = False,
    tempdir: Optional[PathType] = None,
) -> None:
    """Convert VCF Zarr files created using scikit-allel to a single Zarr on-disk store in sgkit Xarray format.

    Parameters
    ----------
    input
        Path to the input Zarr file.
    output
        Path to the ouput Zarr file.
    contigs
        The contigs to convert. By default all contigs are converted.
    grouped_by_contig
        Whether there is one group for each contig in the Zarr file, by default False.
    consolidated
        Whether the Zarr file has consolidated metadata, by default False.
    tempdir
        Temporary directory where intermediate files are stored. The default None means
        use the system default temporary directory.
    """

    if consolidated:
        vcfzarr = zarr.open_consolidated(str(input), mode="r")
    else:
        vcfzarr = zarr.open_group(str(input), mode="r")

    if not grouped_by_contig:
        ds = _vcfzarr_to_dataset(vcfzarr)
        ds.to_zarr(str(output))

    else:
        # read each contig separately, concatenate, rechunk, then save to zarr

        contigs = contigs or list(vcfzarr.group_keys())

        # Index the contig names
        _, variant_contig_names = encode_array(contigs)
        variant_contig_names = list(variant_contig_names)

        vars_to_rechunk = []
        vars_to_copy = []

        with tempfile.TemporaryDirectory(prefix="vcfzarr_to_zarr_",
                                         suffix=".zarr",
                                         dir=tempdir) as tmpdir:
            zarr_files = []
            for i, contig in enumerate(contigs):
                # convert contig group to zarr and save in tmpdir
                ds = _vcfzarr_to_dataset(vcfzarr[contig], contig,
                                         variant_contig_names)
                if i == 0:
                    for (var, arr) in ds.data_vars.items():
                        if arr.dims[0] == "variants":
                            vars_to_rechunk.append(var)
                        else:
                            vars_to_copy.append(var)

                contig_zarr_file = Path(tmpdir) / contig
                ds.to_zarr(contig_zarr_file)

                zarr_files.append(str(contig_zarr_file))

            concat_zarrs_optimized(zarr_files,
                                   output,
                                   vars_to_rechunk,
                                   vars_to_copy,
                                   fix_strings=True)
Example #29
0
def vcfzarr_to_zarr(
    input: PathType,
    output: PathType,
    *,
    contigs: Optional[List[str]] = None,
    grouped_by_contig: bool = False,
    consolidated: bool = False,
    tempdir: Optional[PathType] = None,
    concat_algorithm: Optional[Literal["xarray_internal"]] = None,
) -> None:
    """Convert VCF Zarr files created using scikit-allel to a single Zarr on-disk store in sgkit Xarray format.

    Parameters
    ----------
    input
        Path to the input Zarr file.
    output
        Path to the ouput Zarr file.
    contigs
        The contigs to convert. By default all contigs are converted.
    grouped_by_contig
        Whether there is one group for each contig in the Zarr file, by default False.
    consolidated
        Whether the Zarr file has consolidated metadata, by default False.
    tempdir
        Temporary directory where intermediate files are stored. The default None means
        use the system default temporary directory.
    concat_algorithm
        The algorithm to use to concatenate and rechunk Zarr files. The default None means
        use the optimized version suitable for large files, whereas ``xarray_internal`` will
        use built-in Xarray APIs, which can exhibit high memory usage, see https://github.com/dask/dask/issues/6745.
    """

    if consolidated:
        vcfzarr = zarr.open_consolidated(str(input), mode="r")
    else:
        vcfzarr = zarr.open_group(str(input), mode="r")

    if not grouped_by_contig:
        ds = _vcfzarr_to_dataset(vcfzarr)
        ds.to_zarr(str(output))

    else:
        # read each contig separately, concatenate, rechunk, then save to zarr

        contigs = contigs or list(vcfzarr.group_keys())

        # Index the contig names
        _, variant_contig_names = encode_array(contigs)
        variant_contig_names = list(variant_contig_names)

        vars_to_rechunk = []
        vars_to_copy = []

        with tempfile.TemporaryDirectory(prefix="vcfzarr_to_zarr_",
                                         suffix=".zarr",
                                         dir=tempdir) as tmpdir:
            zarr_files = []
            for i, contig in enumerate(contigs):
                # convert contig group to zarr and save in tmpdir
                ds = _vcfzarr_to_dataset(vcfzarr[contig], contig,
                                         variant_contig_names)
                if i == 0:
                    for (var, arr) in ds.data_vars.items():
                        if arr.dims[0] == "variants":
                            vars_to_rechunk.append(var)
                        else:
                            vars_to_copy.append(var)

                contig_zarr_file = Path(tmpdir) / contig
                ds.to_zarr(contig_zarr_file)

                zarr_files.append(str(contig_zarr_file))

            if concat_algorithm == "xarray_internal":
                ds = zarrs_to_dataset(zarr_files)
                ds.to_zarr(output, mode="w")
            else:
                # Use the optimized algorithm in `concatenate_and_rechunk`
                _concat_zarrs_optimized(zarr_files, output, vars_to_rechunk,
                                        vars_to_copy)
Example #30
0
def xds_from_zarr(store, columns=None, chunks=None, **kwargs):
    """
    Reads the zarr data store in `store` and returns list of
    Dataset's containing the data.

    Parameters
    ----------
    store : str or Path
        Path containing the data
    columns : list of str or str or None
        Columns to read. `None` or `"ALL"` stores all columns on each dataset.
        Otherwise, a list of columns should be supplied.
    chunks: dict or list of dicts
        chunking schema for each dataset
    **kwargs: optional

    Returns
    -------
    writes : Dataset or list of Datasets
        Dataset(s) representing write operations
    """

    if isinstance(store, DaskMSStore):
        pass
    elif isinstance(store, (Path, str)):
        store = DaskMSStore(f"{store}", **kwargs.pop("storage_options", {}))
    else:
        raise TypeError(f"store '{store}' must be "
                        f"Path, str or DaskMSStore")

    # If any kwargs are added, they should be popped prior to this check.
    if len(kwargs) > 0:
        warnings.warn(
            f"The following unsupported kwargs were ignored in "
            f"xds_from_zarr: {kwargs}", UserWarning)

    columns = promote_columns(columns)

    if chunks is None:
        pass
    elif isinstance(chunks, (tuple, list)):
        if not all(isinstance(v, dict) for v in chunks):
            raise TypeError("chunks must be None, a dict or a list of dicts")
    elif isinstance(chunks, dict):
        chunks = [chunks]
    else:
        raise TypeError("chunks must be None, a dict or a list of dicts")

    datasets = []
    numpy_vars = []

    # NOTE(JSKenyon): Iterating over all the zarr groups/arrays is VERY
    # expensive if the metadata has not been consolidated.
    zc.consolidate_metadata(store.map)
    table_path = store.table if store.table else "MAIN"
    table_group = zarr.open_consolidated(store.map)[table_path]

    for g, (group_name,
            group) in enumerate(sorted(table_group.groups(),
                                       key=group_sortkey)):
        group_attrs = decode_attr(dict(group.attrs))
        dask_ms_attrs = group_attrs.pop(DASKMS_ATTR_KEY)
        natural_chunks = dask_ms_attrs["chunks"]
        group_chunks = {d: tuple(dc) for d, dc in natural_chunks.items()}

        if chunks:
            # Defer to user-supplied chunking strategy
            try:
                group_chunks.update(chunks[g])
            except IndexError:
                group_chunks.update(chunks[-1])  # Reuse last chunking.
                pass

        data_vars = {}
        coords = {}

        for name, zarray in column_iterator(group, columns):
            attrs = decode_attr(dict(zarray.attrs[DASKMS_ATTR_KEY]))
            dims = attrs["dims"]
            coordinate = attrs.get("coordinate", False)
            array_chunks = tuple(
                group_chunks.get(d, s) for d, s in zip(dims, zarray.shape))

            array_chunks = da.core.normalize_chunks(array_chunks, zarray.shape)
            ext_args = extent_args(dims, array_chunks)
            token_name = f"read~{name}-{tokenize(zarray, *ext_args)}"

            read = da.blockwise(zarr_getter,
                                dims,
                                zarray,
                                None,
                                *ext_args,
                                concatenate=False,
                                name=token_name,
                                meta=np.empty((0, ) * zarray.ndim,
                                              zarray.dtype))

            read = inlined_array(read, ext_args[::2])
            var = Variable(dims, read, attrs)
            (coords if coordinate else data_vars)[name] = var

            # Save numpy arrays for reification
            typ = decode_type(attrs["array_type"])

            if typ is np.ndarray:
                numpy_vars.append(var)
            elif typ is da.Array:
                pass
            else:
                raise TypeError(f"Unknown array_type '{attrs['array_type']}'")

        datasets.append(Dataset(data_vars, coords=coords, attrs=group_attrs))

    # Reify any numpy arrays directly into their variables
    for v, a in zip(numpy_vars, dask.compute(v.data for v in numpy_vars)[0]):
        v.data = a

    return datasets