Example #1
0
 def test_do_not_overwrite_user_coordinates(self) -> None:
     orig = Dataset(
         coords={"x": [0, 1, 2], "y": ("x", [5, 6, 7]), "z": ("x", [8, 9, 10])},
         data_vars={"a": ("x", [1, 2, 3]), "b": ("x", [3, 5, 6])},
     )
     orig["a"].encoding["coordinates"] = "y"
     orig["b"].encoding["coordinates"] = "z"
     enc, _ = conventions.encode_dataset_coordinates(orig)
     assert enc["a"].attrs["coordinates"] == "y"
     assert enc["b"].attrs["coordinates"] == "z"
     orig["a"].attrs["coordinates"] = "foo"
     with pytest.raises(ValueError, match=r"'coordinates' found in both attrs"):
         conventions.encode_dataset_coordinates(orig)
Example #2
0
 def test_multidimensional_coordinates(self):
     # regression test for GH1763
     # Set up test case with coordinates that have overlapping (but not
     # identical) dimensions.
     zeros1 = np.zeros((1, 5, 3))
     zeros2 = np.zeros((1, 6, 3))
     zeros3 = np.zeros((1, 5, 4))
     orig = Dataset({
         'lon1': (['x1', 'y1'], zeros1.squeeze(0), {}),
         'lon2': (['x2', 'y1'], zeros2.squeeze(0), {}),
         'lon3': (['x1', 'y2'], zeros3.squeeze(0), {}),
         'lat1': (['x1', 'y1'], zeros1.squeeze(0), {}),
         'lat2': (['x2', 'y1'], zeros2.squeeze(0), {}),
         'lat3': (['x1', 'y2'], zeros3.squeeze(0), {}),
         'foo1': (['time', 'x1', 'y1'], zeros1,
                  {'coordinates': 'lon1 lat1'}),
         'foo2': (['time', 'x2', 'y1'], zeros2,
                  {'coordinates': 'lon2 lat2'}),
         'foo3': (['time', 'x1', 'y2'], zeros3,
                  {'coordinates': 'lon3 lat3'}),
         'time': ('time', [0.], {'units': 'hours since 2017-01-01'}),
     })
     orig = conventions.decode_cf(orig)
     # Encode the coordinates, as they would be in a netCDF output file.
     enc, attrs = conventions.encode_dataset_coordinates(orig)
     # Make sure we have the right coordinates for each variable.
     foo1_coords = enc['foo1'].attrs.get('coordinates', '')
     foo2_coords = enc['foo2'].attrs.get('coordinates', '')
     foo3_coords = enc['foo3'].attrs.get('coordinates', '')
     assert set(foo1_coords.split()) == set(['lat1', 'lon1'])
     assert set(foo2_coords.split()) == set(['lat2', 'lon2'])
     assert set(foo3_coords.split()) == set(['lat3', 'lon3'])
     # Should not have any global coordinates.
     assert 'coordinates' not in attrs
Example #3
0
 def test_multidimensional_coordinates(self) -> None:
     # regression test for GH1763
     # Set up test case with coordinates that have overlapping (but not
     # identical) dimensions.
     zeros1 = np.zeros((1, 5, 3))
     zeros2 = np.zeros((1, 6, 3))
     zeros3 = np.zeros((1, 5, 4))
     orig = Dataset(
         {
             "lon1": (["x1", "y1"], zeros1.squeeze(0), {}),
             "lon2": (["x2", "y1"], zeros2.squeeze(0), {}),
             "lon3": (["x1", "y2"], zeros3.squeeze(0), {}),
             "lat1": (["x1", "y1"], zeros1.squeeze(0), {}),
             "lat2": (["x2", "y1"], zeros2.squeeze(0), {}),
             "lat3": (["x1", "y2"], zeros3.squeeze(0), {}),
             "foo1": (["time", "x1", "y1"], zeros1, {"coordinates": "lon1 lat1"}),
             "foo2": (["time", "x2", "y1"], zeros2, {"coordinates": "lon2 lat2"}),
             "foo3": (["time", "x1", "y2"], zeros3, {"coordinates": "lon3 lat3"}),
             "time": ("time", [0.0], {"units": "hours since 2017-01-01"}),
         }
     )
     orig = conventions.decode_cf(orig)
     # Encode the coordinates, as they would be in a netCDF output file.
     enc, attrs = conventions.encode_dataset_coordinates(orig)
     # Make sure we have the right coordinates for each variable.
     foo1_coords = enc["foo1"].attrs.get("coordinates", "")
     foo2_coords = enc["foo2"].attrs.get("coordinates", "")
     foo3_coords = enc["foo3"].attrs.get("coordinates", "")
     assert set(foo1_coords.split()) == {"lat1", "lon1"}
     assert set(foo2_coords.split()) == {"lat2", "lon2"}
     assert set(foo3_coords.split()) == {"lat3", "lon3"}
     # Should not have any global coordinates.
     assert "coordinates" not in attrs
 def test_multidimensional_coordinates(self):
     # regression test for GH1763
     # Set up test case with coordinates that have overlapping (but not
     # identical) dimensions.
     zeros1 = np.zeros((1, 5, 3))
     zeros2 = np.zeros((1, 6, 3))
     zeros3 = np.zeros((1, 5, 4))
     orig = Dataset({
         'lon1': (['x1', 'y1'], zeros1.squeeze(0), {}),
         'lon2': (['x2', 'y1'], zeros2.squeeze(0), {}),
         'lon3': (['x1', 'y2'], zeros3.squeeze(0), {}),
         'lat1': (['x1', 'y1'], zeros1.squeeze(0), {}),
         'lat2': (['x2', 'y1'], zeros2.squeeze(0), {}),
         'lat3': (['x1', 'y2'], zeros3.squeeze(0), {}),
         'foo1': (['time', 'x1', 'y1'], zeros1,
                  {'coordinates': 'lon1 lat1'}),
         'foo2': (['time', 'x2', 'y1'], zeros2,
                  {'coordinates': 'lon2 lat2'}),
         'foo3': (['time', 'x1', 'y2'], zeros3,
                  {'coordinates': 'lon3 lat3'}),
         'time': ('time', [0.], {'units': 'hours since 2017-01-01'}),
     })
     orig = conventions.decode_cf(orig)
     # Encode the coordinates, as they would be in a netCDF output file.
     enc, attrs = conventions.encode_dataset_coordinates(orig)
     # Make sure we have the right coordinates for each variable.
     foo1_coords = enc['foo1'].attrs.get('coordinates', '')
     foo2_coords = enc['foo2'].attrs.get('coordinates', '')
     foo3_coords = enc['foo3'].attrs.get('coordinates', '')
     assert set(foo1_coords.split()) == set(['lat1', 'lon1'])
     assert set(foo2_coords.split()) == set(['lat2', 'lon2'])
     assert set(foo3_coords.split()) == set(['lat3', 'lon3'])
     # Should not have any global coordinates.
     assert 'coordinates' not in attrs
Example #5
0
    def test_emit_coordinates_attribute_in_encoding(self) -> None:
        orig = Dataset(
            {"a": 1, "b": 1},
            coords={"t": np.array("2004-11-01T00:00:00", dtype=np.datetime64)},
        )

        orig["a"].encoding["coordinates"] = None
        enc, _ = conventions.encode_dataset_coordinates(orig)

        # check coordinate attribute emitted for 'a'
        assert "coordinates" not in enc["a"].attrs
        assert "coordinates" not in enc["a"].encoding

        # check coordinate attribute not emitted for 'b'
        assert enc["b"].attrs.get("coordinates") == "t"
        assert "coordinates" not in enc["b"].encoding
Example #6
0
 def test_var_with_coord_attr(self) -> None:
     # regression test for GH6310
     # don't overwrite user-defined "coordinates" attributes
     orig = Dataset(
         {"values": ("time", np.zeros(2), {"coordinates": "time lon lat"})},
         coords={
             "time": ("time", np.zeros(2)),
             "lat": ("time", np.zeros(2)),
             "lon": ("time", np.zeros(2)),
         },
     )
     # Encode the coordinates, as they would be in a netCDF output file.
     enc, attrs = conventions.encode_dataset_coordinates(orig)
     # Make sure we have the right coordinates for each variable.
     values_coords = enc["values"].attrs.get("coordinates", "")
     assert set(values_coords.split()) == {"time", "lat", "lon"}
     # Should not have any global coordinates.
     assert "coordinates" not in attrs
Example #7
0
def _setup_rechunk(
    source,
    target_chunks,
    max_mem,
    target_store,
    target_options=None,
    temp_store=None,
    temp_options=None,
):
    if temp_options is None:
        temp_options = target_options
    target_options = target_options or {}
    temp_options = temp_options or {}

    if isinstance(source, xarray.Dataset):
        if not isinstance(target_chunks, dict):
            raise ValueError(
                "You must specify ``target-chunks`` as a dict when rechunking a dataset."
            )

        variables, attrs = encode_dataset_coordinates(source)
        attrs = _encode_zarr_attributes(attrs)

        if temp_store:
            temp_group = zarr.group(temp_store)
        else:
            temp_group = None
        target_group = zarr.group(target_store)
        target_group.attrs.update(attrs)

        copy_specs = []
        for name, variable in variables.items():
            # This isn't strictly necessary because a shallow copy
            # also occurs in `encode_dataset_coordinates` but do it
            # anyways in case the coord encoding function changes
            variable = variable.copy()

            # Update the array encoding with provided options and apply it;
            # note that at this point the `options` may contain any valid property
            # applicable for the `encoding` parameter in Dataset.to_zarr other than "chunks"
            options = target_options.get(name, {})
            if "chunks" in options:
                raise ValueError(
                    f"Chunks must be provided in ``target_chunks`` rather than options (variable={name})"
                )
            variable.encoding.update(options)
            variable = encode_zarr_variable(variable)

            # Extract the array encoding to get a default chunking, a step
            # which will also ensure that the target chunking is compatible
            # with the current chunking (only necessary for on-disk arrays)
            variable_encoding = extract_zarr_variable_encoding(
                variable, raise_on_invalid=False, name=name)
            variable_chunks = target_chunks.get(name,
                                                variable_encoding["chunks"])

            # Restrict options to only those that are specific to zarr and
            # not managed internally
            options = {k: v for k, v in options.items() if k in ZARR_OPTIONS}
            _validate_options(options)

            # Extract array attributes along with reserved property for
            # xarray dimension names
            variable_attrs = _encode_zarr_attributes(variable.attrs)
            variable_attrs[DIMENSION_KEY] = encode_zarr_attr_value(
                variable.dims)

            copy_spec = _setup_array_rechunk(
                dask.array.asarray(variable),
                variable_chunks,
                max_mem,
                target_group,
                target_options=options,
                temp_store_or_group=temp_group,
                temp_options=options,
                name=name,
            )
            copy_spec.write.array.attrs.update(variable_attrs)  # type: ignore
            copy_specs.append(copy_spec)

        return copy_specs, temp_group, target_group

    elif isinstance(source, zarr.hierarchy.Group):
        if not isinstance(target_chunks, dict):
            raise ValueError(
                "You must specify ``target-chunks`` as a dict when rechunking a group."
            )

        if temp_store:
            temp_group = zarr.group(temp_store)
        else:
            temp_group = None
        target_group = zarr.group(target_store)
        target_group.attrs.update(source.attrs)

        copy_specs = []
        for array_name, array_target_chunks in target_chunks.items():
            copy_spec = _setup_array_rechunk(
                source[array_name],
                array_target_chunks,
                max_mem,
                target_group,
                target_options=target_options.get(array_name),
                temp_store_or_group=temp_group,
                temp_options=temp_options.get(array_name),
                name=array_name,
            )
            copy_specs.append(copy_spec)

        return copy_specs, temp_group, target_group

    elif isinstance(source, (zarr.core.Array, dask.array.Array)):

        copy_spec = _setup_array_rechunk(
            source,
            target_chunks,
            max_mem,
            target_store,
            target_options=target_options,
            temp_store_or_group=temp_store,
            temp_options=temp_options,
        )
        intermediate = copy_spec.intermediate.array
        target = copy_spec.write.array
        return [copy_spec], intermediate, target

    else:
        raise ValueError(
            f"Source must be a Zarr Array, Zarr Group, Dask Array or Xarray Dataset (not {type(source)})."
        )