コード例 #1
0
 def test_sel_unsorted_datetime_index_raises(self) -> None:
     index = PandasIndex(pd.to_datetime(["2001", "2000", "2002"]), "x")
     with pytest.raises(KeyError):
         # pandas will try to convert this into an array indexer. We should
         # raise instead, so we can be sure the result of indexing with a
         # slice is always a view.
         index.sel({"x": slice("2001", "2002")})
コード例 #2
0
ファイル: test_indexes.py プロジェクト: aulemahal/xarray
    def test_copy(self):
        expected = PandasIndex([1, 2, 3], "x")
        actual = expected.copy()

        assert actual.index.equals(expected.index)
        assert actual.index is not expected.index
        assert actual.dim == expected.dim
コード例 #3
0
 def test_sel_boolean(self) -> None:
     # index should be ignored and indexer dtype should not be coerced
     # see https://github.com/pydata/xarray/issues/5727
     index = PandasIndex(pd.Index([0.0, 2.0, 1.0, 3.0]), "x")
     actual = index.sel({"x": [False, True, False, True]})
     expected_dim_indexers = {"x": [False, True, False, True]}
     np.testing.assert_array_equal(actual.dim_indexers["x"],
                                   expected_dim_indexers["x"])
コード例 #4
0
    def test_copy(self) -> None:
        expected = PandasIndex([1, 2, 3], "x", coord_dtype=np.int32)
        actual = expected.copy()

        assert actual.index.equals(expected.index)
        assert actual.index is not expected.index
        assert actual.dim == expected.dim
        assert actual.coord_dtype == expected.coord_dtype
コード例 #5
0
    def unique_indexes(self) -> list[PandasIndex]:
        x_idx = PandasIndex(pd.Index([1, 2, 3], name="x"), "x")
        y_idx = PandasIndex(pd.Index([4, 5, 6], name="y"), "y")
        z_pd_midx = pd.MultiIndex.from_product([["a", "b"], [1, 2]],
                                               names=["one", "two"])
        z_midx = PandasMultiIndex(z_pd_midx, "z")

        return [x_idx, y_idx, z_midx]
コード例 #6
0
    def test_sel_datetime(self) -> None:
        index = PandasIndex(
            pd.to_datetime(["2000-01-01", "2001-01-01", "2002-01-01"]), "x")
        actual = index.sel({"x": "2001-01-01"})
        expected_dim_indexers = {"x": 1}
        assert actual.dim_indexers == expected_dim_indexers

        actual = index.sel({"x": index.to_pandas_index().to_numpy()[1]})
        assert actual.dim_indexers == expected_dim_indexers
コード例 #7
0
ファイル: test_indexes.py プロジェクト: aulemahal/xarray
    def test_query_datetime(self):
        index = PandasIndex(
            pd.to_datetime(["2000-01-01", "2001-01-01", "2002-01-01"]), "x")
        actual = index.query({"x": "2001-01-01"})
        expected = (1, None)
        assert actual == expected

        actual = index.query({"x": index.to_pandas_index().to_numpy()[1]})
        assert actual == expected
コード例 #8
0
    def test_unstack(self) -> None:
        pd_midx = pd.MultiIndex.from_product([["a", "b"], [1, 2, 3]],
                                             names=["one", "two"])
        index = PandasMultiIndex(pd_midx, "x")

        new_indexes, new_pd_idx = index.unstack()
        assert list(new_indexes) == ["one", "two"]
        assert new_indexes["one"].equals(PandasIndex(["a", "b"], "one"))
        assert new_indexes["two"].equals(PandasIndex([1, 2, 3], "two"))
        assert new_pd_idx.equals(pd_midx)
コード例 #9
0
    def test_concat_str_dtype(self, dtype) -> None:

        a = PandasIndex(np.array(["a"], dtype=dtype), "x", coord_dtype=dtype)
        b = PandasIndex(np.array(["b"], dtype=dtype), "x", coord_dtype=dtype)
        expected = PandasIndex(np.array(["a", "b"], dtype=dtype),
                               "x",
                               coord_dtype=dtype)

        actual = PandasIndex.concat([a, b], "x")
        assert actual.equals(expected)
        assert np.issubdtype(actual.coord_dtype, dtype)
コード例 #10
0
ファイル: test_indexes.py プロジェクト: mathause/xarray
    def test_concat_periods(self):
        periods = pd.period_range("2000-01-01", periods=10)
        indexes = [PandasIndex(periods[:5], "t"), PandasIndex(periods[5:], "t")]
        expected = PandasIndex(periods, "t")
        actual = PandasIndex.concat(indexes, dim="t")
        assert actual.equals(expected)
        assert isinstance(actual.index, pd.PeriodIndex)

        positions = [list(range(5)), list(range(5, 10))]
        actual = PandasIndex.concat(indexes, dim="t", positions=positions)
        assert actual.equals(expected)
        assert isinstance(actual.index, pd.PeriodIndex)
コード例 #11
0
    def test_constructor(self) -> None:
        pd_idx = pd.Index([1, 2, 3])
        index = PandasIndex(pd_idx, "x")

        assert index.index.equals(pd_idx)
        # makes a shallow copy
        assert index.index is not pd_idx
        assert index.dim == "x"

        # test no name set for pd.Index
        pd_idx.name = None
        index = PandasIndex(pd_idx, "x")
        assert index.index.name == "x"
コード例 #12
0
ファイル: test_indexes.py プロジェクト: mathause/xarray
    def test_create_variables(self) -> None:
        # pandas has only Float64Index but variable dtype should be preserved
        data = np.array([1.1, 2.2, 3.3], dtype=np.float32)
        pd_idx = pd.Index(data, name="foo")
        index = PandasIndex(pd_idx, "x", coord_dtype=data.dtype)
        index_vars = {
            "foo": IndexVariable(
                "x", data, attrs={"unit": "m"}, encoding={"fill_value": 0.0}
            )
        }

        actual = index.create_variables(index_vars)
        assert_identical(actual["foo"], index_vars["foo"])
        assert actual["foo"].dtype == index_vars["foo"].dtype
        assert actual["foo"].dtype == index.coord_dtype
コード例 #13
0
ファイル: test_indexes.py プロジェクト: aulemahal/xarray
    def test_getitem(self):
        pd_idx = pd.Index([1, 2, 3])
        expected = PandasIndex(pd_idx, "x")
        actual = expected[1:]

        assert actual.index.equals(pd_idx[1:])
        assert actual.dim == expected.dim
コード例 #14
0
ファイル: test_indexes.py プロジェクト: mathause/xarray
    def test_rename(self) -> None:
        index = PandasIndex(pd.Index([1, 2, 3], name="a"), "x", coord_dtype=np.int32)

        # shortcut
        new_index = index.rename({}, {})
        assert new_index is index

        new_index = index.rename({"a": "b"}, {})
        assert new_index.index.name == "b"
        assert new_index.dim == "x"
        assert new_index.coord_dtype == np.int32

        new_index = index.rename({}, {"x": "y"})
        assert new_index.index.name == "a"
        assert new_index.dim == "y"
        assert new_index.coord_dtype == np.int32
コード例 #15
0
 def test_sel(self) -> None:
     # TODO: add tests that aren't just for edge cases
     index = PandasIndex(pd.Index([1, 2, 3]), "x")
     with pytest.raises(KeyError, match=r"not all values found"):
         index.sel({"x": [0]})
     with pytest.raises(KeyError):
         index.sel({"x": 0})
     with pytest.raises(ValueError, match=r"does not have a MultiIndex"):
         index.sel({"x": {"one": 0}})
コード例 #16
0
    def test_getitem(self) -> None:
        pd_idx = pd.Index([1, 2, 3])
        expected = PandasIndex(pd_idx, "x", coord_dtype=np.int32)
        actual = expected[1:]

        assert actual.index.equals(pd_idx[1:])
        assert actual.dim == expected.dim
        assert actual.coord_dtype == expected.coord_dtype
コード例 #17
0
    def test_from_variables_index_adapter(self) -> None:
        # test index type is preserved when variable wraps a pd.Index
        data = pd.Series(["foo", "bar"], dtype="category")
        pd_idx = pd.Index(data)
        var = xr.Variable("x", pd_idx)

        index = PandasIndex.from_variables({"x": var})
        assert isinstance(index.index, pd.CategoricalIndex)
コード例 #18
0
ファイル: test_indexes.py プロジェクト: aulemahal/xarray
    def test_from_pandas_index(self):
        pd_idx = pd.Index([1, 2, 3], name="foo")

        index, index_vars = PandasIndex.from_pandas_index(pd_idx, "x")

        assert index.dim == "x"
        assert index.index is pd_idx
        assert index.index.name == "foo"
        xr.testing.assert_identical(index_vars["foo"],
                                    IndexVariable("x", [1, 2, 3]))

        # test no name set for pd.Index
        pd_idx.name = None
        index, index_vars = PandasIndex.from_pandas_index(pd_idx, "x")
        assert "x" in index_vars
        assert index.index is not pd_idx
        assert index.index.name == "x"
コード例 #19
0
ファイル: test_indexes.py プロジェクト: aulemahal/xarray
    def test_from_variables(self):
        var = xr.Variable("x", [1, 2, 3],
                          attrs={"unit": "m"},
                          encoding={"dtype": np.int32})

        index, index_vars = PandasIndex.from_variables({"x": var})
        xr.testing.assert_identical(var.to_index_variable(), index_vars["x"])
        assert index.dim == "x"
        assert index.index.equals(index_vars["x"].to_index())

        var2 = xr.Variable(("x", "y"), [[1, 2, 3], [4, 5, 6]])
        with pytest.raises(ValueError, match=r".*only accepts one variable.*"):
            PandasIndex.from_variables({"x": var, "foo": var2})

        with pytest.raises(ValueError,
                           match=r".*only accepts a 1-dimensional variable.*"):
            PandasIndex.from_variables({"foo": var2})
コード例 #20
0
    def test_reindex_like(self) -> None:
        index1 = PandasIndex([0, 1, 2], "x")
        index2 = PandasIndex([1, 2, 3, 4], "x")

        expected = {"x": [1, 2, -1, -1]}
        actual = index1.reindex_like(index2)
        assert actual.keys() == expected.keys()
        np.testing.assert_array_equal(actual["x"], expected["x"])

        index3 = PandasIndex([1, 1, 2], "x")
        with pytest.raises(ValueError, match=r".*index has duplicate values"):
            index3.reindex_like(index2)
コード例 #21
0
    def test_from_variables(self) -> None:
        # pandas has only Float64Index but variable dtype should be preserved
        data = np.array([1.1, 2.2, 3.3], dtype=np.float32)
        var = xr.Variable("x",
                          data,
                          attrs={"unit": "m"},
                          encoding={"dtype": np.float64})

        index = PandasIndex.from_variables({"x": var})
        assert index.dim == "x"
        assert index.index.equals(pd.Index(data))
        assert index.coord_dtype == data.dtype

        var2 = xr.Variable(("x", "y"), [[1, 2, 3], [4, 5, 6]])
        with pytest.raises(ValueError, match=r".*only accepts one variable.*"):
            PandasIndex.from_variables({"x": var, "foo": var2})

        with pytest.raises(ValueError,
                           match=r".*only accepts a 1-dimensional variable.*"):
            PandasIndex.from_variables({"foo": var2})
コード例 #22
0
def test_concat_index_not_same_dim() -> None:
    ds1 = Dataset(coords={"x": ("x", [1, 2])})
    ds2 = Dataset(coords={"x": ("y", [3, 4])})
    # TODO: use public API for setting a non-default index, when available
    ds2._indexes["x"] = PandasIndex([3, 4], "y")

    with pytest.raises(
            ValueError,
            match=
            r"Cannot concatenate along dimension 'x' indexes with dimensions.*",
    ):
        concat([ds1, ds2], dim="x")
コード例 #23
0
def test_safe_cast_to_index():
    dates = pd.date_range("2000-01-01", periods=10)
    x = np.arange(5)
    td = x * np.timedelta64(1, "D")
    midx = pd.MultiIndex.from_tuples([(0,)], names=["a"])
    for expected, array in [
        (dates, dates.values),
        (pd.Index(x, dtype=object), x.astype(object)),
        (pd.Index(td), td),
        (pd.Index(td, dtype=object), td.astype(object)),
        (midx, PandasIndex(midx)),
    ]:
        actual = utils.safe_cast_to_index(array)
        assert_array_equal(expected, actual)
        assert expected.dtype == actual.dtype
コード例 #24
0
    def test_join(self) -> None:
        index1 = PandasIndex(["a", "aa", "aaa"], "x", coord_dtype="<U3")
        index2 = PandasIndex(["aa", "aaa", "aaaa"], "x", coord_dtype="<U4")

        expected = PandasIndex(["aa", "aaa"], "x")
        actual = index1.join(index2)
        print(actual.index)
        assert actual.equals(expected)
        assert actual.coord_dtype == "<U4"

        expected = PandasIndex(["a", "aa", "aaa", "aaaa"], "x")
        actual = index1.join(index2, how="outer")
        print(actual.index)
        assert actual.equals(expected)
        assert actual.coord_dtype == "<U4"
コード例 #25
0
    def test_concat_dim_error(self) -> None:
        indexes = [PandasIndex([0, 1], "x"), PandasIndex([2, 3], "y")]

        with pytest.raises(ValueError,
                           match=r"Cannot concatenate.*dimensions.*"):
            PandasIndex.concat(indexes, "x")
コード例 #26
0
ファイル: parallel.py プロジェクト: gabrielmpp/xarray
def map_blocks(
    func: Callable[..., T_DSorDA],
    obj: Union[DataArray, Dataset],
    args: Sequence[Any] = (),
    kwargs: Mapping[str, Any] = None,
    template: Union[DataArray, Dataset] = None,
) -> T_DSorDA:
    """Apply a function to each block of a DataArray or Dataset.

    .. warning::
        This function is experimental and its signature may change.

    Parameters
    ----------
    func : callable
        User-provided function that accepts a DataArray or Dataset as its first
        parameter ``obj``. The function will receive a subset or 'block' of ``obj`` (see below),
        corresponding to one chunk along each chunked dimension. ``func`` will be
        executed as ``func(subset_obj, *subset_args, **kwargs)``.

        This function must return either a single DataArray or a single Dataset.

        This function cannot add a new chunked dimension.
    obj : DataArray, Dataset
        Passed to the function as its first argument, one block at a time.
    args : sequence
        Passed to func after unpacking and subsetting any xarray objects by blocks.
        xarray objects in args must be aligned with obj, otherwise an error is raised.
    kwargs : mapping
        Passed verbatim to func after unpacking. xarray objects, if any, will not be
        subset to blocks. Passing dask collections in kwargs is not allowed.
    template : DataArray or Dataset, optional
        xarray object representing the final result after compute is called. If not provided,
        the function will be first run on mocked-up data, that looks like ``obj`` but
        has sizes 0, to determine properties of the returned object such as dtype,
        variable names, attributes, new dimensions and new indexes (if any).
        ``template`` must be provided if the function changes the size of existing dimensions.
        When provided, ``attrs`` on variables in `template` are copied over to the result. Any
        ``attrs`` set by ``func`` will be ignored.

    Returns
    -------
    A single DataArray or Dataset with dask backend, reassembled from the outputs of the
    function.

    Notes
    -----
    This function is designed for when ``func`` needs to manipulate a whole xarray object
    subset to each block. Each block is loaded into memory. In the more common case where
    ``func`` can work on numpy arrays, it is recommended to use ``apply_ufunc``.

    If none of the variables in ``obj`` is backed by dask arrays, calling this function is
    equivalent to calling ``func(obj, *args, **kwargs)``.

    See Also
    --------
    dask.array.map_blocks, xarray.apply_ufunc, xarray.Dataset.map_blocks
    xarray.DataArray.map_blocks

    Examples
    --------
    Calculate an anomaly from climatology using ``.groupby()``. Using
    ``xr.map_blocks()`` allows for parallel operations with knowledge of ``xarray``,
    its indices, and its methods like ``.groupby()``.

    >>> def calculate_anomaly(da, groupby_type="time.month"):
    ...     gb = da.groupby(groupby_type)
    ...     clim = gb.mean(dim="time")
    ...     return gb - clim
    ...
    >>> time = xr.cftime_range("1990-01", "1992-01", freq="M")
    >>> month = xr.DataArray(time.month, coords={"time": time}, dims=["time"])
    >>> np.random.seed(123)
    >>> array = xr.DataArray(
    ...     np.random.rand(len(time)),
    ...     dims=["time"],
    ...     coords={"time": time, "month": month},
    ... ).chunk()
    >>> array.map_blocks(calculate_anomaly, template=array).compute()
    <xarray.DataArray (time: 24)>
    array([ 0.12894847,  0.11323072, -0.0855964 , -0.09334032,  0.26848862,
            0.12382735,  0.22460641,  0.07650108, -0.07673453, -0.22865714,
           -0.19063865,  0.0590131 , -0.12894847, -0.11323072,  0.0855964 ,
            0.09334032, -0.26848862, -0.12382735, -0.22460641, -0.07650108,
            0.07673453,  0.22865714,  0.19063865, -0.0590131 ])
    Coordinates:
      * time     (time) object 1990-01-31 00:00:00 ... 1991-12-31 00:00:00
        month    (time) int64 1 2 3 4 5 6 7 8 9 10 11 12 1 2 3 4 5 6 7 8 9 10 11 12

    Note that one must explicitly use ``args=[]`` and ``kwargs={}`` to pass arguments
    to the function being applied in ``xr.map_blocks()``:

    >>> array.map_blocks(
    ...     calculate_anomaly,
    ...     kwargs={"groupby_type": "time.year"},
    ...     template=array,
    ... )  # doctest: +ELLIPSIS
    <xarray.DataArray (time: 24)>
    dask.array<<this-array>-calculate_anomaly, shape=(24,), dtype=float64, chunksize=(24,), chunktype=numpy.ndarray>
    Coordinates:
      * time     (time) object 1990-01-31 00:00:00 ... 1991-12-31 00:00:00
        month    (time) int64 dask.array<chunksize=(24,), meta=np.ndarray>
    """

    def _wrapper(
        func: Callable,
        args: List,
        kwargs: dict,
        arg_is_array: Iterable[bool],
        expected: dict,
    ):
        """
        Wrapper function that receives datasets in args; converts to dataarrays when necessary;
        passes these to the user function `func` and checks returned objects for expected shapes/sizes/etc.
        """

        converted_args = [
            dataset_to_dataarray(arg) if is_array else arg
            for is_array, arg in zip(arg_is_array, args)
        ]

        result = func(*converted_args, **kwargs)

        # check all dims are present
        missing_dimensions = set(expected["shapes"]) - set(result.sizes)
        if missing_dimensions:
            raise ValueError(
                f"Dimensions {missing_dimensions} missing on returned object."
            )

        # check that index lengths and values are as expected
        for name, index in result.xindexes.items():
            if name in expected["shapes"]:
                if len(index) != expected["shapes"][name]:
                    raise ValueError(
                        f"Received dimension {name!r} of length {len(index)}. Expected length {expected['shapes'][name]}."
                    )
            if name in expected["indexes"]:
                expected_index = expected["indexes"][name]
                if not index.equals(expected_index):
                    raise ValueError(
                        f"Expected index {name!r} to be {expected_index!r}. Received {index!r} instead."
                    )

        # check that all expected variables were returned
        check_result_variables(result, expected, "coords")
        if isinstance(result, Dataset):
            check_result_variables(result, expected, "data_vars")

        return make_dict(result)

    if template is not None and not isinstance(template, (DataArray, Dataset)):
        raise TypeError(
            f"template must be a DataArray or Dataset. Received {type(template).__name__} instead."
        )
    if not isinstance(args, Sequence):
        raise TypeError("args must be a sequence (for example, a list or tuple).")
    if kwargs is None:
        kwargs = {}
    elif not isinstance(kwargs, Mapping):
        raise TypeError("kwargs must be a mapping (for example, a dict)")

    for value in kwargs.values():
        if dask.is_dask_collection(value):
            raise TypeError(
                "Cannot pass dask collections in kwargs yet. Please compute or "
                "load values before passing to map_blocks."
            )

    if not dask.is_dask_collection(obj):
        return func(obj, *args, **kwargs)

    all_args = [obj] + list(args)
    is_xarray = [isinstance(arg, (Dataset, DataArray)) for arg in all_args]
    is_array = [isinstance(arg, DataArray) for arg in all_args]

    # there should be a better way to group this. partition?
    xarray_indices, xarray_objs = unzip(
        (index, arg) for index, arg in enumerate(all_args) if is_xarray[index]
    )
    others = [
        (index, arg) for index, arg in enumerate(all_args) if not is_xarray[index]
    ]

    # all xarray objects must be aligned. This is consistent with apply_ufunc.
    aligned = align(*xarray_objs, join="exact")
    xarray_objs = tuple(
        dataarray_to_dataset(arg) if is_da else arg
        for is_da, arg in zip(is_array, aligned)
    )

    _, npargs = unzip(
        sorted(list(zip(xarray_indices, xarray_objs)) + others, key=lambda x: x[0])
    )

    # check that chunk sizes are compatible
    input_chunks = dict(npargs[0].chunks)
    input_indexes = dict(npargs[0].xindexes)
    for arg in xarray_objs[1:]:
        assert_chunks_compatible(npargs[0], arg)
        input_chunks.update(arg.chunks)
        input_indexes.update(arg.xindexes)

    if template is None:
        # infer template by providing zero-shaped arrays
        template = infer_template(func, aligned[0], *args, **kwargs)
        template_indexes = set(template.xindexes)
        preserved_indexes = template_indexes & set(input_indexes)
        new_indexes = template_indexes - set(input_indexes)
        indexes = {dim: input_indexes[dim] for dim in preserved_indexes}
        indexes.update({k: template.xindexes[k] for k in new_indexes})
        output_chunks = {
            dim: input_chunks[dim] for dim in template.dims if dim in input_chunks
        }

    else:
        # template xarray object has been provided with proper sizes and chunk shapes
        indexes = dict(template.xindexes)
        if isinstance(template, DataArray):
            output_chunks = dict(
                zip(template.dims, template.chunks)  # type: ignore[arg-type]
            )
        else:
            output_chunks = dict(template.chunks)

    for dim in output_chunks:
        if dim in input_chunks and len(input_chunks[dim]) != len(output_chunks[dim]):
            raise ValueError(
                "map_blocks requires that one block of the input maps to one block of output. "
                f"Expected number of output chunks along dimension {dim!r} to be {len(input_chunks[dim])}. "
                f"Received {len(output_chunks[dim])} instead. Please provide template if not provided, or "
                "fix the provided template."
            )

    if isinstance(template, DataArray):
        result_is_array = True
        template_name = template.name
        template = template._to_temp_dataset()
    elif isinstance(template, Dataset):
        result_is_array = False
    else:
        raise TypeError(
            f"func output must be DataArray or Dataset; got {type(template)}"
        )

    # We're building a new HighLevelGraph hlg. We'll have one new layer
    # for each variable in the dataset, which is the result of the
    # func applied to the values.

    graph: Dict[Any, Any] = {}
    new_layers: DefaultDict[str, Dict[Any, Any]] = collections.defaultdict(dict)
    gname = "{}-{}".format(
        dask.utils.funcname(func), dask.base.tokenize(npargs[0], args, kwargs)
    )

    # map dims to list of chunk indexes
    ichunk = {dim: range(len(chunks_v)) for dim, chunks_v in input_chunks.items()}
    # mapping from chunk index to slice bounds
    input_chunk_bounds = {
        dim: np.cumsum((0,) + chunks_v) for dim, chunks_v in input_chunks.items()
    }
    output_chunk_bounds = {
        dim: np.cumsum((0,) + chunks_v) for dim, chunks_v in output_chunks.items()
    }

    def subset_dataset_to_block(
        graph: dict, gname: str, dataset: Dataset, input_chunk_bounds, chunk_index
    ):
        """
        Creates a task that subsets an xarray dataset to a block determined by chunk_index.
        Block extents are determined by input_chunk_bounds.
        Also subtasks that subset the constituent variables of a dataset.
        """

        # this will become [[name1, variable1],
        #                   [name2, variable2],
        #                   ...]
        # which is passed to dict and then to Dataset
        data_vars = []
        coords = []

        chunk_tuple = tuple(chunk_index.values())
        for name, variable in dataset.variables.items():
            # make a task that creates tuple of (dims, chunk)
            if dask.is_dask_collection(variable.data):
                # recursively index into dask_keys nested list to get chunk
                chunk = variable.__dask_keys__()
                for dim in variable.dims:
                    chunk = chunk[chunk_index[dim]]

                chunk_variable_task = (f"{name}-{gname}-{chunk[0]}",) + chunk_tuple
                graph[chunk_variable_task] = (
                    tuple,
                    [variable.dims, chunk, variable.attrs],
                )
            else:
                # non-dask array possibly with dimensions chunked on other variables
                # index into variable appropriately
                subsetter = {
                    dim: _get_chunk_slicer(dim, chunk_index, input_chunk_bounds)
                    for dim in variable.dims
                }
                subset = variable.isel(subsetter)
                chunk_variable_task = (
                    f"{name}-{gname}-{dask.base.tokenize(subset)}",
                ) + chunk_tuple
                graph[chunk_variable_task] = (
                    tuple,
                    [subset.dims, subset, subset.attrs],
                )

            # this task creates dict mapping variable name to above tuple
            if name in dataset._coord_names:
                coords.append([name, chunk_variable_task])
            else:
                data_vars.append([name, chunk_variable_task])

        return (Dataset, (dict, data_vars), (dict, coords), dataset.attrs)

    # iterate over all possible chunk combinations
    for chunk_tuple in itertools.product(*ichunk.values()):
        # mapping from dimension name to chunk index
        chunk_index = dict(zip(ichunk.keys(), chunk_tuple))

        blocked_args = [
            subset_dataset_to_block(graph, gname, arg, input_chunk_bounds, chunk_index)
            if isxr
            else arg
            for isxr, arg in zip(is_xarray, npargs)
        ]

        # expected["shapes", "coords", "data_vars", "indexes"] are used to
        # raise nice error messages in _wrapper
        expected = {}
        # input chunk 0 along a dimension maps to output chunk 0 along the same dimension
        # even if length of dimension is changed by the applied function
        expected["shapes"] = {
            k: output_chunks[k][v] for k, v in chunk_index.items() if k in output_chunks
        }
        expected["data_vars"] = set(template.data_vars.keys())  # type: ignore[assignment]
        expected["coords"] = set(template.coords.keys())  # type: ignore[assignment]
        # TODO: benbovy - flexible indexes: clean this up
        # for now assumes pandas index (thus can be indexed) but it won't be the case for
        # all indexes
        expected_indexes = {}
        for dim in indexes:
            idx = indexes[dim].to_pandas_index()[
                _get_chunk_slicer(dim, chunk_index, output_chunk_bounds)
            ]
            expected_indexes[dim] = PandasIndex(idx)
        expected["indexes"] = expected_indexes

        from_wrapper = (gname,) + chunk_tuple
        graph[from_wrapper] = (_wrapper, func, blocked_args, kwargs, is_array, expected)

        # mapping from variable name to dask graph key
        var_key_map: Dict[Hashable, str] = {}
        for name, variable in template.variables.items():
            if name in indexes:
                continue
            gname_l = f"{name}-{gname}"
            var_key_map[name] = gname_l

            key: Tuple[Any, ...] = (gname_l,)
            for dim in variable.dims:
                if dim in chunk_index:
                    key += (chunk_index[dim],)
                else:
                    # unchunked dimensions in the input have one chunk in the result
                    # output can have new dimensions with exactly one chunk
                    key += (0,)

            # We're adding multiple new layers to the graph:
            # The first new layer is the result of the computation on
            # the array.
            # Then we add one layer per variable, which extracts the
            # result for that variable, and depends on just the first new
            # layer.
            new_layers[gname_l][key] = (operator.getitem, from_wrapper, name)

    hlg = HighLevelGraph.from_collections(
        gname,
        graph,
        dependencies=[arg for arg in npargs if dask.is_dask_collection(arg)],
    )

    # This adds in the getitems for each variable in the dataset.
    hlg = HighLevelGraph(
        {**hlg.layers, **new_layers},
        dependencies={
            **hlg.dependencies,
            **{name: {gname} for name in new_layers.keys()},
        },
    )

    result = Dataset(coords=indexes, attrs=template.attrs)
    for index in result.xindexes:
        result[index].attrs = template[index].attrs
        result[index].encoding = template[index].encoding

    for name, gname_l in var_key_map.items():
        dims = template[name].dims
        var_chunks = []
        for dim in dims:
            if dim in output_chunks:
                var_chunks.append(output_chunks[dim])
            elif dim in indexes:
                var_chunks.append((len(indexes[dim]),))
            elif dim in template.dims:
                # new unindexed dimension
                var_chunks.append((template.sizes[dim],))

        data = dask.array.Array(
            hlg, name=gname_l, chunks=var_chunks, dtype=template[name].dtype
        )
        result[name] = (dims, data, template[name].attrs)
        result[name].encoding = template[name].encoding

    result = result.set_coords(template._coord_names)

    if result_is_array:
        da = dataset_to_dataarray(result)
        da.name = template_name
        return da  # type: ignore[return-value]
    return result  # type: ignore[return-value]
コード例 #27
0
    def test_map_index_queries(self) -> None:
        def create_sel_results(
            x_indexer,
            x_index,
            other_vars,
            drop_coords,
            drop_indexes,
            rename_dims,
        ):
            dim_indexers = {"x": x_indexer}
            index_vars = x_index.create_variables()
            indexes = {k: x_index for k in index_vars}
            variables = {}
            variables.update(index_vars)
            variables.update(other_vars)

            return indexing.IndexSelResult(
                dim_indexers=dim_indexers,
                indexes=indexes,
                variables=variables,
                drop_coords=drop_coords,
                drop_indexes=drop_indexes,
                rename_dims=rename_dims,
            )

        def test_indexer(
            data: T_Xarray,
            x: Any,
            expected: indexing.IndexSelResult,
        ) -> None:
            results = indexing.map_index_queries(data, {"x": x})

            assert results.dim_indexers.keys() == expected.dim_indexers.keys()
            assert_array_equal(results.dim_indexers["x"],
                               expected.dim_indexers["x"])

            assert results.indexes.keys() == expected.indexes.keys()
            for k in results.indexes:
                assert results.indexes[k].equals(expected.indexes[k])

            assert results.variables.keys() == expected.variables.keys()
            for k in results.variables:
                assert_array_equal(results.variables[k], expected.variables[k])

            assert set(results.drop_coords) == set(expected.drop_coords)
            assert set(results.drop_indexes) == set(expected.drop_indexes)
            assert results.rename_dims == expected.rename_dims

        data = Dataset({"x": ("x", [1, 2, 3])})
        mindex = pd.MultiIndex.from_product([["a", "b"], [1, 2], [-1, -2]],
                                            names=("one", "two", "three"))
        mdata = DataArray(range(8), [("x", mindex)])

        test_indexer(data, 1, indexing.IndexSelResult({"x": 0}))
        test_indexer(data, np.int32(1), indexing.IndexSelResult({"x": 0}))
        test_indexer(data, Variable([], 1), indexing.IndexSelResult({"x": 0}))
        test_indexer(mdata, ("a", 1, -1), indexing.IndexSelResult({"x": 0}))

        expected = create_sel_results(
            [True, True, False, False, False, False, False, False],
            PandasIndex(pd.Index([-1, -2]), "three"),
            {
                "one": Variable((), "a"),
                "two": Variable((), 1)
            },
            ["x"],
            ["one", "two"],
            {"x": "three"},
        )
        test_indexer(mdata, ("a", 1), expected)

        expected = create_sel_results(
            slice(0, 4, None),
            PandasMultiIndex(
                pd.MultiIndex.from_product([[1, 2], [-1, -2]],
                                           names=("two", "three")),
                "x",
            ),
            {"one": Variable((), "a")},
            [],
            ["one"],
            {},
        )
        test_indexer(mdata, "a", expected)

        expected = create_sel_results(
            [True, True, True, True, False, False, False, False],
            PandasMultiIndex(
                pd.MultiIndex.from_product([[1, 2], [-1, -2]],
                                           names=("two", "three")),
                "x",
            ),
            {"one": Variable((), "a")},
            [],
            ["one"],
            {},
        )
        test_indexer(mdata, ("a", ), expected)

        test_indexer(mdata, [("a", 1, -1), ("b", 2, -2)],
                     indexing.IndexSelResult({"x": [0, 7]}))
        test_indexer(mdata, slice("a", "b"),
                     indexing.IndexSelResult({"x": slice(0, 8, None)}))
        test_indexer(
            mdata,
            slice(("a", 1), ("b", 1)),
            indexing.IndexSelResult({"x": slice(0, 6, None)}),
        )
        test_indexer(
            mdata,
            {
                "one": "a",
                "two": 1,
                "three": -1
            },
            indexing.IndexSelResult({"x": 0}),
        )

        expected = create_sel_results(
            [True, True, False, False, False, False, False, False],
            PandasIndex(pd.Index([-1, -2]), "three"),
            {
                "one": Variable((), "a"),
                "two": Variable((), 1)
            },
            ["x"],
            ["one", "two"],
            {"x": "three"},
        )
        test_indexer(mdata, {"one": "a", "two": 1}, expected)

        expected = create_sel_results(
            [True, False, True, False, False, False, False, False],
            PandasIndex(pd.Index([1, 2]), "two"),
            {
                "one": Variable((), "a"),
                "three": Variable((), -1)
            },
            ["x"],
            ["one", "three"],
            {"x": "two"},
        )
        test_indexer(mdata, {"one": "a", "three": -1}, expected)

        expected = create_sel_results(
            [True, True, True, True, False, False, False, False],
            PandasMultiIndex(
                pd.MultiIndex.from_product([[1, 2], [-1, -2]],
                                           names=("two", "three")),
                "x",
            ),
            {"one": Variable((), "a")},
            [],
            ["one"],
            {},
        )
        test_indexer(mdata, {"one": "a"}, expected)
コード例 #28
0
 def test_concat_empty(self) -> None:
     idx = PandasIndex.concat([], "x")
     assert idx.coord_dtype is np.dtype("O")
コード例 #29
0
 def test_to_pandas_index(self) -> None:
     pd_idx = pd.Index([1, 2, 3], name="foo")
     index = PandasIndex(pd_idx, "x")
     assert index.to_pandas_index() is index.index
コード例 #30
0
 def test_equals(self) -> None:
     index1 = PandasIndex([1, 2, 3], "x")
     index2 = PandasIndex([1, 2, 3], "x")
     assert index1.equals(index2) is True