def test_sel_unsorted_datetime_index_raises(self) -> None: index = PandasIndex(pd.to_datetime(["2001", "2000", "2002"]), "x") with pytest.raises(KeyError): # pandas will try to convert this into an array indexer. We should # raise instead, so we can be sure the result of indexing with a # slice is always a view. index.sel({"x": slice("2001", "2002")})
def test_copy(self): expected = PandasIndex([1, 2, 3], "x") actual = expected.copy() assert actual.index.equals(expected.index) assert actual.index is not expected.index assert actual.dim == expected.dim
def test_sel_boolean(self) -> None: # index should be ignored and indexer dtype should not be coerced # see https://github.com/pydata/xarray/issues/5727 index = PandasIndex(pd.Index([0.0, 2.0, 1.0, 3.0]), "x") actual = index.sel({"x": [False, True, False, True]}) expected_dim_indexers = {"x": [False, True, False, True]} np.testing.assert_array_equal(actual.dim_indexers["x"], expected_dim_indexers["x"])
def test_copy(self) -> None: expected = PandasIndex([1, 2, 3], "x", coord_dtype=np.int32) actual = expected.copy() assert actual.index.equals(expected.index) assert actual.index is not expected.index assert actual.dim == expected.dim assert actual.coord_dtype == expected.coord_dtype
def unique_indexes(self) -> list[PandasIndex]: x_idx = PandasIndex(pd.Index([1, 2, 3], name="x"), "x") y_idx = PandasIndex(pd.Index([4, 5, 6], name="y"), "y") z_pd_midx = pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=["one", "two"]) z_midx = PandasMultiIndex(z_pd_midx, "z") return [x_idx, y_idx, z_midx]
def test_sel_datetime(self) -> None: index = PandasIndex( pd.to_datetime(["2000-01-01", "2001-01-01", "2002-01-01"]), "x") actual = index.sel({"x": "2001-01-01"}) expected_dim_indexers = {"x": 1} assert actual.dim_indexers == expected_dim_indexers actual = index.sel({"x": index.to_pandas_index().to_numpy()[1]}) assert actual.dim_indexers == expected_dim_indexers
def test_query_datetime(self): index = PandasIndex( pd.to_datetime(["2000-01-01", "2001-01-01", "2002-01-01"]), "x") actual = index.query({"x": "2001-01-01"}) expected = (1, None) assert actual == expected actual = index.query({"x": index.to_pandas_index().to_numpy()[1]}) assert actual == expected
def test_unstack(self) -> None: pd_midx = pd.MultiIndex.from_product([["a", "b"], [1, 2, 3]], names=["one", "two"]) index = PandasMultiIndex(pd_midx, "x") new_indexes, new_pd_idx = index.unstack() assert list(new_indexes) == ["one", "two"] assert new_indexes["one"].equals(PandasIndex(["a", "b"], "one")) assert new_indexes["two"].equals(PandasIndex([1, 2, 3], "two")) assert new_pd_idx.equals(pd_midx)
def test_concat_str_dtype(self, dtype) -> None: a = PandasIndex(np.array(["a"], dtype=dtype), "x", coord_dtype=dtype) b = PandasIndex(np.array(["b"], dtype=dtype), "x", coord_dtype=dtype) expected = PandasIndex(np.array(["a", "b"], dtype=dtype), "x", coord_dtype=dtype) actual = PandasIndex.concat([a, b], "x") assert actual.equals(expected) assert np.issubdtype(actual.coord_dtype, dtype)
def test_concat_periods(self): periods = pd.period_range("2000-01-01", periods=10) indexes = [PandasIndex(periods[:5], "t"), PandasIndex(periods[5:], "t")] expected = PandasIndex(periods, "t") actual = PandasIndex.concat(indexes, dim="t") assert actual.equals(expected) assert isinstance(actual.index, pd.PeriodIndex) positions = [list(range(5)), list(range(5, 10))] actual = PandasIndex.concat(indexes, dim="t", positions=positions) assert actual.equals(expected) assert isinstance(actual.index, pd.PeriodIndex)
def test_constructor(self) -> None: pd_idx = pd.Index([1, 2, 3]) index = PandasIndex(pd_idx, "x") assert index.index.equals(pd_idx) # makes a shallow copy assert index.index is not pd_idx assert index.dim == "x" # test no name set for pd.Index pd_idx.name = None index = PandasIndex(pd_idx, "x") assert index.index.name == "x"
def test_create_variables(self) -> None: # pandas has only Float64Index but variable dtype should be preserved data = np.array([1.1, 2.2, 3.3], dtype=np.float32) pd_idx = pd.Index(data, name="foo") index = PandasIndex(pd_idx, "x", coord_dtype=data.dtype) index_vars = { "foo": IndexVariable( "x", data, attrs={"unit": "m"}, encoding={"fill_value": 0.0} ) } actual = index.create_variables(index_vars) assert_identical(actual["foo"], index_vars["foo"]) assert actual["foo"].dtype == index_vars["foo"].dtype assert actual["foo"].dtype == index.coord_dtype
def test_getitem(self): pd_idx = pd.Index([1, 2, 3]) expected = PandasIndex(pd_idx, "x") actual = expected[1:] assert actual.index.equals(pd_idx[1:]) assert actual.dim == expected.dim
def test_rename(self) -> None: index = PandasIndex(pd.Index([1, 2, 3], name="a"), "x", coord_dtype=np.int32) # shortcut new_index = index.rename({}, {}) assert new_index is index new_index = index.rename({"a": "b"}, {}) assert new_index.index.name == "b" assert new_index.dim == "x" assert new_index.coord_dtype == np.int32 new_index = index.rename({}, {"x": "y"}) assert new_index.index.name == "a" assert new_index.dim == "y" assert new_index.coord_dtype == np.int32
def test_sel(self) -> None: # TODO: add tests that aren't just for edge cases index = PandasIndex(pd.Index([1, 2, 3]), "x") with pytest.raises(KeyError, match=r"not all values found"): index.sel({"x": [0]}) with pytest.raises(KeyError): index.sel({"x": 0}) with pytest.raises(ValueError, match=r"does not have a MultiIndex"): index.sel({"x": {"one": 0}})
def test_getitem(self) -> None: pd_idx = pd.Index([1, 2, 3]) expected = PandasIndex(pd_idx, "x", coord_dtype=np.int32) actual = expected[1:] assert actual.index.equals(pd_idx[1:]) assert actual.dim == expected.dim assert actual.coord_dtype == expected.coord_dtype
def test_from_variables_index_adapter(self) -> None: # test index type is preserved when variable wraps a pd.Index data = pd.Series(["foo", "bar"], dtype="category") pd_idx = pd.Index(data) var = xr.Variable("x", pd_idx) index = PandasIndex.from_variables({"x": var}) assert isinstance(index.index, pd.CategoricalIndex)
def test_from_pandas_index(self): pd_idx = pd.Index([1, 2, 3], name="foo") index, index_vars = PandasIndex.from_pandas_index(pd_idx, "x") assert index.dim == "x" assert index.index is pd_idx assert index.index.name == "foo" xr.testing.assert_identical(index_vars["foo"], IndexVariable("x", [1, 2, 3])) # test no name set for pd.Index pd_idx.name = None index, index_vars = PandasIndex.from_pandas_index(pd_idx, "x") assert "x" in index_vars assert index.index is not pd_idx assert index.index.name == "x"
def test_from_variables(self): var = xr.Variable("x", [1, 2, 3], attrs={"unit": "m"}, encoding={"dtype": np.int32}) index, index_vars = PandasIndex.from_variables({"x": var}) xr.testing.assert_identical(var.to_index_variable(), index_vars["x"]) assert index.dim == "x" assert index.index.equals(index_vars["x"].to_index()) var2 = xr.Variable(("x", "y"), [[1, 2, 3], [4, 5, 6]]) with pytest.raises(ValueError, match=r".*only accepts one variable.*"): PandasIndex.from_variables({"x": var, "foo": var2}) with pytest.raises(ValueError, match=r".*only accepts a 1-dimensional variable.*"): PandasIndex.from_variables({"foo": var2})
def test_reindex_like(self) -> None: index1 = PandasIndex([0, 1, 2], "x") index2 = PandasIndex([1, 2, 3, 4], "x") expected = {"x": [1, 2, -1, -1]} actual = index1.reindex_like(index2) assert actual.keys() == expected.keys() np.testing.assert_array_equal(actual["x"], expected["x"]) index3 = PandasIndex([1, 1, 2], "x") with pytest.raises(ValueError, match=r".*index has duplicate values"): index3.reindex_like(index2)
def test_from_variables(self) -> None: # pandas has only Float64Index but variable dtype should be preserved data = np.array([1.1, 2.2, 3.3], dtype=np.float32) var = xr.Variable("x", data, attrs={"unit": "m"}, encoding={"dtype": np.float64}) index = PandasIndex.from_variables({"x": var}) assert index.dim == "x" assert index.index.equals(pd.Index(data)) assert index.coord_dtype == data.dtype var2 = xr.Variable(("x", "y"), [[1, 2, 3], [4, 5, 6]]) with pytest.raises(ValueError, match=r".*only accepts one variable.*"): PandasIndex.from_variables({"x": var, "foo": var2}) with pytest.raises(ValueError, match=r".*only accepts a 1-dimensional variable.*"): PandasIndex.from_variables({"foo": var2})
def test_concat_index_not_same_dim() -> None: ds1 = Dataset(coords={"x": ("x", [1, 2])}) ds2 = Dataset(coords={"x": ("y", [3, 4])}) # TODO: use public API for setting a non-default index, when available ds2._indexes["x"] = PandasIndex([3, 4], "y") with pytest.raises( ValueError, match= r"Cannot concatenate along dimension 'x' indexes with dimensions.*", ): concat([ds1, ds2], dim="x")
def test_safe_cast_to_index(): dates = pd.date_range("2000-01-01", periods=10) x = np.arange(5) td = x * np.timedelta64(1, "D") midx = pd.MultiIndex.from_tuples([(0,)], names=["a"]) for expected, array in [ (dates, dates.values), (pd.Index(x, dtype=object), x.astype(object)), (pd.Index(td), td), (pd.Index(td, dtype=object), td.astype(object)), (midx, PandasIndex(midx)), ]: actual = utils.safe_cast_to_index(array) assert_array_equal(expected, actual) assert expected.dtype == actual.dtype
def test_join(self) -> None: index1 = PandasIndex(["a", "aa", "aaa"], "x", coord_dtype="<U3") index2 = PandasIndex(["aa", "aaa", "aaaa"], "x", coord_dtype="<U4") expected = PandasIndex(["aa", "aaa"], "x") actual = index1.join(index2) print(actual.index) assert actual.equals(expected) assert actual.coord_dtype == "<U4" expected = PandasIndex(["a", "aa", "aaa", "aaaa"], "x") actual = index1.join(index2, how="outer") print(actual.index) assert actual.equals(expected) assert actual.coord_dtype == "<U4"
def test_concat_dim_error(self) -> None: indexes = [PandasIndex([0, 1], "x"), PandasIndex([2, 3], "y")] with pytest.raises(ValueError, match=r"Cannot concatenate.*dimensions.*"): PandasIndex.concat(indexes, "x")
def map_blocks( func: Callable[..., T_DSorDA], obj: Union[DataArray, Dataset], args: Sequence[Any] = (), kwargs: Mapping[str, Any] = None, template: Union[DataArray, Dataset] = None, ) -> T_DSorDA: """Apply a function to each block of a DataArray or Dataset. .. warning:: This function is experimental and its signature may change. Parameters ---------- func : callable User-provided function that accepts a DataArray or Dataset as its first parameter ``obj``. The function will receive a subset or 'block' of ``obj`` (see below), corresponding to one chunk along each chunked dimension. ``func`` will be executed as ``func(subset_obj, *subset_args, **kwargs)``. This function must return either a single DataArray or a single Dataset. This function cannot add a new chunked dimension. obj : DataArray, Dataset Passed to the function as its first argument, one block at a time. args : sequence Passed to func after unpacking and subsetting any xarray objects by blocks. xarray objects in args must be aligned with obj, otherwise an error is raised. kwargs : mapping Passed verbatim to func after unpacking. xarray objects, if any, will not be subset to blocks. Passing dask collections in kwargs is not allowed. template : DataArray or Dataset, optional xarray object representing the final result after compute is called. If not provided, the function will be first run on mocked-up data, that looks like ``obj`` but has sizes 0, to determine properties of the returned object such as dtype, variable names, attributes, new dimensions and new indexes (if any). ``template`` must be provided if the function changes the size of existing dimensions. When provided, ``attrs`` on variables in `template` are copied over to the result. Any ``attrs`` set by ``func`` will be ignored. Returns ------- A single DataArray or Dataset with dask backend, reassembled from the outputs of the function. Notes ----- This function is designed for when ``func`` needs to manipulate a whole xarray object subset to each block. Each block is loaded into memory. In the more common case where ``func`` can work on numpy arrays, it is recommended to use ``apply_ufunc``. If none of the variables in ``obj`` is backed by dask arrays, calling this function is equivalent to calling ``func(obj, *args, **kwargs)``. See Also -------- dask.array.map_blocks, xarray.apply_ufunc, xarray.Dataset.map_blocks xarray.DataArray.map_blocks Examples -------- Calculate an anomaly from climatology using ``.groupby()``. Using ``xr.map_blocks()`` allows for parallel operations with knowledge of ``xarray``, its indices, and its methods like ``.groupby()``. >>> def calculate_anomaly(da, groupby_type="time.month"): ... gb = da.groupby(groupby_type) ... clim = gb.mean(dim="time") ... return gb - clim ... >>> time = xr.cftime_range("1990-01", "1992-01", freq="M") >>> month = xr.DataArray(time.month, coords={"time": time}, dims=["time"]) >>> np.random.seed(123) >>> array = xr.DataArray( ... np.random.rand(len(time)), ... dims=["time"], ... coords={"time": time, "month": month}, ... ).chunk() >>> array.map_blocks(calculate_anomaly, template=array).compute() <xarray.DataArray (time: 24)> array([ 0.12894847, 0.11323072, -0.0855964 , -0.09334032, 0.26848862, 0.12382735, 0.22460641, 0.07650108, -0.07673453, -0.22865714, -0.19063865, 0.0590131 , -0.12894847, -0.11323072, 0.0855964 , 0.09334032, -0.26848862, -0.12382735, -0.22460641, -0.07650108, 0.07673453, 0.22865714, 0.19063865, -0.0590131 ]) Coordinates: * time (time) object 1990-01-31 00:00:00 ... 1991-12-31 00:00:00 month (time) int64 1 2 3 4 5 6 7 8 9 10 11 12 1 2 3 4 5 6 7 8 9 10 11 12 Note that one must explicitly use ``args=[]`` and ``kwargs={}`` to pass arguments to the function being applied in ``xr.map_blocks()``: >>> array.map_blocks( ... calculate_anomaly, ... kwargs={"groupby_type": "time.year"}, ... template=array, ... ) # doctest: +ELLIPSIS <xarray.DataArray (time: 24)> dask.array<<this-array>-calculate_anomaly, shape=(24,), dtype=float64, chunksize=(24,), chunktype=numpy.ndarray> Coordinates: * time (time) object 1990-01-31 00:00:00 ... 1991-12-31 00:00:00 month (time) int64 dask.array<chunksize=(24,), meta=np.ndarray> """ def _wrapper( func: Callable, args: List, kwargs: dict, arg_is_array: Iterable[bool], expected: dict, ): """ Wrapper function that receives datasets in args; converts to dataarrays when necessary; passes these to the user function `func` and checks returned objects for expected shapes/sizes/etc. """ converted_args = [ dataset_to_dataarray(arg) if is_array else arg for is_array, arg in zip(arg_is_array, args) ] result = func(*converted_args, **kwargs) # check all dims are present missing_dimensions = set(expected["shapes"]) - set(result.sizes) if missing_dimensions: raise ValueError( f"Dimensions {missing_dimensions} missing on returned object." ) # check that index lengths and values are as expected for name, index in result.xindexes.items(): if name in expected["shapes"]: if len(index) != expected["shapes"][name]: raise ValueError( f"Received dimension {name!r} of length {len(index)}. Expected length {expected['shapes'][name]}." ) if name in expected["indexes"]: expected_index = expected["indexes"][name] if not index.equals(expected_index): raise ValueError( f"Expected index {name!r} to be {expected_index!r}. Received {index!r} instead." ) # check that all expected variables were returned check_result_variables(result, expected, "coords") if isinstance(result, Dataset): check_result_variables(result, expected, "data_vars") return make_dict(result) if template is not None and not isinstance(template, (DataArray, Dataset)): raise TypeError( f"template must be a DataArray or Dataset. Received {type(template).__name__} instead." ) if not isinstance(args, Sequence): raise TypeError("args must be a sequence (for example, a list or tuple).") if kwargs is None: kwargs = {} elif not isinstance(kwargs, Mapping): raise TypeError("kwargs must be a mapping (for example, a dict)") for value in kwargs.values(): if dask.is_dask_collection(value): raise TypeError( "Cannot pass dask collections in kwargs yet. Please compute or " "load values before passing to map_blocks." ) if not dask.is_dask_collection(obj): return func(obj, *args, **kwargs) all_args = [obj] + list(args) is_xarray = [isinstance(arg, (Dataset, DataArray)) for arg in all_args] is_array = [isinstance(arg, DataArray) for arg in all_args] # there should be a better way to group this. partition? xarray_indices, xarray_objs = unzip( (index, arg) for index, arg in enumerate(all_args) if is_xarray[index] ) others = [ (index, arg) for index, arg in enumerate(all_args) if not is_xarray[index] ] # all xarray objects must be aligned. This is consistent with apply_ufunc. aligned = align(*xarray_objs, join="exact") xarray_objs = tuple( dataarray_to_dataset(arg) if is_da else arg for is_da, arg in zip(is_array, aligned) ) _, npargs = unzip( sorted(list(zip(xarray_indices, xarray_objs)) + others, key=lambda x: x[0]) ) # check that chunk sizes are compatible input_chunks = dict(npargs[0].chunks) input_indexes = dict(npargs[0].xindexes) for arg in xarray_objs[1:]: assert_chunks_compatible(npargs[0], arg) input_chunks.update(arg.chunks) input_indexes.update(arg.xindexes) if template is None: # infer template by providing zero-shaped arrays template = infer_template(func, aligned[0], *args, **kwargs) template_indexes = set(template.xindexes) preserved_indexes = template_indexes & set(input_indexes) new_indexes = template_indexes - set(input_indexes) indexes = {dim: input_indexes[dim] for dim in preserved_indexes} indexes.update({k: template.xindexes[k] for k in new_indexes}) output_chunks = { dim: input_chunks[dim] for dim in template.dims if dim in input_chunks } else: # template xarray object has been provided with proper sizes and chunk shapes indexes = dict(template.xindexes) if isinstance(template, DataArray): output_chunks = dict( zip(template.dims, template.chunks) # type: ignore[arg-type] ) else: output_chunks = dict(template.chunks) for dim in output_chunks: if dim in input_chunks and len(input_chunks[dim]) != len(output_chunks[dim]): raise ValueError( "map_blocks requires that one block of the input maps to one block of output. " f"Expected number of output chunks along dimension {dim!r} to be {len(input_chunks[dim])}. " f"Received {len(output_chunks[dim])} instead. Please provide template if not provided, or " "fix the provided template." ) if isinstance(template, DataArray): result_is_array = True template_name = template.name template = template._to_temp_dataset() elif isinstance(template, Dataset): result_is_array = False else: raise TypeError( f"func output must be DataArray or Dataset; got {type(template)}" ) # We're building a new HighLevelGraph hlg. We'll have one new layer # for each variable in the dataset, which is the result of the # func applied to the values. graph: Dict[Any, Any] = {} new_layers: DefaultDict[str, Dict[Any, Any]] = collections.defaultdict(dict) gname = "{}-{}".format( dask.utils.funcname(func), dask.base.tokenize(npargs[0], args, kwargs) ) # map dims to list of chunk indexes ichunk = {dim: range(len(chunks_v)) for dim, chunks_v in input_chunks.items()} # mapping from chunk index to slice bounds input_chunk_bounds = { dim: np.cumsum((0,) + chunks_v) for dim, chunks_v in input_chunks.items() } output_chunk_bounds = { dim: np.cumsum((0,) + chunks_v) for dim, chunks_v in output_chunks.items() } def subset_dataset_to_block( graph: dict, gname: str, dataset: Dataset, input_chunk_bounds, chunk_index ): """ Creates a task that subsets an xarray dataset to a block determined by chunk_index. Block extents are determined by input_chunk_bounds. Also subtasks that subset the constituent variables of a dataset. """ # this will become [[name1, variable1], # [name2, variable2], # ...] # which is passed to dict and then to Dataset data_vars = [] coords = [] chunk_tuple = tuple(chunk_index.values()) for name, variable in dataset.variables.items(): # make a task that creates tuple of (dims, chunk) if dask.is_dask_collection(variable.data): # recursively index into dask_keys nested list to get chunk chunk = variable.__dask_keys__() for dim in variable.dims: chunk = chunk[chunk_index[dim]] chunk_variable_task = (f"{name}-{gname}-{chunk[0]}",) + chunk_tuple graph[chunk_variable_task] = ( tuple, [variable.dims, chunk, variable.attrs], ) else: # non-dask array possibly with dimensions chunked on other variables # index into variable appropriately subsetter = { dim: _get_chunk_slicer(dim, chunk_index, input_chunk_bounds) for dim in variable.dims } subset = variable.isel(subsetter) chunk_variable_task = ( f"{name}-{gname}-{dask.base.tokenize(subset)}", ) + chunk_tuple graph[chunk_variable_task] = ( tuple, [subset.dims, subset, subset.attrs], ) # this task creates dict mapping variable name to above tuple if name in dataset._coord_names: coords.append([name, chunk_variable_task]) else: data_vars.append([name, chunk_variable_task]) return (Dataset, (dict, data_vars), (dict, coords), dataset.attrs) # iterate over all possible chunk combinations for chunk_tuple in itertools.product(*ichunk.values()): # mapping from dimension name to chunk index chunk_index = dict(zip(ichunk.keys(), chunk_tuple)) blocked_args = [ subset_dataset_to_block(graph, gname, arg, input_chunk_bounds, chunk_index) if isxr else arg for isxr, arg in zip(is_xarray, npargs) ] # expected["shapes", "coords", "data_vars", "indexes"] are used to # raise nice error messages in _wrapper expected = {} # input chunk 0 along a dimension maps to output chunk 0 along the same dimension # even if length of dimension is changed by the applied function expected["shapes"] = { k: output_chunks[k][v] for k, v in chunk_index.items() if k in output_chunks } expected["data_vars"] = set(template.data_vars.keys()) # type: ignore[assignment] expected["coords"] = set(template.coords.keys()) # type: ignore[assignment] # TODO: benbovy - flexible indexes: clean this up # for now assumes pandas index (thus can be indexed) but it won't be the case for # all indexes expected_indexes = {} for dim in indexes: idx = indexes[dim].to_pandas_index()[ _get_chunk_slicer(dim, chunk_index, output_chunk_bounds) ] expected_indexes[dim] = PandasIndex(idx) expected["indexes"] = expected_indexes from_wrapper = (gname,) + chunk_tuple graph[from_wrapper] = (_wrapper, func, blocked_args, kwargs, is_array, expected) # mapping from variable name to dask graph key var_key_map: Dict[Hashable, str] = {} for name, variable in template.variables.items(): if name in indexes: continue gname_l = f"{name}-{gname}" var_key_map[name] = gname_l key: Tuple[Any, ...] = (gname_l,) for dim in variable.dims: if dim in chunk_index: key += (chunk_index[dim],) else: # unchunked dimensions in the input have one chunk in the result # output can have new dimensions with exactly one chunk key += (0,) # We're adding multiple new layers to the graph: # The first new layer is the result of the computation on # the array. # Then we add one layer per variable, which extracts the # result for that variable, and depends on just the first new # layer. new_layers[gname_l][key] = (operator.getitem, from_wrapper, name) hlg = HighLevelGraph.from_collections( gname, graph, dependencies=[arg for arg in npargs if dask.is_dask_collection(arg)], ) # This adds in the getitems for each variable in the dataset. hlg = HighLevelGraph( {**hlg.layers, **new_layers}, dependencies={ **hlg.dependencies, **{name: {gname} for name in new_layers.keys()}, }, ) result = Dataset(coords=indexes, attrs=template.attrs) for index in result.xindexes: result[index].attrs = template[index].attrs result[index].encoding = template[index].encoding for name, gname_l in var_key_map.items(): dims = template[name].dims var_chunks = [] for dim in dims: if dim in output_chunks: var_chunks.append(output_chunks[dim]) elif dim in indexes: var_chunks.append((len(indexes[dim]),)) elif dim in template.dims: # new unindexed dimension var_chunks.append((template.sizes[dim],)) data = dask.array.Array( hlg, name=gname_l, chunks=var_chunks, dtype=template[name].dtype ) result[name] = (dims, data, template[name].attrs) result[name].encoding = template[name].encoding result = result.set_coords(template._coord_names) if result_is_array: da = dataset_to_dataarray(result) da.name = template_name return da # type: ignore[return-value] return result # type: ignore[return-value]
def test_map_index_queries(self) -> None: def create_sel_results( x_indexer, x_index, other_vars, drop_coords, drop_indexes, rename_dims, ): dim_indexers = {"x": x_indexer} index_vars = x_index.create_variables() indexes = {k: x_index for k in index_vars} variables = {} variables.update(index_vars) variables.update(other_vars) return indexing.IndexSelResult( dim_indexers=dim_indexers, indexes=indexes, variables=variables, drop_coords=drop_coords, drop_indexes=drop_indexes, rename_dims=rename_dims, ) def test_indexer( data: T_Xarray, x: Any, expected: indexing.IndexSelResult, ) -> None: results = indexing.map_index_queries(data, {"x": x}) assert results.dim_indexers.keys() == expected.dim_indexers.keys() assert_array_equal(results.dim_indexers["x"], expected.dim_indexers["x"]) assert results.indexes.keys() == expected.indexes.keys() for k in results.indexes: assert results.indexes[k].equals(expected.indexes[k]) assert results.variables.keys() == expected.variables.keys() for k in results.variables: assert_array_equal(results.variables[k], expected.variables[k]) assert set(results.drop_coords) == set(expected.drop_coords) assert set(results.drop_indexes) == set(expected.drop_indexes) assert results.rename_dims == expected.rename_dims data = Dataset({"x": ("x", [1, 2, 3])}) mindex = pd.MultiIndex.from_product([["a", "b"], [1, 2], [-1, -2]], names=("one", "two", "three")) mdata = DataArray(range(8), [("x", mindex)]) test_indexer(data, 1, indexing.IndexSelResult({"x": 0})) test_indexer(data, np.int32(1), indexing.IndexSelResult({"x": 0})) test_indexer(data, Variable([], 1), indexing.IndexSelResult({"x": 0})) test_indexer(mdata, ("a", 1, -1), indexing.IndexSelResult({"x": 0})) expected = create_sel_results( [True, True, False, False, False, False, False, False], PandasIndex(pd.Index([-1, -2]), "three"), { "one": Variable((), "a"), "two": Variable((), 1) }, ["x"], ["one", "two"], {"x": "three"}, ) test_indexer(mdata, ("a", 1), expected) expected = create_sel_results( slice(0, 4, None), PandasMultiIndex( pd.MultiIndex.from_product([[1, 2], [-1, -2]], names=("two", "three")), "x", ), {"one": Variable((), "a")}, [], ["one"], {}, ) test_indexer(mdata, "a", expected) expected = create_sel_results( [True, True, True, True, False, False, False, False], PandasMultiIndex( pd.MultiIndex.from_product([[1, 2], [-1, -2]], names=("two", "three")), "x", ), {"one": Variable((), "a")}, [], ["one"], {}, ) test_indexer(mdata, ("a", ), expected) test_indexer(mdata, [("a", 1, -1), ("b", 2, -2)], indexing.IndexSelResult({"x": [0, 7]})) test_indexer(mdata, slice("a", "b"), indexing.IndexSelResult({"x": slice(0, 8, None)})) test_indexer( mdata, slice(("a", 1), ("b", 1)), indexing.IndexSelResult({"x": slice(0, 6, None)}), ) test_indexer( mdata, { "one": "a", "two": 1, "three": -1 }, indexing.IndexSelResult({"x": 0}), ) expected = create_sel_results( [True, True, False, False, False, False, False, False], PandasIndex(pd.Index([-1, -2]), "three"), { "one": Variable((), "a"), "two": Variable((), 1) }, ["x"], ["one", "two"], {"x": "three"}, ) test_indexer(mdata, {"one": "a", "two": 1}, expected) expected = create_sel_results( [True, False, True, False, False, False, False, False], PandasIndex(pd.Index([1, 2]), "two"), { "one": Variable((), "a"), "three": Variable((), -1) }, ["x"], ["one", "three"], {"x": "two"}, ) test_indexer(mdata, {"one": "a", "three": -1}, expected) expected = create_sel_results( [True, True, True, True, False, False, False, False], PandasMultiIndex( pd.MultiIndex.from_product([[1, 2], [-1, -2]], names=("two", "three")), "x", ), {"one": Variable((), "a")}, [], ["one"], {}, ) test_indexer(mdata, {"one": "a"}, expected)
def test_concat_empty(self) -> None: idx = PandasIndex.concat([], "x") assert idx.coord_dtype is np.dtype("O")
def test_to_pandas_index(self) -> None: pd_idx = pd.Index([1, 2, 3], name="foo") index = PandasIndex(pd_idx, "x") assert index.to_pandas_index() is index.index
def test_equals(self) -> None: index1 = PandasIndex([1, 2, 3], "x") index2 = PandasIndex([1, 2, 3], "x") assert index1.equals(index2) is True