Beispiel #1
0
def median(array, dim, keep_attrs=False, skipna=False, **kwargs):
    """ Runs a median on an dask-backed xarray.
    
    This function does not scale!
    It will rechunk along the given dimension, so make sure 
    your other chunk sizes are small enough that it 
    will fit into memory.
    
    :param array: An xarray.DataArray or xarray.Dataset wrapping one or more dask arrays
    :type array: xarray.DataArray or xarray.Dataset
    :param dim: The name of the dim in array to calculate the median
    :type dim: str
    """
    if type(array) is xr.Dataset:
        return array.apply(median,
                           dim=dim,
                           keep_attrs=keep_attrs,
                           skipna=skipna,
                           **kwargs)

    if not hasattr(array.data, 'dask'):
        return array.median(dim,
                            keep_attrs=keep_attrs,
                            skipna=skipna,
                            **kwargs)

    array = array.chunk({dim: -1})
    axis = array.dims.index(dim)
    median_func = np.nanmedian if skipna else np.median
    blocks = dask.array.map_blocks(median_func,
                                   array.data,
                                   dtype=array.dtype,
                                   drop_axis=axis,
                                   axis=axis,
                                   **kwargs)

    new_coords = {
        k: v
        for k, v in array.coords.items() if k != dim and dim not in v.dims
    }
    new_dims = tuple(d for d in array.dims if d != dim)
    new_attrs = array.attrs if keep_attrs else None

    return xr.DataArray(blocks,
                        coords=new_coords,
                        dims=new_dims,
                        attrs=new_attrs)
def test_xarray_reduce_multiple_groupers():
    arr = np.ones((4, 12))

    labels = np.array(
        ["a", "a", "c", "c", "c", "b", "b", "c", "c", "b", "b", "f"])
    labels = np.array(labels)
    labels2 = np.array([1, 2, 2, 1])

    da = xr.DataArray(arr,
                      dims=("x", "y"),
                      coords={
                          "labels2": ("x", labels2),
                          "labels": ("y", labels)
                      }).expand_dims(z=4)

    expected = xr.DataArray(
        [[4, 4], [10, 10], [8, 8], [2, 2]],
        dims=("labels", "labels2"),
        coords={
            "labels": ["a", "c", "b", "f"],
            "labels2": [1, 2]
        },
    ).expand_dims(z=4)

    actual = xarray_reduce(da, da.labels, da.labels2, func="count")
    xr.testing.assert_identical(expected, actual)

    actual = xarray_reduce(da, "labels", "labels2", func="count", fill_value=0)
    xr.testing.assert_identical(expected, actual)

    with raise_if_dask_computes():
        actual = xarray_reduce(da.chunk({
            "x": 2,
            "z": 1
        }),
                               da.labels,
                               da.labels2,
                               func="count")
    xr.testing.assert_identical(expected, actual)
Beispiel #3
0
def test_min_count_specific(dask, func, dim):
    if dask and not has_dask:
        pytest.skip("requires dask")

    # Simple array with four non-NaN values.
    da = DataArray(np.ones((6, 6), dtype=np.float64) * np.nan, dims=("a", "b"))
    da[0][0] = 2
    da[0][3] = 2
    da[3][0] = 2
    da[3][3] = 2
    if dask:
        da = da.chunk({"a": 3, "b": 3})

    # Expected result if we set min_count to the number of non-NaNs in a
    # row/column/the entire array.
    if dim:
        min_count = 2
        expected = DataArray(
            [4.0, np.nan, np.nan] * 2, dims=("a" if dim == "b" else "b",)
        )
    else:
        min_count = 4
        expected = DataArray(8.0 if func == "sum" else 16.0)

    # Check for that min_count.
    with raise_if_dask_computes():
        actual = getattr(da, func)(dim, skipna=True, min_count=min_count)
    assert_dask_array(actual, dask)
    assert_allclose(actual, expected)

    # With min_count being one higher, should get all NaN.
    min_count += 1
    expected *= np.nan
    with raise_if_dask_computes():
        actual = getattr(da, func)(dim, skipna=True, min_count=min_count)
    assert_dask_array(actual, dask)
    assert_allclose(actual, expected)