def dask_hist2d(x: da.Array, y: da.Array, bins: int, range, density=False):
    if x.shape != y.shape:
        raise ValueError(
            f"Mismatch in argument shaoes: x.shape == {x.shape}; y.shape == {y.shape}"
        )

    token = tokenize(x, y, bins, range, density)
    name = "histogram2d-sum-" + token

    x_keys = flatten(x.__dask_keys__())
    y_keys = flatten(y.__dask_keys__())

    dsk = {
        (name, i, 0, 0): (_block_fast_hist2d, xi, yi, bins, range)
        for i, (xi, yi) in enumerate(zip(x_keys, y_keys))
    }
    dtype = np.histogram2d([], [])[0].dtype

    graph = HighLevelGraph.from_collections(name, dsk, dependencies=(x, y))

    # turn graph into a 3D array of shape (nchunks, nbins, nbins)
    nchunks = len(list(flatten(x.__dask_keys__())))
    chunks = ((1,) * nchunks, (bins,), (bins,))
    mapped = Array(graph, name, chunks, dtype=dtype)
    n = mapped.sum(axis=0)
    return n
Exemple #2
0
    def get_variables(self):
        def load(band):
            band = ds.GetRasterBand(band)
            a = band.ReadAsArray()
            no_data = band.GetNoDataValue()
            if no_data is not None:
                try:
                    a[a == no_data] = a.dtype.type(nan)
                except ValueError:
                    pass
            return a

        ds = self.ds
        dims = ['lat', 'lon']
        chunks = ((ds.RasterYSize, ), (ds.RasterXSize, ))
        shape = (ds.RasterYSize, ds.RasterXSize)
        variables = OrderedDict()
        for iband in range(1, ds.RasterCount + 1):
            band = ds.GetRasterBand(iband)
            dt = dtype(gdal_array.codes[band.DataType])
            if with_dask:
                dsk = {('x', 0, 0): (load, iband)}
                arr = Array(dsk, 'x', chunks, shape=shape, dtype=dt)
            else:
                arr = load(iband)
            attrs = band.GetMetadata_Dict()
            try:
                dt.type(nan)
                attrs['_FillValue'] = nan
            except ValueError:
                no_data = band.GetNoDataValue()
                attrs.update({'_FillValue': no_data} if no_data else {})
            variables['Band%i' % iband] = Variable(dims, arr, attrs)
        variables['lat'], variables['lon'] = self._load_GeoTransform()
        return FrozenOrderedDict(variables)
def downscale_dask(
    array: Any,
    reduction: Callable[[NDArray[Any], Tuple[int, ...]], NDArray[Any]],
    scale_factors: Union[int, Sequence[int], Dict[int, int]],
    **kwargs: Any,
) -> Any:

    if not np.all((np.array(array.shape) % np.array(scale_factors)) == 0):
        raise ValueError(
            f"Coarsening factors {scale_factors} do not align with array shape {array.shape}."
        )

    array = align_chunks(array, scale_factors)
    name = "downscale-" + tokenize(reduction, array, scale_factors)
    dsk = {
        (name,) + key[1:]: (apply, reduction, [key, scale_factors], kwargs)
        for key in flatten(array.__dask_keys__())
    }
    chunks = tuple(
        tuple(int(size // scale_factors[axis]) for size in sizes)
        for axis, sizes in enumerate(array.chunks)
    )

    meta = reduction(
        np.empty(scale_factors, dtype=array.dtype), scale_factors, **kwargs
    )
    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[array])
    return Array(graph, name, chunks, meta=meta)
Exemple #4
0
    def _query_resample_kdtree(self,
                               resample_kdtree,
                               target_lons,
                               target_lats,
                               valid_output_index,
                               reduce_data=True):
        """Query kd-tree on slice of target coordinates"""
        from dask.base import tokenize
        from dask.array import Array

        def query(target_lons, target_lats, valid_output_index, c_slice):
            voi = valid_output_index[c_slice].compute()
            shape = voi.shape
            voir = voi.ravel()
            target_lons_valid = target_lons[c_slice].ravel()[voir]
            target_lats_valid = target_lats[c_slice].ravel()[voir]

            coords = self.transform_lonlats(target_lons_valid,
                                            target_lats_valid)
            distance_array, index_array = np.stack(
                resample_kdtree.query(
                    coords.compute(),
                    k=self.neighbours,
                    eps=self.epsilon,
                    distance_upper_bound=self.radius_of_influence))

            res_ia = np.full(shape, fill_value=np.nan, dtype=np.float)
            res_da = np.full(shape, fill_value=np.nan, dtype=np.float)
            res_ia[voi] = index_array
            res_da[voi] = distance_array
            return np.stack([res_ia, res_da], axis=-1)

        token = tokenize(1000)
        name = 'query-' + token

        dsk = {}
        vstart = 0

        for i, vck in enumerate(valid_output_index.chunks[0]):
            hstart = 0
            for j, hck in enumerate(valid_output_index.chunks[1]):
                c_slice = (slice(vstart,
                                 vstart + vck), slice(hstart, hstart + hck))
                dsk[(name, i, j, 0)] = (query, target_lons, target_lats,
                                        valid_output_index, c_slice)
                hstart += hck
            vstart += vck

        res = Array(dsk,
                    name,
                    shape=list(valid_output_index.shape) + [2],
                    chunks=list(valid_output_index.chunks) + [2],
                    dtype=target_lons.dtype)

        index_array = res[:, :, 0].astype(np.uint)
        distance_array = res[:, :, 1]
        return index_array, distance_array
Exemple #5
0
def test_ragged_blockdims():
    dsk = {('x', 0, 0): np.ones((2, 2)),
           ('x', 0, 1): np.ones((2, 3)),
           ('x', 1, 0): np.ones((5, 2)),
           ('x', 1, 1): np.ones((5, 3))}

    a = Array(dsk, 'x', chunks=[(2, 5), (2, 3)], shape=(7, 5))
    s = symbol('s', '7 * 5 * int')

    assert compute(s.sum(axis=0), a).chunks == ((2, 3),)
    assert compute(s.sum(axis=1), a).chunks == ((2, 5),)

    assert compute(s + 1, a).chunks == a.chunks
Exemple #6
0
def _dask_imread(files, imread=None, preprocess=None, coerce_shape=None):
    """ Read a stack of images into a dask array """
    from dask.array import Array
    from dask.base import tokenize
    from functools import partial

    if not imread:
        from skimage.io import imread

    def _imread(open_file):
        with open_file as f:
            return imread(f)

    def add_leading_dimension(x):
        return x[None, ...]

    filenames = [f.path for f in files]

    name = 'imread-%s' % tokenize(filenames)

    if coerce_shape is not None:
        reshape = partial(_coerce_shape, shape=coerce_shape)

    with files[0] as f:
        sample = imread(f)
    if coerce_shape is not None:
        sample = reshape(sample)
    if preprocess:
        sample = preprocess(sample)

    keys = [(name, i) + (0, ) * len(sample.shape) for i in range(len(files))]

    if coerce_shape is not None:
        if preprocess:
            values = [(add_leading_dimension,
                       (preprocess, (reshape, (_imread, f)))) for f in files]
        else:
            values = [(add_leading_dimension, (reshape, (_imread, f)))
                      for f in files]
    elif preprocess:
        values = [(add_leading_dimension, (preprocess, (_imread, f)))
                  for f in files]
    else:
        values = [(add_leading_dimension, (_imread, f)) for f in files]
    dsk = dict(zip(keys, values))

    chunks = ((1, ) * len(files), ) + tuple((d, ) for d in sample.shape)

    return Array(dsk, name, chunks, sample.dtype)
Exemple #7
0
    def _load_GeoTransform(self):
        """Calculate latitude and longitude variable calculated from the
        gdal.Open.GetGeoTransform method"""
        def load_lon():
            return arange(ds.RasterXSize) * b[1] + b[0]

        def load_lat():
            return arange(ds.RasterYSize) * b[5] + b[3]

        ds = self.ds
        b = self.ds.GetGeoTransform()  # bbox, interval
        if with_dask:
            lat = Array({('lat', 0): (load_lat, )},
                        'lat', (self.ds.RasterYSize, ),
                        shape=(self.ds.RasterYSize, ),
                        dtype=float)
            lon = Array({('lon', 0): (load_lon, )},
                        'lon', (self.ds.RasterXSize, ),
                        shape=(self.ds.RasterXSize, ),
                        dtype=float)
        else:
            lat = load_lat()
            lon = load_lon()
        return Variable(('lat', ), lat), Variable(('lon', ), lon)
def dask_hist1d(
    a: Array, bins=None, range=None, normed=False, weights=None, density=None
):
    """
    Blocked variant of :func:`numpy.histogram`, but using the fast-histogram module.

    Parameters
    ----------
    a : array_like
        Input data. The histogram is computed over the flattened array.
    bins : int or sequence of scalars, optional
        Either an iterable specifying the ``bins`` or the number of ``bins``
        and a ``range`` argument is required as computing ``min`` and ``max``
        over blocked arrays is an expensive operation that must be performed
        explicitly.
        If `bins` is an int, it defines the number of equal-width
        bins in the given range (10, by default). If `bins` is a
        sequence, it defines a monotonically increasing array of bin edges,
        including the rightmost edge, allowing for non-uniform bin widths.
    range : (float, float), optional
        The lower and upper range of the bins.  If not provided, range
        is simply ``(a.min(), a.max())``.  Values outside the range are
        ignored. The first element of the range must be less than or
        equal to the second. `range` affects the automatic bin
        computation as well. While bin width is computed to be optimal
        based on the actual data within `range`, the bin count will fill
        the entire range including portions containing no data.
    normed : bool, optional
        This is equivalent to the ``density`` argument, but produces incorrect
        results for unequal bin widths. It should not be used.
    weights : array_like, optional
        A dask.array.Array of weights, of the same block structure as ``a``.  Each value in
        ``a`` only contributes its associated weight towards the bin count
        (instead of 1). If ``density`` is True, the weights are
        normalized, so that the integral of the density over the range
        remains 1.
    density : bool, optional
        If ``False``, the result will contain the number of samples in
        each bin. If ``True``, the result is the value of the
        probability *density* function at the bin, normalized such that
        the *integral* over the range is 1. Note that the sum of the
        histogram values will not be equal to 1 unless bins of unity
        width are chosen; it is not a probability *mass* function.
        Overrides the ``normed`` keyword if given.
        If ``density`` is True, ``bins`` cannot be a single-number delayed
        value. It must be a concrete number, or a (possibly-delayed)
        array/sequence of the bin edges.
    Returns
    -------
    hist : dask Array
        The values of the histogram. See `density` and `weights` for a
        description of the possible semantics.
    bin_edges : dask Array of dtype float
        Return the bin edges ``(length(hist)+1)``.


    Examples
    --------
    Using number of bins and range:

    >>> import dask.array as da
    >>> import numpy as np
    >>> x = da.from_array(np.arange(10000), chunks=10)
    >>> h, bins = da.histogram(x, bins=10, range=[0, 10000])
    >>> bins
    array([    0.,  1000.,  2000.,  3000.,  4000.,  5000.,  6000.,  7000.,
            8000.,  9000., 10000.])
    >>> h.compute()
    array([1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000])

    Explicitly specifying the bins:

    >>> h, bins = da.histogram(x, bins=np.array([0, 5000, 10000]))
    >>> bins
    array([    0,  5000, 10000])
    >>> h.compute()
    array([5000, 5000])
    """
    if isinstance(bins, Array):
        scalar_bins = bins.ndim == 0
        # ^ `np.ndim` is not implemented by Dask array.
    elif isinstance(bins, Delayed):
        scalar_bins = bins._length is None or bins._length == 1
    else:
        scalar_bins = np.ndim(bins) == 0

    if bins is None or (scalar_bins and range is None):
        raise ValueError(
            "dask.array.histogram requires either specifying "
            "bins as an iterable or specifying both a range and "
            "the number of bins"
        )

    if weights is not None and weights.chunks != a.chunks:
        raise ValueError("Input array and weights must have the same chunked structure")

    if normed is not False:
        raise ValueError(
            "The normed= keyword argument has been deprecated. "
            "Please use density instead. "
            "See the numpy.histogram docstring for more information."
        )

    if density and scalar_bins and isinstance(bins, (Array, Delayed)):
        raise NotImplementedError(
            "When `density` is True, `bins` cannot be a scalar Dask object. "
            "It must be a concrete number or a (possibly-delayed) array/sequence of bin edges."
        )

    for argname, val in [("bins", bins), ("range", range), ("weights", weights)]:
        if not isinstance(bins, (Array, Delayed)) and is_dask_collection(bins):
            raise TypeError(
                "Dask types besides Array and Delayed are not supported "
                "for `histogram`. For argument `{}`, got: {!r}".format(argname, val)
            )

    if range is not None:
        try:
            if len(range) != 2:
                raise ValueError(
                    f"range must be a sequence or array of length 2, but got {len(range)} items"
                )
            if isinstance(range, (Array, np.ndarray)) and range.shape != (2,):
                raise ValueError(
                    f"range must be a 1-dimensional array of two items, but got an array of shape {range.shape}"
                )
        except TypeError:
            raise TypeError(
                f"Expected a sequence or array for range, not {range}"
            ) from None

    token = tokenize(a, bins, range, weights, density)
    name = "histogram-sum-" + token

    if scalar_bins:
        bins = _linspace_from_delayed(range[0], range[1], bins + 1)
        # ^ NOTE `range[1]` is safe because of the above check, and the initial check
        # that range must not be None if `scalar_bins`
    else:
        if not isinstance(bins, (Array, np.ndarray)):
            bins = asarray(bins)
        if bins.ndim != 1:
            raise ValueError(
                f"bins must be a 1-dimensional array or sequence, got shape {bins.shape}"
            )

    (bins_ref, range_ref), deps = unpack_collections([bins, range])

    # Map the histogram to all bins, forming a 2D array of histograms, stacked for each chunk
    if weights is None:
        dsk = {
            (name, i, 0): (_block_fast_hist1d, k, bins_ref, range_ref)
            for i, k in enumerate(flatten(a.__dask_keys__()))
        }
        dtype = np.histogram([])[0].dtype
    else:
        a_keys = flatten(a.__dask_keys__())
        w_keys = flatten(weights.__dask_keys__())
        dsk = {
            (name, i, 0): (_block_fast_hist1d, k, bins_ref, range_ref, w)
            for i, (k, w) in enumerate(zip(a_keys, w_keys))
        }
        dtype = weights.dtype

    deps = (a,) + deps
    if weights is not None:
        deps += (weights,)
    graph = HighLevelGraph.from_collections(name, dsk, dependencies=deps)

    # Turn graph into a 2D Array of shape (nchunks, nbins)
    nchunks = len(list(flatten(a.__dask_keys__())))
    nbins = bins.size - 1  # since `bins` is 1D
    chunks = ((1,) * nchunks, (nbins,))
    mapped = Array(graph, name, chunks, dtype=dtype)

    # Sum over chunks to get the final histogram
    n = mapped.sum(axis=0)

    # We need to replicate normed and density options from numpy
    if density is not None:
        if density:
            db = asarray(np.diff(bins).astype(float), chunks=n.chunks)
            return n / db / n.sum(), bins
        else:
            return n, bins
    else:
        return n, bins
Exemple #9
0
def array(self, futures, shape=None, chunks=None, dtype=None):
    """
    Turns a set of future arrays (result of a distributed operation),
    associated to a cartesian communicator, into a Dask Array.

    Parameters
    ----------
    cart: CartComm
        A cartesian communicator with dimensions equal to the number of chunks
    futures: tuple(futures)
        A set of future arrays associated to the cart
    dims_axes: tuple
        The axes associated to the dimensions of the cart
    shape: tuple(int)
        The shape of the array
    chunks: tuple(int)
        The chunks of the array
    dtype: tuple(int)
        The dtype of the array
    """
    if not len(futures) == len(self):
        raise ValueError("futures and cart must have the same length")

    if chunks is None or dtype is None:
        infos = self.client.map(lambda arr: (arr.dtype, arr.shape), futures)
        infos = tuple(_.result() for _ in infos)

        if dtype is None:
            dtype = infos[0][0]
        if not all((dtype == dtp for (dtp, _) in infos)):
            raise TypeError(
                f"Futures have different dtypes {[info[0] for info in infos]}")

        if chunks is None:
            chunks = infos[0][1]
            if not all((chunks == chn for (_, chn) in infos)):
                # TODO: normalize chunks using shape
                raise NotImplementedError(
                    "Futures with non-uniform chunks not supported yet")

    if shape is None:
        shape = list(chunks)
        for _i, _l in self.normalize_dims():
            shape[_i] *= _l

    chunks = normalize_chunks(chunks, shape, dtype=dtype)

    self.check_dims(tuple(len(chunk) for chunk in chunks))

    dask = {}
    idxs, _ = zip(*self.normalize_dims())
    for coords, future in zip(self.normalize_coords(), futures):
        key = [0] * len(shape)
        for _i, _c in zip(idxs, coords):
            key[_i] = _c

        name = next(iter(futures)).key
        if isinstance(name, tuple):
            name = name[0]
        assert isinstance(name, str)
        key = (name, ) + tuple(key)
        dask[key] = future

    return Array(dask,
                 next(iter(dask.keys()))[0],
                 chunks,
                 dtype=dtype,
                 shape=shape)