def dask_hist2d(x: da.Array, y: da.Array, bins: int, range, density=False): if x.shape != y.shape: raise ValueError( f"Mismatch in argument shaoes: x.shape == {x.shape}; y.shape == {y.shape}" ) token = tokenize(x, y, bins, range, density) name = "histogram2d-sum-" + token x_keys = flatten(x.__dask_keys__()) y_keys = flatten(y.__dask_keys__()) dsk = { (name, i, 0, 0): (_block_fast_hist2d, xi, yi, bins, range) for i, (xi, yi) in enumerate(zip(x_keys, y_keys)) } dtype = np.histogram2d([], [])[0].dtype graph = HighLevelGraph.from_collections(name, dsk, dependencies=(x, y)) # turn graph into a 3D array of shape (nchunks, nbins, nbins) nchunks = len(list(flatten(x.__dask_keys__()))) chunks = ((1,) * nchunks, (bins,), (bins,)) mapped = Array(graph, name, chunks, dtype=dtype) n = mapped.sum(axis=0) return n
def get_variables(self): def load(band): band = ds.GetRasterBand(band) a = band.ReadAsArray() no_data = band.GetNoDataValue() if no_data is not None: try: a[a == no_data] = a.dtype.type(nan) except ValueError: pass return a ds = self.ds dims = ['lat', 'lon'] chunks = ((ds.RasterYSize, ), (ds.RasterXSize, )) shape = (ds.RasterYSize, ds.RasterXSize) variables = OrderedDict() for iband in range(1, ds.RasterCount + 1): band = ds.GetRasterBand(iband) dt = dtype(gdal_array.codes[band.DataType]) if with_dask: dsk = {('x', 0, 0): (load, iband)} arr = Array(dsk, 'x', chunks, shape=shape, dtype=dt) else: arr = load(iband) attrs = band.GetMetadata_Dict() try: dt.type(nan) attrs['_FillValue'] = nan except ValueError: no_data = band.GetNoDataValue() attrs.update({'_FillValue': no_data} if no_data else {}) variables['Band%i' % iband] = Variable(dims, arr, attrs) variables['lat'], variables['lon'] = self._load_GeoTransform() return FrozenOrderedDict(variables)
def downscale_dask( array: Any, reduction: Callable[[NDArray[Any], Tuple[int, ...]], NDArray[Any]], scale_factors: Union[int, Sequence[int], Dict[int, int]], **kwargs: Any, ) -> Any: if not np.all((np.array(array.shape) % np.array(scale_factors)) == 0): raise ValueError( f"Coarsening factors {scale_factors} do not align with array shape {array.shape}." ) array = align_chunks(array, scale_factors) name = "downscale-" + tokenize(reduction, array, scale_factors) dsk = { (name,) + key[1:]: (apply, reduction, [key, scale_factors], kwargs) for key in flatten(array.__dask_keys__()) } chunks = tuple( tuple(int(size // scale_factors[axis]) for size in sizes) for axis, sizes in enumerate(array.chunks) ) meta = reduction( np.empty(scale_factors, dtype=array.dtype), scale_factors, **kwargs ) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[array]) return Array(graph, name, chunks, meta=meta)
def _query_resample_kdtree(self, resample_kdtree, target_lons, target_lats, valid_output_index, reduce_data=True): """Query kd-tree on slice of target coordinates""" from dask.base import tokenize from dask.array import Array def query(target_lons, target_lats, valid_output_index, c_slice): voi = valid_output_index[c_slice].compute() shape = voi.shape voir = voi.ravel() target_lons_valid = target_lons[c_slice].ravel()[voir] target_lats_valid = target_lats[c_slice].ravel()[voir] coords = self.transform_lonlats(target_lons_valid, target_lats_valid) distance_array, index_array = np.stack( resample_kdtree.query( coords.compute(), k=self.neighbours, eps=self.epsilon, distance_upper_bound=self.radius_of_influence)) res_ia = np.full(shape, fill_value=np.nan, dtype=np.float) res_da = np.full(shape, fill_value=np.nan, dtype=np.float) res_ia[voi] = index_array res_da[voi] = distance_array return np.stack([res_ia, res_da], axis=-1) token = tokenize(1000) name = 'query-' + token dsk = {} vstart = 0 for i, vck in enumerate(valid_output_index.chunks[0]): hstart = 0 for j, hck in enumerate(valid_output_index.chunks[1]): c_slice = (slice(vstart, vstart + vck), slice(hstart, hstart + hck)) dsk[(name, i, j, 0)] = (query, target_lons, target_lats, valid_output_index, c_slice) hstart += hck vstart += vck res = Array(dsk, name, shape=list(valid_output_index.shape) + [2], chunks=list(valid_output_index.chunks) + [2], dtype=target_lons.dtype) index_array = res[:, :, 0].astype(np.uint) distance_array = res[:, :, 1] return index_array, distance_array
def test_ragged_blockdims(): dsk = {('x', 0, 0): np.ones((2, 2)), ('x', 0, 1): np.ones((2, 3)), ('x', 1, 0): np.ones((5, 2)), ('x', 1, 1): np.ones((5, 3))} a = Array(dsk, 'x', chunks=[(2, 5), (2, 3)], shape=(7, 5)) s = symbol('s', '7 * 5 * int') assert compute(s.sum(axis=0), a).chunks == ((2, 3),) assert compute(s.sum(axis=1), a).chunks == ((2, 5),) assert compute(s + 1, a).chunks == a.chunks
def _dask_imread(files, imread=None, preprocess=None, coerce_shape=None): """ Read a stack of images into a dask array """ from dask.array import Array from dask.base import tokenize from functools import partial if not imread: from skimage.io import imread def _imread(open_file): with open_file as f: return imread(f) def add_leading_dimension(x): return x[None, ...] filenames = [f.path for f in files] name = 'imread-%s' % tokenize(filenames) if coerce_shape is not None: reshape = partial(_coerce_shape, shape=coerce_shape) with files[0] as f: sample = imread(f) if coerce_shape is not None: sample = reshape(sample) if preprocess: sample = preprocess(sample) keys = [(name, i) + (0, ) * len(sample.shape) for i in range(len(files))] if coerce_shape is not None: if preprocess: values = [(add_leading_dimension, (preprocess, (reshape, (_imread, f)))) for f in files] else: values = [(add_leading_dimension, (reshape, (_imread, f))) for f in files] elif preprocess: values = [(add_leading_dimension, (preprocess, (_imread, f))) for f in files] else: values = [(add_leading_dimension, (_imread, f)) for f in files] dsk = dict(zip(keys, values)) chunks = ((1, ) * len(files), ) + tuple((d, ) for d in sample.shape) return Array(dsk, name, chunks, sample.dtype)
def _load_GeoTransform(self): """Calculate latitude and longitude variable calculated from the gdal.Open.GetGeoTransform method""" def load_lon(): return arange(ds.RasterXSize) * b[1] + b[0] def load_lat(): return arange(ds.RasterYSize) * b[5] + b[3] ds = self.ds b = self.ds.GetGeoTransform() # bbox, interval if with_dask: lat = Array({('lat', 0): (load_lat, )}, 'lat', (self.ds.RasterYSize, ), shape=(self.ds.RasterYSize, ), dtype=float) lon = Array({('lon', 0): (load_lon, )}, 'lon', (self.ds.RasterXSize, ), shape=(self.ds.RasterXSize, ), dtype=float) else: lat = load_lat() lon = load_lon() return Variable(('lat', ), lat), Variable(('lon', ), lon)
def dask_hist1d( a: Array, bins=None, range=None, normed=False, weights=None, density=None ): """ Blocked variant of :func:`numpy.histogram`, but using the fast-histogram module. Parameters ---------- a : array_like Input data. The histogram is computed over the flattened array. bins : int or sequence of scalars, optional Either an iterable specifying the ``bins`` or the number of ``bins`` and a ``range`` argument is required as computing ``min`` and ``max`` over blocked arrays is an expensive operation that must be performed explicitly. If `bins` is an int, it defines the number of equal-width bins in the given range (10, by default). If `bins` is a sequence, it defines a monotonically increasing array of bin edges, including the rightmost edge, allowing for non-uniform bin widths. range : (float, float), optional The lower and upper range of the bins. If not provided, range is simply ``(a.min(), a.max())``. Values outside the range are ignored. The first element of the range must be less than or equal to the second. `range` affects the automatic bin computation as well. While bin width is computed to be optimal based on the actual data within `range`, the bin count will fill the entire range including portions containing no data. normed : bool, optional This is equivalent to the ``density`` argument, but produces incorrect results for unequal bin widths. It should not be used. weights : array_like, optional A dask.array.Array of weights, of the same block structure as ``a``. Each value in ``a`` only contributes its associated weight towards the bin count (instead of 1). If ``density`` is True, the weights are normalized, so that the integral of the density over the range remains 1. density : bool, optional If ``False``, the result will contain the number of samples in each bin. If ``True``, the result is the value of the probability *density* function at the bin, normalized such that the *integral* over the range is 1. Note that the sum of the histogram values will not be equal to 1 unless bins of unity width are chosen; it is not a probability *mass* function. Overrides the ``normed`` keyword if given. If ``density`` is True, ``bins`` cannot be a single-number delayed value. It must be a concrete number, or a (possibly-delayed) array/sequence of the bin edges. Returns ------- hist : dask Array The values of the histogram. See `density` and `weights` for a description of the possible semantics. bin_edges : dask Array of dtype float Return the bin edges ``(length(hist)+1)``. Examples -------- Using number of bins and range: >>> import dask.array as da >>> import numpy as np >>> x = da.from_array(np.arange(10000), chunks=10) >>> h, bins = da.histogram(x, bins=10, range=[0, 10000]) >>> bins array([ 0., 1000., 2000., 3000., 4000., 5000., 6000., 7000., 8000., 9000., 10000.]) >>> h.compute() array([1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]) Explicitly specifying the bins: >>> h, bins = da.histogram(x, bins=np.array([0, 5000, 10000])) >>> bins array([ 0, 5000, 10000]) >>> h.compute() array([5000, 5000]) """ if isinstance(bins, Array): scalar_bins = bins.ndim == 0 # ^ `np.ndim` is not implemented by Dask array. elif isinstance(bins, Delayed): scalar_bins = bins._length is None or bins._length == 1 else: scalar_bins = np.ndim(bins) == 0 if bins is None or (scalar_bins and range is None): raise ValueError( "dask.array.histogram requires either specifying " "bins as an iterable or specifying both a range and " "the number of bins" ) if weights is not None and weights.chunks != a.chunks: raise ValueError("Input array and weights must have the same chunked structure") if normed is not False: raise ValueError( "The normed= keyword argument has been deprecated. " "Please use density instead. " "See the numpy.histogram docstring for more information." ) if density and scalar_bins and isinstance(bins, (Array, Delayed)): raise NotImplementedError( "When `density` is True, `bins` cannot be a scalar Dask object. " "It must be a concrete number or a (possibly-delayed) array/sequence of bin edges." ) for argname, val in [("bins", bins), ("range", range), ("weights", weights)]: if not isinstance(bins, (Array, Delayed)) and is_dask_collection(bins): raise TypeError( "Dask types besides Array and Delayed are not supported " "for `histogram`. For argument `{}`, got: {!r}".format(argname, val) ) if range is not None: try: if len(range) != 2: raise ValueError( f"range must be a sequence or array of length 2, but got {len(range)} items" ) if isinstance(range, (Array, np.ndarray)) and range.shape != (2,): raise ValueError( f"range must be a 1-dimensional array of two items, but got an array of shape {range.shape}" ) except TypeError: raise TypeError( f"Expected a sequence or array for range, not {range}" ) from None token = tokenize(a, bins, range, weights, density) name = "histogram-sum-" + token if scalar_bins: bins = _linspace_from_delayed(range[0], range[1], bins + 1) # ^ NOTE `range[1]` is safe because of the above check, and the initial check # that range must not be None if `scalar_bins` else: if not isinstance(bins, (Array, np.ndarray)): bins = asarray(bins) if bins.ndim != 1: raise ValueError( f"bins must be a 1-dimensional array or sequence, got shape {bins.shape}" ) (bins_ref, range_ref), deps = unpack_collections([bins, range]) # Map the histogram to all bins, forming a 2D array of histograms, stacked for each chunk if weights is None: dsk = { (name, i, 0): (_block_fast_hist1d, k, bins_ref, range_ref) for i, k in enumerate(flatten(a.__dask_keys__())) } dtype = np.histogram([])[0].dtype else: a_keys = flatten(a.__dask_keys__()) w_keys = flatten(weights.__dask_keys__()) dsk = { (name, i, 0): (_block_fast_hist1d, k, bins_ref, range_ref, w) for i, (k, w) in enumerate(zip(a_keys, w_keys)) } dtype = weights.dtype deps = (a,) + deps if weights is not None: deps += (weights,) graph = HighLevelGraph.from_collections(name, dsk, dependencies=deps) # Turn graph into a 2D Array of shape (nchunks, nbins) nchunks = len(list(flatten(a.__dask_keys__()))) nbins = bins.size - 1 # since `bins` is 1D chunks = ((1,) * nchunks, (nbins,)) mapped = Array(graph, name, chunks, dtype=dtype) # Sum over chunks to get the final histogram n = mapped.sum(axis=0) # We need to replicate normed and density options from numpy if density is not None: if density: db = asarray(np.diff(bins).astype(float), chunks=n.chunks) return n / db / n.sum(), bins else: return n, bins else: return n, bins
def array(self, futures, shape=None, chunks=None, dtype=None): """ Turns a set of future arrays (result of a distributed operation), associated to a cartesian communicator, into a Dask Array. Parameters ---------- cart: CartComm A cartesian communicator with dimensions equal to the number of chunks futures: tuple(futures) A set of future arrays associated to the cart dims_axes: tuple The axes associated to the dimensions of the cart shape: tuple(int) The shape of the array chunks: tuple(int) The chunks of the array dtype: tuple(int) The dtype of the array """ if not len(futures) == len(self): raise ValueError("futures and cart must have the same length") if chunks is None or dtype is None: infos = self.client.map(lambda arr: (arr.dtype, arr.shape), futures) infos = tuple(_.result() for _ in infos) if dtype is None: dtype = infos[0][0] if not all((dtype == dtp for (dtp, _) in infos)): raise TypeError( f"Futures have different dtypes {[info[0] for info in infos]}") if chunks is None: chunks = infos[0][1] if not all((chunks == chn for (_, chn) in infos)): # TODO: normalize chunks using shape raise NotImplementedError( "Futures with non-uniform chunks not supported yet") if shape is None: shape = list(chunks) for _i, _l in self.normalize_dims(): shape[_i] *= _l chunks = normalize_chunks(chunks, shape, dtype=dtype) self.check_dims(tuple(len(chunk) for chunk in chunks)) dask = {} idxs, _ = zip(*self.normalize_dims()) for coords, future in zip(self.normalize_coords(), futures): key = [0] * len(shape) for _i, _c in zip(idxs, coords): key[_i] = _c name = next(iter(futures)).key if isinstance(name, tuple): name = name[0] assert isinstance(name, str) key = (name, ) + tuple(key) dask[key] = future return Array(dask, next(iter(dask.keys()))[0], chunks, dtype=dtype, shape=shape)