def _run_dask_cupy(data: da.Array, cellsize_x: Union[int, float], cellsize_y: Union[int, float]) -> da.Array: msg = 'Upstream bug in dask prevents cupy backed arrays' raise NotImplementedError(msg) _func = partial(_run_cupy, cellsize_x=cellsize_x, cellsize_y=cellsize_y) out = data.map_overlap(_func, depth=(1, 1), boundary=cupy.nan, dtype=cupy.float32, meta=cupy.array(())) return out
def histogram( arr: da.Array, eda_dtype: DType, bins: Optional[int] = None, return_edges: bool = True, range: Optional[Tuple[int, int]] = None, # pylint: disable=redefined-builtin ) -> Tuple[da.Array, ...]: """Calculate "histogram" for both numerical and categorical.""" if len(arr.shape) != 1: raise ValueError("Histogram only supports 1-d array.") srs = dd.from_dask_array(arr) if isinstance(eda_dtype, Continuous): if range is not None: minimum, maximum = range else: minimum, maximum = arr.min(axis=0), arr.max(axis=0) if bins is None: raise ValueError("num_bins cannot be None if calculating numerical histograms.") counts, edges = da.histogram(arr, bins, range=[minimum, maximum]) centers = (edges[:-1] + edges[1:]) / 2 if not return_edges: return counts, centers return counts, centers, edges elif isinstance(eda_dtype, (Nominal, GeoGraphy, SmallCardNum, DateTime)): # Dask array's unique is way slower than the values_counts on Series # See https://github.com/dask/dask/issues/2851 # centers, counts = da.unique(arr, return_counts=True) value_counts = srs.value_counts() counts = value_counts.to_dask_array() centers = value_counts.index.to_dask_array() return (counts, centers) else: raise ValueError(f"Unsupported dtype {eda_dtype}")
def test_ragged_blockdims(): dsk = {('x', 0, 0): np.ones((2, 2)), ('x', 0, 1): np.ones((2, 3)), ('x', 1, 0): np.ones((5, 2)), ('x', 1, 1): np.ones((5, 3))} a = Array(dsk, 'x', chunks=[(2, 5), (2, 3)], shape=(7, 5)) s = symbol('s', '7 * 5 * int') assert compute(s.sum(axis=0), a).chunks == ((2, 3),) assert compute(s.sum(axis=1), a).chunks == ((2, 5),) assert compute(s + 1, a).chunks == a.chunks
def spearman_nxn(data: da.Array) -> da.Array: """ Spearman correlation calculation of a n x n correlation matrix for n columns """ _, ncols = data.shape data = data.compute() # TODO: How to compute rank distributedly? ranks = np.empty_like(data) for j in range(ncols): ranks[:, j] = pd.Series(data[:, j]).rank() ranks = da.from_array(ranks) corrmat = pearson_nxn(ranks) return corrmat
def calc_hist_kde( data: da.Array, bins: int, bandwidth: float) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]: """ Calculate a density histogram and its corresponding kernel density estimate over a given series. The kernel is guassian. Parameters ---------- data: da.Array one numerical column over which to compute the histogram and kde bins : int number of bins to use in the histogram bandwidth: float bandwidth for the kde Returns ------- Tuple[pd.DataFrame, np.ndarray, np.ndarray] The histogram in a dataframe, range of points for the kde, and the kde calculated at the specified points """ minv, maxv = dask.compute(data.min(), data.max()) hist_arr, bins_arr = da.histogram(data, range=[minv, maxv], bins=bins, density=True) hist_arr = hist_arr.compute() intervals = _format_bin_intervals(bins_arr) hist_df = pd.DataFrame({ "intervals": intervals, "left": bins_arr[:-1], "right": bins_arr[1:], "freq": hist_arr, }) pts_rng = np.linspace(minv, maxv, 1000) pdf = gaussian_kde(data.compute(), bw_method=bandwidth)(pts_rng) return hist_df, pts_rng, pdf
def apply(X: Array, YP: Array, BX: Array, BYP: Array) -> Array: # Collapse selected variant blocks and alphas into single # new covariate dimension assert YP.shape[2] == BYP.shape[2] n_group_covar = n_covar + BYP.shape[2] * n_alpha_1 BYP = BYP.reshape((n_outcome, n_sample_block, -1)) BG = da.concatenate((BX, BYP), axis=-1) BG = BG.rechunk((-1, None, -1)) assert_block_shape(BG, 1, n_sample_block, 1) assert_chunk_shape(BG, n_outcome, 1, n_group_covar) assert_array_shape(BG, n_outcome, n_sample_block, n_group_covar) YP = YP.reshape((n_outcome, n_sample, -1)) XYP = da.broadcast_to(X, (n_outcome, n_sample, n_covar)) XG = da.concatenate((XYP, YP), axis=-1) XG = XG.rechunk((-1, None, -1)) assert_block_shape(XG, 1, n_sample_block, 1) assert_chunk_shape(XG, n_outcome, sample_chunks[0], n_group_covar) assert_array_shape(XG, n_outcome, n_sample, n_group_covar) YG = da.map_blocks( # Block chunks: # (n_outcome, sample_chunks[0], n_group_covar) @ # (n_outcome, n_group_covar, 1) [after transpose] lambda x, b: x @ b.transpose((0, 2, 1)), XG, BG, chunks=(n_outcome, sample_chunks, 1), ) assert_block_shape(YG, 1, n_sample_block, 1) assert_chunk_shape(YG, n_outcome, sample_chunks[0], 1) assert_array_shape(YG, n_outcome, n_sample, 1) YG = da.squeeze(YG, axis=-1).T assert_block_shape(YG, n_sample_block, 1) assert_chunk_shape(YG, sample_chunks[0], n_outcome) assert_array_shape(YG, n_sample, n_outcome) return YG
def _run_dask_cupy(data: da.Array) -> da.Array: msg = 'Upstream bug in dask prevents cupy backed arrays' raise NotImplementedError(msg) # add any func args # TODO: probably needs cellsize args _func = partial(_run_cupy) out = data.map_overlap(_func, depth=(1, 1), boundary=cupy.nan, dtype=cupy.float32, meta=cupy.array(())) return out
def _dask_imread(files, imread=None, preprocess=None, coerce_shape=None): """ Read a stack of images into a dask array """ from dask.array import Array from dask.base import tokenize from functools import partial if not imread: from skimage.io import imread def _imread(open_file): with open_file as f: return imread(f) def add_leading_dimension(x): return x[None, ...] filenames = [f.path for f in files] name = 'imread-%s' % tokenize(filenames) if coerce_shape is not None: reshape = partial(_coerce_shape, shape=coerce_shape) with files[0] as f: sample = imread(f) if coerce_shape is not None: sample = reshape(sample) if preprocess: sample = preprocess(sample) keys = [(name, i) + (0, ) * len(sample.shape) for i in range(len(files))] if coerce_shape is not None: if preprocess: values = [(add_leading_dimension, (preprocess, (reshape, (_imread, f)))) for f in files] else: values = [(add_leading_dimension, (reshape, (_imread, f))) for f in files] elif preprocess: values = [(add_leading_dimension, (preprocess, (_imread, f))) for f in files] else: values = [(add_leading_dimension, (_imread, f)) for f in files] dsk = dict(zip(keys, values)) chunks = ((1, ) * len(files), ) + tuple((d, ) for d in sample.shape) return Array(dsk, name, chunks, sample.dtype)
def _load_GeoTransform(self): """Calculate latitude and longitude variable calculated from the gdal.Open.GetGeoTransform method""" def load_lon(): return arange(ds.RasterXSize) * b[1] + b[0] def load_lat(): return arange(ds.RasterYSize) * b[5] + b[3] ds = self.ds b = self.ds.GetGeoTransform() # bbox, interval if with_dask: lat = Array({('lat', 0): (load_lat, )}, 'lat', (self.ds.RasterYSize, ), shape=(self.ds.RasterYSize, ), dtype=float) lon = Array({('lon', 0): (load_lon, )}, 'lon', (self.ds.RasterXSize, ), shape=(self.ds.RasterXSize, ), dtype=float) else: lat = load_lat() lon = load_lon() return Variable(('lat', ), lat), Variable(('lon', ), lon)
def pairwise_distances(X: da.Array, Y: ArrayLike, metric: Union[str, Callable[[ArrayLike, ArrayLike], float]] = "euclidean", n_jobs: Optional[int] = None, **kwargs: Any): if isinstance(Y, da.Array): raise TypeError("`Y` must be a numpy array") chunks = (X.chunks[0], (len(Y), )) return X.map_blocks(metrics.pairwise_distances, Y, dtype=float, chunks=chunks, metric=metric, **kwargs)
def scatter_with_regression( x: da.Array, y: da.Array, sample_size: int, k: Optional[int] = None ) -> Tuple[Tuple[da.Array, da.Array], Tuple[da.Array, da.Array], Optional[da.Array]]: """Calculate pearson correlation on 2 given arrays. Parameters ---------- xarr : da.Array yarr : da.Array sample_size : int k : Optional[int] = None Highlight k points which influence pearson correlation most """ if k == 0: raise ValueError("k should be larger than 0") xp1 = da.vstack([x, da.ones_like(x)]).T xp1 = xp1.rechunk((xp1.chunks[0], -1)) mask = ~(da.isnan(x) | da.isnan(y)) # if chunk size in the first dimension is 1, lstsq will use sfqr instead of tsqr, # where the former does not support nan in shape. if len(xp1.chunks[0]) == 1: xp1 = xp1.rechunk((2, -1)) y = y.rechunk((2, -1)) mask = mask.rechunk((2, -1)) (coeffa, coeffb), _, _, _ = da.linalg.lstsq(xp1[mask], y[mask]) if sample_size < x.shape[0]: samplesel = da.random.choice(x.shape[0], int(sample_size), chunks=x.chunksize) x = x[samplesel] y = y[samplesel] if k is None: return (coeffa, coeffb), (x, y), None influences = pearson_influence(x, y) return (coeffa, coeffb), (x, y), influences
def _rechunk(dask_array: da.Array): ndim_to_chunks = { 2: { 0: -1, 1: -1 }, 3: { 0: "auto", 1: -1, 2: -1 }, 4: { 0: "auto", 1: "auto", 2: -1, 3: -1 }, } return dask_array.rechunk(ndim_to_chunks[dask_array.ndim])
def _unchunk_ifneeded(data: da.Array, axis: int) -> da.Array: """Returns `data` unchunked along `axis`. Parameters ---------- data : :class:`dask.array.Array` Data which may be chunked along `axis`. axis : :class:`int` Axis number which specifies the axis to unchunk. Returns ------- data : :class:`dask.array.Array` A dask array which is not chunked along the specified axis. """ if isinstance(data, da.Array): shape = data.shape chunksize = data.chunksize axis = _check_axis(axis, data.ndim) if shape[axis] != chunksize[axis]: data = data.rechunk({axis: -1}) return data else: raise TypeError("data must be a dask array.")
def _max_str_len(arr: Array) -> Array: return arr.map_blocks(lambda s: np.char.str_len(s.astype(str)), dtype=np.int8).max()
def array(self, futures, shape=None, chunks=None, dtype=None): """ Turns a set of future arrays (result of a distributed operation), associated to a cartesian communicator, into a Dask Array. Parameters ---------- cart: CartComm A cartesian communicator with dimensions equal to the number of chunks futures: tuple(futures) A set of future arrays associated to the cart dims_axes: tuple The axes associated to the dimensions of the cart shape: tuple(int) The shape of the array chunks: tuple(int) The chunks of the array dtype: tuple(int) The dtype of the array """ if not len(futures) == len(self): raise ValueError("futures and cart must have the same length") if chunks is None or dtype is None: infos = self.client.map(lambda arr: (arr.dtype, arr.shape), futures) infos = tuple(_.result() for _ in infos) if dtype is None: dtype = infos[0][0] if not all((dtype == dtp for (dtp, _) in infos)): raise TypeError( f"Futures have different dtypes {[info[0] for info in infos]}") if chunks is None: chunks = infos[0][1] if not all((chunks == chn for (_, chn) in infos)): # TODO: normalize chunks using shape raise NotImplementedError( "Futures with non-uniform chunks not supported yet") if shape is None: shape = list(chunks) for _i, _l in self.normalize_dims(): shape[_i] *= _l chunks = normalize_chunks(chunks, shape, dtype=dtype) self.check_dims(tuple(len(chunk) for chunk in chunks)) dask = {} idxs, _ = zip(*self.normalize_dims()) for coords, future in zip(self.normalize_coords(), futures): key = [0] * len(shape) for _i, _c in zip(idxs, coords): key[_i] = _c name = next(iter(futures)).key if isinstance(name, tuple): name = name[0] assert isinstance(name, str) key = (name, ) + tuple(key) dask[key] = future return Array(dask, next(iter(dask.keys()))[0], chunks, dtype=dtype, shape=shape)
def _stage_2( YP: Array, X: Array, Y: Array, alphas: Optional[NDArray] = None, normalize: bool = True, _glow_adj_alpha: bool = False, _glow_adj_scaling: bool = False, ) -> Tuple[Array, Array]: """Stage 2 - WGR Meta Regression This stage will train separate ridge regression models for each outcome using the predictions from stage 1 for that same outcome as features. These predictions are then evaluated based on R2 score to determine an optimal "meta" estimator (see `_stage_1` for the "base" estimator description). Results then include only predictions and coefficients from this optimal model. For more details, see the level 1 regression model described in step 1 of [Mbatchou et al. 2020](https://www.biorxiv.org/content/10.1101/2020.06.19.162354v2). """ assert YP.ndim == 4 assert X.ndim == 2 assert Y.ndim == 2 # Check that chunking across samples is the same for all arrays assert YP.numblocks[2] == X.numblocks[0] == Y.numblocks[0] assert YP.chunks[2] == X.chunks[0] == Y.chunks[0] # Assert single chunks for covariates and outcomes assert X.numblocks[1] == Y.numblocks[1] == 1 # Extract shape statistics n_variant_block, n_alpha_1 = YP.shape[:2] n_sample_block = Y.numblocks[0] n_sample, n_outcome = Y.shape n_covar = X.shape[1] n_indvar = n_covar + n_variant_block * n_alpha_1 sample_chunks = Y.chunks[0] if normalize: assert_block_shape(YP, n_variant_block, 1, n_sample_block, 1) assert_chunk_shape(YP, 1, n_alpha_1, sample_chunks[0], n_outcome) # See: https://github.com/projectglow/glow/issues/260 if _glow_adj_scaling: YP = da.map_blocks( lambda x: (x - x.mean(axis=2, keepdims=True)) / x.std(axis=2, keepdims=True), YP, ) else: YP = (YP - YP.mean(axis=2, keepdims=True)) / YP.std(axis=2, keepdims=True) # Tranpose for refit on level 1 predictions YP = YP.transpose((3, 2, 0, 1)) assert_array_shape(YP, n_outcome, n_sample, n_variant_block, n_alpha_1) if alphas is None: # See: https://github.com/projectglow/glow/issues/255 if _glow_adj_alpha: alphas = get_alphas(n_variant_block * n_alpha_1 * n_outcome) else: alphas = get_alphas(n_variant_block * n_alpha_1) n_alpha_2 = alphas.size YR = [] BR = [] for i in range(n_outcome): # Slice and reshape to new 2D covariate matrix; # The order of raveling in trailing dimensions is important # and later reshapes will assume variants, alphas order XPB = YP[i].reshape((n_sample, n_variant_block * n_alpha_1)) # Prepend covariates and chunk along first dim only XPB = da.concatenate((X, XPB), axis=1) XPB = XPB.rechunk(chunks=(None, -1)) assert_array_shape(XPB, n_sample, n_indvar) assert XPB.numblocks == (n_sample_block, 1) # Extract outcome vector YB = Y[:, [i]] assert XPB.ndim == YB.ndim == 2 # Fit and predict folds for each parameter BB, YPB = _ridge_regression_cv(XPB, YB, alphas, n_zero_reg=n_covar)[-2:] assert_array_shape(BB, n_alpha_2, n_sample_block * n_indvar, 1) assert_array_shape(YPB, n_alpha_2, n_sample, 1) BR.append(BB) YR.append(YPB) # Concatenate predictions along outcome dimension YR = da.concatenate(YR, axis=2) assert_block_shape(YR, 1, n_sample_block, n_outcome) assert_chunk_shape(YR, n_alpha_2, sample_chunks[0], 1) assert_array_shape(YR, n_alpha_2, n_sample, n_outcome) # Move samples to last dim so all others are batch # dims for R2 calculations YR = da.transpose(YR, (0, 2, 1)) assert_array_shape(YR, n_alpha_2, n_outcome, n_sample) YR = YR.rechunk((-1, -1, None)) assert_block_shape(YR, 1, 1, n_sample_block) assert YR.shape[1:] == Y.T.shape # Concatenate betas along outcome dimension BR = da.concatenate(BR, axis=2) assert_block_shape(BR, 1, n_sample_block, n_outcome) assert_chunk_shape(BR, n_alpha_2, n_indvar, 1) assert_array_shape(BR, n_alpha_2, n_sample_block * n_indvar, n_outcome) # Compute R2 scores within each sample block for each outcome + alpha R2 = da.stack( [ r2_score(YR.blocks[..., i], Y.T.blocks[..., i]) # Avoid warnings on R2 calculations for blocks with single rows if YR.chunks[-1][i] > 1 else da.full(YR.shape[:-1], np.nan) for i in range(n_sample_block) ] ) assert_array_shape(R2, n_sample_block, n_alpha_2, n_outcome) # Coerce to finite or nan before nan-aware mean R2 = da.where(da.isfinite(R2), R2, np.nan) # Find highest mean alpha score for each outcome across blocks R2M = da.nanmean(R2, axis=0) assert_array_shape(R2M, n_alpha_2, n_outcome) # Identify index for the alpha value with the highest mean score R2I = da.argmax(R2M, axis=0) assert_array_shape(R2I, n_outcome) # Choose the predictions corresponding to the model with best score YRM = da.stack([YR[R2I[i], i, :] for i in range(n_outcome)], axis=-1) YRM = YRM.rechunk((None, -1)) assert_block_shape(YRM, n_sample_block, 1) assert_chunk_shape(YRM, sample_chunks[0], n_outcome) assert_array_shape(YRM, n_sample, n_outcome) # Choose the betas corresponding to the model with the best score BRM = da.stack([BR[R2I[i], :, i] for i in range(n_outcome)], axis=-1) BRM = BRM.rechunk((None, -1)) assert_block_shape(BRM, n_sample_block, 1) assert_chunk_shape(BRM, n_indvar, n_outcome) assert_array_shape(BRM, n_sample_block * n_indvar, n_outcome) return BRM, YRM
def _map_blocks_asnumpy(x: Array) -> Array: if da.utils.is_cupy_type(x._meta): # pragma: no cover import cupy as cp # type: ignore[import] x = x.map_blocks(cp.asnumpy) return x
def est_motion_part(varr: darr.Array, npart: int, chunk_nfm: int, alt_error=5, **kwargs) -> Tuple[darr.Array, darr.Array]: """ Construct dask graph for the recursive motion estimation algorithm. Parameters ---------- varr : darr.Array Input dask array representing movie data. npart : int Number of frames/chunks to combine for the recursive algorithm. chunk_nfm : int Number of frames in each parallel task. alt_error : int, optional Error threshold between estimated shifts from two alternative methods, specified in pixels. By default `5`. Returns ------- temps : darr.Array Registration template for the movie. shifts : darr.Array Estimated motion. See Also -------- estimate_motion """ if chunk_nfm is None: chunk_nfm = varr.chunksize[0] varr = varr.rechunk((chunk_nfm, None, None)) arr_opt = fct.partial(custom_arr_optimize, keep_patterns=["^est_motion_chunk"]) if kwargs.get("mesh_size", None): param = get_bspline_param(varr[0].compute(), kwargs["mesh_size"]) tmp_ls = [] sh_ls = [] for blk in varr.blocks: res = da.delayed(est_motion_chunk)(blk, None, alt_error=alt_error, npart=npart, **kwargs) if alt_error: tmp = darr.from_delayed(res[0], shape=(3, blk.shape[1], blk.shape[2]), dtype=blk.dtype) else: tmp = darr.from_delayed(res[0], shape=(blk.shape[1], blk.shape[2]), dtype=blk.dtype) if kwargs.get("mesh_size", None): sh = darr.from_delayed( res[1], shape=(blk.shape[0], 2, int(param[1]), int(param[0])), dtype=float, ) else: sh = darr.from_delayed(res[1], shape=(blk.shape[0], 2), dtype=float) tmp_ls.append(tmp) sh_ls.append(sh) with da.config.set(array_optimize=arr_opt): temps = da.optimize(darr.stack(tmp_ls, axis=0))[0] shifts = da.optimize(darr.concatenate(sh_ls, axis=0))[0] while temps.shape[0] > 1: tmp_ls = [] sh_ls = [] for idx in np.arange(0, temps.numblocks[0], npart): tmps = temps.blocks[idx:idx + npart] sh_org = shifts.blocks[idx:idx + npart] sh_org_ls = [sh_org.blocks[i] for i in range(sh_org.numblocks[0])] res = da.delayed(est_motion_chunk)(tmps, sh_org_ls, alt_error=alt_error, npart=npart, **kwargs) if alt_error: tmp = darr.from_delayed(res[0], shape=(3, tmps.shape[1], tmps.shape[2]), dtype=tmps.dtype) else: tmp = darr.from_delayed(res[0], shape=(tmps.shape[1], tmps.shape[2]), dtype=tmps.dtype) sh_new = darr.from_delayed(res[1], shape=sh_org.shape, dtype=sh_org.dtype) tmp_ls.append(tmp) sh_ls.append(sh_new) temps = darr.stack(tmp_ls, axis=0) shifts = darr.concatenate(sh_ls, axis=0) return temps, shifts
def _encode_dask_array( values: da.Array, uniques: Optional[np.ndarray] = None, encode: bool = False, onehot_dtype: Optional[np.dtype] = None, ): """One-hot or label encode a dask array. Parameters ---------- values : da.Array, shape [n_samples,] uniques : np.ndarray, shape [n_uniques,] encode : bool, default False Whether to encode the values (True) or just discover the uniques. onehot_dtype : np.dtype, optional Optional dtype for the resulting one-hot encoded array. This changes the shape, dtype, and underlying storage of the returned dask array. ======= ================= ========================= thing onehot_dtype=None onehot_dtype=onehot_dtype ======= ================= ========================= shape (n_samples,) (n_samples, len(uniques)) dtype np.intp onehot_dtype storage np.ndarray scipy.sparse.csr_matrix ======= ================= ========================= Returns ------- uniques : ndarray The discovered uniques (uniques=None) or just `uniques` encoded : da.Array, optional The encoded values. Only returned when ``encode=True``. """ if uniques is None: if encode and onehot_dtype: raise ValueError( "Cannot use 'encode` and 'onehot_dtype' simultaneously.") if encode: uniques, encoded = da.unique(values, return_inverse=True) return uniques, encoded else: return da.unique(values) if encode: if onehot_dtype: dtype = onehot_dtype new_axis: Optional[int] = 1 chunks = values.chunks + (len(uniques), ) else: dtype = np.dtype("int") new_axis = None chunks = values.chunks return ( uniques, values.map_blocks( _check_and_search_block, uniques, onehot_dtype=onehot_dtype, dtype=dtype, new_axis=new_axis, chunks=chunks, ), ) else: return uniques
def dask_hist1d( a: Array, bins=None, range=None, normed=False, weights=None, density=None ): """ Blocked variant of :func:`numpy.histogram`, but using the fast-histogram module. Parameters ---------- a : array_like Input data. The histogram is computed over the flattened array. bins : int or sequence of scalars, optional Either an iterable specifying the ``bins`` or the number of ``bins`` and a ``range`` argument is required as computing ``min`` and ``max`` over blocked arrays is an expensive operation that must be performed explicitly. If `bins` is an int, it defines the number of equal-width bins in the given range (10, by default). If `bins` is a sequence, it defines a monotonically increasing array of bin edges, including the rightmost edge, allowing for non-uniform bin widths. range : (float, float), optional The lower and upper range of the bins. If not provided, range is simply ``(a.min(), a.max())``. Values outside the range are ignored. The first element of the range must be less than or equal to the second. `range` affects the automatic bin computation as well. While bin width is computed to be optimal based on the actual data within `range`, the bin count will fill the entire range including portions containing no data. normed : bool, optional This is equivalent to the ``density`` argument, but produces incorrect results for unequal bin widths. It should not be used. weights : array_like, optional A dask.array.Array of weights, of the same block structure as ``a``. Each value in ``a`` only contributes its associated weight towards the bin count (instead of 1). If ``density`` is True, the weights are normalized, so that the integral of the density over the range remains 1. density : bool, optional If ``False``, the result will contain the number of samples in each bin. If ``True``, the result is the value of the probability *density* function at the bin, normalized such that the *integral* over the range is 1. Note that the sum of the histogram values will not be equal to 1 unless bins of unity width are chosen; it is not a probability *mass* function. Overrides the ``normed`` keyword if given. If ``density`` is True, ``bins`` cannot be a single-number delayed value. It must be a concrete number, or a (possibly-delayed) array/sequence of the bin edges. Returns ------- hist : dask Array The values of the histogram. See `density` and `weights` for a description of the possible semantics. bin_edges : dask Array of dtype float Return the bin edges ``(length(hist)+1)``. Examples -------- Using number of bins and range: >>> import dask.array as da >>> import numpy as np >>> x = da.from_array(np.arange(10000), chunks=10) >>> h, bins = da.histogram(x, bins=10, range=[0, 10000]) >>> bins array([ 0., 1000., 2000., 3000., 4000., 5000., 6000., 7000., 8000., 9000., 10000.]) >>> h.compute() array([1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]) Explicitly specifying the bins: >>> h, bins = da.histogram(x, bins=np.array([0, 5000, 10000])) >>> bins array([ 0, 5000, 10000]) >>> h.compute() array([5000, 5000]) """ if isinstance(bins, Array): scalar_bins = bins.ndim == 0 # ^ `np.ndim` is not implemented by Dask array. elif isinstance(bins, Delayed): scalar_bins = bins._length is None or bins._length == 1 else: scalar_bins = np.ndim(bins) == 0 if bins is None or (scalar_bins and range is None): raise ValueError( "dask.array.histogram requires either specifying " "bins as an iterable or specifying both a range and " "the number of bins" ) if weights is not None and weights.chunks != a.chunks: raise ValueError("Input array and weights must have the same chunked structure") if normed is not False: raise ValueError( "The normed= keyword argument has been deprecated. " "Please use density instead. " "See the numpy.histogram docstring for more information." ) if density and scalar_bins and isinstance(bins, (Array, Delayed)): raise NotImplementedError( "When `density` is True, `bins` cannot be a scalar Dask object. " "It must be a concrete number or a (possibly-delayed) array/sequence of bin edges." ) for argname, val in [("bins", bins), ("range", range), ("weights", weights)]: if not isinstance(bins, (Array, Delayed)) and is_dask_collection(bins): raise TypeError( "Dask types besides Array and Delayed are not supported " "for `histogram`. For argument `{}`, got: {!r}".format(argname, val) ) if range is not None: try: if len(range) != 2: raise ValueError( f"range must be a sequence or array of length 2, but got {len(range)} items" ) if isinstance(range, (Array, np.ndarray)) and range.shape != (2,): raise ValueError( f"range must be a 1-dimensional array of two items, but got an array of shape {range.shape}" ) except TypeError: raise TypeError( f"Expected a sequence or array for range, not {range}" ) from None token = tokenize(a, bins, range, weights, density) name = "histogram-sum-" + token if scalar_bins: bins = _linspace_from_delayed(range[0], range[1], bins + 1) # ^ NOTE `range[1]` is safe because of the above check, and the initial check # that range must not be None if `scalar_bins` else: if not isinstance(bins, (Array, np.ndarray)): bins = asarray(bins) if bins.ndim != 1: raise ValueError( f"bins must be a 1-dimensional array or sequence, got shape {bins.shape}" ) (bins_ref, range_ref), deps = unpack_collections([bins, range]) # Map the histogram to all bins, forming a 2D array of histograms, stacked for each chunk if weights is None: dsk = { (name, i, 0): (_block_fast_hist1d, k, bins_ref, range_ref) for i, k in enumerate(flatten(a.__dask_keys__())) } dtype = np.histogram([])[0].dtype else: a_keys = flatten(a.__dask_keys__()) w_keys = flatten(weights.__dask_keys__()) dsk = { (name, i, 0): (_block_fast_hist1d, k, bins_ref, range_ref, w) for i, (k, w) in enumerate(zip(a_keys, w_keys)) } dtype = weights.dtype deps = (a,) + deps if weights is not None: deps += (weights,) graph = HighLevelGraph.from_collections(name, dsk, dependencies=deps) # Turn graph into a 2D Array of shape (nchunks, nbins) nchunks = len(list(flatten(a.__dask_keys__()))) nbins = bins.size - 1 # since `bins` is 1D chunks = ((1,) * nchunks, (nbins,)) mapped = Array(graph, name, chunks, dtype=dtype) # Sum over chunks to get the final histogram n = mapped.sum(axis=0) # We need to replicate normed and density options from numpy if density is not None: if density: db = asarray(np.diff(bins).astype(float), chunks=n.chunks) return n / db / n.sum(), bins else: return n, bins else: return n, bins