Esempio n. 1
0
def _run_dask_cupy(data: da.Array, cellsize_x: Union[int, float],
                   cellsize_y: Union[int, float]) -> da.Array:
    msg = 'Upstream bug in dask prevents cupy backed arrays'
    raise NotImplementedError(msg)

    _func = partial(_run_cupy, cellsize_x=cellsize_x, cellsize_y=cellsize_y)

    out = data.map_overlap(_func,
                           depth=(1, 1),
                           boundary=cupy.nan,
                           dtype=cupy.float32,
                           meta=cupy.array(()))
    return out
Esempio n. 2
0
def histogram(
    arr: da.Array,
    eda_dtype: DType,
    bins: Optional[int] = None,
    return_edges: bool = True,
    range: Optional[Tuple[int, int]] = None,  # pylint: disable=redefined-builtin
) -> Tuple[da.Array, ...]:
    """Calculate "histogram" for both numerical and categorical."""
    if len(arr.shape) != 1:
        raise ValueError("Histogram only supports 1-d array.")
    srs = dd.from_dask_array(arr)
    if isinstance(eda_dtype, Continuous):
        if range is not None:
            minimum, maximum = range
        else:
            minimum, maximum = arr.min(axis=0), arr.max(axis=0)

        if bins is None:
            raise ValueError("num_bins cannot be None if calculating numerical histograms.")

        counts, edges = da.histogram(arr, bins, range=[minimum, maximum])
        centers = (edges[:-1] + edges[1:]) / 2

        if not return_edges:
            return counts, centers
        return counts, centers, edges
    elif isinstance(eda_dtype, (Nominal, GeoGraphy, SmallCardNum, DateTime)):
        # Dask array's unique is way slower than the values_counts on Series
        # See https://github.com/dask/dask/issues/2851
        # centers, counts = da.unique(arr, return_counts=True)

        value_counts = srs.value_counts()

        counts = value_counts.to_dask_array()
        centers = value_counts.index.to_dask_array()

        return (counts, centers)
    else:
        raise ValueError(f"Unsupported dtype {eda_dtype}")
Esempio n. 3
0
def test_ragged_blockdims():
    dsk = {('x', 0, 0): np.ones((2, 2)),
           ('x', 0, 1): np.ones((2, 3)),
           ('x', 1, 0): np.ones((5, 2)),
           ('x', 1, 1): np.ones((5, 3))}

    a = Array(dsk, 'x', chunks=[(2, 5), (2, 3)], shape=(7, 5))
    s = symbol('s', '7 * 5 * int')

    assert compute(s.sum(axis=0), a).chunks == ((2, 3),)
    assert compute(s.sum(axis=1), a).chunks == ((2, 5),)

    assert compute(s + 1, a).chunks == a.chunks
Esempio n. 4
0
def spearman_nxn(data: da.Array) -> da.Array:
    """
    Spearman correlation calculation of a n x n correlation matrix for n columns
    """
    _, ncols = data.shape
    data = data.compute()  # TODO: How to compute rank distributedly?

    ranks = np.empty_like(data)
    for j in range(ncols):
        ranks[:, j] = pd.Series(data[:, j]).rank()
    ranks = da.from_array(ranks)
    corrmat = pearson_nxn(ranks)
    return corrmat
Esempio n. 5
0
def calc_hist_kde(
        data: da.Array, bins: int,
        bandwidth: float) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
    """
    Calculate a density histogram and its corresponding kernel density
    estimate over a given series. The kernel is guassian.

    Parameters
    ----------
    data: da.Array
        one numerical column over which to compute the histogram and kde
    bins : int
        number of bins to use in the histogram
    bandwidth: float
        bandwidth for the kde

    Returns
    -------
    Tuple[pd.DataFrame, np.ndarray, np.ndarray]
        The histogram in a dataframe, range of points for the kde,
        and the kde calculated at the specified points
    """
    minv, maxv = dask.compute(data.min(), data.max())
    hist_arr, bins_arr = da.histogram(data,
                                      range=[minv, maxv],
                                      bins=bins,
                                      density=True)
    hist_arr = hist_arr.compute()
    intervals = _format_bin_intervals(bins_arr)
    hist_df = pd.DataFrame({
        "intervals": intervals,
        "left": bins_arr[:-1],
        "right": bins_arr[1:],
        "freq": hist_arr,
    })
    pts_rng = np.linspace(minv, maxv, 1000)
    pdf = gaussian_kde(data.compute(), bw_method=bandwidth)(pts_rng)
    return hist_df, pts_rng, pdf
Esempio n. 6
0
    def apply(X: Array, YP: Array, BX: Array, BYP: Array) -> Array:
        # Collapse selected variant blocks and alphas into single
        # new covariate dimension
        assert YP.shape[2] == BYP.shape[2]
        n_group_covar = n_covar + BYP.shape[2] * n_alpha_1

        BYP = BYP.reshape((n_outcome, n_sample_block, -1))
        BG = da.concatenate((BX, BYP), axis=-1)
        BG = BG.rechunk((-1, None, -1))
        assert_block_shape(BG, 1, n_sample_block, 1)
        assert_chunk_shape(BG, n_outcome, 1, n_group_covar)
        assert_array_shape(BG, n_outcome, n_sample_block, n_group_covar)

        YP = YP.reshape((n_outcome, n_sample, -1))
        XYP = da.broadcast_to(X, (n_outcome, n_sample, n_covar))
        XG = da.concatenate((XYP, YP), axis=-1)
        XG = XG.rechunk((-1, None, -1))
        assert_block_shape(XG, 1, n_sample_block, 1)
        assert_chunk_shape(XG, n_outcome, sample_chunks[0], n_group_covar)
        assert_array_shape(XG, n_outcome, n_sample, n_group_covar)

        YG = da.map_blocks(
            # Block chunks:
            # (n_outcome, sample_chunks[0], n_group_covar) @
            # (n_outcome, n_group_covar, 1) [after transpose]
            lambda x, b: x @ b.transpose((0, 2, 1)),
            XG,
            BG,
            chunks=(n_outcome, sample_chunks, 1),
        )
        assert_block_shape(YG, 1, n_sample_block, 1)
        assert_chunk_shape(YG, n_outcome, sample_chunks[0], 1)
        assert_array_shape(YG, n_outcome, n_sample, 1)
        YG = da.squeeze(YG, axis=-1).T
        assert_block_shape(YG, n_sample_block, 1)
        assert_chunk_shape(YG, sample_chunks[0], n_outcome)
        assert_array_shape(YG, n_sample, n_outcome)
        return YG
Esempio n. 7
0
def _run_dask_cupy(data: da.Array) -> da.Array:
    msg = 'Upstream bug in dask prevents cupy backed arrays'
    raise NotImplementedError(msg)

    # add any func args
    # TODO: probably needs cellsize args
    _func = partial(_run_cupy)

    out = data.map_overlap(_func,
                           depth=(1, 1),
                           boundary=cupy.nan,
                           dtype=cupy.float32,
                           meta=cupy.array(()))
    return out
Esempio n. 8
0
def _dask_imread(files, imread=None, preprocess=None, coerce_shape=None):
    """ Read a stack of images into a dask array """
    from dask.array import Array
    from dask.base import tokenize
    from functools import partial

    if not imread:
        from skimage.io import imread

    def _imread(open_file):
        with open_file as f:
            return imread(f)

    def add_leading_dimension(x):
        return x[None, ...]

    filenames = [f.path for f in files]

    name = 'imread-%s' % tokenize(filenames)

    if coerce_shape is not None:
        reshape = partial(_coerce_shape, shape=coerce_shape)

    with files[0] as f:
        sample = imread(f)
    if coerce_shape is not None:
        sample = reshape(sample)
    if preprocess:
        sample = preprocess(sample)

    keys = [(name, i) + (0, ) * len(sample.shape) for i in range(len(files))]

    if coerce_shape is not None:
        if preprocess:
            values = [(add_leading_dimension,
                       (preprocess, (reshape, (_imread, f)))) for f in files]
        else:
            values = [(add_leading_dimension, (reshape, (_imread, f)))
                      for f in files]
    elif preprocess:
        values = [(add_leading_dimension, (preprocess, (_imread, f)))
                  for f in files]
    else:
        values = [(add_leading_dimension, (_imread, f)) for f in files]
    dsk = dict(zip(keys, values))

    chunks = ((1, ) * len(files), ) + tuple((d, ) for d in sample.shape)

    return Array(dsk, name, chunks, sample.dtype)
Esempio n. 9
0
    def _load_GeoTransform(self):
        """Calculate latitude and longitude variable calculated from the
        gdal.Open.GetGeoTransform method"""
        def load_lon():
            return arange(ds.RasterXSize) * b[1] + b[0]

        def load_lat():
            return arange(ds.RasterYSize) * b[5] + b[3]

        ds = self.ds
        b = self.ds.GetGeoTransform()  # bbox, interval
        if with_dask:
            lat = Array({('lat', 0): (load_lat, )},
                        'lat', (self.ds.RasterYSize, ),
                        shape=(self.ds.RasterYSize, ),
                        dtype=float)
            lon = Array({('lon', 0): (load_lon, )},
                        'lon', (self.ds.RasterXSize, ),
                        shape=(self.ds.RasterXSize, ),
                        dtype=float)
        else:
            lat = load_lat()
            lon = load_lon()
        return Variable(('lat', ), lat), Variable(('lon', ), lon)
Esempio n. 10
0
def pairwise_distances(X: da.Array,
                       Y: ArrayLike,
                       metric: Union[str, Callable[[ArrayLike, ArrayLike],
                                                   float]] = "euclidean",
                       n_jobs: Optional[int] = None,
                       **kwargs: Any):
    if isinstance(Y, da.Array):
        raise TypeError("`Y` must be a numpy array")
    chunks = (X.chunks[0], (len(Y), ))
    return X.map_blocks(metrics.pairwise_distances,
                        Y,
                        dtype=float,
                        chunks=chunks,
                        metric=metric,
                        **kwargs)
Esempio n. 11
0
def scatter_with_regression(
    x: da.Array,
    y: da.Array,
    sample_size: int,
    k: Optional[int] = None
) -> Tuple[Tuple[da.Array, da.Array], Tuple[da.Array, da.Array],
           Optional[da.Array]]:
    """Calculate pearson correlation on 2 given arrays.

    Parameters
    ----------
    xarr : da.Array
    yarr : da.Array
    sample_size : int
    k : Optional[int] = None
        Highlight k points which influence pearson correlation most
    """
    if k == 0:
        raise ValueError("k should be larger than 0")

    xp1 = da.vstack([x, da.ones_like(x)]).T
    xp1 = xp1.rechunk((xp1.chunks[0], -1))

    mask = ~(da.isnan(x) | da.isnan(y))
    # if chunk size in the first dimension is 1, lstsq will use sfqr instead of tsqr,
    # where the former does not support nan in shape.

    if len(xp1.chunks[0]) == 1:
        xp1 = xp1.rechunk((2, -1))
        y = y.rechunk((2, -1))
        mask = mask.rechunk((2, -1))

    (coeffa, coeffb), _, _, _ = da.linalg.lstsq(xp1[mask], y[mask])

    if sample_size < x.shape[0]:
        samplesel = da.random.choice(x.shape[0],
                                     int(sample_size),
                                     chunks=x.chunksize)
        x = x[samplesel]
        y = y[samplesel]

    if k is None:
        return (coeffa, coeffb), (x, y), None

    influences = pearson_influence(x, y)
    return (coeffa, coeffb), (x, y), influences
Esempio n. 12
0
def _rechunk(dask_array: da.Array):
    ndim_to_chunks = {
        2: {
            0: -1,
            1: -1
        },
        3: {
            0: "auto",
            1: -1,
            2: -1
        },
        4: {
            0: "auto",
            1: "auto",
            2: -1,
            3: -1
        },
    }
    return dask_array.rechunk(ndim_to_chunks[dask_array.ndim])
Esempio n. 13
0
def _unchunk_ifneeded(data: da.Array, axis: int) -> da.Array:
    """Returns `data` unchunked along `axis`.

    Parameters
    ----------
    data : :class:`dask.array.Array`
        Data which may be chunked along `axis`.
    axis : :class:`int`
        Axis number which specifies the axis to unchunk.

    Returns
    -------
        data : :class:`dask.array.Array`
            A dask array which is not chunked along the specified axis.
    """
    if isinstance(data, da.Array):
        shape = data.shape
        chunksize = data.chunksize
        axis = _check_axis(axis, data.ndim)
        if shape[axis] != chunksize[axis]:
            data = data.rechunk({axis: -1})
        return data
    else:
        raise TypeError("data must be a dask array.")
Esempio n. 14
0
def _max_str_len(arr: Array) -> Array:
    return arr.map_blocks(lambda s: np.char.str_len(s.astype(str)),
                          dtype=np.int8).max()
Esempio n. 15
0
def array(self, futures, shape=None, chunks=None, dtype=None):
    """
    Turns a set of future arrays (result of a distributed operation),
    associated to a cartesian communicator, into a Dask Array.

    Parameters
    ----------
    cart: CartComm
        A cartesian communicator with dimensions equal to the number of chunks
    futures: tuple(futures)
        A set of future arrays associated to the cart
    dims_axes: tuple
        The axes associated to the dimensions of the cart
    shape: tuple(int)
        The shape of the array
    chunks: tuple(int)
        The chunks of the array
    dtype: tuple(int)
        The dtype of the array
    """
    if not len(futures) == len(self):
        raise ValueError("futures and cart must have the same length")

    if chunks is None or dtype is None:
        infos = self.client.map(lambda arr: (arr.dtype, arr.shape), futures)
        infos = tuple(_.result() for _ in infos)

        if dtype is None:
            dtype = infos[0][0]
        if not all((dtype == dtp for (dtp, _) in infos)):
            raise TypeError(
                f"Futures have different dtypes {[info[0] for info in infos]}")

        if chunks is None:
            chunks = infos[0][1]
            if not all((chunks == chn for (_, chn) in infos)):
                # TODO: normalize chunks using shape
                raise NotImplementedError(
                    "Futures with non-uniform chunks not supported yet")

    if shape is None:
        shape = list(chunks)
        for _i, _l in self.normalize_dims():
            shape[_i] *= _l

    chunks = normalize_chunks(chunks, shape, dtype=dtype)

    self.check_dims(tuple(len(chunk) for chunk in chunks))

    dask = {}
    idxs, _ = zip(*self.normalize_dims())
    for coords, future in zip(self.normalize_coords(), futures):
        key = [0] * len(shape)
        for _i, _c in zip(idxs, coords):
            key[_i] = _c

        name = next(iter(futures)).key
        if isinstance(name, tuple):
            name = name[0]
        assert isinstance(name, str)
        key = (name, ) + tuple(key)
        dask[key] = future

    return Array(dask,
                 next(iter(dask.keys()))[0],
                 chunks,
                 dtype=dtype,
                 shape=shape)
Esempio n. 16
0
def _stage_2(
    YP: Array,
    X: Array,
    Y: Array,
    alphas: Optional[NDArray] = None,
    normalize: bool = True,
    _glow_adj_alpha: bool = False,
    _glow_adj_scaling: bool = False,
) -> Tuple[Array, Array]:
    """Stage 2 - WGR Meta Regression

    This stage will train separate ridge regression models for each outcome
    using the predictions from stage 1 for that same outcome as features. These
    predictions are then evaluated based on R2 score to determine an optimal
    "meta" estimator (see `_stage_1` for the "base" estimator description). Results
    then include only predictions and coefficients from this optimal model.

    For more details, see the level 1 regression model described in step 1
    of [Mbatchou et al. 2020](https://www.biorxiv.org/content/10.1101/2020.06.19.162354v2).
    """
    assert YP.ndim == 4
    assert X.ndim == 2
    assert Y.ndim == 2
    # Check that chunking across samples is the same for all arrays
    assert YP.numblocks[2] == X.numblocks[0] == Y.numblocks[0]
    assert YP.chunks[2] == X.chunks[0] == Y.chunks[0]
    # Assert single chunks for covariates and outcomes
    assert X.numblocks[1] == Y.numblocks[1] == 1
    # Extract shape statistics
    n_variant_block, n_alpha_1 = YP.shape[:2]
    n_sample_block = Y.numblocks[0]
    n_sample, n_outcome = Y.shape
    n_covar = X.shape[1]
    n_indvar = n_covar + n_variant_block * n_alpha_1
    sample_chunks = Y.chunks[0]

    if normalize:
        assert_block_shape(YP, n_variant_block, 1, n_sample_block, 1)
        assert_chunk_shape(YP, 1, n_alpha_1, sample_chunks[0], n_outcome)
        # See: https://github.com/projectglow/glow/issues/260
        if _glow_adj_scaling:
            YP = da.map_blocks(
                lambda x: (x - x.mean(axis=2, keepdims=True))
                / x.std(axis=2, keepdims=True),
                YP,
            )
        else:
            YP = (YP - YP.mean(axis=2, keepdims=True)) / YP.std(axis=2, keepdims=True)
    # Tranpose for refit on level 1 predictions
    YP = YP.transpose((3, 2, 0, 1))
    assert_array_shape(YP, n_outcome, n_sample, n_variant_block, n_alpha_1)

    if alphas is None:
        # See: https://github.com/projectglow/glow/issues/255
        if _glow_adj_alpha:
            alphas = get_alphas(n_variant_block * n_alpha_1 * n_outcome)
        else:
            alphas = get_alphas(n_variant_block * n_alpha_1)
    n_alpha_2 = alphas.size

    YR = []
    BR = []
    for i in range(n_outcome):
        # Slice and reshape to new 2D covariate matrix;
        # The order of raveling in trailing dimensions is important
        # and later reshapes will assume variants, alphas order
        XPB = YP[i].reshape((n_sample, n_variant_block * n_alpha_1))
        # Prepend covariates and chunk along first dim only
        XPB = da.concatenate((X, XPB), axis=1)
        XPB = XPB.rechunk(chunks=(None, -1))
        assert_array_shape(XPB, n_sample, n_indvar)
        assert XPB.numblocks == (n_sample_block, 1)
        # Extract outcome vector
        YB = Y[:, [i]]
        assert XPB.ndim == YB.ndim == 2
        # Fit and predict folds for each parameter
        BB, YPB = _ridge_regression_cv(XPB, YB, alphas, n_zero_reg=n_covar)[-2:]
        assert_array_shape(BB, n_alpha_2, n_sample_block * n_indvar, 1)
        assert_array_shape(YPB, n_alpha_2, n_sample, 1)
        BR.append(BB)
        YR.append(YPB)

    # Concatenate predictions along outcome dimension
    YR = da.concatenate(YR, axis=2)
    assert_block_shape(YR, 1, n_sample_block, n_outcome)
    assert_chunk_shape(YR, n_alpha_2, sample_chunks[0], 1)
    assert_array_shape(YR, n_alpha_2, n_sample, n_outcome)
    # Move samples to last dim so all others are batch
    # dims for R2 calculations
    YR = da.transpose(YR, (0, 2, 1))
    assert_array_shape(YR, n_alpha_2, n_outcome, n_sample)
    YR = YR.rechunk((-1, -1, None))
    assert_block_shape(YR, 1, 1, n_sample_block)
    assert YR.shape[1:] == Y.T.shape

    # Concatenate betas along outcome dimension
    BR = da.concatenate(BR, axis=2)
    assert_block_shape(BR, 1, n_sample_block, n_outcome)
    assert_chunk_shape(BR, n_alpha_2, n_indvar, 1)
    assert_array_shape(BR, n_alpha_2, n_sample_block * n_indvar, n_outcome)

    # Compute R2 scores within each sample block for each outcome + alpha
    R2 = da.stack(
        [
            r2_score(YR.blocks[..., i], Y.T.blocks[..., i])
            # Avoid warnings on R2 calculations for blocks with single rows
            if YR.chunks[-1][i] > 1 else da.full(YR.shape[:-1], np.nan)
            for i in range(n_sample_block)
        ]
    )
    assert_array_shape(R2, n_sample_block, n_alpha_2, n_outcome)
    # Coerce to finite or nan before nan-aware mean
    R2 = da.where(da.isfinite(R2), R2, np.nan)
    # Find highest mean alpha score for each outcome across blocks
    R2M = da.nanmean(R2, axis=0)
    assert_array_shape(R2M, n_alpha_2, n_outcome)
    # Identify index for the alpha value with the highest mean score
    R2I = da.argmax(R2M, axis=0)
    assert_array_shape(R2I, n_outcome)

    # Choose the predictions corresponding to the model with best score
    YRM = da.stack([YR[R2I[i], i, :] for i in range(n_outcome)], axis=-1)
    YRM = YRM.rechunk((None, -1))
    assert_block_shape(YRM, n_sample_block, 1)
    assert_chunk_shape(YRM, sample_chunks[0], n_outcome)
    assert_array_shape(YRM, n_sample, n_outcome)
    # Choose the betas corresponding to the model with the best score
    BRM = da.stack([BR[R2I[i], :, i] for i in range(n_outcome)], axis=-1)
    BRM = BRM.rechunk((None, -1))
    assert_block_shape(BRM, n_sample_block, 1)
    assert_chunk_shape(BRM, n_indvar, n_outcome)
    assert_array_shape(BRM, n_sample_block * n_indvar, n_outcome)
    return BRM, YRM
Esempio n. 17
0
def _map_blocks_asnumpy(x: Array) -> Array:
    if da.utils.is_cupy_type(x._meta):  # pragma: no cover
        import cupy as cp  # type: ignore[import]

        x = x.map_blocks(cp.asnumpy)
    return x
Esempio n. 18
0
def est_motion_part(varr: darr.Array,
                    npart: int,
                    chunk_nfm: int,
                    alt_error=5,
                    **kwargs) -> Tuple[darr.Array, darr.Array]:
    """
    Construct dask graph for the recursive motion estimation algorithm.

    Parameters
    ----------
    varr : darr.Array
        Input dask array representing movie data.
    npart : int
        Number of frames/chunks to combine for the recursive algorithm.
    chunk_nfm : int
        Number of frames in each parallel task.
    alt_error : int, optional
        Error threshold between estimated shifts from two alternative methods,
        specified in pixels. By default `5`.

    Returns
    -------
    temps : darr.Array
        Registration template for the movie.
    shifts : darr.Array
        Estimated motion.
    See Also
    --------
    estimate_motion
    """
    if chunk_nfm is None:
        chunk_nfm = varr.chunksize[0]
    varr = varr.rechunk((chunk_nfm, None, None))
    arr_opt = fct.partial(custom_arr_optimize,
                          keep_patterns=["^est_motion_chunk"])
    if kwargs.get("mesh_size", None):
        param = get_bspline_param(varr[0].compute(), kwargs["mesh_size"])
    tmp_ls = []
    sh_ls = []
    for blk in varr.blocks:
        res = da.delayed(est_motion_chunk)(blk,
                                           None,
                                           alt_error=alt_error,
                                           npart=npart,
                                           **kwargs)
        if alt_error:
            tmp = darr.from_delayed(res[0],
                                    shape=(3, blk.shape[1], blk.shape[2]),
                                    dtype=blk.dtype)
        else:
            tmp = darr.from_delayed(res[0],
                                    shape=(blk.shape[1], blk.shape[2]),
                                    dtype=blk.dtype)
        if kwargs.get("mesh_size", None):
            sh = darr.from_delayed(
                res[1],
                shape=(blk.shape[0], 2, int(param[1]), int(param[0])),
                dtype=float,
            )
        else:
            sh = darr.from_delayed(res[1],
                                   shape=(blk.shape[0], 2),
                                   dtype=float)
        tmp_ls.append(tmp)
        sh_ls.append(sh)
    with da.config.set(array_optimize=arr_opt):
        temps = da.optimize(darr.stack(tmp_ls, axis=0))[0]
        shifts = da.optimize(darr.concatenate(sh_ls, axis=0))[0]
    while temps.shape[0] > 1:
        tmp_ls = []
        sh_ls = []
        for idx in np.arange(0, temps.numblocks[0], npart):
            tmps = temps.blocks[idx:idx + npart]
            sh_org = shifts.blocks[idx:idx + npart]
            sh_org_ls = [sh_org.blocks[i] for i in range(sh_org.numblocks[0])]
            res = da.delayed(est_motion_chunk)(tmps,
                                               sh_org_ls,
                                               alt_error=alt_error,
                                               npart=npart,
                                               **kwargs)
            if alt_error:
                tmp = darr.from_delayed(res[0],
                                        shape=(3, tmps.shape[1],
                                               tmps.shape[2]),
                                        dtype=tmps.dtype)
            else:
                tmp = darr.from_delayed(res[0],
                                        shape=(tmps.shape[1], tmps.shape[2]),
                                        dtype=tmps.dtype)
            sh_new = darr.from_delayed(res[1],
                                       shape=sh_org.shape,
                                       dtype=sh_org.dtype)
            tmp_ls.append(tmp)
            sh_ls.append(sh_new)
        temps = darr.stack(tmp_ls, axis=0)
        shifts = darr.concatenate(sh_ls, axis=0)
    return temps, shifts
Esempio n. 19
0
def _encode_dask_array(
    values: da.Array,
    uniques: Optional[np.ndarray] = None,
    encode: bool = False,
    onehot_dtype: Optional[np.dtype] = None,
):
    """One-hot or label encode a dask array.

    Parameters
    ----------
    values : da.Array, shape [n_samples,]
    uniques : np.ndarray, shape [n_uniques,]
    encode : bool, default False
        Whether to encode the values (True) or just discover the uniques.
    onehot_dtype : np.dtype, optional
        Optional dtype for the resulting one-hot encoded array. This changes
        the shape, dtype, and underlying storage of the returned dask array.

        ======= ================= =========================
        thing   onehot_dtype=None onehot_dtype=onehot_dtype
        ======= ================= =========================
        shape   (n_samples,)      (n_samples, len(uniques))
        dtype   np.intp           onehot_dtype
        storage np.ndarray        scipy.sparse.csr_matrix
        ======= ================= =========================

    Returns
    -------
    uniques : ndarray
        The discovered uniques (uniques=None) or just `uniques`
    encoded : da.Array, optional
        The encoded values. Only returned when ``encode=True``.
    """

    if uniques is None:
        if encode and onehot_dtype:
            raise ValueError(
                "Cannot use 'encode` and 'onehot_dtype' simultaneously.")
        if encode:
            uniques, encoded = da.unique(values, return_inverse=True)
            return uniques, encoded
        else:
            return da.unique(values)

    if encode:
        if onehot_dtype:
            dtype = onehot_dtype
            new_axis: Optional[int] = 1
            chunks = values.chunks + (len(uniques), )
        else:
            dtype = np.dtype("int")
            new_axis = None
            chunks = values.chunks

        return (
            uniques,
            values.map_blocks(
                _check_and_search_block,
                uniques,
                onehot_dtype=onehot_dtype,
                dtype=dtype,
                new_axis=new_axis,
                chunks=chunks,
            ),
        )
    else:
        return uniques
Esempio n. 20
0
def dask_hist1d(
    a: Array, bins=None, range=None, normed=False, weights=None, density=None
):
    """
    Blocked variant of :func:`numpy.histogram`, but using the fast-histogram module.

    Parameters
    ----------
    a : array_like
        Input data. The histogram is computed over the flattened array.
    bins : int or sequence of scalars, optional
        Either an iterable specifying the ``bins`` or the number of ``bins``
        and a ``range`` argument is required as computing ``min`` and ``max``
        over blocked arrays is an expensive operation that must be performed
        explicitly.
        If `bins` is an int, it defines the number of equal-width
        bins in the given range (10, by default). If `bins` is a
        sequence, it defines a monotonically increasing array of bin edges,
        including the rightmost edge, allowing for non-uniform bin widths.
    range : (float, float), optional
        The lower and upper range of the bins.  If not provided, range
        is simply ``(a.min(), a.max())``.  Values outside the range are
        ignored. The first element of the range must be less than or
        equal to the second. `range` affects the automatic bin
        computation as well. While bin width is computed to be optimal
        based on the actual data within `range`, the bin count will fill
        the entire range including portions containing no data.
    normed : bool, optional
        This is equivalent to the ``density`` argument, but produces incorrect
        results for unequal bin widths. It should not be used.
    weights : array_like, optional
        A dask.array.Array of weights, of the same block structure as ``a``.  Each value in
        ``a`` only contributes its associated weight towards the bin count
        (instead of 1). If ``density`` is True, the weights are
        normalized, so that the integral of the density over the range
        remains 1.
    density : bool, optional
        If ``False``, the result will contain the number of samples in
        each bin. If ``True``, the result is the value of the
        probability *density* function at the bin, normalized such that
        the *integral* over the range is 1. Note that the sum of the
        histogram values will not be equal to 1 unless bins of unity
        width are chosen; it is not a probability *mass* function.
        Overrides the ``normed`` keyword if given.
        If ``density`` is True, ``bins`` cannot be a single-number delayed
        value. It must be a concrete number, or a (possibly-delayed)
        array/sequence of the bin edges.
    Returns
    -------
    hist : dask Array
        The values of the histogram. See `density` and `weights` for a
        description of the possible semantics.
    bin_edges : dask Array of dtype float
        Return the bin edges ``(length(hist)+1)``.


    Examples
    --------
    Using number of bins and range:

    >>> import dask.array as da
    >>> import numpy as np
    >>> x = da.from_array(np.arange(10000), chunks=10)
    >>> h, bins = da.histogram(x, bins=10, range=[0, 10000])
    >>> bins
    array([    0.,  1000.,  2000.,  3000.,  4000.,  5000.,  6000.,  7000.,
            8000.,  9000., 10000.])
    >>> h.compute()
    array([1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000])

    Explicitly specifying the bins:

    >>> h, bins = da.histogram(x, bins=np.array([0, 5000, 10000]))
    >>> bins
    array([    0,  5000, 10000])
    >>> h.compute()
    array([5000, 5000])
    """
    if isinstance(bins, Array):
        scalar_bins = bins.ndim == 0
        # ^ `np.ndim` is not implemented by Dask array.
    elif isinstance(bins, Delayed):
        scalar_bins = bins._length is None or bins._length == 1
    else:
        scalar_bins = np.ndim(bins) == 0

    if bins is None or (scalar_bins and range is None):
        raise ValueError(
            "dask.array.histogram requires either specifying "
            "bins as an iterable or specifying both a range and "
            "the number of bins"
        )

    if weights is not None and weights.chunks != a.chunks:
        raise ValueError("Input array and weights must have the same chunked structure")

    if normed is not False:
        raise ValueError(
            "The normed= keyword argument has been deprecated. "
            "Please use density instead. "
            "See the numpy.histogram docstring for more information."
        )

    if density and scalar_bins and isinstance(bins, (Array, Delayed)):
        raise NotImplementedError(
            "When `density` is True, `bins` cannot be a scalar Dask object. "
            "It must be a concrete number or a (possibly-delayed) array/sequence of bin edges."
        )

    for argname, val in [("bins", bins), ("range", range), ("weights", weights)]:
        if not isinstance(bins, (Array, Delayed)) and is_dask_collection(bins):
            raise TypeError(
                "Dask types besides Array and Delayed are not supported "
                "for `histogram`. For argument `{}`, got: {!r}".format(argname, val)
            )

    if range is not None:
        try:
            if len(range) != 2:
                raise ValueError(
                    f"range must be a sequence or array of length 2, but got {len(range)} items"
                )
            if isinstance(range, (Array, np.ndarray)) and range.shape != (2,):
                raise ValueError(
                    f"range must be a 1-dimensional array of two items, but got an array of shape {range.shape}"
                )
        except TypeError:
            raise TypeError(
                f"Expected a sequence or array for range, not {range}"
            ) from None

    token = tokenize(a, bins, range, weights, density)
    name = "histogram-sum-" + token

    if scalar_bins:
        bins = _linspace_from_delayed(range[0], range[1], bins + 1)
        # ^ NOTE `range[1]` is safe because of the above check, and the initial check
        # that range must not be None if `scalar_bins`
    else:
        if not isinstance(bins, (Array, np.ndarray)):
            bins = asarray(bins)
        if bins.ndim != 1:
            raise ValueError(
                f"bins must be a 1-dimensional array or sequence, got shape {bins.shape}"
            )

    (bins_ref, range_ref), deps = unpack_collections([bins, range])

    # Map the histogram to all bins, forming a 2D array of histograms, stacked for each chunk
    if weights is None:
        dsk = {
            (name, i, 0): (_block_fast_hist1d, k, bins_ref, range_ref)
            for i, k in enumerate(flatten(a.__dask_keys__()))
        }
        dtype = np.histogram([])[0].dtype
    else:
        a_keys = flatten(a.__dask_keys__())
        w_keys = flatten(weights.__dask_keys__())
        dsk = {
            (name, i, 0): (_block_fast_hist1d, k, bins_ref, range_ref, w)
            for i, (k, w) in enumerate(zip(a_keys, w_keys))
        }
        dtype = weights.dtype

    deps = (a,) + deps
    if weights is not None:
        deps += (weights,)
    graph = HighLevelGraph.from_collections(name, dsk, dependencies=deps)

    # Turn graph into a 2D Array of shape (nchunks, nbins)
    nchunks = len(list(flatten(a.__dask_keys__())))
    nbins = bins.size - 1  # since `bins` is 1D
    chunks = ((1,) * nchunks, (nbins,))
    mapped = Array(graph, name, chunks, dtype=dtype)

    # Sum over chunks to get the final histogram
    n = mapped.sum(axis=0)

    # We need to replicate normed and density options from numpy
    if density is not None:
        if density:
            db = asarray(np.diff(bins).astype(float), chunks=n.chunks)
            return n / db / n.sum(), bins
        else:
            return n, bins
    else:
        return n, bins