Esempio n. 1
0
    def _transform_col(self, X_col: da.Array, quantiles: ArrayLike,
                       inverse: bool) -> ArrayLike:
        output_distribution = self.output_distribution

        if not inverse:
            lower_bound_x = quantiles[0]
            upper_bound_x = quantiles[-1]
            lower_bound_y = 0
            upper_bound_y = 1
        else:
            lower_bound_x = 0
            upper_bound_x = 1
            lower_bound_y = quantiles[0]
            upper_bound_y = quantiles[-1]
            #  for inverse transform, match a uniform distribution
            if output_distribution == "normal":
                X_col = X_col.map_blocks(stats.norm.cdf)
                # else output distribution is already a uniform distribution

        if output_distribution == "normal":
            lower_bounds_idx = X_col - BOUNDS_THRESHOLD < lower_bound_x
            upper_bounds_idx = X_col + BOUNDS_THRESHOLD > upper_bound_x
        if output_distribution == "uniform":
            lower_bounds_idx = X_col == lower_bound_x
            upper_bounds_idx = X_col == upper_bound_x

        if not inverse:
            # See the note in scikit-learn. This trick is to avoid
            # repeated extreme values
            X_col = 0.5 * (
                X_col.map_blocks(np.interp, quantiles, self.references_) -
                (-X_col).map_blocks(np.interp, -quantiles[::-1],
                                    -self.references_[::-1]))
        else:
            X_col = X_col.map_blocks(np.interp, self.references_, quantiles)

        X_col[upper_bounds_idx] = upper_bound_y
        X_col[lower_bounds_idx] = lower_bound_y

        if not inverse:

            if output_distribution == "normal":
                X_col = X_col.map_blocks(stats.norm.ppf)
                # find the value to clip the data to avoid mapping to
                # infinity. Clip such that the inverse transform will be
                # consistent
                clip_min = stats.norm.ppf(BOUNDS_THRESHOLD - np.spacing(1))
                clip_max = stats.norm.ppf(1 -
                                          (BOUNDS_THRESHOLD - np.spacing(1)))
                X_col = da.clip(X_col, clip_min, clip_max)

            # else output distribution is uniform and the ppf is the
            # identity function so we let X_col unchanged

        return X_col
Esempio n. 2
0
def pairwise_distances(X: da.Array,
                       Y: ArrayLike,
                       metric: Union[str, Callable[[ArrayLike, ArrayLike],
                                                   float]] = "euclidean",
                       n_jobs: Optional[int] = None,
                       **kwargs: Any):
    if isinstance(Y, da.Array):
        raise TypeError("`Y` must be a numpy array")
    chunks = (X.chunks[0], (len(Y), ))
    return X.map_blocks(metrics.pairwise_distances,
                        Y,
                        dtype=float,
                        chunks=chunks,
                        metric=metric,
                        **kwargs)
Esempio n. 3
0
def _max_str_len(arr: Array) -> Array:
    return arr.map_blocks(lambda s: np.char.str_len(s.astype(str)),
                          dtype=np.int8).max()
Esempio n. 4
0
def _map_blocks_asnumpy(x: Array) -> Array:
    if da.utils.is_cupy_type(x._meta):  # pragma: no cover
        import cupy as cp  # type: ignore[import]

        x = x.map_blocks(cp.asnumpy)
    return x
Esempio n. 5
0
def _encode_dask_array(
    values: da.Array,
    uniques: Optional[np.ndarray] = None,
    encode: bool = False,
    onehot_dtype: Optional[np.dtype] = None,
):
    """One-hot or label encode a dask array.

    Parameters
    ----------
    values : da.Array, shape [n_samples,]
    uniques : np.ndarray, shape [n_uniques,]
    encode : bool, default False
        Whether to encode the values (True) or just discover the uniques.
    onehot_dtype : np.dtype, optional
        Optional dtype for the resulting one-hot encoded array. This changes
        the shape, dtype, and underlying storage of the returned dask array.

        ======= ================= =========================
        thing   onehot_dtype=None onehot_dtype=onehot_dtype
        ======= ================= =========================
        shape   (n_samples,)      (n_samples, len(uniques))
        dtype   np.intp           onehot_dtype
        storage np.ndarray        scipy.sparse.csr_matrix
        ======= ================= =========================

    Returns
    -------
    uniques : ndarray
        The discovered uniques (uniques=None) or just `uniques`
    encoded : da.Array, optional
        The encoded values. Only returned when ``encode=True``.
    """

    if uniques is None:
        if encode and onehot_dtype:
            raise ValueError(
                "Cannot use 'encode` and 'onehot_dtype' simultaneously.")
        if encode:
            uniques, encoded = da.unique(values, return_inverse=True)
            return uniques, encoded
        else:
            return da.unique(values)

    if encode:
        if onehot_dtype:
            dtype = onehot_dtype
            new_axis: Optional[int] = 1
            chunks = values.chunks + (len(uniques), )
        else:
            dtype = np.dtype("int")
            new_axis = None
            chunks = values.chunks

        return (
            uniques,
            values.map_blocks(
                _check_and_search_block,
                uniques,
                onehot_dtype=onehot_dtype,
                dtype=dtype,
                new_axis=new_axis,
                chunks=chunks,
            ),
        )
    else:
        return uniques