def _transform_col(self, X_col: da.Array, quantiles: ArrayLike, inverse: bool) -> ArrayLike: output_distribution = self.output_distribution if not inverse: lower_bound_x = quantiles[0] upper_bound_x = quantiles[-1] lower_bound_y = 0 upper_bound_y = 1 else: lower_bound_x = 0 upper_bound_x = 1 lower_bound_y = quantiles[0] upper_bound_y = quantiles[-1] # for inverse transform, match a uniform distribution if output_distribution == "normal": X_col = X_col.map_blocks(stats.norm.cdf) # else output distribution is already a uniform distribution if output_distribution == "normal": lower_bounds_idx = X_col - BOUNDS_THRESHOLD < lower_bound_x upper_bounds_idx = X_col + BOUNDS_THRESHOLD > upper_bound_x if output_distribution == "uniform": lower_bounds_idx = X_col == lower_bound_x upper_bounds_idx = X_col == upper_bound_x if not inverse: # See the note in scikit-learn. This trick is to avoid # repeated extreme values X_col = 0.5 * ( X_col.map_blocks(np.interp, quantiles, self.references_) - (-X_col).map_blocks(np.interp, -quantiles[::-1], -self.references_[::-1])) else: X_col = X_col.map_blocks(np.interp, self.references_, quantiles) X_col[upper_bounds_idx] = upper_bound_y X_col[lower_bounds_idx] = lower_bound_y if not inverse: if output_distribution == "normal": X_col = X_col.map_blocks(stats.norm.ppf) # find the value to clip the data to avoid mapping to # infinity. Clip such that the inverse transform will be # consistent clip_min = stats.norm.ppf(BOUNDS_THRESHOLD - np.spacing(1)) clip_max = stats.norm.ppf(1 - (BOUNDS_THRESHOLD - np.spacing(1))) X_col = da.clip(X_col, clip_min, clip_max) # else output distribution is uniform and the ppf is the # identity function so we let X_col unchanged return X_col
def pairwise_distances(X: da.Array, Y: ArrayLike, metric: Union[str, Callable[[ArrayLike, ArrayLike], float]] = "euclidean", n_jobs: Optional[int] = None, **kwargs: Any): if isinstance(Y, da.Array): raise TypeError("`Y` must be a numpy array") chunks = (X.chunks[0], (len(Y), )) return X.map_blocks(metrics.pairwise_distances, Y, dtype=float, chunks=chunks, metric=metric, **kwargs)
def _max_str_len(arr: Array) -> Array: return arr.map_blocks(lambda s: np.char.str_len(s.astype(str)), dtype=np.int8).max()
def _map_blocks_asnumpy(x: Array) -> Array: if da.utils.is_cupy_type(x._meta): # pragma: no cover import cupy as cp # type: ignore[import] x = x.map_blocks(cp.asnumpy) return x
def _encode_dask_array( values: da.Array, uniques: Optional[np.ndarray] = None, encode: bool = False, onehot_dtype: Optional[np.dtype] = None, ): """One-hot or label encode a dask array. Parameters ---------- values : da.Array, shape [n_samples,] uniques : np.ndarray, shape [n_uniques,] encode : bool, default False Whether to encode the values (True) or just discover the uniques. onehot_dtype : np.dtype, optional Optional dtype for the resulting one-hot encoded array. This changes the shape, dtype, and underlying storage of the returned dask array. ======= ================= ========================= thing onehot_dtype=None onehot_dtype=onehot_dtype ======= ================= ========================= shape (n_samples,) (n_samples, len(uniques)) dtype np.intp onehot_dtype storage np.ndarray scipy.sparse.csr_matrix ======= ================= ========================= Returns ------- uniques : ndarray The discovered uniques (uniques=None) or just `uniques` encoded : da.Array, optional The encoded values. Only returned when ``encode=True``. """ if uniques is None: if encode and onehot_dtype: raise ValueError( "Cannot use 'encode` and 'onehot_dtype' simultaneously.") if encode: uniques, encoded = da.unique(values, return_inverse=True) return uniques, encoded else: return da.unique(values) if encode: if onehot_dtype: dtype = onehot_dtype new_axis: Optional[int] = 1 chunks = values.chunks + (len(uniques), ) else: dtype = np.dtype("int") new_axis = None chunks = values.chunks return ( uniques, values.map_blocks( _check_and_search_block, uniques, onehot_dtype=onehot_dtype, dtype=dtype, new_axis=new_axis, chunks=chunks, ), ) else: return uniques