def transform(self, X, y=None): kwargs = self.kw_args if self.kw_args else {} if isinstance(X, da.Array): if self.validate: X = check_array(X, accept_dask_array=True, accept_unknown_chunks=True) XP = X.map_blocks(self.func, dtype=X.dtype, chunks=X.chunks, **kwargs) elif isinstance(X, dd.DataFrame): if self.validate: X = check_array(X, accept_dask_dataframe=True, preserve_pandas_dataframe=True) XP = X.map_partitions(self.func, **kwargs) elif isinstance(X, pd.DataFrame): if self.validate: X = check_array(X, accept_dask_array=False, preserve_pandas_dataframe=True) XP = self.func(X, **kwargs) else: if self.validate: X = check_array(X, accept_dask_array=False) XP = self.func(X, **kwargs) return XP
def transform(self, X, y=None): if isinstance(X, da.Array): n_cols = len(self._transformer.get_feature_names()) X = check_array(X, accept_multiple_blocks=False, accept_unknown_chunks=True) chunks = (X.chunks[0], n_cols) XP = X.map_blocks(self._transformer.transform, dtype=X.dtype, chunks=chunks) elif isinstance(X, pd.DataFrame): XP = X.pipe(self._transformer.transform) if self.preserve_dataframe: columns = self._transformer.get_feature_names(X.columns) XP = pd.DataFrame(data=XP, columns=columns, index=X.index) elif isinstance(X, dd.DataFrame): XP = X.map_partitions(self._transformer.transform) if self.preserve_dataframe: columns = self._transformer.get_feature_names(X.columns) XP = dd.from_dask_array(XP, columns, X.index) else: # typically X is instance of np.ndarray XP = self._transformer.transform(X) return XP
def _fit(self, X, handle_unknown="error"): X = check_array(X, accept_dask_dataframe=True, dtype=None, preserve_pandas_dataframe=True) _, n_features = X.shape if self.categories != "auto": for cats in self.categories: if not np.all(np.sort(cats) == np.array(cats)): raise ValueError("Unsorted categories are not yet" "supported") if len(self.categories) != n_features: raise ValueError("Shape mismatch: if n_values is an array," " it has to be of shape (n_features,).") self.categories_ = [] self.dtypes_ = [] for i in range(n_features): Xi = X[:, i] if self.categories == "auto": cats = _encode(Xi) else: cats = np.array(self.categories[i], dtype=X.dtype) self.categories_.append(cats) self.dtypes_.append(None) self.categories_ = dask.compute(self.categories_)[0]
def check_array(array, accept_sparse=False, *, accept_large_sparse=True, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, ensure_min_features=1, estimator=None, distributed=False, chunks="16MB"): _X = dmlu.check_array(array, accept_sparse=accept_sparse, accept_large_sparse=accept_large_sparse, dtype=dtype, order=order, copy=copy, force_all_finite=force_all_finite, ensure_2d=ensure_2d, allow_nd=allow_nd, ensure_min_samples=ensure_min_samples, ensure_min_features=ensure_min_features, estimator=estimator) if distributed: return da.from_array(_X, chunks=chunks) else: return _X
def split(self, X, y=None, groups=None): X = check_array(X) for i in range(self.n_splits): if self.blockwise: yield self._split_blockwise(X) else: yield self._split(X)
def split(self, X, y=None, groups=None): X = check_array(X) rng = check_random_state(self.random_state) for i in range(self.n_splits): seeds = draw_seed(rng, 0, 2 ** 32 - 1, size=len(X.chunks[0]), dtype="uint") if self.blockwise: yield self._split_blockwise(X, seeds) else: yield self._split(X)
def split(self, X, y=None, groups=None): X = check_array(X, ensure_2d=False, allow_nd=True) rng = check_random_state(self.random_state) for i in range(self.n_splits): seeds = draw_seed(rng, 0, _I4MAX, size=len(X.chunks[0]), dtype="uint") if self.blockwise: yield self._split_blockwise(X, seeds) else: yield self._split(X)
def split(self, X, y=None, groups=None): X = check_array(X) n_samples = X.shape[0] n_splits = self.n_splits fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=np.int) fold_sizes[:n_samples % n_splits] += 1 chunks = X.chunks[0] seeds = [None] * len(chunks) if self.shuffle: rng = check_random_state(self.random_state) seeds = draw_seed(rng, 0, _I4MAX, size=len(chunks), dtype="uint") test_current = 0 for fold_size in fold_sizes: test_start, test_stop = test_current, test_current + fold_size yield self._split(test_start, test_stop, n_samples, chunks, seeds) test_current = test_stop
def _transform_new(self, X): X = check_array(X, accept_dask_dataframe=True, dtype=None, preserve_pandas_dataframe=True) _, n_features = X.shape # We encode each column independently, as they have different categories. Xs = [ _encode_dask_array( X[:, i], uniques=self.categories_[i], encode=True, onehot_dtype=self.dtype, )[1] for i in range(n_features) ] X = da.concatenate(Xs, axis=1) if not self.sparse: X = X.map_blocks(lambda x: x.toarray(), dtype=self.dtype) return X.rechunk({1: X.shape[1]})
def _check_array(self, X, *args, **kwargs): X = check_array(X, accept_dask_dataframe=True, **kwargs) return X
def test_check_array_1d(): arr = da.random.uniform(size=(10, ), chunks=5) check_array(arr, ensure_2d=False)
def test_check_array_raises(): X = da.random.uniform(size=(10, 5), chunks=2) with pytest.raises(TypeError) as m: check_array(X) assert m.match("Chunking is only allowed on the first axis.")
def _check_array(self, X: Union[ArrayLike, DataFrameType], *args: Any, **kwargs: Any) -> Union[ArrayLike, DataFrameType]: X = check_array(X, accept_dask_dataframe=True, **kwargs) return X