Esempio n. 1
0
    def transform(self, X, y=None):
        kwargs = self.kw_args if self.kw_args else {}

        if isinstance(X, da.Array):
            if self.validate:
                X = check_array(X,
                                accept_dask_array=True,
                                accept_unknown_chunks=True)
            XP = X.map_blocks(self.func,
                              dtype=X.dtype,
                              chunks=X.chunks,
                              **kwargs)
        elif isinstance(X, dd.DataFrame):
            if self.validate:
                X = check_array(X,
                                accept_dask_dataframe=True,
                                preserve_pandas_dataframe=True)
            XP = X.map_partitions(self.func, **kwargs)
        elif isinstance(X, pd.DataFrame):
            if self.validate:
                X = check_array(X,
                                accept_dask_array=False,
                                preserve_pandas_dataframe=True)
            XP = self.func(X, **kwargs)
        else:
            if self.validate:
                X = check_array(X, accept_dask_array=False)
            XP = self.func(X, **kwargs)
        return XP
Esempio n. 2
0
    def transform(self, X, y=None):
        if isinstance(X, da.Array):
            n_cols = len(self._transformer.get_feature_names())
            X = check_array(X,
                            accept_multiple_blocks=False,
                            accept_unknown_chunks=True)
            chunks = (X.chunks[0], n_cols)
            XP = X.map_blocks(self._transformer.transform,
                              dtype=X.dtype,
                              chunks=chunks)
        elif isinstance(X, pd.DataFrame):
            XP = X.pipe(self._transformer.transform)
            if self.preserve_dataframe:
                columns = self._transformer.get_feature_names(X.columns)
                XP = pd.DataFrame(data=XP, columns=columns, index=X.index)
        elif isinstance(X, dd.DataFrame):
            XP = X.map_partitions(self._transformer.transform)
            if self.preserve_dataframe:
                columns = self._transformer.get_feature_names(X.columns)
                XP = dd.from_dask_array(XP, columns, X.index)
        else:
            # typically X is instance of np.ndarray
            XP = self._transformer.transform(X)

        return XP
Esempio n. 3
0
    def _fit(self, X, handle_unknown="error"):
        X = check_array(X,
                        accept_dask_dataframe=True,
                        dtype=None,
                        preserve_pandas_dataframe=True)

        _, n_features = X.shape

        if self.categories != "auto":
            for cats in self.categories:
                if not np.all(np.sort(cats) == np.array(cats)):
                    raise ValueError("Unsorted categories are not yet"
                                     "supported")
            if len(self.categories) != n_features:
                raise ValueError("Shape mismatch: if n_values is an array,"
                                 " it has to be of shape (n_features,).")

        self.categories_ = []
        self.dtypes_ = []

        for i in range(n_features):
            Xi = X[:, i]
            if self.categories == "auto":
                cats = _encode(Xi)
            else:
                cats = np.array(self.categories[i], dtype=X.dtype)
            self.categories_.append(cats)
            self.dtypes_.append(None)

        self.categories_ = dask.compute(self.categories_)[0]
Esempio n. 4
0
def check_array(array,
                accept_sparse=False,
                *,
                accept_large_sparse=True,
                dtype="numeric",
                order=None,
                copy=False,
                force_all_finite=True,
                ensure_2d=True,
                allow_nd=False,
                ensure_min_samples=1,
                ensure_min_features=1,
                estimator=None,
                distributed=False,
                chunks="16MB"):

    _X = dmlu.check_array(array,
                          accept_sparse=accept_sparse,
                          accept_large_sparse=accept_large_sparse,
                          dtype=dtype,
                          order=order,
                          copy=copy,
                          force_all_finite=force_all_finite,
                          ensure_2d=ensure_2d,
                          allow_nd=allow_nd,
                          ensure_min_samples=ensure_min_samples,
                          ensure_min_features=ensure_min_features,
                          estimator=estimator)

    if distributed:
        return da.from_array(_X, chunks=chunks)
    else:
        return _X
Esempio n. 5
0
    def split(self, X, y=None, groups=None):
        X = check_array(X)

        for i in range(self.n_splits):
            if self.blockwise:
                yield self._split_blockwise(X)
            else:
                yield self._split(X)
Esempio n. 6
0
 def split(self, X, y=None, groups=None):
     X = check_array(X)
     rng = check_random_state(self.random_state)
     for i in range(self.n_splits):
         seeds = draw_seed(rng, 0, 2 ** 32 - 1, size=len(X.chunks[0]), dtype="uint")
         if self.blockwise:
             yield self._split_blockwise(X, seeds)
         else:
             yield self._split(X)
Esempio n. 7
0
 def split(self, X, y=None, groups=None):
     X = check_array(X, ensure_2d=False, allow_nd=True)
     rng = check_random_state(self.random_state)
     for i in range(self.n_splits):
         seeds = draw_seed(rng,
                           0,
                           _I4MAX,
                           size=len(X.chunks[0]),
                           dtype="uint")
         if self.blockwise:
             yield self._split_blockwise(X, seeds)
         else:
             yield self._split(X)
Esempio n. 8
0
    def split(self, X, y=None, groups=None):
        X = check_array(X)
        n_samples = X.shape[0]
        n_splits = self.n_splits
        fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=np.int)
        fold_sizes[:n_samples % n_splits] += 1

        chunks = X.chunks[0]
        seeds = [None] * len(chunks)
        if self.shuffle:
            rng = check_random_state(self.random_state)
            seeds = draw_seed(rng, 0, _I4MAX, size=len(chunks), dtype="uint")

        test_current = 0
        for fold_size in fold_sizes:
            test_start, test_stop = test_current, test_current + fold_size
            yield self._split(test_start, test_stop, n_samples, chunks, seeds)
            test_current = test_stop
Esempio n. 9
0
    def _transform_new(self, X):
        X = check_array(X,
                        accept_dask_dataframe=True,
                        dtype=None,
                        preserve_pandas_dataframe=True)

        _, n_features = X.shape

        # We encode each column independently, as they have different categories.
        Xs = [
            _encode_dask_array(
                X[:, i],
                uniques=self.categories_[i],
                encode=True,
                onehot_dtype=self.dtype,
            )[1] for i in range(n_features)
        ]
        X = da.concatenate(Xs, axis=1)

        if not self.sparse:
            X = X.map_blocks(lambda x: x.toarray(), dtype=self.dtype)

        return X.rechunk({1: X.shape[1]})
Esempio n. 10
0
 def _check_array(self, X, *args, **kwargs):
     X = check_array(X, accept_dask_dataframe=True, **kwargs)
     return X
Esempio n. 11
0
def test_check_array_1d():
    arr = da.random.uniform(size=(10, ), chunks=5)
    check_array(arr, ensure_2d=False)
Esempio n. 12
0
def test_check_array_raises():
    X = da.random.uniform(size=(10, 5), chunks=2)
    with pytest.raises(TypeError) as m:
        check_array(X)

    assert m.match("Chunking is only allowed on the first axis.")
Esempio n. 13
0
 def _check_array(self, X: Union[ArrayLike, DataFrameType], *args: Any,
                  **kwargs: Any) -> Union[ArrayLike, DataFrameType]:
     X = check_array(X, accept_dask_dataframe=True, **kwargs)
     return X