Ejemplo n.º 1
0
    def fit(self, X: XSeries, y: XSeries) -> None:
        """[summary].

        Args:
            X : [description].
            y (optional): [description]. Defaults to None.
        """
        # TODO(smly): warn to use fit_transform instead of fit().
        # transform() is recommended for encoding test set.
        if cudf_is_available() and isinstance(X, cudf.Series):
            pass
        elif isinstance(X, np.ndarray):
            X = column_or_1d(X, warn=True)
            y = column_or_1d(y, warn=True)
        else:
            raise RuntimeError

        # y = column_or_1d(y, warn=True)
        self.mean_encoders_ = []

        # Fit and append mean_encoders
        for trn_idx, tst_idx in self.fold.split(X):
            X_trn, _ = X[trn_idx], X[tst_idx]
            y_trn, _ = y[trn_idx], y[tst_idx]
            if cudf_is_available() and isinstance(X, cudf.Series):
                encoder = _CuPy_MeanEncoder()
                encoder.fit(X_trn, y_trn)
                self.mean_encoders_.append(encoder)
            elif isinstance(X, np.ndarray):
                encoder = _MeanEncoder()
                encoder.fit(X_trn, y_trn)
                self.mean_encoders_.append(encoder)
            else:
                raise RuntimeError
def test_concat_combination(dataframes):
    for df in dataframes:
        encoder = ConcatCombination()
        df_encoded = encoder.fit_transform(df)
        assert df_encoded.columns.tolist() == [
            "col1",
            "col2",
            "col3",
            "col1col2_combi",
            "col1col3_combi",
            "col2col3_combi",
        ]
        if cudf_is_available() and isinstance(df_encoded, cudf.DataFrame):
            assert df_encoded["col1col3_combi"].to_arrow().to_pylist() == [
                "aX", "bY"
            ]
        else:
            assert df_encoded["col1col3_combi"].tolist() == ["aX", "bY"]

    for df in dataframes:
        encoder = ConcatCombination(output_suffix="", drop_origin=True)
        df_encoded = encoder.fit_transform(df)
        assert df_encoded.columns.tolist() == [
            "col1col2",
            "col1col3",
            "col2col3",
        ]
        if cudf_is_available() and isinstance(df_encoded, cudf.DataFrame):
            assert df_encoded["col2col3"].to_arrow().to_pylist() == [
                "@X", "%Y"
            ]
        else:
            assert df_encoded["col2col3"].tolist() == ["@X", "%Y"]

    for df in dataframes:
        encoder = ConcatCombination(output_suffix="", drop_origin=True, r=3)
        df_encoded = encoder.fit_transform(df)
        if cudf_is_available() and isinstance(df_encoded, cudf.DataFrame):
            assert df_encoded.columns.tolist() == [
                "col1col2col3",
            ]
            assert df_encoded["col1col2col3"].to_arrow().to_pylist() == [
                "a@X", "b%Y"
            ]
        else:
            assert df_encoded.columns.tolist() == [
                "col1col2col3",
            ]
            assert df_encoded["col1col2col3"].tolist() == ["a@X", "b%Y"]
Ejemplo n.º 3
0
    def transform(self, X):
        """Transform ndarray values."""
        check_is_fitted(self, "classes_")
        if cudf_is_available() and isinstance(X, cudf.Series):
            X = X.to_array()
        X = column_or_1d(X, warn=True)

        # Label encoding if necessary
        if self._label_encoding_uniques is not None:
            X = self._label_encoding_uniques.get_indexer(pd.Series(X))

        missing_mask = np.isnan(X)
        encode_mask = np.invert(missing_mask)
        unseen_mask = np.bitwise_xor(np.isin(X, self.classes_, invert=True),
                                     missing_mask)

        X[unseen_mask] = np.max(self.classes_)
        indices = np.searchsorted(self.classes_, X[encode_mask])

        X[encode_mask] = np.take(
            self.lut_[:, 1],
            np.take(np.searchsorted(self.lut_[:, 0], self.classes_), indices),
        )

        if np.any(missing_mask):
            X[missing_mask] = self._default_missing

        return X
Ejemplo n.º 4
0
    def transform(self, X: XSeries) -> XSeries:
        """[summary].

        Args:
            X : [description].
        Returns:
            Any : [description].
        """
        check_is_fitted(self, "mean_encoders_")

        # Encoding for testing part. Different result from `fit_transform()`
        # result.
        if cudf_is_available() and isinstance(X, cudf.Series):
            n_splits = self.fold.get_n_splits()
            likelihood_values = cupy.zeros((X.shape[0], n_splits))
            for fold_idx, mean_encoder in enumerate(self.mean_encoders_):
                ret = mean_encoder.transform(X)
                likelihood_values[:, fold_idx] = ret
            return np.mean(likelihood_values, axis=1)
        else:
            n_splits = self.fold.get_n_splits()
            likelihood_values = np.zeros((X.shape[0], n_splits))
            for fold_idx, mean_encoder in enumerate(self.mean_encoders_):
                ret = mean_encoder.transform(X)
                likelihood_values[:, fold_idx] = ret
            return np.mean(likelihood_values, axis=1)
Ejemplo n.º 5
0
    def transform(self, input_df: XDataFrame) -> XDataFrame:
        """Transform data frame.

        Args:
            input_df (XDataFrame): Input data frame.
        Returns:
            XDataFrame : Output data frame.
        """
        new_df = input_df.copy()

        input_cols = self._input_cols
        if not input_cols:
            input_cols = new_df.columns.tolist()

        if self._exclude_cols:
            for col in self._exclude_cols:
                input_cols.remove(col)

        for col in input_cols:
            out_col = self._output_prefix + col + self._output_suffix
            if cudf_is_available() and isinstance(new_df, cudf.DataFrame):
                X = self._uniques[col].get_indexer(new_df[col].to_array())
            else:
                X = self._uniques[col].get_indexer(new_df[col])
            if self._unseen == "n_unique":
                missing_values = new_df[col].isna()
                unseen_values = np.invert(new_df[col].isin(self._uniques[col]))
                unseen_mask = np.bitwise_xor(missing_values, unseen_values)
                X[unseen_mask] = len(self._uniques[col])

            new_df[out_col] = X

        return new_df
Ejemplo n.º 6
0
    def fit_transform(self, X: XSeries, y: XSeries) -> XNDArray:
        """[summary].

        Args:
            X : [description].
        Returns:
            XNDArray : [description].
        """
        self.fit(X, y)
        check_is_fitted(self, "mean_encoders_")

        # Encoding for training data.
        if cudf_is_available() and isinstance(X, cudf.Series):
            likelihood_values = cupy.zeros(X.shape[0])
            for idx, (trn_idx, tst_idx) in enumerate(self.fold.split(X)):
                X_tst = X[tst_idx]
                likelihood_values[tst_idx] = self.mean_encoders_[
                    idx].transform(X_tst)
            return likelihood_values
        elif isinstance(X, np.ndarray):
            likelihood_values = np.zeros(X.shape[0])
            for idx, (trn_idx, tst_idx) in enumerate(self.fold.split(X)):
                X_tst = X[tst_idx]
                likelihood_values[tst_idx] = self.mean_encoders_[
                    idx].transform(X_tst)
            return likelihood_values
        else:
            raise RuntimeError
Ejemplo n.º 7
0
def test_internal_target_encoder_with_cudf():
    if not cudf_is_available() or cudf is not None or cupy is not None:
        # Skip test.
        return

    X = cudf.Series(
        np.array([[2, 2], [2, 4], [2, 6], [8, 7], [8, 8], [8, 9], [8, 10]]))
    y = cudf.Series(np.array([1, 1, 0, 1, 1, 1, 0]))

    fold = KFold(n_splits=2, shuffle=False)
    trn_idx, tst_idx = next(fold.split(X))
    assert np.array_equal(tst_idx, np.array([0, 1, 2, 3]))

    encoder = _TargetEncoder(fold=fold)

    # Test `fit_transform()`.
    y_trn = encoder.fit_transform(X[:, 0], y)
    assert np.allclose(y_trn.values,
                       np.array([
                           0.0,
                           0.0,
                           0.0,
                           0.66666667,
                           1.0,
                           1.0,
                           1.0,
                       ]))

    X_tst = np.array([8, 0, 2])
    y_tst = encoder.transform(X_tst)
    assert np.allclose(
        y_tst.values,
        np.array([0.83333334, 0., 0.33333334]),
    )
Ejemplo n.º 8
0
def test_select_numerical_cudf(pandas_dataframe):
    if not cudf_is_available():
        return

    df_cuda = cudf.from_pandas(pandas_dataframe)
    encoder = SelectNumerical()
    df_new = encoder.fit_transform(df_cuda)
    assert df_new.columns.tolist() == ["num"]
Ejemplo n.º 9
0
def dataframes():
    df = pd.DataFrame({"var1": [1, 2, 3]})

    if cudf_is_available():
        df_cuda = cudf.from_pandas(df)
        return [df, df_cuda]
    else:
        return [df]
Ejemplo n.º 10
0
def dataframes():
    df = pd.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3], "d": [1, 2, 9],})

    if cudf_is_available():
        df_cuda = cudf.from_pandas(df)
        return [df, df_cuda]
    else:
        return [df]
Ejemplo n.º 11
0
def dataframes():
    df = pd.DataFrame({"col": ["a", "a", "b"],})

    if cudf_is_available():
        df_cuda = cudf.from_pandas(df)
        return [df, df_cuda]
    else:
        return [df]
Ejemplo n.º 12
0
def dataframes():
    df = pd.DataFrame(
        {"a": [1, 2, 3, 4, 5], "b": ["a", "a", "a", "b", "b"], "c": [0, 0, 1, 1, 1],}
    )

    if cudf_is_available():
        df_cuda = cudf.from_pandas(df)
        return [df, df_cuda]
    else:
        return [df]
Ejemplo n.º 13
0
def dataframes():
    df = pd.DataFrame({"target": [1, 0, 0]})
    for col in range(100):
        df.loc[:, "col{}".format(col)] = np.array([1, 2, 3])

    if cudf_is_available():
        df_cuda = cudf.from_pandas(df)
        return [df, df_cuda]
    else:
        return [df]
Ejemplo n.º 14
0
def dataframes():
    df = pd.DataFrame({
        "col": ["A", "B", "B"],
        "num": [1, 2, 3],
    })

    if cudf_is_available():
        df_cuda = cudf.from_pandas(df)
        return [df, df_cuda]
    else:
        return [df]
Ejemplo n.º 15
0
def dataframes():
    df = pd.DataFrame({
        "col1": ["2", "2", "2", "8", "8", "8", "8"],
        "col2": [2, 4, 6, 7, 8, 9, 10],
        "target": [1, 1, 0, 1, 1, 1, 0],
    })

    if cudf_is_available():
        df_cuda = cudf.from_pandas(df)
        return [df, df_cuda]
    else:
        return [df]
Ejemplo n.º 16
0
    def transform(self, input_df: XDataFrame) -> XDataFrame:
        """Transform data frame.

        Args:
            input_df (XDataFrame): Input data frame.
        Returns:
            XDataFrame : Output data frame.
        """
        if isinstance(input_df, pd.DataFrame):
            new_df = input_df.copy()
        elif cudf_is_available() and isinstance(input_df, cudf.DataFrame):
            new_df = input_df.to_pandas()
        else:
            raise RuntimeError("Unexpected data type: {}".format(type(input_df)))
        generated_cols = []

        input_cols = self._input_cols
        if not input_cols:
            input_cols = new_df.columns.tolist()
        if len(self._exclude_cols) > 0:
            input_cols = [col for col in input_cols if col not in self._exclude_cols]

        for col in input_cols:
            new_col = self._output_prefix + col + self._output_suffix
            if self._fillna is not None:
                new_df[new_col] = (
                    new_df[col].fillna(self._fillna).apply(self._lambda_func)
                )
            else:
                new_df[new_col] = new_df[col].apply(self._lambda_func)

            generated_cols.append(new_col)

        if cudf_is_available() and isinstance(input_df, cudf.DataFrame):
            new_df = cudf.from_pandas(new_df)

        if self._drop_origin:
            return new_df[generated_cols]

        return new_df
def dataframes():
    df = pd.DataFrame({
        "col1": [1, 2, 3, 4, 5],
        "col2": [2, 3, 4, 5, 6],
        "col3": [3, 4, 5, 6, 7],
    })

    if cudf_is_available():
        df_cuda = cudf.from_pandas(df)
        return [df, df_cuda]
    else:
        return [df]
    return [df]
Ejemplo n.º 18
0
def dataframes_targetencoder():
    df = pd.DataFrame({
        "col1": [2, 2, 2, 8, 8, 8, 8],
        "col2": [2, 4, 6, 7, 8, 9, 10],
        "target": [1, 1, 0, 1, 1, 1, 0],
    })
    df_test = pd.DataFrame({
        "col1": [2, 8],
        "col2": [2, 8],
    })

    if cudf_is_available():
        df_cuda = cudf.from_pandas(df)
        df_test_cuda = cudf.from_pandas(df_test)
        return [(df, df_test), (df_cuda, df_test_cuda)]
    else:
        return [(df, df_test)]
def test_arithmetic_combinations(dataframes):
    for df in dataframes:
        encoder = ArithmeticCombinations(operator="+", output_suffix="_plus")
        df_new = encoder.fit_transform(df)

        assert df_new.columns.tolist() == [
            "col1",
            "col2",
            "col3",
            "col1col2_plus",
            "col1col3_plus",
            "col2col3_plus",
        ]
        if cudf_is_available() and isinstance(df_new, cudf.DataFrame):
            assert df_new["col2col3_plus"].to_arrow().to_pylist() == [
                5, 7, 9, 11, 13
            ]
        else:
            assert df_new["col2col3_plus"].tolist() == [5, 7, 9, 11, 13]
Ejemplo n.º 20
0
    def fit_transform(self, input_df: XDataFrame) -> XDataFrame:
        """Fit to data frame, then transform it.

        Args:
            input_df (XDataFrame): Input data frame.
        Returns:
            XDataFrame : Output data frame.
        """
        if cudf_is_available() and isinstance(input_df, cudf.DataFrame):
            self._selected_cols = (
                input_df.to_pandas()
                .T.drop_duplicates(keep="first")
                .index.values.tolist()
            )
        else:
            self._selected_cols = input_df.T.drop_duplicates(
                keep="first"
            ).index.values.tolist()
        return input_df[self._selected_cols]
Ejemplo n.º 21
0
    def fit(self, X, y=None):
        """Fit to ndarray, then transform it."""
        if cudf_is_available() and isinstance(X, cudf.Series):
            X = X.to_array()
        X = column_or_1d(X, warn=True)

        # Label encoding if necessary
        if not np.can_cast(X.dtype, np.int64):
            X, uniques = pd.Series(X).factorize()
            self._label_encoding_uniques = uniques

        self.classes_, self.counts_ = np.unique(X[np.isfinite(X)],
                                                return_counts=True)

        self.classes_ = np.append(self.classes_, [np.max(self.classes_) + 1])
        self.counts_ = np.append(self.counts_, [self._default_unseen])
        self.lut_ = np.hstack(
            [self.classes_.reshape(-1, 1),
             self.counts_.reshape(-1, 1)])
        return self
Ejemplo n.º 22
0
    def fit(self, X: CSeries, y: CSeries):
        """[summary].

        Args:
            X (cupy.ndarray): Input cupy ndarray.
            y (cupy.ndarray): Target cupy ndarray.
        """
        # Label encoding if necessary
        if not cupy.can_cast(X.dtype, cupy.int):
            if cudf_is_available() and isinstance(X, cudf.Series):
                X = X.to_array()
            X, uniques = pd.Series(cupy.asnumpy(X)).factorize()
            X = cudf.Series(X)
            self._label_encoding_uniques = uniques

        self.classes_, counts = cupy.unique(X, return_counts=True)
        self.class_means_ = cupy.zeros_like(self.classes_, dtype="float64")

        assert isinstance(y, cudf.Series)
        df = cudf.DataFrame()
        df.insert(0, "X", X)
        df.insert(0, "y", y.values)
        agg = df.groupby("X").agg("mean").to_pandas()

        for idx, uniq_value in enumerate(self.classes_):
            uniq_value = cupy.asnumpy(uniq_value).item()
            mean_value = agg.loc[uniq_value]["y"]
            self.class_means_[idx] = mean_value

        self.classes_ = cupy.array(
            np.append(cupy.asnumpy(self.classes_),
                      [cupy.asnumpy(cupy.max(self.classes_)) + 1]))
        self.class_means_ = cupy.array(
            np.append(cupy.asnumpy(self.class_means_),
                      [cupy.asnumpy(self.default_unseen_)]))

        self.lut_ = cupy.hstack(
            [self.classes_.reshape(-1, 1),
             self.class_means_.reshape(-1, 1)])
Ejemplo n.º 23
0
def test_cudf_is_available():
    if cudf is None:
        assert cudf_is_available() is False
    else:
        assert cudf_is_available() is True
Ejemplo n.º 24
0
def test_internal_cupy_mean_encoder_fit_transform():
    if not cudf_is_available() or cudf is not None or cupy is not None:
        # Skip test.
        return

    X = np.array([[2, 2], [2, 4], [2, 6], [8, 7], [8, 8], [8, 9], [8, 10]])
    y = np.array([1, 1, 0, 0, 1, 1, 0])

    X = cupy.asarray(X)
    y = cupy.asarray(y)

    col_idx = 0
    encoder = _CuPy_MeanEncoder()
    y_mean = encoder.fit_transform(X[:, col_idx], y)

    assert np.array_equal(
        cupy.asnumpy(encoder.classes_),
        np.array([2, 8, 9]),  # 9 (max + 1) is assigned for unseen values.
    )
    assert cupy.allclose(
        encoder.class_means_,
        cupy.array([
            0.66666667,
            0.5,
            0.0,
        ])  # 2/3  # 2/4
    )
    assert cupy.allclose(
        y_mean,
        cupy.array([
            0.66666667,
            0.66666667,
            0.66666667,
            0.5,
            0.5,
            0.5,
            0.5,
        ]))

    # Unseen values
    col_idx = 0
    X_test = cupy.array([9, 1, 8, 2])
    y_mean = encoder.transform(X_test)

    assert cupy.allclose(
        y_mean,
        cupy.array([
            0.0,  # 9 = recognized as seen value since (max+1) is assigned for unseen value.
            0.0,  # 1 = unseen value
            0.5,  # 8 = 2/4
            0.66666667,
        ]),
    )

    # Missing value
    col_idx = 0
    X_test = cupy.array([[cupy.nan, 2], [1, 1], [8, 4]])
    y_mean = encoder.transform(X_test[:, col_idx])

    assert cupy.allclose(
        y_mean,
        cupy.array([
            0.0,
            0.0,
            0.5,
        ]  # NaN = missing value  # 1 = unseen value  # 0 = 2/4
                   ),
    )