Ejemplo n.º 1
0
def test_targetencoder_multi_column():
    """
    Test jointly encoding multiple columns
    """
    train = cudf.DataFrame({
        'cat_1': ['a', 'b', 'b', 'a', 'a', 'b'],
        'cat_2': [1, 1, 2, 2, 1, 2],
        'label': [1, 0, 1, 1, 0, 1]
    })
    test = cudf.DataFrame({
        'cat_1': ['b', 'b', 'a', 'b'],
        'cat_2': [1, 2, 1, 2]
    })
    encoder = TargetEncoder()
    train_encoded = encoder.fit_transform(train[['cat_1', 'cat_2']],
                                          train.label)
    test_encoded = encoder.transform(test[['cat_1', 'cat_2']])
    train_answer = np.array([2. / 3, 2. / 3, 1., 2. / 3, 2. / 3, 1.])
    test_answer = np.array([0., 1., 0.5, 1.])
    assert array_equal(train_encoded, train_answer)
    assert array_equal(test_encoded, test_answer)

    encoder = TargetEncoder()
    encoder.fit(train[['cat_1', 'cat_2']], train.label)
    train_encoded = encoder.transform(train[['cat_1', 'cat_2']])
    test_encoded = encoder.transform(test[['cat_1', 'cat_2']])
    assert array_equal(train_encoded, train_answer)
    assert array_equal(test_encoded, test_answer)
Ejemplo n.º 2
0
def test_get_params():
    params = {
        'n_folds': 5,
        'smooth': 1,
        'seed': 49,
        'split_method': 'customize'
    }
    encoder = TargetEncoder(**params)
    p2 = encoder.get_params()
    for k, v in params.items():
        assert v == p2[k]
Ejemplo n.º 3
0
def test_targetencoder_cupy():
    """
    Note that there are newly-encountered values in x_test,
    namely, 3 and 4.
    """
    x_train = cp.array([1, 2, 2, 1])
    y_train = cp.array([1, 0, 1, 1])
    x_test = cp.array([1, 2, 3, 4])
    encoder = TargetEncoder()
    encoder.fit_transform(x_train, y_train)
    test_encoded = encoder.transform(x_test)
    answer = np.array([1., 0.5, 0.75, 0.75])
    assert array_equal(test_encoded, answer)
    print(type(test_encoded))
    assert isinstance(test_encoded, cp.ndarray)
Ejemplo n.º 4
0
def test_one_category():
    train = cudf.DataFrame({
        'category': ['a', 'a', 'a', 'a'],
        'label': [3, 0, 0, 3]
    })
    test = cudf.DataFrame({'category': ['c', 'b', 'a', 'd']})

    encoder = TargetEncoder()
    train_encoded = encoder.fit_transform(train.category, train.label)
    answer = np.array([1., 2., 2., 1.])
    assert array_equal(train_encoded, answer)

    test_encoded = encoder.transform(test.category)
    answer = np.array([1.5, 1.5, 1.5, 1.5])
    assert array_equal(test_encoded, answer)
Ejemplo n.º 5
0
def test_targetencoder_transform():
    train = cudf.DataFrame({
        'category': ['a', 'b', 'b', 'a'],
        'label': [1, 0, 1, 1]
    })
    test = cudf.DataFrame({'category': ['b', 'b', 'a', 'b']})
    encoder = TargetEncoder()
    encoder.fit_transform(train.category, train.label)
    test_encoded = encoder.transform(test.category)
    answer = np.array([0.5, 0.5, 1., 0.5])
    assert array_equal(test_encoded, answer)

    encoder = TargetEncoder()
    encoder.fit(train.category, train.label)
    test_encoded = encoder.transform(test.category)
    assert array_equal(test_encoded, answer)
Ejemplo n.º 6
0
def test_targetencoder_pandas():
    """
    Note that there are newly-encountered values in test,
    namely, 'c' and 'd'.
    """
    train = pandas.DataFrame({
        'category': ['a', 'b', 'b', 'a'],
        'label': [1, 0, 1, 1]
    })
    test = pandas.DataFrame({'category': ['c', 'b', 'a', 'd']})
    encoder = TargetEncoder()
    encoder.fit_transform(train.category, train.label)
    test_encoded = encoder.transform(test.category)
    answer = np.array([0.75, 0.5, 1., 0.75])
    assert array_equal(test_encoded, answer)
    print(type(test_encoded))
    assert isinstance(test_encoded, np.ndarray)
Ejemplo n.º 7
0
def test_targetencoder_random(n_samples, dtype):

    x = cp.random.randint(0, 1000, n_samples).astype(dtype)
    y = cp.random.randint(0, 2, n_samples).astype(dtype)
    xt = cp.random.randint(0, 1000, n_samples).astype(dtype)

    encoder = TargetEncoder()
    encoder.fit_transform(x, y)
    test_encoded = encoder.transform(xt)

    df_train = cudf.DataFrame({'x': x, 'y': y})
    dg = df_train.groupby('x', as_index=False).agg({'y': 'mean'})
    df_test = cudf.DataFrame({'x': xt})
    df_test['row_id'] = cp.arange(len(df_test))
    df_test = df_test.merge(dg, on='x', how='left')
    df_test = df_test.sort_values('row_id')
    answer = df_test['y'].fillna(cp.mean(y).item()).values
    assert array_equal(test_encoded, answer)
Ejemplo n.º 8
0
def test_targetencoder_var():
    train = cudf.DataFrame({
        'category': ['a', 'b', 'b', 'b'],
        'label': [1, 0, 1, 1]
    })
    encoder = TargetEncoder(stat='var')
    train_encoded = encoder.fit_transform(train.category, train.label)
    answer = np.array([.25, 0., .5, .5])
    assert array_equal(train_encoded, answer)

    encoder = TargetEncoder(stat='var')
    encoder.fit(train.category, train.label)
    train_encoded = encoder.transform(train.category)

    assert array_equal(train_encoded, answer)
Ejemplo n.º 9
0
def test_targetencoder_smooth():
    train = cudf.DataFrame({
        'category': ['a', 'b', 'b', 'a'],
        'label': [1, 0, 1, 1]
    })
    answers = np.array([[1., 1., 0., 1.], [0.875, 0.875, 0.375, 0.875],
                        [0.8333, 0.8333, 0.5, 0.8333],
                        [0.75, 0.75, 0.75, 0.75]])
    smooths = [0, 1, 2, 10000]
    for smooth, answer in zip(smooths, answers):
        encoder = TargetEncoder(smooth=smooth)
        train_encoded = encoder.fit_transform(train.category, train.label)
        assert array_equal(train_encoded, answer)

        encoder = TargetEncoder(smooth=smooth)
        encoder.fit(train.category, train.label)
        train_encoded = encoder.transform(train.category)

        assert array_equal(train_encoded, answer)
Ejemplo n.º 10
0
def test_transform_with_index():
    df = cudf.DataFrame({
        "a": [1, 1, 2, 3],
        "b": [True, False, False, True]
    },
                        index=[9, 4, 5, 3])

    t_enc = TargetEncoder()

    t_enc.fit(df.a, y=df.b)
    train_encoded = t_enc.transform(df.a)
    ans = cp.asarray([0, 1, 0.5, 0.5])
    assert array_equal(train_encoded, ans)

    train_encoded = t_enc.transform(df[["a"]])
    assert array_equal(train_encoded, ans)
Ejemplo n.º 11
0
def test_targetencoder_customized_fold_id():
    """
    use customized `fold_ids` array to split data.
    in this example, the 1st sample belongs to `fold 0`
    the 2nd and 3rd sample belongs to `fold 1`
    and the 4th sample belongs to `fold 2`
    """
    train = cudf.DataFrame({
        'category': ['a', 'b', 'b', 'a'],
        'label': [1, 0, 1, 1]
    })
    fold_ids = [0, 1, 1, 2]
    encoder = TargetEncoder(split_method='customize')
    train_encoded = encoder.fit_transform(train.category,
                                          train.label,
                                          fold_ids=fold_ids)
    answer = np.array([1., 0.75, 0.75, 1.])
    assert array_equal(train_encoded, answer)

    encoder = TargetEncoder(split_method='customize')
    encoder.fit(train.category, train.label, fold_ids=fold_ids)
    train_encoded = encoder.transform(train.category)

    assert array_equal(train_encoded, answer)
Ejemplo n.º 12
0
    def cv(
        self,
        y_train: AoS,
        train_features: XDataFrame,
        test_features: XDataFrame,
        y_valid: Optional[AoS],
        valid_features: Optional[XDataFrame],
        feature_name: List[str],
        folds_ids: List[Tuple[np.ndarray, np.ndarray]],
        target_scaler: Optional[MinMaxScaler],
        config: dict,
        log: bool = True,
    ) -> Tuple[
        List[Model], np.ndarray, np.ndarray, Optional[np.ndarray], pd.DataFrame, dict
    ]:
        # initialize
        valid_exists = True if valid_features is not None else False
        test_preds = np.zeros(len(test_features))
        oof_preds = np.zeros(len(train_features))
        if valid_exists:
            valid_preds = np.zeros(len(valid_features))
        else:
            valid_preds = None
        best_iteration = 0.0
        cv_score_list: List[dict] = []
        models: List[Model] = []

        with timer("make X"):
            X_train = train_features.copy()
            X_test = test_features.copy()
            X_valid = valid_features.copy() if valid_features is not None else None

        with timer("make y"):
            y = y_train.values if isinstance(y_train, pd.Series) else y_train
            y_valid = y_valid.values if isinstance(y_valid, pd.Series) else y_valid

        if config["target_encoding"]:
            with timer("target encoding for test"):
                cat_cols = config["categorical_cols"]
                for cat_col in cat_cols:
                    encoder = TargetEncoder(n_folds=4, smooth=0.3)
                    encoder.fit(X_train[cat_col], y)
                    X_test[cat_col + "_TE"] = encoder.transform(X_test[cat_col])
                    feature_name.append((cat_col + "_TE"))

        importances = pd.DataFrame(index=feature_name)

        for i_fold, (trn_idx, val_idx) in enumerate(folds_ids):
            with timer(f"fold {i_fold}"):
                self.fold = i_fold
                with timer("get train data and valid data"):
                    # get train data and valid data
                    x_trn = X_train.iloc[trn_idx]
                    y_trn = y[trn_idx]
                    x_val = X_train.iloc[val_idx]
                    y_val = y[val_idx]

                if config["target_encoding"]:
                    with timer("target encoding"):
                        cat_cols = config["categorical_cols"]
                        for cat_col in cat_cols:
                            encoder = TargetEncoder(n_folds=4, smooth=0.3)
                            x_trn[cat_col + "_TE"] = encoder.fit_transform(
                                x_trn[cat_col], y_trn
                            )
                            x_val[cat_col + "_TE"] = encoder.transform(x_val[cat_col])

                logging.info(f"train size: {x_trn.shape}, valid size: {x_val.shape}")
                print(f"train size: {x_trn.shape}, valid size: {x_val.shape}")

                with timer("get sampling"):
                    x_trn, y_trn = get_sampling(x_trn, y_trn, config)

                with timer("train model"):
                    # train model
                    model, best_score = self.fit(x_trn, y_trn, x_val, y_val, config)
                    cv_score_list.append(best_score)
                    models.append(model)
                    best_iteration += self.get_best_iteration(model) / len(folds_ids)

                with timer("predict oof and test"):
                    # predict oof and test
                    oof_preds[val_idx] = self.predict(model, x_val).reshape(-1)
                    test_preds += self.predict(model, X_test).reshape(-1) / len(
                        folds_ids
                    )

                    if valid_exists:
                        valid_preds += self.predict(model, valid_features).reshape(
                            -1
                        ) / len(folds_ids)

                with timer("get feature importance"):
                    # get feature importances
                    importances_tmp = pd.DataFrame(
                        self.get_feature_importance(model),
                        columns=[f"gain_{i_fold+1}"],
                        index=feature_name,
                    )
                    importances = importances.join(importances_tmp, how="inner")

        # summary of feature importance
        feature_importance = importances.mean(axis=1)

        # save raw prediction
        self.raw_oof_preds = oof_preds
        self.raw_test_preds = test_preds
        self.raw_valid_preds = valid_preds

        # post_process (if you have any)
        y, oof_preds, test_preds, y_valid, valid_preds = self.post_process(
            oof_preds=oof_preds,
            test_preds=test_preds,
            valid_preds=valid_preds,
            y_train=y_train,
            y_valid=y_valid,
            train_features=train_features,
            test_features=test_features,
            valid_features=valid_features,
            target_scaler=target_scaler,
            config=config,
        )

        # print oof score
        oof_score = calc_metric(y, oof_preds)
        print(f"oof score: {oof_score:.5f}")

        if valid_exists:
            valid_score = calc_metric(y_valid, valid_preds)
            print(f"valid score: {valid_score:.5f}")

        if log:
            logging.info(f"oof score: {oof_score:.5f}")
            if valid_exists:
                logging.info(f"valid score: {valid_score:.5f}")

        evals_results = {
            "evals_result": {
                "oof_score": oof_score,
                "cv_score": {
                    f"cv{i + 1}": cv_score for i, cv_score in enumerate(cv_score_list)
                },
                "n_data": np.shape(X_train)[0],
                "best_iteration": best_iteration,
                "n_features": np.shape(X_train)[1],
                "feature_importance": feature_importance.sort_values(
                    ascending=False
                ).to_dict(),
            }
        }

        if valid_exists:
            evals_results["valid_score"] = valid_score
        return (
            models,
            oof_preds,
            test_preds,
            valid_preds,
            feature_importance,
            evals_results,
        )