Ejemplo n.º 1
0
def test_column_equality():

    X = pd.DataFrame(np.random.randn(1000, 4), columns=list('ABCD'))
    X['category'] = np.random.choice([-5, 5, 10], size=len(X))
    y = pd.Series(np.random.randn(1000))

    for i in range(2, 10):
        te = TargetEncoder(cols=['category'], n_folds=i)
        te.fit(X, y)
        encoded_data = te.transform(X, y)
        assert len(set(X['category'])) * i == len(
            set(encoded_data['category']
                )), 'Encoded data does not have matching unique values'
Ejemplo n.º 2
0
def test_input_transform():
    """Test result of target encoding"""
    for alpha in np.arange(0, 1000, 10):
        out_feature = get_out(FEATURE, TARGET, alpha)
        out_dataset = get_out(DATASET, TARGET, alpha)

        enc = TargetEncoder(alpha=alpha, max_bins=30, split=())

        result = enc.transform_train(FEATURE.reshape(-1, 1), TARGET)
        assert (result == out_feature).all()

        result = enc.transform_train(DATASET, TARGET)
        assert (result == out_dataset).all()
Ejemplo n.º 3
0
def test_input_transform():

    for alpha in np.arange(0, 1000, 10):
        X_1_ls, X_1_np, X_1_pd, X_2_ls, X_2_np, X_2_np, X_2_pd, y_ls, y_np, y_pd, out_1, out_2 = generate_data_alpha(
            alpha)

        enc = TargetEncoder(alpha=alpha, max_unique=30, split=[])

        for y in [y_ls, y_np, y_pd]:
            for X in [X_1_ls, X_1_np, X_1_pd]:
                assert (enc.transform_train(X, y) == out_1).all()

        for y in [y_ls, y_np, y_pd]:
            for X in [X_2_ls, X_2_np, X_2_pd]:
                assert (enc.transform_train(X, y) == out_2).all()
Ejemplo n.º 4
0
def test_fallback():
    X = pd.DataFrame(np.random.randint(0, 10, size=(50, 4)),
                     columns=list('ABCD'))
    X['category'] = np.random.choice([-5, 5, 10], size=len(X))
    X.loc[:, 'category_orig'] = X['category']

    y = pd.Series(np.random.randn(50))
    X['target'] = y

    y = np.where((X.category == 5), np.nan, X.target)
    te = TargetEncoder(cols=['category'])
    te.fit(X, y)
    encoded_data = te.transform(X, y)

    cat_5_val = np.mean(
        encoded_data[encoded_data['category_orig'] == 5]['category'])
    assert round(cat_5_val, 2) == round(te.fallback,
                                        2), 'Fallback amount does not match'
Ejemplo n.º 5
0
def test_mean_vals_for_regular():
    X = pd.DataFrame(np.random.randn(50, 4), columns=list('ABCD'))
    X['category'] = np.random.choice([-5, 5, 10], size=len(X))
    X.loc[:, 'category_orig'] = X['category']
    y = pd.Series(np.random.randn(50))

    te = TargetEncoder(cols=['category'])
    te.fit(X, y)
    encoded_data = te.transform(X, y)

    X.loc[:, 'target'] = y
    grouped_data = X.groupby(['category']).mean().reset_index()
    means_to_test = grouped_data[['category', 'target']]

    check_data = encoded_data.merge(means_to_test,
                                    left_on='category_orig',
                                    right_on='category',
                                    how='left')
    # assert check_data['neighbourhood_group_x'] == check_data['number_of_reviews'], 'Mean values do not match up'
    te_vals = np.array(check_data['category_x'])
    original_vals = np.array(check_data['target'])

    assert np.array_equal(te_vals,
                          original_vals) == True, ' Means values are not equal'
Ejemplo n.º 6
0
def test_kfold():
    y = pd.DataFrame(np.random.randn(20, 1), columns=['target'])
    y = y.sample(frac=1, random_state=123)

    zero_data = np.zeros(shape=(20, 1))
    X = pd.DataFrame(zero_data)
    X.columns = ['category']

    X.loc[:5] = 'A'
    X.loc[5:10] = 'B'
    X.loc[10:15] = 'C'
    X.loc[15:] = 'D'

    X.loc[:, 'category_orig'] = X['category']
    X = X.sample(frac=1, random_state=123)

    n_folds = 4
    te = TargetEncoder(cols=['category'], n_folds=n_folds)
    te.fit(X, y)
    results = te.transform(X, y)

    X.loc[:, 'target'] = y
    num_fold_in_df = set(te._kfold_numbering(X, y, n_folds=4)['fold'])

    ## Manually calculating target encoded fold values
    fold_1_a = np.mean([
        X.loc[0]['target'], X.loc[3]['target'], X.loc[1]['target'],
        X.loc[2]['target']
    ])
    fold_2_b = np.mean([
        X.loc[5]['target'], X.loc[8]['target'], X.loc[9]['target'],
        X.loc[6]['target']
    ])
    fold_3_c = np.mean(
        [X.loc[14]['target'], X.loc[12]['target'], X.loc[13]['target']])
    fold_4_d = np.mean([
        X.loc[17]['target'], X.loc[19]['target'], X.loc[15]['target'],
        X.loc[16]['target']
    ])

    assert len(
        num_fold_in_df) == n_folds, "Please double check the fold counts"
    assert results.loc[4]['category'] == fold_1_a
    assert results.loc[7]['category'] == fold_2_b
    assert results.loc[11]['category'] == fold_3_c
    assert results.loc[18]['category'] == fold_4_d
Ejemplo n.º 7
0
def test_correct_init():
    """Test call of target encoding module"""
    for alpha in np.arange(0, 31, 10):
        for max_bins in np.arange(1, 100, 30):
            for model in [TargetEncoderClassifier, TargetEncoderRegressor]:
                enc = model(alpha=alpha, max_bins=max_bins)
                enc.transform_train(HUGE_DATASET, HUGE_TARGET)
                enc.transform_test(HUGE_DATASET)

            for split_1, split_2 in product(range(1, 4), range(1, 4)):
                split = []
                if split_1 != 1:
                    split.append(split_1)

                if split_2 != 1:
                    split.append(split_2)

                split = tuple(split)

                enc = TargetEncoder(alpha=alpha,
                                    max_bins=max_bins,
                                    split=split)
                enc.transform_train(HUGE_DATASET, HUGE_TARGET)
                enc.transform_test(HUGE_DATASET)
Ejemplo n.º 8
0
X = train.drop(['target', 'id'], axis=1)
y = train['target']

ALPHA = 70
MAX_UNIQUE = max(len_uniques)
FEATURES_COUNT = X.shape[1]

enc = TargetEncoderClassifier(alpha=ALPHA,
                              max_unique=MAX_UNIQUE,
                              used_features=FEATURES_COUNT)
score = cross_val_score(enc, X, y, scoring='roc_auc', cv=cv)
print(f'score: {score.mean():.4}, std: {score.std():.4}')

enc.fit(X, y)
pred_enc = enc.predict_proba(test.drop('id', axis=1))[:, 1]

enc = TargetEncoder(alpha=ALPHA, max_unique=MAX_UNIQUE, split=[cv])
X_train = enc.transform_train(X=X, y=y)
X_test = enc.transform_test(test.drop('id', axis=1))

lin = LogisticRegression()
score = cross_val_score(lin, X_train, y, scoring='roc_auc', cv=cv)
print(f'score: {score.mean():.4}, std: {score.std():.4}')

lin.fit(X_train, y)
pred_lin = lin.predict_proba(X_test)[:, 1]

sample_submission['target'] = pred_enc + pred_lin
sample_submission.to_csv('submission.csv', index=False)

print(sample_submission.head())
Ejemplo n.º 9
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        orig_cols = list(X.names)
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)

        min_count = np.min(np.unique(y, return_counts=True)[1])
        if min_count < 9:
            self.params['cv_search'] = False
        if min_count < 3:
            self.params['grid_search_iterations'] = False
            self.params['cv_search'] = False

        # save pre-datatable-imputed X
        X_dt = X

        # Apply OOB imputation
        self.oob_imputer = OOBImpute(self._impute_num_type,
                                     self._impute_int_type,
                                     self._impute_bool_type,
                                     self._impute_cat_type, self._oob_bool,
                                     self._oob_cat)
        X = self.oob_imputer.fit_transform(X)

        # convert to pandas for sklearn
        X = X.to_pandas()
        X_orig_cols_names = list(X.columns)
        if self._kaggle_features:
            self.features = make_features()
            X = self.features.fit_transform(X)
        else:
            self.features = None
        # print("LR: pandas dtypes: %s" % (str(list(X.dtypes))))

        # FEATURE GROUPS

        # Choose which features are numeric or categorical
        cat_features = [
            x for x in X_orig_cols_names
            if CatOriginalTransformer.is_me_transformed(x)
        ]
        catlabel_features = [
            x for x in X_orig_cols_names if CatTransformer.is_me_transformed(x)
        ]
        # can add explicit column name list to below force_cats
        force_cats = cat_features + catlabel_features

        # choose if numeric is treated as categorical
        if not self._num_as_cat:
            numerical_features = (X.dtypes == 'float') | (
                X.dtypes == 'float32') | (X.dtypes == 'float64')
        else:
            numerical_features = X.dtypes == 'invalid'
            # force oob imputation for numerics
            self.oob_imputer = OOBImpute('oob', 'oob', 'oob',
                                         self._impute_cat_type, self._oob_bool,
                                         self._oob_cat)
            X = self.oob_imputer.fit_transform(X_dt)
            X = X.to_pandas()
            X = self.features.fit_transform(X)
        if self._kaggle_features:
            numerical_features = self.features.update_numerical_features(
                numerical_features)

        categorical_features = ~numerical_features
        # below can lead to overlap between what is numeric and what is categorical
        more_cats = (pd.Series([
            True if x in force_cats else False
            for x in list(categorical_features.index)
        ],
                               index=categorical_features.index))
        categorical_features = (categorical_features) | (more_cats)
        if self._kaggle_features:
            categorical_features = self.features.update_categorical_features(
                categorical_features)

        if self._debug:
            import uuid
            struuid = str(uuid.uuid4())
            Xy = X.copy()
            Xy.loc[:, 'target'] = y
            Xy.to_csv("munged_%s.csv" % struuid)

        cat_X = X.loc[:, categorical_features]
        num_X = X.loc[:, numerical_features]
        if self._debug:
            print("LR: Cat names: %s" % str(list(cat_X.columns)))
            print("LR: Num names: %s" % str(list(num_X.columns)))

        # TRANSFORMERS
        lr_params = copy.deepcopy(self.params)
        lr_params.pop('grid_search_by_iterations', None)
        lr_params.pop('cv_search', None)
        grid_search = False  # WIP

        full_features_list = []
        transformers = []
        if self._use_numerics and any(numerical_features.values):
            impute_params = {}
            impute_params['strategy'] = lr_params.pop('strategy', 'mean')
            full_features_list.extend(list(num_X.columns))
            transformers.append(
                (make_pipeline(SimpleImputer(**impute_params),
                               StandardScaler()), numerical_features))
        # http://contrib.scikit-learn.org/categorical-encoding/
        if self._use_ordinal_encoding and any(categorical_features.values):
            ord_params = dict(handle_missing='value', handle_unknown='value')
            full_features_list.extend(list(cat_X.columns))
            # Note: OrdinalEncoder doesn't handle unseen features, while CategoricalEncoder used too
            import category_encoders as ce
            transformers.append(
                (ce.OrdinalEncoder(**ord_params), categorical_features))
        if self._use_catboost_encoding and any(categorical_features.values):
            cb_params = dict(handle_missing='value', handle_unknown='value')
            cb_params['sigma'] = lr_params.pop('sigma')
            full_features_list.extend(list(cat_X.columns))
            import category_encoders as ce
            transformers.append(
                (ce.CatBoostEncoder(**cb_params), categorical_features))
        if self._use_woe_encoding and any(categorical_features.values):
            woe_params = dict(handle_missing='value', handle_unknown='value')
            woe_params['randomized'] = lr_params.pop('randomized')
            woe_params['sigma'] = lr_params.pop('sigma_woe')
            woe_params['regularization'] = lr_params.pop('regularization')
            full_features_list.extend(list(cat_X.columns))
            import category_encoders as ce
            transformers.append(
                (ce.WOEEncoder(**woe_params), categorical_features))
        if self._use_target_encoding and any(categorical_features.values):
            te_params = dict(handle_missing='value', handle_unknown='value')
            te_params['min_samples_leaf'] = lr_params.pop('min_samples_leaf')
            te_params['smoothing'] = lr_params.pop('smoothing')
            full_features_list.extend(list(cat_X.columns))
            import category_encoders as ce
            transformers.append(
                (ce.TargetEncoder(**te_params), categorical_features))
        if self._use_target_encoding_other and any(
                categorical_features.values):
            full_features_list.extend(list(cat_X.columns))
            len_uniques = []
            cat_X_copy = cat_X.copy()
            for c in cat_X.columns:
                le = LabelEncoder()
                le.fit(cat_X[c])
                cat_X_copy[c] = le.transform(cat_X_copy[c])
                len_uniques.append(len(le.classes_))
            if self._debug:
                uniques_series = pd.Series(len_uniques,
                                           index=list(cat_X.columns))
                print("uniques_series: %s" % uniques_series)
            ALPHA = 75
            MAX_UNIQUE = max(len_uniques)
            # FEATURES_COUNT = cat_X.shape[1]
            cv = StratifiedKFold(n_splits=5,
                                 shuffle=True,
                                 random_state=self.params['random_state'])
            split_cv = [cv]
            # split_cv = [3, 3]
            from target_encoding import TargetEncoder
            transformers.append(
                (TargetEncoder(alpha=ALPHA,
                               max_unique=MAX_UNIQUE,
                               split_in=split_cv), categorical_features))
        if self._use_ohe_encoding and any(categorical_features.values):
            transformers.append(
                (OneHotEncoder(handle_unknown='ignore',
                               sparse=True), categorical_features))
        assert len(transformers) > 0, "should have some features"

        preprocess = make_column_transformer(*transformers)

        # ESTIMATOR
        lr_defaults = dict(penalty='l2',
                           dual=False,
                           tol=1e-4,
                           C=1.0,
                           fit_intercept=True,
                           intercept_scaling=1,
                           class_weight=None,
                           random_state=None,
                           solver='warn',
                           max_iter=100,
                           multi_class='warn',
                           verbose=0,
                           warm_start=False,
                           n_jobs=None,
                           l1_ratio=None)
        allowed_lr_kwargs_keys = lr_defaults.keys()
        lr_params_copy = copy.deepcopy(lr_params)
        for k, v in lr_params_copy.items():
            if k not in allowed_lr_kwargs_keys:
                lr_params.pop(k, None)
        del lr_params_copy

        can_score = self.num_classes == 2 and 'AUC' in self.params_base[
            'score_f_name'].upper()
        # print("LR: can_score: %s" % str(can_score))
        if can_score:
            scorer = make_scorer(roc_auc_score,
                                 greater_is_better=True,
                                 needs_proba=True)
        else:
            scorer = None

        if not ('C' in lr_params or 'l1_ratios' in lr_params):
            # override
            self.params['cv_search'] = False

        if not self.params['cv_search']:
            estimator = LogisticRegression(**lr_params)
            estimator_name = 'logisticregression'
        else:
            lr_params_cv = copy.deepcopy(lr_params)
            if 'C' in lr_params:
                lr_params_cv['Cs'] = self.get_param_range(
                    self.params['C'],
                    self.params['fit_count'],
                    func_type='log')
                # print("LR: CV: Cs: %s" % str(lr_params_cv['Cs']))
            if 'l1_ratios' in lr_params:
                lr_params_cv['l1_ratios'] = self.get_param_range(
                    self.params['l1_ratio'],
                    self.params['fit_count'],
                    func_type='linear')
                # print("LR: CV: l1_ratios: %s" % str(lr_params_cv['l1_ratios']))
            lr_params_cv.pop('n_jobs', None)
            lr_params_cv.pop('C', None)
            lr_params_cv.pop('l1_ratio', None)
            if lr_params_cv['penalty'] == 'none':
                lr_params_cv['penalty'] = 'l2'
            estimator = LogisticRegressionCV(n_jobs=self.params['n_jobs'],
                                             cv=3,
                                             refit=True,
                                             scoring=scorer,
                                             **lr_params_cv)
            estimator_name = 'logisticregressioncv'

        # PIPELINE
        model = make_pipeline(preprocess, estimator)

        # FIT
        if self.params['grid_search_iterations'] and can_score:
            # WIP FIXME for multiclass and other scorers
            from sklearn.model_selection import GridSearchCV

            max_iter_range = self.get_param_range(
                self.params['max_iter'],
                self.params['fit_count'],
                range_limit=self._overfit_limit_iteration_step,
                func_type='log')
            # print("LR: max_iter_range: %s" % str(max_iter_range))
            param_grid = {
                '%s__max_iter' % estimator_name: max_iter_range,
            }
            grid_clf = GridSearchCV(model,
                                    param_grid,
                                    n_jobs=self.params['n_jobs'],
                                    cv=3,
                                    iid=True,
                                    refit=True,
                                    scoring=scorer)
            grid_clf.fit(X, y)
            model = grid_clf.best_estimator_
            # print("LR: best_index=%d best_score: %g best_params: %s" % (
            #    grid_clf.best_index_, grid_clf.best_score_, str(grid_clf.best_params_)))
        elif grid_search:
            # WIP
            from sklearn.model_selection import GridSearchCV

            param_grid = {
                'columntransformer__pipeline__simpleimputer__strategy':
                ['mean', 'median'],
                '%s__C' % estimator_name: [0.1, 0.5, 1.0],
            }
            grid_clf = GridSearchCV(model, param_grid, cv=10, iid=False)
            grid_clf.fit(X, y)
            model = grid_clf.best_estimator_
            # self.best_params = grid_clf.best_params_
        else:
            model.fit(X, y)

        # get actual LR model
        lr_model = model.named_steps[estimator_name]

        if self._debug and False:
            import uuid
            struuid = str(uuid.uuid4())
            save_obj(
                model.named_steps['columntransformer'].fit_transform(X, y),
                "columns_csr_%s.pkl" % struuid)

        # average importances over classes
        importances = np.average(np.array(lr_model.coef_), axis=0)
        # average iterations over classes (can't take max_iter per class)
        iterations = np.average(lr_model.n_iter_)
        # print("LR: iterations: %d" % iterations)

        # reduce OHE features to original names
        ohe_features_short = []
        if self._use_ohe_encoding and any(categorical_features.values):
            if self._use_ohe_encoding:
                input_features = [x + self._ohe_postfix for x in cat_X.columns]
                ohe_features = pd.Series(
                    model.named_steps['columntransformer'].
                    named_transformers_['onehotencoder'].get_feature_names(
                        input_features=input_features))

                def f(x):
                    return '_'.join(x.split(self._ohe_postfix + '_')[:-1])

                # identify OHE features
                ohe_features_short = ohe_features.apply(lambda x: f(x))
                full_features_list.extend(list(ohe_features_short))

        # aggregate our own features
        if self._kaggle_features:
            self.features.aggregate(full_features_list, importances)

        msg = "LR: num=%d cat=%d : ohe=%d : imp=%d full=%d" % (
            len(num_X.columns), len(cat_X.columns), len(ohe_features_short),
            len(importances), len(full_features_list))
        if self._debug:
            print(msg)
        assert len(importances) == len(full_features_list), msg

        # aggregate importances by dai feature name
        importances = pd.Series(
            np.abs(importances),
            index=full_features_list).groupby(level=0).mean()
        assert len(importances) == len(
            X_orig_cols_names), "%d %d %s : %s %s" % (
                len(importances), len(X_orig_cols_names), msg,
                str(list(X.columns)), str(list(X.dtypes)))

        # save hyper parameter searched results for next search
        self.params['max_iter'] = iterations
        if self.params['cv_search']:
            self.params['C'] = np.average(lr_model.C_, axis=0)
        if 'l1_ratios' in lr_params and self.params['cv_search']:
            self.params['l1_ratio'] = np.average(lr_model.l1_ratio_, axis=0)
        if 'fit_count' in self.params:
            self.params['fit_count'] += 1
        else:
            self.params['fit_count'] = 0

        self.set_model_properties(model=(model, self.features),
                                  features=orig_cols,
                                  importances=importances.tolist(),
                                  iterations=iterations)
        self.features = None
Ejemplo n.º 10
0
def test_correct_init_encoder():

    X, y = generate_data_init()

    for alpha in np.arange(0, 100, 10):
        enc = TargetEncoder(alpha=alpha)
        enc.transform_train(X, y)
        enc.transform_test(X)

    for max_unique in np.arange(2, 100, 10):
        enc = TargetEncoder(max_unique=max_unique)
        enc.transform_train(X, y)
        enc.transform_test(X)

    for split_1, split_2 in product(range(1, 6), range(1, 6)):

        split = []
        if split_1 == 1:
            continue
        else:
            split.append(split_1)
        if split_2 != 1:
            split.append(split_2)

        enc = TargetEncoder(split=split)
        enc.transform_train(X, y)
        enc.transform_test(X)