def test_cv_lgbm_df():
    X, y = make_classification_df(n_samples=1024,
                                  n_num_features=20,
                                  n_cat_features=1,
                                  class_sep=0.98,
                                  random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    models = [LGBMClassifier(n_estimators=300) for _ in range(5)]

    pred_oof, pred_test, scores, importance = cross_validate(
        models, X_train, y_train, X_test, cv=5, eval_func=roc_auc_score)

    print(scores)
    assert len(scores) == 5 + 1
    assert scores[-1] >= 0.85  # overall roc_auc
    assert roc_auc_score(y_train, pred_oof) == scores[-1]
    assert roc_auc_score(y_test, pred_test) >= 0.85  # test roc_auc
    assert roc_auc_score(y_test, models[0].predict_proba(X_test)
                         [:, 1]) >= 0.85  # make sure models are trained
    assert len(importance) == 5
    assert list(importance[0].columns) == ['feature', 'importance']
    assert len(importance[0]) == 20 + 1
    assert models[0].booster_.num_trees(
    ) < 300  # making sure early stopping worked
def test_cv_lgbm():
    X, y = make_classification(n_samples=1024,
                               n_features=20,
                               class_sep=0.98,
                               random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    models = [LGBMClassifier(n_estimators=300) for _ in range(5)]

    pred_oof, pred_test, scores, importance = cross_validate(
        models,
        X_train,
        y_train,
        X_test,
        cv=5,
        eval_func=roc_auc_score,
        fit_params={'early_stopping_rounds': 200})

    print(scores)
    assert len(scores) == 5 + 1
    assert scores[-1] >= 0.85  # overall roc_auc
    assert roc_auc_score(y_train, pred_oof) == scores[-1]
    assert roc_auc_score(y_test, pred_test) >= 0.85  # test roc_auc
    assert roc_auc_score(
        y,
        models[0].predict_proba(X)[:,
                                   1]) >= 0.85  # make sure models are trained
    assert len(importance) == 5
    assert list(importance[0].columns) == ['feature', 'importance']
    assert len(importance[0]) == 20
def test_cv_partial_evaluate():
    X, y = make_classification(n_samples=1024,
                               n_features=20,
                               class_sep=0.98,
                               random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    model = RidgeClassifier(alpha=1.0)

    n = 0

    def _fold_count(*args):
        nonlocal n
        n += 1

    cv = Take(2, KFold(5))

    pred_oof, pred_test, scores, _ = cross_validate(model,
                                                    X_train,
                                                    y_train,
                                                    X_test,
                                                    cv=cv,
                                                    eval_func=roc_auc_score,
                                                    on_each_fold=_fold_count)

    assert len(scores) == 2 + 1
    assert scores[-1] >= 0.8  # overall auc
    assert n == 2
Exemple #4
0
def test_cv_sklean_binary():
    X, y = make_classification(n_samples=1024, n_features=20, class_sep=0.98, random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

    model = RidgeClassifier(alpha=1.0)

    pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=5, eval_func=roc_auc_score)

    assert len(scores) == 5 + 1
    assert scores[-1] >= 0.85  # overall auc
    assert roc_auc_score(y_train, pred_oof) == scores[-1]
    assert roc_auc_score(y_test, pred_test) >= 0.85  # test score
def test_fit_params_callback():
    X, y = make_classification(n_samples=1024,
                               n_features=20,
                               class_sep=0.98,
                               random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    models = [LGBMClassifier(n_estimators=300) for _ in range(5)]

    sample_weights = np.random.randint(1, 10, size=len(X_train))
    sample_weights = sample_weights / sample_weights.sum()

    def fit_params(n: int, train_index: List[int], valid_index: List[int]):
        return {
            'early_stopping_rounds': 100,
            'sample_weight': list(sample_weights[train_index]),
            'eval_sample_weight': [list(sample_weights[valid_index])]
        }

    result_w_weight = cross_validate(models,
                                     X_train,
                                     y_train,
                                     X_test,
                                     cv=5,
                                     eval_func=roc_auc_score,
                                     fit_params=fit_params)

    result_wo_weight = cross_validate(models,
                                      X_train,
                                      y_train,
                                      X_test,
                                      cv=5,
                                      eval_func=roc_auc_score,
                                      fit_params={'early_stopping_rounds': 50})

    assert result_w_weight.scores[-1] != result_wo_weight.scores[-1]
Exemple #6
0
def test_cv_sklean_regression():
    X, y = make_regression(n_samples=1024, n_features=20, random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

    model = Ridge(alpha=1.0)

    pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=5, eval_func=r2_score)

    print(scores)
    assert len(scores) == 5 + 1
    assert scores[-1] >= 0.95  # overall r2
    assert r2_score(y_train, pred_oof) == scores[-1]
    assert r2_score(y_test, pred_test) >= 0.95  # test r2
Exemple #7
0
def _make_1st_stage_preds(X, y, X_test):
    if type_of_target(y) == 'continuous':
        models = [
            SVR(),
            Ridge(random_state=0),
            RandomForestRegressor(n_estimators=30, random_state=0)
        ]
    else:
        models = [
            SVC(random_state=0),
            LogisticRegression(random_state=0),
            RandomForestClassifier(n_estimators=30, random_state=0)
        ]

    results = [cross_validate(m, X, y, X_test, cv=5) for m in models]

    return [r.oof_prediction for r in results], [r.test_prediction for r in results]
Exemple #8
0
def stacking(test_predictions: List[np.ndarray],
             oof_predictions: List[np.ndarray],
             y: pd.Series,
             estimator: Optional[BaseEstimator] = None,
             cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None,
             groups: Optional[pd.Series] = None,
             type_of_target: str = 'auto',
             eval_func: Optional[Callable] = None) -> EnsembleResult:
    """
    Perform stacking on predictions.

    Args:
        test_predictions:
            List of predicted values on test data.
        oof_predictions:
            List of predicted values on out-of-fold training data.
        y:
            Target value
        estimator:
            Estimator used for the 2nd-level model.
            If ``None``, the default estimator (auto-tuned linear model) will be used.
        cv:
            int, cross-validation generator or an iterable which determines the cross-validation splitting strategy.

            - None, to use the default ``KFold(5, random_state=0, shuffle=True)``,
            - integer, to specify the number of folds in a ``(Stratified)KFold``,
            - CV splitter (the instance of ``BaseCrossValidator``),
            - An iterable yielding (train, test) splits as arrays of indices.
        groups:
            Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``).
        type_of_target:
            The type of target variable. If ``auto``, type is inferred by ``sklearn.utils.multiclass.type_of_target``.
            Otherwise, ``binary``, ``continuous``, or ``multiclass`` are supported.
        eval_func:
            Evaluation metric used for calculating result score. Used only if ``oof_predictions`` and ``y`` are given.
    Returns:
        Namedtuple with following members

        * test_prediction:
            numpy array, Average prediction on test data.
        * oof_prediction:
            numpy array, Average prediction on Out-of-Fold validation data. ``None`` if ``oof_predictions`` = ``None``.
        * score:
            float, Calculated score on Out-of-Fold data. ``None`` if ``eval_func`` is ``None``.
    """
    assert len(oof_predictions) == len(
        test_predictions), "Number of oof and test predictions should be same"

    def _stack(predictions):
        if predictions[0].ndim == 1:
            predictions = [p.reshape(len(p), -1) for p in predictions]
        return np.hstack(predictions)

    X_train = convert_input(_stack(oof_predictions))
    y = convert_input_vector(y, X_train.index)
    X_test = convert_input(_stack(test_predictions))

    assert len(X_train) == len(y)

    if type_of_target == 'auto':
        type_of_target = multiclass.type_of_target(y)

    if estimator is None:
        # if estimator is None, tuned linear estimator is used
        if type_of_target == 'continuous':
            estimator = Ridge(normalize=True, random_state=0)
            param_grid = {
                'alpha': [0.001, 0.01, 0.1, 1, 10],
            }
        else:
            estimator = LogisticRegression(random_state=0)
            param_grid = {
                'penalty': ['l1', 'l2'],
                'C': [0.001, 0.01, 0.1, 1, 10],
            }
        grid_search = GridSearchCV(estimator, param_grid, cv=cv)
        grid_search.fit(X_train, y, groups=groups)
        estimator = grid_search.best_estimator_

    result = cross_validate(estimator,
                            X_train,
                            y,
                            X_test,
                            cv=cv,
                            groups=groups,
                            eval_func=eval_func)
    score = result.scores[-1] if result.scores else None

    return EnsembleResult(result.test_prediction, result.oof_prediction, score)