コード例 #1
0
ファイル: modelling.py プロジェクト: jackdry/pyml
def xgb_cv(X,
           y,
           params,
           cv,
           states=1,
           pipe=None,
           early_stopping_rounds=20,
           verbose=False):
    importance = []
    n = cv.get_n_splits() * states
    scores = [[] for _ in range(cv.get_n_splits())]
    iterations = [[] for _ in range(cv.get_n_splits())]
    clf = XGBClassifier(**params, tree_method="gpu_hist")
    with tqdm(total=n, ncols=50) as pbar:
        for i, (train_index, test_index) in enumerate(cv.split(X)):
            X_train, y_train = X.iloc[train_index], y.iloc[train_index]
            X_test, y_test = X.iloc[test_index], y.iloc[test_index]
            if pipe is not None:
                X_train = pipe.fit_transform(X_train, y_train, X_test=X_test)
                X_test = pipe.transform(X_test)
            for state in range(states):
                clf.random_state = state
                clf.fit(X_train,
                        y_train,
                        eval_set=[(X_test, y_test)],
                        eval_metric="auc",
                        early_stopping_rounds=early_stopping_rounds,
                        verbose=verbose)
                iterations[i].append(clf.best_iteration)
                scores[i].append(clf.best_score)
                importance.append([i, state] +
                                  clf.feature_importances_.tolist())
                pbar.update(1)
    importance = pd.DataFrame(importance,
                              columns=["fold", "state"] +
                              X_train.columns.tolist())
    # sort columns from most to least important
    importance = importance[importance.mean().sort_values(
        ascending=False).index]
    importance = importance.set_index(["fold", "state"])
    return {
        "importance": importance,
        "scores": scores,
        "iterations": iterations
    }