def xgb_cv(X, y, params, cv, states=1, pipe=None, early_stopping_rounds=20, verbose=False): importance = [] n = cv.get_n_splits() * states scores = [[] for _ in range(cv.get_n_splits())] iterations = [[] for _ in range(cv.get_n_splits())] clf = XGBClassifier(**params, tree_method="gpu_hist") with tqdm(total=n, ncols=50) as pbar: for i, (train_index, test_index) in enumerate(cv.split(X)): X_train, y_train = X.iloc[train_index], y.iloc[train_index] X_test, y_test = X.iloc[test_index], y.iloc[test_index] if pipe is not None: X_train = pipe.fit_transform(X_train, y_train, X_test=X_test) X_test = pipe.transform(X_test) for state in range(states): clf.random_state = state clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric="auc", early_stopping_rounds=early_stopping_rounds, verbose=verbose) iterations[i].append(clf.best_iteration) scores[i].append(clf.best_score) importance.append([i, state] + clf.feature_importances_.tolist()) pbar.update(1) importance = pd.DataFrame(importance, columns=["fold", "state"] + X_train.columns.tolist()) # sort columns from most to least important importance = importance[importance.mean().sort_values( ascending=False).index] importance = importance.set_index(["fold", "state"]) return { "importance": importance, "scores": scores, "iterations": iterations }