def lgbm_cv(X, y, params, cv, states=1, pipe=None, early_stopping_rounds=20, verbose=False): importance = [] n = cv.get_n_splits() * states scores = [[] for _ in range(cv.get_n_splits())] iterations = [[] for _ in range(cv.get_n_splits())] clf = LGBMClassifier(**params, importance_type="gain") with tqdm(total=n, ncols=50) as pbar: for i, (train_index, test_index) in enumerate(cv.split(X)): X_train, y_train = X.iloc[train_index], y.iloc[train_index] X_test, y_test = X.iloc[test_index], y.iloc[test_index] if pipe is not None: X_train = pipe.fit_transform(X_train, y_train, X_test=X_test) X_test = pipe.transform(X_test) for state in range(states): clf.random_state = state clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric="auc", early_stopping_rounds=early_stopping_rounds, verbose=verbose) iterations[i].append(clf.best_iteration_) scores[i].append(clf.best_score_["valid_0"]["auc"]) importance.append([i, state] + clf.feature_importances_.tolist()) pbar.update(1) importance = pd.DataFrame(importance, columns=["fold", "state"] + X_train.columns.tolist()) # sort columns from most to least important importance = importance[importance.mean().sort_values( ascending=False).index] importance = importance.set_index(["fold", "state"]) return { "importance": importance, "scores": scores, "iterations": iterations }
def get_model(PARAMS): """Get model according to parameters""" scale_pos_weight = 577.88 # scale_pos_weight = number of negative samples / number of positive samples model = LGBMClassifier() model.num_leaves = PARAMS.get("num_leaves") model.max_depth = PARAMS.get("max_depth") model.n_estimators = 10000 model.early_stopping_rounds = 20 model.scale_pos_weight = scale_pos_weight # we set this parameter to solve the class imbalance problem model.objective = "binary" model.min_child_weight = PARAMS.get("min_child_weight") model.subsample = PARAMS.get("subsample") model.subsample_freq = 1 model.colsample_bytree = PARAMS.get("colsample_bytree") model.random_state = 42 model.n_jobs = -1 model.max_bin = 63 model.device = "gpu" model.gpu_use_dp = False model.gpu_platform_id = 0 model.gpu_device_id = 0 return model