Exemple #1
0
def lgbm_cv(X,
            y,
            params,
            cv,
            states=1,
            pipe=None,
            early_stopping_rounds=20,
            verbose=False):
    importance = []
    n = cv.get_n_splits() * states
    scores = [[] for _ in range(cv.get_n_splits())]
    iterations = [[] for _ in range(cv.get_n_splits())]
    clf = LGBMClassifier(**params, importance_type="gain")
    with tqdm(total=n, ncols=50) as pbar:
        for i, (train_index, test_index) in enumerate(cv.split(X)):
            X_train, y_train = X.iloc[train_index], y.iloc[train_index]
            X_test, y_test = X.iloc[test_index], y.iloc[test_index]
            if pipe is not None:
                X_train = pipe.fit_transform(X_train, y_train, X_test=X_test)
                X_test = pipe.transform(X_test)
            for state in range(states):
                clf.random_state = state
                clf.fit(X_train,
                        y_train,
                        eval_set=[(X_test, y_test)],
                        eval_metric="auc",
                        early_stopping_rounds=early_stopping_rounds,
                        verbose=verbose)
                iterations[i].append(clf.best_iteration_)
                scores[i].append(clf.best_score_["valid_0"]["auc"])
                importance.append([i, state] +
                                  clf.feature_importances_.tolist())
                pbar.update(1)
    importance = pd.DataFrame(importance,
                              columns=["fold", "state"] +
                              X_train.columns.tolist())
    # sort columns from most to least important
    importance = importance[importance.mean().sort_values(
        ascending=False).index]
    importance = importance.set_index(["fold", "state"])
    return {
        "importance": importance,
        "scores": scores,
        "iterations": iterations
    }
def get_model(PARAMS):
    """Get model according to parameters"""
    scale_pos_weight = 577.88  # scale_pos_weight = number of negative samples / number of positive samples
    model = LGBMClassifier()
    model.num_leaves = PARAMS.get("num_leaves")
    model.max_depth = PARAMS.get("max_depth")
    model.n_estimators = 10000
    model.early_stopping_rounds = 20
    model.scale_pos_weight = scale_pos_weight  # we set this parameter to solve the class imbalance problem
    model.objective = "binary"
    model.min_child_weight = PARAMS.get("min_child_weight")
    model.subsample = PARAMS.get("subsample")
    model.subsample_freq = 1
    model.colsample_bytree = PARAMS.get("colsample_bytree")
    model.random_state = 42
    model.n_jobs = -1
    model.max_bin = 63
    model.device = "gpu"
    model.gpu_use_dp = False
    model.gpu_platform_id = 0
    model.gpu_device_id = 0

    return model