Example #1
0
    def test_tune_best_score_reproducibility(self) -> None:
        boston = sklearn.datasets.load_boston()
        X_trainval, X_test, y_trainval, y_test = train_test_split(
            boston.data, boston.target, random_state=0)

        train = lgb.Dataset(X_trainval, y_trainval)
        params = {
            "objective": "regression",
            "metric": "rmse",
            "random_seed": 0
        }

        tuner_first_try = lgb.LightGBMTunerCV(
            params,
            train,
            early_stopping_rounds=3,
            folds=KFold(n_splits=3),
            optuna_seed=10,
        )
        tuner_first_try.run()
        best_score_first_try = tuner_first_try.best_score

        tuner_second_try = lgb.LightGBMTunerCV(
            params,
            train,
            early_stopping_rounds=3,
            folds=KFold(n_splits=3),
            optuna_seed=10,
        )
        tuner_second_try.run()
        best_score_second_try = tuner_second_try.best_score

        assert best_score_second_try == best_score_first_try
    def tune(self, X, y):
        dtrain = lgb_opt.Dataset(X, label=y)

        params = {
            "objective": "binary",
            "metric": "auc",
            "verbosity": -1,
            "boosting_type": "gbdt",
        }

        tuner = lgb_opt.LightGBMTunerCV(params,
                                        dtrain,
                                        verbose_eval=100,
                                        early_stopping_rounds=100,
                                        folds=KFold(n_splits=3))

        tuner.run()

        print("Best score:", tuner.best_score)
        best_params = tuner.best_params
        print("Best params:", best_params)
        print("  Params: ")
        for key, value in best_params.items():
            print("    {}: {}".format(key, value))
        return best_params
Example #3
0
    def test_tune_best_score_reproducibility(self) -> None:
        california = sklearn.datasets.fetch_california_housing()
        X_trainval, X_test, y_trainval, y_test = train_test_split(
            california.data, california.target, random_state=0
        )

        train = lgb.Dataset(X_trainval, y_trainval)
        params = {
            "objective": "regression",
            "metric": "rmse",
            "random_seed": 0,
            "deterministic": True,
            "force_col_wise": True,
            "verbosity": -1,
        }

        tuner_first_try = lgb.LightGBMTunerCV(
            params,
            train,
            early_stopping_rounds=3,
            folds=KFold(n_splits=3),
            optuna_seed=10,
        )
        tuner_first_try.run()
        best_score_first_try = tuner_first_try.best_score

        tuner_second_try = lgb.LightGBMTunerCV(
            params,
            train,
            early_stopping_rounds=3,
            folds=KFold(n_splits=3),
            optuna_seed=10,
        )
        tuner_second_try.run()
        best_score_second_try = tuner_second_try.best_score

        assert best_score_second_try == best_score_first_try
Example #4
0
def _single_train(features, targets, params):
    '''
    train single column of target
    '''

    trainval = lgb.Dataset(features, targets)
    tuner = lgb.LightGBMTunerCV(
        params,
        trainval,
        verbose_eval=100,
        early_stopping_rounds=100,
        folds=KFold(n_splits=3),
    )
    tuner.run()
    return tuner.best_params, tuner.best_score
    data = train.drop(['revenue'], axis=1)
    target = train.revenue
    logtarget = np.log1p(target)
    dtrain = lgb.Dataset(data, label=logtarget)

    ### set the parameters and optimize the hiper-parameters ####
    params = {
        "objective": "rmse",
        "metric": "rmse",
        "verbosity": -1,
        "boosting_type": "gbdt",
    }

    tuner = lgb.LightGBMTunerCV(params,
                                dtrain,
                                verbose_eval=100,
                                early_stopping_rounds=100,
                                folds=KFold(n_splits=10),
                                return_cvbooster=True)
    tuner.run()
    ### Print the results ###
    print("Best score:", tuner.best_score)
    best_params = tuner.best_params
    print("Best params:", best_params)
    print("  Params: ")
    for key, value in best_params.items():
        print("    {}: {}".format(key, value))

    ### Save the best model ####
    # model = tuner.get_best_booster()
    # model.save_model('lgbm_model.txt')
def training(train, test, validation_size, estimator, target_variable, drop_list, target_type, cv_folds, scoring_cv, cv=True, final=False, hypertuning=False):

            import matplotlib.pyplot as plt
            import pandas as pd
            import lightgbm as lgbm
            import training
            import os
            import sklearn
            import numpy as np
            import seaborn as sns
            import re
            import matplotlib.pyplot as plt
            import math
            from datetime import datetime
            import datetime

            import statsmodels.api as sm
            from sklearn.model_selection import train_test_split
            from scipy import stats
            from sklearn.feature_selection import SelectFromModel
            from sklearn.model_selection import cross_val_score, validation_curve
            from sklearn.ensemble import RandomForestClassifier
            from sklearn.ensemble import RandomForestRegressor
            from sklearn.ensemble import GradientBoostingRegressor
            from sklearn.ensemble import GradientBoostingClassifier
            from sklearn.model_selection import RandomizedSearchCV
            from sklearn.model_selection import train_test_split
            from sklearn.pipeline import Pipeline
            from sklearn.compose import ColumnTransformer
            from sklearn import ensemble
            from sklearn.linear_model import LogisticRegression
            from sklearn.model_selection import cross_val_score
            from sklearn.model_selection import GridSearchCV
            from sklearn.preprocessing import OneHotEncoder
            from sklearn.impute import SimpleImputer
            from sklearn.preprocessing import KBinsDiscretizer
            from sklearn.metrics import mean_squared_log_error
            from sklearn.metrics import make_scorer
            from sklearn.model_selection import KFold
            from sklearn.metrics import (confusion_matrix,  
                                    accuracy_score, 
                                    recall_score,
                                    roc_curve,
                                    roc_auc_score,
                                    plot_roc_curve,
                                    mean_squared_error) 

            import xgboost
            import shap
            from catboost import CatBoostClassifier
            from catboost import CatBoostRegressor
            import lightgbm as lgbm
            import optuna.integration.lightgbm as lgb
            from optuna.integration import _lightgbm_tuner as tuner
            from optuna.integration._lightgbm_tuner import LightGBMTuner 
            from optuna.integration._lightgbm_tuner import LightGBMTunerCV 

            rmsle_scorer = make_scorer(score_func)

            train_y = train[target_variable]
            train_x = train.drop(columns=drop_list)

            test_y = test[target_variable]
            test_x = test.drop(columns=drop_list) 

            column_names = list(train_x.columns)
            
            if final==True:

                train_x = train_x.append(test_x)
                train_y = train_y.append(test_y)

            if target_type=="bin":

                if estimator == "log_sk":
                    model = LogisticRegression(max_iter=1000)
                    log_sk = model.fit(train_x, train_y)
                    fitted_model = log_sk

                if estimator == "gb" and hypertuning==False:
                    model = ensemble.GradientBoostingClassifier(learning_rate = 0.1, max_depth=3, n_estimators= 100)
                    gb = model.fit(train_x, train_y)
                    fitted_model = gb   

                if estimator == "gb" and hypertuning==True:

                    param_grid = {
                                    'n_estimators': [100, 200, 400],
                                    'max_depth': [3, 5, 7],
                                    'learning_rate': [0.1, 0.05, 0.025, 0.01, 0.001, 0.005],
                                    'random_state': [42]
                                }

                    gb = ensemble.GradientBoostingClassifier()
                    gb_grid = GridSearchCV(gb, param_grid, cv=cv_folds, scoring=scoring_cv)
                    gb_grid.fit(train_x, train_y)
                    print('Optimal parameters for gradient boosting classifier = ', gb_grid.best_params_)
                    gb = gb_grid.best_estimator_
                    fitted_model = gb

                if estimator == "rf" and hypertuning==False:
                    model = ensemble.RandomForestClassifier(max_depth= 80, max_features= 5, min_samples_leaf= 3, min_samples_split= 12, n_estimators= 100)
                    rf = model.fit(train_x, train_y)
                    fitted_model=rf

                if estimator == "rf" and hypertuning==True:

                    param_grid = {
                                    'bootstrap': [True],
                                    'max_depth': [10, 20, 30],
                                    'max_features': [2, 3, 5],
                                    'min_samples_leaf': [3, 5, 10],
                                    'min_samples_split': [8, 12],
                                    'n_estimators': [100, 300, 500],
                                    'n_jobs': [3]
                                }

                    rf = RandomForestClassifier()
                    rf_grid = GridSearchCV(rf, param_grid, cv=cv_folds, scoring=scoring_cv)
                    rf_grid.fit(train_x, train_y)
                    print('Optimal parameters for random forest classifier = ', rf_grid.best_params_)
                    rf = rf_grid.best_estimator_
                    fitted_model = rf

                if cv and hypertuning==False:
                    cross_val_accuracy = cross_val_score(estimator=model
                            , X=train_x
                            , y=train_y
                            , cv=cv_folds
                            , scoring=scoring_cv)

                    print(f'The average cross validation accuracy of the model is {round(cross_val_accuracy.mean(), 2)}')
                    print(cross_val_accuracy)

            if target_type=="con":

                if estimator == "lgbm" and hypertuning==False:

                    train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=validation_size, shuffle=True, random_state=42)
                    train_data=lgb.Dataset(train_x,label=train_y)
                    valid_data=lgb.Dataset(valid_x,label=valid_y)

                    model = lgbm.LGBMRegressor(random_state=42, n_estimators=1000)
                    lgbm_model = model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], eval_metric=scoring_cv, verbose = -1)
                    fitted_model = lgbm_model

                if estimator == "lin_reg" and hypertuning==False:
                    model = LinearRegression(max_iter=1000)
                    lin_reg = model.fit(train_x, train_y)
                    fitted_model = lin_reg

                if estimator == "gb" and hypertuning==False:
                    model = ensemble.GradientBoostingRegressor(learning_rate = 0.001, max_depth=5, n_estimators= 100)
                    gb = model.fit(train_x, train_y)
                    fitted_model = gb   

                if estimator == "rf" and hypertuning==False:
                    model = ensemble.RandomForestRegressor(max_depth= 30, max_features= 5, min_samples_leaf= 3, min_samples_split= 8, n_estimators= 500, n_jobs= -1)
                    rf = model.fit(train_x, train_y)
                    fitted_model=rf

                if estimator == "gb" and hypertuning==True:
                    # {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 100, 'random_state': 42}
                    param_grid = {
                                'n_estimators': [100,500,1000],
                                'max_features': ["auto","sqrt","log2",0.6,0.8],
                                'min_samples_leaf':[30,50,70],
                                'min_samples_split':[10,20,500,100],
                                'max_depth' : [10,15,20,25],
                                'learning_rate':[0.1,0.01,0.001]
                                }

                    gb = ensemble.GradientBoostingRegressor()
                    gb_grid = GridSearchCV(gb, param_grid, cv=cv_folds, scoring=scoring_cv)
                    gb_grid.fit(train_x, train_y)
                    print('Optimal parameters for gradient boosting regressor = ', gb_grid.best_params_)
                    gb = gb_grid.best_estimator_
                    fitted_model = gb

                if estimator == "lgbm" and hypertuning==True:
                    if __name__ == "__main__":

                            dtrain = lgb.Dataset(train_x, label=train_y)

                            params = {
                                    "objective": "regression",
                                    "metric": "rmse",
                                    "verbosity": -1,
                                    "boosting_type": "gbdt",
                                }

                            tuner = lgb.LightGBMTunerCV(
                                    params, dtrain, verbose_eval=100, early_stopping_rounds=100, folds=KFold(n_splits=5)
                                )

                            tuner.run()

                            print("Best score:", tuner.best_score)
                            best_params = tuner.best_params
                            print("Best params:", best_params)
                            print("  Params: ")
                            for key, value in best_params.items():
                                print("    {}: {}".format(key, value))


                if estimator == "rf" and hypertuning==True: 
                    # {'bootstrap': True, 'max_depth': 80, 'max_features': 2, 'min_samples_leaf': 5, 'min_samples_split': 12, 'n_estimators': 100, 'n_jobs': 1}
                    # max_depth= 80, max_features= 5, min_samples_leaf= 3, min_samples_split= 8, n_estimators= 300, n_jobs= 1
                    # {'bootstrap': True, 'max_depth': 100, 'max_features': 5, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 500, 'n_jobs': 4}
                    
                    param_grid = {
                                    'max_depth': [10, 20, 30],
                                    'max_features': [2, 3, 5],
                                    'min_samples_leaf': [3, 5, 10],
                                    'min_samples_split': [8, 12],
                                    'n_estimators': [100, 300, 500],
                                    'n_jobs': [4]
                                }

                    rf = RandomForestRegressor()
                    rf_grid = GridSearchCV(rf, param_grid, cv=cv_folds, scoring=scoring_cv)
                    rf_grid.fit(train_x, train_y)
                    print('Optimal parameters for random forest regressor = ', rf_grid.best_params_)
                    rf = rf_grid.best_estimator_
                    fitted_model = rf


                if cv and hypertuning==False:
                    cross_val_rmse = cross_val_score(estimator=model
                            , X=train_x
                            , y=train_y
                            , cv=cv_folds
                            , scoring=scoring_cv)

                    print(f'The average cross validation rmsle of the model is {-1*round(cross_val_rmse.mean(), 2)}')
                    print(cross_val_rmse)

                if estimator=="gb" or estimator=="rf" or estimator=="lgbm":
                    list_all_Features = train_x.columns.tolist()

                    # Feature importance
                    fi_df = pd.DataFrame({"Feature": list_all_Features, "Importance": fitted_model.feature_importances_}).sort_values(by="Importance", ascending=False)
                    fi_selected=fi_df[:15]
                    important_feature_list = fi_selected["Feature"].tolist()

                    if estimator=="gb":
                        fi_selected.to_excel(r'fi_selected.xlsx')
                        fig = plt.figure(figsize=(20,10))
                        feat_importances = pd.Series(fitted_model.feature_importances_, index=list_all_Features)
                        feat_importances.nlargest(30).plot(kind='barh', color="green")
                        plt.title("Feature Importance from Gradient Boosting")
                        plt.savefig('Feature Importance from Gradient Boosting.png',  bbox_inches = "tight")

                    if estimator=="rf":
                        fi_selected.to_excel(r'fi_selected.xlsx')
                        fig = plt.figure(figsize=(20,20))
                        feat_importances = pd.Series(fitted_model.feature_importances_, index=list_all_Features)
                        feat_importances.nlargest(30).plot(kind='barh', color="green")
                        plt.title("Feature Importance from Random Forest")
                        plt.savefig('Feature Importance from Random Forest.png',  bbox_inches = "tight")

                    if estimator=="lgbm":
                        fi_selected.to_excel(r'fi_selected.xlsx')
                        feat_importances = pd.Series(fitted_model.feature_importances_, index=list_all_Features)
                        explainer = shap.TreeExplainer(fitted_model)
                        shap_values = explainer.shap_values(valid_x)

                        shap.initjs()

                        force_plot = shap.force_plot(explainer.expected_value, shap_values[0,:], valid_x.iloc[0,:])
                        shap.save_html("index_force_plot.htm", force_plot)
                        force_plot_all = shap.force_plot(explainer.expected_value, shap_values, valid_x)
                        shap.save_html("index_force_plot_all.htm", force_plot_all)
                        plt.figure(figsize=(10,20))
                        shap.summary_plot(shap_values, valid_x, show=False)
                        plt.savefig('summary_plot.png',  bbox_inches = "tight")

                        top_features = feat_importances.nlargest(10)
                        top_features = top_features.reset_index()
                        top_features = top_features['index'].to_list()    

                        for i in top_features:
                            plt.figure(figsize=(20,20))
                            shap.dependence_plot(i, shap_values, valid_x, show=False)
                            plt.savefig(f"dep_plot_{i}.png",  bbox_inches = "tight")

                if final==False and target_type=="con":
                    yhat = fitted_model.predict(test_x).astype(float)
                    y_pred = list(yhat.astype(float))
                    y_true = list(test_y) 
                    print(np.sqrt(mean_squared_error(y_true, y_pred)))

                if final==False and target_type=="bin":
                    yhat = fitted_model.predict(test_x) 
                    y_pred = list(map(round, yhat)) 
                    cm = confusion_matrix(test_y, y_pred)  
                    print ("Confusion Matrix : \n", cm) 
                    print('Test accuracy = ', accuracy_score(test_y, prediction))
                    print('Test recall = ', recall_score(test_y, prediction))
                
                return fitted_model
Example #7
0
"""
import sklearn.datasets
from sklearn.model_selection import KFold

import optuna.integration.lightgbm as lgb


if __name__ == "__main__":
    data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
    dtrain = lgb.Dataset(data, label=target)

    params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
    }

    tuner = lgb.LightGBMTunerCV(
        params, dtrain, verbose_eval=100, early_stopping_rounds=100, folds=KFold(n_splits=3)
    )

    tuner.run()

    print("Best score:", tuner.best_score)
    best_params = tuner.best_params
    print("Best params:", best_params)
    print("  Params: ")
    for key, value in best_params.items():
        print("    {}: {}".format(key, value))
Example #8
0
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold
import optuna.integration.lightgbm as lgb

X, y = make_classification(10 ** 4, 100, shift=0.3, random_state=666)
dtrain = lgb.Dataset(X, label=y)

params = {
    "objective": "binary",
    "boosting_type": "gbdt",
    "metric": "auc",
    "n_jobs": 16,
    "verbosity": -1,
}

tuner = lgb.LightGBMTunerCV(
    params,
    dtrain,
    verbose_eval=100,
    early_stopping_rounds=100,
    folds=StratifiedKFold(5),
    show_progress_bar=False
)

tuner.run()

tuner.best_score
tuner.best_params