def test_lightgbm_classifier(): # Classification params = params = { 'objective': 'binary', 'metric': ['auc', 'binary_logloss'], 'boosting_type': 'gbdt', } classifier_cls = get_lightgbm_learner_learning_api(params, num_boost_round=10000, \ early_stopping_rounds=5, verbose_eval=50, split_random_seed=42) fs = RFE(classifier_cls) dataset = BankNote() target_column = dataset.y_column df = dataset.df # Features generation features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column]) # Feature selection x_columns = [_ for _ in features_df.columns if _ != target_column] x_df = features_df[x_columns] y_df = features_df[[target_column]] n_vars = max(x_df.shape[1] - 5, 1) m = fs.fit(x_df, y_df, n_vars) # Assertions assert len(fs.selected_variables) == n_vars assert fs.selected_variables == ['Variance', 'Kurtosis', 'Skewness.ABS(* - MEAN(*))', 'Skewness', 'Variance.ABS(* - MEAN(*))', \ 'Entropy', 'Variance.ABS(* - Q25(*))', 'Kurtosis.ABS(* - MEDIAN(*))', 'Kurtosis.ABS(* - Q75(*))', 'Skewness.ABS(* - MEDIAN(*))', \ 'Kurtosis.ABS(* - Q25(*))', 'Kurtosis.ABS(* - MEAN(*))', 'Variance.ABS(* - MEDIAN(*))', 'Entropy.ABS(* - MEDIAN(*))', \ 'Entropy.ABS(* - Q25(*))']
def test_lightgbm_regression(): lgbm_params = { 'objective': 'rmse', 'boosting_type': 'gbdt', 'n_jobs': -1, 'learning_rate': 0.1, 'verbose': -1, } regressor_cls = get_lightgbm_learner_learning_api(lgbm_params, num_boost_round=2000, \ early_stopping_rounds=5, split_random_seed=0) fs = Boruta(regressor_cls) dataset = Abalone() target_column = dataset.y_column df = dataset.df # Features generation features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column]) # Feature selection x_columns = [_ for _ in features_df.columns if _ != target_column] x_df = features_df[x_columns] y_df = features_df[[target_column]] m = fs.fit(x_df, y_df) assert len(fs.selected_variables) == 13 assert fs.selected_variables == ['Shucked weight.ABS(* - Q75(*))', 'Shucked weight.ABS(* - MEDIAN(*))', \ 'Shucked weight.ABS(* - MEAN(*))', 'Shucked weight', 'Shell weight.ABS(* - MEAN(*))', 'Shell weight', \ 'Shucked weight.ABS(* - Q25(*))', 'Sex_I', 'Diameter', 'Whole weight', 'Shell weight.ABS(* - Q75(*))', \ 'Whole weight.ABS(* - MEAN(*))', 'Height']
def test_non_additive_lean_boosted_classifier(): # Classification params = { 'objective': 'binary', 'metric': ['auc', 'binary_logloss'], 'boosting_type': 'gbdt', } lightgbm_classifier_cls = get_lightgbm_learner_learning_api(params, num_boost_round=10000, \ split_random_seed=42) dataset = BankNote() target_column = dataset.y_column df = dataset.df # Features generation features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column]) features_df[target_column] = features_df[target_column].astype(int) # Model building results = features_df.kxy.fit(target_column, lightgbm_classifier_cls, \ problem_type='classification', additive_learning=False, return_scores=True, \ n_down_perf_before_stop=1) assert results['Testing Accuracy'] == '0.964' assert results['Selected Variables'] == [ 'Variance', 'Skewness.ABS(* - Q25(*))', 'Kurtosis', 'Skewness', 'Entropy' ]
def test_non_additive_lean_boosted_regressor(): # Regression params = { 'objective': 'rmse', 'boosting_type': 'gbdt', 'num_leaves': 100, 'n_jobs': -1, 'learning_rate': 0.1, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'verbose': -1, } lightgbm_regressor_cls = get_lightgbm_learner_learning_api(params, num_boost_round=10000, \ split_random_seed=42) dataset = Abalone() target_column = dataset.y_column df = dataset.df # Features generation features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column]) # Model building results = features_df.kxy.fit(target_column, lightgbm_regressor_cls, \ problem_type='regression', additive_learning=False, return_scores=True, \ n_down_perf_before_stop=1) assert results['Testing R-Squared'] == '0.554' assert results['Selected Variables'] == ['Shell weight', 'Shucked weight', 'Whole weight', 'Shell weight.ABS(* - Q25(*))',\ 'Viscera weight.ABS(* - MEDIAN(*))', 'Viscera weight.ABS(* - MEAN(*))', 'Height', 'Length', 'Diameter', 'Sex_I',\ 'Shucked weight.ABS(* - MEDIAN(*))', 'Diameter.ABS(* - MEDIAN(*))', 'Viscera weight.ABS(* - Q75(*))',\ 'Viscera weight.ABS(* - Q25(*))', 'Diameter.ABS(* - Q25(*))', 'Sex_M', 'Sex_F']
def lightgbm_classification_benchmark(): # LeanML vs Boruta vs RFE params = { 'objective': 'binary', 'metric': ['auc', 'binary_logloss'], } lightgbm_classifier_cls = get_lightgbm_learner_learning_api(params, num_boost_round=10000, \ early_stopping_rounds=50, verbose_eval=50) classification_benchmark(lightgbm_classifier_cls, 'lightgbm')
def lightgbm_regression_benchmark(): # LeanML vs Boruta vs RFE params = { 'objective': 'rmse', 'boosting_type': 'gbdt', 'num_leaves': 100, 'n_jobs': -1, 'learning_rate': 0.1, 'verbose': -1, } lightgbm_regressor_cls = get_lightgbm_learner_learning_api(params, num_boost_round=10000, \ early_stopping_rounds=50, verbose_eval=50) regression_benchmark(lightgbm_regressor_cls, 'lightgbm')
def test_lean_boosted_lightgbm_learning_regressor(): # Regression params = params = { 'objective': 'rmse', 'boosting_type': 'gbdt', 'num_leaves': 100, 'n_jobs': -1, 'learning_rate': 0.1, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'verbose': -1, } lightgbm_regressor_cls = get_lightgbm_learner_learning_api(params, num_boost_round=10000, \ early_stopping_rounds=50, verbose_eval=50, split_random_seed=42) dataset = Abalone() target_column = dataset.y_column df = dataset.df # Features generation features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column]) # Model building results = features_df.kxy.fit(target_column, lightgbm_regressor_cls, \ problem_type='regression', additive_learning=True, return_scores=True, \ n_down_perf_before_stop=1) model = results['predictor'].models[0] feature_columns = results['Selected Variables'] x = features_df[feature_columns].values predictions = model.predict(x) path = '../kxy/misc/cache/%s-%s.sav' % (dataset.name, 'lightbm-learning-api-regressor') model.save(path) loaded_model = lightgbm_regressor_cls(path=path) loaded_predictions = loaded_model.predict(x) assert np.allclose(predictions, loaded_predictions)
def test_leanml_predictor_lightgbm(): # Regression params = params = { 'objective': 'rmse', 'boosting_type': 'gbdt', 'num_leaves': 100, 'n_jobs': -1, 'learning_rate': 0.1, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'verbose': -1, } lightgbm_regressor_cls = get_lightgbm_learner_learning_api(params, num_boost_round=10000, \ early_stopping_rounds=50, verbose_eval=50, split_random_seed=42) dataset = Abalone() target_column = dataset.y_column df = dataset.df # Features generation features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column]) # Model building results = features_df.kxy.fit(target_column, lightgbm_regressor_cls, \ problem_type='regression') feature_columns = results['Selected Variables'] predictor = results['predictor'] predictions = predictor.predict(features_df[feature_columns]) path = '../kxy/misc/cache/%s-%s.sav' % (dataset.name, 'lightbm-learning-api-regressor') predictor.save(path) loaded_predictor = LeanMLPredictor.load(path, lightgbm_regressor_cls) loaded_predictions = loaded_predictor.predict(features_df[feature_columns]) assert np.allclose(predictions, loaded_predictions)
# function, the class should also define a save(self, path) method to # save a model to disk, and a load(cls, path) class method to load a # saved model from disk. # See kxy.learning.base_learners for helper functions that allow you # create learner functions that return instances of popular predictive # models (e.g. lightgbm, xgboost, sklearn, tensorflow, pytorch models # etc.). from kxy.learning import get_lightgbm_learner_learning_api params = { 'objective': 'binary', 'metric': ['auc', 'binary_logloss'], } lightgbm_learner_func = get_lightgbm_learner_learning_api(params, \ num_boost_round=10000, early_stopping_rounds=50, verbose_eval=50, \ split_random_seed=0) # 5. Fit a LightGBM classifier wrapped around LeanML feature selection results = train_features_df.kxy.fit(target_column, \ lightgbm_learner_func, problem_type='classification', \ feature_selection_method='leanml') predictor = results['predictor'] # 6. Make predictions from a dataframe of test features test_predictions_df = predictor.predict(test_features_df) # 7. Compute out-of-sample accuracy and AUC from sklearn.metrics import accuracy_score, roc_auc_score accuracy = accuracy_score( test_labels_df[target_column].values, \