def test_ensemble_supports_cv_with_user_defined_transforms(self): path = get_dataset("airquality").as_filepath() schema = DataSchema.read_schema(path) data = FileDataStream(path, schema) ind_args = {'Ozone_ind': 'Ozone', 'Solar_R_ind': 'Solar_R'} handler_args = {'Solar_R': 'Solar_R', 'Ozone': 'Ozone'} lgbm_args = { 'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'], 'label': 'Wind', 'normalize': 'Yes' } ols_args = { 'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'], 'label': 'Wind', 'normalize': 'Yes' } ogd_args = { 'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'], 'label': 'Wind', 'shuffle': False, 'normalize': 'Yes' } for split_start in ['before_transforms', 'after_transforms']: pipeline_steps = [ Indicator() << ind_args, Handler(replace_with='Mean') << handler_args, LightGbmRegressor(**lgbm_args) ] cv_results = CV(pipeline_steps).fit(data, split_start=split_start) l2_avg_lgbm = cv_results['metrics_summary'].loc['Average', 'L2(avg)'] r1 = OrdinaryLeastSquaresRegressor(**ols_args) r2 = OnlineGradientDescentRegressor(**ogd_args) r3 = LightGbmRegressor(**lgbm_args) data = FileDataStream(path, schema) pipeline_steps = [ Indicator() << ind_args, Handler(replace_with='Mean') << handler_args, VotingRegressor(estimators=[r1, r2, r3], combiner='Average') ] cv_results = CV(pipeline_steps).fit(data, split_start=split_start) l2_avg_ensemble = cv_results['metrics_summary'].loc['Average', 'L2(avg)'] self.assertTrue(l2_avg_ensemble < l2_avg_lgbm)
def test_ensemble_supports_get_fit_info(self): df = pd.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], yy=[1.1, 2.2, 1.24, 3.4, 3.4])) col_info = {'Feature': ['workclass', 'education'], Role.Label: 'new_y'} r1 = OrdinaryLeastSquaresRegressor(normalize="Yes") << col_info r2 = OnlineGradientDescentRegressor(normalize="Yes") << col_info r3 = LightGbmRegressor(normalize="Yes") << col_info pipeline = Pipeline([ MeanVarianceScaler() << {'new_y': 'yy'}, OneHotVectorizer() << ['workclass', 'education'], ColumnDropper() << 'yy', VotingRegressor(estimators=[r1, r2, r3], combiner='Average') ]) info = pipeline.get_fit_info(df) last_info_node = info[0][-1] self.assertEqual(last_info_node['inputs'], ['Feature:education,workclass', 'Label:new_y']) self.assertEqual(last_info_node['name'], 'VotingRegressor') self.assertTrue(isinstance(last_info_node['operator'], VotingRegressor)) self.assertEqual(last_info_node['outputs'], ['Score']) self.assertEqual(last_info_node['schema_after'], ['Score']) self.assertEqual(last_info_node['type'], 'regressor')
def test_lightgbmregressor(self): np.random.seed(0) df = get_dataset("airquality").as_df().fillna(0) df = df[df.Ozone.notnull()] X_train, X_test, y_train, y_test = train_test_split( df.loc[:, df.columns != 'Ozone'], df['Ozone']) # Train a model and score ftree = LightGbmRegressor().fit(X_train, y_train) scores = ftree.predict(X_test) r2 = r2_score(y_test, scores) assert_greater(r2, 0.32, "should be greater than %s" % 0.32) assert_less(r2, 0.33, "sum should be less than %s" % 0.33)
def test_ensemble_supports_user_defined_transforms(self): test2_df = test_df.copy(deep=True) test2_df = test2_df.append(pd.DataFrame({'c1': [9, 11], 'c2': [1, 1]})) r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r1.fit(train_df) result1 = r1.predict(test2_df) r2 = OnlineGradientDescentRegressor(**ogdArgs) r2.fit(train_df) result2 = r2.predict(test2_df) r3 = LightGbmRegressor(**lgbmArgs) r3.fit(train_df) result3 = r3.predict(test2_df) r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) r3 = LightGbmRegressor(**lgbmArgs) pipeline = Pipeline([ RangeFilter(min=0, max=10, columns='c1'), VotingRegressor(estimators=[r1, r2, r3], combiner='Average') ]) pipeline.fit(train_df) result4 = pipeline.predict(test2_df) self.assertEqual(len(result4), 3) average1 = (result1[0] + result2[0] + result3[0]) / 3 average2 = (result1[1] + result2[1] + result3[1]) / 3 average3 = (result1[2] + result2[2] + result3[2]) / 3 self.assertAlmostEqual(average1, result4.loc[0, 'Score'], places=5) self.assertAlmostEqual(average2, result4.loc[1, 'Score'], places=5) self.assertAlmostEqual(average3, result4.loc[2, 'Score'], places=5)
def test_ensemble_with_average_and_median_combiner(self): r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r1.fit(train_df) result1 = r1.predict(test_df) r2 = OnlineGradientDescentRegressor(**ogdArgs) r2.fit(train_df) result2 = r2.predict(test_df) r3 = LightGbmRegressor(**lgbmArgs) r3.fit(train_df) result3 = r3.predict(test_df) r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) r3 = LightGbmRegressor(**lgbmArgs) pipeline = Pipeline([VotingRegressor(estimators=[r1, r2, r3], combiner='Average')]) pipeline.fit(train_df) result4 = pipeline.predict(test_df) average1 = (result1[0] + result2[0] + result3[0]) / 3 average2 = (result1[1] + result2[1] + result3[1]) / 3 self.assertAlmostEqual(average1, result4.loc[0, 'Score'], places=5) self.assertAlmostEqual(average2, result4.loc[1, 'Score'], places=5) r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) r3 = LightGbmRegressor(**lgbmArgs) pipeline = Pipeline([VotingRegressor(estimators=[r1, r2, r3], combiner='Median')]) pipeline.fit(train_df) result4 = pipeline.predict(test_df) median1 = sorted([result1.loc[0], result2.loc[0], result3.loc[0]])[1] median2 = sorted([result1.loc[1], result2.loc[1], result3.loc[1]])[1] self.assertEqual(median1, result4.loc[0, 'Score']) self.assertEqual(median2, result4.loc[1, 'Score'])
def test_data_role_info_has_been_removed_from_estimators(self): r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) r3 = LightGbmRegressor(**lgbmArgs) vr = VotingRegressor(estimators=[r1, r2, r3], combiner='Average') pipeline = Pipeline([vr]) pipeline.fit(train_df) self.assertTrue(not hasattr(vr, 'feature_column_name')) self.assertTrue(not hasattr(vr.estimators[0], 'feature_column_name')) self.assertTrue(hasattr(vr.estimators[0], 'feature_column_name_')) self.assertTrue(not hasattr(vr.estimators[1], 'feature_column_name')) self.assertTrue(hasattr(vr.estimators[1], 'feature_column_name_')) self.assertTrue(not hasattr(vr.estimators[2], 'feature_column_name')) self.assertTrue(hasattr(vr.estimators[2], 'feature_column_name_'))
def test_schema_sep_default(self): data = pandas.DataFrame( dict(real=[0.1, 2.2], text=['word', 'class'], y=[1, 3])) data.to_csv('data.csv', index=False, header=True) ds = FileDataStream.read_csv('data.csv', collapse=False, numeric_dtype=numpy.float32) assert str( ds.schema) == "col=real:R4:0 col=text:TX:1 col=y:R4:2 header=+" assert ds.schema.to_string() == "col=real:R4:0 col=text:TX:1 " \ "col=y:R4:2 header=+" assert ds.schema.to_string( add_sep=True) == "col=real:R4:0 col=text:TX:1 col=y:R4:2 " \ "header=+ sep=," exp = Pipeline([ OneHotVectorizer(columns=['text']), LightGbmRegressor(min_data_per_leaf=1) ]) exp.fit(ds, 'y') pred = exp.predict(ds) assert pred is not None assert len(pred) > 0
def nimbus_training(X, y): params = { "model": { "random_state": 26, "evaluation_metric": 'MeanAbsoluteError', "number_of_iterations": 100, "use_categorical_split": True }, "booster": { "l1_regularization": 0.00000239, "l2_regularization": 0.0132, "feature_fraction": 0.98, "subsample_fraction": 0.99, "subsample_frequency": 5, } } model = Pipeline([ LightGbmRegressor(booster=Gbdt(**params["booster"]), **params["model"]) ]) model.fit(X, y, verbose=100) return model
def test_split_start_with_transforms_with_presteps(self): path = get_dataset("airquality").as_filepath() schema = DataSchema.read_schema(path) data = FileDataStream(path, schema) pipeline_steps = [ Indicator() << { 'Ozone_ind': 'Ozone', 'Solar_R_ind': 'Solar_R' }, Handler(replace_with='Mean') << { 'Solar_R': 'Solar_R', 'Ozone': 'Ozone' }, LightGbmRegressor(feature=[ 'Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp' ], label='Wind') ] results = CV(pipeline_steps).fit(data, split_start='after_transforms', dry_run=True) results = json.loads(results) node_names = [ep['Name'] for ep in results['nodes']] cv_node = [ ep for ep in results['nodes'] if 'Models.CrossValidator' in ep['Name'] ][0] cv_sub_node_names = [ep['Name'] for ep in cv_node['Inputs']['Nodes']] self.assertTrue('Transforms.MissingValueHandler' in node_names) self.assertTrue( 'Transforms.MissingValueHandler' not in cv_sub_node_names) self.assertTrue('Transforms.ModelCombiner' in node_names)
def test_metrics_evaluate_binary_from_filedatastream(self): path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) e = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), LightGbmRegressor(feature=['induced', 'edu'], label='age', number_of_threads=1) ]) e.fit(data, verbose=0) metrics, _ = e.test(data) # TODO: debug flucations, and increase decimal precision on checks assert_almost_equal(metrics['L1(avg)'][0], 4.104164, decimal=4, err_msg="L1 loss should be %s" % 4.104164) assert_almost_equal(metrics['L2(avg)'][0], 24.15286, decimal=4, err_msg="L2(avg) should be %s" % 24.15286) assert_almost_equal(metrics['Loss-fn(avg)'][0], 24.15286, decimal=4, err_msg="Loss-fn(avg)loss should be %s" % 24.15286)
############################################################################### # LightGbmRegressor import numpy as np from nimbusml.datasets import get_dataset from nimbusml.ensemble import LightGbmRegressor from sklearn.metrics import r2_score from sklearn.model_selection import train_test_split # use the built-in data set 'airquality' to create test and train data # Unnamed: 0 Ozone Solar_R Wind Temp Month Day # 0 1 41.0 190.0 7.4 67 5 1 # 1 2 36.0 118.0 8.0 72 5 2 np.random.seed(0) df = get_dataset("airquality").as_df().fillna(0) df = df[df.Ozone.notnull()] X_train, X_test, y_train, y_test = train_test_split( df.loc[:, df.columns != 'Ozone'], df['Ozone']) # train a model and score ftree = LightGbmRegressor().fit(X_train, y_train) scores = ftree.predict(X_test) # evaluate the model print('R-squared fit:', r2_score(y_test, scores))
#RMSE: 1.629386 import lightgbm as lgb from lightgbm import LGBMRegressor regressor2 = LGBMRegressor(random_state=0) #RMSE: 1.673168 from catboost import CatBoostRegressor regressor3 = CatBoostRegressor(iterations=2000, random_state = 0, verbose = 200) import xgboost as xgb from xgboost import XGBRegressor regressor4 = xgb.XGBRegressor() from nimbusml.ensemble import LightGbmRegressor regressor5 = LightGbmRegressor(random_state=0) from sklearn.ensemble import RandomForestRegressor regressor6 = RandomForestRegressor(n_estimators = 1000, random_state = 0) from sklearn.ensemble import GradientBoostingRegressor regressor7 = GradientBoostingRegressor(random_state=0) from sklearn import linear_model regressor8 = linear_model.BayesianRidge() from sklearn.svm import SVR regressor9 = SVR(kernel = 'rbf') from sklearn.neural_network import MLPRegressor regressor10 = MLPRegressor(random_state=0, max_iter=1000)
'check_fit_score_takes_y', 'check_fit2d_predict1d', 'check_fit1d_1feature', 'check_dont_overwrite_parameters', 'check_supervised_y_2d', 'check_estimators_fit_returns_self', 'check_estimators_overwrite_params', 'check_estimators_dtypes', 'check_classifiers_classes', 'check_classifiers_train' ] INSTANCES = { 'LightGbmBinaryClassifier': LightGbmBinaryClassifier(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmClassifier': LightGbmClassifier(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmRegressor': LightGbmRegressor(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmRanker': LightGbmRanker(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=n_gram()), 'SkipFilter': SkipFilter(count=5), 'TakeFilter': TakeFilter(count=100000), 'IidSpikeDetector': IidSpikeDetector(columns=['F0']), 'IidChangePointDetector': IidChangePointDetector(columns=['F0']), 'SsaSpikeDetector': SsaSpikeDetector(columns=['F0'], seasonal_window_size=2),
path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... # define the training pipeline pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), LightGbmRegressor(feature=['induced', 'edu'], label='age', booster=Gbdt(reg_lambda=0.1)) ]) # train, predict, and evaluate # TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions print(predictions.head()) # Score # 0 34.008430 # 1 34.008430 # 2 33.160175 # 3 33.160175 # 4 32.472412
categorical_columns = [ 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'ethnicity', 'sex', 'native-country-region' ] file_schema = 'sep=, col=label:R4:0 col=Features:R4:9-14 col=workclass:TX:1 ' \ 'col=education:TX:2 col=marital-status:TX:3 ' \ 'col=occupation:TX:4 col=relationship:TX:5 col=ethnicity:TX:6 ' \ 'col=sex:TX:7 col=native-country-region:TX:8 header+' label_column = 'label' learners = [ FastForestBinaryClassifier(), FastForestRegressor(), FastTreesBinaryClassifier(), FastTreesRegressor(), FastTreesTweedieRegressor(), LightGbmRegressor(), LightGbmBinaryClassifier(), AveragedPerceptronBinaryClassifier(), FastLinearBinaryClassifier(), FastLinearClassifier(), FastLinearRegressor(), LogisticRegressionBinaryClassifier(), LogisticRegressionClassifier(), OnlineGradientDescentRegressor(), SgdBinaryClassifier(), # SymSgdBinaryClassifier(), OrdinaryLeastSquaresRegressor(), PoissonRegressionRegressor() ] learners_not_supported = [
'minimum_example_count_per_leaf': 1, 'normalize': 'Yes' } if show_individual_predictions: r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r1.fit(train_df) result = r1.predict(test_df) print(result) r2 = OnlineGradientDescentRegressor(**ogdArgs) r2.fit(train_df) result = r2.predict(test_df) print(result) r3 = LightGbmRegressor(**lgbmArgs) r3.fit(train_df) result = r3.predict(test_df) print(result) # Perform a prediction using an ensemble # of all three of the above predictors. r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) r3 = LightGbmRegressor(**lgbmArgs) pipeline = Pipeline( [VotingRegressor(estimators=[r1, r2, r3], combiner='Average')]) pipeline.fit(train_df) result = pipeline.predict(test_df)
#SymSgdBinaryClassifier(), OrdinaryLeastSquaresRegressor(), PoissonRegressionRegressor(), OneVsRestClassifier(FastLinearBinaryClassifier()), GamRegressor(), GamBinaryClassifier(), PcaAnomalyDetector(), FactorizationMachineBinaryClassifier(), KMeansPlusPlus(n_clusters=2), NaiveBayesClassifier(), FastForestBinaryClassifier(number_of_trees=2), FastForestRegressor(number_of_trees=2), FastTreesBinaryClassifier(number_of_trees=2), FastTreesRegressor(number_of_trees=2), FastTreesTweedieRegressor(number_of_trees=2), LightGbmRegressor(number_of_iterations=2), LightGbmClassifier(), LightGbmBinaryClassifier(number_of_iterations=2) ] learners_not_supported = [ #PcaTransformer(), # REVIEW: crashes ] class TestModelSummary(unittest.TestCase): def test_model_summary(self): for learner in learners: pipeline = Pipeline( [OneHotVectorizer() << categorical_columns, learner])
NOBINARY_CHECKS = [ 'check_estimator_sparse_data', 'check_dtype_object', 'check_fit_score_takes_y', 'check_fit2d_predict1d', 'check_fit1d_1feature', 'check_dont_overwrite_parameters', 'check_supervised_y_2d', 'check_estimators_fit_returns_self', 'check_estimators_overwrite_params', 'check_estimators_dtypes', 'check_classifiers_classes', 'check_classifiers_train' ] INSTANCES = { 'LightGbmBinaryClassifier': LightGbmBinaryClassifier(min_data_per_group=1, min_data_per_leaf=1), 'LightGbmClassifier': LightGbmClassifier(min_data_per_group=1, min_data_per_leaf=1), 'LightGbmRegressor': LightGbmRegressor(min_data_per_group=1, min_data_per_leaf=1), 'LightGbmRanker': LightGbmRanker(min_data_per_group=1, min_data_per_leaf=1), 'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=n_gram()), 'SkipFilter': SkipFilter(count=5), 'TensorFlowScorer': TensorFlowScorer(model=os.path.join(this, '..', 'nimbusml', 'examples', 'frozen_saved_model.pb'), columns={'c': ['a', 'b']}), } MULTI_OUTPUT_EX = [ 'FastLinearClassifier', 'FastLinearRegressor', 'LogisticRegressionClassifier', 'FastTreesRegressor',