Ejemplo n.º 1
0
    def test_ensemble_supports_cv_with_user_defined_transforms(self):
        path = get_dataset("airquality").as_filepath()
        schema = DataSchema.read_schema(path)
        data = FileDataStream(path, schema)

        ind_args = {'Ozone_ind': 'Ozone', 'Solar_R_ind': 'Solar_R'}
        handler_args = {'Solar_R': 'Solar_R', 'Ozone': 'Ozone'}
        lgbm_args = {
            'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'],
            'label': 'Wind',
            'normalize': 'Yes'
        }
        ols_args = {
            'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'],
            'label': 'Wind',
            'normalize': 'Yes'
        }
        ogd_args = {
            'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'],
            'label': 'Wind',
            'shuffle': False,
            'normalize': 'Yes'
        }

        for split_start in ['before_transforms', 'after_transforms']:
            pipeline_steps = [
                Indicator() << ind_args,
                Handler(replace_with='Mean') << handler_args,
                LightGbmRegressor(**lgbm_args)
            ]

            cv_results = CV(pipeline_steps).fit(data, split_start=split_start)
            l2_avg_lgbm = cv_results['metrics_summary'].loc['Average', 'L2(avg)']

            r1 = OrdinaryLeastSquaresRegressor(**ols_args)
            r2 = OnlineGradientDescentRegressor(**ogd_args)
            r3 = LightGbmRegressor(**lgbm_args)

            data = FileDataStream(path, schema)
            pipeline_steps = [
                Indicator() << ind_args,
                Handler(replace_with='Mean') << handler_args,
                VotingRegressor(estimators=[r1, r2, r3], combiner='Average')
            ]

            cv_results = CV(pipeline_steps).fit(data, split_start=split_start)
            l2_avg_ensemble = cv_results['metrics_summary'].loc['Average', 'L2(avg)']

            self.assertTrue(l2_avg_ensemble < l2_avg_lgbm)
Ejemplo n.º 2
0
    def test_ensemble_supports_get_fit_info(self):
        df = pd.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                               workclass=['X', 'X', 'Y', 'Y', 'Y'],
                               yy=[1.1, 2.2, 1.24, 3.4, 3.4]))

        col_info = {'Feature': ['workclass', 'education'], Role.Label: 'new_y'}

        r1 = OrdinaryLeastSquaresRegressor(normalize="Yes") << col_info
        r2 = OnlineGradientDescentRegressor(normalize="Yes") << col_info
        r3 = LightGbmRegressor(normalize="Yes") << col_info

        pipeline = Pipeline([
            MeanVarianceScaler() << {'new_y': 'yy'},
            OneHotVectorizer() << ['workclass', 'education'],
            ColumnDropper() << 'yy',
            VotingRegressor(estimators=[r1, r2, r3], combiner='Average')
        ])

        info = pipeline.get_fit_info(df)

        last_info_node = info[0][-1]
        self.assertEqual(last_info_node['inputs'],
                         ['Feature:education,workclass', 'Label:new_y'])
        self.assertEqual(last_info_node['name'], 'VotingRegressor')
        self.assertTrue(isinstance(last_info_node['operator'], VotingRegressor))
        self.assertEqual(last_info_node['outputs'], ['Score'])
        self.assertEqual(last_info_node['schema_after'], ['Score'])
        self.assertEqual(last_info_node['type'], 'regressor')
Ejemplo n.º 3
0
    def test_lightgbmregressor(self):
        np.random.seed(0)

        df = get_dataset("airquality").as_df().fillna(0)
        df = df[df.Ozone.notnull()]

        X_train, X_test, y_train, y_test = train_test_split(
            df.loc[:, df.columns != 'Ozone'], df['Ozone'])

        # Train a model and score
        ftree = LightGbmRegressor().fit(X_train, y_train)
        scores = ftree.predict(X_test)

        r2 = r2_score(y_test, scores)
        assert_greater(r2, 0.32, "should be greater than %s" % 0.32)
        assert_less(r2, 0.33, "sum should be less than %s" % 0.33)
Ejemplo n.º 4
0
    def test_ensemble_supports_user_defined_transforms(self):
        test2_df = test_df.copy(deep=True)
        test2_df = test2_df.append(pd.DataFrame({'c1': [9, 11], 'c2': [1, 1]}))

        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r1.fit(train_df)
        result1 = r1.predict(test2_df)

        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        r2.fit(train_df)
        result2 = r2.predict(test2_df)

        r3 = LightGbmRegressor(**lgbmArgs)
        r3.fit(train_df)
        result3 = r3.predict(test2_df)

        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        r3 = LightGbmRegressor(**lgbmArgs)

        pipeline = Pipeline([
            RangeFilter(min=0, max=10, columns='c1'),
            VotingRegressor(estimators=[r1, r2, r3], combiner='Average')
        ])
        pipeline.fit(train_df)
        result4 = pipeline.predict(test2_df)

        self.assertEqual(len(result4), 3)

        average1 = (result1[0] + result2[0] + result3[0]) / 3
        average2 = (result1[1] + result2[1] + result3[1]) / 3
        average3 = (result1[2] + result2[2] + result3[2]) / 3
        self.assertAlmostEqual(average1, result4.loc[0, 'Score'], places=5)
        self.assertAlmostEqual(average2, result4.loc[1, 'Score'], places=5)
        self.assertAlmostEqual(average3, result4.loc[2, 'Score'], places=5)
Ejemplo n.º 5
0
    def test_ensemble_with_average_and_median_combiner(self):
        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r1.fit(train_df)
        result1 = r1.predict(test_df)

        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        r2.fit(train_df)
        result2 = r2.predict(test_df)

        r3 = LightGbmRegressor(**lgbmArgs)
        r3.fit(train_df)
        result3 = r3.predict(test_df)

        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        r3 = LightGbmRegressor(**lgbmArgs)

        pipeline = Pipeline([VotingRegressor(estimators=[r1, r2, r3], combiner='Average')])
        pipeline.fit(train_df)
        result4 = pipeline.predict(test_df)

        average1 = (result1[0] + result2[0] + result3[0]) / 3
        average2 = (result1[1] + result2[1] + result3[1]) / 3
        self.assertAlmostEqual(average1, result4.loc[0, 'Score'], places=5)
        self.assertAlmostEqual(average2, result4.loc[1, 'Score'], places=5)

        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        r3 = LightGbmRegressor(**lgbmArgs)

        pipeline = Pipeline([VotingRegressor(estimators=[r1, r2, r3], combiner='Median')])
        pipeline.fit(train_df)
        result4 = pipeline.predict(test_df)

        median1 = sorted([result1.loc[0], result2.loc[0], result3.loc[0]])[1]
        median2 = sorted([result1.loc[1], result2.loc[1], result3.loc[1]])[1]

        self.assertEqual(median1, result4.loc[0, 'Score'])
        self.assertEqual(median2, result4.loc[1, 'Score'])
Ejemplo n.º 6
0
    def test_data_role_info_has_been_removed_from_estimators(self):
        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        r3 = LightGbmRegressor(**lgbmArgs)
        vr = VotingRegressor(estimators=[r1, r2, r3], combiner='Average')

        pipeline = Pipeline([vr])
        pipeline.fit(train_df)

        self.assertTrue(not hasattr(vr, 'feature_column_name'))

        self.assertTrue(not hasattr(vr.estimators[0], 'feature_column_name'))
        self.assertTrue(hasattr(vr.estimators[0], 'feature_column_name_'))

        self.assertTrue(not hasattr(vr.estimators[1], 'feature_column_name'))
        self.assertTrue(hasattr(vr.estimators[1], 'feature_column_name_'))

        self.assertTrue(not hasattr(vr.estimators[2], 'feature_column_name'))
        self.assertTrue(hasattr(vr.estimators[2], 'feature_column_name_'))
Ejemplo n.º 7
0
 def test_schema_sep_default(self):
     data = pandas.DataFrame(
         dict(real=[0.1, 2.2], text=['word', 'class'], y=[1, 3]))
     data.to_csv('data.csv', index=False, header=True)
     ds = FileDataStream.read_csv('data.csv',
                                  collapse=False,
                                  numeric_dtype=numpy.float32)
     assert str(
         ds.schema) == "col=real:R4:0 col=text:TX:1 col=y:R4:2 header=+"
     assert ds.schema.to_string() == "col=real:R4:0 col=text:TX:1 " \
                                     "col=y:R4:2 header=+"
     assert ds.schema.to_string(
         add_sep=True) == "col=real:R4:0 col=text:TX:1 col=y:R4:2 " \
                          "header=+ sep=,"
     exp = Pipeline([
         OneHotVectorizer(columns=['text']),
         LightGbmRegressor(min_data_per_leaf=1)
     ])
     exp.fit(ds, 'y')
     pred = exp.predict(ds)
     assert pred is not None
     assert len(pred) > 0
Ejemplo n.º 8
0
def nimbus_training(X, y):
    params = {
        "model": {
            "random_state": 26,
            "evaluation_metric": 'MeanAbsoluteError',
            "number_of_iterations": 100,
            "use_categorical_split": True
        },
        "booster": {
            "l1_regularization": 0.00000239,
            "l2_regularization": 0.0132,
            "feature_fraction": 0.98,
            "subsample_fraction": 0.99,
            "subsample_frequency": 5,
        }
    }
    model = Pipeline([
        LightGbmRegressor(booster=Gbdt(**params["booster"]), **params["model"])
    ])

    model.fit(X, y, verbose=100)

    return model
Ejemplo n.º 9
0
    def test_split_start_with_transforms_with_presteps(self):
        path = get_dataset("airquality").as_filepath()
        schema = DataSchema.read_schema(path)
        data = FileDataStream(path, schema)

        pipeline_steps = [
            Indicator() << {
                'Ozone_ind': 'Ozone',
                'Solar_R_ind': 'Solar_R'
            },
            Handler(replace_with='Mean') << {
                'Solar_R': 'Solar_R',
                'Ozone': 'Ozone'
            },
            LightGbmRegressor(feature=[
                'Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'
            ],
                              label='Wind')
        ]

        results = CV(pipeline_steps).fit(data,
                                         split_start='after_transforms',
                                         dry_run=True)
        results = json.loads(results)

        node_names = [ep['Name'] for ep in results['nodes']]
        cv_node = [
            ep for ep in results['nodes']
            if 'Models.CrossValidator' in ep['Name']
        ][0]
        cv_sub_node_names = [ep['Name'] for ep in cv_node['Inputs']['Nodes']]

        self.assertTrue('Transforms.MissingValueHandler' in node_names)
        self.assertTrue(
            'Transforms.MissingValueHandler' not in cv_sub_node_names)
        self.assertTrue('Transforms.ModelCombiner' in node_names)
Ejemplo n.º 10
0
 def test_metrics_evaluate_binary_from_filedatastream(self):
     path = get_dataset('infert').as_filepath()
     data = FileDataStream.read_csv(path)
     e = Pipeline([
         OneHotVectorizer(columns={'edu': 'education'}),
         LightGbmRegressor(feature=['induced', 'edu'],
                           label='age',
                           number_of_threads=1)
     ])
     e.fit(data, verbose=0)
     metrics, _ = e.test(data)
     # TODO: debug flucations, and increase decimal precision on checks
     assert_almost_equal(metrics['L1(avg)'][0],
                         4.104164,
                         decimal=4,
                         err_msg="L1 loss should be %s" % 4.104164)
     assert_almost_equal(metrics['L2(avg)'][0],
                         24.15286,
                         decimal=4,
                         err_msg="L2(avg) should be %s" % 24.15286)
     assert_almost_equal(metrics['Loss-fn(avg)'][0],
                         24.15286,
                         decimal=4,
                         err_msg="Loss-fn(avg)loss should be %s" % 24.15286)
###############################################################################
# LightGbmRegressor
import numpy as np
from nimbusml.datasets import get_dataset
from nimbusml.ensemble import LightGbmRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

# use the built-in data set 'airquality' to create test and train data
#    Unnamed: 0  Ozone  Solar_R  Wind  Temp  Month  Day
# 0           1   41.0    190.0   7.4    67      5    1
# 1           2   36.0    118.0   8.0    72      5    2
np.random.seed(0)

df = get_dataset("airquality").as_df().fillna(0)
df = df[df.Ozone.notnull()]

X_train, X_test, y_train, y_test = train_test_split(
    df.loc[:, df.columns != 'Ozone'], df['Ozone'])

# train a model and score
ftree = LightGbmRegressor().fit(X_train, y_train)
scores = ftree.predict(X_test)

# evaluate the model
print('R-squared fit:', r2_score(y_test, scores))
Ejemplo n.º 12
0
#RMSE: 1.629386

import lightgbm as lgb
from lightgbm import LGBMRegressor
regressor2 = LGBMRegressor(random_state=0)
#RMSE: 1.673168

from catboost import CatBoostRegressor
regressor3 = CatBoostRegressor(iterations=2000, random_state = 0, verbose = 200)

import xgboost as xgb
from xgboost import XGBRegressor
regressor4 = xgb.XGBRegressor()

from nimbusml.ensemble import LightGbmRegressor
regressor5 = LightGbmRegressor(random_state=0)

from sklearn.ensemble import RandomForestRegressor
regressor6 = RandomForestRegressor(n_estimators = 1000, random_state = 0)

from sklearn.ensemble import GradientBoostingRegressor
regressor7 = GradientBoostingRegressor(random_state=0)

from sklearn import linear_model
regressor8 = linear_model.BayesianRidge()

from sklearn.svm import SVR
regressor9 = SVR(kernel = 'rbf')

from sklearn.neural_network import MLPRegressor
regressor10 = MLPRegressor(random_state=0, max_iter=1000)
Ejemplo n.º 13
0
    'check_fit_score_takes_y', 'check_fit2d_predict1d', 'check_fit1d_1feature',
    'check_dont_overwrite_parameters', 'check_supervised_y_2d',
    'check_estimators_fit_returns_self', 'check_estimators_overwrite_params',
    'check_estimators_dtypes', 'check_classifiers_classes',
    'check_classifiers_train'
]

INSTANCES = {
    'LightGbmBinaryClassifier':
    LightGbmBinaryClassifier(minimum_example_count_per_group=1,
                             minimum_example_count_per_leaf=1),
    'LightGbmClassifier':
    LightGbmClassifier(minimum_example_count_per_group=1,
                       minimum_example_count_per_leaf=1),
    'LightGbmRegressor':
    LightGbmRegressor(minimum_example_count_per_group=1,
                      minimum_example_count_per_leaf=1),
    'LightGbmRanker':
    LightGbmRanker(minimum_example_count_per_group=1,
                   minimum_example_count_per_leaf=1),
    'NGramFeaturizer':
    NGramFeaturizer(word_feature_extractor=n_gram()),
    'SkipFilter':
    SkipFilter(count=5),
    'TakeFilter':
    TakeFilter(count=100000),
    'IidSpikeDetector':
    IidSpikeDetector(columns=['F0']),
    'IidChangePointDetector':
    IidChangePointDetector(columns=['F0']),
    'SsaSpikeDetector':
    SsaSpikeDetector(columns=['F0'], seasonal_window_size=2),
Ejemplo n.º 14
0
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path)
print(data.head())
#    age  case education  induced  parity ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

# define the training pipeline
pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    LightGbmRegressor(feature=['induced', 'edu'],
                      label='age',
                      booster=Gbdt(reg_lambda=0.1))
])

# train, predict, and evaluate
# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)

# print predictions
print(predictions.head())
#       Score
# 0  34.008430
# 1  34.008430
# 2  33.160175
# 3  33.160175
# 4  32.472412
Ejemplo n.º 15
0
categorical_columns = [
    'workclass', 'education', 'marital-status', 'occupation', 'relationship',
    'ethnicity', 'sex', 'native-country-region'
]
file_schema = 'sep=, col=label:R4:0 col=Features:R4:9-14 col=workclass:TX:1 ' \
              'col=education:TX:2 col=marital-status:TX:3 ' \
              'col=occupation:TX:4 col=relationship:TX:5 col=ethnicity:TX:6 ' \
              'col=sex:TX:7 col=native-country-region:TX:8 header+'
label_column = 'label'
learners = [
    FastForestBinaryClassifier(),
    FastForestRegressor(),
    FastTreesBinaryClassifier(),
    FastTreesRegressor(),
    FastTreesTweedieRegressor(),
    LightGbmRegressor(),
    LightGbmBinaryClassifier(),
    AveragedPerceptronBinaryClassifier(),
    FastLinearBinaryClassifier(),
    FastLinearClassifier(),
    FastLinearRegressor(),
    LogisticRegressionBinaryClassifier(),
    LogisticRegressionClassifier(),
    OnlineGradientDescentRegressor(),
    SgdBinaryClassifier(),
    # SymSgdBinaryClassifier(),
    OrdinaryLeastSquaresRegressor(),
    PoissonRegressionRegressor()
]

learners_not_supported = [
Ejemplo n.º 16
0
    'minimum_example_count_per_leaf': 1,
    'normalize': 'Yes'
}

if show_individual_predictions:
    r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
    r1.fit(train_df)
    result = r1.predict(test_df)
    print(result)

    r2 = OnlineGradientDescentRegressor(**ogdArgs)
    r2.fit(train_df)
    result = r2.predict(test_df)
    print(result)

    r3 = LightGbmRegressor(**lgbmArgs)
    r3.fit(train_df)
    result = r3.predict(test_df)
    print(result)

# Perform a prediction using an ensemble
# of all three of the above predictors.

r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
r2 = OnlineGradientDescentRegressor(**ogdArgs)
r3 = LightGbmRegressor(**lgbmArgs)
pipeline = Pipeline(
    [VotingRegressor(estimators=[r1, r2, r3], combiner='Average')])

pipeline.fit(train_df)
result = pipeline.predict(test_df)
Ejemplo n.º 17
0
    #SymSgdBinaryClassifier(),
    OrdinaryLeastSquaresRegressor(),
    PoissonRegressionRegressor(),
    OneVsRestClassifier(FastLinearBinaryClassifier()),
    GamRegressor(),
    GamBinaryClassifier(),
    PcaAnomalyDetector(),
    FactorizationMachineBinaryClassifier(),
    KMeansPlusPlus(n_clusters=2),
    NaiveBayesClassifier(),
    FastForestBinaryClassifier(number_of_trees=2), 
    FastForestRegressor(number_of_trees=2),
    FastTreesBinaryClassifier(number_of_trees=2),
    FastTreesRegressor(number_of_trees=2),
    FastTreesTweedieRegressor(number_of_trees=2),
    LightGbmRegressor(number_of_iterations=2),
    LightGbmClassifier(),
    LightGbmBinaryClassifier(number_of_iterations=2)
]

learners_not_supported = [
    #PcaTransformer(), # REVIEW: crashes
]


class TestModelSummary(unittest.TestCase):

    def test_model_summary(self):
        for learner in learners:
            pipeline = Pipeline(
                [OneHotVectorizer() << categorical_columns, learner])
Ejemplo n.º 18
0
NOBINARY_CHECKS = [
    'check_estimator_sparse_data', 'check_dtype_object',
    'check_fit_score_takes_y', 'check_fit2d_predict1d', 'check_fit1d_1feature',
    'check_dont_overwrite_parameters', 'check_supervised_y_2d',
    'check_estimators_fit_returns_self', 'check_estimators_overwrite_params',
    'check_estimators_dtypes', 'check_classifiers_classes',
    'check_classifiers_train'
]

INSTANCES = {
    'LightGbmBinaryClassifier':
    LightGbmBinaryClassifier(min_data_per_group=1, min_data_per_leaf=1),
    'LightGbmClassifier':
    LightGbmClassifier(min_data_per_group=1, min_data_per_leaf=1),
    'LightGbmRegressor':
    LightGbmRegressor(min_data_per_group=1, min_data_per_leaf=1),
    'LightGbmRanker':
    LightGbmRanker(min_data_per_group=1, min_data_per_leaf=1),
    'NGramFeaturizer':
    NGramFeaturizer(word_feature_extractor=n_gram()),
    'SkipFilter':
    SkipFilter(count=5),
    'TensorFlowScorer':
    TensorFlowScorer(model=os.path.join(this, '..', 'nimbusml', 'examples',
                                        'frozen_saved_model.pb'),
                     columns={'c': ['a', 'b']}),
}

MULTI_OUTPUT_EX = [
    'FastLinearClassifier', 'FastLinearRegressor',
    'LogisticRegressionClassifier', 'FastTreesRegressor',