Example #1
0
def check_cv(pipeline,
             X,
             y=None,
             n_folds=2,
             groups=None,
             split_start='before_transforms',
             expected_metrics={},
             **params):
    cv = CV(pipeline)
    if split_start == 'try_all':
        len_pipeline = len(pipeline.nodes)
        values_to_test = ['after_transforms', 'before_transforms']
        values_to_test.extend(list(range(len_pipeline)))
        values_to_test.extend(list(range(-len_pipeline, 0)))
        for s in values_to_test:
            graph_id = '_split_start={}'.format(str(s))
            results = cv.fit(X,
                             y,
                             cv=n_folds,
                             groups=groups,
                             split_start=s,
                             graph_id=graph_id)
            check_cv_results(cv._learner_type, results, n_folds,
                             expected_metrics)
    else:
        results = cv.fit(X,
                         y,
                         cv=n_folds,
                         groups=groups,
                         split_start=split_start,
                         **params)
        check_cv_results(cv._learner_type, results, n_folds, expected_metrics)

    return results
Example #2
0
    def test_unseen_classes(self):
        # Create a dataset such that cv splits miss some of the classes
        X = random_df()
        y = random_series()
        y[95:] = range(5)

        msg = 'CV didn\'t raise Warning exception b/c of minority class issue'
        with self.assertRaises(Warning, msg=msg):
            cv = CV([FastLinearClassifier()])
            cv.fit(X, y, cv=3)
Example #3
0
    def test_ensemble_supports_cv_with_user_defined_transforms(self):
        path = get_dataset("airquality").as_filepath()
        schema = DataSchema.read_schema(path)
        data = FileDataStream(path, schema)

        ind_args = {'Ozone_ind': 'Ozone', 'Solar_R_ind': 'Solar_R'}
        handler_args = {'Solar_R': 'Solar_R', 'Ozone': 'Ozone'}
        lgbm_args = {
            'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'],
            'label': 'Wind',
            'normalize': 'Yes'
        }
        ols_args = {
            'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'],
            'label': 'Wind',
            'normalize': 'Yes'
        }
        ogd_args = {
            'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'],
            'label': 'Wind',
            'shuffle': False,
            'normalize': 'Yes'
        }

        for split_start in ['before_transforms', 'after_transforms']:
            pipeline_steps = [
                Indicator() << ind_args,
                Handler(replace_with='Mean') << handler_args,
                LightGbmRegressor(**lgbm_args)
            ]

            cv_results = CV(pipeline_steps).fit(data, split_start=split_start)
            l2_avg_lgbm = cv_results['metrics_summary'].loc['Average', 'L2(avg)']

            r1 = OrdinaryLeastSquaresRegressor(**ols_args)
            r2 = OnlineGradientDescentRegressor(**ogd_args)
            r3 = LightGbmRegressor(**lgbm_args)

            data = FileDataStream(path, schema)
            pipeline_steps = [
                Indicator() << ind_args,
                Handler(replace_with='Mean') << handler_args,
                VotingRegressor(estimators=[r1, r2, r3], combiner='Average')
            ]

            cv_results = CV(pipeline_steps).fit(data, split_start=split_start)
            l2_avg_ensemble = cv_results['metrics_summary'].loc['Average', 'L2(avg)']

            self.assertTrue(l2_avg_ensemble < l2_avg_lgbm)
Example #4
0
 def test_unsupported_pipelines(self):
     unsupported_pipelines = [OneHotVectorizer()]
     for p in unsupported_pipelines:
         pipeline = Pipeline([p])
         msg = 'CV doesn\'t support pipeline ending with {}, but ' \
               'didn\'t raise exception'.format(
                   pipeline._last_node_type())
         with self.assertRaises(ValueError, msg=msg):
             CV(pipeline)
Example #5
0
 def check_cv_with_non_defaults(self,
                                label_name='label',
                                group_id='groupid',
                                features='Features_1',
                                **params):
     steps = [
         ToKey(columns={
             'groupid2': group_id,
             'label2': label_name
         }),
         LightGbmRanker() << {
             Role.GroupId: 'groupid2',
             Role.Label: 'label2',
             Role.Feature: [features]
         }
     ]
     data = self.data(label_name, group_id, features)
     cv = CV(steps)
     results = cv.fit(data, groups='groupid', cv=4)
     check_cv_results(cv._learner_type,
                      results,
                      n_folds=4,
                      expected_metrics={})
Example #6
0
    def test_split_start_with_transforms_with_presteps(self):
        path = get_dataset("airquality").as_filepath()
        schema = DataSchema.read_schema(path)
        data = FileDataStream(path, schema)

        pipeline_steps = [
            Indicator() << {
                'Ozone_ind': 'Ozone',
                'Solar_R_ind': 'Solar_R'
            },
            Handler(replace_with='Mean') << {
                'Solar_R': 'Solar_R',
                'Ozone': 'Ozone'
            },
            LightGbmRegressor(feature=[
                'Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'
            ],
                              label='Wind')
        ]

        results = CV(pipeline_steps).fit(data,
                                         split_start='after_transforms',
                                         dry_run=True)
        results = json.loads(results)

        node_names = [ep['Name'] for ep in results['nodes']]
        cv_node = [
            ep for ep in results['nodes']
            if 'Models.CrossValidator' in ep['Name']
        ][0]
        cv_sub_node_names = [ep['Name'] for ep in cv_node['Inputs']['Nodes']]

        self.assertTrue('Transforms.MissingValueHandler' in node_names)
        self.assertTrue(
            'Transforms.MissingValueHandler' not in cv_sub_node_names)
        self.assertTrue('Transforms.ModelCombiner' in node_names)
Example #7
0
from nimbusml.preprocessing.missing_values import Indicator, Handler

# Case 1: Default usage of CV

path = get_dataset('infert').as_filepath()
schema = DataSchema.read_schema(path, numeric_dtype=np.float32)
data = FileDataStream.read_csv(path, schema=schema)

pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    LogisticRegressionClassifier(feature=['age', 'spontaneous', 'edu'],
                                 label='induced')
])

# Do 3-fold cross-validation
cv_results = CV(pipeline).fit(data, cv=3)

# print summary statistic of metrics
print(cv_results['metrics_summary'])

# print metrics for all folds
print(cv_results['metrics'])

# print confusion matrix for fold 1
cm = cv_results['confusion_matrix']
print(cm[cm.Fold == 1])

# Case 2: Using CV with split_start option

path = get_dataset("airquality").as_filepath()
schema = DataSchema.read_schema(path)