def check_cv(pipeline, X, y=None, n_folds=2, groups=None, split_start='before_transforms', expected_metrics={}, **params): cv = CV(pipeline) if split_start == 'try_all': len_pipeline = len(pipeline.nodes) values_to_test = ['after_transforms', 'before_transforms'] values_to_test.extend(list(range(len_pipeline))) values_to_test.extend(list(range(-len_pipeline, 0))) for s in values_to_test: graph_id = '_split_start={}'.format(str(s)) results = cv.fit(X, y, cv=n_folds, groups=groups, split_start=s, graph_id=graph_id) check_cv_results(cv._learner_type, results, n_folds, expected_metrics) else: results = cv.fit(X, y, cv=n_folds, groups=groups, split_start=split_start, **params) check_cv_results(cv._learner_type, results, n_folds, expected_metrics) return results
def test_unseen_classes(self): # Create a dataset such that cv splits miss some of the classes X = random_df() y = random_series() y[95:] = range(5) msg = 'CV didn\'t raise Warning exception b/c of minority class issue' with self.assertRaises(Warning, msg=msg): cv = CV([FastLinearClassifier()]) cv.fit(X, y, cv=3)
def test_ensemble_supports_cv_with_user_defined_transforms(self): path = get_dataset("airquality").as_filepath() schema = DataSchema.read_schema(path) data = FileDataStream(path, schema) ind_args = {'Ozone_ind': 'Ozone', 'Solar_R_ind': 'Solar_R'} handler_args = {'Solar_R': 'Solar_R', 'Ozone': 'Ozone'} lgbm_args = { 'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'], 'label': 'Wind', 'normalize': 'Yes' } ols_args = { 'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'], 'label': 'Wind', 'normalize': 'Yes' } ogd_args = { 'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'], 'label': 'Wind', 'shuffle': False, 'normalize': 'Yes' } for split_start in ['before_transforms', 'after_transforms']: pipeline_steps = [ Indicator() << ind_args, Handler(replace_with='Mean') << handler_args, LightGbmRegressor(**lgbm_args) ] cv_results = CV(pipeline_steps).fit(data, split_start=split_start) l2_avg_lgbm = cv_results['metrics_summary'].loc['Average', 'L2(avg)'] r1 = OrdinaryLeastSquaresRegressor(**ols_args) r2 = OnlineGradientDescentRegressor(**ogd_args) r3 = LightGbmRegressor(**lgbm_args) data = FileDataStream(path, schema) pipeline_steps = [ Indicator() << ind_args, Handler(replace_with='Mean') << handler_args, VotingRegressor(estimators=[r1, r2, r3], combiner='Average') ] cv_results = CV(pipeline_steps).fit(data, split_start=split_start) l2_avg_ensemble = cv_results['metrics_summary'].loc['Average', 'L2(avg)'] self.assertTrue(l2_avg_ensemble < l2_avg_lgbm)
def test_unsupported_pipelines(self): unsupported_pipelines = [OneHotVectorizer()] for p in unsupported_pipelines: pipeline = Pipeline([p]) msg = 'CV doesn\'t support pipeline ending with {}, but ' \ 'didn\'t raise exception'.format( pipeline._last_node_type()) with self.assertRaises(ValueError, msg=msg): CV(pipeline)
def check_cv_with_non_defaults(self, label_name='label', group_id='groupid', features='Features_1', **params): steps = [ ToKey(columns={ 'groupid2': group_id, 'label2': label_name }), LightGbmRanker() << { Role.GroupId: 'groupid2', Role.Label: 'label2', Role.Feature: [features] } ] data = self.data(label_name, group_id, features) cv = CV(steps) results = cv.fit(data, groups='groupid', cv=4) check_cv_results(cv._learner_type, results, n_folds=4, expected_metrics={})
def test_split_start_with_transforms_with_presteps(self): path = get_dataset("airquality").as_filepath() schema = DataSchema.read_schema(path) data = FileDataStream(path, schema) pipeline_steps = [ Indicator() << { 'Ozone_ind': 'Ozone', 'Solar_R_ind': 'Solar_R' }, Handler(replace_with='Mean') << { 'Solar_R': 'Solar_R', 'Ozone': 'Ozone' }, LightGbmRegressor(feature=[ 'Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp' ], label='Wind') ] results = CV(pipeline_steps).fit(data, split_start='after_transforms', dry_run=True) results = json.loads(results) node_names = [ep['Name'] for ep in results['nodes']] cv_node = [ ep for ep in results['nodes'] if 'Models.CrossValidator' in ep['Name'] ][0] cv_sub_node_names = [ep['Name'] for ep in cv_node['Inputs']['Nodes']] self.assertTrue('Transforms.MissingValueHandler' in node_names) self.assertTrue( 'Transforms.MissingValueHandler' not in cv_sub_node_names) self.assertTrue('Transforms.ModelCombiner' in node_names)
from nimbusml.preprocessing.missing_values import Indicator, Handler # Case 1: Default usage of CV path = get_dataset('infert').as_filepath() schema = DataSchema.read_schema(path, numeric_dtype=np.float32) data = FileDataStream.read_csv(path, schema=schema) pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), LogisticRegressionClassifier(feature=['age', 'spontaneous', 'edu'], label='induced') ]) # Do 3-fold cross-validation cv_results = CV(pipeline).fit(data, cv=3) # print summary statistic of metrics print(cv_results['metrics_summary']) # print metrics for all folds print(cv_results['metrics']) # print confusion matrix for fold 1 cm = cv_results['confusion_matrix'] print(cm[cm.Fold == 1]) # Case 2: Using CV with split_start option path = get_dataset("airquality").as_filepath() schema = DataSchema.read_schema(path)