def test_suite_stackedensemble_validation_frame(blending=False): def test_validation_metrics_are_computed_when_providing_validation_frame(): ds = prepare_data(blending) base_models = train_base_models(ds) se_valid = train_stacked_ensemble(ds, base_models, validation_frame=ds.valid) assert se_valid.model_performance(valid=True) is not None assert type(se_valid.model_performance(valid=True)) == h2o.model.metrics_base.H2OBinomialModelMetrics assert type(se_valid.auc(valid=True)) == float def test_a_better_model_is_produced_with_validation_frame(): ds = prepare_data(blending) base_models = train_base_models(ds) se_no_valid = train_stacked_ensemble(ds, base_models) se_valid = train_stacked_ensemble(ds, base_models, validation_frame=ds.valid) assert se_no_valid.model_performance(valid=True) is None assert se_valid.model_performance(valid=True) is not None se_no_valid_perf = se_no_valid.model_performance(test_data=ds.test) se_valid_perf = se_valid.model_performance(test_data=ds.test) tolerance = 1e-3 # ad hoc tolerance as there's no guarantee perf will actually be better with validation frame assert se_no_valid_perf.auc() < se_valid_perf.auc() or (se_no_valid_perf.auc() - se_valid_perf.auc()) < tolerance, \ "Expected that a better model would be produced when passing a validation frame, bot obtained: " \ "AUC (no validation) = {}, AUC (validation frame) = {}".format(se_no_valid_perf.auc(), se_valid_perf.auc()) return [pu.tag_test(test, 'blending' if blending else None) for test in [ test_validation_metrics_are_computed_when_providing_validation_frame, test_a_better_model_is_produced_with_validation_frame ]]
def test_suite_stackedensemble_binary_model(blending=False): def test_saved_binary_model_produces_same_predictions_as_original(): ds = prepare_data(blending) base_models = train_base_models(ds) se_model = train_stacked_ensemble(ds, base_models) #Predict in ensemble in Py client preds_py = se_model.predict(ds.test) tmp_dir = tempfile.mkdtemp() try: bin_file = h2o.save_model(se_model, tmp_dir) #Load binary model and predict bin_model = h2o.load_model(pu.locate(bin_file)) preds_bin = bin_model.predict(ds.test) finally: shutil.rmtree(tmp_dir) #Predictions from model in Py and binary model should be the same pred_diff = preds_bin - preds_py assert pred_diff["p0"].max() < 1e-11 assert pred_diff["p1"].max() < 1e-11 assert pred_diff["p0"].min() > -1e-11 assert pred_diff["p1"].min() > -1e-11 return [pu.tag_test(test, 'blending' if blending else None) for test in [ test_saved_binary_model_produces_same_predictions_as_original ]]
def test_suite_stackedensemble_levelone_frame(blending=False): def test_levelone_frame_not_accessible_with__keep_levelone_frame__False(): ds = prepare_data(blending) models = train_base_models(ds) se = train_stacked_ensemble(ds, models) assert se.levelone_frame_id() is None, \ "Level one frame should not be available when keep_levelone_frame is False." def test_levelone_frame_accessible_with__keep_levelone_frame__True(): ds = prepare_data(blending) models = train_base_models(ds) se = train_stacked_ensemble(ds, models, keep_levelone_frame=True) assert se.levelone_frame_id() is not None, \ "Level one frame should be available when keep_levelone_frame is True." def test_levelone_frame_has_expected_dimensions(): ds = prepare_data(blending) models = train_base_models(ds) se = train_stacked_ensemble(ds, models, keep_levelone_frame=True) level_one_frame = h2o.get_frame(se.levelone_frame_id()["name"]) se_training_frame = ds.blend if blending else ds.train num_col_level_one_frame = (se_training_frame[ds.y].unique().nrow) * len(models) + 1 # count_classes(probabilities) * count_models + 1 (target) assert level_one_frame.ncols == num_col_level_one_frame, \ "The number of columns in a level one frame should be numClasses * numBaseModels + 1." assert level_one_frame.nrows == se_training_frame.nrows, \ "The number of rows in the level one frame should match train number of rows. " return [pu.tag_test(test, 'blending' if blending else None) for test in [ test_levelone_frame_not_accessible_with__keep_levelone_frame__False, test_levelone_frame_accessible_with__keep_levelone_frame__True, test_levelone_frame_has_expected_dimensions ]]
def test_suite_stackedensemble_binary_model(blending=False): def test_saved_binary_model_produces_same_predictions_as_original(): ds = prepare_data(blending) base_models = train_base_models(ds) se_model = train_stacked_ensemble(ds, base_models) #Predict in ensemble in Py client preds_py = se_model.predict(ds.test) tmp_dir = tempfile.mkdtemp() try: bin_file = h2o.save_model(se_model, tmp_dir) #Load binary model and predict bin_model = h2o.load_model(pu.locate(bin_file)) preds_bin = bin_model.predict(ds.test) finally: shutil.rmtree(tmp_dir) #Predictions from model in Py and binary model should be the same pred_diff = preds_bin - preds_py assert pred_diff["p0"].max() < 1e-11 assert pred_diff["p1"].max() < 1e-11 assert pred_diff["p0"].min() > -1e-11 assert pred_diff["p1"].min() > -1e-11 return [ pu.tag_test(test, 'blending' if blending else None) for test in [test_saved_binary_model_produces_same_predictions_as_original] ]
def test_suite_stackedensemble_training_frame(blending=False): def test_base_models_can_use_different_x(): """ test that passing in base models that use different subsets of the features works. (different x, but same training_frame) """ ds = prepare_data(blending) datasets = pu.ns(gbm=ds.extend(x=ds.x[1:11]), drf=ds.extend(x=ds.x[13:20])) bm = train_base_models(datasets) se = train_stacked_ensemble(ds, bm) se_nox = train_stacked_ensemble(ds.extend(x=None), bm) assert se.auc() > 0 assert se.auc() == se_nox.auc() def test_base_models_can_use_different_compatible_training_frames(): """ test that passing in base models that use different subsets of the features works. (different training_frame) """ ds = prepare_data(blending) datasets = pu.ns(gbm=ds.extend(x=None, train=ds.train[list(range( 1, 11))].cbind(ds.train[ds.y])), drf=ds.extend(x=None, train=ds.train[list(range( 13, 20))].cbind(ds.train[ds.y]))) bm = train_base_models(datasets) se = train_stacked_ensemble(ds, bm) assert se.auc() > 0 def test_se_fails_when_base_models_use_incompatible_training_frames(): """ test that SE fails when passing in base models that were trained with frames of different size """ ds = prepare_data(blending) datasets = pu.ns(gbm=ds.extend(x=None), drf=ds.extend(x=None, train=ds.train[0:ds.train.nrows // 2, :])) bm = train_base_models(datasets) try: se = train_stacked_ensemble(ds, bm) assert blending, "Stacked Ensembles of models with different training frame sizes should fail in non-blending mode" se.predict(ds.train) except Exception as e: assert not blending, "No Exception should have been raised in blending mode" assert "Base models are inconsistent: they use different size (number of rows) training frames" in str( e), "wrong error message: {}".format(str(e)) # raise e return [ pu.tag_test(test, 'blending' if blending else None) for test in [ test_base_models_can_use_different_x, test_base_models_can_use_different_compatible_training_frames, test_se_fails_when_base_models_use_incompatible_training_frames ] ]
def test_suite_stackedensemble_training_frame(blending=False): def test_base_models_can_use_different_x(): """ test that passing in base models that use different subsets of the features works. (different x, but same training_frame) """ ds = prepare_data(blending) datasets = pu.ns(gbm=ds.extend(x=ds.x[1:11]), drf=ds.extend(x=ds.x[13:20])) bm = train_base_models(datasets) se = train_stacked_ensemble(ds, bm) se_nox = train_stacked_ensemble(ds.extend(x=None), bm) assert se.auc() > 0 assert se.auc() == se_nox.auc() def test_base_models_can_use_different_compatible_training_frames(): """ test that passing in base models that use different subsets of the features works. (different training_frame) """ ds = prepare_data(blending) datasets = pu.ns(gbm=ds.extend(x=None, train=ds.train[list(range(1, 11))].cbind(ds.train[ds.y])), drf=ds.extend(x=None, train=ds.train[list(range(13, 20))].cbind(ds.train[ds.y]))) bm = train_base_models(datasets) se = train_stacked_ensemble(ds, bm) assert se.auc() > 0 def test_se_fails_when_base_models_use_incompatible_training_frames(): """ test that SE fails when passing in base models that were trained with frames of different size """ ds = prepare_data(blending) datasets = pu.ns(gbm=ds.extend(x=None), drf=ds.extend(x=None, train=ds.train[0:ds.train.nrows//2,:])) bm = train_base_models(datasets) try: train_stacked_ensemble(ds, bm) assert False, "Stacked Ensembles of models with different training frame sizes should fail" except Exception as e: assert "Base models are inconsistent: they use different size (number of rows) training frames" in str(e), "wrong error message: {}".format(str(e)) # raise e return [pu.tag_test(test, 'blending' if blending else None) for test in [ test_base_models_can_use_different_x, test_base_models_can_use_different_compatible_training_frames, test_se_fails_when_base_models_use_incompatible_training_frames ]]
def test_suite_stackedensemble_base_models(blending=False): def test_base_models_can_be_passed_as_objects_or_as_ids(): """This test checks the following: 1) That passing in a list of models for base_models works. 2) That passing in a list of models and model_ids results in the same stacked ensemble. """ ds = prepare_data(blending) base_models = train_base_models(ds) se1 = train_stacked_ensemble(ds, [m.model_id for m in base_models]) se2 = train_stacked_ensemble(ds, base_models) # Eval train AUC to assess equivalence assert se1.auc() == se2.auc() return [ pu.tag_test(test, 'blending' if blending else None) for test in [test_base_models_can_be_passed_as_objects_or_as_ids] ]
def test_suite_stackedensemble_base_models(blending=False): def test_base_models_can_be_passed_as_objects_or_as_ids(): """This test checks the following: 1) That passing in a list of models for base_models works. 2) That passing in a list of models and model_ids results in the same stacked ensemble. """ ds = prepare_data(blending) base_models = train_base_models(ds) se1 = train_stacked_ensemble(ds, [m.model_id for m in base_models]) se2 = train_stacked_ensemble(ds, base_models) # Eval train AUC to assess equivalence assert se1.auc() == se2.auc() return [pu.tag_test(test, 'blending' if blending else None) for test in [ test_base_models_can_be_passed_as_objects_or_as_ids ]]
def test_suite_stackedensemble_levelone_frame(blending=False): def test_levelone_frame_not_accessible_with__keep_levelone_frame__False(): ds = prepare_data(blending) models = train_base_models(ds) se = train_stacked_ensemble(ds, models) assert se.levelone_frame_id() is None, \ "Level one frame should not be available when keep_levelone_frame is False." def test_levelone_frame_accessible_with__keep_levelone_frame__True(): ds = prepare_data(blending) models = train_base_models(ds) se = train_stacked_ensemble(ds, models, keep_levelone_frame=True) assert se.levelone_frame_id() is not None, \ "Level one frame should be available when keep_levelone_frame is True." def test_levelone_frame_has_expected_dimensions(): ds = prepare_data(blending) models = train_base_models(ds) se = train_stacked_ensemble(ds, models, keep_levelone_frame=True) level_one_frame = h2o.get_frame(se.levelone_frame_id()["name"]) se_training_frame = ds.blend if blending else ds.train num_col_level_one_frame = ( se_training_frame[ds.y].unique().nrow) * len( models ) + 1 # count_classes(probabilities) * count_models + 1 (target) assert level_one_frame.ncols == num_col_level_one_frame, \ "The number of columns in a level one frame should be numClasses * numBaseModels + 1." assert level_one_frame.nrows == se_training_frame.nrows, \ "The number of rows in the level one frame should match train number of rows. " return [ pu.tag_test(test, 'blending' if blending else None) for test in [ test_levelone_frame_not_accessible_with__keep_levelone_frame__False, test_levelone_frame_accessible_with__keep_levelone_frame__True, test_levelone_frame_has_expected_dimensions ] ]
def test_suite_stackedensemble_validation_frame(blending=False): def test_validation_metrics_are_computed_when_providing_validation_frame(): ds = prepare_data(blending) base_models = train_base_models(ds) se_valid = train_stacked_ensemble(ds, base_models, validation_frame=ds.valid) assert se_valid.model_performance(valid=True) is not None assert type(se_valid.model_performance( valid=True)) == h2o.model.metrics_base.H2OBinomialModelMetrics assert type(se_valid.auc(valid=True)) == float def test_a_better_model_is_produced_with_validation_frame(): ds = prepare_data(blending) base_models = train_base_models(ds) se_no_valid = train_stacked_ensemble(ds, base_models) se_valid = train_stacked_ensemble(ds, base_models, validation_frame=ds.valid) assert se_no_valid.model_performance(valid=True) is None assert se_valid.model_performance(valid=True) is not None se_no_valid_perf = se_no_valid.model_performance(test_data=ds.test) se_valid_perf = se_valid.model_performance(test_data=ds.test) tolerance = 1e-3 # ad hoc tolerance as there's no guarantee perf will actually be better with validation frame assert se_no_valid_perf.auc() < se_valid_perf.auc() or (se_no_valid_perf.auc() - se_valid_perf.auc()) < tolerance, \ "Expected that a better model would be produced when passing a validation frame, bot obtained: " \ "AUC (no validation) = {}, AUC (validation frame) = {}".format(se_no_valid_perf.auc(), se_valid_perf.auc()) return [ pu.tag_test(test, 'blending' if blending else None) for test in [ test_validation_metrics_are_computed_when_providing_validation_frame, test_a_better_model_is_produced_with_validation_frame ] ]
def make_test(test, classifier): bound_test = partial(test, classifier) bound_test.__name__ = test.__name__ pyunit_utils.tag_test(bound_test, classifier.__name__) return bound_test
def test_suite_stackedensemble_gaussian(blending=False): def test_predict_on_se_model(): ds = prepare_data(blending) models = train_base_models(ds) se = train_stacked_ensemble(ds, models) for i in range(2): # repeat predict to verify consistency pred = se.predict(test_data=ds.test) assert pred.nrow == ds.test.nrow, "expected " + str(pred.nrow) + " to be equal to " + str(ds.test.nrow) assert pred.ncol == 1, "expected " + str(pred.ncol) + " to be equal to 1 but it was equal to " + str(pred.ncol) def test_se_performance_is_better_than_individual_models(): ds = prepare_data(blending) base_models = train_base_models(ds) def compute_perf(model): perf = pu.ns( train=model.model_performance(train=True), test=model.model_performance(test_data=ds.test) ) print("{} training performance: ".format(model.model_id)) print(perf.train) print("{} test performance: ".format(model.model_id)) print(perf.test) return perf base_perfs = {} for model in base_models: base_perfs[model.model_id] = compute_perf(model) se = train_stacked_ensemble(ds, base_models) perf_se = compute_perf(se) # Check that stack perf is better (smaller) than the best (smaller) base learner perf: # Training RMSE for each base learner baselearner_best_rmse_train = min([perf.train.rmse() for perf in base_perfs.values()]) stack_rmse_train = perf_se.train.rmse() print("Best Base-learner Training RMSE: {}".format(baselearner_best_rmse_train)) print("Ensemble Training RMSE: {}".format(stack_rmse_train)) assert_warn(stack_rmse_train < baselearner_best_rmse_train, "expected SE training RMSE would be smaller than the best of base learner training RMSE, but obtained: " \ "RMSE (SE) = {}, RMSE (best base learner) = {}".format(stack_rmse_train, baselearner_best_rmse_train)) # Test RMSE for each base learner baselearner_best_rmse_test = min([perf.test.rmse() for perf in base_perfs.values()]) stack_rmse_test = perf_se.test.rmse() print("Best Base-learner Test RMSE: {}".format(baselearner_best_rmse_test)) print("Ensemble Test RMSE: {}".format(stack_rmse_test)) assert_warn(stack_rmse_test < baselearner_best_rmse_test, "expected SE test RMSE would be smaller than the best of base learner test RMSE, but obtained: " \ "RMSE (SE) = {}, RMSE (best base learner) = {}".format(stack_rmse_test, baselearner_best_rmse_test)) def test_validation_frame_produces_same_metric_as_perf_test(): ds = prepare_data(blending) models = train_base_models(ds) se = train_stacked_ensemble(ds, models, validation_frame=ds.test) se_perf = se.model_performance(test_data=ds.test) se_perf_validation_frame = se.model_performance(valid=True) # since the metrics object is not exactly the same, we can just test that RSME is the same assert se_perf.rmse() == se_perf_validation_frame.rmse(), \ "expected SE test RMSE to be the same as SE validation frame RMSE, but obtained: " \ "RMSE (perf on test) = {}, RMSE (test passed as validation frame) = {}".format(se_perf.rmse(), se_perf_validation_frame.rmse()) return [pu.tag_test(test, 'blending' if blending else None) for test in [ test_predict_on_se_model, test_se_performance_is_better_than_individual_models, test_validation_frame_produces_same_metric_as_perf_test ]]
def test_suite_stackedensemble_binomial(blending=False): def test_predict_on_se_model(): ds = prepare_data(blending) models = train_base_models(ds) se = train_stacked_ensemble(ds, models) pred = se.predict(test_data=ds.test) assert pred.nrow == ds.test.nrow, "expected " + str( pred.nrow) + " to be equal to " + str(ds.test.nrow) assert pred.ncol == 3, "expected " + str( pred.ncol) + " to be equal to 3 but it was equal to " + str( pred.ncol) def test_se_performance_is_better_than_individual_models(): ds = prepare_data(blending) base_models = train_base_models(ds) def compute_perf(model): perf = pu.ns(train=model.model_performance(train=True), test=model.model_performance(test_data=ds.test)) print("{} training performance: ".format(model.model_id)) print(perf.train) print("{} test performance: ".format(model.model_id)) print(perf.test) return perf base_perfs = {} for model in base_models: base_perfs[model.model_id] = compute_perf(model) se = train_stacked_ensemble(ds, base_models) perf_se = compute_perf(se) # Check that stack perf is better (bigger) than the best(biggest) base learner perf: # Training AUC baselearner_best_auc_train = max( [perf.train.auc() for perf in base_perfs.values()]) stack_auc_train = perf_se.train.auc() print("Best Base-learner Training AUC: {}".format( baselearner_best_auc_train)) print("Ensemble Training AUC: {}".format(stack_auc_train)) assert stack_auc_train > baselearner_best_auc_train, \ "expected SE training AUC would be greater than the best of base learner training AUC, but obtained: " \ "AUC (SE) = {}, AUC (best base learner) = {}".format(stack_auc_train, baselearner_best_auc_train) # Test AUC baselearner_best_auc_test = max( [perf.test.auc() for perf in base_perfs.values()]) stack_auc_test = perf_se.test.auc() print("Best Base-learner Test AUC: {}".format( baselearner_best_auc_test)) print("Ensemble Test AUC: {}".format(stack_auc_test)) assert stack_auc_test > baselearner_best_auc_test, \ "expected SE test AUC would be greater than the best of base learner test AUC, but obtained: " \ "AUC (SE) = {}, AUC (best base learner) = {}".format(stack_auc_test, baselearner_best_auc_test) def test_validation_frame_produces_same_metric_as_perf_test(): ds = prepare_data(blending) models = train_base_models(ds) se = train_stacked_ensemble(ds, models, validation_frame=ds.test) se_perf = se.model_performance(test_data=ds.test) # since the metrics object is not exactly the same, we can just test that AUC is the same se_perf_validation_frame = se.model_performance(valid=True) assert se_perf.auc() == se_perf_validation_frame.auc(), \ "expected SE test AUC to be the same as SE validation frame AUC, but obtained: " \ "AUC (perf on test) = {}, AUC (test passed as validation frame) = {}".format(se_perf.auc(), se_perf_validation_frame.auc()) return [ pu.tag_test(test, 'blending' if blending else None) for test in [ test_predict_on_se_model, test_se_performance_is_better_than_individual_models, test_validation_frame_produces_same_metric_as_perf_test ] ]
def make_test(test, transformer): bound_test = partial(test, transformer) bound_test.__name__ = test.__name__ pyunit_utils.tag_test(bound_test, transformer.__name__) return bound_test
def test_suite_stackedensemble_binomial(blending=False): def test_predict_on_se_model(): ds = prepare_data(blending) models = train_base_models(ds) se = train_stacked_ensemble(ds, models) pred = se.predict(test_data=ds.test) assert pred.nrow == ds.test.nrow, "expected " + str(pred.nrow) + " to be equal to " + str(ds.test.nrow) assert pred.ncol == 3, "expected " + str(pred.ncol) + " to be equal to 3 but it was equal to " + str(pred.ncol) def test_se_performance_is_better_than_individual_models(): ds = prepare_data(blending) base_models = train_base_models(ds) def compute_perf(model): perf = pu.ns( train=model.model_performance(train=True), test=model.model_performance(test_data=ds.test) ) print("{} training performance: ".format(model.model_id)) print(perf.train) print("{} test performance: ".format(model.model_id)) print(perf.test) return perf base_perfs = {} for model in base_models: base_perfs[model.model_id] = compute_perf(model) se = train_stacked_ensemble(ds, base_models) perf_se = compute_perf(se) # Check that stack perf is better (bigger) than the best(biggest) base learner perf: # Training AUC baselearner_best_auc_train = max([perf.train.auc() for perf in base_perfs.values()]) stack_auc_train = perf_se.train.auc() print("Best Base-learner Training AUC: {}".format(baselearner_best_auc_train)) print("Ensemble Training AUC: {}".format(stack_auc_train)) assert stack_auc_train > baselearner_best_auc_train, \ "expected SE training AUC would be greater than the best of base learner training AUC, but obtained: " \ "AUC (SE) = {}, AUC (best base learner) = {}".format(stack_auc_train, baselearner_best_auc_train) # Test AUC baselearner_best_auc_test = max([perf.test.auc() for perf in base_perfs.values()]) stack_auc_test = perf_se.test.auc() print("Best Base-learner Test AUC: {}".format(baselearner_best_auc_test)) print("Ensemble Test AUC: {}".format(stack_auc_test)) assert stack_auc_test > baselearner_best_auc_test, \ "expected SE test AUC would be greater than the best of base learner test AUC, but obtained: " \ "AUC (SE) = {}, AUC (best base learner) = {}".format(stack_auc_test, baselearner_best_auc_test) def test_validation_frame_produces_same_metric_as_perf_test(): ds = prepare_data(blending) models = train_base_models(ds) se = train_stacked_ensemble(ds, models, validation_frame=ds.test) se_perf = se.model_performance(test_data=ds.test) # since the metrics object is not exactly the same, we can just test that AUC is the same se_perf_validation_frame = se.model_performance(valid=True) assert se_perf.auc() == se_perf_validation_frame.auc(), \ "expected SE test AUC to be the same as SE validation frame AUC, but obtained: " \ "AUC (perf on test) = {}, AUC (test passed as validation frame) = {}".format(se_perf.auc(), se_perf_validation_frame.auc()) return [pu.tag_test(test, 'blending' if blending else None) for test in [ test_predict_on_se_model, test_se_performance_is_better_than_individual_models, test_validation_frame_produces_same_metric_as_perf_test ]]
def test_suite_stackedensemble_gaussian(blending=False): def test_predict_on_se_model(): ds = prepare_data(blending) models = train_base_models(ds) se = train_stacked_ensemble(ds, models) for i in range(2): # repeat predict to verify consistency pred = se.predict(test_data=ds.test) assert pred.nrow == ds.test.nrow, "expected " + str( pred.nrow) + " to be equal to " + str(ds.test.nrow) assert pred.ncol == 1, "expected " + str( pred.ncol) + " to be equal to 1 but it was equal to " + str( pred.ncol) def test_se_performance_is_better_than_individual_models(): ds = prepare_data(blending) base_models = train_base_models(ds) def compute_perf(model): perf = pu.ns(train=model.model_performance(train=True), test=model.model_performance(test_data=ds.test)) print("{} training performance: ".format(model.model_id)) print(perf.train) print("{} test performance: ".format(model.model_id)) print(perf.test) return perf base_perfs = {} for model in base_models: base_perfs[model.model_id] = compute_perf(model) se = train_stacked_ensemble(ds, base_models) perf_se = compute_perf(se) # Check that stack perf is better (smaller) than the best (smaller) base learner perf: # Training RMSE for each base learner baselearner_best_rmse_train = min( [perf.train.rmse() for perf in base_perfs.values()]) stack_rmse_train = perf_se.train.rmse() print("Best Base-learner Training RMSE: {}".format( baselearner_best_rmse_train)) print("Ensemble Training RMSE: {}".format(stack_rmse_train)) assert_warn(stack_rmse_train < baselearner_best_rmse_train, "expected SE training RMSE would be smaller than the best of base learner training RMSE, but obtained: " \ "RMSE (SE) = {}, RMSE (best base learner) = {}".format(stack_rmse_train, baselearner_best_rmse_train)) # Test RMSE for each base learner baselearner_best_rmse_test = min( [perf.test.rmse() for perf in base_perfs.values()]) stack_rmse_test = perf_se.test.rmse() print("Best Base-learner Test RMSE: {}".format( baselearner_best_rmse_test)) print("Ensemble Test RMSE: {}".format(stack_rmse_test)) assert_warn(stack_rmse_test < baselearner_best_rmse_test, "expected SE test RMSE would be smaller than the best of base learner test RMSE, but obtained: " \ "RMSE (SE) = {}, RMSE (best base learner) = {}".format(stack_rmse_test, baselearner_best_rmse_test)) def test_validation_frame_produces_same_metric_as_perf_test(): ds = prepare_data(blending) models = train_base_models(ds) se = train_stacked_ensemble(ds, models, validation_frame=ds.test) se_perf = se.model_performance(test_data=ds.test) se_perf_validation_frame = se.model_performance(valid=True) # since the metrics object is not exactly the same, we can just test that RSME is the same assert se_perf.rmse() == se_perf_validation_frame.rmse(), \ "expected SE test RMSE to be the same as SE validation frame RMSE, but obtained: " \ "RMSE (perf on test) = {}, RMSE (test passed as validation frame) = {}".format(se_perf.rmse(), se_perf_validation_frame.rmse()) return [ pu.tag_test(test, 'blending' if blending else None) for test in [ test_predict_on_se_model, test_se_performance_is_better_than_individual_models, test_validation_frame_produces_same_metric_as_perf_test ] ]