コード例 #1
0
def test_suite_stackedensemble_validation_frame(blending=False):
    
    def test_validation_metrics_are_computed_when_providing_validation_frame():
        ds = prepare_data(blending)
        base_models = train_base_models(ds)
        se_valid = train_stacked_ensemble(ds, base_models, validation_frame=ds.valid)
        
        assert se_valid.model_performance(valid=True) is not None
        assert type(se_valid.model_performance(valid=True)) == h2o.model.metrics_base.H2OBinomialModelMetrics
        assert type(se_valid.auc(valid=True)) == float
                    
        
    def test_a_better_model_is_produced_with_validation_frame():
        ds = prepare_data(blending)
        base_models = train_base_models(ds)
        se_no_valid = train_stacked_ensemble(ds, base_models)
        se_valid = train_stacked_ensemble(ds, base_models, validation_frame=ds.valid)

        assert se_no_valid.model_performance(valid=True) is None
        assert se_valid.model_performance(valid=True) is not None
        
        se_no_valid_perf = se_no_valid.model_performance(test_data=ds.test)
        se_valid_perf = se_valid.model_performance(test_data=ds.test)
        tolerance = 1e-3  # ad hoc tolerance as there's no guarantee perf will actually be better with validation frame 
        assert se_no_valid_perf.auc() < se_valid_perf.auc() or (se_no_valid_perf.auc() - se_valid_perf.auc()) < tolerance, \
            "Expected that a better model would be produced when passing a validation frame, bot obtained: " \
            "AUC (no validation) = {}, AUC (validation frame) = {}".format(se_no_valid_perf.auc(), se_valid_perf.auc())
        
    
    return [pu.tag_test(test, 'blending' if blending else None) for test in [
        test_validation_metrics_are_computed_when_providing_validation_frame,
        test_a_better_model_is_produced_with_validation_frame
    ]]
コード例 #2
0
def test_suite_stackedensemble_binary_model(blending=False):

    def test_saved_binary_model_produces_same_predictions_as_original():
        ds = prepare_data(blending)
        base_models = train_base_models(ds)
        se_model = train_stacked_ensemble(ds, base_models)
        
        #Predict in ensemble in Py client
        preds_py = se_model.predict(ds.test)
        
        tmp_dir = tempfile.mkdtemp()
        try:
            bin_file = h2o.save_model(se_model, tmp_dir)
            #Load binary model and predict
            bin_model = h2o.load_model(pu.locate(bin_file))
            preds_bin = bin_model.predict(ds.test)
        finally:
            shutil.rmtree(tmp_dir)

        #Predictions from model in Py and binary model should be the same
        pred_diff = preds_bin - preds_py
        assert pred_diff["p0"].max() < 1e-11
        assert pred_diff["p1"].max() < 1e-11
        assert pred_diff["p0"].min() > -1e-11
        assert pred_diff["p1"].min() > -1e-11
    
    return [pu.tag_test(test, 'blending' if blending else None) for test in [
        test_saved_binary_model_produces_same_predictions_as_original
    ]]
コード例 #3
0
def test_suite_stackedensemble_levelone_frame(blending=False):

    def test_levelone_frame_not_accessible_with__keep_levelone_frame__False():
        ds = prepare_data(blending)
        models = train_base_models(ds)
        se = train_stacked_ensemble(ds, models)
        assert se.levelone_frame_id() is None, \
            "Level one frame should not be available when keep_levelone_frame is False."
    
    def test_levelone_frame_accessible_with__keep_levelone_frame__True():
        ds = prepare_data(blending)
        models = train_base_models(ds)
        se = train_stacked_ensemble(ds, models, keep_levelone_frame=True)
        assert se.levelone_frame_id() is not None, \
            "Level one frame should be available when keep_levelone_frame is True."
    
    def test_levelone_frame_has_expected_dimensions():
        ds = prepare_data(blending)
        models = train_base_models(ds)
        se = train_stacked_ensemble(ds, models, keep_levelone_frame=True)
        level_one_frame = h2o.get_frame(se.levelone_frame_id()["name"])
        
        se_training_frame = ds.blend if blending else ds.train
        
        num_col_level_one_frame = (se_training_frame[ds.y].unique().nrow) * len(models) + 1  # count_classes(probabilities) * count_models + 1 (target)
        assert level_one_frame.ncols == num_col_level_one_frame, \
            "The number of columns in a level one frame should be numClasses * numBaseModels + 1."
        assert level_one_frame.nrows == se_training_frame.nrows, \
            "The number of rows in the level one frame should match train number of rows. "
    
    return [pu.tag_test(test, 'blending' if blending else None) for test in [
        test_levelone_frame_not_accessible_with__keep_levelone_frame__False,
        test_levelone_frame_accessible_with__keep_levelone_frame__True,
        test_levelone_frame_has_expected_dimensions
    ]]
コード例 #4
0
def test_suite_stackedensemble_binary_model(blending=False):
    def test_saved_binary_model_produces_same_predictions_as_original():
        ds = prepare_data(blending)
        base_models = train_base_models(ds)
        se_model = train_stacked_ensemble(ds, base_models)

        #Predict in ensemble in Py client
        preds_py = se_model.predict(ds.test)

        tmp_dir = tempfile.mkdtemp()
        try:
            bin_file = h2o.save_model(se_model, tmp_dir)
            #Load binary model and predict
            bin_model = h2o.load_model(pu.locate(bin_file))
            preds_bin = bin_model.predict(ds.test)
        finally:
            shutil.rmtree(tmp_dir)

        #Predictions from model in Py and binary model should be the same
        pred_diff = preds_bin - preds_py
        assert pred_diff["p0"].max() < 1e-11
        assert pred_diff["p1"].max() < 1e-11
        assert pred_diff["p0"].min() > -1e-11
        assert pred_diff["p1"].min() > -1e-11

    return [
        pu.tag_test(test, 'blending' if blending else None) for test in
        [test_saved_binary_model_produces_same_predictions_as_original]
    ]
コード例 #5
0
def test_suite_stackedensemble_training_frame(blending=False):
    def test_base_models_can_use_different_x():
        """
        test that passing in base models that use different subsets of 
        the features works. (different x, but same training_frame)
        """
        ds = prepare_data(blending)
        datasets = pu.ns(gbm=ds.extend(x=ds.x[1:11]),
                         drf=ds.extend(x=ds.x[13:20]))

        bm = train_base_models(datasets)
        se = train_stacked_ensemble(ds, bm)
        se_nox = train_stacked_ensemble(ds.extend(x=None), bm)
        assert se.auc() > 0
        assert se.auc() == se_nox.auc()

    def test_base_models_can_use_different_compatible_training_frames():
        """
        test that passing in base models that use different subsets of 
        the features works. (different training_frame) 
        """
        ds = prepare_data(blending)
        datasets = pu.ns(gbm=ds.extend(x=None,
                                       train=ds.train[list(range(
                                           1, 11))].cbind(ds.train[ds.y])),
                         drf=ds.extend(x=None,
                                       train=ds.train[list(range(
                                           13, 20))].cbind(ds.train[ds.y])))
        bm = train_base_models(datasets)
        se = train_stacked_ensemble(ds, bm)
        assert se.auc() > 0

    def test_se_fails_when_base_models_use_incompatible_training_frames():
        """
        test that SE fails when passing in base models that were trained with frames of different size 
        """
        ds = prepare_data(blending)
        datasets = pu.ns(gbm=ds.extend(x=None),
                         drf=ds.extend(x=None,
                                       train=ds.train[0:ds.train.nrows //
                                                      2, :]))
        bm = train_base_models(datasets)
        try:
            se = train_stacked_ensemble(ds, bm)
            assert blending, "Stacked Ensembles of models with different training frame sizes should fail in non-blending mode"
            se.predict(ds.train)
        except Exception as e:
            assert not blending, "No Exception should have been raised in blending mode"
            assert "Base models are inconsistent: they use different size (number of rows) training frames" in str(
                e), "wrong error message: {}".format(str(e))
            # raise e

    return [
        pu.tag_test(test, 'blending' if blending else None) for test in [
            test_base_models_can_use_different_x,
            test_base_models_can_use_different_compatible_training_frames,
            test_se_fails_when_base_models_use_incompatible_training_frames
        ]
    ]
def test_suite_stackedensemble_training_frame(blending=False):
    
    def test_base_models_can_use_different_x():
        """
        test that passing in base models that use different subsets of 
        the features works. (different x, but same training_frame)
        """
        ds = prepare_data(blending)
        datasets = pu.ns(gbm=ds.extend(x=ds.x[1:11]), 
                         drf=ds.extend(x=ds.x[13:20]))
        
        bm = train_base_models(datasets)
        se = train_stacked_ensemble(ds, bm)
        se_nox = train_stacked_ensemble(ds.extend(x=None), bm)
        assert se.auc() > 0
        assert se.auc() == se_nox.auc()
        
    
    def test_base_models_can_use_different_compatible_training_frames():
        """
        test that passing in base models that use different subsets of 
        the features works. (different training_frame) 
        """
        ds = prepare_data(blending)
        datasets = pu.ns(gbm=ds.extend(x=None, 
                                       train=ds.train[list(range(1, 11))].cbind(ds.train[ds.y])), 
                         drf=ds.extend(x=None,
                                       train=ds.train[list(range(13, 20))].cbind(ds.train[ds.y])))
        bm = train_base_models(datasets)
        se = train_stacked_ensemble(ds, bm)
        assert se.auc() > 0
        
    def test_se_fails_when_base_models_use_incompatible_training_frames():
        """
        test that SE fails when passing in base models that were trained with frames of different size 
        """
        ds = prepare_data(blending)
        datasets = pu.ns(gbm=ds.extend(x=None),
                         drf=ds.extend(x=None, train=ds.train[0:ds.train.nrows//2,:]))
        bm = train_base_models(datasets)
        try:
            train_stacked_ensemble(ds, bm)
            assert False, "Stacked Ensembles of models with different training frame sizes should fail"
        except Exception as e:
            assert "Base models are inconsistent: they use different size (number of rows) training frames" in str(e), "wrong error message: {}".format(str(e))
            # raise e
    
    return [pu.tag_test(test, 'blending' if blending else None) for test in [
        test_base_models_can_use_different_x,
        test_base_models_can_use_different_compatible_training_frames,
        test_se_fails_when_base_models_use_incompatible_training_frames
    ]]
コード例 #7
0
def test_suite_stackedensemble_base_models(blending=False):
    def test_base_models_can_be_passed_as_objects_or_as_ids():
        """This test checks the following:
        1) That passing in a list of models for base_models works.
        2) That passing in a list of models and model_ids results in the same stacked ensemble.
        """
        ds = prepare_data(blending)
        base_models = train_base_models(ds)
        se1 = train_stacked_ensemble(ds, [m.model_id for m in base_models])
        se2 = train_stacked_ensemble(ds, base_models)

        # Eval train AUC to assess equivalence
        assert se1.auc() == se2.auc()

    return [
        pu.tag_test(test, 'blending' if blending else None)
        for test in [test_base_models_can_be_passed_as_objects_or_as_ids]
    ]
コード例 #8
0
def test_suite_stackedensemble_base_models(blending=False):

    def test_base_models_can_be_passed_as_objects_or_as_ids():
        """This test checks the following:
        1) That passing in a list of models for base_models works.
        2) That passing in a list of models and model_ids results in the same stacked ensemble.
        """
        ds = prepare_data(blending)
        base_models = train_base_models(ds)
        se1 = train_stacked_ensemble(ds, [m.model_id for m in base_models])
        se2 = train_stacked_ensemble(ds, base_models)

        # Eval train AUC to assess equivalence
        assert se1.auc() == se2.auc()
        
    return [pu.tag_test(test, 'blending' if blending else None) for test in [
        test_base_models_can_be_passed_as_objects_or_as_ids
    ]]
コード例 #9
0
def test_suite_stackedensemble_levelone_frame(blending=False):
    def test_levelone_frame_not_accessible_with__keep_levelone_frame__False():
        ds = prepare_data(blending)
        models = train_base_models(ds)
        se = train_stacked_ensemble(ds, models)
        assert se.levelone_frame_id() is None, \
            "Level one frame should not be available when keep_levelone_frame is False."

    def test_levelone_frame_accessible_with__keep_levelone_frame__True():
        ds = prepare_data(blending)
        models = train_base_models(ds)
        se = train_stacked_ensemble(ds, models, keep_levelone_frame=True)
        assert se.levelone_frame_id() is not None, \
            "Level one frame should be available when keep_levelone_frame is True."

    def test_levelone_frame_has_expected_dimensions():
        ds = prepare_data(blending)
        models = train_base_models(ds)
        se = train_stacked_ensemble(ds, models, keep_levelone_frame=True)
        level_one_frame = h2o.get_frame(se.levelone_frame_id()["name"])

        se_training_frame = ds.blend if blending else ds.train

        num_col_level_one_frame = (
            se_training_frame[ds.y].unique().nrow) * len(
                models
            ) + 1  # count_classes(probabilities) * count_models + 1 (target)
        assert level_one_frame.ncols == num_col_level_one_frame, \
            "The number of columns in a level one frame should be numClasses * numBaseModels + 1."
        assert level_one_frame.nrows == se_training_frame.nrows, \
            "The number of rows in the level one frame should match train number of rows. "

    return [
        pu.tag_test(test, 'blending' if blending else None) for test in [
            test_levelone_frame_not_accessible_with__keep_levelone_frame__False,
            test_levelone_frame_accessible_with__keep_levelone_frame__True,
            test_levelone_frame_has_expected_dimensions
        ]
    ]
コード例 #10
0
def test_suite_stackedensemble_validation_frame(blending=False):
    def test_validation_metrics_are_computed_when_providing_validation_frame():
        ds = prepare_data(blending)
        base_models = train_base_models(ds)
        se_valid = train_stacked_ensemble(ds,
                                          base_models,
                                          validation_frame=ds.valid)

        assert se_valid.model_performance(valid=True) is not None
        assert type(se_valid.model_performance(
            valid=True)) == h2o.model.metrics_base.H2OBinomialModelMetrics
        assert type(se_valid.auc(valid=True)) == float

    def test_a_better_model_is_produced_with_validation_frame():
        ds = prepare_data(blending)
        base_models = train_base_models(ds)
        se_no_valid = train_stacked_ensemble(ds, base_models)
        se_valid = train_stacked_ensemble(ds,
                                          base_models,
                                          validation_frame=ds.valid)

        assert se_no_valid.model_performance(valid=True) is None
        assert se_valid.model_performance(valid=True) is not None

        se_no_valid_perf = se_no_valid.model_performance(test_data=ds.test)
        se_valid_perf = se_valid.model_performance(test_data=ds.test)
        tolerance = 1e-3  # ad hoc tolerance as there's no guarantee perf will actually be better with validation frame
        assert se_no_valid_perf.auc() < se_valid_perf.auc() or (se_no_valid_perf.auc() - se_valid_perf.auc()) < tolerance, \
            "Expected that a better model would be produced when passing a validation frame, bot obtained: " \
            "AUC (no validation) = {}, AUC (validation frame) = {}".format(se_no_valid_perf.auc(), se_valid_perf.auc())

    return [
        pu.tag_test(test, 'blending' if blending else None) for test in [
            test_validation_metrics_are_computed_when_providing_validation_frame,
            test_a_better_model_is_produced_with_validation_frame
        ]
    ]
コード例 #11
0
def make_test(test, classifier):
    bound_test = partial(test, classifier)
    bound_test.__name__ = test.__name__
    pyunit_utils.tag_test(bound_test, classifier.__name__)
    return bound_test
コード例 #12
0
def test_suite_stackedensemble_gaussian(blending=False):
    
    def test_predict_on_se_model():
        ds = prepare_data(blending)
        models = train_base_models(ds)
        se = train_stacked_ensemble(ds, models)
        
        for i in range(2): # repeat predict to verify consistency
            pred = se.predict(test_data=ds.test)
            assert pred.nrow == ds.test.nrow, "expected " + str(pred.nrow) + " to be equal to " + str(ds.test.nrow)
            assert pred.ncol == 1, "expected " + str(pred.ncol) + " to be equal to 1 but it was equal to " + str(pred.ncol)

        
    def test_se_performance_is_better_than_individual_models():
        ds = prepare_data(blending)
        base_models = train_base_models(ds)

        def compute_perf(model):
            perf = pu.ns(
                train=model.model_performance(train=True),
                test=model.model_performance(test_data=ds.test)
            )
            print("{} training performance: ".format(model.model_id))
            print(perf.train)
            print("{} test performance: ".format(model.model_id))
            print(perf.test)
            return perf

        base_perfs = {}
        for model in base_models:
            base_perfs[model.model_id] = compute_perf(model)

        
        se = train_stacked_ensemble(ds, base_models)
        perf_se = compute_perf(se)


        # Check that stack perf is better (smaller) than the best (smaller) base learner perf:
        # Training RMSE for each base learner
        baselearner_best_rmse_train = min([perf.train.rmse() for perf in base_perfs.values()])
        stack_rmse_train = perf_se.train.rmse()
        print("Best Base-learner Training RMSE:  {}".format(baselearner_best_rmse_train))
        print("Ensemble Training RMSE:  {}".format(stack_rmse_train))
        assert_warn(stack_rmse_train < baselearner_best_rmse_train,
            "expected SE training RMSE would be smaller than the best of base learner training RMSE, but obtained: " \
            "RMSE (SE) = {}, RMSE (best base learner) = {}".format(stack_rmse_train, baselearner_best_rmse_train))

        # Test RMSE for each base learner
        baselearner_best_rmse_test = min([perf.test.rmse() for perf in base_perfs.values()])
        stack_rmse_test = perf_se.test.rmse()
        print("Best Base-learner Test RMSE:  {}".format(baselearner_best_rmse_test))
        print("Ensemble Test RMSE:  {}".format(stack_rmse_test))
        assert_warn(stack_rmse_test < baselearner_best_rmse_test,
            "expected SE test RMSE would be smaller than the best of base learner test RMSE, but obtained: " \
            "RMSE (SE) = {}, RMSE (best base learner) = {}".format(stack_rmse_test, baselearner_best_rmse_test))
        
        
    def test_validation_frame_produces_same_metric_as_perf_test():
        ds = prepare_data(blending)
        models = train_base_models(ds)
        se = train_stacked_ensemble(ds, models, validation_frame=ds.test)
        se_perf = se.model_performance(test_data=ds.test)
        se_perf_validation_frame = se.model_performance(valid=True)
        # since the metrics object is not exactly the same, we can just test that RSME is the same
        assert se_perf.rmse() == se_perf_validation_frame.rmse(), \
            "expected SE test RMSE to be the same as SE validation frame RMSE, but obtained: " \
            "RMSE (perf on test) = {}, RMSE (test passed as validation frame) = {}".format(se_perf.rmse(), se_perf_validation_frame.rmse())

    
    return [pu.tag_test(test, 'blending' if blending else None) for test in [
        test_predict_on_se_model,
        test_se_performance_is_better_than_individual_models,
        test_validation_frame_produces_same_metric_as_perf_test
    ]]
コード例 #13
0
def test_suite_stackedensemble_binomial(blending=False):
    def test_predict_on_se_model():
        ds = prepare_data(blending)
        models = train_base_models(ds)
        se = train_stacked_ensemble(ds, models)
        pred = se.predict(test_data=ds.test)
        assert pred.nrow == ds.test.nrow, "expected " + str(
            pred.nrow) + " to be equal to " + str(ds.test.nrow)
        assert pred.ncol == 3, "expected " + str(
            pred.ncol) + " to be equal to 3 but it was equal to " + str(
                pred.ncol)

    def test_se_performance_is_better_than_individual_models():
        ds = prepare_data(blending)
        base_models = train_base_models(ds)

        def compute_perf(model):
            perf = pu.ns(train=model.model_performance(train=True),
                         test=model.model_performance(test_data=ds.test))
            print("{} training performance: ".format(model.model_id))
            print(perf.train)
            print("{} test performance: ".format(model.model_id))
            print(perf.test)
            return perf

        base_perfs = {}
        for model in base_models:
            base_perfs[model.model_id] = compute_perf(model)

        se = train_stacked_ensemble(ds, base_models)
        perf_se = compute_perf(se)

        # Check that stack perf is better (bigger) than the best(biggest) base learner perf:
        # Training AUC
        baselearner_best_auc_train = max(
            [perf.train.auc() for perf in base_perfs.values()])
        stack_auc_train = perf_se.train.auc()
        print("Best Base-learner Training AUC:  {}".format(
            baselearner_best_auc_train))
        print("Ensemble Training AUC:  {}".format(stack_auc_train))
        assert stack_auc_train > baselearner_best_auc_train, \
            "expected SE training AUC would be greater than the best of base learner training AUC, but obtained: " \
            "AUC (SE) = {}, AUC (best base learner) = {}".format(stack_auc_train, baselearner_best_auc_train)

        # Test AUC
        baselearner_best_auc_test = max(
            [perf.test.auc() for perf in base_perfs.values()])
        stack_auc_test = perf_se.test.auc()
        print("Best Base-learner Test AUC:  {}".format(
            baselearner_best_auc_test))
        print("Ensemble Test AUC:  {}".format(stack_auc_test))
        assert stack_auc_test > baselearner_best_auc_test, \
            "expected SE test AUC would be greater than the best of base learner test AUC, but obtained: " \
            "AUC (SE) = {}, AUC (best base learner) = {}".format(stack_auc_test, baselearner_best_auc_test)

    def test_validation_frame_produces_same_metric_as_perf_test():
        ds = prepare_data(blending)
        models = train_base_models(ds)
        se = train_stacked_ensemble(ds, models, validation_frame=ds.test)
        se_perf = se.model_performance(test_data=ds.test)
        # since the metrics object is not exactly the same, we can just test that AUC is the same
        se_perf_validation_frame = se.model_performance(valid=True)
        assert se_perf.auc() == se_perf_validation_frame.auc(), \
            "expected SE test AUC to be the same as SE validation frame AUC, but obtained: " \
            "AUC (perf on test) = {}, AUC (test passed as validation frame) = {}".format(se_perf.auc(), se_perf_validation_frame.auc())

    return [
        pu.tag_test(test, 'blending' if blending else None) for test in [
            test_predict_on_se_model,
            test_se_performance_is_better_than_individual_models,
            test_validation_frame_produces_same_metric_as_perf_test
        ]
    ]
コード例 #14
0
def make_test(test, transformer):
    bound_test = partial(test, transformer)
    bound_test.__name__ = test.__name__
    pyunit_utils.tag_test(bound_test, transformer.__name__)
    return bound_test
コード例 #15
0
def test_suite_stackedensemble_binomial(blending=False):
    
    def test_predict_on_se_model():
        ds = prepare_data(blending)
        models = train_base_models(ds)
        se = train_stacked_ensemble(ds, models)
        pred = se.predict(test_data=ds.test)
        assert pred.nrow == ds.test.nrow, "expected " + str(pred.nrow) + " to be equal to " + str(ds.test.nrow)
        assert pred.ncol == 3, "expected " + str(pred.ncol) + " to be equal to 3 but it was equal to " + str(pred.ncol)
        
    
    def test_se_performance_is_better_than_individual_models():
        ds = prepare_data(blending)
        base_models = train_base_models(ds)
        
        def compute_perf(model):
            perf = pu.ns(
                train=model.model_performance(train=True),
                test=model.model_performance(test_data=ds.test)
            )
            print("{} training performance: ".format(model.model_id))
            print(perf.train)
            print("{} test performance: ".format(model.model_id))
            print(perf.test)
            return perf

        base_perfs = {}
        for model in base_models:
            base_perfs[model.model_id] = compute_perf(model)

        se = train_stacked_ensemble(ds, base_models)
        perf_se = compute_perf(se)

        # Check that stack perf is better (bigger) than the best(biggest) base learner perf:
        # Training AUC
        baselearner_best_auc_train = max([perf.train.auc() for perf in base_perfs.values()])
        stack_auc_train = perf_se.train.auc()
        print("Best Base-learner Training AUC:  {}".format(baselearner_best_auc_train))
        print("Ensemble Training AUC:  {}".format(stack_auc_train))
        assert stack_auc_train > baselearner_best_auc_train, \
            "expected SE training AUC would be greater than the best of base learner training AUC, but obtained: " \
            "AUC (SE) = {}, AUC (best base learner) = {}".format(stack_auc_train, baselearner_best_auc_train)

        # Test AUC
        baselearner_best_auc_test = max([perf.test.auc() for perf in base_perfs.values()])
        stack_auc_test = perf_se.test.auc()
        print("Best Base-learner Test AUC:  {}".format(baselearner_best_auc_test))
        print("Ensemble Test AUC:  {}".format(stack_auc_test))
        assert stack_auc_test > baselearner_best_auc_test, \
            "expected SE test AUC would be greater than the best of base learner test AUC, but obtained: " \
            "AUC (SE) = {}, AUC (best base learner) = {}".format(stack_auc_test, baselearner_best_auc_test)
        
    
    def test_validation_frame_produces_same_metric_as_perf_test():
        ds = prepare_data(blending)
        models = train_base_models(ds)
        se = train_stacked_ensemble(ds, models, validation_frame=ds.test)
        se_perf = se.model_performance(test_data=ds.test)
        # since the metrics object is not exactly the same, we can just test that AUC is the same
        se_perf_validation_frame = se.model_performance(valid=True)
        assert se_perf.auc() == se_perf_validation_frame.auc(), \
            "expected SE test AUC to be the same as SE validation frame AUC, but obtained: " \
            "AUC (perf on test) = {}, AUC (test passed as validation frame) = {}".format(se_perf.auc(), se_perf_validation_frame.auc())
        
    return [pu.tag_test(test, 'blending' if blending else None) for test in [
        test_predict_on_se_model,
        test_se_performance_is_better_than_individual_models,
        test_validation_frame_produces_same_metric_as_perf_test
    ]]
コード例 #16
0
def test_suite_stackedensemble_gaussian(blending=False):
    def test_predict_on_se_model():
        ds = prepare_data(blending)
        models = train_base_models(ds)
        se = train_stacked_ensemble(ds, models)

        for i in range(2):  # repeat predict to verify consistency
            pred = se.predict(test_data=ds.test)
            assert pred.nrow == ds.test.nrow, "expected " + str(
                pred.nrow) + " to be equal to " + str(ds.test.nrow)
            assert pred.ncol == 1, "expected " + str(
                pred.ncol) + " to be equal to 1 but it was equal to " + str(
                    pred.ncol)

    def test_se_performance_is_better_than_individual_models():
        ds = prepare_data(blending)
        base_models = train_base_models(ds)

        def compute_perf(model):
            perf = pu.ns(train=model.model_performance(train=True),
                         test=model.model_performance(test_data=ds.test))
            print("{} training performance: ".format(model.model_id))
            print(perf.train)
            print("{} test performance: ".format(model.model_id))
            print(perf.test)
            return perf

        base_perfs = {}
        for model in base_models:
            base_perfs[model.model_id] = compute_perf(model)

        se = train_stacked_ensemble(ds, base_models)
        perf_se = compute_perf(se)

        # Check that stack perf is better (smaller) than the best (smaller) base learner perf:
        # Training RMSE for each base learner
        baselearner_best_rmse_train = min(
            [perf.train.rmse() for perf in base_perfs.values()])
        stack_rmse_train = perf_se.train.rmse()
        print("Best Base-learner Training RMSE:  {}".format(
            baselearner_best_rmse_train))
        print("Ensemble Training RMSE:  {}".format(stack_rmse_train))
        assert_warn(stack_rmse_train < baselearner_best_rmse_train,
            "expected SE training RMSE would be smaller than the best of base learner training RMSE, but obtained: " \
            "RMSE (SE) = {}, RMSE (best base learner) = {}".format(stack_rmse_train, baselearner_best_rmse_train))

        # Test RMSE for each base learner
        baselearner_best_rmse_test = min(
            [perf.test.rmse() for perf in base_perfs.values()])
        stack_rmse_test = perf_se.test.rmse()
        print("Best Base-learner Test RMSE:  {}".format(
            baselearner_best_rmse_test))
        print("Ensemble Test RMSE:  {}".format(stack_rmse_test))
        assert_warn(stack_rmse_test < baselearner_best_rmse_test,
            "expected SE test RMSE would be smaller than the best of base learner test RMSE, but obtained: " \
            "RMSE (SE) = {}, RMSE (best base learner) = {}".format(stack_rmse_test, baselearner_best_rmse_test))

    def test_validation_frame_produces_same_metric_as_perf_test():
        ds = prepare_data(blending)
        models = train_base_models(ds)
        se = train_stacked_ensemble(ds, models, validation_frame=ds.test)
        se_perf = se.model_performance(test_data=ds.test)
        se_perf_validation_frame = se.model_performance(valid=True)
        # since the metrics object is not exactly the same, we can just test that RSME is the same
        assert se_perf.rmse() == se_perf_validation_frame.rmse(), \
            "expected SE test RMSE to be the same as SE validation frame RMSE, but obtained: " \
            "RMSE (perf on test) = {}, RMSE (test passed as validation frame) = {}".format(se_perf.rmse(), se_perf_validation_frame.rmse())

    return [
        pu.tag_test(test, 'blending' if blending else None) for test in [
            test_predict_on_se_model,
            test_se_performance_is_better_than_individual_models,
            test_validation_frame_produces_same_metric_as_perf_test
        ]
    ]