def test():
     env = _prepare_test_env()
     se1 = H2OStackedEnsembleEstimator(base_models=base_models_1(env), seed=seed)
     se1.train(env["data"].x, env["data"].y, env["data"].train)
     se2 = H2OStackedEnsembleEstimator(base_models=base_models_2(env), seed=seed)
     se2.train(env["data"].x, env["data"].y, env["data"].train)
     assert sorted(se1.base_models) == sorted(se2.base_models), error_message
def test_frames_cannot_be_passed_as_key():
    ds = import_dataset()

    kw_args = [
        dict(training_frame=ds['train'].frame_id),
        dict(training_frame=ds['train'],
             validation_frame=ds['valid'].frame_id),
        dict(training_frame=ds['train'], blending_frame=ds['blend'].frame_id),
    ]

    # Constructor validation
    for kwargs in kw_args:
        try:
            H2OStackedEnsembleEstimator(base_models=[], **kwargs)
            assert False, "should have thrown due to invalid frame"
        except H2OTypeError as e:
            attr = next(k for k, v in kwargs.items() if v is not ds['train'])
            assert "'{}' must be a valid H2OFrame".format(attr) in str(e), str(
                e)

    # train method validation
    se = H2OStackedEnsembleEstimator(base_models=[])

    for kwargs in kw_args:
        try:
            se.train(y=ds['target'], **kwargs)
            assert False, "should have thrown due to invalid frame"
        except H2OTypeError as e:
            attr = next(k for k, v in kwargs.items() if v is not ds['train'])
            assert "'{}' must be a valid H2OFrame".format(attr) in str(e), str(
                e)
Beispiel #3
0
def test_frames_can_be_passed_as_key():
    ds = import_dataset()

    kw_args = [
        dict(training_frame=ds['train'].frame_id),
        dict(training_frame=ds['train'], validation_frame=ds['valid'].frame_id),
        dict(training_frame=ds['train'], blending_frame=ds['blend'].frame_id),
    ]

    # Constructor validation
    for kwargs in kw_args:
        H2OStackedEnsembleEstimator(base_models=[], **kwargs)

    # train method validation
    base_model_params = dict(ntrees=3, nfolds=3, seed=seed, keep_cross_validation_predictions=True)
    for kwargs in kw_args:
        base_training_args = {k: v for k, v in kwargs.items() if k != 'blending_frame'}
        base_training_args['y'] = ds['target']
        gbm = H2OGradientBoostingEstimator(**base_model_params)
        gbm.train(**base_training_args)
        rf = H2ORandomForestEstimator(**base_model_params)
        rf.train(**base_training_args)
        
        se = H2OStackedEnsembleEstimator(base_models=[gbm, rf])
        se.train(y=ds['target'], **kwargs)
def infer_uses_defaults_when_base_model_doesnt_support_distributions_test():
    train = h2o.import_file(
        pyunit_utils.locate("smalldata/iris/iris_train.csv"))
    test = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_test.csv"))
    x_reg = train.columns
    y_reg = "petal_wid"
    x_reg.remove(y_reg)

    nfolds = 2
    glm_reg = H2OGeneralizedLinearEstimator(
        nfolds=nfolds,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True,
        family="tweedie")
    glm_reg.train(x=x_reg, y=y_reg, training_frame=train)

    gbm_reg = H2OGradientBoostingEstimator(
        nfolds=nfolds,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True,
        distribution="tweedie")
    gbm_reg.train(x=x_reg, y=y_reg, training_frame=train)

    drf_reg = H2ORandomForestEstimator(nfolds=nfolds,
                                       fold_assignment="Modulo",
                                       keep_cross_validation_predictions=True)
    drf_reg.train(x=x_reg, y=y_reg, training_frame=train)

    se_reg_0 = H2OStackedEnsembleEstimator(training_frame=train,
                                           validation_frame=test,
                                           base_models=[glm_reg, gbm_reg],
                                           metalearner_algorithm="gbm")
    se_reg_0.train(x_reg, y_reg, train)

    assert se_reg_0.metalearner().actual_params.get("distribution") == "tweedie", \
        "Expected distribution {} but got {}".format("tweedie",
                                                     se_reg_0.metalearner().actual_params.get("distribution"))

    se_reg_1 = H2OStackedEnsembleEstimator(
        training_frame=train,
        validation_frame=test,
        base_models=[glm_reg, gbm_reg, drf_reg],
        metalearner_algorithm="gbm")
    se_reg_1.train(x_reg, y_reg, train)

    assert se_reg_1.metalearner().actual_params.get("distribution") == "gaussian", \
        "Expected distribution {} but got {}".format("gaussian",
                                                     se_reg_1.metalearner().actual_params.get("distribution"))

    se_reg_2 = H2OStackedEnsembleEstimator(
        training_frame=train,
        validation_frame=test,
        base_models=[drf_reg, glm_reg, gbm_reg],
        metalearner_algorithm="gbm")
    se_reg_2.train(x_reg, y_reg, train)

    assert se_reg_2.metalearner().actual_params.get("distribution") == "gaussian", \
        "Expected distribution {} but got {}".format("gaussian",
                                                     se_reg_2.metalearner().actual_params.get("distribution"))
def test_stacked_ensemble_is_able_to_use_imported_base_models():
    import tempfile, shutil, glob
    train = h2o.import_file(pu.locate("smalldata/iris/iris_train.csv"))
    test = h2o.import_file(pu.locate("smalldata/iris/iris_test.csv"))
    x = train.columns
    y = "species"
    x.remove(y)

    nfolds = 2
    gbm = H2OGradientBoostingEstimator(nfolds=nfolds,
                                       fold_assignment="Modulo",
                                       keep_cross_validation_predictions=True)
    gbm.train(x=x, y=y, training_frame=train)
    drf = H2ORandomForestEstimator(nfolds=nfolds,
                                  fold_assignment="Modulo",
                                  keep_cross_validation_predictions=True)
    drf.train(x=x, y=y, training_frame=train)

    se = H2OStackedEnsembleEstimator(training_frame=train,
                                     validation_frame=test,
                                     base_models=[gbm.model_id, drf.model_id])
    se.train(x=x, y=y, training_frame=train)

    assert len(se.base_models) == 2

    TMP_DIR = tempfile.mkdtemp()
    try:
        h2o.save_model(gbm, TMP_DIR + "/gbm.model")
        h2o.save_model(drf, TMP_DIR + "/drf.model")

        gbm_holdout_id = gbm.cross_validation_holdout_predictions().frame_id
        drf_holdout_id = drf.cross_validation_holdout_predictions().frame_id
        h2o.export_file(gbm.cross_validation_holdout_predictions(), TMP_DIR + "/gbm.holdout")
        h2o.export_file(drf.cross_validation_holdout_predictions(), TMP_DIR + "/drf.holdout")

        h2o.remove_all()

        h2o.import_file(TMP_DIR + "/gbm.holdout", gbm_holdout_id)
        h2o.import_file(TMP_DIR + "/drf.holdout", drf_holdout_id)

        gbm = h2o.upload_model(glob.glob(TMP_DIR + "/gbm.model/*")[0])
        drf = h2o.upload_model(glob.glob(TMP_DIR + "/drf.model/*")[0])

        train = h2o.import_file(pu.locate("smalldata/iris/iris_train.csv"), "some_other_name_of_training_frame")
        test = h2o.import_file(pu.locate("smalldata/iris/iris_test.csv"), "some_other_name_of_test_frame")
        x = train.columns
        y = "species"
        x.remove(y)

        se_loaded = H2OStackedEnsembleEstimator(training_frame=train,
                                                validation_frame=test,
                                                base_models=[gbm.model_id, drf.model_id])
        se_loaded.train(x=x, y=y, training_frame=train)

        assert len(se_loaded.base_models) == 2
    finally:
        shutil.rmtree(TMP_DIR)
def test_offset_column_has_to_be_same_in_each_base_model():
    train = h2o.import_file(pu.locate("smalldata/iris/iris_train.csv"))
    x = train.columns
    y = "petal_wid"
    x.remove(y)

    nfolds = 2
    gbm = H2OGradientBoostingEstimator(
        nfolds=nfolds,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True,
    )
    gbm.train(x=x, y=y, training_frame=train)

    gbm2 = H2OGradientBoostingEstimator(nfolds=nfolds,
                                        fold_assignment="Modulo",
                                        keep_cross_validation_predictions=True,
                                        offset_column="petal_len")
    gbm2.train(x=x, y=y, training_frame=train)

    gbm3 = H2OGradientBoostingEstimator(nfolds=nfolds,
                                        fold_assignment="Modulo",
                                        keep_cross_validation_predictions=True,
                                        offset_column="sepal_len")
    gbm3.train(x=x, y=y, training_frame=train)

    try:
        se = H2OStackedEnsembleEstimator(
            training_frame=train,
            base_models=[gbm.model_id, gbm2.model_id],
            offset_column="petal_len")
        se.train(x=x, y=y, training_frame=train)
        assert False, "Should have failed with 'All base models have to have the same offset_column!'"
    except H2OResponseError:
        pass

    try:
        se = H2OStackedEnsembleEstimator(
            training_frame=train,
            base_models=[gbm2.model_id, gbm.model_id],
            offset_column="petal_len")
        se.train(x=x, y=y, training_frame=train)
        assert False, "Should have failed with 'All base models have to have the same offset_column!'"
    except H2OResponseError:
        pass

    try:
        se = H2OStackedEnsembleEstimator(
            training_frame=train,
            base_models=[gbm2.model_id, gbm3.model_id],
            offset_column="petal_len")
        se.train(x=x, y=y, training_frame=train)
        assert False, "Should have failed with 'All base models have to have the same offset_column!'"
    except H2OResponseError:
        pass
def stackedensemble_base_models_test():
    """This test checks the following:
    1) That passing in a list of models for base_models works.
    2) That passing in a list of models and model_ids results in the same stacked ensemble.
    """

    # Import training set
    train = h2o.import_file(
        path=pyunit_utils.locate("smalldata/testng/higgs_train_5k.csv"),
        destination_frame="higgs_train_5k")

    # Identify predictors and response
    x = train.columns
    y = "response"
    x.remove(y)

    # convert response to a factor
    train[y] = train[y].asfactor()

    # set number of folds
    nfolds = 5

    # train and cross-validate a GBM
    my_gbm = H2OGradientBoostingEstimator(
        distribution="bernoulli",
        ntrees=10,
        nfolds=nfolds,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True,
        seed=1)
    my_gbm.train(x=x, y=y, training_frame=train)

    # train and cross-validate a RF
    my_rf = H2ORandomForestEstimator(ntrees=50,
                                     nfolds=nfolds,
                                     fold_assignment="Modulo",
                                     keep_cross_validation_predictions=True,
                                     seed=1)
    my_rf.train(x=x, y=y, training_frame=train)

    # Train a stacked ensemble using model ids in base_models
    stack1 = H2OStackedEnsembleEstimator(
        model_id="my_ensemble_binomial1",
        base_models=[my_gbm.model_id, my_rf.model_id])
    stack1.train(x=x, y=y, training_frame=train)

    # Train a stacked ensemble using models in base_models
    stack2 = H2OStackedEnsembleEstimator(model_id="my_ensemble_binomial2",
                                         base_models=[my_gbm, my_rf])

    stack2.train(x=x, y=y, training_frame=train)

    # Eval train AUC to assess equivalence
    assert stack1.auc() == stack2.auc()
def stackedensemble_levelone_frame_test():

    train = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_train.csv"))
    y = "species"
    x = list(range(4))
    train[y] = train[y].asfactor()
    nfolds = 5
    num_base_models = 2
    num_col_level_one_frame = (
        train[y].unique().nrow
    ) * num_base_models + 1  #Predicting 3 classes across two base models + response (3*2+1)

    # train and cross-validate a GBM
    my_gbm = H2OGradientBoostingEstimator(
        distribution="multinomial",
        nfolds=nfolds,
        ntrees=10,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True,
        seed=1)
    my_gbm.train(x=x, y=y, training_frame=train)

    # train and cross-validate a RF
    my_rf = H2ORandomForestEstimator(ntrees=10,
                                     nfolds=nfolds,
                                     fold_assignment="Modulo",
                                     keep_cross_validation_predictions=True,
                                     seed=1)

    my_rf.train(x=x, y=y, training_frame=train)

    # Train a stacked ensemble using the GBM and GLM above
    stack = H2OStackedEnsembleEstimator(
        base_models=[my_gbm.model_id, my_rf.model_id],
        keep_levelone_frame=True)
    stack.train(
        x=x, y=y,
        training_frame=train)  # also test that validation_frame is working
    level_one_frame = h2o.get_frame(stack.levelone_frame_id()["name"])
    assert level_one_frame.ncols == num_col_level_one_frame, "The number of columns in a level one frame should be numClasses * numBaseModels + 1."
    assert level_one_frame.nrows == train.nrows, "The number of rows in the level one frame should match train number of rows. "

    stack2 = H2OStackedEnsembleEstimator(
        base_models=[my_gbm.model_id, my_rf.model_id])
    stack2.train(
        x=x, y=y,
        training_frame=train)  # also test that validation_frame is working
    assert stack2.levelone_frame_id(
    ) is None, "Level one frame is only available when keep_levelone_frame is True."
Beispiel #9
0
def grid_export_with_cv():
    train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    # Run GBM Grid Search
    hyper_parameters = OrderedDict()
    hyper_parameters["ntrees"] = [1, 2]

    # train with CV
    gs = H2OGridSearch(H2OGradientBoostingEstimator(nfolds=2, keep_cross_validation_predictions=True, seed=42),
                       hyper_params=hyper_parameters)
    gs.train(x=list(range(4)), y=4, training_frame=train)

    holdout_frame_ids = map(lambda m: m.cross_validation_holdout_predictions().frame_id, gs.models)

    export_dir = pyunit_utils.locate("results")
    saved_path = h2o.save_grid(export_dir, gs.grid_id, export_cross_validation_predictions=True)

    h2o.remove_all()

    grid = h2o.load_grid(saved_path)

    assert grid is not None
    for holdout_frame_id in holdout_frame_ids:
        assert h2o.get_frame(holdout_frame_id) is not None

    train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    stack = H2OStackedEnsembleEstimator(base_models=grid.model_ids)
    stack.train(x=list(range(4)), y=4, training_frame=train)

    predicted = stack.predict(train)
    assert predicted.nrow == train.nrow
def test_frames_can_be_passed_to_constructor():
    ds = import_dataset()

    gbm = H2OGradientBoostingEstimator(ntrees=10,
                                       nfolds=0,
                                       seed=seed,
                                       training_frame=ds['train'],
                                       validation_frame=ds['valid'])
    gbm.train(y=ds['target'])

    rf = H2ORandomForestEstimator(ntrees=10,
                                  nfolds=0,
                                  seed=seed,
                                  training_frame=ds['train'],
                                  validation_frame=ds['valid'])
    rf.train(y=ds['target'])

    se = H2OStackedEnsembleEstimator(base_models=[gbm, rf],
                                     seed=seed,
                                     training_frame=ds['train'],
                                     validation_frame=ds['valid'],
                                     blending_frame=ds['blend'])
    se.train(y=ds['target'])

    assert se.auc() > 0
def train_stacked_ensemble(dataset, base_models, **kwargs):
    se = H2OStackedEnsembleEstimator(base_models=[m.model_id for m in base_models], seed=seed)
    se.train(x=dataset.x, y=dataset.y,
             training_frame=dataset.train,
             blending_frame=dataset.blend if hasattr(dataset, 'blend') else None,
             **kwargs)
    return se
def test_stackedensemble_respects_the_max_runtime_secs():
    max_runtime_secs = 1
    hyper_parameters = dict()
    hyper_parameters["ntrees"] = [1, 2, 3, 4, 5]
    params = dict(
        fold_assignment="modulo",
        nfolds=3
    )

    data = prepare_data()

    gs1 = H2OGridSearch(H2OGradientBoostingEstimator(**params), hyper_params=hyper_parameters)
    gs1.train(data.x, data.y, data.train, validation_frame=data.train)

    big_blending_frame = data.train
    for i in range(15):
        big_blending_frame = big_blending_frame.rbind(big_blending_frame)

    se = H2OStackedEnsembleEstimator(
        base_models=gs1.model_ids,
        max_runtime_secs=max_runtime_secs,
        blending_frame=big_blending_frame)
    try:
        se.train(data.x, data.y, data.train)
        assert False, "This should have failed due to time out."
    except H2OResponseError:
        pass
def test_base_models_are_populated():
    train = h2o.import_file(pu.locate("smalldata/iris/iris_train.csv"))
    test = h2o.import_file(pu.locate("smalldata/iris/iris_test.csv"))
    x = train.columns
    y = "species"
    x.remove(y)

    nfolds = 2
    gbm = H2OGradientBoostingEstimator(nfolds=nfolds,
                                       fold_assignment="Modulo",
                                       keep_cross_validation_predictions=True)
    gbm.train(x=x, y=y, training_frame=train)
    rf = H2ORandomForestEstimator(nfolds=nfolds,
                                  fold_assignment="Modulo",
                                  keep_cross_validation_predictions=True)
    rf.train(x=x, y=y, training_frame=train)
    se = H2OStackedEnsembleEstimator(training_frame=train,
                                     validation_frame=test,
                                     base_models=[gbm.model_id, rf.model_id])
    se.train(x=x, y=y, training_frame=train)
    retrieved_se = get_model(se.model_id)

    assert len(se.base_models) == 2
    assert len(retrieved_se.base_models) == 2
    assert se.base_models == retrieved_se.base_models
    # ensure that we are getting the model_ids
    assert pu.is_type(se.base_models, [str])
    assert pu.is_type(retrieved_se.base_models, [str])
Beispiel #14
0
def stacking(trained_base_model_lst):
    """
    create a stacking algorithm estimator
    :param trained_base_model_lst: a list of base algorithm models' id
    :return: a constructed stacking estimator
    """
    return H2OStackedEnsembleEstimator(model_id="stacking_model", base_models=trained_base_model_lst)
 def test_base_models_work_properly_with_list_of_models():
     env = _prepare_test_env()
     se = H2OStackedEnsembleEstimator(base_models=[env["drf"]])
     se.train(env["data"].x, env["data"].y, env["data"].train)
     assert se.base_models == [
         env["drf"].model_id
     ], "StackedEnsembles don't work properly with single model in base models"
def test_offset_column_is_inherited_from_base_models():
    train = h2o.import_file(pu.locate("smalldata/iris/iris_train.csv"))
    x = train.columns
    y = "petal_wid"
    x.remove(y)

    nfolds = 2
    gbm = H2OGradientBoostingEstimator(nfolds=nfolds,
                                       fold_assignment="Modulo",
                                       keep_cross_validation_predictions=True,
                                       offset_column="petal_len")
    gbm.train(x=x, y=y, training_frame=train)

    gbm2 = H2OGradientBoostingEstimator(nfolds=nfolds,
                                        fold_assignment="Modulo",
                                        keep_cross_validation_predictions=True,
                                        offset_column="petal_len")
    gbm2.train(x=x, y=y, training_frame=train)

    se = H2OStackedEnsembleEstimator(training_frame=train,
                                     base_models=[gbm.model_id, gbm2.model_id])
    se.train(x=x, y=y, training_frame=train)

    assert se.metalearner(
    ).actual_params["offset_column"]["column_name"] == "petal_len"
Beispiel #17
0
def test_frames_can_be_overridden_in_train_method():
    ds = import_dataset()

    dummy_frame = h2o.H2OFrame([1, 2, 3])

    gbm = H2OGradientBoostingEstimator(ntrees=10, nfolds=0, seed=seed,
                                       training_frame=dummy_frame,
                                       validation_frame=dummy_frame)
    gbm.train(y=ds['target'],
              training_frame=ds['train'],
              validation_frame=ds['valid'])

    rf = H2ORandomForestEstimator(ntrees=10, nfolds=0, seed=seed,
                                  training_frame=dummy_frame,
                                  validation_frame=dummy_frame)
    rf.train(y=ds['target'],
             training_frame=ds['train'],
             validation_frame=ds['valid'])

    se = H2OStackedEnsembleEstimator(base_models=[gbm, rf], seed=seed,
                                     training_frame=dummy_frame,
                                     validation_frame=dummy_frame,
                                     blending_frame=dummy_frame)
    se.train(y=ds['target'],
             training_frame=ds['train'],
             validation_frame=ds['valid'],
             blending_frame=ds['blend'])

    assert se.auc() > 0
def stacked_ensemble_export():
    print("###### STACKED ENSEMBLE ######")
    frame = h2o.import_file(
        path=pyunit_utils.locate("smalldata/testng/higgs_train_5k.csv"),
        destination_frame="higgs_train_5k")
    x = frame.columns
    y = "response"
    x.remove(y)
    frame[y] = frame[y].asfactor()
    my_gbm = H2OGradientBoostingEstimator(
        distribution="bernoulli",
        ntrees=10,
        nfolds=5,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True,
        seed=1)
    my_gbm.train(x=x, y=y, training_frame=frame)
    my_rf = H2ORandomForestEstimator(ntrees=50,
                                     nfolds=5,
                                     fold_assignment="Modulo",
                                     keep_cross_validation_predictions=True,
                                     seed=1)
    my_rf.train(x=x, y=y, training_frame=frame)
    model = H2OStackedEnsembleEstimator(
        model_id="my_ensemble_binomial1",
        base_models=[my_gbm.model_id, my_rf.model_id])
    model.train(x=x, y=y, training_frame=frame)
    expect_error(model.download_pojo, "Stacked Enemble", "POJO")
    expect_error(model.download_mojo, "Stacked Enemble", "MOJO")
Beispiel #19
0
def test_SE_warns_when_all_basemodels_use_same_weights_column_and_SE_none():
    train = h2o.import_file(pu.locate("smalldata/iris/iris_train.csv"))
    train["weights"] = 1
    x = train.columns
    y = "species"
    x.remove(y)

    nfolds = 2
    gbm = H2OGradientBoostingEstimator(nfolds=nfolds,
                                       fold_assignment="Modulo",
                                       keep_cross_validation_predictions=True,
                                       weights_column="weights")
    gbm.train(x=x, y=y, training_frame=train)

    rf = H2ORandomForestEstimator(nfolds=nfolds,
                                  fold_assignment="Modulo",
                                  keep_cross_validation_predictions=True,
                                  weights_column="weights")
    rf.train(x=x, y=y, training_frame=train)

    se = H2OStackedEnsembleEstimator(training_frame=train,
                                     base_models=[gbm.model_id, rf.model_id])

    for v in sys.modules.values():
        if getattr(v, '__warningregistry__', None):
            v.__warningregistry__ = {}
    with warnings.catch_warnings(record=True) as ws:
        # Get all UserWarnings
        warnings.simplefilter("always", RuntimeWarning)
        se.train(x=x, y=y, training_frame=train)
        assert any((issubclass(w.category, RuntimeWarning)
                    and 'use weights_column="weights"' in str(w.message)
                    for w in ws))
Beispiel #20
0
def test_weights_column_is_propagated_to_metalearner():
    train = h2o.import_file(pu.locate("smalldata/iris/iris_train.csv"))
    train["weights"] = 1
    x = train.columns
    y = "species"
    x.remove(y)

    nfolds = 2
    gbm = H2OGradientBoostingEstimator(nfolds=nfolds,
                                       fold_assignment="Modulo",
                                       keep_cross_validation_predictions=True,
                                       weights_column="weights")
    gbm.train(x=x, y=y, training_frame=train)

    rf = H2ORandomForestEstimator(nfolds=nfolds,
                                  fold_assignment="Modulo",
                                  keep_cross_validation_predictions=True,
                                  weights_column="weights")
    rf.train(x=x, y=y, training_frame=train)

    se = H2OStackedEnsembleEstimator(training_frame=train,
                                     base_models=[gbm.model_id, rf.model_id],
                                     weights_column="weights")
    se.train(x=x, y=y, training_frame=train)

    assert se.metalearner(
    ).actual_params["weights_column"]["column_name"] == "weights"
def test_stackedensemble_propagates_the_max_runtime_secs():
    max_runtime_secs = 5
    hyper_parameters = dict()
    hyper_parameters["ntrees"] = [1, 3, 5]
    params = dict(
        fold_assignment="modulo",
        nfolds=3,
        keep_cross_validation_predictions=True
    )

    data = prepare_data()

    gs1 = H2OGridSearch(H2OGradientBoostingEstimator(**params), hyper_params=hyper_parameters)
    gs1.train(data.x, data.y, data.train, validation_frame=data.train)

    se = H2OStackedEnsembleEstimator(base_models=[gs1], max_runtime_secs=max_runtime_secs)
    se.train(data.x, data.y, data.train)
    metalearner = h2o.get_model(se.metalearner()["name"])

    # metalearner has the set max_runtine_secs
    assert metalearner.actual_params['max_runtime_secs'] <= max_runtime_secs
    assert metalearner.actual_params['max_runtime_secs'] > 0

    # stack ensemble has the set max_runtime_secs
    assert se.max_runtime_secs == max_runtime_secs
Beispiel #22
0
def airline_gbm_random_grid():
    air_hex = h2o.import_file(
        path=pyunit_utils.locate("smalldata/airlines/allyears2k_headers.zip"),
        destination_frame="air.hex")
    myX = ["Year", "Month", "CRSDepTime", "UniqueCarrier", "Origin", "Dest"]

    # create hyperameter and search criteria lists (ranges are inclusive..exclusive))
    hyper_params_tune = {
        'max_depth':
        list(range(1, 10 + 1, 1)),
        'sample_rate': [x / 100. for x in range(20, 101)],
        'col_sample_rate': [x / 100. for x in range(20, 101)],
        'col_sample_rate_per_tree': [x / 100. for x in range(20, 101)],
        'col_sample_rate_change_per_level': [x / 100. for x in range(90, 111)],
        'min_rows':
        [2**x for x in range(0,
                             int(math.log(air_hex.nrow, 2) - 1) + 1)],
        'nbins': [2**x for x in range(4, 11)],
        'nbins_cats': [2**x for x in range(4, 13)],
        'min_split_improvement': [0, 1e-8, 1e-6, 1e-4],
        'histogram_type': ["UniformAdaptive", "QuantilesGlobal", "RoundRobin"]
    }

    search_criteria_tune = {
        'strategy': "RandomDiscrete",
        'max_runtime_secs': 600,  ## limit the runtime to 10 minutes
        'max_models': 5,  ## build no more than 5 models
        'seed': 1234,
        'stopping_rounds': 5,
        'stopping_metric': "AUC",
        'stopping_tolerance': 1e-3
    }

    air_grid = H2OGridSearch(H2OGradientBoostingEstimator,
                             hyper_params=hyper_params_tune,
                             search_criteria=search_criteria_tune)

    air_grid.train(x=myX,
                   y="IsDepDelayed",
                   training_frame=air_hex,
                   nfolds=5,
                   fold_assignment='Modulo',
                   keep_cross_validation_predictions=True,
                   distribution="bernoulli",
                   seed=1234)

    # under rare circumstances, GBM may not build enough model (5 in this case) within 600 seconds to return
    # hence, switch the compare from == to <= should fix the intermittent problem.
    assert len(air_grid.get_grid()
               ) <= 5, "Grid search has returned more models than allowed."
    print(air_grid.get_grid("logloss"))

    stacker = H2OStackedEnsembleEstimator(selection_strategy="choose_all",
                                          base_models=air_grid.model_ids)
    stacker.train(model_id="my_ensemble",
                  y="IsDepDelayed",
                  training_frame=air_hex)
    predictions = stacker.predict(air_hex)  # training data
    print("preditions for ensemble are in: " + predictions.frame_id)
 def check_stackedensemble_with_AUTO_metalearner(data, models):
     se = H2OStackedEnsembleEstimator(base_models=models,
                                      metalearner_nfolds=5,
                                      seed=seed)
     se.train(data.x, data.y, data.train)
     results = scores_and_preds(se, data.test)
     print(results)
     assert data.domain == results.test_pclasses, "expected predicted classes {} but got {}".format(
         data.domain, results.test_pclasses)
 def test_validation_on_backend_works():
     data = prepare_data()
     se = H2OStackedEnsembleEstimator(base_models=[data.train.frame_id])
     try:
         se.train(data.x, data.y, data.train)
     except H2OResponseError as e:
         assert "Unsupported type \"class water.fvec.Frame\" as a base model." in str(e), \
             "StackedEnsembles' base models validation exception probably changed."
     else:
         assert False, "StackEnsembles' base models validation doesn't work properly."
 def check_stackedensemble_with_GLM_metalearner(data, models):
     se = H2OStackedEnsembleEstimator(base_models=models,
                                      metalearner_algorithm='GLM',
                                      metalearner_nfolds=5,
                                      seed=seed)
     se.train(data.x, data.y, data.train)
     results = scores_and_preds(se, data.test)
     print(results)
     assert data.domain != results.test_pclasses, "expected predictions not to include all target domain"
     assert len(results.test_pclasses) == 1
Beispiel #26
0
def KAStackdEsnsumbleEstimator(modellist,
                               nfolds=nfolds,
                               stack_id="ka",
                               train=train_ka,
                               ensemble_tag="ka"):
    print("starting time:")
    ensemble = H2OStackedEnsembleEstimator(
        model_id=("airbnb_ensemble_binomial_" + ensemble_tag),
        base_models=modellist)
    ensemble.train(x=x, y=y, training_frame=train)
    return (ensemble)
def metalearner_parameters_test():
    train = h2o.import_file(
        pyunit_utils.locate("smalldata/iris/iris_train.csv"))
    test = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_test.csv"))
    x = train.columns
    y = "species"
    x.remove(y)

    nfolds = 2
    gbm = H2OGradientBoostingEstimator(nfolds=nfolds,
                                       fold_assignment="Modulo",
                                       keep_cross_validation_predictions=True)
    gbm.train(x=x, y=y, training_frame=train)
    rf = H2ORandomForestEstimator(nfolds=nfolds,
                                  fold_assignment="Modulo",
                                  keep_cross_validation_predictions=True)
    rf.train(x=x, y=y, training_frame=train)

    se_nb = H2OStackedEnsembleEstimator(
        training_frame=train,
        validation_frame=test,
        base_models=[gbm.model_id, rf.model_id],
        metalearner_algorithm='naivebayes',
        metalearner_params={'min_prob': 0.5})
    se_nb.train(x=x, y=y, training_frame=train)

    assert se_nb.actual_params["metalearner_algorithm"] == "naivebayes"
    assert '{"min_prob": [0.5]}' == se_nb.actual_params["metalearner_params"]

    se_xgb = H2OStackedEnsembleEstimator(
        training_frame=train,
        validation_frame=test,
        base_models=[gbm.model_id, rf.model_id],
        metalearner_algorithm='xgboost',
        metalearner_params={'booster': 'dart'})
    se_xgb.train(x=x, y=y, training_frame=train)

    assert se_xgb.actual_params["metalearner_algorithm"] == "xgboost"
    assert '{"booster": ["dart"]}' == se_xgb.actual_params[
        "metalearner_params"]
 def check_stackedensemble_with_GLM_metalearner_with_standardization_disabled(
         data, models):
     se = H2OStackedEnsembleEstimator(
         base_models=models,
         metalearner_algorithm='GLM',
         metalearner_nfolds=5,
         metalearner_params=dict(standardize=False),
         seed=seed)
     se.train(data.x, data.y, data.train)
     results = scores_and_preds(se, data.test)
     print(results)
     assert data.domain == results.test_pclasses, "expected predicted classes {} but got {}".format(
         data.domain, results.test_pclasses)
    def train_ensemble_using_metalearner(algo, expected_algo):
        print("Training ensemble using {} metalearner.".format(algo))

        meta_params = dict(metalearner_nfolds=3)

        se = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm=algo, **meta_params)
        se.train(x=x, y=y, training_frame=train)
        assert(se.params['metalearner_algorithm']['actual'] == expected_algo)
        if meta_params:
            assert(se.params['metalearner_nfolds']['actual'] == 3)

        meta = h2o.get_model(se.metalearner()['name'])
        assert(meta.algo == expected_algo), "Expected that the metalearner would use {}, but actually used {}.".format(expected_algo, meta.algo)
        if meta_params:
            assert(meta.params['nfolds']['actual'] == 3)
def metalearner_property_test():
    train = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_train.csv"))
    test = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_test.csv"))
    x = train.columns
    y = "species"
    x.remove(y)

    nfolds = 2
    gbm = H2OGradientBoostingEstimator(nfolds=nfolds,
                                       fold_assignment="Modulo",
                                       keep_cross_validation_predictions=True)
    gbm.train(x=x, y=y, training_frame=train)
    rf = H2ORandomForestEstimator(nfolds=nfolds,
                                  fold_assignment="Modulo",
                                  keep_cross_validation_predictions=True)
    rf.train(x=x, y=y, training_frame=train)
    se = H2OStackedEnsembleEstimator(training_frame=train,
                                     validation_frame=test,
                                     base_models=[gbm.model_id, rf.model_id])
    se.train(x=x, y=y, training_frame=train)

    old_way_retrieved_metalearner = h2o.get_model(se.metalearner()["name"])

    # Test backward compatibility
    assert se.metalearner()["name"] == old_way_retrieved_metalearner.model_id
    # Test we get the same metalearner as we used to get
    assert old_way_retrieved_metalearner.model_id == se.metalearner().model_id

    # Check we emit deprecation warning
    for v in sys.modules.values():
        if getattr(v, '__warningregistry__', None):
            v.__warningregistry__ = {}
    with warnings.catch_warnings(record=True) as ws:
        # Get all DeprecationWarnings
        warnings.simplefilter("always", DeprecationWarning)
        # Trigger a warning.
        _ = se.metalearner()["name"]
        # Assert that we get the deprecation warning
        assert any((
            issubclass(w.category, DeprecationWarning) and
            "metalearner()['name']" in str(w.message)
            for w in ws
        ))