fold_assignment="Modulo",
                                       keep_cross_validation_predictions=True)
my_GLM.train(x, y, train, validation_frame=valid)
my_GLM.model_performance(valid)

## Save models
h2o.save_model(model=my_GBM, force=True)
h2o.save_model(model=my_NN, force=True)
h2o.save_model(model=my_RF, force=True)
h2o.save_model(model=my_GLM, force=True)

## Stacked ensemble - 21'' to run
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
models = [my_GBM.model_id, my_NN.model_id, my_RF.model_id, my_GLM.model_id]
my_SE = H2OStackedEnsembleEstimator(model_id="SE_glm_gbm_rf_nn",
                                    base_models=models,
                                    metalearner_algorithm='deeplearning',
                                    seed=12)
my_SE.train(x, y, train, validation_frame=valid)
h2o.save_model(model=my_SE, force=True)

# Compare models performance
all_models = [my_GBM, my_NN, my_RF, my_GLM, my_SE]
names = ["GBM", "NN", "RF", "GLM", "SE"]
test_perf = list(map(lambda x: x.model_performance(valid), all_models))
pd.Series(map(lambda p: p.rmse(), test_perf), names)
# Better performance on the Stacked Ensemble!

## Performance on the test data
my_SE.model_performance(test)

# Performance reached on the test data : RMSE = 120575 < 123000
Beispiel #2
0
def stackedensemble_metalearner_seed_test():

    # Import training set
    train = h2o.import_file(
        path=pyunit_utils.locate("smalldata/testng/higgs_train_5k.csv"),
        destination_frame="higgs_train_5k")
    test = h2o.import_file(
        path=pyunit_utils.locate("smalldata/testng/higgs_test_5k.csv"),
        destination_frame="higgs_test_5k")

    # Identify predictors and response
    x = train.columns
    y = "response"
    x.remove(y)

    # Convert response to a factor
    train[y] = train[y].asfactor()
    test[y] = test[y].asfactor()

    # Set number of folds for base learners
    nfolds = 3

    #Metalearner params for gbm, drf, glm, and deep deeplearning
    gbm_params = {"sample_rate": 0.3, "col_sample_rate": 0.3}

    # Train and cross-validate a GBM
    my_gbm = H2OGradientBoostingEstimator(
        distribution="bernoulli",
        ntrees=10,
        nfolds=nfolds,
        keep_cross_validation_predictions=True,
        seed=1)
    my_gbm.train(x=x, y=y, training_frame=train)

    # Train and cross-validate a RF
    my_rf = H2ORandomForestEstimator(ntrees=10,
                                     nfolds=nfolds,
                                     keep_cross_validation_predictions=True,
                                     seed=1)
    my_rf.train(x=x, y=y, training_frame=train)

    #Train two SE models with same metalearner seeds
    stack_gbm1 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
                                             metalearner_algorithm="gbm",
                                             metalearner_params=gbm_params,
                                             seed=55555)
    stack_gbm2 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
                                             metalearner_algorithm="gbm",
                                             metalearner_params=gbm_params,
                                             seed=55555)
    stack_gbm1.train(x=x, y=y, training_frame=train)
    stack_gbm2.train(x=x, y=y, training_frame=train)
    meta_gbm1 = h2o.get_model(stack_gbm1.metalearner()['name'])
    meta_gbm2 = h2o.get_model(stack_gbm2.metalearner()['name'])

    assert meta_gbm1.rmse(train=True) == meta_gbm2.rmse(
        train=True), "RMSE should match if same seed"

    #Train two SE models with diff metalearner seeds
    stack_gbm3 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
                                             metalearner_algorithm="gbm",
                                             metalearner_params=gbm_params,
                                             seed=55555)
    stack_gbm4 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
                                             metalearner_algorithm="gbm",
                                             metalearner_params=gbm_params,
                                             seed=98765)
    stack_gbm3.train(x=x, y=y, training_frame=train)
    stack_gbm4.train(x=x, y=y, training_frame=train)
    meta_gbm3 = h2o.get_model(stack_gbm3.metalearner()['name'])
    meta_gbm4 = h2o.get_model(stack_gbm4.metalearner()['name'])
    assert meta_gbm3.rmse(train=True) != meta_gbm4.rmse(
        train=True), "RMSE should NOT match if diff seed"
Beispiel #3
0
gbm_final_model = gbm_grid_table.models[0]

train_performance = gbm_final_model.model_performance(train)
train_performance.plot()
valid_performance = gbm_final_model.model_performance(valid)
valid_performance.plot()
test_performance = gbm_final_model.model_performance(test)
test_performance.plot()

import time
ts = time.time()
ts = int(ts)
gbm_model = gbm_final_model
rf_model = rf_final_model
ensemble = H2OStackedEnsembleEstimator(model_id="my_ensemble_binomial_" +
                                       str(ts),
                                       base_models=[gbm_model, rf_model])

ensemble.train(x=predictors,
               y=target,
               training_frame=train,
               validation_frame=valid)

train_performance = ensemble.model_performance(train)
train_performance.plot()
valid_performance = ensemble.model_performance(valid)
valid_performance.plot()
test_performance = ensemble.model_performance(test)
test_performance.plot()

# Eval ensemble performance on the test data\n",
def basic_inference_works_for_DRF_and_NB_test():
    train = h2o.import_file(
        pyunit_utils.locate("smalldata/iris/iris_train.csv"))
    test = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_test.csv"))

    x_class = train.columns
    y_class = "species"
    x_class.remove(y_class)

    nfolds = 2

    nb_class = H2ONaiveBayesEstimator(
        nfolds=nfolds,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True,
    )
    nb_class.train(x=x_class, y=y_class, training_frame=train)

    gbm_class = H2OGradientBoostingEstimator(
        nfolds=nfolds,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True,
    )
    gbm_class.train(x=x_class, y=y_class, training_frame=train)

    drf_class = H2ORandomForestEstimator(
        nfolds=nfolds,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True)
    drf_class.train(x=x_class, y=y_class, training_frame=train)

    se_class_0 = H2OStackedEnsembleEstimator(
        training_frame=train,
        validation_frame=test,
        base_models=[nb_class, gbm_class, drf_class],
        metalearner_algorithm="gbm")
    se_class_0.train(x_class, y_class, train)

    assert se_class_0.metalearner().actual_params.get("distribution") == "multinomial", \
        "Expected distribution {} but got {}".format("multinomial",
                                                     se_class_0.metalearner().actual_params.get("distribution"))

    se_class_1 = H2OStackedEnsembleEstimator(
        training_frame=train,
        validation_frame=test,
        base_models=[gbm_class, drf_class, nb_class],
        metalearner_algorithm="gbm")
    se_class_1.train(x_class, y_class, train)

    assert se_class_1.metalearner().actual_params.get("distribution") == "multinomial", \
        "Expected distribution {} but got {}".format("multinomial",
                                                     se_class_1.metalearner().actual_params.get("distribution"))

    se_class_2 = H2OStackedEnsembleEstimator(
        training_frame=train,
        validation_frame=test,
        base_models=[drf_class, nb_class, gbm_class],
        metalearner_algorithm="gbm")
    se_class_2.train(x_class, y_class, train)

    assert se_class_2.metalearner().actual_params.get("distribution") == "multinomial", \
        "Expected distribution {} but got {}".format("multinomial",
                                                     se_class_2.metalearner().actual_params.get("distribution"))

    se_class_3 = H2OStackedEnsembleEstimator(
        training_frame=train,
        validation_frame=test,
        base_models=[nb_class, gbm_class, drf_class])
    se_class_3.train(x_class, y_class, train)

    assert se_class_3.metalearner().actual_params.get("family") == "multinomial", \
        "Expected family {} but got {}".format("multinomial",
                                               se_class_3.metalearner().actual_params.get("family"))

    se_class_4 = H2OStackedEnsembleEstimator(
        training_frame=train,
        validation_frame=test,
        base_models=[gbm_class, drf_class, nb_class])
    se_class_4.train(x_class, y_class, train)

    assert se_class_4.metalearner().actual_params.get("family") == "multinomial", \
        "Expected family {} but got {}".format("multinomial",
                                               se_class_4.metalearner().actual_params.get("family"))

    se_class_5 = H2OStackedEnsembleEstimator(
        training_frame=train,
        validation_frame=test,
        base_models=[drf_class, nb_class, gbm_class])
    se_class_5.train(x_class, y_class, train)

    assert se_class_5.metalearner().actual_params.get("family") == "multinomial", \
        "Expected family {} but got {}".format("multinomial",
                                               se_class_5.metalearner().actual_params.get("family"))
Beispiel #5
0
def airline_gbm_random_grid():
    air_hex = h2o.import_file(
        path=pyunit_utils.locate("smalldata/airlines/allyears2k_headers.zip"),
        destination_frame="air.hex")
    myX = ["DayofMonth", "DayOfWeek"]

    hyper_parameters = {
        'learn_rate': [0.1, 0.2],
        'max_depth': [2, 3, 4],
        'ntrees': [5, 10, 15]
    }

    search_crit = {
        'strategy': "RandomDiscrete",
        'max_models': 5,
        'seed': 1234,
        'stopping_rounds': 3,
        'stopping_metric': "AUTO",
        'stopping_tolerance': 1e-2
    }

    air_grid = H2OGridSearch(H2OGradientBoostingEstimator,
                             hyper_params=hyper_parameters,
                             search_criteria=search_crit)
    air_grid.train(x=myX,
                   y="IsDepDelayed",
                   training_frame=air_hex,
                   nfolds=5,
                   fold_assignment='Modulo',
                   keep_cross_validation_predictions=True,
                   distribution="bernoulli",
                   seed=5678)

    assert (len(air_grid.get_grid()) == 5)
    print(air_grid.get_grid("logloss"))

    stacker = H2OStackedEnsembleEstimator(base_models=air_grid.model_ids)
    print("created H2OStackedEnsembleEstimator")
    stacker.train(model_id="my_ensemble",
                  y="IsDepDelayed",
                  training_frame=air_hex)
    print("trained H2OStackedEnsembleEstimator")
    predictions = stacker.predict(air_hex)  # training data
    print("predictions for ensemble are in: " + predictions.frame_id)

    # Check that the model can be retrieved
    assert stacker.model_id == "my_ensemble"
    modelcopy = h2o.get_model(stacker.model_id)
    assert modelcopy is not None
    assert modelcopy.model_id == "my_ensemble"

    # golden test for ensemble predictions:
    assert round(
        predictions[0, "YES"], 4
    ) == 0.4327, "Expected prediction for row: {0} to be: {1}; got: {2} instead.".format(
        0, 0.4327, round(predictions[0, "YES"], 4))
    assert round(
        predictions[1, "YES"], 4
    ) == 0.5214, "Expected prediction for row: {0} to be: {1}; got: {2} instead.".format(
        1, 0.5214, round(predictions[1, "YES"], 4))
    assert round(
        predictions[2, "YES"], 4
    ) == 0.4666, "Expected prediction for row: {0} to be: {1}; got: {2} instead.".format(
        2, 0.4666, round(predictions[2, "YES"], 4))

    air_grid = H2OGridSearch(H2OGradientBoostingEstimator,
                             hyper_params=hyper_parameters,
                             search_criteria=search_crit)
    air_grid.train(x=myX,
                   y="IsDepDelayed",
                   training_frame=air_hex,
                   distribution="bernoulli")
    assert (len(air_grid.get_grid()) == 5)
    print(air_grid.get_grid("logloss"))

    # added this part to check h2o.get_grid is working properly
    fetch_grid = h2o.get_grid(str(air_grid.grid_id))
    assert len(air_grid.get_grid()) == len(fetch_grid.get_grid())

    ################################################################################
    # PUBDEV-5145: make sure we give a good error message for JSON parse failures, like range() under 3.6
    hyper_parameters['max_depth'] = range(2, 4)
    search_crit['max_models'] = 1

    if sys.version_info[0] < 3:
        # no exception
        air_grid = H2OGridSearch(H2OGradientBoostingEstimator,
                                 hyper_params=hyper_parameters,
                                 search_criteria=search_crit)
        air_grid.train(x=myX,
                       y="IsDepDelayed",
                       training_frame=air_hex,
                       nfolds=5,
                       fold_assignment='Modulo',
                       keep_cross_validation_predictions=True,
                       distribution="bernoulli",
                       seed=5678)
    else:
        # MalformedJsonException in Java; check for the right error message in Python
        got_exception = False
        exc = None
        try:
            air_grid = H2OGridSearch(H2OGradientBoostingEstimator,
                                     hyper_params=hyper_parameters,
                                     search_criteria=search_crit)
            air_grid.train(x=myX,
                           y="IsDepDelayed",
                           training_frame=air_hex,
                           nfolds=5,
                           fold_assignment='Modulo',
                           keep_cross_validation_predictions=True,
                           distribution="bernoulli",
                           seed=5678)
        except H2OResponseError as e:
            got_exception = True
            exc = e
        assert (type(exc) == H2OResponseError)
        print("Got an H2OResponseError, as expected with 3.x")
        assert ("Error: Can't parse the hyper_parameters dictionary"
                in str(exc))
        assert (got_exception)

    hyper_parameters['max_depth'] = 1
    search_crit['max_models'] = [1, 3]  # expecting an int
    # IllegalStateException in Java; check for the right error message in Python
    got_exception = False
    exc = None
    try:
        air_grid = H2OGridSearch(H2OGradientBoostingEstimator,
                                 hyper_params=hyper_parameters,
                                 search_criteria=search_crit)
        air_grid.train(x=myX,
                       y="IsDepDelayed",
                       training_frame=air_hex,
                       nfolds=5,
                       fold_assignment='Modulo',
                       keep_cross_validation_predictions=True,
                       distribution="bernoulli",
                       seed=5678)
    except H2OResponseError as e:
        got_exception = True
        exc = e
    assert (type(exc) == H2OResponseError)
    print("Got an H2OResponseError, as expected with 3.x")
    assert ("Error: Can't parse the search_criteria dictionary" in str(exc))
    assert (got_exception)
Beispiel #6
0
    fold_assignment="Modulo")
glm_model.train(x=features,
                y=target,
                training_frame=train,
                model_id="glm_model")
glm_model.show()

# Deep Water Model
dw_model = H2ODeepWaterEstimator(epochs=3,
                                 network="lenet",
                                 ignore_const_cols=False,
                                 image_shape=[28, 28],
                                 channels=1,
                                 standardize=False,
                                 seed=1234,
                                 nfolds=nfolds,
                                 keep_cross_validation_predictions=True,
                                 fold_assignment="Modulo")
dw_model.train(x=features, y=target, training_frame=train, model_id="dw_model")
dw_model.show()

# Stacked Ensemble
stack_all = H2OStackedEnsembleEstimator(
    base_models=[gbm_model.model_id, glm_model.model_id, dw_model.model_id])
stack_all.train(x=features,
                y=target,
                training_frame=train,
                validation_frame=valid,
                model_id="stack_all")
stack_all.model_performance()
def infer_family_helper(family,
                        expected_family,
                        link,
                        expected_link,
                        kwargs1=None,
                        kwargs2=None):
    kwargs1 = dict() if kwargs1 is None else kwargs1
    kwargs2 = dict() if kwargs2 is None else kwargs2
    train = h2o.import_file(
        pyunit_utils.locate("smalldata/iris/iris_train.csv"))
    test = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_test.csv"))
    if family == "multinomial":
        y = "species"
    elif family == "binomial":
        train["response"] = (train["species"] == "Iris-versicolor").asfactor()
        test["response"] = (test["species"] == "Iris-versicolor").asfactor()
        y = "response"
    elif family == "quasibinomial" or family == "fractionalbinomial":
        train["response"] = (train["species"] == "Iris-versicolor") / 2
        test["response"] = (test["species"] == "Iris-versicolor") / 2
        y = "response"
    elif family == "ordinal":
        y = "response"
        train[y] = (train["species"] == "Iris-versicolor")
        test[y] = (test["species"] == "Iris-versicolor")
        train[(train["species"] == "Iris-setosa"), y] = 2
        test[(test["species"] == "Iris-setosa"), y] = 2
        train[y] = train[y].asfactor()
        test[y] = test[y].asfactor()
    else:
        y = "petal_wid"

    x = train.columns
    x.remove(y)

    if "link" not in kwargs1 and link:
        kwargs1["link"] = link
    if "family" not in kwargs1:
        kwargs1["family"] = family

    if "link" not in kwargs2 and link:
        kwargs2["link"] = link
    if "family" not in kwargs2:
        kwargs2["family"] = family

    nfolds = 2
    glm = H2OGeneralizedLinearEstimator(nfolds=nfolds,
                                        fold_assignment="Modulo",
                                        keep_cross_validation_predictions=True,
                                        **kwargs1)
    glm.train(x=x, y=y, training_frame=train)

    glm2 = H2OGeneralizedLinearEstimator(
        nfolds=nfolds,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True,
        **kwargs2)
    glm2.train(x=x, y=y, training_frame=train)

    se = H2OStackedEnsembleEstimator(training_frame=train,
                                     validation_frame=test,
                                     base_models=[glm, glm2],
                                     metalearner_algorithm="glm")
    se.train(x, y, train)
    assert se.metalearner().actual_params.get("family") == expected_family, \
        "Expected family {} but got {}".format(expected_family, se.metalearner().actual_params.get("family"))
    if link:
        assert se.metalearner().actual_params.get("link") == expected_link, \
            "Expected link {} but got {}".format(expected_link, se.metalearner().actual_params.get("link"))

    se_auto = H2OStackedEnsembleEstimator(training_frame=train,
                                          validation_frame=test,
                                          base_models=[glm, glm2],
                                          metalearner_algorithm="auto")
    se_auto.train(x, y, train)
    assert se_auto.metalearner().actual_params.get("family") == expected_family, \
        "Expected family {} but got {}".format(expected_family, se_auto.metalearner().actual_params.get("family"))
    if link:
        assert se_auto.metalearner().actual_params.get("link") == expected_link, \
            "Expected link {} but got {}".format(expected_link, se_auto.metalearner().actual_params.get("link"))
    ntrees=200,
    sample_rate=0.7,
    col_sample_rate=0.4)

model_gbm.train(x=training_columns, y=predict_column, training_frame=train)

model_rf = H2ORandomForestEstimator(model_id="model_rf",
                                    nfolds=nfolds,
                                    fold_assignment="Modulo",
                                    keep_cross_validation_predictions=True,
                                    ntrees=200)

model_rf.train(x=training_columns, y=predict_column, training_frame=train)

model_glm = H2OGeneralizedLinearEstimator(
    model_id="model_glm",
    nfolds=nfolds,
    fold_assignment="Modulo",
    keep_cross_validation_predictions=True)

model_glm.train(x=training_columns, y=predict_column, training_frame=train)

models = [model_gbm, model_rf, model_glm]
final_se = H2OStackedEnsembleEstimator(model_id="se_model", base_models=models)

final_se.train(x=training_columns, y=predict_column, training_frame=train)

#The best rmse was obtained using gbm
#RMSE: 116191.98025712594

model_gbm.model_performance(test)
Beispiel #9
0
#deep_water.train(x=pred_columns, y='target', training_frame=df)
deep_learn = H2ODeepLearningEstimator(
    nfolds=nfolds,
    hidden=[10, 10, 10, 10, 10, 10, 10, 10, 10],
    activation="Tanh",
    fold_assignment="Modulo",
    keep_cross_validation_predictions=True)
deep_learn.train(x=pred_columns, y='target', training_frame=df)
lin = H2OGeneralizedLinearEstimator(nfolds=nfolds,
                                    fold_assignment="Modulo",
                                    keep_cross_validation_predictions=True)
lin.train(x=pred_columns, y='target', training_frame=df)

stack = H2OStackedEnsembleEstimator(model_id="my_ensemble",
                                    training_frame=df,
                                    base_models=[
                                        my_gbm.model_id, my_rf.model_id,
                                        deep_learn.model_id, lin.model_id
                                    ])
#stack = H2OStackedEnsembleEstimator(model_id="my_ensemble", training_frame=df, base_models=[my_gbm.model_id, my_rf.model_id, deep_water.model_id,deep_learn.model_id,lin.model_id])
#stack = H2OStackedEnsembleEstimator(model_id="my_ensemble", training_frame=df, base_models=[my_gbm.model_id, my_rf.model_id])
stack.train(x=pred_columns, y='target', training_frame=df)
stack.model_performance()

predictions = stack.predict(test)
h2o.download_csv(predictions, "../output/predictions_BIG.h2o")

#gbm_grid = H2OGradientBoostingEstimator(
#        ## more trees is better if the learning rate is small enough
#        ## here, use "more than enough" trees - we have early stopping
#        ntrees=10000,
#        ## smaller learning rate is better
def stackedensemble_gaussian():
    #
    # australia.csv: Gaussian
    #
    australia_hex = h2o.import_file(
        path=pyunit_utils.locate("smalldata/extdata/australia.csv"),
        destination_frame="australia.hex")
    myX = [
        "premax", "salmax", "minairtemp", "maxairtemp", "maxsst",
        "maxsoilmoist", "Max_czcs"
    ]
    # myXSmaller = ["premax", "salmax","minairtemp", "maxairtemp", "maxsst", "maxsoilmoist"]
    # dependent = "runoffnew"

    my_gbm = H2OGradientBoostingEstimator(
        ntrees=10,
        max_depth=3,
        min_rows=2,
        learn_rate=0.2,
        nfolds=5,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True,
        distribution="AUTO")
    my_gbm.train(y="runoffnew", x=myX, training_frame=australia_hex)
    print("GBM performance: ")
    my_gbm.model_performance(australia_hex).show()

    my_rf = H2ORandomForestEstimator(ntrees=10,
                                     max_depth=3,
                                     min_rows=2,
                                     nfolds=5,
                                     fold_assignment="Modulo",
                                     keep_cross_validation_predictions=True)
    my_rf.train(y="runoffnew", x=myX, training_frame=australia_hex)
    print("RF performance: ")
    my_rf.model_performance(australia_hex).show()

    my_dl = H2ODeepLearningEstimator(nfolds=5,
                                     fold_assignment="Modulo",
                                     keep_cross_validation_predictions=True)
    my_dl.train(y="runoffnew", x=myX, training_frame=australia_hex)
    print("DL performance: ")
    my_dl.model_performance(australia_hex).show()

    # NOTE: don't specify family
    my_glm = H2OGeneralizedLinearEstimator(
        nfolds=5,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True)
    my_glm.train(y="runoffnew", training_frame=australia_hex)
    # my_glm.train(y = "runoffnew", x = myX, training_frame = australia_hex)
    # my_glm.train(y = "runoffnew", x = myXSmaller, training_frame = australia_hex)  # test parameter error-checking
    print("GLM performance: ")
    my_glm.model_performance(australia_hex).show()

    stacker = H2OStackedEnsembleEstimator(
        selection_strategy="choose_all",
        base_models=[my_gbm.model_id, my_rf.model_id, my_glm.model_id])
    stacker.train(model_id="my_ensemble",
                  x=myX,
                  y="runoffnew",
                  training_frame=australia_hex)
    # test ignore_columns parameter checking
    # stacker.train(model_id="my_ensemble", y="runoffnew", training_frame=australia_hex, ignored_columns=["premax"])
    predictions = stacker.predict(australia_hex)  # training data
    print("Predictions for australia ensemble are in: " + predictions.frame_id)

    #
    # ecology.csv: Gaussian
    #
    ecology_train = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv"),
        destination_frame="ecology_train")
    myX = [
        "SegSumT", "SegTSeas", "SegLowFlow", "DSDist", "DSMaxSlope", "USAvgT",
        "USRainDays", "USSlope", "USNative", "DSDam", "Method", "LocSed"
    ]
    #  myXSmaller = ["SegSumT", "SegTSeas", "SegLowFlow"]

    my_gbm = H2OGradientBoostingEstimator(
        ntrees=10,
        max_depth=3,
        min_rows=2,
        learn_rate=0.2,
        nfolds=5,
        fold_assignment='Modulo',
        keep_cross_validation_predictions=True,
        distribution="AUTO")
    my_gbm.train(y="Angaus", x=myX, training_frame=ecology_train)
    print("GBM performance: ")
    my_gbm.model_performance(ecology_train).show()

    my_rf = H2ORandomForestEstimator(ntrees=10,
                                     max_depth=3,
                                     min_rows=2,
                                     nfolds=5,
                                     fold_assignment='Modulo',
                                     keep_cross_validation_predictions=True)
    my_rf.train(y="Angaus", x=myX, training_frame=ecology_train)
    print("RF performance: ")
    my_rf.model_performance(ecology_train).show()

    my_dl = H2ODeepLearningEstimator(nfolds=5,
                                     fold_assignment='Modulo',
                                     keep_cross_validation_predictions=True)
    my_dl.train(y="Angaus", x=myX, training_frame=ecology_train)
    print("DL performance: ")
    my_dl.model_performance(ecology_train).show()

    # NOTE: don't specify family
    my_glm = H2OGeneralizedLinearEstimator(
        nfolds=5,
        fold_assignment='Modulo',
        keep_cross_validation_predictions=True)
    my_glm.train(y="Angaus", x=myX, training_frame=ecology_train)
    print("GLM performance: ")
    my_glm.model_performance(ecology_train).show()

    stacker = H2OStackedEnsembleEstimator(
        selection_strategy="choose_all",
        base_models=[my_gbm.model_id, my_rf.model_id, my_glm.model_id])
    print("created H2OStackedEnsembleEstimator: " + str(stacker))
    stacker.train(model_id="my_ensemble",
                  y="Angaus",
                  training_frame=ecology_train)
    print("trained H2OStackedEnsembleEstimator: " + str(stacker))
    print("trained H2OStackedEnsembleEstimator via get_model: " +
          str(h2o.get_model("my_ensemble")))

    predictions = stacker.predict(ecology_train)  # training data
    print("predictions for ensemble are in: " + predictions.frame_id)

    #
    # insurance.csv: Poisson
    #
    insurance_train = h2o.import_file(
        path=pyunit_utils.locate("smalldata/glm_test/insurance.csv"),
        destination_frame="insurance_train")
    insurance_train["offset"] = insurance_train["Holders"].log()

    myX = list(range(3))

    my_gbm = H2OGradientBoostingEstimator(
        ntrees=10,
        max_depth=3,
        min_rows=2,
        learn_rate=0.2,
        nfolds=5,
        fold_assignment='Modulo',
        keep_cross_validation_predictions=True,
        distribution='poisson')
    my_gbm.train(y="Claims", x=myX, training_frame=insurance_train)
    print("GBM performance: ")
    my_gbm.model_performance(insurance_train).show()

    my_rf = H2ORandomForestEstimator(ntrees=10,
                                     max_depth=3,
                                     min_rows=2,
                                     nfolds=5,
                                     fold_assignment='Modulo',
                                     keep_cross_validation_predictions=True)
    my_rf.train(y="Claims", x=myX, training_frame=insurance_train)
    print("RF performance: ")
    my_rf.model_performance(insurance_train).show()

    my_dl = H2ODeepLearningEstimator(nfolds=5,
                                     fold_assignment='Modulo',
                                     keep_cross_validation_predictions=True,
                                     distribution='poisson')
    my_dl.train(y="Claims", x=myX, training_frame=insurance_train)
    print("DL performance: ")
    my_dl.model_performance(insurance_train).show()

    # NOTE: don't specify family
    my_glm = H2OGeneralizedLinearEstimator(
        nfolds=5,
        fold_assignment='Modulo',
        keep_cross_validation_predictions=True,
        family='poisson')
    my_glm.train(y="Claims", x=myX, training_frame=insurance_train)
    print("GLM performance: ")
    my_glm.model_performance(insurance_train).show()

    stacker = H2OStackedEnsembleEstimator(
        selection_strategy="choose_all",
        base_models=[my_gbm.model_id, my_rf.model_id, my_glm.model_id])
    print("created H2OStackedEnsembleEstimator: " + str(stacker))
    stacker.train(model_id="my_ensemble",
                  y="Claims",
                  training_frame=insurance_train)
    print("trained H2OStackedEnsembleEstimator: " + str(stacker))

    print("metalearner: ")
    print(h2o.get_model(stacker.metalearner()['name']))

    predictions = stacker.predict(insurance_train)  # training data
    print("preditions for ensemble are in: " + predictions.frame_id)
Beispiel #11
0
def airline_gbm_random_grid():
    air_hex = h2o.import_file(
        path=pyunit_utils.locate("smalldata/airlines/allyears2k_headers.zip"),
        destination_frame="air.hex")
    myX = ["DayofMonth", "DayOfWeek"]

    hyper_parameters = {
        'learn_rate': [0.1, 0.2],
        'max_depth': [2, 3, 4],
        'ntrees': [5, 10, 15]
    }

    search_crit = {
        'strategy': "RandomDiscrete",
        'max_models': 5,
        'seed': 1234,
        'stopping_rounds': 3,
        'stopping_metric': "AUTO",
        'stopping_tolerance': 1e-2
    }

    air_grid = H2OGridSearch(H2OGradientBoostingEstimator,
                             hyper_params=hyper_parameters,
                             search_criteria=search_crit)
    air_grid.train(x=myX,
                   y="IsDepDelayed",
                   training_frame=air_hex,
                   nfolds=5,
                   fold_assignment='Modulo',
                   keep_cross_validation_predictions=True,
                   distribution="bernoulli",
                   seed=5678)

    assert (len(air_grid.get_grid()) == 5)
    print(air_grid.get_grid("logloss"))

    stacker = H2OStackedEnsembleEstimator(base_models=air_grid.model_ids)
    print("created H2OStackedEnsembleEstimator")
    stacker.train(model_id="my_ensemble",
                  y="IsDepDelayed",
                  training_frame=air_hex)
    print("trained H2OStackedEnsembleEstimator")
    predictions = stacker.predict(air_hex)  # training data
    print("predictions for ensemble are in: " + predictions.frame_id)

    # Check that the model can be retrieved
    assert stacker.model_id == "my_ensemble"
    modelcopy = h2o.get_model(stacker.model_id)
    assert modelcopy is not None
    assert modelcopy.model_id == "my_ensemble"

    # golden test for ensemble predictions:
    assert round(
        predictions[0, "YES"], 4
    ) == 0.4327, "Expected prediction for row: {0} to be: {1}; got: {2} instead.".format(
        0, 0.4327, round(predictions[0, "YES"], 4))
    assert round(
        predictions[1, "YES"], 4
    ) == 0.5214, "Expected prediction for row: {0} to be: {1}; got: {2} instead.".format(
        1, 0.5214, round(predictions[1, "YES"], 4))
    assert round(
        predictions[2, "YES"], 4
    ) == 0.4666, "Expected prediction for row: {0} to be: {1}; got: {2} instead.".format(
        2, 0.4666, round(predictions[2, "YES"], 4))

    air_grid = H2OGridSearch(H2OGradientBoostingEstimator,
                             hyper_params=hyper_parameters,
                             search_criteria=search_crit)
    air_grid.train(x=myX,
                   y="IsDepDelayed",
                   training_frame=air_hex,
                   distribution="bernoulli")
    assert (len(air_grid.get_grid()) == 5)
    print(air_grid.get_grid("logloss"))

    # added this part to check h2o.get_grid is working properly
    fetch_grid = h2o.get_grid(str(air_grid.grid_id))
    assert len(air_grid.get_grid()) == len(fetch_grid.get_grid())
# )

gbm1.train(x=features, y=t, training_frame=train_hf)

gbm2.train(x=features, y=t, training_frame=train_hf)
#
# glm1.train(x = features,
#                   y = t,
#                   training_frame = train_hf)

# glm2.train(x = features,
#                   y = t,
#                   training_frame = train_hf)

all_ids = ['gbm1', 'gbm2']
ensemble = H2OStackedEnsembleEstimator(model_id="my_ensemble",
                                       base_models=all_ids)

ensemble.train(x=features, y=t, training_frame=train_hf)

print(gbm1.model_performance(val_hf).auc())
print(gbm2.model_performance(val_hf).auc())
# print(glm1.model_performance(val_hf).auc())
# print(glm2.model_performance(val_hf).auc())
print(ensemble.model_performance(val_hf).auc())
print()
time_elapsed = time.time() - since
print('[timer]: complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60,
                                                    time_elapsed % 60))
'''0.6477481772734077
0.6477481772734077
0.6477510858544782
from h2o.estimators.gbm import H2OGradientBoostingEstimator
mGBM = H2OGradientBoostingEstimator(nfolds=folds,fold_assignment="Modulo",keep_cross_validation_predictions=True)
mGBM.train(X,y,train)
print(mGBM.model_performance(test))

#XGBoost model, This showed the most optimal result and also achieves below $123000 RMSE
from h2o.estimators import H2OXGBoostEstimator
xgb = H2OXGBoostEstimator(nfolds=folds,ntrees=60,learn_rate=0.2,fold_assignment="Modulo",keep_cross_validation_predictions=True)
xgb.train(X,y,train)
print(xgb.model_performance(test))


print("##################################### Check Ensemble Model #####################")
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
# Train a stacked ensemble using above models
ensemble = H2OStackedEnsembleEstimator(base_models=[glm,rFm,mGBM, xgb])
ensemble.train(X,y,train)

# Eval ensemble performance on the test data
#performance is worst among all
print(ensemble.model_performance(test))


print("##################################### Start Training Deep Learning Model #####################")
#split train set in train and validation
#Running Deep learning using cross validation is extremely slow, so just use validation set
#Also, best parameters were identified by creating models with
#epoch ranging from 50, 100, 200
# Hidden layers as [200.200] and [200,200,200] and [50,50]
#Activation functions as tanh, rectifier and tanh_with_dropout
# Rectifier with dropout using [0.1,0.1] and [0.25, 0.25] and [0.35, 0.35] and [0.5,0.5]
Beispiel #14
0
r_rf.train(predictors, target, training_frame=rh_data)

r_deep = H2ODeepLearningEstimator(hidden=[200, 100, 100, 10, 10, 10],
                                  epochs=1000,
                                  rate=0.001,
                                  nfolds=6,
                                  keep_cross_validation_predictions=True,
                                  seed=1)
r_deep.train(predictors, target, training_frame=rh_data)

r_stack = H2OStackedEnsembleEstimator(metalearner_algorithm="deeplearning",
                                      metalearner_params={
                                          "hidden":
                                          [200, 100, 100, 10, 10, 10],
                                          "epochs": 100,
                                          "nesterov_accelerated_gradient": True
                                      },
                                      model_id="ensemble4",
                                      training_frame=rh_data,
                                      metalearner_nfolds=6,
                                      base_models=[r_gbm, r_rf, r_deep])

r_stack.train(predictors, target, training_frame=rh_data)
print(r_stack)

h2o.save_model(model=r_stack, path="stack_red3", force=True)

# saving trained RF red model
h2o.save_model(model=r_rf, path="rf_red2", force=True)

# printing key red RF model info
Beispiel #15
0
            b_models = aml.leader.full_parameters['base_models']
            base = [
                b_models['actual_value'][i]['name']
                for i in range(0, len(b_models['actual_value']))
            ]

            #Here we retrain base models before calling stack ensemble
            print("stacked")
            for b_model in base:
                if 'GLM' in b_model:  #GLM is giving error with re-training
                    base.remove(b_model)
                else:
                    m = h2o.get_model(b_model)
                    m.train(y=-1, training_frame=d)

            ensemble = H2OStackedEnsembleEstimator(base_models=base)
            ensemble.train(y=-1, training_frame=d)
            anytime_model = ensemble

        else:
            aml.leader.train(y=-1, training_frame=d)
            anytime_model = aml.leader

# In[27]:

from h2o.estimators.gbm import H2OGradientBoostingEstimator

m = h2o.get_model('GBM_grid__1_AutoML_20200518_140119_model_4')
print(m)

# In[28]:
                              nfolds=nfolds,
                              fold_assignment="Modulo",
                              keep_cross_validation_predictions=True)

m1.train(x, y, train)
m2.train(x, y, train)
m3.train(x, y, train)
m4.train(x, y, train)

base_models = [m1, m2, m3, m4]

m5 = H2OStackedEnsembleEstimator(model_id="SE",
                                 base_models=base_models,
                                 metalearner_algorithm="deeplearning",
                                 metalearner_params={
                                     'epochs': 20,
                                     'hidden': [200, 200, 200],
                                     'l1': 1e-5,
                                 },
                                 metalearner_nfolds=nfolds,
                                 validation_frame=valid)

m5.train(x, y, train)

for m in [m1, m2, m3, m4, m5]:
    print("%3s RMSE: %.2f" % (m.model_id, m.model_performance(test).rmse()))

best_model = sorted(set(base_models + [m5]),
                    key=lambda x: x.model_performance(test).rmse())[0]

print(best_model.model_performance(test))
Beispiel #17
0
rf = H2ORandomForestEstimator(nfolds=3, keep_cross_validation_predictions=True)
rf.train(x=x, y=y, training_frame=train, validation_frame=valid)
rf.cross_validation_models()
rf.cross_validation_metrics_summary()
rf.varimp_plot()
rf.varimp()
rf.mse(train=True, valid=True, xval=True)
rf.r2(train=True, valid=True, xval=True)
rf.mae(train=True, valid=True, xval=True)

#♦burası hata alıyor bir bak!!!!!!
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator

stack = H2OStackedEnsembleEstimator(model_id="my_ensemble",
                                    training_frame=train,
                                    validation_frame=test,
                                    base_models=[gbm.model_id, rf.model_id])
stack.train(x=x, y=y, training_frame=train, validation_frame=valid)
stack.model_performance()

y2 = data2['clicks']
X = data2.drop('clicks', axis=1)
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X,
                                                  y2,
                                                  test_size=0.25,
                                                  random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_train,
                                                    y_train,
                                                    test_size=0.3,
Beispiel #18
0
def stackedensemble_binomial_test():
    """This test check the following (for binomial classification):
    1) That H2OStackedEnsembleEstimator executes w/o erros on a 2-model 'manually constructed ensemble.
    2) That .predict() works on a stack.
    3) That .model_performance() works on a stack.
    4) That the training and test performance is better on ensemble vs the base learners.
    5) That the validation_frame arg on H2OStackedEnsembleEstimator works correctly.
    """

    # Import train and test datasets
    train = h2o.import_file(
        path=pyunit_utils.locate("smalldata/testng/higgs_train_5k.csv"),
        destination_frame="higgs_train_5k")
    test = h2o.import_file(
        path=pyunit_utils.locate("smalldata/testng/higgs_test_5k.csv"),
        destination_frame="higgs_test_5k")

    print(train.summary())

    # Identify predictors and response
    x = train.columns
    y = "response"
    x.remove(y)

    # convert response to a factor
    train[y] = train[y].asfactor()
    test[y] = test[y].asfactor()

    # set number of folds
    nfolds = 5

    # train and cross-validate a GBM
    my_gbm = H2OGradientBoostingEstimator(
        distribution="bernoulli",
        ntrees=10,
        max_depth=3,
        min_rows=2,
        learn_rate=0.2,
        nfolds=nfolds,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True,
        seed=1)
    my_gbm.train(x=x, y=y, training_frame=train)

    # evaluate the performance
    perf_gbm_train = my_gbm.model_performance(train=True)
    perf_gbm_test = my_gbm.model_performance(test_data=test)
    print("GBM training performance: ")
    print(perf_gbm_train)
    print("GBM test performance: ")
    print(perf_gbm_test)

    # train and cross-validate a RF
    my_rf = H2ORandomForestEstimator(ntrees=50,
                                     nfolds=nfolds,
                                     fold_assignment="Modulo",
                                     keep_cross_validation_predictions=True,
                                     seed=1)

    my_rf.train(x=x, y=y, training_frame=train)

    # evaluate performance
    perf_rf_train = my_rf.model_performance(train=True)
    perf_rf_test = my_rf.model_performance(test_data=test)
    print("RF training performance: ")
    print(perf_rf_train)
    print("RF test performance: ")
    print(perf_rf_test)

    # Train a stacked ensemble using the GBM and GLM above
    stack = H2OStackedEnsembleEstimator(
        model_id="my_ensemble_binomial",
        base_models=[my_gbm.model_id, my_rf.model_id],
        selection_strategy="choose_all")

    stack.train(
        x=x, y=y, training_frame=train,
        validation_frame=test)  # also test that validation_frame is working

    # check that prediction works
    pred = stack.predict(test_data=test)
    assert pred.nrow == test.nrow, "expected " + str(
        pred.nrow) + " to be equal to " + str(test.nrow)
    assert pred.ncol == 3, "expected " + str(
        pred.ncol) + " to be equal to 3 but it was equal to " + str(pred.ncol)

    # Evaluate ensemble performance
    perf_stack_train = stack.model_performance()
    perf_stack_test = stack.model_performance(test_data=test)

    # Check that stack perf is better (bigger) than the best(biggest) base learner perf:
    # Training AUC
    baselearner_best_auc_train = max(perf_gbm_train.auc(), perf_rf_train.auc())
    stack_auc_train = perf_stack_train.auc()
    print("Best Base-learner Training AUC:  {0}".format(
        baselearner_best_auc_train))
    print("Ensemble Training AUC:  {0}".format(stack_auc_train))
    assert stack_auc_train > baselearner_best_auc_train, "expected stack_auc_train would be greater than " \
                                                         " found it wasn't baselearner_best_auc_train"

    # Test AUC
    baselearner_best_auc_test = max(perf_gbm_test.auc(), perf_rf_test.auc())
    stack_auc_test = perf_stack_test.auc()
    print("Best Base-learner Test AUC:  {0}".format(baselearner_best_auc_test))
    print("Ensemble Test AUC:  {0}".format(stack_auc_test))
    assert stack_auc_test > baselearner_best_auc_test, "expected stack_auc_test would be greater than " \
                                                         " baselearner_best_auc_test, found it wasn't  " \
                                                       "baselearner_best_auc_test = "+ \
                                                                str(baselearner_best_auc_test) + ",stack_auc_test " \
                                                                                                     " = "+ str(stack_auc_test)

    # Check that passing `test` as a validation_frame produces the same metric as stack.model_performance(test)
    # since the metrics object is not exactly the same, we can just test that AUC is the same
    perf_stack_validation_frame = stack.model_performance(valid=True)
    assert stack_auc_test == perf_stack_validation_frame.auc(), "expected stack_auc_test to be the same as " \
                                                                "perf_stack_validation_frame.auc() found they were not " \
                                                                "perf_stack_validation_frame.auc() = " + \
                                                                str(perf_stack_validation_frame.auc()) + \
                                                                "stack_auc_test was " + str(stack_auc_test)
def stackedensemble_guassian_test():
    """This test check the following (for guassian regression):
    1) That H2OStackedEnsembleEstimator executes w/o errors on a 3-model manually constructed ensemble.
    2) That .predict() works on a stack.
    3) That .model_performance() works on a stack.
    4) That the training and test performance is better on ensemble vs the base learners.
    5) That the validation_frame arg on H2OStackedEnsembleEstimator works correctly.
    """

    col_types = [
        "numeric", "numeric", "numeric", "enum", "enum", "numeric", "numeric",
        "numeric", "numeric"
    ]
    dat = h2o.upload_file(
        path=pyunit_utils.locate("smalldata/extdata/prostate.csv"),
        destination_frame="prostate_hex",
        col_types=col_types)
    train, test = dat.split_frame(ratios=[.8], seed=1)
    print(train.summary())

    # Identify predictors and response
    x = ["CAPSULE", "GLEASON", "RACE", "DPROS", "DCAPS", "PSA", "VOL"]
    y = "AGE"

    # set number of folds
    nfolds = 5

    # train and cross-validate a GBM
    my_gbm = H2OGradientBoostingEstimator(
        distribution="gaussian",
        max_depth=3,
        learn_rate=0.2,
        nfolds=nfolds,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True,
        seed=1)
    my_gbm.train(x=x, y=y, training_frame=train)

    # evaluate the performance
    perf_gbm_train = my_gbm.model_performance(train=True)
    perf_gbm_test = my_gbm.model_performance(test_data=test)
    print("GBM training performance: ")
    print(perf_gbm_train)
    print("GBM test performance: ")
    print(perf_gbm_test)

    # train and cross-validate a RF
    my_rf = H2ORandomForestEstimator(ntrees=30,
                                     nfolds=nfolds,
                                     fold_assignment="Modulo",
                                     keep_cross_validation_predictions=True,
                                     seed=1)

    my_rf.train(x=x, y=y, training_frame=train)

    # evaluate performance
    perf_rf_train = my_rf.model_performance(train=True)
    perf_rf_test = my_rf.model_performance(test_data=test)
    print("RF training performance: ")
    print(perf_rf_train)
    print("RF test performance: ")
    print(perf_rf_test)

    # Train and cross-validate an extremely-randomized RF
    my_xrf = H2ORandomForestEstimator(ntrees=50,
                                      nfolds=nfolds,
                                      histogram_type="Random",
                                      fold_assignment="Modulo",
                                      keep_cross_validation_predictions=True,
                                      seed=1)

    my_xrf.train(x=x, y=y, training_frame=train)

    # evaluate performance
    perf_xrf_train = my_xrf.model_performance(train=True)
    perf_xrf_test = my_xrf.model_performance(test_data=test)
    print("XRF training performance: ")
    print(perf_xrf_train)
    print("XRF test performance: ")
    print(perf_xrf_test)

    # Train a stacked ensemble using the GBM and GLM above
    stack = H2OStackedEnsembleEstimator(
        model_id="my_ensemble_guassian",
        base_models=[my_gbm.model_id, my_rf.model_id, my_xrf.model_id])

    stack.train(
        x=x, y=y, training_frame=train,
        validation_frame=test)  # also test that validation_frame is working

    # Check that prediction works
    pred = stack.predict(test_data=test)
    assert pred.nrow == test.nrow, "expected " + str(
        pred.nrow) + " to be equal to " + str(test.nrow)
    assert pred.ncol == 1, "expected " + str(
        pred.ncol) + " to be equal to 1 but it was equal to " + str(pred.ncol)

    # Does predict() have ugly side effects?
    pred = stack.predict(test_data=test)
    assert pred.nrow == test.nrow, "expected " + str(
        pred.nrow) + " to be equal to " + str(test.nrow)
    assert pred.ncol == 1, "expected " + str(
        pred.ncol) + " to be equal to 1 but it was equal to " + str(pred.ncol)

    # Evaluate ensemble performance
    perf_stack_train = stack.model_performance()
    perf_stack_test = stack.model_performance(test_data=test)

    # Does performance() have ugly side effects?
    perf_stack_train = stack.model_performance()
    perf_stack_test = stack.model_performance(test_data=test)

    # Training RMSE for each base learner
    baselearner_best_rmse_train = min(perf_gbm_train.rmse(),
                                      perf_rf_train.rmse(),
                                      perf_xrf_train.rmse())
    stack_rmse_train = perf_stack_train.rmse()
    print("Best Base-learner Training RMSE:  {0}".format(
        baselearner_best_rmse_train))
    print("Ensemble Training RMSE:  {0}".format(stack_rmse_train))
    #assert stack_rmse_train < baselearner_best_rmse_train, "expected stack_rmse_train would be less than " \
    #                                                     " found it wasn't baselearner_best_rmse_train"

    # Check that stack perf is better (smaller) than the best (smaller) base learner perf:
    # Test RMSE for each base learner
    baselearner_best_rmse_test = min(perf_gbm_test.rmse(), perf_rf_test.rmse(),
                                     perf_xrf_test.rmse())
    stack_rmse_test = perf_stack_test.rmse()
    print(
        "Best Base-learner Test RMSE:  {0}".format(baselearner_best_rmse_test))
    print("Ensemble Test RMSE:  {0}".format(stack_rmse_test))
    assert stack_rmse_test < baselearner_best_rmse_test, "expected stack_rmse_test would be less than " \
                                                       " baselearner_best_rmse_test, found it wasn't  " \
                                                       "baselearner_best_rmse_test = "+ \
                                                       str(baselearner_best_rmse_test) + ",stack_rmse_test " \
                                                                                              " = "+ str(stack_rmse_test)

    # Check that passing `test` as a validation_frame produces the same metric as stack.model_performance(test)
    # since the metrics object is not exactly the same, we can just test that RSME is the same
    perf_stack_validation_frame = stack.model_performance(valid=True)
    assert stack_rmse_test == perf_stack_validation_frame.rmse(), "expected stack_rmse_test to be the same as " \
                                                                "perf_stack_validation_frame.rmse() found they were not " \
                                                                "perf_stack_validation_frame.rmse() = " + \
                                                                str(perf_stack_validation_frame.rmse()) + \
                                                                "stack_rmse_test was " + str(stack_rmse_test)
def stackedensemble_multinomial_test():
    """This test check the following (for multinomial regression):
    1) That H2OStackedEnsembleEstimator executes w/o errors on a 6-model manually constructed ensemble.
    2) That .predict() works on a stack.
    3) That .model_performance() works on a stack.
    4) That test performance is better on ensemble vs the base learners.
    5) That the validation_frame arg on H2OStackedEnsembleEstimator works correctly.
    """

    df = h2o.import_file(
        path=pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz"))
    y = "C785"
    x = list(range(784))
    df[y] = df[y].asfactor()
    train = df[0:5000, :]
    test = df[5000:10000, :]
    # Number of CV folds (to generate level-one data for stacking)
    nfolds = 2

    # train and cross-validate a GBM
    my_gbm = H2OGradientBoostingEstimator(
        distribution="multinomial",
        nfolds=nfolds,
        ntrees=10,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True,
        seed=1)
    my_gbm.train(x=x, y=y, training_frame=train)

    # evaluate the performance
    perf_gbm_train = my_gbm.model_performance()
    perf_gbm_test = my_gbm.model_performance(test_data=test)
    print("GBM training performance: ")
    print(perf_gbm_train)
    print("GBM test performance: ")
    print(perf_gbm_test)

    # train and cross-validate a RF
    my_rf = H2ORandomForestEstimator(ntrees=10,
                                     nfolds=nfolds,
                                     fold_assignment="Modulo",
                                     keep_cross_validation_predictions=True,
                                     seed=1)
    my_rf.train(x=x, y=y, training_frame=train)

    # evaluate performance
    perf_rf_train = my_rf.model_performance()
    perf_rf_test = my_rf.model_performance(test_data=test)
    print("RF training performance: ")
    print(perf_rf_train)
    print("RF test performance: ")
    print(perf_rf_test)

    # Train and cross-validate an XGBoost GBM
    my_xgb = H2OXGBoostEstimator(ntrees=10,
                                 nfolds=nfolds,
                                 fold_assignment="Modulo",
                                 keep_cross_validation_predictions=True,
                                 seed=1)
    my_xgb.train(x=x, y=y, training_frame=train)

    # evaluate performance
    perf_xgb_train = my_xgb.model_performance()
    perf_xgb_test = my_xgb.model_performance(test_data=test)
    print("XGB training performance: ")
    print(perf_xgb_train)
    print("XGB test performance: ")
    print(perf_xgb_test)

    # Train and cross-validate a Naive Bayes model
    my_nb = H2ONaiveBayesEstimator(nfolds=nfolds,
                                   fold_assignment="Modulo",
                                   keep_cross_validation_predictions=True,
                                   seed=1)
    my_nb.train(x=x, y=y, training_frame=train)

    # evaluate performance
    perf_nb_train = my_nb.model_performance()
    perf_nb_test = my_nb.model_performance(test_data=test)
    print("NB training performance: ")
    print(perf_nb_train)
    print("NB test performance: ")
    print(perf_nb_test)

    # Train and cross-validate a Deep Learning model
    my_dnn = H2ODeepLearningEstimator(hidden=[10, 10],
                                      nfolds=nfolds,
                                      fold_assignment="Modulo",
                                      keep_cross_validation_predictions=True,
                                      seed=1)
    my_dnn.train(x=x, y=y, training_frame=train)

    # evaluate performance
    perf_dnn_train = my_dnn.model_performance()
    perf_dnn_test = my_dnn.model_performance(test_data=test)
    print("DNN training performance: ")
    print(perf_dnn_train)
    print("DNN test performance: ")
    print(perf_dnn_test)

    # Train and cross-validate a GLM model
    my_glm = H2OGeneralizedLinearEstimator(
        family="multinomial",
        nfolds=nfolds,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True,
        seed=1)
    my_glm.train(x=x, y=y, training_frame=train)

    # evaluate performance
    perf_glm_train = my_glm.model_performance()
    perf_glm_test = my_glm.model_performance(test_data=test)
    print("GLM training performance: ")
    print(perf_glm_train)
    print("GLM test performance: ")
    print(perf_glm_test)

    # Train a stacked ensemble using the GBM and GLM above
    stack = H2OStackedEnsembleEstimator(base_models=[
        my_gbm.model_id, my_rf.model_id, my_xgb.model_id, my_nb.model_id,
        my_dnn.model_id, my_glm.model_id
    ])
    stack.train(
        x=x, y=y, training_frame=train,
        validation_frame=test)  # also test that validation_frame is working
    assert isinstance(
        stack, h2o.estimators.stackedensemble.H2OStackedEnsembleEstimator)
    assert stack.type == "classifier"

    # Check that prediction works
    pred = stack.predict(test_data=test)
    print(pred)
    assert pred.nrow == test.nrow, "expected " + str(
        pred.nrow) + " to be equal to " + str(test.nrow)
    assert pred.ncol == 11, "expected " + str(
        pred.ncol) + " to be equal to 1 but it was equal to " + str(pred.ncol)

    # Evaluate ensemble performance
    perf_stack_train = stack.model_performance()
    assert isinstance(perf_stack_train,
                      h2o.model.metrics_base.H2OMultinomialModelMetrics)
    perf_stack_valid = stack.model_performance(valid=True)
    assert isinstance(perf_stack_valid,
                      h2o.model.metrics_base.H2OMultinomialModelMetrics)
    perf_stack_test = stack.model_performance(test_data=test)
    assert isinstance(perf_stack_test,
                      h2o.model.metrics_base.H2OMultinomialModelMetrics)

    # Check that stack perf is better (smaller) than the best (smaller) base learner perf:
    # Test Mean Per Class Error for each base learner
    baselearner_best_mean_per_class_error_test = min(perf_gbm_test.mean_per_class_error(), \
                                                     perf_rf_test.mean_per_class_error(), \
                                                     perf_xgb_test.mean_per_class_error(), \
                                                     perf_nb_test.mean_per_class_error(), \
                                                     perf_dnn_test.mean_per_class_error(),
                                                     perf_glm_test.mean_per_class_error())
    stack_mean_per_class_error_test = perf_stack_test.mean_per_class_error()
    print("Best Base-learner Test Mean Per Class Error:  {0}".format(
        baselearner_best_mean_per_class_error_test))
    print("Ensemble Test Mean Per Class Error:  {0}".format(
        stack_mean_per_class_error_test))
    assert stack_mean_per_class_error_test <= baselearner_best_mean_per_class_error_test, + \
                                                         "expected stack_mean_per_class_error_test would be less than " \
                                                         " baselearner_best_mean_per_class_error_test, found it wasn't  " \
                                                         "baselearner_best_mean_per_class_error_test = "+ \
                                                         str(baselearner_best_mean_per_class_error_test) + \
                                                         ",stack_mean_per_class_error_test = "+ \
                                                         str(stack_mean_per_class_error_test)

    # Check that passing `test` as a validation_frame produces the same metric as stack.model_performance(test)
    # since the metrics object is not exactly the same, we can just test that RSME is the same
    perf_stack_validation_frame = stack.model_performance(valid=True)
    assert stack_mean_per_class_error_test == perf_stack_validation_frame.mean_per_class_error(), \
                                                                  "expected stack_mean_per_class_error_test to be the same as " \
                                                                  "perf_stack_validation_frame.mean_per_class_error() found it wasn't" \
                                                                  "perf_stack_validation_frame.mean_per_class_error() = " + \
                                                                  str(perf_stack_validation_frame.mean_per_class_error()) + \
                                                                  "stack_mean_per_class_error_test was " + \
                                                                  str(stack_mean_per_class_error_test)
def infer_mixed_family_and_dist_helper(family,
                                       expected_family,
                                       first_glm,
                                       expected_link=None,
                                       kwargs_glm=None,
                                       kwargs_gbm=None,
                                       metalearner_params=None):
    kwargs_glm = dict() if kwargs_glm is None else kwargs_glm
    kwargs_gbm = dict() if kwargs_gbm is None else kwargs_gbm
    metalearner_params = dict(
    ) if metalearner_params is None else metalearner_params

    distribution = family if not family == "binomial" else "bernoulli"
    expected_distribution = expected_family if not expected_family == "binomial" else "bernoulli"

    train = h2o.import_file(
        pyunit_utils.locate("smalldata/iris/iris_train.csv"))
    test = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_test.csv"))
    if family == "multinomial":
        y = "species"
    elif family == "binomial":
        train["response"] = (train["species"] == "Iris-versicolor").asfactor()
        test["response"] = (test["species"] == "Iris-versicolor").asfactor()
        y = "response"
    elif family == "quasibinomial" or family == "fractionalbinomial":
        train["response"] = (train["species"] == "Iris-versicolor") / 2
        test["response"] = (test["species"] == "Iris-versicolor") / 2
        y = "response"
    elif family == "ordinal":
        y = "response"
        train[y] = (train["species"] == "Iris-versicolor")
        test[y] = (test["species"] == "Iris-versicolor")
        train[(train["species"] == "Iris-setosa"), y] = 2
        test[(test["species"] == "Iris-setosa"), y] = 2
        train[y] = train[y].asfactor()
        test[y] = test[y].asfactor()
    else:
        y = "petal_wid"

    x = train.columns
    x.remove(y)

    if "family" not in kwargs_glm:
        kwargs_glm["family"] = family

    if "distribution" not in kwargs_gbm:
        kwargs_gbm["distribution"] = distribution

    nfolds = 2
    glm = H2OGeneralizedLinearEstimator(nfolds=nfolds,
                                        fold_assignment="Modulo",
                                        keep_cross_validation_predictions=True,
                                        **kwargs_glm)
    glm.train(x=x, y=y, training_frame=train)

    gbm = H2OGradientBoostingEstimator(nfolds=nfolds,
                                       fold_assignment="Modulo",
                                       keep_cross_validation_predictions=True,
                                       **kwargs_gbm)
    gbm.train(x=x, y=y, training_frame=train)

    se = H2OStackedEnsembleEstimator(
        training_frame=train,
        validation_frame=test,
        base_models=[glm, gbm] if first_glm else [gbm, glm],
        metalearner_algorithm="glm",
        metalearner_params={
            k: v
            for k, v in metalearner_params.items() if k != "distribution"
        })
    se.train(x, y, train)
    assert se.metalearner().actual_params.get("family") == expected_family, \
        "Expected family {} but got {}".format(expected_family, se.metalearner().actual_params.get("family"))
    if expected_link:
        assert se.metalearner().actual_params.get("link") == expected_link, \
            "Expected link {} but got {}".format(expected_link, se.metalearner().actual_params.get("link"))

    se_auto = H2OStackedEnsembleEstimator(
        training_frame=train,
        validation_frame=test,
        base_models=[glm, gbm] if first_glm else [gbm, glm],
        metalearner_algorithm="auto",
        metalearner_params={
            k: v
            for k, v in metalearner_params.items() if k != "distribution"
        })
    se_auto.train(x, y, train)
    assert se_auto.metalearner().actual_params.get("family") == expected_family, \
        "Expected family {} but got {}".format(expected_family, se_auto.metalearner().actual_params.get("family"))
    if expected_link:
        assert se_auto.metalearner().actual_params.get("link") == expected_link, \
            "Expected link {} but got {}".format(expected_link, se_auto.metalearner().actual_params.get("link"))
    se_gbm = H2OStackedEnsembleEstimator(
        training_frame=train,
        validation_frame=test,
        base_models=[glm, gbm] if first_glm else [gbm, glm],
        metalearner_algorithm="gbm",
        metalearner_params={
            k: v
            for k, v in metalearner_params.items()
            if k != "family" and k != "link"
        })
    se_gbm.train(x, y, train)
    assert se_gbm.metalearner().actual_params.get("distribution") == expected_distribution, \
        "Expected distribution {} but got {}".format(expected_distribution,
                                                     se_gbm.metalearner().actual_params.get("distribution"))
Beispiel #22
0
                                        binomial_double_trees=True,
                                        keep_cross_validation_predictions=True)
RandomForest.train(x=x, y=y, training_frame=train)
# Eval performance:
RFperf = RandomForest.model_performance()

GradientBoost = H2OGradientBoostingEstimator(model_id = 'GradientBoost',
                                             nfolds=5,
                                             seed=1111,
                                             keep_cross_validation_predictions=True)
GradientBoost.train(x=x, y=y, training_frame=train)
GBperf = GradientBoost.model_performance()


Ensemble = H2OStackedEnsembleEstimator(model_id="Ensemble",
                                       base_models=['DeepLearn', 'RandomForest',
                                                    'GradientBoost'])
Ensemble.train(x=x, y=y, training_frame=train)

Performance = Ensemble.model_performance()


predic = Ensemble.predict(valid).as_data_frame()
yhat = np.array(predic).reshape(-1,1)
ytrue = np.array(Test['LogReturn']).reshape(-1,1)
yy = np.concatenate((np.exp(yhat),np.exp(ytrue)),axis=1)
yy = yy[:99,:]
R2 = np.corrcoef(yy.T)[0,1]**2

h2o.cluster().shutdown()
Beispiel #23
0
def stackedensemble_nfolds_test():
    """This test checks the following:
    1) That H2OStackedEnsembleEstimator `metalearner_nfolds` works correctly
    2) That H2OStackedEnsembleEstimator `metalearner_fold_assignment` works correctly
    3) That Stacked Ensemble cross-validation metrics are correctly copied from metalearner
    """

    # Import training set
    train = h2o.import_file(
        path=pyunit_utils.locate("smalldata/testng/higgs_train_5k.csv"),
        destination_frame="higgs_train_5k")
    test = h2o.import_file(
        path=pyunit_utils.locate("smalldata/testng/higgs_test_5k.csv"),
        destination_frame="higgs_test_5k")
    # Add a fold_column
    fold_column = "fold_id"
    train[fold_column] = train.kfold_column(n_folds=3, seed=1)

    # Identify predictors and response
    x = train.columns
    y = "response"
    x.remove(y)
    x.remove(fold_column)

    # Convert response to a factor
    train[y] = train[y].asfactor()
    test[y] = test[y].asfactor()

    # Set number of folds for base learners
    nfolds = 3

    # Train and cross-validate a GBM
    my_gbm = H2OGradientBoostingEstimator(
        distribution="bernoulli",
        ntrees=10,
        nfolds=nfolds,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True,
        seed=1)
    my_gbm.train(x=x, y=y, training_frame=train)

    # Train and cross-validate a RF
    my_rf = H2ORandomForestEstimator(ntrees=50,
                                     nfolds=nfolds,
                                     fold_assignment="Modulo",
                                     keep_cross_validation_predictions=True,
                                     seed=1)
    my_rf.train(x=x, y=y, training_frame=train)

    # Check that not setting nfolds still produces correct results
    stack0 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf])
    stack0.train(x=x, y=y, training_frame=train)
    assert (stack0.params['metalearner_nfolds']['actual'] == 0)
    meta0 = h2o.get_model(stack0.metalearner()['name'])
    assert (meta0.params['nfolds']['actual'] == 0)

    # Train a stacked ensemble & check that metalearner_nfolds works
    # Also test that the xval metrics from metalearner & ensemble are equal
    stack1 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
                                         metalearner_nfolds=3)
    stack1.train(x=x, y=y, training_frame=train)
    # Check that metalearner_nfolds is correctly stored in model output
    assert (stack1.params['metalearner_nfolds']['actual'] == 3)
    # Check that the metalearner was cross-validated with the correct number of folds
    meta1 = h2o.get_model(stack1.metalearner()['name'])
    assert (meta1.params['nfolds']['actual'] == 3)
    # Check that metalearner fold_assignment is NULL/"AUTO"
    assert (meta1.params['fold_assignment']['actual'] == "AUTO")
    # Check that validation metrics are NULL
    assert (stack1.mse(valid=True) is None)
    # Check that xval metrics from metalearner and ensemble are equal (use mse as proxy)
    assert (stack1.mse(xval=True) == meta1.mse(xval=True))

    # Train a new ensmeble, also passing a validation frame
    ss = test.split_frame(ratios=[0.5], seed=1)
    stack2 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
                                         metalearner_nfolds=3)
    stack2.train(x=x, y=y, training_frame=train, validation_frame=ss[0])
    # Check that valid & xval metrics from metalearner and ensemble are equal (use mse as proxy)
    meta2 = h2o.get_model(stack2.metalearner()['name'])
    assert (stack2.mse(valid=True) == meta2.mse(valid=True))
    # Check that xval metrics from metalearner and ensemble are equal (use mse as proxy)
    assert (stack2.mse(xval=True) == meta2.mse(xval=True))

    # Check that metalearner_fold_assignment works
    stack3 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf],
                                         metalearner_nfolds=3,
                                         metalearner_fold_assignment="Modulo")
    stack3.train(x=x, y=y, training_frame=train)
    # Check that metalearner_fold_assignment is correctly stored in model output
    assert (stack3.params['metalearner_fold_assignment']['actual'] == "Modulo")
    # Check that the metalearner was cross-validated with the correct number of folds
    meta3 = h2o.get_model(stack3.metalearner()['name'])
    assert (meta3.params['fold_assignment']['actual'] == "Modulo")

    # Check that metalearner_fold_column works
    stack4 = H2OStackedEnsembleEstimator(
        base_models=[my_gbm, my_rf],
        metalearner_fold_column=fold_column,
        metalearner_params=dict(keep_cross_validation_models=True))
    stack4.train(x=x, y=y, training_frame=train)
    # Check that metalearner_fold_column is correctly stored in model output
    assert (stack4.params['metalearner_fold_column']['actual']['column_name']
            == fold_column)
    # Check that metalearner_fold_column is passed through to metalearner
    meta4 = h2o.get_model(stack4.metalearner()['name'])
    assert (
        meta4.params['fold_column']['actual']['column_name'] == fold_column)
    assert (meta4.params['nfolds']['actual'] == 0)
    assert (len(meta4.cross_validation_models()) == 3)
Beispiel #24
0
my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
                                      nfolds=10,
                                      ntrees=5,
                                      keep_cross_validation_predictions=True,
                                      seed=1)

my_gbm.train(y=-1, training_frame=d)

my_rf = H2ORandomForestEstimator(nfolds=10,
                                 ntrees=5,
                                 keep_cross_validation_predictions=True,
                                 seed=1)
my_rf.train(y=-1, training_frame=d)

# Train a stacked ensemble using the GBM and GLM above
ensemble = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf])

ensemble.train(y=-1, training_frame=d)

anytime_model = ensemble

#Prequential evaluation

for i in range(3, 10):

    #Test on next batch for accuracy

    test = B[i]

    spm = sp.csr_matrix(test.values)
    d = h2o.H2OFrame(spm)
def stackedensemble_grid_gaussian():
    """This test check the following (for guassian regression):
    1) That H2OStackedEnsembleEstimator executes w/o erros on a random-grid-based ensemble.
    2) That .predict() works on a stack.
    3) That .model_performance() works on a stack.
    4) That the training and test performance is better on ensemble vs the base learners.
    5) That the validation_frame arg on H2OStackedEnsembleEstimator works correctly.
    """

    # Import train and test datasets
    dat = h2o.import_file(
        path=pyunit_utils.locate("smalldata/extdata/australia.csv"),
        destination_frame="australia_hex")
    train, test = dat.split_frame(ratios=[.75], seed=1)

    print(train.summary())

    # Identify predictors and response
    x = [
        "premax", "salmax", "minairtemp", "maxairtemp", "maxsst",
        "maxsoilmoist", "Max_czcs"
    ]
    y = "runoffnew"

    # Set number of folds
    nfolds = 5

    # Specify GBM hyperparameters for the grid
    hyper_params = {
        "learn_rate": [0.01, 0.03],
        "max_depth": [3, 4, 5, 6, 9],
        "sample_rate": [0.7, 0.8, 0.9, 1.0],
        "col_sample_rate": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
    }
    search_criteria = {
        "strategy": "RandomDiscrete",
        "max_models": 3,
        "seed": 1
    }

    # Train the grid
    grid = H2OGridSearch(model=H2OGradientBoostingEstimator(
        ntrees=10,
        seed=1,
        nfolds=nfolds,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True),
                         hyper_params=hyper_params,
                         search_criteria=search_criteria,
                         grid_id="gbm_grid_guassian")

    grid.train(x=x, y=y, training_frame=train)

    # Train a stacked ensemble using the GBM grid
    stack = H2OStackedEnsembleEstimator(
        model_id="my_ensemble_gbm_grid_guassian", base_models=grid.model_ids)
    stack.train(x=x, y=y, training_frame=train, validation_frame=test)

    # Check that predictions work
    pred = stack.predict(test_data=test)
    assert pred.nrow == test.nrow, "expected " + str(
        pred.nrow) + " to be equal to " + str(test.nrow)
    assert pred.ncol == 1, "expected " + str(
        pred.ncol) + " to be equal to 1 but it was equal to " + str(pred.ncol)

    # Evaluate ensemble performance
    perf_stack_train = stack.model_performance()
    perf_stack_test = stack.model_performance(test_data=test)

    # Training RMSE for each base learner
    baselearner_best_rmse_train = max(
        [h2o.get_model(model).rmse(train=True) for model in grid.model_ids])
    stack_rmse_train = perf_stack_train.rmse()
    print("Best Base-learner Training RMSE:  {0}".format(
        baselearner_best_rmse_train))
    print("Ensemble Training RMSE:  {0}".format(stack_rmse_train))
    assert stack_rmse_train < baselearner_best_rmse_train, "expected stack_rmse_train would be less than " \
                                                         " found it wasn't baselearner_best_rmse_train"

    # Check that stack perf is better (smaller) than the best (smaller) base learner perf:
    # Test RMSE for each base learner
    baselearner_best_rmse_test = max([
        h2o.get_model(model).model_performance(test_data=test).rmse()
        for model in grid.model_ids
    ])
    stack_rmse_test = perf_stack_test.rmse()
    print(
        "Best Base-learner Test RMSE:  {0}".format(baselearner_best_rmse_test))
    print("Ensemble Test RMSE:  {0}".format(stack_rmse_test))
    assert stack_rmse_test < baselearner_best_rmse_test, "expected stack_rmse_test would be less than " \
                                                         "baselearner_best_rmse_test, found it wasn't " \
                                                         "baselearner_best_rmse_test = "+ \
                                                         str(baselearner_best_rmse_test) + ",stack_rmse_test " \
                                                                                           "= "+ str(stack_rmse_test)

    # Check that passing `test` as a validation_frame produces the same metric as stack.model_performance(test)
    # since the metrics object is not exactly the same, we can just test that RMSE is the same
    perf_stack_validation_frame = stack.model_performance(valid=True)
    assert stack_rmse_test == perf_stack_validation_frame.rmse(), "expected stack_rmse_test to be the same as " \
                                                                  "perf_stack_validation_frame.rmse() found they were not " \
                                                                  "perf_stack_validation_frame.rmse() = " + \
                                                                  str(perf_stack_validation_frame.rmse()) + \
                                                                  "stack_rmse_test was " + str(stack_rmse_test)
Beispiel #26
0
def stackedensemble_validation_frame_test():
    """This test checks the following:
    1) That passing in a validation_frame to h2o.stackedEnsemble does something (validation metrics exist).
    2) It should hopefully produce a better model (in the metalearning step).
    """

    # Import training set
    df = h2o.import_file(path=pyunit_utils.locate("smalldata/higgs/higgs_train_5k.csv"),
                         destination_frame="higgs_train_5k")
    test = h2o.import_file(path=pyunit_utils.locate("smalldata/higgs/higgs_test_5k.csv"),
                           destination_frame="higgs_test_5k")

    # Identify predictors and response
    x = df.columns
    y = "response"
    x.remove(y)

    # Convert response to a factor
    df[y] = df[y].asfactor()
    test[y] = test[y].asfactor()

    # Split off a validation_frame
    ss = df.split_frame(seed = 1)
    train = ss[0]
    valid = ss[1]

    # Set number of folds
    nfolds = 5

    # Train and cross-validate a GBM
    my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
                                          ntrees=10,
                                          nfolds=nfolds,
                                          fold_assignment="Modulo",
                                          keep_cross_validation_predictions=True,
                                          seed=1)
    my_gbm.train(x=x, y=y, training_frame=train)

    # Train and cross-validate a RF
    my_rf = H2ORandomForestEstimator(ntrees=10,
                                     nfolds=nfolds,
                                     fold_assignment="Modulo",
                                     keep_cross_validation_predictions=True,
                                     seed=1)
    my_rf.train(x=x, y=y, training_frame=train)

    # Train a stacked ensemble & check that validation metrics are missing
    stack1 = H2OStackedEnsembleEstimator(base_models=[my_gbm.model_id, my_rf.model_id])
    stack1.train(x=x, y=y, training_frame=train)
    assert(stack1.model_performance(valid=True) is None)

    # Train a stacked ensemble with a validation_frame & check that validation metrics exist & are correct type
    stack2 = H2OStackedEnsembleEstimator(base_models=[my_gbm.model_id, my_rf.model_id])
    stack2.train(x=x, y=y, training_frame=train, validation_frame=valid)
    assert(type(stack2.model_performance(valid=True)) == h2o.model.metrics_base.H2OBinomialModelMetrics)
    assert(type(stack2.auc(valid=True)) == float)


    # Compare test AUC (ensemble with validation_frame should not be worse)
    perf1 = stack1.model_performance(test_data=test)
    perf2 = stack2.model_performance(test_data=test)
    assert perf2.auc() >= perf1.auc()
def airline_gbm_random_grid():
    air_hex = h2o.import_file(
        path=pyunit_utils.locate("smalldata/airlines/allyears2k_headers.zip"),
        destination_frame="air.hex")
    myX = ["Year", "Month", "CRSDepTime", "UniqueCarrier", "Origin", "Dest"]

    # create hyperameter and search criteria lists (ranges are inclusive..exclusive))
    hyper_params_tune = {
        'max_depth':
        list(range(1, 10 + 1, 1)),
        'sample_rate': [x / 100. for x in range(20, 101)],
        'col_sample_rate': [x / 100. for x in range(20, 101)],
        'col_sample_rate_per_tree': [x / 100. for x in range(20, 101)],
        'col_sample_rate_change_per_level': [x / 100. for x in range(90, 111)],
        'min_rows':
        [2**x for x in range(0,
                             int(math.log(air_hex.nrow, 2) - 1) + 1)],
        'nbins': [2**x for x in range(4, 11)],
        'nbins_cats': [2**x for x in range(4, 13)],
        'min_split_improvement': [0, 1e-8, 1e-6, 1e-4],
        'histogram_type': ["UniformAdaptive", "QuantilesGlobal", "RoundRobin"]
    }
    # search_criteria directs how to do grid search.
    # 1). grid search can stop early if the early stopping conditions specified by
    #   stopping_metric/stopping_tolerance/stopping_rounds
    # 2). grid search will stop if it takes longer than max_runtime_secs
    # 3). grid search will stop if it has collected max_models in its array.
    #
    # grid search stops correctly if any of the three conditions are satisfied
    search_criteria_tune = {
        'strategy': "RandomDiscrete",
        'max_runtime_secs':
        600,  # limit the runtime to 10 minutes to hit more stopping conditions
        'max_models': 5,  # build no more than 5 models
        'seed': 1234,
        'stopping_rounds': 5,
        'stopping_metric': "AUC",
        'stopping_tolerance': 1e-3
    }

    air_grid = H2OGridSearch(H2OGradientBoostingEstimator,
                             hyper_params=hyper_params_tune,
                             search_criteria=search_criteria_tune)
    starttime = time.time()
    air_grid.train(x=myX,
                   y="IsDepDelayed",
                   training_frame=air_hex,
                   nfolds=5,
                   fold_assignment='Modulo',
                   keep_cross_validation_predictions=True,
                   distribution="bernoulli",
                   seed=1234)
    runtime = time.time() - starttime

    # check stopping condition 3), max_models
    correct_stopping_condition = len(
        air_grid.get_grid()) == search_criteria_tune["max_models"]

    # if false, check stopping condition 2), max_runtime_secs
    if not (correct_stopping_condition):
        correct_stopping_condition = runtime >= search_criteria_tune[
            "max_runtime_secs"]

    # if false, check stopping condition 1), early stopping has occurred.
    if not (correct_stopping_condition):
        for eachModel in air_grid.models:
            metric_list = pyunit_utils.extract_scoring_history_field(
                eachModel, "training_auc")
            if pyunit_utils.evaluate_early_stopping(
                    metric_list, search_criteria_tune["stopping_rounds"],
                    search_criteria_tune["stopping_tolerance"], True):
                correct_stopping_condition = True
                break

    assert correct_stopping_condition, "Grid search did not find a model that fits the search_criteria_tune."
    print(air_grid.get_grid("logloss"))

    stacker = H2OStackedEnsembleEstimator(selection_strategy="choose_all",
                                          base_models=air_grid.model_ids)
    stacker.train(model_id="my_ensemble",
                  y="IsDepDelayed",
                  training_frame=air_hex)
    predictions = stacker.predict(air_hex)  # training data
    print("preditions for ensemble are in: " + predictions.frame_id)
Beispiel #28
0
 def compute_stack_ensemble(self):
     self.ensemble = H2OStackedEnsembleEstimator(
         model_id="ensemble_" + str(random.sample(list(range(100)), 1)[0]),
         base_models=self.all_ids)
     self.ensemble.train(x=self.X, y=self.y, training_frame=self.train)
def stackedensemble_grid_binomial():
    """This test check the following (for binomial classification):
    1) That H2OStackedEnsembleEstimator executes w/o errors on a random-grid-based ensemble.
    2) That .predict() works on a stack.
    3) That .model_performance() works on a stack.
    4) That the training and test performance is better on ensemble vs the base learners.
    5) That the validation_frame arg on H2OStackedEnsembleEstimator works correctly.
    """

    # Import train and test datasets
    train = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/higgs_train_5k.csv"),
                            destination_frame="higgs_train_5k")
    test = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/higgs_test_5k.csv"),
                           destination_frame="higgs_test_5k")

    # Identify predictors and response
    x = train.columns
    y = "response"
    x.remove(y)

    # Encode the response as categorical
    train[y] = train[y].asfactor()
    test[y] = test[y].asfactor()

    # Set number of folds
    nfolds = 5

    # Specify GBM hyperparameters for the grid
    hyper_params = {"learn_rate": [0.01, 0.03],
                    "max_depth": [3, 4, 5, 6, 9],
                    "sample_rate": [0.7, 0.8, 0.9, 1.0],
                    "col_sample_rate": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]}
    search_criteria = {"strategy": "RandomDiscrete", "max_models": 3, "seed": 1}

    # Train the grid
    grid = H2OGridSearch(model=H2OGradientBoostingEstimator(ntrees=10, seed=1,
                                                            nfolds=nfolds, fold_assignment="Modulo",
                                                            keep_cross_validation_predictions=True),
                         hyper_params=hyper_params,
                         search_criteria=search_criteria,
                         grid_id="gbm_grid_binomial")

    grid.train(x=x, y=y, training_frame=train)

    # Train a stacked ensemble using the GBM grid
    stack = H2OStackedEnsembleEstimator(model_id="my_ensemble_gbm_grid_binomial", 
                                        base_models=grid.model_ids)
    stack.train(x=x, y=y, training_frame=train, validation_frame=test)


    # check that prediction works
    pred = stack.predict(test_data= test)
    assert pred.nrow == test.nrow, "expected " + str(pred.nrow) + " to be equal to " + str(test.nrow)
    assert pred.ncol == 3, "expected " + str(pred.ncol) + " to be equal to 3 but it was equal to " + str(pred.ncol)

    # Evaluate ensemble performance
    perf_stack_train = stack.model_performance()
    perf_stack_test = stack.model_performance(test_data=test)

    # Training AUC for each base learner
    baselearner_best_auc_train = max([h2o.get_model(model).auc(train = True) for model in grid.model_ids])
    stack_auc_train = perf_stack_train.auc()
    print("Best Base-learner Training AUC:  {0}".format(baselearner_best_auc_train))
    print("Ensemble Training AUC:  {0}".format(stack_auc_train))
    # this does not pass, but that's okay for training error
    #assert stack_auc_train > baselearner_best_auc_train, "expected stack_auc_train would be greater than " \
    #                                                     " found it wasn't baselearner_best_auc_train"

    # Test AUC
    baselearner_best_auc_test = max([h2o.get_model(model).model_performance(test_data=test).auc() for model in grid.model_ids])
    stack_auc_test = perf_stack_test.auc()
    print("Best Base-learner Test AUC:  {0}".format(baselearner_best_auc_test))
    print("Ensemble Test AUC:  {0}".format(stack_auc_test))
    assert stack_auc_test > baselearner_best_auc_test, "expected stack_auc_test would be greater than " \
                                                       " baselearner_best_auc_test, found it wasn't  " \
                                                       "baselearner_best_auc_test = "+ \
                                                       str(baselearner_best_auc_test) + ",stack_auc_test " \
                                                                                        " = "+ str(stack_auc_test)

    # Check that passing `test` as a validation_frame produces the same metric as stack.model_performance(test)
    # since the metrics object is not exactly the same, we can just test that AUC is the same
    perf_stack_validation_frame = stack.model_performance(valid=True)
    assert stack_auc_test == perf_stack_validation_frame.auc(), "expected stack_auc_test to be the same as " \
                                                                "perf_stack_validation_frame.auc() found they were not " \
                                                                "perf_stack_validation_frame.auc() = " + \
                                                                str(perf_stack_validation_frame.auc()) + \
                                                                "stack_auc_test was " + str(stack_auc_test)
Beispiel #30
0
def stackedensemble_binary_test():
    # Import a sample binary outcome train/test set into H2O
    train = h2o.import_file(
        pyunit_utils.locate("smalldata/higgs/higgs_train_10k.csv"))
    test = h2o.import_file(
        pyunit_utils.locate("smalldata/testng/higgs_test_5k.csv"))

    # Identify predictors and response
    x = train.columns
    y = "response"
    x.remove(y)

    # For binary classification, response should be a factor
    train[y] = train[y].asfactor()
    test[y] = test[y].asfactor()

    # Number of CV folds (to generate level-one data for stacking)
    nfolds = 5

    # 1. Generate a 2-model ensemble (GBM + RF)

    # Train and cross-validate a GBM
    my_gbm = H2OGradientBoostingEstimator(
        distribution="bernoulli",
        ntrees=10,
        max_depth=3,
        min_rows=2,
        learn_rate=0.2,
        nfolds=nfolds,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True,
        seed=1)
    my_gbm.train(x=x, y=y, training_frame=train)

    # Train and cross-validate a RF
    my_rf = H2ORandomForestEstimator(ntrees=50,
                                     nfolds=nfolds,
                                     fold_assignment="Modulo",
                                     keep_cross_validation_predictions=True,
                                     seed=1)
    my_rf.train(x=x, y=y, training_frame=train)

    # Train a stacked ensemble using the GBM and DRF above
    ensemble = H2OStackedEnsembleEstimator(
        model_id="my_ensemble_binomial",
        base_models=[my_gbm.model_id, my_rf.model_id])
    ensemble.train(x=x, y=y, training_frame=train)

    #Predict in ensemble in Py client
    preds_py = ensemble.predict(test)

    #Load binary model and predict
    bin_model = h2o.load_model(
        pyunit_utils.locate(
            "smalldata/binarymodels/stackedensemble/ensemble_higgs"))
    preds_bin = bin_model.predict(test)

    #Predictions from model in Py and binary model should be the same
    pred_diff = preds_bin - preds_py
    assert pred_diff["p0"].max() < 1e-11
    assert pred_diff["p1"].max() < 1e-11
    assert pred_diff["p0"].min() > -1e-11
    assert pred_diff["p1"].min() > -1e-11