fold_assignment="Modulo", keep_cross_validation_predictions=True) my_GLM.train(x, y, train, validation_frame=valid) my_GLM.model_performance(valid) ## Save models h2o.save_model(model=my_GBM, force=True) h2o.save_model(model=my_NN, force=True) h2o.save_model(model=my_RF, force=True) h2o.save_model(model=my_GLM, force=True) ## Stacked ensemble - 21'' to run from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator models = [my_GBM.model_id, my_NN.model_id, my_RF.model_id, my_GLM.model_id] my_SE = H2OStackedEnsembleEstimator(model_id="SE_glm_gbm_rf_nn", base_models=models, metalearner_algorithm='deeplearning', seed=12) my_SE.train(x, y, train, validation_frame=valid) h2o.save_model(model=my_SE, force=True) # Compare models performance all_models = [my_GBM, my_NN, my_RF, my_GLM, my_SE] names = ["GBM", "NN", "RF", "GLM", "SE"] test_perf = list(map(lambda x: x.model_performance(valid), all_models)) pd.Series(map(lambda p: p.rmse(), test_perf), names) # Better performance on the Stacked Ensemble! ## Performance on the test data my_SE.model_performance(test) # Performance reached on the test data : RMSE = 120575 < 123000
def stackedensemble_metalearner_seed_test(): # Import training set train = h2o.import_file( path=pyunit_utils.locate("smalldata/testng/higgs_train_5k.csv"), destination_frame="higgs_train_5k") test = h2o.import_file( path=pyunit_utils.locate("smalldata/testng/higgs_test_5k.csv"), destination_frame="higgs_test_5k") # Identify predictors and response x = train.columns y = "response" x.remove(y) # Convert response to a factor train[y] = train[y].asfactor() test[y] = test[y].asfactor() # Set number of folds for base learners nfolds = 3 #Metalearner params for gbm, drf, glm, and deep deeplearning gbm_params = {"sample_rate": 0.3, "col_sample_rate": 0.3} # Train and cross-validate a GBM my_gbm = H2OGradientBoostingEstimator( distribution="bernoulli", ntrees=10, nfolds=nfolds, keep_cross_validation_predictions=True, seed=1) my_gbm.train(x=x, y=y, training_frame=train) # Train and cross-validate a RF my_rf = H2ORandomForestEstimator(ntrees=10, nfolds=nfolds, keep_cross_validation_predictions=True, seed=1) my_rf.train(x=x, y=y, training_frame=train) #Train two SE models with same metalearner seeds stack_gbm1 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm", metalearner_params=gbm_params, seed=55555) stack_gbm2 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm", metalearner_params=gbm_params, seed=55555) stack_gbm1.train(x=x, y=y, training_frame=train) stack_gbm2.train(x=x, y=y, training_frame=train) meta_gbm1 = h2o.get_model(stack_gbm1.metalearner()['name']) meta_gbm2 = h2o.get_model(stack_gbm2.metalearner()['name']) assert meta_gbm1.rmse(train=True) == meta_gbm2.rmse( train=True), "RMSE should match if same seed" #Train two SE models with diff metalearner seeds stack_gbm3 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm", metalearner_params=gbm_params, seed=55555) stack_gbm4 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm", metalearner_params=gbm_params, seed=98765) stack_gbm3.train(x=x, y=y, training_frame=train) stack_gbm4.train(x=x, y=y, training_frame=train) meta_gbm3 = h2o.get_model(stack_gbm3.metalearner()['name']) meta_gbm4 = h2o.get_model(stack_gbm4.metalearner()['name']) assert meta_gbm3.rmse(train=True) != meta_gbm4.rmse( train=True), "RMSE should NOT match if diff seed"
gbm_final_model = gbm_grid_table.models[0] train_performance = gbm_final_model.model_performance(train) train_performance.plot() valid_performance = gbm_final_model.model_performance(valid) valid_performance.plot() test_performance = gbm_final_model.model_performance(test) test_performance.plot() import time ts = time.time() ts = int(ts) gbm_model = gbm_final_model rf_model = rf_final_model ensemble = H2OStackedEnsembleEstimator(model_id="my_ensemble_binomial_" + str(ts), base_models=[gbm_model, rf_model]) ensemble.train(x=predictors, y=target, training_frame=train, validation_frame=valid) train_performance = ensemble.model_performance(train) train_performance.plot() valid_performance = ensemble.model_performance(valid) valid_performance.plot() test_performance = ensemble.model_performance(test) test_performance.plot() # Eval ensemble performance on the test data\n",
def basic_inference_works_for_DRF_and_NB_test(): train = h2o.import_file( pyunit_utils.locate("smalldata/iris/iris_train.csv")) test = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_test.csv")) x_class = train.columns y_class = "species" x_class.remove(y_class) nfolds = 2 nb_class = H2ONaiveBayesEstimator( nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, ) nb_class.train(x=x_class, y=y_class, training_frame=train) gbm_class = H2OGradientBoostingEstimator( nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, ) gbm_class.train(x=x_class, y=y_class, training_frame=train) drf_class = H2ORandomForestEstimator( nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True) drf_class.train(x=x_class, y=y_class, training_frame=train) se_class_0 = H2OStackedEnsembleEstimator( training_frame=train, validation_frame=test, base_models=[nb_class, gbm_class, drf_class], metalearner_algorithm="gbm") se_class_0.train(x_class, y_class, train) assert se_class_0.metalearner().actual_params.get("distribution") == "multinomial", \ "Expected distribution {} but got {}".format("multinomial", se_class_0.metalearner().actual_params.get("distribution")) se_class_1 = H2OStackedEnsembleEstimator( training_frame=train, validation_frame=test, base_models=[gbm_class, drf_class, nb_class], metalearner_algorithm="gbm") se_class_1.train(x_class, y_class, train) assert se_class_1.metalearner().actual_params.get("distribution") == "multinomial", \ "Expected distribution {} but got {}".format("multinomial", se_class_1.metalearner().actual_params.get("distribution")) se_class_2 = H2OStackedEnsembleEstimator( training_frame=train, validation_frame=test, base_models=[drf_class, nb_class, gbm_class], metalearner_algorithm="gbm") se_class_2.train(x_class, y_class, train) assert se_class_2.metalearner().actual_params.get("distribution") == "multinomial", \ "Expected distribution {} but got {}".format("multinomial", se_class_2.metalearner().actual_params.get("distribution")) se_class_3 = H2OStackedEnsembleEstimator( training_frame=train, validation_frame=test, base_models=[nb_class, gbm_class, drf_class]) se_class_3.train(x_class, y_class, train) assert se_class_3.metalearner().actual_params.get("family") == "multinomial", \ "Expected family {} but got {}".format("multinomial", se_class_3.metalearner().actual_params.get("family")) se_class_4 = H2OStackedEnsembleEstimator( training_frame=train, validation_frame=test, base_models=[gbm_class, drf_class, nb_class]) se_class_4.train(x_class, y_class, train) assert se_class_4.metalearner().actual_params.get("family") == "multinomial", \ "Expected family {} but got {}".format("multinomial", se_class_4.metalearner().actual_params.get("family")) se_class_5 = H2OStackedEnsembleEstimator( training_frame=train, validation_frame=test, base_models=[drf_class, nb_class, gbm_class]) se_class_5.train(x_class, y_class, train) assert se_class_5.metalearner().actual_params.get("family") == "multinomial", \ "Expected family {} but got {}".format("multinomial", se_class_5.metalearner().actual_params.get("family"))
def airline_gbm_random_grid(): air_hex = h2o.import_file( path=pyunit_utils.locate("smalldata/airlines/allyears2k_headers.zip"), destination_frame="air.hex") myX = ["DayofMonth", "DayOfWeek"] hyper_parameters = { 'learn_rate': [0.1, 0.2], 'max_depth': [2, 3, 4], 'ntrees': [5, 10, 15] } search_crit = { 'strategy': "RandomDiscrete", 'max_models': 5, 'seed': 1234, 'stopping_rounds': 3, 'stopping_metric': "AUTO", 'stopping_tolerance': 1e-2 } air_grid = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters, search_criteria=search_crit) air_grid.train(x=myX, y="IsDepDelayed", training_frame=air_hex, nfolds=5, fold_assignment='Modulo', keep_cross_validation_predictions=True, distribution="bernoulli", seed=5678) assert (len(air_grid.get_grid()) == 5) print(air_grid.get_grid("logloss")) stacker = H2OStackedEnsembleEstimator(base_models=air_grid.model_ids) print("created H2OStackedEnsembleEstimator") stacker.train(model_id="my_ensemble", y="IsDepDelayed", training_frame=air_hex) print("trained H2OStackedEnsembleEstimator") predictions = stacker.predict(air_hex) # training data print("predictions for ensemble are in: " + predictions.frame_id) # Check that the model can be retrieved assert stacker.model_id == "my_ensemble" modelcopy = h2o.get_model(stacker.model_id) assert modelcopy is not None assert modelcopy.model_id == "my_ensemble" # golden test for ensemble predictions: assert round( predictions[0, "YES"], 4 ) == 0.4327, "Expected prediction for row: {0} to be: {1}; got: {2} instead.".format( 0, 0.4327, round(predictions[0, "YES"], 4)) assert round( predictions[1, "YES"], 4 ) == 0.5214, "Expected prediction for row: {0} to be: {1}; got: {2} instead.".format( 1, 0.5214, round(predictions[1, "YES"], 4)) assert round( predictions[2, "YES"], 4 ) == 0.4666, "Expected prediction for row: {0} to be: {1}; got: {2} instead.".format( 2, 0.4666, round(predictions[2, "YES"], 4)) air_grid = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters, search_criteria=search_crit) air_grid.train(x=myX, y="IsDepDelayed", training_frame=air_hex, distribution="bernoulli") assert (len(air_grid.get_grid()) == 5) print(air_grid.get_grid("logloss")) # added this part to check h2o.get_grid is working properly fetch_grid = h2o.get_grid(str(air_grid.grid_id)) assert len(air_grid.get_grid()) == len(fetch_grid.get_grid()) ################################################################################ # PUBDEV-5145: make sure we give a good error message for JSON parse failures, like range() under 3.6 hyper_parameters['max_depth'] = range(2, 4) search_crit['max_models'] = 1 if sys.version_info[0] < 3: # no exception air_grid = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters, search_criteria=search_crit) air_grid.train(x=myX, y="IsDepDelayed", training_frame=air_hex, nfolds=5, fold_assignment='Modulo', keep_cross_validation_predictions=True, distribution="bernoulli", seed=5678) else: # MalformedJsonException in Java; check for the right error message in Python got_exception = False exc = None try: air_grid = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters, search_criteria=search_crit) air_grid.train(x=myX, y="IsDepDelayed", training_frame=air_hex, nfolds=5, fold_assignment='Modulo', keep_cross_validation_predictions=True, distribution="bernoulli", seed=5678) except H2OResponseError as e: got_exception = True exc = e assert (type(exc) == H2OResponseError) print("Got an H2OResponseError, as expected with 3.x") assert ("Error: Can't parse the hyper_parameters dictionary" in str(exc)) assert (got_exception) hyper_parameters['max_depth'] = 1 search_crit['max_models'] = [1, 3] # expecting an int # IllegalStateException in Java; check for the right error message in Python got_exception = False exc = None try: air_grid = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters, search_criteria=search_crit) air_grid.train(x=myX, y="IsDepDelayed", training_frame=air_hex, nfolds=5, fold_assignment='Modulo', keep_cross_validation_predictions=True, distribution="bernoulli", seed=5678) except H2OResponseError as e: got_exception = True exc = e assert (type(exc) == H2OResponseError) print("Got an H2OResponseError, as expected with 3.x") assert ("Error: Can't parse the search_criteria dictionary" in str(exc)) assert (got_exception)
fold_assignment="Modulo") glm_model.train(x=features, y=target, training_frame=train, model_id="glm_model") glm_model.show() # Deep Water Model dw_model = H2ODeepWaterEstimator(epochs=3, network="lenet", ignore_const_cols=False, image_shape=[28, 28], channels=1, standardize=False, seed=1234, nfolds=nfolds, keep_cross_validation_predictions=True, fold_assignment="Modulo") dw_model.train(x=features, y=target, training_frame=train, model_id="dw_model") dw_model.show() # Stacked Ensemble stack_all = H2OStackedEnsembleEstimator( base_models=[gbm_model.model_id, glm_model.model_id, dw_model.model_id]) stack_all.train(x=features, y=target, training_frame=train, validation_frame=valid, model_id="stack_all") stack_all.model_performance()
def infer_family_helper(family, expected_family, link, expected_link, kwargs1=None, kwargs2=None): kwargs1 = dict() if kwargs1 is None else kwargs1 kwargs2 = dict() if kwargs2 is None else kwargs2 train = h2o.import_file( pyunit_utils.locate("smalldata/iris/iris_train.csv")) test = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_test.csv")) if family == "multinomial": y = "species" elif family == "binomial": train["response"] = (train["species"] == "Iris-versicolor").asfactor() test["response"] = (test["species"] == "Iris-versicolor").asfactor() y = "response" elif family == "quasibinomial" or family == "fractionalbinomial": train["response"] = (train["species"] == "Iris-versicolor") / 2 test["response"] = (test["species"] == "Iris-versicolor") / 2 y = "response" elif family == "ordinal": y = "response" train[y] = (train["species"] == "Iris-versicolor") test[y] = (test["species"] == "Iris-versicolor") train[(train["species"] == "Iris-setosa"), y] = 2 test[(test["species"] == "Iris-setosa"), y] = 2 train[y] = train[y].asfactor() test[y] = test[y].asfactor() else: y = "petal_wid" x = train.columns x.remove(y) if "link" not in kwargs1 and link: kwargs1["link"] = link if "family" not in kwargs1: kwargs1["family"] = family if "link" not in kwargs2 and link: kwargs2["link"] = link if "family" not in kwargs2: kwargs2["family"] = family nfolds = 2 glm = H2OGeneralizedLinearEstimator(nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, **kwargs1) glm.train(x=x, y=y, training_frame=train) glm2 = H2OGeneralizedLinearEstimator( nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, **kwargs2) glm2.train(x=x, y=y, training_frame=train) se = H2OStackedEnsembleEstimator(training_frame=train, validation_frame=test, base_models=[glm, glm2], metalearner_algorithm="glm") se.train(x, y, train) assert se.metalearner().actual_params.get("family") == expected_family, \ "Expected family {} but got {}".format(expected_family, se.metalearner().actual_params.get("family")) if link: assert se.metalearner().actual_params.get("link") == expected_link, \ "Expected link {} but got {}".format(expected_link, se.metalearner().actual_params.get("link")) se_auto = H2OStackedEnsembleEstimator(training_frame=train, validation_frame=test, base_models=[glm, glm2], metalearner_algorithm="auto") se_auto.train(x, y, train) assert se_auto.metalearner().actual_params.get("family") == expected_family, \ "Expected family {} but got {}".format(expected_family, se_auto.metalearner().actual_params.get("family")) if link: assert se_auto.metalearner().actual_params.get("link") == expected_link, \ "Expected link {} but got {}".format(expected_link, se_auto.metalearner().actual_params.get("link"))
ntrees=200, sample_rate=0.7, col_sample_rate=0.4) model_gbm.train(x=training_columns, y=predict_column, training_frame=train) model_rf = H2ORandomForestEstimator(model_id="model_rf", nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, ntrees=200) model_rf.train(x=training_columns, y=predict_column, training_frame=train) model_glm = H2OGeneralizedLinearEstimator( model_id="model_glm", nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True) model_glm.train(x=training_columns, y=predict_column, training_frame=train) models = [model_gbm, model_rf, model_glm] final_se = H2OStackedEnsembleEstimator(model_id="se_model", base_models=models) final_se.train(x=training_columns, y=predict_column, training_frame=train) #The best rmse was obtained using gbm #RMSE: 116191.98025712594 model_gbm.model_performance(test)
#deep_water.train(x=pred_columns, y='target', training_frame=df) deep_learn = H2ODeepLearningEstimator( nfolds=nfolds, hidden=[10, 10, 10, 10, 10, 10, 10, 10, 10], activation="Tanh", fold_assignment="Modulo", keep_cross_validation_predictions=True) deep_learn.train(x=pred_columns, y='target', training_frame=df) lin = H2OGeneralizedLinearEstimator(nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True) lin.train(x=pred_columns, y='target', training_frame=df) stack = H2OStackedEnsembleEstimator(model_id="my_ensemble", training_frame=df, base_models=[ my_gbm.model_id, my_rf.model_id, deep_learn.model_id, lin.model_id ]) #stack = H2OStackedEnsembleEstimator(model_id="my_ensemble", training_frame=df, base_models=[my_gbm.model_id, my_rf.model_id, deep_water.model_id,deep_learn.model_id,lin.model_id]) #stack = H2OStackedEnsembleEstimator(model_id="my_ensemble", training_frame=df, base_models=[my_gbm.model_id, my_rf.model_id]) stack.train(x=pred_columns, y='target', training_frame=df) stack.model_performance() predictions = stack.predict(test) h2o.download_csv(predictions, "../output/predictions_BIG.h2o") #gbm_grid = H2OGradientBoostingEstimator( # ## more trees is better if the learning rate is small enough # ## here, use "more than enough" trees - we have early stopping # ntrees=10000, # ## smaller learning rate is better
def stackedensemble_gaussian(): # # australia.csv: Gaussian # australia_hex = h2o.import_file( path=pyunit_utils.locate("smalldata/extdata/australia.csv"), destination_frame="australia.hex") myX = [ "premax", "salmax", "minairtemp", "maxairtemp", "maxsst", "maxsoilmoist", "Max_czcs" ] # myXSmaller = ["premax", "salmax","minairtemp", "maxairtemp", "maxsst", "maxsoilmoist"] # dependent = "runoffnew" my_gbm = H2OGradientBoostingEstimator( ntrees=10, max_depth=3, min_rows=2, learn_rate=0.2, nfolds=5, fold_assignment="Modulo", keep_cross_validation_predictions=True, distribution="AUTO") my_gbm.train(y="runoffnew", x=myX, training_frame=australia_hex) print("GBM performance: ") my_gbm.model_performance(australia_hex).show() my_rf = H2ORandomForestEstimator(ntrees=10, max_depth=3, min_rows=2, nfolds=5, fold_assignment="Modulo", keep_cross_validation_predictions=True) my_rf.train(y="runoffnew", x=myX, training_frame=australia_hex) print("RF performance: ") my_rf.model_performance(australia_hex).show() my_dl = H2ODeepLearningEstimator(nfolds=5, fold_assignment="Modulo", keep_cross_validation_predictions=True) my_dl.train(y="runoffnew", x=myX, training_frame=australia_hex) print("DL performance: ") my_dl.model_performance(australia_hex).show() # NOTE: don't specify family my_glm = H2OGeneralizedLinearEstimator( nfolds=5, fold_assignment="Modulo", keep_cross_validation_predictions=True) my_glm.train(y="runoffnew", training_frame=australia_hex) # my_glm.train(y = "runoffnew", x = myX, training_frame = australia_hex) # my_glm.train(y = "runoffnew", x = myXSmaller, training_frame = australia_hex) # test parameter error-checking print("GLM performance: ") my_glm.model_performance(australia_hex).show() stacker = H2OStackedEnsembleEstimator( selection_strategy="choose_all", base_models=[my_gbm.model_id, my_rf.model_id, my_glm.model_id]) stacker.train(model_id="my_ensemble", x=myX, y="runoffnew", training_frame=australia_hex) # test ignore_columns parameter checking # stacker.train(model_id="my_ensemble", y="runoffnew", training_frame=australia_hex, ignored_columns=["premax"]) predictions = stacker.predict(australia_hex) # training data print("Predictions for australia ensemble are in: " + predictions.frame_id) # # ecology.csv: Gaussian # ecology_train = h2o.import_file( path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv"), destination_frame="ecology_train") myX = [ "SegSumT", "SegTSeas", "SegLowFlow", "DSDist", "DSMaxSlope", "USAvgT", "USRainDays", "USSlope", "USNative", "DSDam", "Method", "LocSed" ] # myXSmaller = ["SegSumT", "SegTSeas", "SegLowFlow"] my_gbm = H2OGradientBoostingEstimator( ntrees=10, max_depth=3, min_rows=2, learn_rate=0.2, nfolds=5, fold_assignment='Modulo', keep_cross_validation_predictions=True, distribution="AUTO") my_gbm.train(y="Angaus", x=myX, training_frame=ecology_train) print("GBM performance: ") my_gbm.model_performance(ecology_train).show() my_rf = H2ORandomForestEstimator(ntrees=10, max_depth=3, min_rows=2, nfolds=5, fold_assignment='Modulo', keep_cross_validation_predictions=True) my_rf.train(y="Angaus", x=myX, training_frame=ecology_train) print("RF performance: ") my_rf.model_performance(ecology_train).show() my_dl = H2ODeepLearningEstimator(nfolds=5, fold_assignment='Modulo', keep_cross_validation_predictions=True) my_dl.train(y="Angaus", x=myX, training_frame=ecology_train) print("DL performance: ") my_dl.model_performance(ecology_train).show() # NOTE: don't specify family my_glm = H2OGeneralizedLinearEstimator( nfolds=5, fold_assignment='Modulo', keep_cross_validation_predictions=True) my_glm.train(y="Angaus", x=myX, training_frame=ecology_train) print("GLM performance: ") my_glm.model_performance(ecology_train).show() stacker = H2OStackedEnsembleEstimator( selection_strategy="choose_all", base_models=[my_gbm.model_id, my_rf.model_id, my_glm.model_id]) print("created H2OStackedEnsembleEstimator: " + str(stacker)) stacker.train(model_id="my_ensemble", y="Angaus", training_frame=ecology_train) print("trained H2OStackedEnsembleEstimator: " + str(stacker)) print("trained H2OStackedEnsembleEstimator via get_model: " + str(h2o.get_model("my_ensemble"))) predictions = stacker.predict(ecology_train) # training data print("predictions for ensemble are in: " + predictions.frame_id) # # insurance.csv: Poisson # insurance_train = h2o.import_file( path=pyunit_utils.locate("smalldata/glm_test/insurance.csv"), destination_frame="insurance_train") insurance_train["offset"] = insurance_train["Holders"].log() myX = list(range(3)) my_gbm = H2OGradientBoostingEstimator( ntrees=10, max_depth=3, min_rows=2, learn_rate=0.2, nfolds=5, fold_assignment='Modulo', keep_cross_validation_predictions=True, distribution='poisson') my_gbm.train(y="Claims", x=myX, training_frame=insurance_train) print("GBM performance: ") my_gbm.model_performance(insurance_train).show() my_rf = H2ORandomForestEstimator(ntrees=10, max_depth=3, min_rows=2, nfolds=5, fold_assignment='Modulo', keep_cross_validation_predictions=True) my_rf.train(y="Claims", x=myX, training_frame=insurance_train) print("RF performance: ") my_rf.model_performance(insurance_train).show() my_dl = H2ODeepLearningEstimator(nfolds=5, fold_assignment='Modulo', keep_cross_validation_predictions=True, distribution='poisson') my_dl.train(y="Claims", x=myX, training_frame=insurance_train) print("DL performance: ") my_dl.model_performance(insurance_train).show() # NOTE: don't specify family my_glm = H2OGeneralizedLinearEstimator( nfolds=5, fold_assignment='Modulo', keep_cross_validation_predictions=True, family='poisson') my_glm.train(y="Claims", x=myX, training_frame=insurance_train) print("GLM performance: ") my_glm.model_performance(insurance_train).show() stacker = H2OStackedEnsembleEstimator( selection_strategy="choose_all", base_models=[my_gbm.model_id, my_rf.model_id, my_glm.model_id]) print("created H2OStackedEnsembleEstimator: " + str(stacker)) stacker.train(model_id="my_ensemble", y="Claims", training_frame=insurance_train) print("trained H2OStackedEnsembleEstimator: " + str(stacker)) print("metalearner: ") print(h2o.get_model(stacker.metalearner()['name'])) predictions = stacker.predict(insurance_train) # training data print("preditions for ensemble are in: " + predictions.frame_id)
def airline_gbm_random_grid(): air_hex = h2o.import_file( path=pyunit_utils.locate("smalldata/airlines/allyears2k_headers.zip"), destination_frame="air.hex") myX = ["DayofMonth", "DayOfWeek"] hyper_parameters = { 'learn_rate': [0.1, 0.2], 'max_depth': [2, 3, 4], 'ntrees': [5, 10, 15] } search_crit = { 'strategy': "RandomDiscrete", 'max_models': 5, 'seed': 1234, 'stopping_rounds': 3, 'stopping_metric': "AUTO", 'stopping_tolerance': 1e-2 } air_grid = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters, search_criteria=search_crit) air_grid.train(x=myX, y="IsDepDelayed", training_frame=air_hex, nfolds=5, fold_assignment='Modulo', keep_cross_validation_predictions=True, distribution="bernoulli", seed=5678) assert (len(air_grid.get_grid()) == 5) print(air_grid.get_grid("logloss")) stacker = H2OStackedEnsembleEstimator(base_models=air_grid.model_ids) print("created H2OStackedEnsembleEstimator") stacker.train(model_id="my_ensemble", y="IsDepDelayed", training_frame=air_hex) print("trained H2OStackedEnsembleEstimator") predictions = stacker.predict(air_hex) # training data print("predictions for ensemble are in: " + predictions.frame_id) # Check that the model can be retrieved assert stacker.model_id == "my_ensemble" modelcopy = h2o.get_model(stacker.model_id) assert modelcopy is not None assert modelcopy.model_id == "my_ensemble" # golden test for ensemble predictions: assert round( predictions[0, "YES"], 4 ) == 0.4327, "Expected prediction for row: {0} to be: {1}; got: {2} instead.".format( 0, 0.4327, round(predictions[0, "YES"], 4)) assert round( predictions[1, "YES"], 4 ) == 0.5214, "Expected prediction for row: {0} to be: {1}; got: {2} instead.".format( 1, 0.5214, round(predictions[1, "YES"], 4)) assert round( predictions[2, "YES"], 4 ) == 0.4666, "Expected prediction for row: {0} to be: {1}; got: {2} instead.".format( 2, 0.4666, round(predictions[2, "YES"], 4)) air_grid = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters, search_criteria=search_crit) air_grid.train(x=myX, y="IsDepDelayed", training_frame=air_hex, distribution="bernoulli") assert (len(air_grid.get_grid()) == 5) print(air_grid.get_grid("logloss")) # added this part to check h2o.get_grid is working properly fetch_grid = h2o.get_grid(str(air_grid.grid_id)) assert len(air_grid.get_grid()) == len(fetch_grid.get_grid())
# ) gbm1.train(x=features, y=t, training_frame=train_hf) gbm2.train(x=features, y=t, training_frame=train_hf) # # glm1.train(x = features, # y = t, # training_frame = train_hf) # glm2.train(x = features, # y = t, # training_frame = train_hf) all_ids = ['gbm1', 'gbm2'] ensemble = H2OStackedEnsembleEstimator(model_id="my_ensemble", base_models=all_ids) ensemble.train(x=features, y=t, training_frame=train_hf) print(gbm1.model_performance(val_hf).auc()) print(gbm2.model_performance(val_hf).auc()) # print(glm1.model_performance(val_hf).auc()) # print(glm2.model_performance(val_hf).auc()) print(ensemble.model_performance(val_hf).auc()) print() time_elapsed = time.time() - since print('[timer]: complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)) '''0.6477481772734077 0.6477481772734077 0.6477510858544782
from h2o.estimators.gbm import H2OGradientBoostingEstimator mGBM = H2OGradientBoostingEstimator(nfolds=folds,fold_assignment="Modulo",keep_cross_validation_predictions=True) mGBM.train(X,y,train) print(mGBM.model_performance(test)) #XGBoost model, This showed the most optimal result and also achieves below $123000 RMSE from h2o.estimators import H2OXGBoostEstimator xgb = H2OXGBoostEstimator(nfolds=folds,ntrees=60,learn_rate=0.2,fold_assignment="Modulo",keep_cross_validation_predictions=True) xgb.train(X,y,train) print(xgb.model_performance(test)) print("##################################### Check Ensemble Model #####################") from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator # Train a stacked ensemble using above models ensemble = H2OStackedEnsembleEstimator(base_models=[glm,rFm,mGBM, xgb]) ensemble.train(X,y,train) # Eval ensemble performance on the test data #performance is worst among all print(ensemble.model_performance(test)) print("##################################### Start Training Deep Learning Model #####################") #split train set in train and validation #Running Deep learning using cross validation is extremely slow, so just use validation set #Also, best parameters were identified by creating models with #epoch ranging from 50, 100, 200 # Hidden layers as [200.200] and [200,200,200] and [50,50] #Activation functions as tanh, rectifier and tanh_with_dropout # Rectifier with dropout using [0.1,0.1] and [0.25, 0.25] and [0.35, 0.35] and [0.5,0.5]
r_rf.train(predictors, target, training_frame=rh_data) r_deep = H2ODeepLearningEstimator(hidden=[200, 100, 100, 10, 10, 10], epochs=1000, rate=0.001, nfolds=6, keep_cross_validation_predictions=True, seed=1) r_deep.train(predictors, target, training_frame=rh_data) r_stack = H2OStackedEnsembleEstimator(metalearner_algorithm="deeplearning", metalearner_params={ "hidden": [200, 100, 100, 10, 10, 10], "epochs": 100, "nesterov_accelerated_gradient": True }, model_id="ensemble4", training_frame=rh_data, metalearner_nfolds=6, base_models=[r_gbm, r_rf, r_deep]) r_stack.train(predictors, target, training_frame=rh_data) print(r_stack) h2o.save_model(model=r_stack, path="stack_red3", force=True) # saving trained RF red model h2o.save_model(model=r_rf, path="rf_red2", force=True) # printing key red RF model info
b_models = aml.leader.full_parameters['base_models'] base = [ b_models['actual_value'][i]['name'] for i in range(0, len(b_models['actual_value'])) ] #Here we retrain base models before calling stack ensemble print("stacked") for b_model in base: if 'GLM' in b_model: #GLM is giving error with re-training base.remove(b_model) else: m = h2o.get_model(b_model) m.train(y=-1, training_frame=d) ensemble = H2OStackedEnsembleEstimator(base_models=base) ensemble.train(y=-1, training_frame=d) anytime_model = ensemble else: aml.leader.train(y=-1, training_frame=d) anytime_model = aml.leader # In[27]: from h2o.estimators.gbm import H2OGradientBoostingEstimator m = h2o.get_model('GBM_grid__1_AutoML_20200518_140119_model_4') print(m) # In[28]:
nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True) m1.train(x, y, train) m2.train(x, y, train) m3.train(x, y, train) m4.train(x, y, train) base_models = [m1, m2, m3, m4] m5 = H2OStackedEnsembleEstimator(model_id="SE", base_models=base_models, metalearner_algorithm="deeplearning", metalearner_params={ 'epochs': 20, 'hidden': [200, 200, 200], 'l1': 1e-5, }, metalearner_nfolds=nfolds, validation_frame=valid) m5.train(x, y, train) for m in [m1, m2, m3, m4, m5]: print("%3s RMSE: %.2f" % (m.model_id, m.model_performance(test).rmse())) best_model = sorted(set(base_models + [m5]), key=lambda x: x.model_performance(test).rmse())[0] print(best_model.model_performance(test))
rf = H2ORandomForestEstimator(nfolds=3, keep_cross_validation_predictions=True) rf.train(x=x, y=y, training_frame=train, validation_frame=valid) rf.cross_validation_models() rf.cross_validation_metrics_summary() rf.varimp_plot() rf.varimp() rf.mse(train=True, valid=True, xval=True) rf.r2(train=True, valid=True, xval=True) rf.mae(train=True, valid=True, xval=True) #♦burası hata alıyor bir bak!!!!!! from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator stack = H2OStackedEnsembleEstimator(model_id="my_ensemble", training_frame=train, validation_frame=test, base_models=[gbm.model_id, rf.model_id]) stack.train(x=x, y=y, training_frame=train, validation_frame=valid) stack.model_performance() y2 = data2['clicks'] X = data2.drop('clicks', axis=1) from sklearn.model_selection import train_test_split X_train, X_val, y_train, y_val = train_test_split(X, y2, test_size=0.25, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3,
def stackedensemble_binomial_test(): """This test check the following (for binomial classification): 1) That H2OStackedEnsembleEstimator executes w/o erros on a 2-model 'manually constructed ensemble. 2) That .predict() works on a stack. 3) That .model_performance() works on a stack. 4) That the training and test performance is better on ensemble vs the base learners. 5) That the validation_frame arg on H2OStackedEnsembleEstimator works correctly. """ # Import train and test datasets train = h2o.import_file( path=pyunit_utils.locate("smalldata/testng/higgs_train_5k.csv"), destination_frame="higgs_train_5k") test = h2o.import_file( path=pyunit_utils.locate("smalldata/testng/higgs_test_5k.csv"), destination_frame="higgs_test_5k") print(train.summary()) # Identify predictors and response x = train.columns y = "response" x.remove(y) # convert response to a factor train[y] = train[y].asfactor() test[y] = test[y].asfactor() # set number of folds nfolds = 5 # train and cross-validate a GBM my_gbm = H2OGradientBoostingEstimator( distribution="bernoulli", ntrees=10, max_depth=3, min_rows=2, learn_rate=0.2, nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_gbm.train(x=x, y=y, training_frame=train) # evaluate the performance perf_gbm_train = my_gbm.model_performance(train=True) perf_gbm_test = my_gbm.model_performance(test_data=test) print("GBM training performance: ") print(perf_gbm_train) print("GBM test performance: ") print(perf_gbm_test) # train and cross-validate a RF my_rf = H2ORandomForestEstimator(ntrees=50, nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_rf.train(x=x, y=y, training_frame=train) # evaluate performance perf_rf_train = my_rf.model_performance(train=True) perf_rf_test = my_rf.model_performance(test_data=test) print("RF training performance: ") print(perf_rf_train) print("RF test performance: ") print(perf_rf_test) # Train a stacked ensemble using the GBM and GLM above stack = H2OStackedEnsembleEstimator( model_id="my_ensemble_binomial", base_models=[my_gbm.model_id, my_rf.model_id], selection_strategy="choose_all") stack.train( x=x, y=y, training_frame=train, validation_frame=test) # also test that validation_frame is working # check that prediction works pred = stack.predict(test_data=test) assert pred.nrow == test.nrow, "expected " + str( pred.nrow) + " to be equal to " + str(test.nrow) assert pred.ncol == 3, "expected " + str( pred.ncol) + " to be equal to 3 but it was equal to " + str(pred.ncol) # Evaluate ensemble performance perf_stack_train = stack.model_performance() perf_stack_test = stack.model_performance(test_data=test) # Check that stack perf is better (bigger) than the best(biggest) base learner perf: # Training AUC baselearner_best_auc_train = max(perf_gbm_train.auc(), perf_rf_train.auc()) stack_auc_train = perf_stack_train.auc() print("Best Base-learner Training AUC: {0}".format( baselearner_best_auc_train)) print("Ensemble Training AUC: {0}".format(stack_auc_train)) assert stack_auc_train > baselearner_best_auc_train, "expected stack_auc_train would be greater than " \ " found it wasn't baselearner_best_auc_train" # Test AUC baselearner_best_auc_test = max(perf_gbm_test.auc(), perf_rf_test.auc()) stack_auc_test = perf_stack_test.auc() print("Best Base-learner Test AUC: {0}".format(baselearner_best_auc_test)) print("Ensemble Test AUC: {0}".format(stack_auc_test)) assert stack_auc_test > baselearner_best_auc_test, "expected stack_auc_test would be greater than " \ " baselearner_best_auc_test, found it wasn't " \ "baselearner_best_auc_test = "+ \ str(baselearner_best_auc_test) + ",stack_auc_test " \ " = "+ str(stack_auc_test) # Check that passing `test` as a validation_frame produces the same metric as stack.model_performance(test) # since the metrics object is not exactly the same, we can just test that AUC is the same perf_stack_validation_frame = stack.model_performance(valid=True) assert stack_auc_test == perf_stack_validation_frame.auc(), "expected stack_auc_test to be the same as " \ "perf_stack_validation_frame.auc() found they were not " \ "perf_stack_validation_frame.auc() = " + \ str(perf_stack_validation_frame.auc()) + \ "stack_auc_test was " + str(stack_auc_test)
def stackedensemble_guassian_test(): """This test check the following (for guassian regression): 1) That H2OStackedEnsembleEstimator executes w/o errors on a 3-model manually constructed ensemble. 2) That .predict() works on a stack. 3) That .model_performance() works on a stack. 4) That the training and test performance is better on ensemble vs the base learners. 5) That the validation_frame arg on H2OStackedEnsembleEstimator works correctly. """ col_types = [ "numeric", "numeric", "numeric", "enum", "enum", "numeric", "numeric", "numeric", "numeric" ] dat = h2o.upload_file( path=pyunit_utils.locate("smalldata/extdata/prostate.csv"), destination_frame="prostate_hex", col_types=col_types) train, test = dat.split_frame(ratios=[.8], seed=1) print(train.summary()) # Identify predictors and response x = ["CAPSULE", "GLEASON", "RACE", "DPROS", "DCAPS", "PSA", "VOL"] y = "AGE" # set number of folds nfolds = 5 # train and cross-validate a GBM my_gbm = H2OGradientBoostingEstimator( distribution="gaussian", max_depth=3, learn_rate=0.2, nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_gbm.train(x=x, y=y, training_frame=train) # evaluate the performance perf_gbm_train = my_gbm.model_performance(train=True) perf_gbm_test = my_gbm.model_performance(test_data=test) print("GBM training performance: ") print(perf_gbm_train) print("GBM test performance: ") print(perf_gbm_test) # train and cross-validate a RF my_rf = H2ORandomForestEstimator(ntrees=30, nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_rf.train(x=x, y=y, training_frame=train) # evaluate performance perf_rf_train = my_rf.model_performance(train=True) perf_rf_test = my_rf.model_performance(test_data=test) print("RF training performance: ") print(perf_rf_train) print("RF test performance: ") print(perf_rf_test) # Train and cross-validate an extremely-randomized RF my_xrf = H2ORandomForestEstimator(ntrees=50, nfolds=nfolds, histogram_type="Random", fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_xrf.train(x=x, y=y, training_frame=train) # evaluate performance perf_xrf_train = my_xrf.model_performance(train=True) perf_xrf_test = my_xrf.model_performance(test_data=test) print("XRF training performance: ") print(perf_xrf_train) print("XRF test performance: ") print(perf_xrf_test) # Train a stacked ensemble using the GBM and GLM above stack = H2OStackedEnsembleEstimator( model_id="my_ensemble_guassian", base_models=[my_gbm.model_id, my_rf.model_id, my_xrf.model_id]) stack.train( x=x, y=y, training_frame=train, validation_frame=test) # also test that validation_frame is working # Check that prediction works pred = stack.predict(test_data=test) assert pred.nrow == test.nrow, "expected " + str( pred.nrow) + " to be equal to " + str(test.nrow) assert pred.ncol == 1, "expected " + str( pred.ncol) + " to be equal to 1 but it was equal to " + str(pred.ncol) # Does predict() have ugly side effects? pred = stack.predict(test_data=test) assert pred.nrow == test.nrow, "expected " + str( pred.nrow) + " to be equal to " + str(test.nrow) assert pred.ncol == 1, "expected " + str( pred.ncol) + " to be equal to 1 but it was equal to " + str(pred.ncol) # Evaluate ensemble performance perf_stack_train = stack.model_performance() perf_stack_test = stack.model_performance(test_data=test) # Does performance() have ugly side effects? perf_stack_train = stack.model_performance() perf_stack_test = stack.model_performance(test_data=test) # Training RMSE for each base learner baselearner_best_rmse_train = min(perf_gbm_train.rmse(), perf_rf_train.rmse(), perf_xrf_train.rmse()) stack_rmse_train = perf_stack_train.rmse() print("Best Base-learner Training RMSE: {0}".format( baselearner_best_rmse_train)) print("Ensemble Training RMSE: {0}".format(stack_rmse_train)) #assert stack_rmse_train < baselearner_best_rmse_train, "expected stack_rmse_train would be less than " \ # " found it wasn't baselearner_best_rmse_train" # Check that stack perf is better (smaller) than the best (smaller) base learner perf: # Test RMSE for each base learner baselearner_best_rmse_test = min(perf_gbm_test.rmse(), perf_rf_test.rmse(), perf_xrf_test.rmse()) stack_rmse_test = perf_stack_test.rmse() print( "Best Base-learner Test RMSE: {0}".format(baselearner_best_rmse_test)) print("Ensemble Test RMSE: {0}".format(stack_rmse_test)) assert stack_rmse_test < baselearner_best_rmse_test, "expected stack_rmse_test would be less than " \ " baselearner_best_rmse_test, found it wasn't " \ "baselearner_best_rmse_test = "+ \ str(baselearner_best_rmse_test) + ",stack_rmse_test " \ " = "+ str(stack_rmse_test) # Check that passing `test` as a validation_frame produces the same metric as stack.model_performance(test) # since the metrics object is not exactly the same, we can just test that RSME is the same perf_stack_validation_frame = stack.model_performance(valid=True) assert stack_rmse_test == perf_stack_validation_frame.rmse(), "expected stack_rmse_test to be the same as " \ "perf_stack_validation_frame.rmse() found they were not " \ "perf_stack_validation_frame.rmse() = " + \ str(perf_stack_validation_frame.rmse()) + \ "stack_rmse_test was " + str(stack_rmse_test)
def stackedensemble_multinomial_test(): """This test check the following (for multinomial regression): 1) That H2OStackedEnsembleEstimator executes w/o errors on a 6-model manually constructed ensemble. 2) That .predict() works on a stack. 3) That .model_performance() works on a stack. 4) That test performance is better on ensemble vs the base learners. 5) That the validation_frame arg on H2OStackedEnsembleEstimator works correctly. """ df = h2o.import_file( path=pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz")) y = "C785" x = list(range(784)) df[y] = df[y].asfactor() train = df[0:5000, :] test = df[5000:10000, :] # Number of CV folds (to generate level-one data for stacking) nfolds = 2 # train and cross-validate a GBM my_gbm = H2OGradientBoostingEstimator( distribution="multinomial", nfolds=nfolds, ntrees=10, fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_gbm.train(x=x, y=y, training_frame=train) # evaluate the performance perf_gbm_train = my_gbm.model_performance() perf_gbm_test = my_gbm.model_performance(test_data=test) print("GBM training performance: ") print(perf_gbm_train) print("GBM test performance: ") print(perf_gbm_test) # train and cross-validate a RF my_rf = H2ORandomForestEstimator(ntrees=10, nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_rf.train(x=x, y=y, training_frame=train) # evaluate performance perf_rf_train = my_rf.model_performance() perf_rf_test = my_rf.model_performance(test_data=test) print("RF training performance: ") print(perf_rf_train) print("RF test performance: ") print(perf_rf_test) # Train and cross-validate an XGBoost GBM my_xgb = H2OXGBoostEstimator(ntrees=10, nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_xgb.train(x=x, y=y, training_frame=train) # evaluate performance perf_xgb_train = my_xgb.model_performance() perf_xgb_test = my_xgb.model_performance(test_data=test) print("XGB training performance: ") print(perf_xgb_train) print("XGB test performance: ") print(perf_xgb_test) # Train and cross-validate a Naive Bayes model my_nb = H2ONaiveBayesEstimator(nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_nb.train(x=x, y=y, training_frame=train) # evaluate performance perf_nb_train = my_nb.model_performance() perf_nb_test = my_nb.model_performance(test_data=test) print("NB training performance: ") print(perf_nb_train) print("NB test performance: ") print(perf_nb_test) # Train and cross-validate a Deep Learning model my_dnn = H2ODeepLearningEstimator(hidden=[10, 10], nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_dnn.train(x=x, y=y, training_frame=train) # evaluate performance perf_dnn_train = my_dnn.model_performance() perf_dnn_test = my_dnn.model_performance(test_data=test) print("DNN training performance: ") print(perf_dnn_train) print("DNN test performance: ") print(perf_dnn_test) # Train and cross-validate a GLM model my_glm = H2OGeneralizedLinearEstimator( family="multinomial", nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_glm.train(x=x, y=y, training_frame=train) # evaluate performance perf_glm_train = my_glm.model_performance() perf_glm_test = my_glm.model_performance(test_data=test) print("GLM training performance: ") print(perf_glm_train) print("GLM test performance: ") print(perf_glm_test) # Train a stacked ensemble using the GBM and GLM above stack = H2OStackedEnsembleEstimator(base_models=[ my_gbm.model_id, my_rf.model_id, my_xgb.model_id, my_nb.model_id, my_dnn.model_id, my_glm.model_id ]) stack.train( x=x, y=y, training_frame=train, validation_frame=test) # also test that validation_frame is working assert isinstance( stack, h2o.estimators.stackedensemble.H2OStackedEnsembleEstimator) assert stack.type == "classifier" # Check that prediction works pred = stack.predict(test_data=test) print(pred) assert pred.nrow == test.nrow, "expected " + str( pred.nrow) + " to be equal to " + str(test.nrow) assert pred.ncol == 11, "expected " + str( pred.ncol) + " to be equal to 1 but it was equal to " + str(pred.ncol) # Evaluate ensemble performance perf_stack_train = stack.model_performance() assert isinstance(perf_stack_train, h2o.model.metrics_base.H2OMultinomialModelMetrics) perf_stack_valid = stack.model_performance(valid=True) assert isinstance(perf_stack_valid, h2o.model.metrics_base.H2OMultinomialModelMetrics) perf_stack_test = stack.model_performance(test_data=test) assert isinstance(perf_stack_test, h2o.model.metrics_base.H2OMultinomialModelMetrics) # Check that stack perf is better (smaller) than the best (smaller) base learner perf: # Test Mean Per Class Error for each base learner baselearner_best_mean_per_class_error_test = min(perf_gbm_test.mean_per_class_error(), \ perf_rf_test.mean_per_class_error(), \ perf_xgb_test.mean_per_class_error(), \ perf_nb_test.mean_per_class_error(), \ perf_dnn_test.mean_per_class_error(), perf_glm_test.mean_per_class_error()) stack_mean_per_class_error_test = perf_stack_test.mean_per_class_error() print("Best Base-learner Test Mean Per Class Error: {0}".format( baselearner_best_mean_per_class_error_test)) print("Ensemble Test Mean Per Class Error: {0}".format( stack_mean_per_class_error_test)) assert stack_mean_per_class_error_test <= baselearner_best_mean_per_class_error_test, + \ "expected stack_mean_per_class_error_test would be less than " \ " baselearner_best_mean_per_class_error_test, found it wasn't " \ "baselearner_best_mean_per_class_error_test = "+ \ str(baselearner_best_mean_per_class_error_test) + \ ",stack_mean_per_class_error_test = "+ \ str(stack_mean_per_class_error_test) # Check that passing `test` as a validation_frame produces the same metric as stack.model_performance(test) # since the metrics object is not exactly the same, we can just test that RSME is the same perf_stack_validation_frame = stack.model_performance(valid=True) assert stack_mean_per_class_error_test == perf_stack_validation_frame.mean_per_class_error(), \ "expected stack_mean_per_class_error_test to be the same as " \ "perf_stack_validation_frame.mean_per_class_error() found it wasn't" \ "perf_stack_validation_frame.mean_per_class_error() = " + \ str(perf_stack_validation_frame.mean_per_class_error()) + \ "stack_mean_per_class_error_test was " + \ str(stack_mean_per_class_error_test)
def infer_mixed_family_and_dist_helper(family, expected_family, first_glm, expected_link=None, kwargs_glm=None, kwargs_gbm=None, metalearner_params=None): kwargs_glm = dict() if kwargs_glm is None else kwargs_glm kwargs_gbm = dict() if kwargs_gbm is None else kwargs_gbm metalearner_params = dict( ) if metalearner_params is None else metalearner_params distribution = family if not family == "binomial" else "bernoulli" expected_distribution = expected_family if not expected_family == "binomial" else "bernoulli" train = h2o.import_file( pyunit_utils.locate("smalldata/iris/iris_train.csv")) test = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_test.csv")) if family == "multinomial": y = "species" elif family == "binomial": train["response"] = (train["species"] == "Iris-versicolor").asfactor() test["response"] = (test["species"] == "Iris-versicolor").asfactor() y = "response" elif family == "quasibinomial" or family == "fractionalbinomial": train["response"] = (train["species"] == "Iris-versicolor") / 2 test["response"] = (test["species"] == "Iris-versicolor") / 2 y = "response" elif family == "ordinal": y = "response" train[y] = (train["species"] == "Iris-versicolor") test[y] = (test["species"] == "Iris-versicolor") train[(train["species"] == "Iris-setosa"), y] = 2 test[(test["species"] == "Iris-setosa"), y] = 2 train[y] = train[y].asfactor() test[y] = test[y].asfactor() else: y = "petal_wid" x = train.columns x.remove(y) if "family" not in kwargs_glm: kwargs_glm["family"] = family if "distribution" not in kwargs_gbm: kwargs_gbm["distribution"] = distribution nfolds = 2 glm = H2OGeneralizedLinearEstimator(nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, **kwargs_glm) glm.train(x=x, y=y, training_frame=train) gbm = H2OGradientBoostingEstimator(nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, **kwargs_gbm) gbm.train(x=x, y=y, training_frame=train) se = H2OStackedEnsembleEstimator( training_frame=train, validation_frame=test, base_models=[glm, gbm] if first_glm else [gbm, glm], metalearner_algorithm="glm", metalearner_params={ k: v for k, v in metalearner_params.items() if k != "distribution" }) se.train(x, y, train) assert se.metalearner().actual_params.get("family") == expected_family, \ "Expected family {} but got {}".format(expected_family, se.metalearner().actual_params.get("family")) if expected_link: assert se.metalearner().actual_params.get("link") == expected_link, \ "Expected link {} but got {}".format(expected_link, se.metalearner().actual_params.get("link")) se_auto = H2OStackedEnsembleEstimator( training_frame=train, validation_frame=test, base_models=[glm, gbm] if first_glm else [gbm, glm], metalearner_algorithm="auto", metalearner_params={ k: v for k, v in metalearner_params.items() if k != "distribution" }) se_auto.train(x, y, train) assert se_auto.metalearner().actual_params.get("family") == expected_family, \ "Expected family {} but got {}".format(expected_family, se_auto.metalearner().actual_params.get("family")) if expected_link: assert se_auto.metalearner().actual_params.get("link") == expected_link, \ "Expected link {} but got {}".format(expected_link, se_auto.metalearner().actual_params.get("link")) se_gbm = H2OStackedEnsembleEstimator( training_frame=train, validation_frame=test, base_models=[glm, gbm] if first_glm else [gbm, glm], metalearner_algorithm="gbm", metalearner_params={ k: v for k, v in metalearner_params.items() if k != "family" and k != "link" }) se_gbm.train(x, y, train) assert se_gbm.metalearner().actual_params.get("distribution") == expected_distribution, \ "Expected distribution {} but got {}".format(expected_distribution, se_gbm.metalearner().actual_params.get("distribution"))
binomial_double_trees=True, keep_cross_validation_predictions=True) RandomForest.train(x=x, y=y, training_frame=train) # Eval performance: RFperf = RandomForest.model_performance() GradientBoost = H2OGradientBoostingEstimator(model_id = 'GradientBoost', nfolds=5, seed=1111, keep_cross_validation_predictions=True) GradientBoost.train(x=x, y=y, training_frame=train) GBperf = GradientBoost.model_performance() Ensemble = H2OStackedEnsembleEstimator(model_id="Ensemble", base_models=['DeepLearn', 'RandomForest', 'GradientBoost']) Ensemble.train(x=x, y=y, training_frame=train) Performance = Ensemble.model_performance() predic = Ensemble.predict(valid).as_data_frame() yhat = np.array(predic).reshape(-1,1) ytrue = np.array(Test['LogReturn']).reshape(-1,1) yy = np.concatenate((np.exp(yhat),np.exp(ytrue)),axis=1) yy = yy[:99,:] R2 = np.corrcoef(yy.T)[0,1]**2 h2o.cluster().shutdown()
def stackedensemble_nfolds_test(): """This test checks the following: 1) That H2OStackedEnsembleEstimator `metalearner_nfolds` works correctly 2) That H2OStackedEnsembleEstimator `metalearner_fold_assignment` works correctly 3) That Stacked Ensemble cross-validation metrics are correctly copied from metalearner """ # Import training set train = h2o.import_file( path=pyunit_utils.locate("smalldata/testng/higgs_train_5k.csv"), destination_frame="higgs_train_5k") test = h2o.import_file( path=pyunit_utils.locate("smalldata/testng/higgs_test_5k.csv"), destination_frame="higgs_test_5k") # Add a fold_column fold_column = "fold_id" train[fold_column] = train.kfold_column(n_folds=3, seed=1) # Identify predictors and response x = train.columns y = "response" x.remove(y) x.remove(fold_column) # Convert response to a factor train[y] = train[y].asfactor() test[y] = test[y].asfactor() # Set number of folds for base learners nfolds = 3 # Train and cross-validate a GBM my_gbm = H2OGradientBoostingEstimator( distribution="bernoulli", ntrees=10, nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_gbm.train(x=x, y=y, training_frame=train) # Train and cross-validate a RF my_rf = H2ORandomForestEstimator(ntrees=50, nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_rf.train(x=x, y=y, training_frame=train) # Check that not setting nfolds still produces correct results stack0 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf]) stack0.train(x=x, y=y, training_frame=train) assert (stack0.params['metalearner_nfolds']['actual'] == 0) meta0 = h2o.get_model(stack0.metalearner()['name']) assert (meta0.params['nfolds']['actual'] == 0) # Train a stacked ensemble & check that metalearner_nfolds works # Also test that the xval metrics from metalearner & ensemble are equal stack1 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_nfolds=3) stack1.train(x=x, y=y, training_frame=train) # Check that metalearner_nfolds is correctly stored in model output assert (stack1.params['metalearner_nfolds']['actual'] == 3) # Check that the metalearner was cross-validated with the correct number of folds meta1 = h2o.get_model(stack1.metalearner()['name']) assert (meta1.params['nfolds']['actual'] == 3) # Check that metalearner fold_assignment is NULL/"AUTO" assert (meta1.params['fold_assignment']['actual'] == "AUTO") # Check that validation metrics are NULL assert (stack1.mse(valid=True) is None) # Check that xval metrics from metalearner and ensemble are equal (use mse as proxy) assert (stack1.mse(xval=True) == meta1.mse(xval=True)) # Train a new ensmeble, also passing a validation frame ss = test.split_frame(ratios=[0.5], seed=1) stack2 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_nfolds=3) stack2.train(x=x, y=y, training_frame=train, validation_frame=ss[0]) # Check that valid & xval metrics from metalearner and ensemble are equal (use mse as proxy) meta2 = h2o.get_model(stack2.metalearner()['name']) assert (stack2.mse(valid=True) == meta2.mse(valid=True)) # Check that xval metrics from metalearner and ensemble are equal (use mse as proxy) assert (stack2.mse(xval=True) == meta2.mse(xval=True)) # Check that metalearner_fold_assignment works stack3 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_nfolds=3, metalearner_fold_assignment="Modulo") stack3.train(x=x, y=y, training_frame=train) # Check that metalearner_fold_assignment is correctly stored in model output assert (stack3.params['metalearner_fold_assignment']['actual'] == "Modulo") # Check that the metalearner was cross-validated with the correct number of folds meta3 = h2o.get_model(stack3.metalearner()['name']) assert (meta3.params['fold_assignment']['actual'] == "Modulo") # Check that metalearner_fold_column works stack4 = H2OStackedEnsembleEstimator( base_models=[my_gbm, my_rf], metalearner_fold_column=fold_column, metalearner_params=dict(keep_cross_validation_models=True)) stack4.train(x=x, y=y, training_frame=train) # Check that metalearner_fold_column is correctly stored in model output assert (stack4.params['metalearner_fold_column']['actual']['column_name'] == fold_column) # Check that metalearner_fold_column is passed through to metalearner meta4 = h2o.get_model(stack4.metalearner()['name']) assert ( meta4.params['fold_column']['actual']['column_name'] == fold_column) assert (meta4.params['nfolds']['actual'] == 0) assert (len(meta4.cross_validation_models()) == 3)
my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli", nfolds=10, ntrees=5, keep_cross_validation_predictions=True, seed=1) my_gbm.train(y=-1, training_frame=d) my_rf = H2ORandomForestEstimator(nfolds=10, ntrees=5, keep_cross_validation_predictions=True, seed=1) my_rf.train(y=-1, training_frame=d) # Train a stacked ensemble using the GBM and GLM above ensemble = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf]) ensemble.train(y=-1, training_frame=d) anytime_model = ensemble #Prequential evaluation for i in range(3, 10): #Test on next batch for accuracy test = B[i] spm = sp.csr_matrix(test.values) d = h2o.H2OFrame(spm)
def stackedensemble_grid_gaussian(): """This test check the following (for guassian regression): 1) That H2OStackedEnsembleEstimator executes w/o erros on a random-grid-based ensemble. 2) That .predict() works on a stack. 3) That .model_performance() works on a stack. 4) That the training and test performance is better on ensemble vs the base learners. 5) That the validation_frame arg on H2OStackedEnsembleEstimator works correctly. """ # Import train and test datasets dat = h2o.import_file( path=pyunit_utils.locate("smalldata/extdata/australia.csv"), destination_frame="australia_hex") train, test = dat.split_frame(ratios=[.75], seed=1) print(train.summary()) # Identify predictors and response x = [ "premax", "salmax", "minairtemp", "maxairtemp", "maxsst", "maxsoilmoist", "Max_czcs" ] y = "runoffnew" # Set number of folds nfolds = 5 # Specify GBM hyperparameters for the grid hyper_params = { "learn_rate": [0.01, 0.03], "max_depth": [3, 4, 5, 6, 9], "sample_rate": [0.7, 0.8, 0.9, 1.0], "col_sample_rate": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8] } search_criteria = { "strategy": "RandomDiscrete", "max_models": 3, "seed": 1 } # Train the grid grid = H2OGridSearch(model=H2OGradientBoostingEstimator( ntrees=10, seed=1, nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True), hyper_params=hyper_params, search_criteria=search_criteria, grid_id="gbm_grid_guassian") grid.train(x=x, y=y, training_frame=train) # Train a stacked ensemble using the GBM grid stack = H2OStackedEnsembleEstimator( model_id="my_ensemble_gbm_grid_guassian", base_models=grid.model_ids) stack.train(x=x, y=y, training_frame=train, validation_frame=test) # Check that predictions work pred = stack.predict(test_data=test) assert pred.nrow == test.nrow, "expected " + str( pred.nrow) + " to be equal to " + str(test.nrow) assert pred.ncol == 1, "expected " + str( pred.ncol) + " to be equal to 1 but it was equal to " + str(pred.ncol) # Evaluate ensemble performance perf_stack_train = stack.model_performance() perf_stack_test = stack.model_performance(test_data=test) # Training RMSE for each base learner baselearner_best_rmse_train = max( [h2o.get_model(model).rmse(train=True) for model in grid.model_ids]) stack_rmse_train = perf_stack_train.rmse() print("Best Base-learner Training RMSE: {0}".format( baselearner_best_rmse_train)) print("Ensemble Training RMSE: {0}".format(stack_rmse_train)) assert stack_rmse_train < baselearner_best_rmse_train, "expected stack_rmse_train would be less than " \ " found it wasn't baselearner_best_rmse_train" # Check that stack perf is better (smaller) than the best (smaller) base learner perf: # Test RMSE for each base learner baselearner_best_rmse_test = max([ h2o.get_model(model).model_performance(test_data=test).rmse() for model in grid.model_ids ]) stack_rmse_test = perf_stack_test.rmse() print( "Best Base-learner Test RMSE: {0}".format(baselearner_best_rmse_test)) print("Ensemble Test RMSE: {0}".format(stack_rmse_test)) assert stack_rmse_test < baselearner_best_rmse_test, "expected stack_rmse_test would be less than " \ "baselearner_best_rmse_test, found it wasn't " \ "baselearner_best_rmse_test = "+ \ str(baselearner_best_rmse_test) + ",stack_rmse_test " \ "= "+ str(stack_rmse_test) # Check that passing `test` as a validation_frame produces the same metric as stack.model_performance(test) # since the metrics object is not exactly the same, we can just test that RMSE is the same perf_stack_validation_frame = stack.model_performance(valid=True) assert stack_rmse_test == perf_stack_validation_frame.rmse(), "expected stack_rmse_test to be the same as " \ "perf_stack_validation_frame.rmse() found they were not " \ "perf_stack_validation_frame.rmse() = " + \ str(perf_stack_validation_frame.rmse()) + \ "stack_rmse_test was " + str(stack_rmse_test)
def stackedensemble_validation_frame_test(): """This test checks the following: 1) That passing in a validation_frame to h2o.stackedEnsemble does something (validation metrics exist). 2) It should hopefully produce a better model (in the metalearning step). """ # Import training set df = h2o.import_file(path=pyunit_utils.locate("smalldata/higgs/higgs_train_5k.csv"), destination_frame="higgs_train_5k") test = h2o.import_file(path=pyunit_utils.locate("smalldata/higgs/higgs_test_5k.csv"), destination_frame="higgs_test_5k") # Identify predictors and response x = df.columns y = "response" x.remove(y) # Convert response to a factor df[y] = df[y].asfactor() test[y] = test[y].asfactor() # Split off a validation_frame ss = df.split_frame(seed = 1) train = ss[0] valid = ss[1] # Set number of folds nfolds = 5 # Train and cross-validate a GBM my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=10, nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_gbm.train(x=x, y=y, training_frame=train) # Train and cross-validate a RF my_rf = H2ORandomForestEstimator(ntrees=10, nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_rf.train(x=x, y=y, training_frame=train) # Train a stacked ensemble & check that validation metrics are missing stack1 = H2OStackedEnsembleEstimator(base_models=[my_gbm.model_id, my_rf.model_id]) stack1.train(x=x, y=y, training_frame=train) assert(stack1.model_performance(valid=True) is None) # Train a stacked ensemble with a validation_frame & check that validation metrics exist & are correct type stack2 = H2OStackedEnsembleEstimator(base_models=[my_gbm.model_id, my_rf.model_id]) stack2.train(x=x, y=y, training_frame=train, validation_frame=valid) assert(type(stack2.model_performance(valid=True)) == h2o.model.metrics_base.H2OBinomialModelMetrics) assert(type(stack2.auc(valid=True)) == float) # Compare test AUC (ensemble with validation_frame should not be worse) perf1 = stack1.model_performance(test_data=test) perf2 = stack2.model_performance(test_data=test) assert perf2.auc() >= perf1.auc()
def airline_gbm_random_grid(): air_hex = h2o.import_file( path=pyunit_utils.locate("smalldata/airlines/allyears2k_headers.zip"), destination_frame="air.hex") myX = ["Year", "Month", "CRSDepTime", "UniqueCarrier", "Origin", "Dest"] # create hyperameter and search criteria lists (ranges are inclusive..exclusive)) hyper_params_tune = { 'max_depth': list(range(1, 10 + 1, 1)), 'sample_rate': [x / 100. for x in range(20, 101)], 'col_sample_rate': [x / 100. for x in range(20, 101)], 'col_sample_rate_per_tree': [x / 100. for x in range(20, 101)], 'col_sample_rate_change_per_level': [x / 100. for x in range(90, 111)], 'min_rows': [2**x for x in range(0, int(math.log(air_hex.nrow, 2) - 1) + 1)], 'nbins': [2**x for x in range(4, 11)], 'nbins_cats': [2**x for x in range(4, 13)], 'min_split_improvement': [0, 1e-8, 1e-6, 1e-4], 'histogram_type': ["UniformAdaptive", "QuantilesGlobal", "RoundRobin"] } # search_criteria directs how to do grid search. # 1). grid search can stop early if the early stopping conditions specified by # stopping_metric/stopping_tolerance/stopping_rounds # 2). grid search will stop if it takes longer than max_runtime_secs # 3). grid search will stop if it has collected max_models in its array. # # grid search stops correctly if any of the three conditions are satisfied search_criteria_tune = { 'strategy': "RandomDiscrete", 'max_runtime_secs': 600, # limit the runtime to 10 minutes to hit more stopping conditions 'max_models': 5, # build no more than 5 models 'seed': 1234, 'stopping_rounds': 5, 'stopping_metric': "AUC", 'stopping_tolerance': 1e-3 } air_grid = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_params_tune, search_criteria=search_criteria_tune) starttime = time.time() air_grid.train(x=myX, y="IsDepDelayed", training_frame=air_hex, nfolds=5, fold_assignment='Modulo', keep_cross_validation_predictions=True, distribution="bernoulli", seed=1234) runtime = time.time() - starttime # check stopping condition 3), max_models correct_stopping_condition = len( air_grid.get_grid()) == search_criteria_tune["max_models"] # if false, check stopping condition 2), max_runtime_secs if not (correct_stopping_condition): correct_stopping_condition = runtime >= search_criteria_tune[ "max_runtime_secs"] # if false, check stopping condition 1), early stopping has occurred. if not (correct_stopping_condition): for eachModel in air_grid.models: metric_list = pyunit_utils.extract_scoring_history_field( eachModel, "training_auc") if pyunit_utils.evaluate_early_stopping( metric_list, search_criteria_tune["stopping_rounds"], search_criteria_tune["stopping_tolerance"], True): correct_stopping_condition = True break assert correct_stopping_condition, "Grid search did not find a model that fits the search_criteria_tune." print(air_grid.get_grid("logloss")) stacker = H2OStackedEnsembleEstimator(selection_strategy="choose_all", base_models=air_grid.model_ids) stacker.train(model_id="my_ensemble", y="IsDepDelayed", training_frame=air_hex) predictions = stacker.predict(air_hex) # training data print("preditions for ensemble are in: " + predictions.frame_id)
def compute_stack_ensemble(self): self.ensemble = H2OStackedEnsembleEstimator( model_id="ensemble_" + str(random.sample(list(range(100)), 1)[0]), base_models=self.all_ids) self.ensemble.train(x=self.X, y=self.y, training_frame=self.train)
def stackedensemble_grid_binomial(): """This test check the following (for binomial classification): 1) That H2OStackedEnsembleEstimator executes w/o errors on a random-grid-based ensemble. 2) That .predict() works on a stack. 3) That .model_performance() works on a stack. 4) That the training and test performance is better on ensemble vs the base learners. 5) That the validation_frame arg on H2OStackedEnsembleEstimator works correctly. """ # Import train and test datasets train = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/higgs_train_5k.csv"), destination_frame="higgs_train_5k") test = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/higgs_test_5k.csv"), destination_frame="higgs_test_5k") # Identify predictors and response x = train.columns y = "response" x.remove(y) # Encode the response as categorical train[y] = train[y].asfactor() test[y] = test[y].asfactor() # Set number of folds nfolds = 5 # Specify GBM hyperparameters for the grid hyper_params = {"learn_rate": [0.01, 0.03], "max_depth": [3, 4, 5, 6, 9], "sample_rate": [0.7, 0.8, 0.9, 1.0], "col_sample_rate": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]} search_criteria = {"strategy": "RandomDiscrete", "max_models": 3, "seed": 1} # Train the grid grid = H2OGridSearch(model=H2OGradientBoostingEstimator(ntrees=10, seed=1, nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True), hyper_params=hyper_params, search_criteria=search_criteria, grid_id="gbm_grid_binomial") grid.train(x=x, y=y, training_frame=train) # Train a stacked ensemble using the GBM grid stack = H2OStackedEnsembleEstimator(model_id="my_ensemble_gbm_grid_binomial", base_models=grid.model_ids) stack.train(x=x, y=y, training_frame=train, validation_frame=test) # check that prediction works pred = stack.predict(test_data= test) assert pred.nrow == test.nrow, "expected " + str(pred.nrow) + " to be equal to " + str(test.nrow) assert pred.ncol == 3, "expected " + str(pred.ncol) + " to be equal to 3 but it was equal to " + str(pred.ncol) # Evaluate ensemble performance perf_stack_train = stack.model_performance() perf_stack_test = stack.model_performance(test_data=test) # Training AUC for each base learner baselearner_best_auc_train = max([h2o.get_model(model).auc(train = True) for model in grid.model_ids]) stack_auc_train = perf_stack_train.auc() print("Best Base-learner Training AUC: {0}".format(baselearner_best_auc_train)) print("Ensemble Training AUC: {0}".format(stack_auc_train)) # this does not pass, but that's okay for training error #assert stack_auc_train > baselearner_best_auc_train, "expected stack_auc_train would be greater than " \ # " found it wasn't baselearner_best_auc_train" # Test AUC baselearner_best_auc_test = max([h2o.get_model(model).model_performance(test_data=test).auc() for model in grid.model_ids]) stack_auc_test = perf_stack_test.auc() print("Best Base-learner Test AUC: {0}".format(baselearner_best_auc_test)) print("Ensemble Test AUC: {0}".format(stack_auc_test)) assert stack_auc_test > baselearner_best_auc_test, "expected stack_auc_test would be greater than " \ " baselearner_best_auc_test, found it wasn't " \ "baselearner_best_auc_test = "+ \ str(baselearner_best_auc_test) + ",stack_auc_test " \ " = "+ str(stack_auc_test) # Check that passing `test` as a validation_frame produces the same metric as stack.model_performance(test) # since the metrics object is not exactly the same, we can just test that AUC is the same perf_stack_validation_frame = stack.model_performance(valid=True) assert stack_auc_test == perf_stack_validation_frame.auc(), "expected stack_auc_test to be the same as " \ "perf_stack_validation_frame.auc() found they were not " \ "perf_stack_validation_frame.auc() = " + \ str(perf_stack_validation_frame.auc()) + \ "stack_auc_test was " + str(stack_auc_test)
def stackedensemble_binary_test(): # Import a sample binary outcome train/test set into H2O train = h2o.import_file( pyunit_utils.locate("smalldata/higgs/higgs_train_10k.csv")) test = h2o.import_file( pyunit_utils.locate("smalldata/testng/higgs_test_5k.csv")) # Identify predictors and response x = train.columns y = "response" x.remove(y) # For binary classification, response should be a factor train[y] = train[y].asfactor() test[y] = test[y].asfactor() # Number of CV folds (to generate level-one data for stacking) nfolds = 5 # 1. Generate a 2-model ensemble (GBM + RF) # Train and cross-validate a GBM my_gbm = H2OGradientBoostingEstimator( distribution="bernoulli", ntrees=10, max_depth=3, min_rows=2, learn_rate=0.2, nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_gbm.train(x=x, y=y, training_frame=train) # Train and cross-validate a RF my_rf = H2ORandomForestEstimator(ntrees=50, nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_rf.train(x=x, y=y, training_frame=train) # Train a stacked ensemble using the GBM and DRF above ensemble = H2OStackedEnsembleEstimator( model_id="my_ensemble_binomial", base_models=[my_gbm.model_id, my_rf.model_id]) ensemble.train(x=x, y=y, training_frame=train) #Predict in ensemble in Py client preds_py = ensemble.predict(test) #Load binary model and predict bin_model = h2o.load_model( pyunit_utils.locate( "smalldata/binarymodels/stackedensemble/ensemble_higgs")) preds_bin = bin_model.predict(test) #Predictions from model in Py and binary model should be the same pred_diff = preds_bin - preds_py assert pred_diff["p0"].max() < 1e-11 assert pred_diff["p1"].max() < 1e-11 assert pred_diff["p0"].min() > -1e-11 assert pred_diff["p1"].min() > -1e-11