def kolmogorov_smirnov(): # Train a model airlines = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) model = H2OGradientBoostingEstimator(ntrees=1, gainslift_bins=20) model.train(x=["Origin", "Distance"], y="IsDepDelayed", training_frame=airlines) verify_ks(model, airlines) model = H2OGradientBoostingEstimator(ntrees=1, gainslift_bins=5) model.train(x=["Origin", "Distance"], y="IsDepDelayed", training_frame=airlines) ks = model.kolmogorov_smirnov() print(ks) ks_verification = ks_metric(model, airlines) print(ks_verification) assert round(ks, 5) != round(ks_verification, 5) model = H2OXGBoostEstimator(gainslift_bins=10) model.train(x=["Origin", "Distance"], y="IsDepDelayed", training_frame=airlines) print(model.gains_lift()) ks = model.kolmogorov_smirnov() assert ks is not None assert 0 < ks < 1 # Test GS is null whern gainslift_bins = 0 model = H2OGradientBoostingEstimator(ntrees=1, gainslift_bins=0) model.train(x=["Origin", "Distance"], y="IsDepDelayed", training_frame=airlines) assert model.gains_lift() is None
def gbm_monotone_quantile_test(): # generate data x = np.atleast_1d(np.random.uniform(0, 10.0, size=100)).T y = f(x).ravel() dy = 1.5 + 1.0 * np.random.random(y.shape) noise = np.random.normal(0, dy) y += noise train = h2o.H2OFrame({'x': x.tolist(), 'y': y.tolist()}) # train a model with 1 constraint on x gbm_mono = H2OGradientBoostingEstimator(seed=42, distribution="quantile", monotone_constraints={"x": 1}) gbm_mono.train(y='y', training_frame=train) mono_pred = gbm_mono.predict(train).as_data_frame().iloc[:, 0].tolist() x_sorted, mono_pred_sorted = zip(*sorted(zip(x, mono_pred))) assert all(x <= y for x, y in zip(mono_pred_sorted, mono_pred_sorted[1:]) ), "The predictions should be monotone." # train a model with -1 constraint on x gbm_adverse = H2OGradientBoostingEstimator(seed=42, distribution="quantile", monotone_constraints={"x": -1}) gbm_adverse.train(y='y', training_frame=train) adverse_pred = gbm_adverse.predict(train).as_data_frame().iloc[:, 0].tolist() x_sorted, adverse_pred_sorted = zip(*sorted(zip(x, adverse_pred))) assert all(x >= y for x, y in zip(adverse_pred_sorted, adverse_pred_sorted[1:])), \ "The predictions should be monotone."
def checkpointing_test(): airlines = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) gbm = H2OGradientBoostingEstimator(ntrees=1) gbm.train(x=["Origin", "Dest"], y="Distance", training_frame=airlines, validation_frame=airlines) checkpointed_gbm = H2OGradientBoostingEstimator(ntrees = 2, checkpoint=gbm) checkpointed_gbm.train(x=["Origin", "Dest"], y="Distance", training_frame=airlines, validation_frame=airlines) assert checkpointed_gbm.checkpoint == gbm checkpointed_gbm = H2OGradientBoostingEstimator(ntrees = 2, checkpoint=gbm.model_id) checkpointed_gbm.train(x=["Origin", "Dest"], y="Distance", training_frame=airlines, validation_frame=airlines) assert checkpointed_gbm.checkpoint == gbm.model_id
def test_frames_can_be_passed_to_constructor(): ds = import_dataset() gbm = H2OGradientBoostingEstimator(ntrees=10, nfolds=0, seed=seed, training_frame=ds['train'], validation_frame=ds['valid']) gbm.train(y=ds['target']) rf = H2ORandomForestEstimator(ntrees=10, nfolds=0, seed=seed, training_frame=ds['train'], validation_frame=ds['valid']) rf.train(y=ds['target']) se = H2OStackedEnsembleEstimator(base_models=[gbm, rf], seed=seed, training_frame=ds['train'], validation_frame=ds['valid'], blending_frame=ds['blend']) se.train(y=ds['target']) assert se.auc() > 0
def mojo_conveniece(): # Train a model airlines = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) model = H2OGradientBoostingEstimator(ntrees = 1) model.train(x = ["Origin", "Dest"], y = "IsDepDelayed", training_frame=airlines) #Save the previously created model into a temporary file original_model_filename = tempfile.mkdtemp() original_model_filename = model.save_mojo(original_model_filename) # Load the model from the temporary file mojo_model = h2o.import_mojo(original_model_filename) assert isinstance(mojo_model, H2OGenericEstimator) # Test scoring is available on the model predictions = mojo_model.predict(airlines) assert predictions is not None assert predictions.nrows == 24421 ##### # MOJO UPLOAD TEST ##### # Download the MOJO original_model_filename = model.download_mojo(original_model_filename) # Load the model from the temporary file mojo_model = h2o.upload_mojo(original_model_filename) assert isinstance(mojo_model, H2OGenericEstimator) # Test scoring is available on the model predictions = mojo_model.predict(airlines) assert predictions is not None assert predictions.nrows == 24421
def partial_plots(): data = h2o.import_file( pyunit_utils.locate('smalldata/prostate/prostate.csv')) x = ['AGE', 'RACE'] y = 'CAPSULE' data[y] = data[y].asfactor() data['RACE'] = data['RACE'].asfactor() gbm_model = H2OGradientBoostingEstimator(ntrees=50, learn_rate=0.05) gbm_model.train(x=x, y=y, training_frame=data) # test saving: with TemporaryDirectory() as tmpdir: path1 = "{}/plot1.png".format(tmpdir) path2 = "{}/plot2.png".format(tmpdir) test_plot_result_saving( gbm_model.partial_plot(data=data, cols=['AGE'], server=True, plot=True, row_index=1), path2, gbm_model.partial_plot(data=data, cols=['AGE'], server=True, plot=True, row_index=1, save_plot_path=path1), path1)
def tree_test(): # GBM airlines = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) gbm = H2OGradientBoostingEstimator(ntrees = 1) gbm.train(x = ["Origin", "Dest"], y = "IsDepDelayed", training_frame=airlines) tree = H2OTree(gbm, 0, "NO") # Indexing from 0 in Python. There is exactly one tree built check_tree(tree, 0, "NO") assert tree.root_node.left_levels is not None#Only categoricals in the model, guaranteed to have categorical split assert tree.root_node.right_levels is not None #Only categoricals in the model, guaranteed to have categorical split # DRF cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_nice_header.csv")) drf = H2ORandomForestEstimator(ntrees=2) drf.train(x = ["power", "acceleration"], y="cylinders", training_frame=cars) drf_tree = H2OTree(drf, 1, None) check_tree(drf_tree, 1) # ISOFOR ecg_discord = h2o.import_file(pyunit_utils.locate("smalldata/anomaly/ecg_discord_train.csv")) isofor = H2OIsolationForestEstimator(ntrees=3, seed=12, sample_size=5) isofor.train(training_frame=ecg_discord) if_tree = H2OTree(isofor, 2) check_tree(if_tree, 2)
def download_model(): prostate = h2o.import_file( pyunit_utils.locate("smalldata/prostate/prostate.csv")) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate_gbm = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=10, max_depth=8, min_rows=10, learn_rate=0.2) prostate_gbm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"], y="CAPSULE", training_frame=prostate) path = pyunit_utils.locate("results") downloaded_model_path = prostate_gbm.download_model(path=path) assert os.path.isfile(downloaded_model_path), \ "Expected load file {0} to exist, but it does not.".format(downloaded_model_path) loaded_model = h2o.load_model(downloaded_model_path) assert isinstance(loaded_model, H2OGradientBoostingEstimator), \ "Expected an H2OGradientBoostingEstimator, but got {0}".format(downloaded_model_path) uploaded_model = h2o.upload_model(downloaded_model_path) assert isinstance(uploaded_model, H2OGradientBoostingEstimator), \ "Expected an H2OGradientBoostingEstimator, but got {0}".format(downloaded_model_path)
def test_custom_metric(self): from custom_metric_class import WeightedFalseNegativeLossMetric train_path = "file://" + unit_test_utils.locate("smalldata/loan.csv") train = h2o.import_file(train_path, destination_frame="loan_train") train["bad_loan"] = train["bad_loan"].asfactor() y = "bad_loan" x = train.col_names x.remove(y) x.remove("int_rate") train["weight"] = train["loan_amnt"] weighted_false_negative_loss_func = h2o.upload_custom_metric( WeightedFalseNegativeLossMetric, func_name="WeightedFalseNegativeLoss", func_file="weighted_false_negative_loss.py") from h2o.estimators import H2OGradientBoostingEstimator gbm = H2OGradientBoostingEstimator( model_id="gbm.hex", custom_metric_func=weighted_false_negative_loss_func) gbm.train(y=y, x=x, training_frame=train, weights_column="weight") perf = gbm.model_performance() self.assertEquals(perf.custom_metric_name(), "WeightedFalseNegativeLoss") self.assertEquals(perf.custom_metric_value(), 0.24579011595430142)
def test_frames_can_be_overridden_in_train_method(): ds = import_dataset() dummy_frame = h2o.H2OFrame([1, 2, 3]) gbm = H2OGradientBoostingEstimator(ntrees=10, nfolds=0, seed=seed, training_frame=dummy_frame, validation_frame=dummy_frame) gbm.train(y=ds['target'], training_frame=ds['train'], validation_frame=ds['valid']) rf = H2ORandomForestEstimator(ntrees=10, nfolds=0, seed=seed, training_frame=dummy_frame, validation_frame=dummy_frame) rf.train(y=ds['target'], training_frame=ds['train'], validation_frame=ds['valid']) se = H2OStackedEnsembleEstimator(base_models=[gbm, rf], seed=seed, training_frame=dummy_frame, validation_frame=dummy_frame, blending_frame=dummy_frame) se.train(y=ds['target'], training_frame=ds['train'], validation_frame=ds['valid'], blending_frame=ds['blend']) assert se.auc() > 0
def test_frames_can_be_passed_as_key(): ds = import_dataset() kw_args = [ dict(training_frame=ds['train'].frame_id), dict(training_frame=ds['train'], validation_frame=ds['valid'].frame_id), dict(training_frame=ds['train'], blending_frame=ds['blend'].frame_id), ] # Constructor validation for kwargs in kw_args: H2OStackedEnsembleEstimator(base_models=[], **kwargs) # train method validation base_model_params = dict(ntrees=3, nfolds=3, seed=seed, keep_cross_validation_predictions=True) for kwargs in kw_args: base_training_args = {k: v for k, v in kwargs.items() if k != 'blending_frame'} base_training_args['y'] = ds['target'] gbm = H2OGradientBoostingEstimator(**base_model_params) gbm.train(**base_training_args) rf = H2ORandomForestEstimator(**base_model_params) rf.train(**base_training_args) se = H2OStackedEnsembleEstimator(base_models=[gbm, rf]) se.train(y=ds['target'], **kwargs)
def gbm_model_build(): """ Train gbm model :returns model, training frame """ prostate_train = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate_train.csv")) prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor() ntrees = 100 learning_rate = 0.1 depth = 5 min_rows = 10 # Build H2O GBM classification model: gbm_h2o = H2OGradientBoostingEstimator(ntrees=ntrees, learn_rate=learning_rate, max_depth=depth, min_rows=min_rows, distribution="bernoulli") gbm_h2o.train(x=list(range(1, prostate_train.ncol)), y="CAPSULE", training_frame=prostate_train) # Doing PFI on test data vs train data: In the end, you need to decide whether you want to know how much the # model relies on each feature for making predictions (-> training data) or how much the feature contributes to # the performance of the model on unseen data (-> test data). To the best of my knowledge, there is no research # addressing the question of training vs. test data return gbm_h2o, prostate_train
def demo_body(go): """ Demo of H2O's Gradient Boosting estimator. This demo uploads a dataset to h2o, parses it, and shows a description. Then it divides the dataset into training and test sets, builds a GLM from the training set, and makes predictions for the test set. Finally, default performance metrics are displayed. """ go() # Connect to H2O h2o.init() go() # Upload the prostate dataset that comes included in the h2o python package prostate = h2o.load_dataset("prostate") go() # Print a description of the prostate data prostate.describe() go() # Randomly split the dataset into ~70/30, training/test sets train, test = prostate.split_frame(ratios=[0.70]) go() # Convert the response columns to factors (for binary classification problems) train["CAPSULE"] = train["CAPSULE"].asfactor() test["CAPSULE"] = test["CAPSULE"].asfactor() go() # Build a (classification) GLM from h2o.estimators import H2OGradientBoostingEstimator prostate_gbm = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=10, max_depth=8, min_rows=10, learn_rate=0.2) prostate_gbm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"], y="CAPSULE", training_frame=train) go() # Show the model prostate_gbm.show() go() # Predict on the test set and show the first ten predictions predictions = prostate_gbm.predict(test) predictions.show() go() # Fetch a tree, print number of tree nodes, show root node description from h2o.tree import H2OTree, H2ONode tree = H2OTree(prostate_gbm, 0, "0") len(tree) tree.left_children tree.right_children tree.root_node.show() go() # Show default performance metrics performance = prostate_gbm.model_performance(test) performance.show()
def generic_blank_constructor(): # Train a model airlines = h2o.import_file( path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) model = H2OGradientBoostingEstimator(ntrees=1) model.train(x=["Origin", "Dest"], y="IsDepDelayed", training_frame=airlines) #Save the previously created model into a temporary file original_model_filename = tempfile.mkdtemp() original_model_filename = model.download_mojo(original_model_filename) # Load the model from the temporary using an empty constructor mojo_model = H2OGenericEstimator() mojo_model.path = original_model_filename mojo_model.train() assert isinstance(mojo_model, H2OGenericEstimator) assert mojo_model._model_json["output"][ "original_model_identifier"] == "gbm" assert mojo_model._model_json["output"][ "original_model_full_name"] == "Gradient Boosting Machine" # Test scoring is available on the model predictions = mojo_model.predict(airlines) assert predictions is not None assert predictions.nrows == 24421
def gradient_boosting(name): """ Get the Gradient Boosting Model :param name: model name, will determine filename :return: """ params = get_params("gradient_boosting") return H2OGradientBoostingEstimator(model_id=name, **params)
def demo_body(go): """ Demo of H2O's Gradient Boosting estimator. This demo uploads a dataset to h2o, parses it, and shows a description. Then it divides the dataset into training and test sets, builds a GLM from the training set, and makes predictions for the test set. Finally, default performance metrics are displayed. """ go() # Connect to H2O h2o.init() go() # Upload the prostate dataset that comes included in the h2o python package prostate = h2o.upload_file(data_file("h2o_data/prostate.csv")) go() # Print a description of the prostate data prostate.summary() go() # Randomly split the dataset into ~70/30, training/test sets r = prostate[0].runif() train = prostate[r < 0.70] test = prostate[r >= 0.70] go() # Convert the response columns to factors (for binary classification problems) train["CAPSULE"] = train["CAPSULE"].asfactor() test["CAPSULE"] = test["CAPSULE"].asfactor() go() # Build a (classification) GLM from h2o.estimators import H2OGradientBoostingEstimator prostate_gbm = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=10, max_depth=8, min_rows=10, learn_rate=0.2) prostate_gbm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"], y="CAPSULE", training_frame=train) go() # Show the model prostate_gbm.show() go() # Predict on the test set and show the first ten predictions predictions = prostate_gbm.predict(test) predictions.show() go() # Show default performance metrics performance = prostate_gbm.model_performance(test) performance.show()
def bake(self) -> H2OGradientBoostingEstimator: fr = stars_frame() assert fr.type("distance") == "int" model = H2OGradientBoostingEstimator(ntrees=100, distribution="gaussian") model.train(y="distance", training_frame=fr, ignored_columns=["name1", "name2"]) return model
def bake(self) -> H2OGradientBoostingEstimator: fr = names_frame() fr = fr[:5000, :] fr["name"] = fr["name"].ascharacter().asfactor() # trim nlevels() assert 256 < fr["name"].nlevels()[0] < 500 model = H2OGradientBoostingEstimator(ntrees=100, distribution="bernoulli") model.train(y="sex", training_frame=fr) return model
def main(): args = parse_args() h2o.init(ip=args.host, port=args.port) # Upload the prostate dataset that comes included in the h2o python package prostate = h2o.load_dataset("prostate") # Print a description of the prostate data prostate.describe() # Randomly split the dataset into ~70/30, training/test sets client.update_task_info({ 'test_train': 0.7, 'learn_rate': 0.2, }) train, test = prostate.split_frame(ratios=[0.70]) # Convert the response columns to factors (for binary classification problems) train["CAPSULE"] = train["CAPSULE"].asfactor() test["CAPSULE"] = test["CAPSULE"].asfactor() # Build a (classification) GLM from h2o.estimators import H2OGradientBoostingEstimator prostate_gbm = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=10, max_depth=8, min_rows=10, learn_rate=0.2) prostate_gbm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"], y="CAPSULE", training_frame=train) # Show the model prostate_gbm.show() # Predict on the test set and show the first ten predictions predictions = prostate_gbm.predict(test) predictions.show() # Fetch a tree, print number of tree nodes, show root node description from h2o.tree import H2OTree, H2ONode tree = H2OTree(prostate_gbm, 0, "0") tree.root_node.show() # Show default performance metrics performance = prostate_gbm.model_performance(test) performance.show() client.update_task_info({ 'mse': performance.mse(), 'rmse': performance.rmse(), 'auc': performance.auc(), 'gini': performance.gini(), 'logloss': performance.logloss(), })
def tree_test(): # GBM airlines = h2o.import_file( path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) gbm = H2OGradientBoostingEstimator(ntrees=1) gbm.train(x=["Origin", "Dest"], y="IsDepDelayed", training_frame=airlines) tree = H2OTree( gbm, 0, "NO") # Indexing from 0 in Python. There is exactly one tree built check_tree(tree, 0, "NO") assert tree.root_node.left_levels is not None #Only categoricals in the model, guaranteed to have categorical split assert tree.root_node.right_levels is not None #Only categoricals in the model, guaranteed to have categorical split assert tree.left_cat_split is not None and tree.right_cat_split is not None assert len(tree.left_cat_split) == len(tree.right_cat_split) # There are categorical splits only, check none of the cat splits is None for i in range(0, len(tree.left_cat_split)): if (tree.left_children[i] == -1 and tree.right_children[i] == -1): # Except leaf nodes, those should be None assert tree.left_cat_split[i] is None assert tree.right_cat_split[i] is None else: assert tree.left_cat_split[i] is not None assert tree.right_cat_split[i] is not None # DRF cars = h2o.import_file( path=pyunit_utils.locate("smalldata/junit/cars_nice_header.csv")) drf = H2ORandomForestEstimator(ntrees=2) drf.train(x=["power", "acceleration"], y="cylinders", training_frame=cars) drf_tree = H2OTree(drf, 1, None) check_tree(drf_tree, 1) # ISOFOR ecg_discord = h2o.import_file( pyunit_utils.locate("smalldata/anomaly/ecg_discord_train.csv")) isofor = H2OIsolationForestEstimator(ntrees=3, seed=12, sample_size=5) isofor.train(training_frame=ecg_discord) if_tree = H2OTree(isofor, 2) # There are no categoricall splits, check none of the cat splits is None for i in range(0, len(if_tree.node_ids)): assert if_tree.left_cat_split[i] is None assert if_tree.right_cat_split[i] is None if (if_tree.left_children[i] == -1 and tree.right_children[i] == -1): # Leaf nodes don't have split thresholds assert if_tree.thresholds is None else: # All others nodes should have split thresholds assert if_tree.thresholds[i] is not None assert if_tree.thresholds[i] is not None check_tree(if_tree, 2)
def gbm_monotone_tweedie_test(): data = h2o.import_file( "http://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/autoclaims.csv" ) data = data.drop( ['POLICYNO', 'PLCYDATE', 'CLM_FREQ5', 'CLM_FLAG', 'IN_YY']) train, test = data.split_frame([0.8], seed=123) response = "CLM_AMT5" monotone_constraints = { "MVR_PTS": 1, } gbm_regular = H2OGradientBoostingEstimator(seed=42, distribution="tweedie") gbm_regular.train(y=response, training_frame=train, validation_frame=test) print(gbm_regular.varimp(use_pandas=True)) top_3_vars_regular = gbm_regular.varimp( use_pandas=True).ix[:, 'variable'].head(3).tolist() assert "MVR_PTS" in top_3_vars_regular gbm_mono = H2OGradientBoostingEstimator( monotone_constraints=monotone_constraints, seed=42, distribution="tweedie") gbm_mono.train(y=response, training_frame=train, validation_frame=test) print(gbm_regular.varimp(use_pandas=True)) top_3_vars_mono = gbm_mono.varimp( use_pandas=True).ix[:, 'variable'].head(3).tolist() # monotone constraints didn't affect the variable importance assert top_3_vars_mono == top_3_vars_regular # train a model with opposite constraint on MVR_PTS gbm_adverse = H2OGradientBoostingEstimator( seed=42, distribution="tweedie", monotone_constraints={"MVR_PTS": -1}) gbm_adverse.train(y=response, training_frame=train, validation_frame=test) # variable becomes least important to the model assert [ "MVR_PTS" ] == gbm_adverse.varimp(use_pandas=True).ix[:, 'variable'].tail(1).tolist()
def download_model_filename(): fr = h2o.import_file( path=pyunit_utils.locate("smalldata/prostate/prostate.csv")) model = H2OGradientBoostingEstimator(ntrees=10, seed=1234) model.train(x=list(range(2, fr.ncol)), y=1, training_frame=fr) # Default filename is model_id model_path = model.download_model() # It should be saved in server working directory assert model_path.endswith( model.model_id), "Not expected path: {0}".format(model_path) loaded_model = h2o.load_model(model_path) assert isinstance(loaded_model, H2OGradientBoostingEstimator) # Default filename is model_id tmpdir = tempfile.mkdtemp() model_path = model.download_model(tmpdir) assert_equals(os.path.join(tmpdir, model.model_id), model_path, "Not expected path") loaded_model = h2o.load_model(model_path) assert isinstance(loaded_model, H2OGradientBoostingEstimator) # Custom filename with custom path model_path = model.download_model(tmpdir, filename="gbm_prostate") assert_equals(os.path.join(tmpdir, "gbm_prostate"), model_path, "Not expected path") loaded_model = h2o.load_model(model_path) assert isinstance(loaded_model, H2OGradientBoostingEstimator) # Custom filename with custom path model_path = model.download_model(tmpdir, filename="gbm_prostate.model") assert_equals(os.path.join(tmpdir, "gbm_prostate.model"), model_path, "Not expected path") loaded_model = h2o.load_model(model_path) assert isinstance(loaded_model, H2OGradientBoostingEstimator) # Custom filename with custom path model_path = model.download_model(tmpdir, filename=os.path.join( "not-existing-folder", "gbm_prostate.model")) assert_equals( os.path.join(tmpdir, "not-existing-folder", "gbm_prostate.model"), model_path, "Not expected path") loaded_model = h2o.load_model(model_path) assert isinstance(loaded_model, H2OGradientBoostingEstimator) # Custom filename with default path model_path = model.download_model(filename="gbm_prostate2.model") assert model_path.endswith( "gbm_prostate2.model"), "Not expected path: {0}".format(model_path) loaded_model = h2o.load_model(model_path) assert isinstance(loaded_model, H2OGradientBoostingEstimator)
def test_train_returns_the_trained_model(): fr = h2o.import_file(path=pu.locate("smalldata/prostate/prostate.csv")) target = "CAPSULE" fr[target] = fr[target].asfactor() gbm = H2OGradientBoostingEstimator(model_id="py_gbm_train_result", seed=42) model = gbm.train(y=target, training_frame=fr) assert isinstance(model, ModelBase) assert model is gbm model.predict(fr)
def test_mojo_ids(): # Train a model airlines = h2o.import_file( path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) model = H2OGradientBoostingEstimator(ntrees=1) model.train(x=["Origin", "Dest"], y="IsDepDelayed", training_frame=airlines, verbose=False) # Save the previously created model into a temporary file original_model_filename = tempfile.mkdtemp() original_model_filename = model.save_mojo(original_model_filename) original_model_id = model.model_id print(original_model_id) # Import MOJO from the temporary file mojo_model = h2o.import_mojo(original_model_filename, model_id=original_model_id) print(mojo_model.model_id) assert_equals(mojo_model.model_id, original_model_id, "Ids should be the same.") # Download the MOJO original_model_filename = model.download_mojo(original_model_filename) # Upload MOJO from the temporary file mojo_model_up = h2o.upload_mojo(original_model_filename, model_id=original_model_id) print(mojo_model_up.model_id) assert_equals(mojo_model_up.model_id, original_model_id, "Ids should be the same.") # Load MOJO model from file mojo_model_from_file = H2OGenericEstimator.from_file( original_model_filename, original_model_id) print(mojo_model_from_file.model_id) assert_equals(mojo_model_from_file.model_id, original_model_id, "Ids should be the same.") # Test initialize model_id from path mojo_model_up_wid = h2o.upload_mojo(original_model_filename) print(mojo_model_up_wid.model_id) assert_equals(mojo_model_up_wid.model_id, original_model_id, "Ids should not be the same.") mojo_model_im_wid = h2o.import_mojo(original_model_filename) print(mojo_model_im_wid.model_id) assert_equals(mojo_model_im_wid.model_id, original_model_id, "Ids should not be the same.")
def train_gbm(train, valid): hf, vf = convert_frames(train, valid) gbm = H2OGradientBoostingEstimator(model_id="Ayaya_gbm", seed=1337, ntrees=500, stopping_metric="custom", stopping_rounds=10, stopping_tolerance=0.001, custom_metric_func=mape_func) gbm.train(training_frame=hf, y="final_rinse_total_turbidity_liter", validation_frame=vf) return gbm
def retain_keys_test(): airlines = h2o.import_file( path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) gbm = H2OGradientBoostingEstimator(ntrees=1) gbm.train(x=["Origin", "Dest"], y="IsDepDelayed", training_frame=airlines) h2o.remove_all([airlines.frame_id, gbm.model_id]) assert h2o.get_frame(airlines.frame_id) is not None assert h2o.get_model(gbm.model_id) is not None ## Test key not being retained when unspecified gbm = H2OGradientBoostingEstimator(ntrees=1) gbm.train(x=["Origin", "Dest"], y="IsDepDelayed", training_frame=airlines) h2o.remove_all([airlines.frame_id]) h2o.ls() try: h2o.get_model(gbm.model_id) assert False except h2o.exceptions.H2OResponseError as e: assert e.args[0].dev_msg.find("not found for argument: key") != -1
def mojo_conveniece(): # Train a model airlines = h2o.import_file( path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) model = H2OGradientBoostingEstimator(ntrees=1) model.train(x=["Origin", "Dest"], y="IsDepDelayed", training_frame=airlines) #Save the previously created model into a temporary file original_model_filename = tempfile.mkdtemp() original_model_filename = model.save_mojo(original_model_filename) # Load the model from the temporary file mojo_model = h2o.import_mojo(original_model_filename) assert isinstance(mojo_model, H2OGenericEstimator) # Test scoring is available on the model predictions = mojo_model.predict(airlines) assert predictions is not None assert predictions.nrows == 24421 ##### # MOJO UPLOAD TEST ##### # Download the MOJO original_model_filename = model.download_mojo(original_model_filename) # Load the model from the temporary file mojo_model = h2o.upload_mojo(original_model_filename) assert isinstance(mojo_model, H2OGenericEstimator) # Test scoring is available on the model predictions = mojo_model.predict(airlines) assert predictions is not None assert predictions.nrows == 24421 ##### # MOJO to POJO Conversion test with POJO re-import ##### pojo_directory = os.path.join(pyunit_utils.locate("results"), model.model_id + ".java") pojo_path = model.download_pojo(path=pojo_directory) mojo2_model = h2o.import_mojo(pojo_path) predictions2 = mojo2_model.predict(airlines) assert predictions2 is not None assert predictions2.nrows == 24421 assert_frame_equal(predictions.as_data_frame(), predictions2.as_data_frame())
def test_checkpointing_gives_equal_model_summary(): prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate.csv")) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() predictors = ["ID", "AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"] response = "CAPSULE" gbm = H2OGradientBoostingEstimator(ntrees=50, seed=1111) gbm.train(x=predictors, y=response, training_frame=prostate) checkpointed_gbm = H2OGradientBoostingEstimator(ntrees=100, seed=1111, checkpoint=gbm.model_id) checkpointed_gbm.train(x=predictors, y=response, training_frame=prostate) gbm_ref = H2OGradientBoostingEstimator(ntrees=100, seed=1111) gbm_ref.train(x=predictors, y=response, training_frame=prostate) assert checkpointed_gbm.checkpoint == gbm.model_id checkpoint_summary = checkpointed_gbm._model_json["output"]["model_summary"] expected_summary = gbm_ref._model_json["output"]["model_summary"] print(checkpoint_summary) print(expected_summary) assert abs(expected_summary["model_size_in_bytes"][0] - checkpoint_summary["model_size_in_bytes"][0]) <= 20, "Not expected size of model created from checkpoint" assert_equals(expected_summary["mean_depth"][0], checkpoint_summary["mean_depth"][0]) assert_equals(expected_summary["min_leaves"][0], checkpoint_summary["min_leaves"][0]) assert_equals(expected_summary["max_leaves"][0], checkpoint_summary["max_leaves"][0]) assert_equals(expected_summary["mean_leaves"][0], checkpoint_summary["mean_leaves"][0])
def trainAndTestH2OPythonGbm(hc, dataset): h2oframe = hc.asH2OFrame(dataset) label = "CAPSULE" gbm = H2OGradientBoostingEstimator(seed=42) gbm.train(y=label, training_frame=h2oframe) directoryName = tempfile.mkdtemp(prefix="") try: mojoPath = gbm.download_mojo(directoryName) settings = H2OMOJOSettings(withDetailedPredictionCol=True) model = H2OMOJOModel.createFromMojo("file://" + mojoPath, settings) return model.transform(dataset).select( "prediction", "detailed_prediction.probabilities.0", "detailed_prediction.probabilities.1") finally: shutil.rmtree(directoryName)
def stackedensemble_mojo_model_test(): train = h2o.import_file( pyunit_utils.locate("smalldata/iris/iris_train.csv")) test = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_test.csv")) x = train.columns y = "species" nfolds = 2 gbm = H2OGradientBoostingEstimator(nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True) gbm.train(x=x, y=y, training_frame=train) rf = H2ORandomForestEstimator(nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True) rf.train(x=x, y=y, training_frame=train) se = H2OStackedEnsembleEstimator(training_frame=train, validation_frame=test, base_models=[gbm.model_id, rf.model_id]) se.train(x=x, y=y, training_frame=train) print(se) with Capturing() as original_output: se.show() original_model_filename = tempfile.mkdtemp() original_model_filename = se.download_mojo(original_model_filename) key = h2o.lazy_import(original_model_filename) fr = h2o.get_frame(key[0]) generic_mojo_model = H2OGenericEstimator(model_key=fr) generic_mojo_model.train() compare_params(se, generic_mojo_model) predictions = generic_mojo_model.predict(test) assert predictions is not None # Test constructor generating the model from existing MOJO file generic_mojo_model_from_file = H2OGenericEstimator.from_file( original_model_filename) assert generic_mojo_model_from_file is not None predictions = generic_mojo_model_from_file.predict(test) assert predictions is not None generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo") generic_mojo_filename = generic_mojo_model_from_file.download_mojo( path=generic_mojo_filename) assert os.path.getsize(generic_mojo_filename) == os.path.getsize( original_model_filename)