def glm_grid_search_on_weights(): train = h2o.import_file( pyunit_utils.locate("smalldata/logreg/prostate.csv")) train = train.drop("ID") train["CAPSULE"] = train["CAPSULE"].asfactor() response = "CAPSULE" features = list(train.col_names) features.remove(response) train['wt_2'] = (train["CAPSULE"] == "1").ifelse(2, 1) train['wt_100'] = (train['CAPSULE'] == "1").ifelse(100, 1) hyper_parameters = OrderedDict() hyper_parameters["weights_column"] = ["wt_2", "wt_100"] print("GLM grid with the following hyper_parameters:", hyper_parameters) gs = H2OGridSearch(H2OGeneralizedLinearEstimator, hyper_params=hyper_parameters) gs.train(x=features, y=response, training_frame=train) for m in gs.get_grid().models: used_features = map(lambda x: x[1], m.varimp()) assert not ("wt_2" in used_features) assert not ("wt_100" in used_features) loglosses = gs.sorted_metric_table()["logloss"] assert loglosses.nunique( ) == 2 # models are not identical (=> weights are considered)
def model_build(): bc_data_set1 = "/home/wso2123/My Work/Datasets/Breast cancer wisconsin/data.csv" bc_data_train_dataset = "/home/wso2123/My Work/Datasets/Breast cancer wisconsin/uncorrected_train.csv" bc_data_validate_dataset = "/home/wso2123/My Work/Datasets/Breast cancer wisconsin/validate.csv" bc_data_test_dataset = "/home/wso2123/My Work/Datasets/Breast cancer wisconsin/test.csv" train_data = h2o.import_file(bc_data_train_dataset) validate_data = h2o.import_file(bc_data_validate_dataset) test_data = h2o.import_file(bc_data_test_dataset) # # Train deep autoencoder learning model on "normal" # training data, y ignored # hyper_parameters = { 'hidden': range(10, 30), 'activation': [ "tanh", "tanh_with_dropout", "rectifier", "rectifier_with_dropout", "maxout", "maxout_with_dropout" ] } grid_search = H2OGridSearch(H2ODeepLearningEstimator, hyper_params=hyper_parameters) grid_search.train(x=train_data.names, training_frame=train_data, validation_frame=validate_data) grid_search.show() v_frame = grid_search.varimp(True) print v_frame
def test_stackedensemble_respects_the_max_runtime_secs(): max_runtime_secs = 1 hyper_parameters = dict() hyper_parameters["ntrees"] = [1, 2, 3, 4, 5] params = dict( fold_assignment="modulo", nfolds=3 ) data = prepare_data() gs1 = H2OGridSearch(H2OGradientBoostingEstimator(**params), hyper_params=hyper_parameters) gs1.train(data.x, data.y, data.train, validation_frame=data.train) big_blending_frame = data.train for i in range(15): big_blending_frame = big_blending_frame.rbind(big_blending_frame) se = H2OStackedEnsembleEstimator( base_models=gs1.model_ids, max_runtime_secs=max_runtime_secs, blending_frame=big_blending_frame) try: se.train(data.x, data.y, data.train) assert False, "This should have failed due to time out." except H2OResponseError: pass
def test_stackedensemble_propagates_the_max_runtime_secs(): max_runtime_secs = 5 hyper_parameters = dict() hyper_parameters["ntrees"] = [1, 3, 5] params = dict( fold_assignment="modulo", nfolds=3, keep_cross_validation_predictions=True ) data = prepare_data() gs1 = H2OGridSearch(H2OGradientBoostingEstimator(**params), hyper_params=hyper_parameters) gs1.train(data.x, data.y, data.train, validation_frame=data.train) se = H2OStackedEnsembleEstimator(base_models=[gs1], max_runtime_secs=max_runtime_secs) se.train(data.x, data.y, data.train) metalearner = h2o.get_model(se.metalearner()["name"]) # metalearner has the set max_runtine_secs assert metalearner.actual_params['max_runtime_secs'] <= max_runtime_secs assert metalearner.actual_params['max_runtime_secs'] > 0 # stack ensemble has the set max_runtime_secs assert se.max_runtime_secs == max_runtime_secs
def _prepare_test_env(): hyper_parameters = dict() hyper_parameters["ntrees"] = [1, 3, 5] params = dict(fold_assignment="modulo", nfolds=3, keep_cross_validation_predictions=True) data = prepare_data() drf = H2ORandomForestEstimator(**params) drf.train(data.x, data.y, data.train, validation_frame=data.train) gs1 = H2OGridSearch(H2OGradientBoostingEstimator(**params), hyper_params=hyper_parameters) gs1.train(data.x, data.y, data.train, validation_frame=data.train) gs2 = H2OGridSearch(H2ORandomForestEstimator(**params), hyper_params=hyper_parameters) gs2.train(data.x, data.y, data.train, validation_frame=data.train) return dict(data=data, drf=drf, gs1=gs1, gs2=gs2)
def test_mean_per_class_error_grid(): gbm = H2OGradientBoostingEstimator(nfolds=3, fold_assignment="Random", seed=1234) cars = h2o.import_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) cars["cylinders"] = cars["cylinders"].asfactor() r = cars[0].runif(seed=1234) train = cars[r > .2] valid = cars[r <= .2] response_col = "cylinders" predictors = ["displacement","power","weight","acceleration","year"] gbm.distribution="multinomial" ## Early stopping gbm.stopping_rounds=2 gbm.stopping_metric="mean_per_class_error" gbm.ntrees=10000 gbm.max_depth=3 gbm.min_rows=1 gbm.learn_rate=0.01 gbm.score_tree_interval=1 gbm.nfolds=None gbm.fold_assignment=None gbm.train(x=predictors,y=response_col, training_frame=train, validation_frame=valid) print(gbm) print(gbm.scoring_history()) ## Grid search hyper_params_tune = {'max_depth' : list(range(1,10+1,1)), 'sample_rate': [x/100. for x in range(20,101)], 'col_sample_rate' : [x/100. for x in range(20,101)], 'col_sample_rate_per_tree': [x/100. for x in range(20,101)], 'col_sample_rate_change_per_level': [x/100. for x in range(90,111)], 'min_rows': [2**x for x in range(0,int(math.log(train.nrow,2)-2)+1)], 'nbins': [2**x for x in range(4,11)], 'nbins_cats': [2**x for x in range(4,13)], 'min_split_improvement': [0,1e-8,1e-6,1e-4], 'histogram_type': ["UniformAdaptive","QuantilesGlobal","RoundRobin"]} search_criteria_tune = {'strategy': "RandomDiscrete", 'max_runtime_secs': 600, ## limit the runtime to 10 minutes 'max_models': 10, 'seed' : 1234, 'stopping_rounds' : 5, 'stopping_metric' : "mean_per_class_error", 'stopping_tolerance': 1e-3 } grid = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_params_tune, search_criteria=search_criteria_tune) grid.train(x=predictors, y=response_col, training_frame=train, validation_frame=valid,distribution="multinomial", seed=1234, stopping_rounds = 10, stopping_metric = "mean_per_class_error", stopping_tolerance=1e-3) print(grid) ## sorted by logloss print(grid.get_grid("mean_per_class_error"))
def search(self, score_cutoff, param_space, rand_seed, n_models, const_params, cv_folds, training_frame, model_directory, predictors, response): if ("Windows" in platform()) and (self.estimator == H2OXGBoostEstimator): incompatible_message = "Windows currently doesn't support H2OXGBoostEstimator. " \ "No xgboost models will be trained." self.logger.info(incompatible_message) print(incompatible_message) return criteria = { 'strategy': 'RandomDiscrete', 'max_models': n_models, 'seed': rand_seed, # limit the runtime to 60 minutesS 'max_runtime_secs': self.max_minutes * 60, # early stopping once the leaderboard of the top 5 models is converged to 0.1% relative difference 'stopping_rounds': 5, 'stopping_metric': self.eval_metric, 'stopping_tolerance': 1e-3 } # Required for H2OStackedEnsembleEstimator const_params.update({ 'nfolds': cv_folds, 'keep_cross_validation_predictions': True, 'fold_assignment': "Modulo", 'seed': rand_seed }) grid = H2OGridSearch(model=self.estimator(**const_params), grid_id=self.name + '_grid', hyper_params=param_space, search_criteria=criteria) self.logger.info("Training {} models ...".format(self.name)) # grid.train(x=X, y=Y, nfolds=configuration.CV_FOLDS, seed=rand_seed, training_frame=credit_data) try: grid.train(x=predictors, y=response, training_frame=training_frame) except H2OResponseError: self.logger.error('Encountered server error. Skipping ' + self.name) return self.logger.info("Finished training {} models.".format(self.name)) # Get the grid results, sorted results = grid.get_grid(sort_by=self.eval_metric, decreasing=True) for x in results: print(get_model_cv_metric(x, self.eval_metric)) high_scoring = [model for model in results if get_model_cv_metric(model, self.eval_metric) > score_cutoff] if not high_scoring: self.logger.info('Failed to find models that meet the cut off.') return self.log_training_results(results=results, search_grid=param_space) self.save_model_list(model_lst=high_scoring, seed=rand_seed, directory=model_directory)
def test_train_returns_the_trained_models(): fr = h2o.import_file(path=pu.locate("smalldata/prostate/prostate.csv")) target = "CAPSULE" fr[target] = fr[target].asfactor() grid = H2OGridSearch( H2OGradientBoostingEstimator, dict( ntrees=[5, 10], learn_rate=[0.1, 0.5] ) ) result = grid.train(y=target, training_frame=fr) assert isinstance(result, H2OGridSearch) assert result is grid result.predict(fr)
def test_make_leaderboard_with_leaderboard_frame(): train = h2o.upload_file( pyunit_utils.locate("smalldata/titanic/titanic_expanded.csv")) train["name"] = train["name"].asfactor() y = "fare" aml = H2OAutoML(seed=1234, max_models=5) aml.train(y=y, training_frame=train) aml2 = H2OAutoML(seed=134, max_models=5) aml2.train(y=y, training_frame=train) grid = H2OGridSearch(H2OGradientBoostingEstimator(), hyper_params={"ntrees": [1, 2, 3]}) grid.train(y=y, training_frame=train) # with leaderboard frame expected_cols = ("model_id", "rmse", "mse", "mae", "rmsle", "mean_residual_deviance", "training_time_ms", "predict_time_per_row_ms", "algo") ldb = h2o.make_leaderboard(aml, train, extra_columns="ALL") for c in expected_cols: assert c in ldb.columns for score_data in ("AUTO", "xval", "valid", "train"): assert h2o.make_leaderboard(aml, train, scoring_data=score_data).nrow > 0 assert h2o.make_leaderboard( [aml, aml2], train, scoring_data=score_data).nrow > 0 assert h2o.make_leaderboard(grid, scoring_data=score_data).nrow > 0 assert h2o.make_leaderboard( [aml, grid, aml2.leader], train, scoring_data=score_data).nrow > 0 # extra columns for ec in ("training_time_ms", "predict_time_per_row_ms", "algo"): assert ec in h2o.make_leaderboard(grid, train, extra_columns=ec).columns # extra columns without leaderboard frame for ec in ("training_time_ms", "algo"): assert ec in h2o.make_leaderboard(grid, extra_columns=ec).columns # sort metrics for sm in ("rmse", "mse", "mae", "rmsle", "mean_residual_deviance"): assert h2o.make_leaderboard(grid, train, sort_metric=sm).columns[1] == sm
def grid_search(training_df, attribute_property_length): h2o.init() h2o.connect() training_array = training_df.values x = training_array[:, 0:attribute_property_length] y = training_array[:, attribute_property_length - 1] tr_df = h2o.H2OFrame(x) training_columns = [ 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10' ] response_column = 'C11' hyper_parameters = {'ntrees': [15, 20, 25], 'max_depth': [15, 20]} random_plus_manual = H2OGridSearch( H2ORandomForestEstimator(nfolds=n_splits), hyper_parameters) random_plus_manual.train(x=training_columns, y=response_column, training_frame=tr_df) random_plus_manual.show()
def train_base_models(data): grid = H2OGridSearch( H2OGradientBoostingEstimator, search_criteria=dict( strategy='RandomDiscrete', max_models=5, seed=seed, ), hyper_params=dict( learn_rate=[0.5, 0.8, 1.0], max_depth=[2, 3, 4, 5], ntrees=[5, 10, 15], ), ) grid.train(data.x, data.y, data.train, nfolds=5, fold_assignment='Modulo', keep_cross_validation_predictions=True) return grid.models
def test_make_leaderboard_without_leaderboard_frame(): train = h2o.upload_file( pyunit_utils.locate("smalldata/titanic/titanic_expanded.csv")) train["name"] = train["name"].asfactor() y = "fare" aml = H2OAutoML(seed=1234, max_models=5) aml.train(y=y, training_frame=train) aml2 = H2OAutoML(seed=134, max_models=5) aml2.train(y=y, training_frame=train) grid = H2OGridSearch(H2OGradientBoostingEstimator(), hyper_params={"ntrees": [1, 2, 3]}) grid.train(y=y, training_frame=train) assert h2o.make_leaderboard(aml).nrow > 0 assert h2o.make_leaderboard(aml).nrow == h2o.make_leaderboard( aml ).nrow # creating the same leaderboard doesn't end up with duplicate models assert h2o.make_leaderboard(grid).nrow > 0 assert h2o.make_leaderboard([aml, aml2, grid, aml.leader]).nrow > 0 # without leaderboard frame for score_data in ("AUTO", "xval", "valid", "train"): assert h2o.make_leaderboard(aml, scoring_data=score_data).nrow > 0 assert h2o.make_leaderboard([aml, aml2], scoring_data=score_data).nrow > 0 assert h2o.make_leaderboard(grid, scoring_data=score_data).nrow > 0 assert h2o.make_leaderboard([aml, grid, aml2.leader], scoring_data=score_data).nrow > 0 try: print( h2o.make_leaderboard(aml, extra_columns="predict_time_per_row_ms")) assert False, "Should fail - Cannot calculate the predict time without leaderboard frame" except h2o.exceptions.H2OResponseError: pass
def grid_metric_accessors(): cars = h2o.import_file( path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) r = cars[0].runif() train = cars[r > .2] valid = cars[r <= .2] # regression response_col = "economy" distribution = "gaussian" predictors = ["displacement", "power", "weight", "acceleration", "year"] gbm = H2OGradientBoostingEstimator(nfolds=3, distribution=distribution, fold_assignment="Random") gbm_grid = H2OGridSearch(gbm, hyper_params=dict(ntrees=[1, 2, 3])) gbm_grid.train(x=predictors, y=response_col, training_frame=train, validation_frame=valid) # using list from http://docs.h2o.ai/h2o/latest-stable/h2o-docs/performance-and-prediction.html#regression for metric in ['r2', 'mse', 'rmse', 'rmsle', 'mae']: val = getattr(gbm_grid, metric)() assert isinstance(val, dict) for v in val.values(): assert isinstance( v, float), "expected a float for metric {} but got {}".format( metric, v) # binomial cars = h2o.import_file( path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() r = cars[0].runif() train = cars[r > .2] valid = cars[r <= .2] response_col = "economy_20mpg" distribution = "bernoulli" predictors = ["displacement", "power", "weight", "acceleration", "year"] gbm = H2OGradientBoostingEstimator(nfolds=3, distribution=distribution, fold_assignment="Random") gbm_grid = H2OGridSearch(gbm, hyper_params=dict(ntrees=[1, 2, 3])) gbm_grid.train(x=predictors, y=response_col, training_frame=train, validation_frame=valid) # using list from http://docs.h2o.ai/h2o/latest-stable/h2o-docs/performance-and-prediction.html#classification # + common ones for metric in ['gini', 'logloss', 'auc', 'aucpr', 'mse', 'rmse']: val = getattr(gbm_grid, metric)() assert isinstance(val, dict) for v in val.values(): assert isinstance( v, float), "expected a float for metric {} but got {}".format( metric, v) for metric in [ 'mcc', 'F1', 'F0point5', 'F2', 'accuracy', 'mean_per_class_error' ]: val = getattr(gbm_grid, metric)() assert isinstance(val, dict) for v in val.values(): assert isinstance( v[0][1], float), "expected a float for metric {} but got {}".format( metric, v) # multinomial cars = h2o.import_file( path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) cars["cylinders"] = cars["cylinders"].asfactor() r = cars[0].runif() train = cars[r > .2] valid = cars[r <= .2] response_col = "cylinders" distribution = "multinomial" predictors = ["displacement", "power", "weight", "acceleration", "year"] gbm = H2OGradientBoostingEstimator(nfolds=3, distribution=distribution, fold_assignment="Random") gbm_grid = H2OGridSearch(gbm, hyper_params=dict(ntrees=[1, 2, 3])) gbm_grid.train(x=predictors, y=response_col, training_frame=train, validation_frame=valid) # using list from http://docs.h2o.ai/h2o/latest-stable/h2o-docs/performance-and-prediction.html#classification # + common ones for metric in ['logloss', 'mse', 'rmse', 'mean_per_class_error']: val = getattr(gbm_grid, metric)() assert isinstance(val, dict) for v in val.values(): assert isinstance( v, float), "expected a float for metric {} but got {}".format( metric, v)
h2o.export_file(hTrain, "hTrainMy.csv", force=True) h2o.export_file(hValidate, "hValidateMy.csv", force=True) h2o.export_file(hTesting, "hTestingMy.csv", force=True) training_columns = list(pData.columns) training_columns.remove('UnitNumber') training_columns.remove('Time') training_columns.remove('RUL') response_column = 'RUL' hyper_parameters = { 'activation': [ 'Tanh', 'TanhWithDropout', 'Rectifier', 'RectifierWithDropout', 'Maxout', 'MaxoutWithDropout' ], 'epochs': [10, 50, 100], 'hidden': [32, 64, 128, 256, 512, 1024] } grid_search = H2OGridSearch(H2ODeepLearningEstimator, hyper_params=hyper_parameters) grid_search.train(x=training_columns, y='RUL', training_frame=hTrain, validation_frame=hValidate) grid_search.show() models = grid_search.sort_by("mse") print models
h2o.export_file(hTrain, "hTrainMy.csv", force=True) h2o.export_file(hValidate, "hValidateMy.csv", force=True) h2o.export_file(hTesting, "hTestingMy.csv", force=True) training_columns = list(pData.columns) training_columns.remove('UnitNumber') training_columns.remove('Time') training_columns.remove('RUL') response_column = 'RUL' hyper_parameters = { 'distribution': [ 'auto', 'bernoulli', 'multinomial', 'gaussian', 'poisson', 'gamma', 'tweedie', 'laplace', 'quantile', 'huber' ], 'fold_assignment': ['auto', 'random', 'modulo', 'stratified'], 'histogram_type': ['auto', 'uniform_adaptive', 'random', 'quantiles_global', 'round_robin'] } grid_search = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters) grid_search.train(x=training_columns, y='RUL', training_frame=hTrain, validation_frame=hValidate) grid_search.show() models = grid_search.sort_by("mse") print(models)
# Split data inti training and validation hTrain, hValidate = hData.split_frame(ratios=[0.8]) h2o.export_file(hTrain, "hTrainMy.csv", force=True) h2o.export_file(hValidate, "hValidateMy.csv", force=True) h2o.export_file(hTesting, "hTestingMy.csv", force=True) training_columns = list(pData.columns) training_columns.remove('UnitNumber') training_columns.remove('Time') training_columns.remove('RUL') response_column = 'RUL' hyper_parameters = { 'ntrees': [50, 75, 100], 'max_depth': [20, 50], 'nbins': [100, 250] } grid_search = H2OGridSearch(H2ORandomForestEstimator, hyper_params=hyper_parameters) grid_search.train(x=training_columns, y='RUL', training_frame=hTrain, validation_frame=hValidate) grid_search.show() models = grid_search.sort_by("mse") print models
def test_pubdev_6416(self): # Attempt to add a model to the grid by specifying invalid hyperparameters search range. # Should fail and generate error data = h2o.import_file( pyunit_utils.locate('smalldata/iris/iris_train.csv')) hyper_params = { 'max_depth': [8], 'sample_rate': [.9], 'col_sample_rate': [.9], 'col_sample_rate_per_tree': [.9], 'col_sample_rate_change_per_level': [.9], 'min_rows': [5000000], # Invalid hyperparameter 'min_split_improvement': [1e-4], 'histogram_type': ["UniformAdaptive"] } search_criteria = { 'strategy': "RandomDiscrete", 'max_runtime_secs': 3600, 'max_models': 1, 'seed': 12345, 'stopping_rounds': 5, 'stopping_metric': "MSE", 'stopping_tolerance': 1e-3 } gbm = H2OGradientBoostingEstimator(distribution='multinomial', ntrees=5, learn_rate=0.05, score_tree_interval=5, seed=1, stopping_rounds=5, stopping_metric="MSE", stopping_tolerance=1e-4) grid = H2OGridSearch(gbm, hyper_params=hyper_params, grid_id="grid_pubdev6416", search_criteria=search_criteria) with self.assertRaises(ValueError) as err: grid.train(x=["sepal_len", "sepal_wid"], y="species", max_runtime_secs=3600, training_frame=data) # During the first search, the error should be present assert "Details: ERRR on field: _min_rows: The dataset size is too small to split for min_rows=5000000.0: must have at least 1.0E7 (weighted) rows" \ in str(err.exception) assert len(grid.models) == 0 hyper_params = { 'max_depth': [8], 'sample_rate': [.9], 'col_sample_rate': [.9], 'col_sample_rate_per_tree': [.9], 'col_sample_rate_change_per_level': [.9], 'min_rows': [10], 'min_split_improvement': [1e-4], 'histogram_type': ["UniformAdaptive"] } gbm = H2OGradientBoostingEstimator(distribution='multinomial', ntrees=5, learn_rate=0.05, learn_rate_annealing=0.99, score_tree_interval=5, seed=1, stopping_rounds=5, stopping_metric="MSE", stopping_tolerance=1e-4) grid = H2OGridSearch(gbm, hyper_params=hyper_params, grid_id="grid_pubdev6416", search_criteria=search_criteria) grid.train(x=["sepal_len", "sepal_wid"], y="species", max_runtime_secs=3600, training_frame=data) # Assert the model is actually trained and added to the grid, not affected by previous exceptions assert len(grid.models) == 1
# Grid search param options: # http://docs.h2o.ai/h2o/latest-stable/h2o-docs/grid-search.html#xgboost-hyperparameters gbm_params2 = { 'learn_rate': [i * 0.01 for i in range(1, 14)], 'max_depth': list(range(6, 11)), 'sample_rate': [i * 0.1 for i in range(4, 11)], 'col_sample_rate': [i * 0.1 for i in range(4, 11)] } # Train and validate a random grid of GBMs gbm_grid2 = H2OGridSearch(model=H2OGradientBoostingEstimator( ntrees=max_trees, stopping_rounds=3, stopping_tolerance=stop_tol, keep_cross_validation_predictions=True), hyper_params=gbm_params2, search_criteria=search_criteria) print("grid searching gbm") gbm_grid2.train(x=train_cols, y='Dry_Yield', training_frame=df, fold_column='YearID_KFold') print(gbm_grid2) gbm_best = gbm_grid2.get_grid(sort_by='mae').models[0] print(gbm_best) gbm_ids = [m.model_id for m in gbm_grid2.models[:stack_top_n_grid_results]] # random forest drf_params = {
def pyunit_mean_per_class_error(): gbm = H2OGradientBoostingEstimator(nfolds=3, fold_assignment="Random", seed=1234) ## Binomial cars = h2o.import_file( pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() r = cars[0].runif(seed=1234) train = cars[r > .2] valid = cars[r <= .2] response_col = "economy_20mpg" predictors = ["displacement", "power", "weight", "acceleration", "year"] gbm.distribution = "bernoulli" gbm.train(y=response_col, x=predictors, validation_frame=valid, training_frame=train) print(gbm) mpce = gbm.mean_per_class_error([0.5, 0.8]) ## different thresholds assert (abs(mpce[0][1] - 0.004132231404958664) < 1e-5) assert (abs(mpce[1][1] - 0.021390374331550777) < 1e-5) ## score on train first print( gbm.model_performance(train).mean_per_class_error( thresholds=[0.3, 0.5])) ## Multinomial cars = h2o.import_file( pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) cars["cylinders"] = cars["cylinders"].asfactor() r = cars[0].runif(seed=1234) train = cars[r > .2] valid = cars[r <= .2] response_col = "cylinders" predictors = ["displacement", "power", "weight", "acceleration", "year"] gbm.distribution = "multinomial" gbm.train(x=predictors, y=response_col, training_frame=train, validation_frame=valid) print(gbm) mpce = gbm.mean_per_class_error(train=True) assert (mpce == 0) mpce = gbm.mean_per_class_error(valid=True) # assert(abs(mpce - 0.207142857143 ) < 1e-5) assert (abs(mpce - 0.407142857143) < 1e-5) mpce = gbm.mean_per_class_error(xval=True) # assert(abs(mpce - 0.350071715433 ) < 1e-5) assert (abs(mpce - 0.35127653471) < 1e-5) ## Early stopping gbm.stopping_rounds = 2 gbm.stopping_metric = "mean_per_class_error" gbm.ntrees = 10000 gbm.max_depth = 3 gbm.min_rows = 1 gbm.learn_rate = 0.01 gbm.score_tree_interval = 1 gbm.nfolds = None gbm.fold_assignment = None gbm.train(x=predictors, y=response_col, training_frame=train, validation_frame=valid) print(gbm) print(gbm.scoring_history()) ## Grid search hyper_params_tune = { 'max_depth': list(range(1, 10 + 1, 1)), 'sample_rate': [x / 100. for x in range(20, 101)], 'col_sample_rate': [x / 100. for x in range(20, 101)], 'col_sample_rate_per_tree': [x / 100. for x in range(20, 101)], 'col_sample_rate_change_per_level': [x / 100. for x in range(90, 111)], 'min_rows': [2**x for x in range(0, int(math.log(train.nrow, 2) - 2) + 1)], 'nbins': [2**x for x in range(4, 11)], 'nbins_cats': [2**x for x in range(4, 13)], 'min_split_improvement': [0, 1e-8, 1e-6, 1e-4], 'histogram_type': ["UniformAdaptive", "QuantilesGlobal", "RoundRobin"] } search_criteria_tune = { 'strategy': "RandomDiscrete", 'max_runtime_secs': 600, ## limit the runtime to 10 minutes 'max_models': 10, 'seed': 1234, 'stopping_rounds': 5, 'stopping_metric': "mean_per_class_error", 'stopping_tolerance': 1e-3 } grid = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_params_tune, search_criteria=search_criteria_tune) grid.train(x=predictors, y=response_col, training_frame=train, validation_frame=valid, distribution="multinomial", seed=1234, stopping_rounds=10, stopping_metric="mean_per_class_error", stopping_tolerance=1e-3) print(grid) ## sorted by logloss print(grid.get_grid("mean_per_class_error"))
# Split data inti training and validation hTrain, hValidate = hData.split_frame(ratios=[0.8]) h2o.export_file(hTrain, "hTrainMy.csv", force=True) h2o.export_file(hValidate, "hValidateMy.csv", force=True) response_column = 'RUL' hyper_parameters = { 'activation': [ 'Tanh', 'TanhWithDropout', 'Rectifier', 'RectifierWithDropout', 'Maxout', 'MaxoutWithDropout' ], 'hidden': [4, 6, 8, 10, 12, 14, 16, 18, 20], 'epochs': [50, 100, 150], 'loss': ['Quadratic', 'Absolute', 'Huber'], 'distribution': [ 'AUTO', 'bernoulli', 'multinomial', 'poisson', 'gamma', 'tweedie', 'laplace', 'huber', 'quantile', 'gaussian' ] } grid_search = H2OGridSearch(H2OAutoEncoderEstimator, hyper_params=hyper_parameters) grid_search.train(x=selected_columns, training_frame=hTrain, validation_frame=hValidate) grid_search.show() models = grid_search.sort_by("mse") print models