def glrm_start(grid_id, export_dir, train, params, hyper_parameters): grid = H2OGridSearch(H2OGeneralizedLowRankEstimator, grid_id=grid_id, hyper_params=hyper_parameters, recovery_dir=export_dir, parallelism=2) grid.start(x=train.names, training_frame=train, **params) return grid
def setup_grid(): h2o.remove_all() hyper_parameters = OrderedDict() hyper_parameters["learn_rate"] = [0.1, 0.05, 0.01] hyper_parameters["ntrees"] = [1, 3, 5] gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters) return gs
def benign_grid(): training_data = h2o.import_file( pyunit_utils.locate("smalldata/logreg/benign.csv")) Y = 3 X = list(range(3)) + list(range(4, 11)) # NOTE: this tests bad parameter value handling; 'a' is not a float: hyper_parameters = {'alpha': [0.01, 0.5, 'a'], 'lambda': [1e-5, 1e-6]} gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters) gs.train(x=X, y=Y, training_frame=training_data) for model in gs: assert isinstance(model, H2OGeneralizedLinearEstimator) gs.show() print(gs.sort_by('F1', False)) best_model_id = gs.sort_by('F1', False)['Model Id'][0] best_model = h2o.get_model(best_model_id) best_model.predict(training_data) gs.predict(training_data) print(gs.get_hyperparams(best_model_id)) print(gs.grid_id) new_g = H2OGridSearch.get_grid( H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters, gs.grid_id) new_g.show() print(new_g.grid_id) print(new_g.sort_by('F1', False)) assert best_model.params['family']['actual'] == 'binomial' # test search_criteria plumbing search_criteria = {'strategy': "RandomDiscrete", 'max_models': 3} max_models_g = H2OGridSearch( H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters, search_criteria=search_criteria) max_models_g.train(x=X, y=Y, training_frame=training_data) max_models_g.show() print(max_models_g.grid_id) print(max_models_g.sort_by('F1', False)) assert len(max_models_g.models) == 3, "expected 3 models, got: {}".format( len(max_models_g.models))
def train_models(self): self.h2o_model = \ H2OGridSearch(H2OGeneralizedAdditiveEstimator(family="gaussian", keep_gam_cols=True), hyper_params=self.hyper_parameters, search_criteria=self.search_criteria) self.h2o_model.train(x = self.myX, y = self.myY, training_frame = self.h2o_data) for model in self.manual_gam_models: model.train(x = self.myX, y = self.myY, training_frame = self.h2o_data) print("done")
def h2o_randomsearch_nn(frames, y_name, x_names, dname): """ H2o neural network with parameter tuning. Args: frames[0], H2o training frame. frames[1], H2o validation frame. y_name: Target name. x_names: List of input names. dname: Name of data file. Returns: List of modeling results: """ ### assign partitions tr_frame, v_frame = frames[0], frames[1] ### impute numeric ### categorical 'NA' treated as valid level tr_frame.impute(method='mean') v_frame.impute(method='mean') ### stdize tr_frame = h2o_stdize(tr_frame, y_name, x_names) v_frame = h2o_stdize(v_frame, y_name, x_names) ### define random grid search parameters and criteria hyper_parameters = {"hidden":[[17, 32], [8, 19], [32, 16, 8], [100],\ [10, 10, 10, 10]], "l1":[s/1e6 for s in range(1, 1001)], "l2":[s/1e4 for s in range(1, 101)], "input_dropout_ratio":[s/1e2 for s in range(1, 21)]} search_criteria = {"strategy":"RandomDiscrete", "max_models":20, "max_runtime_secs":600, "seed":SEED} gsearch = H2OGridSearch(H2ODeepLearningEstimator, hyper_params=hyper_parameters, search_criteria=search_criteria) ### execute training w/ grid search gsearch.train(x=x_names, y=y_name, training_frame=tr_frame, validation_frame=v_frame) ### collect error measures if tr_frame[y_name].isfactor()[0]: measures = h2o_cla_err_measures(gsearch, tr_frame, v_frame, y_name) else: measures = h2o_reg_err_measures(gsearch) ### return appropriate list return ['NN', 'NN w/ random hyperparameter search', dname, tr_frame.nrow, len(x_names), measures[0], measures[1], measures[2]]
def test3_glm_random_grid_search_max_runtime_secs(self): """ This function will test the stopping criteria max_runtime_secs. For each model built, the field run_time actually denote the time in ms used to build the model. We will add up the run_time from all models and check against the stopping criteria max_runtime_secs. Since each model will check its run time differently, there is some inaccuracies in the actual run time. For example, if we give a model 10 ms to build. The GLM may check and see if it has used up all the time for every 10 epochs that it has run. On the other hand, deeplearning may check the time it has spent after every epoch of training. If we are able to restrict the runtime to not exceed the specified max_runtime_secs by a certain percentage, we will consider the test a success. :return: None """ print( "*******************************************************************************************" ) print("test3_glm_random_grid_search_max_runtime_secs for GLM " + self.family) h2o.cluster_info() # setup_data our stopping condition here max_run_time_secs = random.uniform( 0, self.max_grid_runtime * self.allowed_scaled_overtime) search_criteria = { 'strategy': 'RandomDiscrete', 'max_runtime_secs': max_run_time_secs, "seed": round(time.time()) } # search_criteria = {'strategy': 'RandomDiscrete', 'max_runtime_secs': 1/1e8} print("GLM Binomial grid search_criteria: {0}".format(search_criteria)) # fire off random grid-search grid_model = \ H2OGridSearch(H2OGeneralizedLinearEstimator(family=self.family, nfolds=self.nfolds), hyper_params=self.hyper_params, search_criteria=search_criteria) grid_model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data) actual_run_time_secs = pyunit_utils.find_grid_runtime(grid_model) if actual_run_time_secs <= search_criteria["max_runtime_secs"] * ( 1 + self.allowed_diff): print("test3_glm_random_grid_search_max_runtime_secs: passed!") elif len(grid_model) == 1: # will always generate 1 model print("test3_glm_random_grid_search_max_runtime_secs: passed!") else: self.test_failed += 1 self.test_failed_array[self.test_num] = 1 print( "test3_glm_random_grid_search_max_runtime_secs: failed. Model takes time {0}" " seconds which exceeds allowed time {1}".format( actual_run_time_secs, max_run_time_secs * (1 + self.allowed_diff))) self.test_num += 1 sys.stdout.flush()
def h2o_randomsearch_gbm(frames, y_name, x_names, dname): """ H2o GBM with parameter tuning. Args: frames[0], H2o training frame. frames[1], h2o validation frame. y_name: Target name. x_names: List of input names. dname: Name of data file. Returns: List of modeling results: """ ### assign partitions tr_frame, v_frame = frames[0], frames[1] ### impute numeric ### categorical 'NA' treated as valid level tr_frame.impute(method='mean') v_frame.impute(method='mean') ### stdize tr_frame = h2o_stdize(tr_frame, y_name, x_names) v_frame = h2o_stdize(v_frame, y_name, x_names) ### define random grid search parameters and criteria hyper_parameters = {"ntrees":range(0, 100, 1), "max_depth":range(0, 20, 1), "sample_rate":[s/float(10) for s in range(1, 11)], "col_sample_rate":[s/float(10) for s in range(1, 11)]} search_criteria = {"strategy":"RandomDiscrete", "max_models":20, "max_runtime_secs":600, "seed":SEED} gsearch = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters, search_criteria=search_criteria) ### execute training w/ grid search gsearch.train(x=x_names, y=y_name, training_frame=tr_frame, validation_frame=v_frame) ### collect error measures if tr_frame[y_name].isfactor()[0]: measures = h2o_cla_err_measures(gsearch, tr_frame, v_frame, y_name) else: measures = h2o_reg_err_measures(gsearch) ### return appropriate list return ['GBM', 'GBM w/ random hyperparameter search', dname,\ tr_frame.nrow, len(x_names), measures[0], measures[1], measures[2]]
def benign_grid(): training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv")) Y = 3 X = [4,5,6,7,8,9,10,11] # NOTE: this tests bad parameter value handling; 'a' is not a float: hyper_parameters = {'alpha': [0.01,0.3,0.5,'a'], 'lambda': [1e-5,1e-6,1e-7,1e-8]} gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters) gs.train(x=X,y=Y, training_frame=training_data) for model in gs: assert isinstance(model, H2OGeneralizedLinearEstimator) gs.show() print(gs.sort_by('F1', False)) best_model_id = gs.sort_by('F1', False)['Model Id'][0] best_model = h2o.get_model(best_model_id) best_model.predict(training_data) gs.predict(training_data) print(gs.get_hyperparams(best_model_id)) print(gs.grid_id) assert best_model.params['family']['actual'] == 'binomial' # test search_criteria plumbing and max_models search_criteria = { 'strategy': "RandomDiscrete", 'max_models': 3 } max_models_g = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters, search_criteria=search_criteria) max_models_g.train(x=X,y=Y, training_frame=training_data) max_models_g.show() print(max_models_g.grid_id) print(max_models_g.sort_by('F1', False)) assert len(max_models_g.models) == 3, "expected 3 models, got: {}".format(len(max_models_g.models)) print(max_models_g.sorted_metric_table()) # test search_criteria plumbing and asymptotic stopping search_criteria = { 'strategy': "RandomDiscrete", 'seed': 42, 'stopping_metric': "AUTO", 'stopping_tolerance': 0.1, 'stopping_rounds': 2 } asymp_g = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial', nfolds=5), hyper_parameters, search_criteria=search_criteria) asymp_g.train(x=X,y=Y, training_frame=training_data) asymp_g.show() print(asymp_g.grid_id) print(asymp_g.sort_by('F1', False)) assert len(asymp_g.models) == 5, "expected 5 models, got: {}".format(len(asymp_g.models))
def create_grid(self): """Returns an H2O grid search object """ gbm_model = H2OGradientBoostingEstimator(**self.model_params) gbm_grid = H2OGridSearch(model=gbm_model, hyper_params=self.hyper_params, grid_id=self.grid_id, search_criteria=self.search_params) return gbm_grid
def optimize_hyperparam(self, hyper_params, X, y, sample_weight=None, X_valid=None, y_valid=None, sample_weight_valid=None, h2o_train_params=None, **kwargs): """Hyperparameter optimization & fitting model in H2O. Args: hyper_params: X (:obj:`pd.DataFrame`, :obj:`pd.Series`): Training data. y (:obj:`pd.DataFrame`, :obj:`pd.Series`): Training target values. sample_weight (:obj:`pd.DataFrame`, :obj:`pd.Series`, optional): Training sample weights. X_valid (:obj:`pd.DataFrame`, :obj:`pd.Series`, optional): Validation data (only h2o supported). y_valid (:obj:`pd.DataFrame`, :obj:`pd.Series`, optional): Validation target values (only h2o supported). sample_weight_valid (:obj:`pd.DataFrame`, :obj:`pd.Series`, optional): Validation sample weights. h2o_train_params (:obj:`dict`, optional): Parameters passed to `H2OGridSearch.train()`. **kwargs: Other parameters passed to H2OGridSearch. Returns: dict: {`hyperparameter_name`: `optimal_choice`}, Dictionary containing optimal hyperparameter choice. """ if (self.backend == 'h2o') & isinstance(self.model, H2OEstimator): params = dict() if h2o_train_params is None else h2o_train_params features, target, train_set, params = self._x_y_to_h2o_frame( X, y, sample_weight, params, X_valid, y_valid, sample_weight_valid) model_grid = H2OGridSearch(model=self.model, hyper_params=hyper_params, **kwargs) model_grid.train(y=target, x=features, training_frame=train_set, **params) sorted_grid = model_grid.get_grid(sort_by='residual_deviance', decreasing=False) self.best_params = sorted_grid.sorted_metric_table().loc[ 0, :'model_ids'].drop('model_ids').to_dict() self.best_params = { key: self.best_params[key].replace('[', '').replace(']', '') for key in self.best_params.keys() if key != '' } self.best_params = { key: float(self.best_params[key]) if is_number(self.best_params[key]) else self.best_params[key] for key in self.best_params.keys() } self.model = sorted_grid.models[0] else: raise NotImplementedError( f'Error with the backend choice. Supported backends: {self._backends}' ) return self.best_params
def train_models(self): self.h2o_model = H2OGridSearch(H2OGeneralizedAdditiveEstimator(family = "gaussian", gam_columns = ["C11", "C12", "C13"], keep_gam_cols = True, scale = [1, 1, 1], num_knots = [5, 6, 7], bs=[0,2,0]), self.hyper_parameters) self.h2o_model.train(x = self.myX, y = self.myY, training_frame = self.h2o_data) for model in self.manual_gam_models: model.train(x = self.myX, y = self.myY, training_frame = self.h2o_data)
def kmeans_start(grid_id, export_dir, train, params, hyper_parameters): grid = H2OGridSearch( H2OKMeansEstimator(), grid_id=grid_id, hyper_params=hyper_parameters, recovery_dir=export_dir ) grid.start(x=list(range(4)), training_frame=train, **params) return grid
def dl_start(grid_id, export_dir, train, params, hyper_parameters): grid = H2OGridSearch( H2ODeepLearningEstimator, grid_id=grid_id, hyper_params=hyper_parameters, recovery_dir=export_dir ) grid.start(x=list(range(4)), y=4, training_frame=train, **params) return grid
def gbm_start(grid_id, export_dir, train, params, hyper_parameters): grid = H2OGridSearch( H2OGradientBoostingEstimator, grid_id=grid_id, hyper_params=hyper_parameters, recovery_dir=export_dir ) grid.start(x=list(range(2, train.ncol)), y="Angaus", training_frame=train, **params) return grid
def airline_gbm_random_grid(): air_hex = h2o.import_file( path=pyunit_utils.locate("smalldata/airlines/allyears2k_headers.zip"), destination_frame="air.hex") myX = ["Year", "Month", "CRSDepTime", "UniqueCarrier", "Origin", "Dest"] # create hyperameter and search criteria lists (ranges are inclusive..exclusive)) hyper_params_tune = { 'max_depth': list(range(1, 10 + 1, 1)), 'sample_rate': [x / 100. for x in range(20, 101)], 'col_sample_rate': [x / 100. for x in range(20, 101)], 'col_sample_rate_per_tree': [x / 100. for x in range(20, 101)], 'col_sample_rate_change_per_level': [x / 100. for x in range(90, 111)], 'min_rows': [2**x for x in range(0, int(math.log(air_hex.nrow, 2) - 1) + 1)], 'nbins': [2**x for x in range(4, 11)], 'nbins_cats': [2**x for x in range(4, 13)], 'min_split_improvement': [0, 1e-8, 1e-6, 1e-4], 'histogram_type': ["UniformAdaptive", "QuantilesGlobal", "RoundRobin"] } search_criteria_tune = { 'strategy': "RandomDiscrete", 'max_runtime_secs': 600, ## limit the runtime to 10 minutes 'max_models': 5, ## build no more than 5 models 'seed': 1234, 'stopping_rounds': 5, 'stopping_metric': "AUC", 'stopping_tolerance': 1e-3 } air_grid = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_params_tune, search_criteria=search_criteria_tune) air_grid.train(x=myX, y="IsDepDelayed", training_frame=air_hex, nfolds=5, fold_assignment='Modulo', keep_cross_validation_predictions=True, distribution="bernoulli", seed=1234) assert (len(air_grid.get_grid()) == 5) print(air_grid.get_grid("logloss")) stacker = H2OStackedEnsembleEstimator(selection_strategy="choose_all", base_models=air_grid.model_ids) stacker.train(model_id="my_ensemble", y="IsDepDelayed", training_frame=air_hex) predictions = stacker.predict(air_hex) # training data print("preditions for ensemble are in: " + predictions.frame_id)
def glm_start(grid_id, export_dir, train, params, hyper_parameters): y = "cylinders" x = train.names x.remove(y) grid = H2OGridSearch(H2OGeneralizedLinearEstimator, grid_id=grid_id, hyper_params=hyper_parameters, recovery_dir=export_dir) grid.start(x=x, y=y, training_frame=train, **params) return grid
def grid_quasar_pca(): quasar = h2o.import_file(path=pyunit_utils.locate("smalldata/pca_test/SDSS_quasar.txt.zip"), header=1) grid_space = pyunit_utils.make_random_grid_space(algo="pca", ncols=quasar.ncol, nrows=quasar.nrow) print("Grid space: {0}".format(grid_space)) print("Constructing the grid of PCA models...") quasar_pca_grid = H2OGridSearch(H2OPCA, hyper_params=grid_space) quasar_pca_grid.train(x=list(range(1,23)), training_frame=quasar) for model in quasar_pca_grid: assert isinstance(model, H2OPCA) print("Performing various checks of the constructed grid...") print("Check cardinality of grid, that is, the correct number of models have been created...") size_of_grid_space = 1 for v in list(grid_space.values()): v2 = [v] if type(v) != list else v size_of_grid_space = size_of_grid_space * len(v2) actual_size = len(quasar_pca_grid) assert size_of_grid_space == actual_size, "Expected size of grid to be {0}, but got {1}" \ "".format(size_of_grid_space,actual_size) print("Duplicate-entries-in-grid-space check") new_grid_space = copy.deepcopy(grid_space) for name in list(grid_space.keys()): new_grid_space[name] = grid_space[name] + grid_space[name] print("The new search space: {0}".format(new_grid_space)) print("Constructing the new grid of nb models...") quasar_pca_grid2 = H2OGridSearch(H2OPCA, hyper_params=new_grid_space) quasar_pca_grid2.train(x=list(range(1,23)), training_frame=quasar) actual_size2 = len(quasar_pca_grid2) assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \ "size: {1}".format(actual_size, actual_size2) for model in quasar_pca_grid2: assert isinstance(model, H2OPCA) print("Check that the hyper_params that were passed to grid, were used to construct the models...") for name in list(grid_space.keys()): print(name) pyunit_utils.expect_model_param(quasar_pca_grid, name, grid_space[name])
def train_models(self): self.h2o_model = H2OGridSearch(H2OGeneralizedAdditiveEstimator( family="binomial", keep_gam_cols=True, seed=1234), hyper_params=self.hyper_parameters, search_criteria=self.search_criteria) self.h2o_model.train(x=self.myX, y=self.myY, training_frame=self.h2o_data) for model in self.manual_gam_models: model.train(x=self.myX, y=self.myY, training_frame=self.h2o_data)
def validacion_r(modelo, hyper_parameters, datos, variables, semilla=1234): h2o.init(max_mem_size=14) train = h2o.H2OFrame(datos[0]) tipificar_h2o(train) splits = train.split_frame(ratios=[0.7], seed=semilla) gs = H2OGridSearch(modelo, hyper_params=hyper_parameters) gs.train(x=variables, y="Tendencia", training_frame=splits[0]) resultados=procesamiento_resultados_binario(gs,splits,datos) h2o.remove_all() return(resultados)
def test4_glm_random_grid_search_metric(self, metric_name, bigger_is_better): """ This function will test the last stopping condition using metrics. :param metric_name: metric we want to use to test the last stopping condition :param bigger_is_better: higher metric value indicates better model performance :return: None """ print( "*******************************************************************************************" ) print("test4_glm_random_grid_search_metric using " + metric_name + " for family " + self.family) h2o.cluster_info() search_criteria = { "strategy": "RandomDiscrete", "stopping_metric": metric_name, "stopping_tolerance": random.uniform(1e-8, self.max_tolerance), "stopping_rounds": random.randint(1, self.max_stopping_rounds), "seed": round(time.time()) } print("GLM Gaussian grid search_criteria: {0}".format(search_criteria)) # add max_runtime_secs back into hyper-parameters to limit model runtime. self.hyper_params["max_runtime_secs"] = [ 0.3 ] # arbitrarily set to 0.1 second # fire off random grid-search grid_model = \ H2OGridSearch(H2OGeneralizedLinearEstimator(family=self.family, nfolds=self.nfolds), hyper_params=self.hyper_params, search_criteria=search_criteria) grid_model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data) # bool indicating if randomized grid search has calculated the early stopping condition correctly stopped_correctly = \ pyunit_utils.evaluate_metrics_stopping(grid_model.models, metric_name, bigger_is_better, search_criteria, self.possible_number_models) if stopped_correctly: print("test4_glm_random_grid_search_metric " + metric_name + ": passed. ") else: self.test_failed += 1 self.test_failed_array[self.test_num] = 1 print("test4_glm_random_grid_search_metric " + metric_name + ": failed. ") self.test_num += 1
def grid_search(estimator, hparams=None, search_criteria=None): """ create a grid search estimator :param estimator: a specific estimator for grid search :param hparams: a hyper parameters dict for grid search :param search_criteria: criteria for grid search :return: a constructed grid search estimator """ print estimator print hparams return H2OGridSearch(model=estimator, hyper_params=hparams, search_criteria=search_criteria)
def glm_grid(X, y, train, valid, should_submit = False): """ Wrapper function for penalized GLM with alpha and lambda search. :param X: List of inputs. :param y: Name of target variable. :param train: Name of training H2OFrame. :param valid: Name of validation H2OFrame. :return: Best H2Omodel from H2OGeneralizedLinearEstimator """ alpha_opts = [0.01, 0.25, 0.5, 0.99] # always keep some L2 family = ["gaussian", "binomial", "quasibinomial", "multinomial", "poisson", "gamma", "tweedie"] hyper_parameters = {"alpha":alpha_opts } # initialize grid search grid = H2OGridSearch( H2OGeneralizedLinearEstimator( family="gaussian", lambda_search=True, seed=12345), hyper_params=hyper_parameters) # train grid grid.train(y=y, x=X, training_frame=train, validation_frame=valid) # show grid search results print(grid.show()) best = grid.get_grid()[0] print(best) # if should_submit: # sub_frame = testHF[ID_VAR].cbind(best.predict(testHF)) # print(sub_frame.col_names) # print('Submission frame preview:') # print(sub_frame[0:10, [ID_VAR, 'predict']]) # upload_submission(sub_frame,'predict') # plot top frame values print('yhat_frame') yhat_frame = valid.cbind(best.predict(valid)) print(yhat_frame[0:10, [y, 'predict']]) # plot sorted predictions yhat_frame_df = yhat_frame[[y, 'predict']].as_data_frame() yhat_frame_df.sort_values(by='predict', inplace=True) yhat_frame_df.reset_index(inplace=True, drop=True) plt = yhat_frame_df.plot(title='Ranked Predictions Plot') logger.log_string('Ranked Predictions Plot') logger.log_matplotlib_plot(plt) # select best model return best
def xgboost_start(grid_id, export_dir, train, params, hyper_parameters): grid = H2OGridSearch(H2OXGBoostEstimator, grid_id=grid_id, hyper_params=hyper_parameters, recovery_dir=export_dir, parallelism=2) grid.start(x=list(range(2, train.ncol)), y="Angaus", training_frame=train, **params) return grid
def kmeans_grid_iris(): iris_h2o = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris.csv")) grid_space = pyunit_utils.make_random_grid_space(algo="km") print("Grid space: {0}".format(grid_space)) print("Constructing grid of Kmeans models") iris_grid = H2OGridSearch(H2OKMeansEstimator, hyper_params=grid_space) iris_grid.train(x=list(range(4)), training_frame=iris_h2o) print( "Check cardinality of grid, that is, the correct number of models have been created..." ) size_of_grid_space = 1 for v in list(grid_space.values()): size_of_grid_space = size_of_grid_space * len(v) actual_size = len(iris_grid) assert size_of_grid_space == actual_size, "Expected size of grid to be {0}, but got {1}" \ "".format(size_of_grid_space,actual_size) print("Duplicate-entries-in-grid-space check") new_grid_space = copy.deepcopy(grid_space) for name in list(grid_space.keys()): new_grid_space[name] = grid_space[name] + grid_space[name] print("The new search space: {0}".format(new_grid_space)) print("Constructing the new grid of glm models...") iris_grid2 = H2OGridSearch(H2OKMeansEstimator, hyper_params=new_grid_space) iris_grid2.train(x=list(range(4)), training_frame=iris_h2o) actual_size2 = len(iris_grid2) assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \ "size: {1}".format(actual_size, actual_size2) for model in iris_grid2: assert isinstance(model, H2OKMeansEstimator) print( "Check that the hyper_params that were passed to grid, were used to construct the models..." ) for name in list(grid_space.keys()): print(name) pyunit_utils.expect_model_param(iris_grid, name, grid_space[name])
def init_model(self): self.gbm_rand_grid = H2OGridSearch(H2OGradientBoostingEstimator(model_id='gbm_rand_grid' + \ str(random.sample(list(range(101)), 1)[ 0]), nfolds=5, fold_assignment="Modulo", keep_cross_validation_predictions=True, stopping_rounds=10, score_tree_interval=1), search_criteria=self.search_criteria, hyper_params=self.hyper_p_gbm) self.drf_rand_grid = H2OGridSearch(H2ORandomForestEstimator(model_id='drf_rand_grid' + \ str(random.sample(list(range(101)), 1)[0]), seed=1234, nfolds=5, fold_assignment="Modulo", balance_classes=True, keep_cross_validation_predictions=True), search_criteria=self.search_criteria, hyper_params=self.hyper_p_drf) self.glm_rand_grid = H2OGridSearch( H2OGeneralizedLinearEstimator( family="gaussian", nfolds=5, seed=1234, max_iterations=30, keep_cross_validation_predictions=True, compute_p_values=False), search_criteria=self.search_criteria, hyper_params=self.hyper_p_glm) self.dnn_rand_grid = H2OGridSearch(H2ODeepLearningEstimator( model_id='dnn_rand_grid' + \ str(random.sample(list(range(101)), 1)[0]), seed=1234, nfolds=5, fold_assignment="Modulo", keep_cross_validation_predictions=True), search_criteria=self.search_criteria, hyper_params=self.hyper_p_dnn)
def grid_parallel(): train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # Run GBM Grid Search ntrees_opts = [1, 5] hyper_parameters = OrderedDict() hyper_parameters["ntrees"] = ntrees_opts print("GBM grid with the following hyper_parameters:", hyper_parameters) gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters, parallelism = 1) gs.train(x=list(range(4)), y=4, training_frame=train) assert gs is not None assert len(gs.model_ids) == len(ntrees_opts)
def test1_glm_random_grid_search_model_number(self, metric_name): """ This test is used to make sure the randomized gridsearch will generate all models specified in the hyperparameters if no stopping condition is given in the search criterion. We will compare the performance between the randomized gridsearch and normal gridsearch to make sure they generate the same number of models and their performances are similar. :param metric_name: string to denote what grid search model should be sort by :return: None """ print( "*******************************************************************************************" ) print("test1_glm_random_grid_search_model_number for GLM " + self.family) h2o.cluster_info() # setup_data our stopping condition here, random discrete and find all models search_criteria = { 'strategy': 'RandomDiscrete', "stopping_rounds": 0, "seed": round(time.time()) } print("GLM Binomial grid search_criteria: {0}".format(search_criteria)) # fire off random grid-search random_grid_model = \ H2OGridSearch(H2OGeneralizedLinearEstimator(family=self.family, nfolds=self.nfolds), hyper_params=self.hyper_params, search_criteria=search_criteria) random_grid_model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data) # compare number of models built from both gridsearch if not (len(random_grid_model) == self.possible_number_models): self.test_failed += 1 self.test_failed_array[self.test_num] = 1 print( "test1_glm_random_grid_search_model_number for GLM: failed, number of models generated" "possible model number {0} and randomized gridsearch model number {1} are not " "equal.".format(self.possible_number_models, len(random_grid_model))) if self.test_failed_array[self.test_num] == 0: print("test1_glm_random_grid_search_model_number for GLM: passed!") self.test_num += 1 sys.stdout.flush() # gset max_allowed_runtime as total run time to build all models * (1+fraction) self.max_grid_runtime = pyunit_utils.find_grid_runtime( random_grid_model.models)
def test_rf_gridsearch_sorting_metrics(self): """ test_rf_gridsearch_sorting_metrics performs the following: b. build H2O random forest models using grid search. No model is built for bad hyper-parameters values. We should instead get a warning/error message printed out. c. Check and make sure that the models are returned sorted with the correct cross-validation metrics. """ if self.possible_number_models > 0: print("*******************************************************************************************") print("test_rf_gridsearch_sorting_metrics for random forest ") h2o.cluster_info() print("Hyper-parameters used here is {0}".format(self.final_hyper_params)) # start grid search grid_model = H2OGridSearch(H2ORandomForestEstimator(nfolds=self.nfolds, seed=self.seed, score_tree_interval=0), hyper_params=self.final_hyper_params) grid_model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data) result_table = grid_model._grid_json["summary_table"] model_index = 0 grid_model_metrics = [] diff = 0 # calculate difference between gridsearch model metrics and manually extracted model. diff_train = 0 # calculate difference between training and cross-validation metrics # grab performance metric for each model of grid_model and collect correct sorting metrics by hand for each_model in grid_model: grid_model_metric = float(result_table[self.training_metric][model_index]) grid_model_metrics.append(grid_model_metric) manual_metric = each_model._model_json["output"]["cross_validation_metrics"]._metric_json["logloss"] if not(type(grid_model_metrics) == unicode) and not(type(manual_metric)==unicode): diff += abs(grid_model_metric - manual_metric) manual_training_metric = each_model._model_json["output"]["training_metrics"]._metric_json["logloss"] if not(type(grid_model_metrics) == unicode) and not(type(manual_training_metric)==unicode): diff_train += abs(grid_model_metric-manual_training_metric) print("grid model logloss: {0}, grid model training logloss: " "{1}".format(grid_model_metric, manual_training_metric)) model_index += 1 if (diff > self.diff) or not(grid_model_metrics == sorted(grid_model_metrics)) or (diff_train < self.diff): self.test_failed = 1 print("test_rf_gridsearch_sorting_metrics for random forest has failed!") if self.test_failed == 0: print("test_rf_gridsearch_sorting_metrics for random forest has passed!")
def train_models(self): self.h2o_model = H2OGridSearch( H2OGeneralizedAdditiveEstimator(family="multinomial", gam_columns=["C6", "C7", "C8"], keep_gam_cols=True, scale=[1, 1, 1], num_knots=[5, 5, 5]), self.hyper_parameters) self.h2o_model.train(x=self.myX, y=self.myY, training_frame=self.h2o_data) for model in self.manual_gam_models: model.train(x=self.myX, y=self.myY, training_frame=self.h2o_data)
def test_gridsearch(): h2o_data = h2o.import_file(path = pyunit_utils.locate("smalldata/gam_test/synthetic_20Cols_binomial_20KRows.csv")) h2o_data['response'] = h2o_data['response'].asfactor() h2o_data['C3'] = h2o_data['C3'].asfactor() h2o_data['C7'] = h2o_data['C7'].asfactor() h2o_data['C8'] = h2o_data['C8'].asfactor() h2o_data['C10'] = h2o_data['C10'].asfactor() names = h2o_data.names myY = "response" myX = names.remove(myY) search_criteria = {'strategy': 'Cartesian'} hyper_parameters = {'lambda': [1, 2], 'subspaces': [{'scale': [[0.001], [0.0002]], 'num_knots': [[5], [10]], 'bs':[[1], [0]], 'gam_columns': [[["c_0"]], [["c_1"]]]}, {'scale': [[0.001, 0.001, 0.001], [0.0002, 0.0002, 0.0002]], 'bs':[[1, 1, 1], [0, 1, 1]], 'num_knots': [[5, 10, 12], [6, 11, 13]], 'gam_columns': [[["c_0"], ["c_1", "c_2"], ["c_3", "c_4", "c_5"]], [["c_1"], ["c_2", "c_3"], ["c_4", "c_5", "c_6"]]]}]} hyper_parameters2 = {'lambda': [1, 2], 'subspaces': [{'scale': [[0.001], [0.0002]], 'num_knots': [[5], [10]], 'bs':[[1], [0]], 'gam_columns': [[["c_0"]], [["c_1"]]]}, {'scale': [[0.001, 0.001, 0.001], [0.0002, 0.0002, 0.0002]], 'bs':[[1, 1, 1], [0, 1, 1]], 'num_knots': [[5, 10, 12], [6, 11, 13]], 'gam_columns': [["c_0", ["c_1", "c_2"], ["c_3", "c_4", "c_5"]], ["c_1", ["c_2", "c_3"], ["c_4", "c_5", "c_6"]]]}]} h2o_model = H2OGridSearch(H2OGeneralizedAdditiveEstimator(family="binomial", keep_gam_cols=True, seed=1), hyper_params=hyper_parameters, search_criteria=search_criteria) h2o_model.train(x = myX, y = myY, training_frame = h2o_data) h2o_model2 = H2OGridSearch(H2OGeneralizedAdditiveEstimator(family="binomial", keep_gam_cols=True, seed=1), hyper_params=hyper_parameters2, search_criteria=search_criteria) h2o_model2.train(x = myX, y = myY, training_frame = h2o_data) # compare two models by checking their coefficients. They should be the same for index in range(0, len(h2o_model)): model1 = h2o_model[index] model2 = h2o_model2[index] pyunit_utils.assertEqualCoeffDicts(model1.coef(), model2.coef(), tol=1e-6)