def benign_grid(): training_data = h2o.import_file( pyunit_utils.locate("smalldata/logreg/benign.csv")) Y = 3 X = range(3) + range(4, 11) hyper_parameters = {'alpha': [0.01, 0.5, 'a'], 'lambda': [1e-5, 1e-6]} gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters) gs.train(x=X, y=Y, training_frame=training_data) gs.show() print gs.sort_by('F1', False) best_model_id = gs.sort_by('F1', False)['Model Id'][0] best_model = h2o.get_model(best_model_id) best_model.predict(training_data) gs.predict(training_data) print gs.get_hyperparams(best_model_id) print gs.grid_id new_g = H2OGridSearch.get_grid( H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters, gs.grid_id) new_g.show() print new_g.grid_id print new_g.sort_by('F1', False) assert best_model.params['family']['actual'] == 'binomial'
def benign_grid(): training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv")) Y = 3 X = range(3) + range(4,11) hyper_parameters = {'alpha': [0.01,0.5,'a'], 'lambda': [1e-5,1e-6]} gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters) gs.train(x=X,y=Y, training_frame=training_data) gs.show() print gs.sort_by('F1', False) best_model_id = gs.sort_by('F1', False)['Model Id'][0] best_model = h2o.get_model(best_model_id) best_model.predict(training_data) gs.predict(training_data) print gs.get_hyperparams(best_model_id) print gs.grid_id new_g = H2OGridSearch.get_grid(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters, gs.grid_id) new_g.show() print new_g.grid_id print new_g.sort_by('F1', False) assert best_model.params['family']['actual'] == 'binomial'
def iris_gbm_grid(): train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # Run GBM ntrees_opts = [1,3] learn_rate_opts = [0.1,0.01,.05] size_of_hyper_space = len(ntrees_opts) * len(learn_rate_opts) hyper_parameters = OrderedDict() hyper_parameters["learn_rate"] = learn_rate_opts hyper_parameters["ntrees"] = ntrees_opts print("GBM grid with the following hyper_parameters:", hyper_parameters) gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters) gs.train(x=list(range(4)), y=4, training_frame=train) print("\nsorted by mse: ") print(gs.sort_by("mse")) #print gs.hit_ratio_table() for model in gs: assert isinstance(model, H2OGradientBoostingEstimator) assert len(gs) == size_of_hyper_space total_grid_space = list(map(list, itertools.product(*list(hyper_parameters.values())))) print( str(total_grid_space) ) for model in gs.models: combo = [model.parms['learn_rate']['actual_value'], model.parms['ntrees']['actual_value']] assert combo in total_grid_space, "combo: " + str(combo) + "; total_grid_space=" + str(total_grid_space) total_grid_space.remove(combo) # test back-end sorting of model metrics: locally_sorted = gs.sort_by("r2", H2OGridSearch.DESC) remotely_sorted_desc = H2OGridSearch.get_grid(H2OGradientBoostingEstimator(distribution='multinomial'), hyper_parameters, gs.grid_id, sort_by='r2', sort_order='desc') assert len(locally_sorted.cell_values) == len(remotely_sorted_desc.model_ids), "Expected locally sorted and remotely sorted grids to have the same number of models" for i in range(len(remotely_sorted_desc.model_ids)): assert locally_sorted.cell_values[i][0] == remotely_sorted_desc.model_ids[i], "Expected back-end sort by r2 to be the same as locally-sorted: " + str(i) remotely_sorted_asc = H2OGridSearch.get_grid(H2OGradientBoostingEstimator(distribution='multinomial'), hyper_parameters, gs.grid_id, sort_by='r2', sort_order='asc') for model in remotely_sorted_asc: assert isinstance(model, H2OGradientBoostingEstimator) assert len(locally_sorted.cell_values) == len(remotely_sorted_asc.model_ids), "Expected locally sorted and remotely sorted grids to have the same number of models" length = len(remotely_sorted_asc.model_ids) for i in range(length): assert locally_sorted.cell_values[i][0] == remotely_sorted_asc.model_ids[length - i - 1], "Expected back-end sort by r2, ascending, to be the reverse as locally-sorted ascending: " + str(i)
def benign_grid(): training_data = h2o.import_file( pyunit_utils.locate("smalldata/logreg/benign.csv")) Y = 3 X = list(range(3)) + list(range(4, 11)) # NOTE: this tests bad parameter value handling; 'a' is not a float: hyper_parameters = {'alpha': [0.01, 0.5, 'a'], 'lambda': [1e-5, 1e-6]} gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters) gs.train(x=X, y=Y, training_frame=training_data) for model in gs: assert isinstance(model, H2OGeneralizedLinearEstimator) gs.show() print(gs.sort_by('F1', False)) best_model_id = gs.sort_by('F1', False)['Model Id'][0] best_model = h2o.get_model(best_model_id) best_model.predict(training_data) gs.predict(training_data) print(gs.get_hyperparams(best_model_id)) print(gs.grid_id) new_g = H2OGridSearch.get_grid( H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters, gs.grid_id) new_g.show() print(new_g.grid_id) print(new_g.sort_by('F1', False)) assert best_model.params['family']['actual'] == 'binomial' # test search_criteria plumbing search_criteria = {'strategy': "RandomDiscrete", 'max_models': 3} max_models_g = H2OGridSearch( H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters, search_criteria=search_criteria) max_models_g.train(x=X, y=Y, training_frame=training_data) max_models_g.show() print(max_models_g.grid_id) print(max_models_g.sort_by('F1', False)) assert len(max_models_g.models) == 3, "expected 3 models, got: {}".format( len(max_models_g.models))
def benign_grid(): training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv")) Y = 3 X = list(range(3)) + list(range(4,11)) # NOTE: this tests bad parameter value handling; 'a' is not a float: hyper_parameters = {'alpha': [0.01,0.5,'a'], 'lambda': [1e-5,1e-6]} gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters) gs.train(x=X,y=Y, training_frame=training_data) for model in gs: assert isinstance(model, H2OGeneralizedLinearEstimator) gs.show() print(gs.sort_by('F1', False)) best_model_id = gs.sort_by('F1', False)['Model Id'][0] best_model = h2o.get_model(best_model_id) best_model.predict(training_data) gs.predict(training_data) print(gs.get_hyperparams(best_model_id)) print(gs.grid_id) new_g = H2OGridSearch.get_grid(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters, gs.grid_id) new_g.show() print(new_g.grid_id) print(new_g.sort_by('F1', False)) assert best_model.params['family']['actual'] == 'binomial' # test search_criteria plumbing search_criteria = { 'strategy': "Random", 'max_models': 3 } max_models_g = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters, search_criteria=search_criteria) max_models_g.train(x=X,y=Y, training_frame=training_data) max_models_g.show() print(max_models_g.grid_id) print(max_models_g.sort_by('F1', False)) ##### TODO: remove: print("before assert") assert len(max_models_g.models) == 3, "expected 3 models, got: {}".format(len(max_models_g.models))