def infer_uses_defaults_when_base_model_doesnt_support_distributions_test(): train = h2o.import_file( pyunit_utils.locate("smalldata/iris/iris_train.csv")) test = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_test.csv")) x_reg = train.columns y_reg = "petal_wid" x_reg.remove(y_reg) nfolds = 2 glm_reg = H2OGeneralizedLinearEstimator( nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, family="tweedie") glm_reg.train(x=x_reg, y=y_reg, training_frame=train) gbm_reg = H2OGradientBoostingEstimator( nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, distribution="tweedie") gbm_reg.train(x=x_reg, y=y_reg, training_frame=train) drf_reg = H2ORandomForestEstimator(nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True) drf_reg.train(x=x_reg, y=y_reg, training_frame=train) se_reg_0 = H2OStackedEnsembleEstimator(training_frame=train, validation_frame=test, base_models=[glm_reg, gbm_reg], metalearner_algorithm="gbm") se_reg_0.train(x_reg, y_reg, train) assert se_reg_0.metalearner().actual_params.get("distribution") == "tweedie", \ "Expected distribution {} but got {}".format("tweedie", se_reg_0.metalearner().actual_params.get("distribution")) se_reg_1 = H2OStackedEnsembleEstimator( training_frame=train, validation_frame=test, base_models=[glm_reg, gbm_reg, drf_reg], metalearner_algorithm="gbm") se_reg_1.train(x_reg, y_reg, train) assert se_reg_1.metalearner().actual_params.get("distribution") == "gaussian", \ "Expected distribution {} but got {}".format("gaussian", se_reg_1.metalearner().actual_params.get("distribution")) se_reg_2 = H2OStackedEnsembleEstimator( training_frame=train, validation_frame=test, base_models=[drf_reg, glm_reg, gbm_reg], metalearner_algorithm="gbm") se_reg_2.train(x_reg, y_reg, train) assert se_reg_2.metalearner().actual_params.get("distribution") == "gaussian", \ "Expected distribution {} but got {}".format("gaussian", se_reg_2.metalearner().actual_params.get("distribution"))
def plot_test(): air = h2o.import_file( pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip")) # Constructing test and train sets by sampling (20/80) s = air[0].runif() air_train = air[s <= 0.8] air_valid = air[s > 0.8] myX = [ "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek" ] myY = "IsDepDelayed" air_gbm = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=100, max_depth=3, learn_rate=0.01) air_gbm.train(x=myX, y=myY, training_frame=air_train, validation_frame=air_valid) # Plot ROC for train set perf_train = air_gbm.model_performance(train=True) perf_train.plot(type="roc", server=True) perf_train.plot(type="pr", server=True) # Plot ROC for valid set perf_valid = air_gbm.model_performance(valid=True) perf_valid.plot(type="roc", server=True) perf_valid.plot(type="pr", server=True) # Plot ROC for test set air_test = h2o.import_file( pyunit_utils.locate("smalldata/airlines/AirlinesTest.csv.zip")) perf_test = air_gbm.model_performance(air_test) perf_test.plot(type="roc", server=True) perf_test.plot(type="pr", server=True) # Test file saving fn = "curve_plot.png" perf_test.plot(type="roc", server=False, save_to_file=fn) if os.path.isfile(fn): os.remove(fn) perf_test.plot(type="pr", server=False, save_to_file=fn) if os.path.isfile(fn): os.remove(fn) # Test no plot parameter (fprs, tprs) = perf_test.plot(type="roc", server=True, plot=False) assert len(fprs) == len( tprs ), "Expected fprs and tprs to have the same shape but they are not." (recalls, precisions) = perf_test.plot(type="pr", server=True, plot=False) assert len(recalls) == len( precisions ), "Expected recall and precision to have the same shape but they are not."
def GBM_impute(data, columnsArray, rm_cols): columnsArray_ind = [] c = list(filter(lambda x: x not in list(data.columns), list(columnsArray['columnDisplayName'].replace(r'!@#$%^&*.', " ", regex=True)))) for j in c: a=columnsArray.index[columnsArray['columnName'].replace(r'!@#$%^&*.', " ", regex=True)==j].values columnsArray.drop(columnsArray.index[a], inplace = True) columnsArray.index = range(columnsArray.shape[0]) for i in columnsArray['columnName']: if i in rm_cols: columnsArray_ind.append(list(columnsArray[columnsArray['columnName']==i].index)[0]) columnsArray_ind1 = set(columnsArray.index)-set(columnsArray_ind) # print(columnsArray_ind1) columnsArray_edit = columnsArray.iloc[list(columnsArray_ind1)] # select observations without NA's data_clean = data.dropna() # creating H2O Frame and splitting for model train #data_clean.info() hf = h2o.H2OFrame(data_clean) train, valid, test = hf.split_frame(ratios=[.8, .1]) # select observations with NA's data_na_index = [i for i in (set(list(data.index)) - set(list(data_clean.index))) ] data_na = data.iloc[data_na_index] model_accuracy = [] print("Number of missing values : " + str(len(data_na))) gbm = H2OGradientBoostingEstimator() for i in range(len(data_na)): y_set = set(data_na.iloc[i].index) - set(data_na.iloc[i].dropna().index) for yValue in y_set: xValues = set(data_na.columns)-y_set gbm.train(xValues, yValue, training_frame=train, validation_frame=valid) model_accuracy.append(gbm.r2()) print(yValue) test_na = data_na # print('Missing value prediction with GBM model') test_na, columnsArray_edit = columns_data_type(test_na, columnsArray_edit) # print(test_na) # test_na = test_na.drop(xValues,axis=1) test_na = test_na.drop(yValue,axis=1) #print(i) test_na = h2o.H2OFrame(test_na) predicted = gbm.predict(test_na) predicted = predicted.as_data_frame() for j in range(data_na.shape[0]): if np.isnan(data_na[yValue].iloc[j]): data_na[yValue].iloc[j] = predicted['predict'][j] else: continue acc = np.mean(model_accuracy) frames = [data_clean, data_na] df = pd.concat(frames, axis=0) df.sort_index(axis = 0, inplace = True) return df, acc
def test_h2o_classifier_multi_int(self): gbm = H2OGradientBoostingEstimator(ntrees=9, max_depth=5) mojo_path, test_data = _train_classifier(gbm, 9, is_str=False) onnx_model = _convert_mojo(mojo_path) self.assertIsNot(onnx_model, None) dump_data_and_model( test_data, H2OMojoWrapper(mojo_path), onnx_model, basename="H2OClassMultiBin")
def frameslice_gbm(): prostate = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) prostate = prostate[1:9] from h2o.estimators.gbm import H2OGradientBoostingEstimator model = H2OGradientBoostingEstimator() model.train(x=range(1, 8), y=0, training_frame=prostate)
def test_h2o_classifier_multi_2class(self): gbm = H2OGradientBoostingEstimator(ntrees=7, max_depth=5, distribution="multinomial") mojo_path, test_data = _train_classifier(gbm, 2, is_str=True) with self.assertRaises(ValueError) as err: _convert_mojo(mojo_path) self.assertRegexpMatches(err.exception.args[0], "not supported")
def checkpoint_new_category_in_response(): sv = h2o.upload_file( pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv")) iris = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris.csv")) m1 = H2OGradientBoostingEstimator(ntrees=100) m1.train(x=[0, 1, 2, 3], y=4, training_frame=sv) # attempt to continue building model, but with an expanded categorical response domain. # this should fail try: m2 = H2OGradientBoostingEstimator(ntrees=200, checkpoint=m1.model_id) m2.train(x=[0, 1, 2, 3], y=4, training_frame=iris) assert False, "Expected continued model-building to fail with new categories introduced in response" except EnvironmentError: pass
def iris_gbm_grid(): train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # Run GBM ntrees_opts = [1,3] learn_rate_opts = [0.1,0.01,.05] size_of_hyper_space = len(ntrees_opts) * len(learn_rate_opts) hyper_parameters = OrderedDict() hyper_parameters["learn_rate"] = learn_rate_opts hyper_parameters["ntrees"] = ntrees_opts print("GBM grid with the following hyper_parameters:", hyper_parameters) gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters) gs.train(x=list(range(4)), y=4, training_frame=train) print("\nsorted by mse: ") print(gs.sort_by("mse")) #print gs.hit_ratio_table() for model in gs: assert isinstance(model, H2OGradientBoostingEstimator) assert len(gs) == size_of_hyper_space total_grid_space = list(map(list, itertools.product(*list(hyper_parameters.values())))) print( str(total_grid_space) ) for model in gs.models: combo = [model.parms['learn_rate']['actual_value'], model.parms['ntrees']['actual_value']] assert combo in total_grid_space, "combo: " + str(combo) + "; total_grid_space=" + str(total_grid_space) total_grid_space.remove(combo) # test back-end sorting of model metrics: locally_sorted = gs.sort_by("r2", H2OGridSearch.DESC) remotely_sorted_desc = H2OGridSearch.get_grid(H2OGradientBoostingEstimator(distribution='multinomial'), hyper_parameters, gs.grid_id, sort_by='r2', sort_order='desc') assert len(locally_sorted.cell_values) == len(remotely_sorted_desc.model_ids), "Expected locally sorted and remotely sorted grids to have the same number of models" for i in range(len(remotely_sorted_desc.model_ids)): assert locally_sorted.cell_values[i][0] == remotely_sorted_desc.model_ids[i], "Expected back-end sort by r2 to be the same as locally-sorted: " + str(i) remotely_sorted_asc = H2OGridSearch.get_grid(H2OGradientBoostingEstimator(distribution='multinomial'), hyper_parameters, gs.grid_id, sort_by='r2', sort_order='asc') for model in remotely_sorted_asc: assert isinstance(model, H2OGradientBoostingEstimator) assert len(locally_sorted.cell_values) == len(remotely_sorted_asc.model_ids), "Expected locally sorted and remotely sorted grids to have the same number of models" length = len(remotely_sorted_asc.model_ids) for i in range(length): assert locally_sorted.cell_values[i][0] == remotely_sorted_asc.model_ids[length - i - 1], "Expected back-end sort by r2, ascending, to be the reverse as locally-sorted ascending: " + str(i)
def test_h2o_classifier_bin_int(self): gbm = H2OGradientBoostingEstimator(ntrees=8, max_depth=5) mojo_path, test_data = _train_classifier(gbm, 2, is_str=False, force_y_numeric=True) onnx_model = _convert_mojo(mojo_path) self.assertIsNot(onnx_model, None) dump_data_and_model( test_data, H2OMojoWrapper(mojo_path), onnx_model, basename="H2OClassBinInt")
def constant_col_gbm(): train = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) train["constantCol"] = 1 # Run GBM, which should run successfully with constant response when check_constant_response is set to false my_gbm = H2OGradientBoostingEstimator(check_constant_response=False) my_gbm.train(x=list(range(1, 5)), y="constantCol", training_frame=train)
def test_reset_threshold(): """ Test the model threshold can be reset. Performance metric should be recalculated and also predictions should be changed based on the new threshold. """ # import data airlines = h2o.import_file( path=pyunit_utils.locate("smalldata/airlines/modified_airlines.csv")) # convert columns to factors airlines["Year"] = airlines["Year"].asfactor() airlines["Month"] = airlines["Month"].asfactor() airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() airlines["Cancelled"] = airlines["Cancelled"].asfactor() airlines['FlightNum'] = airlines['FlightNum'].asfactor() # set the predictor names and the response column name predictors = [ "Origin", "Dest", "Year", "UniqueCarrier", "DayOfWeek", "Month", "Distance", "FlightNum" ] response = "IsDepDelayed" # split into train and validation sets train, valid = airlines.split_frame(ratios=[.8], seed=1234) # initialize the estimator model = H2OGradientBoostingEstimator(seed=1234, ntrees=5) # train the model model.train(x=predictors, y=response, training_frame=train) old_threshold = model._model_json['output']['default_threshold'] # predict preds = model.predict(airlines) # reset the threshold and get the old one new_threshold = 0.6917189903082518 old_returned = reset_model_threshold(model, new_threshold) reset_model = h2o.get_model(model.model_id) reset_threshold = reset_model._model_json['output']['default_threshold'] # predict with reset model preds_reset = reset_model.predict(airlines) # compare thresholds assert old_threshold == old_returned assert new_threshold == reset_threshold assert reset_threshold != old_threshold # compare predictions print("old threshold:", old_threshold, "new_threshold:", new_threshold) for i in range(airlines.nrow): if preds[i, 2] >= old_threshold and preds[i, 2] < new_threshold: assert preds[i, 0] != preds_reset[i, 0] else: assert preds[i, 0] == preds_reset[i, 0]
def titanic_with_te_kfoldstrategy(frame=None, seeds=None): sum_of_aucs = 0 for current_seed in seeds: ds = split_data(frame, current_seed) targetColumnName = "survived" foldColumnName = "kfold_column" ds['train'][foldColumnName] = ds['train'].kfold_column( n_folds=5, seed=current_seed) teColumns = ["home.dest", "cabin", "embarked"] targetEncoder = TargetEncoder(x=teColumns, y=targetColumnName, fold_column=foldColumnName, blending_avg=True, inflection_point=3, smoothing=1) targetEncoder.fit(frame=ds['train']) encodedTrain = targetEncoder.transform(frame=ds['train'], holdout_type="kfold", seed=1234) encodedValid = targetEncoder.transform(frame=ds['valid'], holdout_type="none", noise=0.0) encodedTest = targetEncoder.transform(frame=ds['test'], holdout_type="none", noise=0.0) myX = [ "pclass", "sex", "age", "sibsp", "parch", "fare", "cabin_te", "embarked_te", "home.dest_te" ] air_model = H2OGradientBoostingEstimator( ntrees=1000, learn_rate=0.1, score_tree_interval=10, stopping_rounds=5, stopping_metric="AUC", stopping_tolerance=0.001, distribution="multinomial", # why AUC is different for quasibinomial and multinomial? seed=1234) air_model.train(x=myX, y=targetColumnName, training_frame=encodedTrain, validation_frame=encodedValid) variable_importance = air_model._model_json['output'][ 'variable_importances'].as_data_frame() # print(variable_importance) my_gbm_metrics = air_model.model_performance(encodedTest) auc = my_gbm_metrics.auc() sum_of_aucs += auc print("AUC with kfold for seed: " + str(current_seed) + " = " + str(auc)) return sum_of_aucs / len(seeds)
def _run_builder(key, algorithm, training_dataset_key, y, x, model_type): try: client = _get_memcached_client() contents = get_dataset_contents(training_dataset_key) with tempfile.TemporaryDirectory() as tmpdir: dataset_path = os.path.join(tmpdir, TRAINING_FILE) with open(dataset_path, 'w') as training_file: training_file.write(contents) h2o.init() training_frame = h2o.import_file(dataset_path) if algorithm == MLAlgorithm.NAIVE_BAYES: # naivebayes expects the prediction response to be categorical training_frame[y] = training_frame[y].asfactor() estimator = H2ONaiveBayesEstimator() elif algorithm == MLAlgorithm.GRADIENT_BOOSTING_MACHINE: estimator = H2OGradientBoostingEstimator() kwargs = {'training_frame': training_frame, 'y': y} if x is not None: kwargs['x'] = x estimator.train(**kwargs) temp_folder = os.path.join(os.path.abspath(os.sep), 'tmp') if model_type.upper() == 'POJO': model_file = estimator.download_pojo( path=temp_folder, get_genmodel_jar=True, genmodel_name='h2o-genmodel.jar') else: model_file = estimator.download_mojo( path=temp_folder, get_genmodel_jar=True, genmodel_name='h2o-genmodel.jar') model_performance = estimator.model_performance() details = {'mse': model_performance.mse()} with zipfile.ZipFile(os.path.join(temp_folder, key), 'w') as zip: zip.write(model_file, os.path.basename(model_file)) zip.write(os.path.join(temp_folder, 'h2o-genmodel.jar'), 'h2o-genmodel.jar') client.set( key, json.dumps({ 'status': 'COMPLETE', 'description': 'Model has been built', 'details': details, 'path': model_file })) except Exception as ex: client.set(key, json.dumps({ 'status': 'FAILED', 'description': str(ex) })) logger.exception("Building model failed")
def test_gbm_bulk_train(): prostate = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() # model will be built for each segment segment_col = "RACE" # segment 0 is too small, will not produce a model bad_segment = 0 segments = prostate[segment_col].unique() segments.rename({'C1': segment_col}) params = {"min_rows": 2, "ntrees": 4, "seed": 42} prostate_gbm = H2OGradientBoostingEstimator(**params) models = prostate_gbm.bulk_train(y="CAPSULE", ignored_columns=["ID"], training_frame=prostate, segments=segments) models_list = models.as_frame() assert models_list.names == [ u'RACE', u'Status', u'Model', u'Errors', u'Warnings' ] assert models_list.nrow == 3 # Check failed models expected_error = 'ERRR on field: _min_rows: The dataset size is too small to split for min_rows=2.0: ' \ 'must have at least 4.0 (weighted) rows, but have only 3.0.\n' assert (models_list["Errors"][models_list[segment_col] == bad_segment] ).as_data_frame()["Errors"][0] == expected_error mp = models_list.as_data_frame() # Check built models for i in range(mp.shape[0]): segment = int(mp.iloc[i][segment_col]) if segment != bad_segment: model_id = mp.iloc[i]["Model"] model = h2o.get_model(model_id) prostate_segment = prostate[prostate[segment_col] == segment] prostate_gbm_segment = H2OGradientBoostingEstimator(**params) prostate_gbm_segment.train(y="CAPSULE", ignored_columns=["ID"], training_frame=prostate_segment) pyunit_utils.check_models(model, prostate_gbm_segment)
def spaces_in_column_names(): train_data = h2o.upload_file(path=pyunit_utils.locate("smalldata/jira/spaces_in_column_names.csv")) train_data.show() train_data.describe() train_data["r e s p o n s e"] = train_data["r e s p o n s e"].asfactor() X = ["p r e d i c t o r 1","predictor2","p r e d i ctor3","pre d ictor4","predictor5"] gbm = H2OGradientBoostingEstimator(ntrees=1, distribution="bernoulli", min_rows=1) gbm.train(x=X,y="r e s p o n s e", training_frame=train_data) gbm.show()
def test_stacked_ensemble_is_able_to_use_imported_base_models(): import tempfile, shutil, glob train = h2o.import_file(pu.locate("smalldata/iris/iris_train.csv")) test = h2o.import_file(pu.locate("smalldata/iris/iris_test.csv")) x = train.columns y = "species" x.remove(y) nfolds = 2 gbm = H2OGradientBoostingEstimator(nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True) gbm.train(x=x, y=y, training_frame=train) drf = H2ORandomForestEstimator(nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True) drf.train(x=x, y=y, training_frame=train) se = H2OStackedEnsembleEstimator(training_frame=train, validation_frame=test, base_models=[gbm.model_id, drf.model_id]) se.train(x=x, y=y, training_frame=train) assert len(se.base_models) == 2 TMP_DIR = tempfile.mkdtemp() try: h2o.save_model(gbm, TMP_DIR + "/gbm.model") h2o.save_model(drf, TMP_DIR + "/drf.model") gbm_holdout_id = gbm.cross_validation_holdout_predictions().frame_id drf_holdout_id = drf.cross_validation_holdout_predictions().frame_id h2o.export_file(gbm.cross_validation_holdout_predictions(), TMP_DIR + "/gbm.holdout") h2o.export_file(drf.cross_validation_holdout_predictions(), TMP_DIR + "/drf.holdout") h2o.remove_all() h2o.import_file(TMP_DIR + "/gbm.holdout", gbm_holdout_id) h2o.import_file(TMP_DIR + "/drf.holdout", drf_holdout_id) gbm = h2o.upload_model(glob.glob(TMP_DIR + "/gbm.model/*")[0]) drf = h2o.upload_model(glob.glob(TMP_DIR + "/drf.model/*")[0]) train = h2o.import_file(pu.locate("smalldata/iris/iris_train.csv"), "some_other_name_of_training_frame") test = h2o.import_file(pu.locate("smalldata/iris/iris_test.csv"), "some_other_name_of_test_frame") x = train.columns y = "species" x.remove(y) se_loaded = H2OStackedEnsembleEstimator(training_frame=train, validation_frame=test, base_models=[gbm.model_id, drf.model_id]) se_loaded.train(x=x, y=y, training_frame=train) assert len(se_loaded.base_models) == 2 finally: shutil.rmtree(TMP_DIR)
def gbm_predict_contributions_sorting_large(): fr = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/creditcardfraud/creditcardfraud.csv")) m = H2OGradientBoostingEstimator(ntrees=10, seed=1234) m.train(x=list(range(0, fr.ncol)), y=30, training_frame=fr) contributions = m.predict_contributions(fr, top_n=-1, bottom_n=0, compare_abs=False) assert_equals(61, contributions.shape[1], "Wrong number of columns") assert_equals(284807, contributions.shape[0], "Wrong number of rows")
def create_grid(self): """Returns an H2O grid search object """ gbm_model = H2OGradientBoostingEstimator(**self.model_params) gbm_grid = H2OGridSearch(model=gbm_model, hyper_params=self.hyper_params, grid_id=self.grid_id, search_criteria=self.search_params) return gbm_grid
def pub_445_long_request_uri(): mnistTrain = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz")) mnistTest = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz")) mnistTrain.set_name(col=784, name="label") mnistTest.set_name(col=784, name="label") mnistModel = H2OGradientBoostingEstimator(ntrees=2, max_depth=2) mnistModel.train(x=list(range(784)),y="label",training_frame=mnistTrain,validation_frame=mnistTest)
def offset_init_train_gbm(): # Connect to a pre-existing cluster cars = h2o.upload_file( pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) cars = cars[cars["economy_20mpg"].isna() == 0] cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() offset = h2o.H2OFrame([[.5]] * 398) offset.set_names(["x1"]) cars = cars.cbind(offset) # offset_column passed in the train method gbm_train = H2OGradientBoostingEstimator(ntrees=1, max_depth=1, min_rows=1, learn_rate=1) gbm_train.train(x=list(range(2, 8)), y="economy_20mpg", training_frame=cars, offset_column="x1") predictions_train = gbm_train.predict(cars) # test offset_column passed in estimator init gbm_init = H2OGradientBoostingEstimator(ntrees=1, max_depth=1, min_rows=1, learn_rate=1, offset_column="x1") gbm_init.train(x=list(range(2, 8)), y="economy_20mpg", training_frame=cars) predictions_init = gbm_init.predict(cars) # test case the both offset column parameters are set the parameter in train will be used gbm_init_train = H2OGradientBoostingEstimator(ntrees=1, max_depth=1, min_rows=1, learn_rate=1, offset_column="x1") gbm_init_train.train(x=list(range(2, 8)), y="economy_20mpg", training_frame=cars, offset_column="x1") predictions_init_train = gbm_init_train.predict(cars) assert predictions_train == predictions_init, "Expected predictions of a model with offset_column in train method has to be same as predictions of a model with offset_column in constructor." assert predictions_train == predictions_init_train, "Expected predictions of a model with offset_column in train method has to be same as predictions of a model with offset_column in both constructor and init."
def algo_pr_auc_test(): ''' This pyunit test is written to make sure we can call pr_auc() on all binomial models. ''' seed = 123456789 prostate_train = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate_train.csv")) prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor() # Build H2O GBM classification model: gbm_h2o = H2OGradientBoostingEstimator(ntrees=10, learn_rate=0.1, max_depth=4, min_rows=10, distribution="bernoulli", seed=seed) gbm_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train) print("*************************** Printing GBM model") print(gbm_h2o) print("pr_auc for GBM model is {0}".format(gbm_h2o.pr_auc())) # Build H2O GLM classification model: glm_h2o = H2OGeneralizedLinearEstimator(family='binomial', seed=seed) glm_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train) print("*************************** Printing GLM model") print(glm_h2o) # glm scoring history does not contain AUC, and hence no pr_auc print("pr_auc for GLM model is {0}".format(glm_h2o.pr_auc())) rf_h2o = H2ORandomForestEstimator(ntrees=10, score_tree_interval=0) rf_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train) print("*************************** Printing random forest model") print(rf_h2o) print("pr_auc for Random Forest model is {0}".format(rf_h2o.pr_auc())) dl_h2o = H2ODeepLearningEstimator(distribution='bernoulli', seed=seed, hidden=[2,2]) dl_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train) print("*************************** Printing deeplearning model") print(dl_h2o) print("pr_auc for deeplearning model is {0}".format(dl_h2o.pr_auc())) assert abs(gbm_h2o.pr_auc()-glm_h2o.pr_auc()) < 0.9, \ "problem with pr_auc values" assert abs(rf_h2o.pr_auc()-dl_h2o.pr_auc()) < 0.9, \ "problem with pr_auc values" assert abs(rf_h2o.pr_auc()-glm_h2o.pr_auc()) < 0.9, \ "problem with pr_auc values" # try to call pr_auc() for regression. Should encounter error. h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip")) myY = "GLEASON" myX = ["ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS"] h2o_model = H2OGeneralizedLinearEstimator(family="gaussian", link="identity",alpha=0.5, Lambda=0) h2o_model.train(x=myX, y=myY, training_frame=h2o_data) try: print(h2o_model.pr_auc()) assert 1==2, "pr_auc() should raise an error for multinomial but did not." except: pass
def bernoulli_gbm(): #Log.info("Importing prostate.csv data...\n") prostate_train = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate_train.csv")) #Log.info("Converting CAPSULE and RACE columns to factors...\n") prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor() #Log.info("H2O Summary of prostate frame:\n") #prostate.summary() # Import prostate_train.csv as numpy array for scikit comparison trainData = np.loadtxt(pyunit_utils.locate("smalldata/logreg/prostate_train.csv"), delimiter=',', skiprows=1) trainDataResponse = trainData[:,0] trainDataFeatures = trainData[:,1:] ntrees = 100 learning_rate = 0.1 depth = 5 min_rows = 10 # Build H2O GBM classification model: from h2o.estimators.gbm import H2OGradientBoostingEstimator gbm_h2o = H2OGradientBoostingEstimator(ntrees=ntrees, learn_rate=learning_rate, max_depth=depth, min_rows=min_rows, distribution="bernoulli") gbm_h2o.train(x=range(1,prostate_train.ncol),y="CAPSULE", training_frame=prostate_train) # Build scikit GBM classification model #Log.info("scikit GBM with same parameters\n") gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learning_rate, n_estimators=ntrees, max_depth=depth, min_samples_leaf=min_rows, max_features=None) gbm_sci.fit(trainDataFeatures,trainDataResponse) #Log.info("Importing prostate_test.csv data...\n") prostate_test = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate_test.csv")) #Log.info("Converting CAPSULE and RACE columns to factors...\n") prostate_test["CAPSULE"] = prostate_test["CAPSULE"].asfactor() # Import prostate_test.csv as numpy array for scikit comparison testData = np.loadtxt(pyunit_utils.locate("smalldata/logreg/prostate_test.csv"), delimiter=',', skiprows=1) testDataResponse = testData[:,0] testDataFeatures = testData[:,1:] # Score on the test data and compare results # scikit auc_sci = roc_auc_score(testDataResponse, gbm_sci.predict_proba(testDataFeatures)[:,1]) # h2o gbm_perf = gbm_h2o.model_performance(prostate_test) auc_h2o = gbm_perf.auc() #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o)) assert auc_h2o >= auc_sci, "h2o (auc) performance degradation, with respect to scikit"
def mojo_predict_api_test(sandbox_dir): data = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) input_csv = "%s/in.csv" % sandbox_dir output_csv = "%s/prediction.csv" % sandbox_dir h2o.export_file(data[1, 2:], input_csv) data[1] = data[1].asfactor() model = H2OGradientBoostingEstimator(distribution="bernoulli") model.train(x=[2, 3, 4, 5, 6, 7, 8], y=1, training_frame=data) # download mojo model_zip_path = os.path.join(sandbox_dir, 'model.zip') genmodel_path = os.path.join(sandbox_dir, 'h2o-genmodel.jar') download_mojo(model, model_zip_path) assert os.path.isfile(model_zip_path) assert os.path.isfile(genmodel_path) # test that we can predict using default paths h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, verbose=True) h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path, verbose=True) assert os.path.isfile(output_csv) os.remove(model_zip_path) os.remove(genmodel_path) os.remove(output_csv) # test that we can predict using custom genmodel path other_sandbox_dir = tempfile.mkdtemp() try: genmodel_path = os.path.join(other_sandbox_dir, 'h2o-genmodel-custom.jar') download_mojo(model, model_zip_path, genmodel_path) assert os.path.isfile(model_zip_path) assert os.path.isfile(genmodel_path) try: h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, verbose=True) assert False, "There should be no h2o-genmodel.jar at %s" % sandbox_dir except RuntimeError: pass assert not os.path.isfile(output_csv) h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path, verbose=True) assert os.path.isfile(output_csv) os.remove(output_csv) output_csv = "%s/out.prediction" % other_sandbox_dir # test that we can predict using default paths h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path, verbose=True, output_csv_path=output_csv) assert os.path.isfile(output_csv) os.remove(model_zip_path) os.remove(genmodel_path) os.remove(output_csv) finally: shutil.rmtree(other_sandbox_dir)
def test_h2o_regressor_unsupported_dists(self): diabetes = load_diabetes() train, test = _train_test_split_as_frames(diabetes.data, diabetes.target) not_supported_dists = ["poisson", "gamma", "tweedie"] for d in not_supported_dists: gbm = H2OGradientBoostingEstimator(ntrees=7, max_depth=5, distribution=d) mojo_path = _make_mojo(gbm, train) with self.assertRaises(ValueError) as err: _convert_mojo(mojo_path) self.assertRegexpMatches(err.exception.args[0], "not supported")
def gbm_demo(): from h2o.estimators.gbm import H2OGradientBoostingEstimator df[1] = df[1].asfactor() m = H2OGradientBoostingEstimator(ntrees=10, max_depth=5) m.train(x=df.names[2:], y='CAPSULE', training_frame=df) print('m.type_print:', m.type)
def test_weights_column_not_in_train(): try: df = h2o.import_file( pyunit_utils.locate("smalldata/prostate/prostate.csv")) gbm = H2OGradientBoostingEstimator(seed=1234, weights_column='foo') gbm.train(y=-1, training_frame=df) assert False, "Model building should fail." except H2OResponseError as e: assert "ERRR on field: _weights_column" in str( e), "Model building should fail with this in message."
def test_binomial_response_warning(): training_data = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv")) y = "survived" features = ["name", "sex"] expected_warning = 'We have detected that your response column has only 2 unique values (0/1). ' \ 'If you wish to train a binary model instead of a regression model, ' \ 'convert your target column to categorical before training.' with pyunit_utils.catch_warnings() as ws: model = H2OGradientBoostingEstimator(ntrees=1) model.train(x=features, y=y, training_frame=training_data) assert pyunit_utils.contains_warning(ws, expected_warning) training_data[training_data[y] == 0, y] = -1 with pyunit_utils.catch_warnings() as ws: model = H2OGradientBoostingEstimator(ntrees=1) model.train(x=features, y=y, training_frame=training_data) assert pyunit_utils.contains_warning(ws, expected_warning)
def pubdev_1696(): iris = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris.csv")) try: H2OGradientBoostingEstimator(nfolds=-99).train(x=[0,1,2],y=3,training_frame=iris) assert False, "expected an error" except EnvironmentError: assert True
def nfold_predict(): fr = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate_train.csv")) m = H2OGradientBoostingEstimator(nfolds=10, ntrees=10) m.train(x=list(range(2, fr.ncol)), y=1, training_frame=fr) xval_models = m.get_xval_models() fr["weights"] = 1 preds = [model.predict(fr) for model in xval_models] (old_div(sum(preds), 10)).show()
def test_gbm_train_segments_parallel(): prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate_gbm = H2OGradientBoostingEstimator(min_rows=2, ntrees=4, seed=42) models = prostate_gbm.train_segments(y="CAPSULE", ignored_columns=["ID"], training_frame=prostate, segments=["RACE"], parallelism=2) models_list = models.as_frame() assert models_list.nrow == 3
df.columns # Optionally print column data types. Note that Spark intelligentlly # identifies that the predictor columns are double because I had # made all of them rdd elements double (above). This saved me from # having to write really ugly Spark casting code df.schema.fields # Convert the Spark DataFrame to something that H2O can ingest df_h2o = hc.as_h2o_frame(df,"df_h2o") ''' ''' predictors = column_names[:-1] response = column_names[-1] ratios = [0.6,0.2] h2o_frame_splits = df_h2o.split_frame(ratios,seed=12345) train = h2o_frame_splits[0] train.frame_id = "Train" valid = h2o_frame_splits[2] valid.frame_id = "Validation" test = h2o_frame_splits[1] test.frame_id = "Test" model = GBM(ntrees=50,max_depth=6,learn_rate=0.1,distribution="multinomial") model.train(x=predictors,y=response,training_frame=train,validation_frame=valid)