def test_gbm_bulk_cv(): response = "survived" titanic = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/titanic.csv")) titanic[response] = titanic[response].asfactor() predictors = ["survived", "name", "sex", "age", "sibsp", "parch", "ticket", "fare", "cabin"] train, valid = titanic.split_frame(ratios=[.8], seed=1234) titanic_gbm = H2OGradientBoostingEstimator(seed=1234, nfolds=2, build_tree_one_node=True) titanic_gbm.train_segments(segments=["pclass"], x=predictors, y=response, training_frame=train, validation_frame=valid, segment_models_id="titanic_by_pclass") train_cl1 = train[train["pclass"] == 1] valid_cl1 = valid[valid["pclass"] == 1] titanic_cl1_gbm = H2OGradientBoostingEstimator(seed=1234, nfolds=2) titanic_cl1_gbm.train(x=predictors, y=response, training_frame=train_cl1, validation_frame=valid_cl1) titanic_models = H2OSegmentModels(segment_models_id="titanic_by_pclass") bulk_models = titanic_models.as_frame() titanic_bulk_cl1_gbm_id = (bulk_models[bulk_models["pclass"] == 1]["model"]) titanic_bulk_cl1_gbm = h2o.get_model(titanic_bulk_cl1_gbm_id.flatten()) pyunit_utils.check_models(titanic_cl1_gbm, titanic_bulk_cl1_gbm, use_cross_validation=True)
def demo_xval_with_validation_frame(): prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) prostate[1] = prostate[1].asfactor() print(prostate.summary()) # invert the response prostate_inverse = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) resp = (1 - prostate_inverse[1]) prostate_inverse[1] = resp.asfactor() print(prostate_inverse.summary()) # 50 is a default but lets be explict ntrees = 50 X = list(range(2,9)) y = 1 # 1. Train a model with 5-fold xval, no validation frame prostate_gbm = H2OGradientBoostingEstimator(nfolds=5, ntrees=ntrees, distribution="bernoulli", seed=1, score_each_iteration=True, stopping_rounds=3) prostate_gbm.train(x=X, y=y, training_frame=prostate) prostate_gbm.show() # stopped early assert get_ntrees(prostate_gbm) < ntrees # 2. Show that training a model without xval with inverted validation frame triggers early stopping earlier # Validation frame contradicts the training frame and training should stop right away prostate_gbm_noxval = H2OGradientBoostingEstimator(ntrees=ntrees, distribution="bernoulli", seed=1, score_each_iteration=True, stopping_rounds=3) prostate_gbm_noxval.train(x=X, y=y, training_frame=prostate, validation_frame=prostate_inverse) prostate_gbm_noxval.show() # stopped almost immediately assert get_ntrees(prostate_gbm_noxval) == 6 # earlier then in case of (1) assert get_ntrees(prostate_gbm_noxval) < get_ntrees(prostate_gbm) # 3. Train a model with 5-fold xval this time with inverted frame as the validation frame prostate_gbm_v = H2OGradientBoostingEstimator(nfolds=5, ntrees=ntrees, distribution="bernoulli", seed=1, score_each_iteration=True, stopping_rounds=3) prostate_gbm_v.train(x=X, y=y, training_frame=prostate, validation_frame=prostate_inverse) prostate_gbm_v.show() # Models (1) and (3) are the same => validation cannot be used for early stopping in xval setting # Otherwise we would have stopped earlier as we see in (2) pyunit_utils.check_models(prostate_gbm, prostate_gbm_v) # Stopped early with same number of trees build as in (1) assert get_ntrees(prostate_gbm) == get_ntrees(prostate_gbm_v)
def model_comparator(frame1, frame2, col_ind, rows1, numElements): assert numElements == 0 models1 = frame1[col_ind].as_data_frame() models2 = frame2[col_ind].as_data_frame() for i in range(rows1): model_id_1 = str(models1.iloc[i][0]) model_1 = h2o.get_model(model_id_1) model_id_2 = str(models2.iloc[i][0]) model_2 = h2o.get_model(model_id_2) print("###### Comparing model {0} and model {1}.".format( model_1.model_id, model_2.model_id)) pyunit_utils.check_models(model_1, model_2)
def test_if_train_segments(): prostate = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() # model will be built for each segment segment_col = "RACE" # segment 0 is too small, will not produce a model bad_segment = 0 segments = prostate[segment_col].unique() segments.rename({'C1': segment_col}) params = {"min_rows": 2, "ntrees": 4, "seed": 42, "sample_size": 10} prostate_if = H2OIsolationForestEstimator(**params) models = prostate_if.train_segments(ignored_columns=["ID"], training_frame=prostate, segments=segments) models_list = models.as_frame() print(models_list) assert models_list.names == [ u'RACE', u'model', u'status', u'errors', u'warnings' ] assert models_list.nrow == 3 # Check failed models expected_error = 'ERRR on field: _min_rows: The dataset size is too small to split for min_rows=2.0: ' \ 'must have at least 4.0 (weighted) rows, but have only 3.0.\n' assert (models_list["errors"][models_list[segment_col] == bad_segment] ).as_data_frame()["errors"][0] == expected_error mp = models_list.as_data_frame() # Check built models for i in range(mp.shape[0]): segment = int(mp.iloc[i][segment_col]) status = str(mp.iloc[i]["status"]) if segment != bad_segment: assert status == "SUCCEEDED" model_id = mp.iloc[i]["model"] model = h2o.get_model(model_id) prostate_segment = prostate[prostate[segment_col] == segment] prostate_if_segment = H2OIsolationForestEstimator(**params) prostate_if_segment.train(ignored_columns=["ID", segment_col], training_frame=prostate_segment) pyunit_utils.check_models(model, prostate_if_segment) preds_actual = model.predict(prostate_segment) preds_expected = prostate_if_segment.predict(prostate_segment) assert_frame_equal(preds_actual.as_data_frame(True), preds_expected.as_data_frame(True)) else: assert status == "FAILED"
def test_gbm_bulk_train(): prostate = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() # model will be built for each segment segment_col = "RACE" # segment 0 is too small, will not produce a model bad_segment = 0 segments = prostate[segment_col].unique() segments.rename({'C1': segment_col}) params = {"min_rows": 2, "ntrees": 4, "seed": 42} prostate_gbm = H2OGradientBoostingEstimator(**params) models = prostate_gbm.bulk_train(y="CAPSULE", ignored_columns=["ID"], training_frame=prostate, segments=segments) models_list = models.as_frame() assert models_list.names == [ u'RACE', u'Status', u'Model', u'Errors', u'Warnings' ] assert models_list.nrow == 3 # Check failed models expected_error = 'ERRR on field: _min_rows: The dataset size is too small to split for min_rows=2.0: ' \ 'must have at least 4.0 (weighted) rows, but have only 3.0.\n' assert (models_list["Errors"][models_list[segment_col] == bad_segment] ).as_data_frame()["Errors"][0] == expected_error mp = models_list.as_data_frame() # Check built models for i in range(mp.shape[0]): segment = int(mp.iloc[i][segment_col]) if segment != bad_segment: model_id = mp.iloc[i]["Model"] model = h2o.get_model(model_id) prostate_segment = prostate[prostate[segment_col] == segment] prostate_gbm_segment = H2OGradientBoostingEstimator(**params) prostate_gbm_segment.train(y="CAPSULE", ignored_columns=["ID"], training_frame=prostate_segment) pyunit_utils.check_models(model, prostate_gbm_segment)
def cv_carsRF(): # read in the dataset and construct training set (and validation set) cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial, # 2:multinomial problem = random.sample(list(range(3)),1)[0] problem = 2 # pick the predictors and the correct response column predictors = ["displacement","power","weight","acceleration","year"] if problem == 1 : response_col = "economy_20mpg" cars[response_col] = cars[response_col].asfactor() elif problem == 2 : response_col = "cylinders" cars[response_col] = cars[response_col].asfactor() else : response_col = "economy" print("Response column: {0}".format(response_col)) ## cross-validation # 1. check that cv metrics are the same over repeated seeded "Modulo" runs nfolds = random.randint(3,10) rf1 = H2ORandomForestEstimator(nfolds=nfolds, fold_assignment="Modulo", seed=1234) rf1.train(x=predictors, y=response_col, training_frame=cars) rf2 = H2ORandomForestEstimator(nfolds=nfolds, fold_assignment="Modulo", seed=1234) rf2.train(x=predictors, y=response_col, training_frame=cars) pyunit_utils.check_models(rf1, rf2, True) # 2. check that cv metrics are different over repeated "Random" runs nfolds = random.randint(3,10) rf1 = H2ORandomForestEstimator(nfolds=nfolds, fold_assignment="Random") rf1.train(x=predictors, y=response_col, training_frame=cars) rf2 = H2ORandomForestEstimator(nfolds=nfolds, fold_assignment="Random") rf2.train(x=predictors, y=response_col, training_frame=cars) try: pyunit_utils.check_models(rf1, rf2, True) assert False, "Expected models to be different over repeated Random runs" except AssertionError: assert True # 3. folds_column num_folds = random.randint(2,5) fold_assignments = h2o.H2OFrame([[random.randint(0,num_folds-1)] for f in range(cars.nrow)]) fold_assignments.set_names(["fold_assignments"]) cars = cars.cbind(fold_assignments) rf = H2ORandomForestEstimator(keep_cross_validation_predictions=True) rf.train(y=response_col, x=predictors, training_frame=cars, fold_column="fold_assignments") num_cv_models = len(rf._model_json['output']['cross_validation_models']) assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \ "{1}".format(num_folds, num_cv_models) cv_model1 = h2o.get_model(rf._model_json['output']['cross_validation_models'][0]['name']) cv_model2 = h2o.get_model(rf._model_json['output']['cross_validation_models'][1]['name']) # 4. keep_cross_validation_predictions cv_predictions = rf1._model_json['output']['cross_validation_predictions'] assert cv_predictions is None, "Expected cross-validation predictions to be None, but got {0}".format(cv_predictions) cv_predictions = rf._model_json['output']['cross_validation_predictions'] assert len(cv_predictions)==num_folds, "Expected the same number of cross-validation predictions " \ "as folds, but got {0}".format(len(cv_predictions)) ## boundary cases # 1. nfolds = number of observations (leave-one-out cross-validation) rf = H2ORandomForestEstimator(nfolds=cars.nrow, fold_assignment="Modulo") rf.train(y=response_col, x=predictors, training_frame=cars) # 2. nfolds = 0 rf1 = H2ORandomForestEstimator(nfolds=0, seed=1234) rf1.train(y=response_col, x=predictors, training_frame=cars) # check that this is equivalent to no nfolds rf2 = H2ORandomForestEstimator(seed=1234) rf2.train(y=response_col, x=predictors, training_frame=cars) pyunit_utils.check_models(rf1, rf2) # 3. cross-validation and regular validation attempted rf = H2ORandomForestEstimator(nfolds=random.randint(3,10)) rf.train(y=response_col, x=predictors, training_frame=cars, validation_frame=cars) ## error cases # 1. nfolds == 1 or < 0 try: rf = H2ORandomForestEstimator(nfolds=random.sample([-1,1], 1)[0]) rf.train(y=response_col, x=predictors, training_frame=cars) assert False, "Expected model-build to fail when nfolds is 1 or < 0" except EnvironmentError: assert True # 2. more folds than observations try: rf = H2ORandomForestEstimator(nfolds=cars.nrow+1, fold_assignment="Modulo") rf.train(y=response_col, x=predictors, training_frame=cars) assert False, "Expected model-build to fail when nfolds > nobs" except EnvironmentError: assert True # 3. fold_column and nfolds both specified try: rf = H2ORandomForestEstimator(nfolds=3) rf.train(y=response_col, x=predictors, fold_column="fold_assignments", training_frame=cars) assert False, "Expected model-build to fail when fold_column and nfolds both specified" except EnvironmentError: assert True
def cv_cars_glm(): # read in the dataset and construct training set (and validation set) cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial, # 2:poisson problem = random.sample(list(range(3)),1)[0] # pick the predictors and response column, along with the correct family predictors = ["displacement","power","weight","acceleration","year"] if problem == 1 : response_col = "economy_20mpg" family = "binomial" cars[response_col] = cars[response_col].asfactor() elif problem == 2 : family = "poisson" response_col = "cylinders" else : family = "gaussian" response_col = "economy" print("Distribution: {0}".format(family)) print("Response column: {0}".format(response_col)) ## cross-validation # 1. check that cv metrics are the same over repeated "Modulo" runs nfolds = random.randint(3,10) glm1 = H2OGeneralizedLinearEstimator(nfolds=nfolds, family=family, fold_assignment="Modulo") glm1.train(x=predictors, y=response_col, training_frame=cars) glm2 = H2OGeneralizedLinearEstimator(nfolds=nfolds, family=family, fold_assignment="Modulo") glm2.train(x=predictors, y=response_col, training_frame=cars) pyunit_utils.check_models(glm1, glm2, True) # 2. check that cv metrics are different over repeated "Random" runs nfolds = random.randint(3,10) glm1 = H2OGeneralizedLinearEstimator(nfolds=nfolds, family=family, fold_assignment="Random") glm1.train(x=predictors, y=response_col, training_frame=cars) glm2 = H2OGeneralizedLinearEstimator(nfolds=nfolds, family=family, fold_assignment="Random") glm2.train(x=predictors, y=response_col, training_frame=cars) try: pyunit_utils.check_models(glm1, glm2, True) assert False, "Expected models to be different over repeated Random runs" except AssertionError: assert True # 3. folds_column num_folds = random.randint(2,5) fold_assignments = h2o.H2OFrame([[random.randint(0,num_folds-1)] for f in range(cars.nrow)]) fold_assignments.set_names(["fold_assignments"]) cars = cars.cbind(fold_assignments) glm = H2OGeneralizedLinearEstimator(family=family, keep_cross_validation_predictions=True) glm.train(x=predictors, y=response_col, training_frame=cars, fold_column="fold_assignments") num_cv_models = len(glm._model_json['output']['cross_validation_models']) assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \ "{1}".format(num_folds, num_cv_models) cv_model1 = h2o.get_model(glm._model_json['output']['cross_validation_models'][0]['name']) cv_model2 = h2o.get_model(glm._model_json['output']['cross_validation_models'][1]['name']) # 4. keep_cross_validation_predictions cv_predictions = glm1._model_json['output']['cross_validation_predictions'] assert cv_predictions is None, "Expected cross-validation predictions to be None, but got {0}".format(cv_predictions) cv_predictions = glm._model_json['output']['cross_validation_predictions'] assert len(cv_predictions)==num_folds, "Expected the same number of cross-validation predictions " \ "as folds, but got {0}".format(len(cv_predictions)) # 2. nfolds = 0 glm1 = H2OGeneralizedLinearEstimator(nfolds=0, family=family) glm1.train(x=predictors, y=response_col, training_frame=cars) # check that this is equivalent to no nfolds glm2 = H2OGeneralizedLinearEstimator(family=family) glm2.train(x=predictors, y=response_col, training_frame=cars) pyunit_utils.check_models(glm1, glm2) # 3. cross-validation and regular validation attempted glm = H2OGeneralizedLinearEstimator(nfolds=random.randint(3,10), family=family) glm.train(x=predictors, y=response_col, training_frame=cars, validation_frame=cars) ## error cases # 1. nfolds == 1 or < 0 try: glm = H2OGeneralizedLinearEstimator(nfolds=random.sample([-1,1], 1)[0], family=family) glm.train(x=predictors, y=response_col, training_frame=cars) assert False, "Expected model-build to fail when nfolds is 1 or < 0" except EnvironmentError: assert True # 2. more folds than observations try: glm = H2OGeneralizedLinearEstimator(nfolds=cars.nrow+1, family=family, fold_assignment="Modulo") glm.train(x=predictors, y=response_col, training_frame=cars) assert False, "Expected model-build to fail when nfolds > nobs" except EnvironmentError: assert True # 3. fold_column and nfolds both specified try: glm = H2OGeneralizedLinearEstimator(nfolds=3, family=family) glm.train(x=predictors, y=response_col, training_frame=cars, fold_column="fold_assignments") assert False, "Expected model-build to fail when fold_column and nfolds both specified" except EnvironmentError: assert True
def cv_carsDL(): # read in the dataset and construct training set (and validation set) cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial, # 2:multinomial problem = random.sample(range(3), 1)[0] # pick the predictors and the correct response column predictors = ["displacement", "power", "weight", "acceleration", "year"] if problem == 1: response_col = "economy_20mpg" cars[response_col] = cars[response_col].asfactor() elif problem == 2: response_col = "cylinders" cars[response_col] = cars[response_col].asfactor() else: response_col = "economy" print "Response column: {0}".format(response_col) ## cross-validation # 1. basic dl = h2o.deeplearning( y=cars[response_col], x=cars[predictors], nfolds=random.randint(3, 10), fold_assignment="Modulo" ) # 2. check that cv metrics are different over repeated "Random" runs nfolds = random.randint(3, 10) dl1 = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=nfolds, fold_assignment="Random") dl2 = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=nfolds, fold_assignment="Random") try: pyunit_utils.check_models(dl1, dl2, True) assert False, "Expected models to be different over repeated Random runs" except AssertionError: assert True # 3. folds_column num_folds = random.randint(2, 5) fold_assignments = h2o.H2OFrame(python_obj=zip(*[[random.randint(0, num_folds - 1)] for f in range(cars.nrow)])) fold_assignments.set_names(["fold_assignments"]) cars = cars.cbind(fold_assignments) dl = h2o.deeplearning( y=cars[response_col], x=cars[predictors], training_frame=cars, fold_column="fold_assignments", keep_cross_validation_predictions=True, ) num_cv_models = len(dl._model_json["output"]["cross_validation_models"]) assert num_cv_models == num_folds, "Expected {0} cross-validation models, but got " "{1}".format( num_folds, num_cv_models ) cv_model1 = h2o.get_model(dl._model_json["output"]["cross_validation_models"][0]["name"]) cv_model2 = h2o.get_model(dl._model_json["output"]["cross_validation_models"][1]["name"]) assert isinstance(cv_model1, type(dl)), ( "Expected cross-validation model to be the same model type as the " "constructed model, but got {0} and {1}".format(type(cv_model1), type(dl)) ) assert isinstance(cv_model2, type(dl)), ( "Expected cross-validation model to be the same model type as the " "constructed model, but got {0} and {1}".format(type(cv_model2), type(dl)) ) # 4. keep_cross_validation_predictions cv_predictions = dl1._model_json["output"]["cross_validation_predictions"] assert cv_predictions is None, "Expected cross-validation predictions to be None, but got {0}".format( cv_predictions ) cv_predictions = dl._model_json["output"]["cross_validation_predictions"] assert ( len(cv_predictions) == num_folds ), "Expected the same number of cross-validation predictions " "as folds, but got {0}".format(len(cv_predictions)) ## boundary cases # 1. nfolds = number of observations (leave-one-out cross-validation) dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow, fold_assignment="Modulo") # 2. nfolds = 0 dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=0) # 3. cross-validation and regular validation attempted dl = h2o.deeplearning( y=cars[response_col], x=cars[predictors], nfolds=random.randint(3, 10), validation_y=cars[response_col], validation_x=cars[predictors], ) ## error cases # 1. nfolds == 1 or < 0 try: dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=random.sample([-1, 1], 1)[0]) assert False, "Expected model-build to fail when nfolds is 1 or < 0" except EnvironmentError: assert True # 2. more folds than observations try: dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow + 1, fold_assignment="Modulo") assert False, "Expected model-build to fail when nfolds > nobs" except EnvironmentError: assert True # 3. fold_column and nfolds both specified try: rf = h2o.deeplearning( y=cars[response_col], x=cars[predictors], nfolds=3, fold_column="fold_assignments", training_frame=cars ) assert False, "Expected model-build to fail when fold_column and nfolds both specified" except EnvironmentError: assert True
def cv_cars_dw(): if not H2ODeepWaterEstimator.available(): return # read in the dataset and construct training set (and validation set) cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial, # 2:multinomial problem = random.sample(list(range(2)),1)[0] + 1 # only do classification # pick the predictors and the correct response column predictors = ["displacement","power","weight","acceleration","year"] if problem == 1 : response_col = "economy_20mpg" cars[response_col] = cars[response_col].asfactor() elif problem == 2 : response_col = "cylinders" cars[response_col] = cars[response_col].asfactor() else : response_col = "economy" print("Response column: {0}".format(response_col)) ## cross-validation # 1. basic dl = H2ODeepWaterEstimator(nfolds=random.randint(3,10),fold_assignment="Modulo",hidden=[20,20],epochs=10) dl.train(x=predictors, y=response_col, training_frame=cars) # 2. check that cv metrics are different over repeated "Random" runs nfolds = random.randint(3,10) dl1 = H2ODeepWaterEstimator(nfolds=nfolds,fold_assignment="Random",hidden=[20,20],epochs=10) dl1.train(x=predictors,y=response_col,training_frame=cars) dl2 = H2ODeepWaterEstimator(nfolds=nfolds,fold_assignment="Random",hidden=[20,20],epochs=10) try: pyunit_utils.check_models(dl1, dl2, True) assert False, "Expected models to be different over repeated Random runs" except AssertionError: assert True # 3. folds_column num_folds = random.randint(2,5) fold_assignments = h2o.H2OFrame([[random.randint(0,num_folds-1)] for _ in range(cars.nrow)]) fold_assignments.set_names(["fold_assignments"]) cars = cars.cbind(fold_assignments) dl = H2ODeepWaterEstimator(keep_cross_validation_predictions=True,hidden=[20,20],epochs=10) dl.train(x=predictors,y=response_col,training_frame=cars,fold_column="fold_assignments") num_cv_models = len(dl._model_json['output']['cross_validation_models']) assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \ "{1}".format(num_folds, num_cv_models) cv_model1 = h2o.get_model(dl._model_json['output']['cross_validation_models'][0]['name']) cv_model2 = h2o.get_model(dl._model_json['output']['cross_validation_models'][1]['name']) # 4. keep_cross_validation_predictions cv_predictions = dl1._model_json['output']['cross_validation_predictions'] ## boundary cases # 1. nfolds = number of observations (leave-one-out cross-validation) dl = H2ODeepWaterEstimator(nfolds=cars.nrow, fold_assignment="Modulo",hidden=[20,20],epochs=10) dl.train(x=predictors,y=response_col,training_frame=cars) # 2. nfolds = 0 dl = H2ODeepWaterEstimator(nfolds=0,hidden=[20,20],epochs=10) dl.train(x=predictors,y=response_col,training_frame=cars) # 3. cross-validation and regular validation attempted dl = H2ODeepWaterEstimator(nfolds=random.randint(3,10),hidden=[20,20],epochs=10) dl.train(x=predictors, y=response_col, training_frame=cars, validation_frame=cars) ## error cases # 1. nfolds == 1 or < 0 try: dl = H2ODeepWaterEstimator(nfolds=random.sample([-1,1], 1)[0],hidden=[20,20],epochs=10) dl.train(x=predictors, y=response_col, training_frame=cars) assert False, "Expected model-build to fail when nfolds is 1 or < 0" except EnvironmentError: assert True # 2. more folds than observations try: dl = H2ODeepWaterEstimator(nfolds=cars.nrow+1,fold_assignment="Modulo",hidden=[20,20],epochs=10) dl.train(x=predictors, y=response_col, training_frame=cars) assert False, "Expected model-build to fail when nfolds > nobs" except EnvironmentError: assert True # 3. fold_column and nfolds both specified try: dl = H2ODeepWaterEstimator(nfolds=3, hidden=[20, 20], epochs=10) dl.train(x=predictors, y=response_col, fold_column="fold_assignments", training_frame=cars) assert False, "Expected model-build to fail when fold_column and nfolds both specified" except EnvironmentError: assert True
def cv_carsGBM(): # read in the dataset and construct training set (and validation set) cars = h2o.import_file( path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial, # 2:multinomial problem = random.sample(list(range(3)), 1)[0] # pick the predictors and response column, along with the correct distribution predictors = ["displacement", "power", "weight", "acceleration", "year"] if problem == 1: response_col = "economy_20mpg" distribution = "bernoulli" cars[response_col] = cars[response_col].asfactor() elif problem == 2: response_col = "cylinders" distribution = "multinomial" cars[response_col] = cars[response_col].asfactor() else: response_col = "economy" distribution = "gaussian" print("Distribution: {0}".format(distribution)) print("Response column: {0}".format(response_col)) ## cross-validation # 1. check that cv metrics are the same over repeated "Modulo" runs nfolds = random.randint(3, 10) gbm1 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, distribution=distribution, ntrees=5, fold_assignment="Modulo") gbm2 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, distribution=distribution, ntrees=5, fold_assignment="Modulo") pyunit_utils.check_models(gbm1, gbm2, True) # 2. check that cv metrics are different over repeated "Random" runs nfolds = random.randint(3, 10) gbm1 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, distribution=distribution, ntrees=5, fold_assignment="Random") gbm2 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, distribution=distribution, ntrees=5, fold_assignment="Random") try: pyunit_utils.check_models(gbm1, gbm2, True) assert False, "Expected models to be different over repeated Random runs" except AssertionError: assert True # 3. folds_column num_folds = random.randint(2, 5) fold_assignments = h2o.H2OFrame([[random.randint(0, num_folds - 1)] for f in range(cars.nrow)]) fold_assignments.set_names(["fold_assignments"]) cars = cars.cbind(fold_assignments) gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], training_frame=cars, distribution=distribution, ntrees=5, fold_column="fold_assignments", keep_cross_validation_predictions=True) num_cv_models = len(gbm._model_json['output']['cross_validation_models']) assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \ "{1}".format(num_folds, num_cv_models) cv_model1 = h2o.get_model( gbm._model_json['output']['cross_validation_models'][0]['name']) cv_model2 = h2o.get_model( gbm._model_json['output']['cross_validation_models'][1]['name']) # 4. keep_cross_validation_predictions cv_predictions = gbm1._model_json['output']['cross_validation_predictions'] assert cv_predictions is None, "Expected cross-validation predictions to be None, but got {0}".format( cv_predictions) cv_predictions = gbm._model_json['output']['cross_validation_predictions'] assert len(cv_predictions)==num_folds, "Expected the same number of cross-validation predictions " \ "as folds, but got {0}".format(len(cv_predictions)) # # 5. manually construct models # fold1 = cars[cars["fold_assignments"]==0] # fold2 = cars[cars["fold_assignments"]==1] # manual_model1 = h2o.gbm(y=fold2[response_col], # x=fold2[predictors], # validation_y=fold1[response_col], # validation_x=fold1[predictors], ntrees=5, # distribution=distribution) # manual_model2 = h2o.gbm(y=fold1[response_col], # x=fold1[predictors], # validation_y=fold2[response_col], # validation_x=fold2[predictors], ntrees=5, # distribution=distribution) ## boundary cases # 1. nfolds = number of observations (leave-one-out cross-validation) gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow, distribution=distribution, ntrees=5, fold_assignment="Modulo") # 2. nfolds = 0 gbm1 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=0, distribution=distribution, ntrees=5) # check that this is equivalent to no nfolds gbm2 = h2o.gbm(y=cars[response_col], x=cars[predictors], distribution=distribution, ntrees=5) pyunit_utils.check_models(gbm1, gbm2) # 3. cross-validation and regular validation attempted gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=random.randint(3, 10), validation_y=cars[response_col], ntrees=5, validation_x=cars[predictors], distribution=distribution) ## error cases # 1. nfolds == 1 or < 0 try: gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=random.sample([-1, 1], 1)[0], ntrees=5, distribution=distribution) assert False, "Expected model-build to fail when nfolds is 1 or < 0" except EnvironmentError: assert True # 2. more folds than observations try: gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow + 1, distribution=distribution, ntrees=5, fold_assignment="Modulo") assert False, "Expected model-build to fail when nfolds > nobs" except EnvironmentError: assert True # 3. fold_column and nfolds both specified try: gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=3, fold_column="fold_assignments", ntrees=5, distribution=distribution, training_frame=cars) assert False, "Expected model-build to fail when fold_column and nfolds both specified" except EnvironmentError: assert True # 4. fold_column and fold_assignment both specified try: gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], fold_assignment="Random", fold_column="fold_assignments", ntrees=5, distribution=distribution, training_frame=cars) assert False, "Expected model-build to fail when fold_column and fold_assignment both specified" except EnvironmentError: assert True
def cv_cars_glm(): # read in the dataset and construct training set (and validation set) cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial, # 2:poisson problem = random.sample(range(3),1)[0] # pick the predictors and response column, along with the correct family predictors = ["displacement","power","weight","acceleration","year"] if problem == 1 : response_col = "economy_20mpg" family = "binomial" cars[response_col] = cars[response_col].asfactor() elif problem == 2 : family = "poisson" response_col = "cylinders" else : family = "gaussian" response_col = "economy" print "Distribution: {0}".format(family) print "Response column: {0}".format(response_col) ## cross-validation # 1. check that cv metrics are the same over repeated "Modulo" runs nfolds = random.randint(3,10) glm1 = H2OGeneralizedLinearEstimator(nfolds=nfolds, family=family, fold_assignment="Modulo") glm1.train(x=predictors, y=response_col, training_frame=cars) glm2 = H2OGeneralizedLinearEstimator(nfolds=nfolds, family=family, fold_assignment="Modulo") glm2.train(x=predictors, y=response_col, training_frame=cars) pyunit_utils.check_models(glm1, glm2, True) # 2. check that cv metrics are different over repeated "Random" runs nfolds = random.randint(3,10) glm1 = H2OGeneralizedLinearEstimator(nfolds=nfolds, family=family, fold_assignment="Random") glm1.train(x=predictors, y=response_col, training_frame=cars) glm2 = H2OGeneralizedLinearEstimator(nfolds=nfolds, family=family, fold_assignment="Random") glm2.train(x=predictors, y=response_col, training_frame=cars) try: pyunit_utils.check_models(glm1, glm2, True) assert False, "Expected models to be different over repeated Random runs" except AssertionError: assert True # 3. folds_column num_folds = random.randint(2,5) fold_assignments = h2o.H2OFrame(python_obj=[[random.randint(0,num_folds-1) for f in range(cars.nrow)]]) fold_assignments.set_names(["fold_assignments"]) cars = cars.cbind(fold_assignments) glm = H2OGeneralizedLinearEstimator(family=family, keep_cross_validation_predictions=True) glm.train(x=predictors, y=response_col, training_frame=cars, fold_column="fold_assignments") num_cv_models = len(glm._model_json['output']['cross_validation_models']) assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \ "{1}".format(num_folds, num_cv_models) cv_model1 = h2o.get_model(glm._model_json['output']['cross_validation_models'][0]['name']) cv_model2 = h2o.get_model(glm._model_json['output']['cross_validation_models'][1]['name']) # 4. keep_cross_validation_predictions cv_predictions = glm1._model_json['output']['cross_validation_predictions'] assert cv_predictions is None, "Expected cross-validation predictions to be None, but got {0}".format(cv_predictions) cv_predictions = glm._model_json['output']['cross_validation_predictions'] assert len(cv_predictions)==num_folds, "Expected the same number of cross-validation predictions " \ "as folds, but got {0}".format(len(cv_predictions)) # 2. nfolds = 0 glm1 = H2OGeneralizedLinearEstimator(nfolds=0, family=family) glm1.train(x=predictors, y=response_col, training_frame=cars) # check that this is equivalent to no nfolds glm2 = H2OGeneralizedLinearEstimator(family=family) glm2.train(x=predictors, y=response_col, training_frame=cars) pyunit_utils.check_models(glm1, glm2) # 3. cross-validation and regular validation attempted glm = H2OGeneralizedLinearEstimator(nfolds=random.randint(3,10), family=family) glm.train(x=predictors, y=response_col, training_frame=cars, validation_frame=cars) ## error cases # 1. nfolds == 1 or < 0 try: glm = H2OGeneralizedLinearEstimator(nfolds=random.sample([-1,1], 1)[0], family=family) glm.train(x=predictors, y=response_col, training_frame=cars) assert False, "Expected model-build to fail when nfolds is 1 or < 0" except EnvironmentError: assert True # 2. more folds than observations try: glm = H2OGeneralizedLinearEstimator(nfolds=cars.nrow+1, family=family, fold_assignment="Modulo") glm.train(x=predictors, y=response_col, training_frame=cars) assert False, "Expected model-build to fail when nfolds > nobs" except EnvironmentError: assert True # 3. fold_column and nfolds both specified try: glm = H2OGeneralizedLinearEstimator(nfolds=3, family=family) glm.train(x=predictors, y=response_col, training_frame=cars, fold_column="fold_assignments") assert False, "Expected model-build to fail when fold_column and nfolds both specified" except EnvironmentError: assert True
def cv_carsGBM(): # read in the dataset and construct training set (and validation set) cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial, # 2:multinomial problem = random.sample(range(3),1)[0] # pick the predictors and response column, along with the correct distribution predictors = ["displacement","power","weight","acceleration","year"] if problem == 1 : response_col = "economy_20mpg" distribution = "bernoulli" cars[response_col] = cars[response_col].asfactor() elif problem == 2 : response_col = "cylinders" distribution = "multinomial" cars[response_col] = cars[response_col].asfactor() else : response_col = "economy" distribution = "gaussian" print "Distribution: {0}".format(distribution) print "Response column: {0}".format(response_col) ## cross-validation # 1. check that cv metrics are the same over repeated "Modulo" runs nfolds = random.randint(3,10) gbm1 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, distribution=distribution, ntrees=5, fold_assignment="Modulo") gbm2 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, distribution=distribution, ntrees=5, fold_assignment="Modulo") pyunit_utils.check_models(gbm1, gbm2, True) # 2. check that cv metrics are different over repeated "Random" runs nfolds = random.randint(3,10) gbm1 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, distribution=distribution, ntrees=5, fold_assignment="Random") gbm2 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, distribution=distribution, ntrees=5, fold_assignment="Random") try: pyunit_utils.check_models(gbm1, gbm2, True) assert False, "Expected models to be different over repeated Random runs" except AssertionError: assert True # 3. folds_column num_folds = random.randint(2,5) fold_assignments = h2o.H2OFrame(python_obj=[[random.randint(0,num_folds-1)] for f in range(cars.nrow)]) fold_assignments.set_names(["fold_assignments"]) cars = cars.cbind(fold_assignments) gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], training_frame=cars, distribution=distribution, ntrees=5, fold_column="fold_assignments", keep_cross_validation_predictions=True) num_cv_models = len(gbm._model_json['output']['cross_validation_models']) assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \ "{1}".format(num_folds, num_cv_models) cv_model1 = h2o.get_model(gbm._model_json['output']['cross_validation_models'][0]['name']) cv_model2 = h2o.get_model(gbm._model_json['output']['cross_validation_models'][1]['name']) assert isinstance(cv_model1, type(gbm)), "Expected cross-validation model to be the same model type as the " \ "constructed model, but got {0} and {1}".format(type(cv_model1),type(gbm)) assert isinstance(cv_model2, type(gbm)), "Expected cross-validation model to be the same model type as the " \ "constructed model, but got {0} and {1}".format(type(cv_model2),type(gbm)) # 4. keep_cross_validation_predictions cv_predictions = gbm1._model_json['output']['cross_validation_predictions'] assert cv_predictions is None, "Expected cross-validation predictions to be None, but got {0}".format(cv_predictions) cv_predictions = gbm._model_json['output']['cross_validation_predictions'] assert len(cv_predictions)==num_folds, "Expected the same number of cross-validation predictions " \ "as folds, but got {0}".format(len(cv_predictions)) # # 5. manually construct models # fold1 = cars[cars["fold_assignments"]==0] # fold2 = cars[cars["fold_assignments"]==1] # manual_model1 = h2o.gbm(y=fold2[response_col], # x=fold2[predictors], # validation_y=fold1[response_col], # validation_x=fold1[predictors], ntrees=5, # distribution=distribution) # manual_model2 = h2o.gbm(y=fold1[response_col], # x=fold1[predictors], # validation_y=fold2[response_col], # validation_x=fold2[predictors], ntrees=5, # distribution=distribution) ## boundary cases # 1. nfolds = number of observations (leave-one-out cross-validation) gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow, distribution=distribution, ntrees=5, fold_assignment="Modulo") # 2. nfolds = 0 gbm1 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=0, distribution=distribution, ntrees=5) # check that this is equivalent to no nfolds gbm2 = h2o.gbm(y=cars[response_col], x=cars[predictors], distribution=distribution, ntrees=5) pyunit_utils.check_models(gbm1, gbm2) # 3. cross-validation and regular validation attempted gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=random.randint(3,10), validation_y=cars[response_col], ntrees=5, validation_x=cars[predictors], distribution=distribution) ## error cases # 1. nfolds == 1 or < 0 try: gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=random.sample([-1,1], 1)[0], ntrees=5, distribution=distribution) assert False, "Expected model-build to fail when nfolds is 1 or < 0" except EnvironmentError: assert True # 2. more folds than observations try: gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow+1, distribution=distribution, ntrees=5, fold_assignment="Modulo") assert False, "Expected model-build to fail when nfolds > nobs" except EnvironmentError: assert True # 3. fold_column and nfolds both specified try: gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=3, fold_column="fold_assignments", ntrees=5, distribution=distribution, training_frame=cars) assert False, "Expected model-build to fail when fold_column and nfolds both specified" except EnvironmentError: assert True # 4. fold_column and fold_assignment both specified try: gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], fold_assignment="Random", fold_column="fold_assignments", ntrees=5, distribution=distribution, training_frame=cars) assert False, "Expected model-build to fail when fold_column and fold_assignment both specified" except EnvironmentError: assert True
def cv_carsDL(): # read in the dataset and construct training set (and validation set) cars = h2o.import_file( path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial, # 2:multinomial problem = random.sample(range(3), 1)[0] # pick the predictors and the correct response column predictors = ["displacement", "power", "weight", "acceleration", "year"] if problem == 1: response_col = "economy_20mpg" cars[response_col] = cars[response_col].asfactor() elif problem == 2: response_col = "cylinders" cars[response_col] = cars[response_col].asfactor() else: response_col = "economy" print "Response column: {0}".format(response_col) ## cross-validation # 1. basic dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=random.randint(3, 10), fold_assignment="Modulo") # 2. check that cv metrics are different over repeated "Random" runs nfolds = random.randint(3, 10) dl1 = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=nfolds, fold_assignment="Random") dl2 = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=nfolds, fold_assignment="Random") try: pyunit_utils.check_models(dl1, dl2, True) assert False, "Expected models to be different over repeated Random runs" except AssertionError: assert True # 3. folds_column num_folds = random.randint(2, 5) fold_assignments = h2o.H2OFrame( zip(*[[random.randint(0, num_folds - 1)] for f in range(cars.nrow)])) fold_assignments.set_names(["fold_assignments"]) cars = cars.cbind(fold_assignments) dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], training_frame=cars, fold_column="fold_assignments", keep_cross_validation_predictions=True) num_cv_models = len(dl._model_json['output']['cross_validation_models']) assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \ "{1}".format(num_folds, num_cv_models) cv_model1 = h2o.get_model( dl._model_json['output']['cross_validation_models'][0]['name']) cv_model2 = h2o.get_model( dl._model_json['output']['cross_validation_models'][1]['name']) assert isinstance(cv_model1, type(dl)), "Expected cross-validation model to be the same model type as the " \ "constructed model, but got {0} and {1}".format(type(cv_model1),type(dl)) assert isinstance(cv_model2, type(dl)), "Expected cross-validation model to be the same model type as the " \ "constructed model, but got {0} and {1}".format(type(cv_model2),type(dl)) # 4. keep_cross_validation_predictions cv_predictions = dl1._model_json['output']['cross_validation_predictions'] assert cv_predictions is None, "Expected cross-validation predictions to be None, but got {0}".format( cv_predictions) cv_predictions = dl._model_json['output']['cross_validation_predictions'] assert len(cv_predictions)==num_folds, "Expected the same number of cross-validation predictions " \ "as folds, but got {0}".format(len(cv_predictions)) ## boundary cases # 1. nfolds = number of observations (leave-one-out cross-validation) dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow, fold_assignment="Modulo") # 2. nfolds = 0 dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=0) # 3. cross-validation and regular validation attempted dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=random.randint(3, 10), validation_y=cars[response_col], validation_x=cars[predictors]) ## error cases # 1. nfolds == 1 or < 0 try: dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=random.sample([-1, 1], 1)[0]) assert False, "Expected model-build to fail when nfolds is 1 or < 0" except EnvironmentError: assert True # 2. more folds than observations try: dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow + 1, fold_assignment="Modulo") assert False, "Expected model-build to fail when nfolds > nobs" except EnvironmentError: assert True # 3. fold_column and nfolds both specified try: rf = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=3, fold_column="fold_assignments", training_frame=cars) assert False, "Expected model-build to fail when fold_column and nfolds both specified" except EnvironmentError: assert True