def kmeans_grid_iris(): iris_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv")) grid_space = pyunit_utils.make_random_grid_space(algo="km") print("Grid space: {0}".format(grid_space)) print("Constructing grid of Kmeans models") iris_grid = H2OGridSearch(H2OKMeansEstimator, hyper_params=grid_space) iris_grid.train(x=list(range(4)), training_frame=iris_h2o) print("Check cardinality of grid, that is, the correct number of models have been created...") size_of_grid_space = 1 for v in list(grid_space.values()): size_of_grid_space = size_of_grid_space * len(v) actual_size = len(iris_grid) assert size_of_grid_space == actual_size, "Expected size of grid to be {0}, but got {1}" \ "".format(size_of_grid_space,actual_size) print("Duplicate-entries-in-grid-space check") new_grid_space = copy.deepcopy(grid_space) for name in list(grid_space.keys()): new_grid_space[name] = grid_space[name] + grid_space[name] print("The new search space: {0}".format(new_grid_space)) print("Constructing the new grid of glm models...") iris_grid2 = H2OGridSearch(H2OKMeansEstimator, hyper_params=new_grid_space) iris_grid2.train(x=list(range(4)), training_frame=iris_h2o) actual_size2 = len(iris_grid2) assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \ "size: {1}".format(actual_size, actual_size2) print("Check that the hyper_params that were passed to grid, were used to construct the models...") for name in list(grid_space.keys()): print(name) pyunit_utils.expect_model_param(iris_grid, name, grid_space[name])
def grid_cars_GBM(): cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) r = cars[0].runif(seed=42) train = cars[r > .2] validation_scheme = random.randint(1,3) # 1:none, 2:cross-validation, 3:validation set print "Validation scheme: {0}".format(validation_scheme) if validation_scheme == 2: nfolds = 2 print "Nfolds: 2" if validation_scheme == 3: valid = cars[r <= .2] grid_space = pyunit_utils.make_random_grid_space(algo="gbm") print "Grid space: {0}".format(grid_space) predictors = ["displacement","power","weight","acceleration","year"] if grid_space['distribution'][0] == 'bernoulli': response_col = "economy_20mpg" elif grid_space['distribution'][0] == 'gaussian': response_col = "economy" else: response_col = "cylinders" print "Predictors: {0}".format(predictors) print "Response: {0}".format(response_col) if grid_space['distribution'][0] in ['bernoulli', 'multinomial']: print "Converting the response column to a factor..." train[response_col] = train[response_col].asfactor() if validation_scheme == 3: valid[response_col] = valid[response_col].asfactor() print "Constructing the grid of gbm models..." cars_gbm_grid = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=grid_space) if validation_scheme == 1: cars_gbm_grid.train(x=predictors,y=response_col,training_frame=train) elif validation_scheme == 2: cars_gbm_grid.train(x=predictors,y=response_col,training_frame=train,nfolds=nfolds) else: cars_gbm_grid.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid) print "Performing various checks of the constructed grid..." print "Check cardinality of grid, that is, the correct number of models have been created..." size_of_grid_space = 1 for v in grid_space.values(): size_of_grid_space = size_of_grid_space * len(v) actual_size = len(cars_gbm_grid) assert size_of_grid_space == actual_size, "Expected size of grid to be {0}, but got {1}" \ "".format(size_of_grid_space,actual_size) print "Duplicate-entries-in-grid-space check" new_grid_space = copy.deepcopy(grid_space) for name in grid_space.keys(): if not name == "distribution": new_grid_space[name] = grid_space[name] + grid_space[name] print "The new search space: {0}".format(new_grid_space) print "Constructing the new grid of gbm models..." cars_gbm_grid2 = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=new_grid_space) if validation_scheme == 1: cars_gbm_grid2.train(x=predictors,y=response_col,training_frame=train) elif validation_scheme == 2: cars_gbm_grid2.train(x=predictors,y=response_col,training_frame=train,nfolds=nfolds) else: cars_gbm_grid2.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid) actual_size2 = len(cars_gbm_grid2) assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \ "size: {1}".format(actual_size, actual_size2) print "Check that the hyper_params that were passed to grid, were used to construct the models..." for name in grid_space.keys(): pyunit_utils.expect_model_param(cars_gbm_grid, name, grid_space[name])
def kmeans_grid_iris(): iris_h2o = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris.csv")) grid_space = pyunit_utils.make_random_grid_space(algo="km") print "Grid space: {0}".format(grid_space) print "Constructing grid of Kmeans models" iris_grid = H2OGridSearch(H2OKMeansEstimator, hyper_params=grid_space) iris_grid.train(x=range(4), training_frame=iris_h2o) print "Check cardinality of grid, that is, the correct number of models have been created..." size_of_grid_space = 1 for v in grid_space.values(): size_of_grid_space = size_of_grid_space * len(v) actual_size = len(iris_grid) assert size_of_grid_space == actual_size, "Expected size of grid to be {0}, but got {1}" \ "".format(size_of_grid_space,actual_size) print "Duplicate-entries-in-grid-space check" new_grid_space = copy.deepcopy(grid_space) for name in grid_space.keys(): new_grid_space[name] = grid_space[name] + grid_space[name] print "The new search space: {0}".format(new_grid_space) print "Constructing the new grid of glm models..." iris_grid2 = H2OGridSearch(H2OKMeansEstimator, hyper_params=new_grid_space) iris_grid2.train(x=range(4), training_frame=iris_h2o) actual_size2 = len(iris_grid2) assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \ "size: {1}".format(actual_size, actual_size2) print "Check that the hyper_params that were passed to grid, were used to construct the models..." for name in grid_space.keys(): print name pyunit_utils.expect_model_param(iris_grid, name, grid_space[name])
def grid_quasar_pca(): quasar = h2o.import_file( path=pyunit_utils.locate("smalldata/pca_test/SDSS_quasar.txt.zip"), header=1) grid_space = pyunit_utils.make_random_grid_space(algo="pca", ncols=quasar.ncol, nrows=quasar.nrow) print("Grid space: {0}".format(grid_space)) print("Constructing the grid of PCA models...") quasar_pca_grid = H2OGridSearch(H2OPCA, hyper_params=grid_space) quasar_pca_grid.train(x=list(range(1, 23)), training_frame=quasar) for model in quasar_pca_grid: assert isinstance(model, H2OPCA) print("Performing various checks of the constructed grid...") print( "Check cardinality of grid, that is, the correct number of models have been created..." ) size_of_grid_space = 1 for v in list(grid_space.values()): v2 = [v] if type(v) != list else v size_of_grid_space = size_of_grid_space * len(v2) actual_size = len(quasar_pca_grid) assert size_of_grid_space == actual_size, "Expected size of grid to be {0}, but got {1}" \ "".format(size_of_grid_space,actual_size) print("Duplicate-entries-in-grid-space check") new_grid_space = copy.deepcopy(grid_space) for name in list(grid_space.keys()): new_grid_space[name] = grid_space[name] + grid_space[name] print("The new search space: {0}".format(new_grid_space)) print("Constructing the new grid of nb models...") quasar_pca_grid2 = H2OGridSearch(H2OPCA, hyper_params=new_grid_space) quasar_pca_grid2.train(x=list(range(1, 23)), training_frame=quasar) actual_size2 = len(quasar_pca_grid2) assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \ "size: {1}".format(actual_size, actual_size2) for model in quasar_pca_grid2: assert isinstance(model, H2OPCA) print( "Check that the hyper_params that were passed to grid, were used to construct the models..." ) for name in list(grid_space.keys()): print(name) pyunit_utils.expect_model_param(quasar_pca_grid, name, grid_space[name])
def grid_cars_NB(): cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) r = cars[0].runif(seed=42) train = cars[r > .2] validation_scheme = random.randint(1,3) # 1:none, 2:cross-validation, 3:validation set print("Validation scheme: {0}".format(validation_scheme)) if validation_scheme == 2: nfolds = 2 print("Nfolds: 2") if validation_scheme == 3: valid = cars[r <= .2] grid_space = pyunit_utils.make_random_grid_space(algo="naiveBayes") print("Grid space: {0}".format(grid_space)) problem = random.sample(["binomial","multinomial"],1) predictors = ["displacement","power","weight","acceleration","year"] if problem == "binomial": response_col = "economy_20mpg" else: response_col = "cylinders" print("Predictors: {0}".format(predictors)) print("Response: {0}".format(response_col)) print("Converting the response column to a factor...") train[response_col] = train[response_col].asfactor() if validation_scheme == 3: valid[response_col] = valid[response_col].asfactor() print("Grid space: {0}".format(grid_space)) print("Constructing the grid of nb models...") grid_space["compute_metrics"] = [False] cars_nb_grid = H2OGridSearch(H2ONaiveBayesEstimator, hyper_params=grid_space) if validation_scheme == 1: cars_nb_grid.train(x=predictors,y=response_col,training_frame=train) elif validation_scheme == 2: cars_nb_grid.train(x=predictors,y=response_col,training_frame=train,nfolds=nfolds) else: cars_nb_grid.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid)
def grid_quasar_pca(): quasar = h2o.import_file(path=pyunit_utils.locate("smalldata/pca_test/SDSS_quasar.txt.zip"), header=1) grid_space = pyunit_utils.make_random_grid_space(algo="pca", ncols=quasar.ncol, nrows=quasar.nrow) print("Grid space: {0}".format(grid_space)) print("Constructing the grid of PCA models...") quasar_pca_grid = H2OGridSearch(H2OPCA, hyper_params=grid_space) quasar_pca_grid.train(x=list(range(1,23)), training_frame=quasar) for model in quasar_pca_grid: assert isinstance(model, H2OPCA) print("Performing various checks of the constructed grid...") print("Check cardinality of grid, that is, the correct number of models have been created...") size_of_grid_space = 1 for v in list(grid_space.values()): v2 = [v] if type(v) != list else v size_of_grid_space = size_of_grid_space * len(v2) actual_size = len(quasar_pca_grid) assert size_of_grid_space == actual_size, "Expected size of grid to be {0}, but got {1}" \ "".format(size_of_grid_space,actual_size) print("Duplicate-entries-in-grid-space check") new_grid_space = copy.deepcopy(grid_space) for name in list(grid_space.keys()): new_grid_space[name] = grid_space[name] + grid_space[name] print("The new search space: {0}".format(new_grid_space)) print("Constructing the new grid of nb models...") quasar_pca_grid2 = H2OGridSearch(H2OPCA, hyper_params=new_grid_space) quasar_pca_grid2.train(x=list(range(1,23)), training_frame=quasar) actual_size2 = len(quasar_pca_grid2) assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \ "size: {1}".format(actual_size, actual_size2) for model in quasar_pca_grid2: assert isinstance(model, H2OPCA) print("Check that the hyper_params that were passed to grid, were used to construct the models...") for name in list(grid_space.keys()): print(name) pyunit_utils.expect_model_param(quasar_pca_grid, name, grid_space[name])
def grid_cars_RF(): cars = h2o.import_file( path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) r = cars[0].runif(seed=42) train = cars[r > .2] validation_scheme = random.randint( 1, 3) # 1:none, 2:cross-validation, 3:validation set print("Validation scheme: {0}".format(validation_scheme)) if validation_scheme == 2: nfolds = 2 print("Nfolds: 2") if validation_scheme == 3: valid = cars[r <= .2] predictors = ["displacement", "power", "weight", "acceleration", "year"] grid_space = pyunit_utils.make_random_grid_space(algo="rf", ncols=len(predictors)) print("Grid space: {0}".format(grid_space)) problem = random.randint(1, 3) if problem == 1: response_col = "economy_20mpg" elif problem == 2: response_col = "economy" else: response_col = "cylinders" print("Predictors: {0}".format(predictors)) print("Response: {0}".format(response_col)) if problem in [1, 3]: print("Converting the response column to a factor...") train[response_col] = train[response_col].asfactor() if validation_scheme == 3: valid[response_col] = valid[response_col].asfactor() print("Constructing the grid of RF models...") cars_rf_grid = H2OGridSearch(H2ORandomForestEstimator, hyper_params=grid_space) if validation_scheme == 1: cars_rf_grid.train(x=predictors, y=response_col, training_frame=train) elif validation_scheme == 2: cars_rf_grid.train(x=predictors, y=response_col, training_frame=train, nfolds=nfolds) else: cars_rf_grid.train(x=predictors, y=response_col, training_frame=train, validation_frame=valid) for model in cars_rf_grid: assert isinstance(model, H2ORandomForestEstimator) print("Performing various checks of the constructed grid...") print( "Check cardinality of grid, that is, the correct number of models have been created..." ) size_of_grid_space = 1 for v in list(grid_space.values()): size_of_grid_space = size_of_grid_space * len(v) actual_size = len(cars_rf_grid) assert size_of_grid_space == actual_size, "Expected size of grid to be {0}, but got {1}" \ "".format(size_of_grid_space,actual_size) print("Duplicate-entries-in-grid-space check") new_grid_space = copy.deepcopy(grid_space) for name in list(grid_space.keys()): if not name == "distribution": new_grid_space[name] = grid_space[name] + grid_space[name] print("The new search space: {0}".format(new_grid_space)) print("Constructing the new grid of RF models...") cars_rf_grid2 = H2OGridSearch(H2ORandomForestEstimator, hyper_params=new_grid_space) if validation_scheme == 1: cars_rf_grid2.train(x=predictors, y=response_col, training_frame=train) elif validation_scheme == 2: cars_rf_grid2.train(x=predictors, y=response_col, training_frame=train, nfolds=nfolds) else: cars_rf_grid2.train(x=predictors, y=response_col, training_frame=train, validation_frame=valid) actual_size2 = len(cars_rf_grid2) assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \ "size: {1}".format(actual_size, actual_size2) for model in cars_rf_grid2: assert isinstance(model, H2ORandomForestEstimator) print(grid_space) print( "Check that the hyper_params that were passed to grid, were used to construct the models..." ) for name in list(grid_space.keys()): pyunit_utils.expect_model_param(cars_rf_grid, name, grid_space[name])
def grid_cars_GLM(): cars = h2o.import_file( path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) r = cars[0].runif(seed=42) train = cars[r > .2] validation_scheme = random.randint( 1, 3) # 1:none, 2:cross-validation, 3:validation set print("Validation scheme: {0}".format(validation_scheme)) if validation_scheme == 2: nfolds = 2 print("Nfolds: 2") if validation_scheme == 3: valid = cars[r <= .2] grid_space = pyunit_utils.make_random_grid_space(algo="glm") print("Grid space: {0}".format(grid_space)) predictors = ["displacement", "power", "weight", "acceleration", "year"] if grid_space['family'][0] == 'binomial': response_col = "economy_20mpg" true_model_type = "classifier" elif grid_space['family'][0] == 'gaussian': response_col = "economy" true_model_type = "regressor" else: response_col = "cylinders" true_model_type = "regressor" print("Predictors: {0}".format(predictors)) print("Response: {0}".format(response_col)) if grid_space['family'][0] in ['binomial', 'multinomial']: print("Converting the response column to a factor...") train[response_col] = train[response_col].asfactor() if validation_scheme == 3: valid[response_col] = valid[response_col].asfactor() #grid_space.update({"lambda":[0.1,0.05,0.01]}) family = grid_space.pop('family')[0] print("Grid space: {0}".format(grid_space)) print("Constructing the grid of glm models...") print("family = ", family) cars_glm_grid = H2OGridSearch(H2OGeneralizedLinearEstimator, hyper_params=grid_space) if validation_scheme == 1: cars_glm_grid.train(x=predictors, y=response_col, training_frame=train, family=family) elif validation_scheme == 2: cars_glm_grid.train(x=predictors, y=response_col, training_frame=train, nfolds=nfolds, family=family) else: cars_glm_grid.train(x=predictors, y=response_col, training_frame=train, validation_frame=valid, family=family) for model in cars_glm_grid: assert isinstance(model, H2OGeneralizedLinearEstimator) print("Performing various checks of the constructed grid...") print( "Check cardinality of grid, that is, the correct number of models have been created..." ) size_of_grid_space = 1 for v in list(grid_space.values()): size_of_grid_space = size_of_grid_space * len(v) actual_size = len(cars_glm_grid) assert size_of_grid_space == actual_size, "Expected size of grid to be {0}, but got {1}" \ "".format(size_of_grid_space,actual_size) print("Check correct type value....") model_type = cars_glm_grid[0].type assert model_type == true_model_type, "Type of model ({0}) is incorrect, expected value is {1}.".format( model_type, true_model_type) print("Duplicate-entries-in-grid-space check") new_grid_space = copy.deepcopy(grid_space) for name in list(grid_space.keys()): if not name == "family": new_grid_space[name] = grid_space[name] + grid_space[name] print("The new search space: {0}".format(new_grid_space)) print("Constructing the new grid of glm models...") cars_glm_grid2 = H2OGridSearch(H2OGeneralizedLinearEstimator, hyper_params=new_grid_space) if validation_scheme == 1: cars_glm_grid2.train(x=predictors, y=response_col, training_frame=train, family=family) elif validation_scheme == 2: cars_glm_grid2.train(x=predictors, y=response_col, training_frame=train, nfolds=nfolds, family=family) else: cars_glm_grid2.train(x=predictors, y=response_col, training_frame=train, validation_frame=valid, family=family) actual_size2 = len(cars_glm_grid2) assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \ "size: {1}".format(actual_size, actual_size2) print( "Check that the hyper_params that were passed to grid, were used to construct the models..." ) for name in list(grid_space.keys()): print(name) pyunit_utils.expect_model_param(cars_glm_grid, name, grid_space[name])
def grid_cars_GLM(): cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) r = cars[0].runif(seed=42) train = cars[r > .2] validation_scheme = random.randint(1,3) # 1:none, 2:cross-validation, 3:validation set print("Validation scheme: {0}".format(validation_scheme)) if validation_scheme == 2: nfolds = 2 print("Nfolds: 2") if validation_scheme == 3: valid = cars[r <= .2] grid_space = pyunit_utils.make_random_grid_space(algo="glm") print("Grid space: {0}".format(grid_space)) predictors = ["displacement","power","weight","acceleration","year"] if grid_space['family'][0] == 'binomial': response_col = "economy_20mpg" true_model_type = "classifier" elif grid_space['family'][0] == 'gaussian': response_col = "economy" true_model_type = "regressor" else: response_col = "cylinders" true_model_type = "regressor" print("Predictors: {0}".format(predictors)) print("Response: {0}".format(response_col)) if grid_space['family'][0] in ['binomial', 'multinomial']: print("Converting the response column to a factor...") train[response_col] = train[response_col].asfactor() if validation_scheme == 3: valid[response_col] = valid[response_col].asfactor() #grid_space.update({"lambda":[0.1,0.05,0.01]}) family = grid_space.pop('family')[0] print("Grid space: {0}".format(grid_space)) print("Constructing the grid of glm models...") print("family = ",family) cars_glm_grid = H2OGridSearch(H2OGeneralizedLinearEstimator, hyper_params=grid_space) if validation_scheme == 1: cars_glm_grid.train(x=predictors,y=response_col,training_frame=train, family=family) elif validation_scheme == 2: cars_glm_grid.train(x=predictors,y=response_col,training_frame=train,nfolds=nfolds, family=family) else: cars_glm_grid.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid, family=family) for model in cars_glm_grid: assert isinstance(model, H2OGeneralizedLinearEstimator) print("Performing various checks of the constructed grid...") print("Check cardinality of grid, that is, the correct number of models have been created...") size_of_grid_space = 1 for v in list(grid_space.values()): size_of_grid_space = size_of_grid_space * len(v) actual_size = len(cars_glm_grid) assert size_of_grid_space == actual_size, "Expected size of grid to be {0}, but got {1}" \ "".format(size_of_grid_space,actual_size) print("Check correct type value....") model_type = cars_glm_grid[0].type assert model_type == true_model_type, "Type of model ({0}) is incorrect, expected value is {1}.".format(model_type, true_model_type) print("Duplicate-entries-in-grid-space check") new_grid_space = copy.deepcopy(grid_space) for name in list(grid_space.keys()): if not name == "family": new_grid_space[name] = grid_space[name] + grid_space[name] print("The new search space: {0}".format(new_grid_space)) print("Constructing the new grid of glm models...") cars_glm_grid2 = H2OGridSearch(H2OGeneralizedLinearEstimator, hyper_params=new_grid_space) if validation_scheme == 1: cars_glm_grid2.train(x=predictors,y=response_col,training_frame=train, family=family) elif validation_scheme == 2: cars_glm_grid2.train(x=predictors,y=response_col,training_frame=train,nfolds=nfolds, family=family) else: cars_glm_grid2.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid, family=family) actual_size2 = len(cars_glm_grid2) assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \ "size: {1}".format(actual_size, actual_size2) print("Check that the hyper_params that were passed to grid, were used to construct the models...") for name in list(grid_space.keys()): print(name) pyunit_utils.expect_model_param(cars_glm_grid, name, grid_space[name])
def grid_cars_RF(): cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) r = cars[0].runif(seed=42) train = cars[r > .2] validation_scheme = random.randint(1,3) # 1:none, 2:cross-validation, 3:validation set print("Validation scheme: {0}".format(validation_scheme)) if validation_scheme == 2: nfolds = 2 print("Nfolds: 2") if validation_scheme == 3: valid = cars[r <= .2] predictors = ["displacement","power","weight","acceleration","year"] grid_space = pyunit_utils.make_random_grid_space(algo="rf", ncols=len(predictors)) # reduce the magnitude of nbins_cats, run was too long. if 'nbins_cats' in list(grid_space): grid_space['nbins_cats'] = random.sample(list(range(2, 200)), random.randint(2, 3)) print("Grid space: {0}".format(grid_space)) problem = random.randint(1,3) if problem == 1: response_col = "economy_20mpg" true_model_type = "classifier" elif problem == 2: response_col = "economy" true_model_type = "regressor" else: response_col = "cylinders" true_model_type = "classifier" print("Predictors: {0}".format(predictors)) print("Response: {0}".format(response_col)) if problem in [1,3]: print("Converting the response column to a factor...") train[response_col] = train[response_col].asfactor() if validation_scheme == 3: valid[response_col] = valid[response_col].asfactor() print("Constructing the grid of RF models...") cars_rf_grid = H2OGridSearch(H2ORandomForestEstimator, hyper_params=grid_space) if validation_scheme == 1: cars_rf_grid.train(x=predictors,y=response_col, training_frame=train) elif validation_scheme == 2: cars_rf_grid.train(x=predictors,y=response_col, training_frame=train, nfolds=nfolds) else: cars_rf_grid.train(x=predictors,y=response_col, training_frame=train, validation_frame=valid) for model in cars_rf_grid: assert isinstance(model, H2ORandomForestEstimator) print("Performing various checks of the constructed grid...") print("Check cardinality of grid, that is, the correct number of models have been created...") size_of_grid_space = 1 for v in list(grid_space.values()): size_of_grid_space = size_of_grid_space * len(v) actual_size = len(cars_rf_grid) assert size_of_grid_space == actual_size, "Expected size of grid to be {0}, but got {1}" \ "".format(size_of_grid_space,actual_size) print("Check correct type value....") model_type = cars_rf_grid[0].type assert model_type == true_model_type, "Type of model ({0}) is incorrect, expected value is {1}.".format(model_type, true_model_type) print("Duplicate-entries-in-grid-space check") new_grid_space = copy.deepcopy(grid_space) for name in list(grid_space.keys()): if not name == "distribution": new_grid_space[name] = grid_space[name] + grid_space[name] print("The new search space: {0}".format(new_grid_space)) print("Constructing the new grid of RF models...") cars_rf_grid2 = H2OGridSearch(H2ORandomForestEstimator, hyper_params=new_grid_space) if validation_scheme == 1: cars_rf_grid2.train(x=predictors,y=response_col, training_frame=train) elif validation_scheme == 2: cars_rf_grid2.train(x=predictors,y=response_col, training_frame=train, nfolds=nfolds) else: cars_rf_grid2.train(x=predictors,y=response_col, training_frame=train, validation_frame=valid) actual_size2 = len(cars_rf_grid2) assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \ "size: {1}".format(actual_size, actual_size2) for model in cars_rf_grid2: assert isinstance(model, H2ORandomForestEstimator) print(grid_space) print("Check that the hyper_params that were passed to grid, were used to construct the models...") for name in list(grid_space.keys()): pyunit_utils.expect_model_param(cars_rf_grid, name, grid_space[name])
def grid_cars_NB(): cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) r = cars[0].runif(seed=42) train = cars[r > .2] validation_scheme = random.randint(1,3) # 1:none, 2:cross-validation, 3:validation set print("Validation scheme: {0}".format(validation_scheme)) if validation_scheme == 2: nfolds = 2 print("Nfolds: 2") if validation_scheme == 3: valid = cars[r <= .2] grid_space = pyunit_utils.make_random_grid_space(algo="naiveBayes") print("Grid space: {0}".format(grid_space)) problem = random.sample(["binomial","multinomial"],1) predictors = ["displacement","power","weight","acceleration","year"] if problem == "binomial": response_col = "economy_20mpg" else: response_col = "cylinders" print("Predictors: {0}".format(predictors)) print("Response: {0}".format(response_col)) print("Converting the response column to a factor...") train[response_col] = train[response_col].asfactor() if validation_scheme == 3: valid[response_col] = valid[response_col].asfactor() print("Grid space: {0}".format(grid_space)) print("Constructing the grid of nb models...") cars_nb_grid = H2OGridSearch(H2ONaiveBayesEstimator, hyper_params=grid_space) if validation_scheme == 1: cars_nb_grid.train(x=predictors,y=response_col,training_frame=train) elif validation_scheme == 2: cars_nb_grid.train(x=predictors,y=response_col,training_frame=train,nfolds=nfolds) else: cars_nb_grid.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid) for model in cars_nb_grid: assert isinstance(model, H2ONaiveBayesEstimator) print("Performing various checks of the constructed grid...") print("Check cardinality of grid, that is, the correct number of models have been created...") size_of_grid_space = 1 print(grid_space) for v in list(grid_space.values()): v2 = [v] if type(v) != list else v size_of_grid_space = size_of_grid_space * len(v2) actual_size = len(cars_nb_grid) assert size_of_grid_space == actual_size, "Expected size of grid to be {0}, but got {1}" \ "".format(size_of_grid_space,actual_size) print("Duplicate-entries-in-grid-space check") new_grid_space = copy.deepcopy(grid_space) for name in list(grid_space.keys()): new_grid_space[name] = grid_space[name] + grid_space[name] print("The new search space: {0}".format(new_grid_space)) print("Constructing the new grid of nb models...") cars_nb_grid2 = H2OGridSearch(H2ONaiveBayesEstimator, hyper_params=new_grid_space) if validation_scheme == 1: cars_nb_grid2.train(x=predictors,y=response_col,training_frame=train) elif validation_scheme == 2: cars_nb_grid2.train(x=predictors,y=response_col,training_frame=train,nfolds=nfolds) else: cars_nb_grid2.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid) actual_size2 = len(cars_nb_grid2) assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \ "size: {1}".format(actual_size, actual_size2) for model in cars_nb_grid2: assert isinstance(model, H2ONaiveBayesEstimator) print("Check that the hyper_params that were passed to grid, were used to construct the models...") for name in list(grid_space.keys()): print(name) pyunit_utils.expect_model_param(cars_nb_grid, name, grid_space[name])
def grid_cars_NB(): cars = h2o.import_file( path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) r = cars[0].runif(seed=42) train = cars[r > .2] validation_scheme = random.randint( 1, 3) # 1:none, 2:cross-validation, 3:validation set print "Validation scheme: {0}".format(validation_scheme) if validation_scheme == 2: nfolds = 2 print "Nfolds: 2" if validation_scheme == 3: valid = cars[r <= .2] grid_space = pyunit_utils.make_random_grid_space(algo="naiveBayes") print "Grid space: {0}".format(grid_space) problem = random.sample(["binomial", "multinomial"], 1) predictors = ["displacement", "power", "weight", "acceleration", "year"] if problem == "binomial": response_col = "economy_20mpg" else: response_col = "cylinders" print "Predictors: {0}".format(predictors) print "Response: {0}".format(response_col) print "Converting the response column to a factor..." train[response_col] = train[response_col].asfactor() if validation_scheme == 3: valid[response_col] = valid[response_col].asfactor() print "Grid space: {0}".format(grid_space) print "Constructing the grid of nb models..." cars_nb_grid = H2OGridSearch(H2ONaiveBayesEstimator, hyper_params=grid_space) if validation_scheme == 1: cars_nb_grid.train(x=predictors, y=response_col, training_frame=train) elif validation_scheme == 2: cars_nb_grid.train(x=predictors, y=response_col, training_frame=train, nfolds=nfolds) else: cars_nb_grid.train(x=predictors, y=response_col, training_frame=train, validation_frame=valid) print "Performing various checks of the constructed grid..." print "Check cardinality of grid, that is, the correct number of models have been created..." size_of_grid_space = 1 print grid_space for v in grid_space.values(): v2 = [v] if type(v) != list else v size_of_grid_space = size_of_grid_space * len(v2) actual_size = len(cars_nb_grid) assert size_of_grid_space == actual_size, "Expected size of grid to be {0}, but got {1}" \ "".format(size_of_grid_space,actual_size) print "Duplicate-entries-in-grid-space check" new_grid_space = copy.deepcopy(grid_space) for name in grid_space.keys(): new_grid_space[name] = grid_space[name] + grid_space[name] print "The new search space: {0}".format(new_grid_space) print "Constructing the new grid of nb models..." cars_nb_grid2 = H2OGridSearch(H2ONaiveBayesEstimator, hyper_params=new_grid_space) if validation_scheme == 1: cars_nb_grid2.train(x=predictors, y=response_col, training_frame=train) elif validation_scheme == 2: cars_nb_grid2.train(x=predictors, y=response_col, training_frame=train, nfolds=nfolds) else: cars_nb_grid2.train(x=predictors, y=response_col, training_frame=train, validation_frame=valid) actual_size2 = len(cars_nb_grid2) assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \ "size: {1}".format(actual_size, actual_size2) print "Check that the hyper_params that were passed to grid, were used to construct the models..." for name in grid_space.keys(): print name pyunit_utils.expect_model_param(cars_nb_grid, name, grid_space[name])