def kmeans_grid_iris():

  iris_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv"))
  grid_space = pyunit_utils.make_random_grid_space(algo="km")
  print("Grid space: {0}".format(grid_space))
  print("Constructing grid of Kmeans models")
  iris_grid = H2OGridSearch(H2OKMeansEstimator, hyper_params=grid_space)
  iris_grid.train(x=list(range(4)), training_frame=iris_h2o)

  print("Check cardinality of grid, that is, the correct number of models have been created...")
  size_of_grid_space = 1
  for v in list(grid_space.values()):
      size_of_grid_space = size_of_grid_space * len(v)
  actual_size = len(iris_grid)
  assert size_of_grid_space ==  actual_size, "Expected size of grid to be {0}, but got {1}" \
                                             "".format(size_of_grid_space,actual_size)

  print("Duplicate-entries-in-grid-space check")
  new_grid_space = copy.deepcopy(grid_space)
  for name in list(grid_space.keys()):
      new_grid_space[name] = grid_space[name] + grid_space[name]
  print("The new search space: {0}".format(new_grid_space))
  print("Constructing the new grid of glm models...")
  iris_grid2 = H2OGridSearch(H2OKMeansEstimator, hyper_params=new_grid_space)
  iris_grid2.train(x=list(range(4)), training_frame=iris_h2o)
  actual_size2 = len(iris_grid2)
  assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \
                                      "size: {1}".format(actual_size, actual_size2)

  print("Check that the hyper_params that were passed to grid, were used to construct the models...")
  for name in list(grid_space.keys()):
      print(name)
      pyunit_utils.expect_model_param(iris_grid, name, grid_space[name])
def grid_cars_GBM():

    cars =  h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    r = cars[0].runif(seed=42)
    train = cars[r > .2]

    validation_scheme = random.randint(1,3) # 1:none, 2:cross-validation, 3:validation set
    print "Validation scheme: {0}".format(validation_scheme)
    if validation_scheme == 2:
        nfolds = 2
        print "Nfolds: 2"
    if validation_scheme == 3: valid = cars[r <= .2]

    grid_space = pyunit_utils.make_random_grid_space(algo="gbm")
    print "Grid space: {0}".format(grid_space)

    predictors = ["displacement","power","weight","acceleration","year"]
    if grid_space['distribution'][0] == 'bernoulli': response_col = "economy_20mpg"
    elif grid_space['distribution'][0] == 'gaussian': response_col = "economy"
    else: response_col = "cylinders"

    print "Predictors: {0}".format(predictors)
    print "Response: {0}".format(response_col)

    if grid_space['distribution'][0] in ['bernoulli', 'multinomial']:
        print "Converting the response column to a factor..."
        train[response_col] = train[response_col].asfactor()
        if validation_scheme == 3: valid[response_col] = valid[response_col].asfactor()

    print "Constructing the grid of gbm models..."
    cars_gbm_grid = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=grid_space)
    if validation_scheme == 1: cars_gbm_grid.train(x=predictors,y=response_col,training_frame=train)
    elif validation_scheme == 2: cars_gbm_grid.train(x=predictors,y=response_col,training_frame=train,nfolds=nfolds)
    else: cars_gbm_grid.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid)

    print "Performing various checks of the constructed grid..."

    print "Check cardinality of grid, that is, the correct number of models have been created..."
    size_of_grid_space = 1
    for v in grid_space.values(): size_of_grid_space = size_of_grid_space * len(v)
    actual_size = len(cars_gbm_grid)
    assert size_of_grid_space ==  actual_size, "Expected size of grid to be {0}, but got {1}" \
                                               "".format(size_of_grid_space,actual_size)

    print "Duplicate-entries-in-grid-space check"
    new_grid_space = copy.deepcopy(grid_space)
    for name in grid_space.keys():
        if not name == "distribution": new_grid_space[name] = grid_space[name] + grid_space[name]
    print "The new search space: {0}".format(new_grid_space)
    print "Constructing the new grid of gbm models..."
    cars_gbm_grid2 = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=new_grid_space)
    if validation_scheme == 1: cars_gbm_grid2.train(x=predictors,y=response_col,training_frame=train)
    elif validation_scheme == 2: cars_gbm_grid2.train(x=predictors,y=response_col,training_frame=train,nfolds=nfolds)
    else: cars_gbm_grid2.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid)
    actual_size2 = len(cars_gbm_grid2)
    assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \
                                        "size: {1}".format(actual_size, actual_size2)

    print "Check that the hyper_params that were passed to grid, were used to construct the models..."
    for name in grid_space.keys(): pyunit_utils.expect_model_param(cars_gbm_grid, name, grid_space[name])
Example #3
0
def kmeans_grid_iris():

    iris_h2o = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris.csv"))
    grid_space = pyunit_utils.make_random_grid_space(algo="km")
    print "Grid space: {0}".format(grid_space)
    print "Constructing grid of Kmeans models"
    iris_grid = H2OGridSearch(H2OKMeansEstimator, hyper_params=grid_space)
    iris_grid.train(x=range(4), training_frame=iris_h2o)

    print "Check cardinality of grid, that is, the correct number of models have been created..."
    size_of_grid_space = 1
    for v in grid_space.values():
        size_of_grid_space = size_of_grid_space * len(v)
    actual_size = len(iris_grid)
    assert size_of_grid_space ==  actual_size, "Expected size of grid to be {0}, but got {1}" \
                                               "".format(size_of_grid_space,actual_size)

    print "Duplicate-entries-in-grid-space check"
    new_grid_space = copy.deepcopy(grid_space)
    for name in grid_space.keys():
        new_grid_space[name] = grid_space[name] + grid_space[name]
    print "The new search space: {0}".format(new_grid_space)
    print "Constructing the new grid of glm models..."
    iris_grid2 = H2OGridSearch(H2OKMeansEstimator, hyper_params=new_grid_space)
    iris_grid2.train(x=range(4), training_frame=iris_h2o)
    actual_size2 = len(iris_grid2)
    assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \
                                        "size: {1}".format(actual_size, actual_size2)

    print "Check that the hyper_params that were passed to grid, were used to construct the models..."
    for name in grid_space.keys():
        print name
        pyunit_utils.expect_model_param(iris_grid, name, grid_space[name])
Example #4
0
def grid_quasar_pca():

    quasar = h2o.import_file(
        path=pyunit_utils.locate("smalldata/pca_test/SDSS_quasar.txt.zip"),
        header=1)
    grid_space = pyunit_utils.make_random_grid_space(algo="pca",
                                                     ncols=quasar.ncol,
                                                     nrows=quasar.nrow)
    print("Grid space: {0}".format(grid_space))

    print("Constructing the grid of PCA models...")
    quasar_pca_grid = H2OGridSearch(H2OPCA, hyper_params=grid_space)
    quasar_pca_grid.train(x=list(range(1, 23)), training_frame=quasar)

    for model in quasar_pca_grid:
        assert isinstance(model, H2OPCA)

    print("Performing various checks of the constructed grid...")

    print(
        "Check cardinality of grid, that is, the correct number of models have been created..."
    )
    size_of_grid_space = 1
    for v in list(grid_space.values()):
        v2 = [v] if type(v) != list else v
        size_of_grid_space = size_of_grid_space * len(v2)
    actual_size = len(quasar_pca_grid)
    assert size_of_grid_space ==  actual_size, "Expected size of grid to be {0}, but got {1}" \
                                               "".format(size_of_grid_space,actual_size)

    print("Duplicate-entries-in-grid-space check")
    new_grid_space = copy.deepcopy(grid_space)
    for name in list(grid_space.keys()):
        new_grid_space[name] = grid_space[name] + grid_space[name]
    print("The new search space: {0}".format(new_grid_space))
    print("Constructing the new grid of nb models...")
    quasar_pca_grid2 = H2OGridSearch(H2OPCA, hyper_params=new_grid_space)
    quasar_pca_grid2.train(x=list(range(1, 23)), training_frame=quasar)
    actual_size2 = len(quasar_pca_grid2)
    assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \
                                        "size: {1}".format(actual_size, actual_size2)

    for model in quasar_pca_grid2:
        assert isinstance(model, H2OPCA)

    print(
        "Check that the hyper_params that were passed to grid, were used to construct the models..."
    )
    for name in list(grid_space.keys()):
        print(name)
        pyunit_utils.expect_model_param(quasar_pca_grid, name,
                                        grid_space[name])
def grid_cars_NB():

    cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    r = cars[0].runif(seed=42)
    train = cars[r > .2]

    validation_scheme = random.randint(1,3) # 1:none, 2:cross-validation, 3:validation set
    print("Validation scheme: {0}".format(validation_scheme))
    if validation_scheme == 2:
        nfolds = 2
        print("Nfolds: 2")
    if validation_scheme == 3:
        valid = cars[r <= .2]

    grid_space = pyunit_utils.make_random_grid_space(algo="naiveBayes")
    print("Grid space: {0}".format(grid_space))

    problem = random.sample(["binomial","multinomial"],1)
    predictors = ["displacement","power","weight","acceleration","year"]
    if problem == "binomial":
        response_col = "economy_20mpg"
    else:
        response_col = "cylinders"

    print("Predictors: {0}".format(predictors))
    print("Response: {0}".format(response_col))

    print("Converting the response column to a factor...")
    train[response_col] = train[response_col].asfactor()
    if validation_scheme == 3:
        valid[response_col] = valid[response_col].asfactor()

    print("Grid space: {0}".format(grid_space))
    print("Constructing the grid of nb models...")

    grid_space["compute_metrics"] = [False]

    cars_nb_grid = H2OGridSearch(H2ONaiveBayesEstimator, hyper_params=grid_space)
    if validation_scheme == 1:
        cars_nb_grid.train(x=predictors,y=response_col,training_frame=train)
    elif validation_scheme == 2:
        cars_nb_grid.train(x=predictors,y=response_col,training_frame=train,nfolds=nfolds)
    else:
        cars_nb_grid.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid)
def grid_quasar_pca():

    quasar = h2o.import_file(path=pyunit_utils.locate("smalldata/pca_test/SDSS_quasar.txt.zip"), header=1)
    grid_space = pyunit_utils.make_random_grid_space(algo="pca", ncols=quasar.ncol, nrows=quasar.nrow)
    print("Grid space: {0}".format(grid_space))

    print("Constructing the grid of PCA models...")
    quasar_pca_grid = H2OGridSearch(H2OPCA, hyper_params=grid_space)
    quasar_pca_grid.train(x=list(range(1,23)), training_frame=quasar)

    for model in quasar_pca_grid:
      assert isinstance(model, H2OPCA)

    print("Performing various checks of the constructed grid...")

    print("Check cardinality of grid, that is, the correct number of models have been created...")
    size_of_grid_space = 1
    for v in list(grid_space.values()):
        v2 = [v] if type(v) != list else v
        size_of_grid_space = size_of_grid_space * len(v2)
    actual_size = len(quasar_pca_grid)
    assert size_of_grid_space ==  actual_size, "Expected size of grid to be {0}, but got {1}" \
                                               "".format(size_of_grid_space,actual_size)

    print("Duplicate-entries-in-grid-space check")
    new_grid_space = copy.deepcopy(grid_space)
    for name in list(grid_space.keys()):
        new_grid_space[name] = grid_space[name] + grid_space[name]
    print("The new search space: {0}".format(new_grid_space))
    print("Constructing the new grid of nb models...")
    quasar_pca_grid2 = H2OGridSearch(H2OPCA, hyper_params=new_grid_space)
    quasar_pca_grid2.train(x=list(range(1,23)), training_frame=quasar)
    actual_size2 = len(quasar_pca_grid2)
    assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \
                                        "size: {1}".format(actual_size, actual_size2)

    for model in quasar_pca_grid2:
      assert isinstance(model, H2OPCA)

    print("Check that the hyper_params that were passed to grid, were used to construct the models...")
    for name in list(grid_space.keys()):
        print(name)
        pyunit_utils.expect_model_param(quasar_pca_grid, name, grid_space[name])
Example #7
0
def grid_cars_RF():

    cars = h2o.import_file(
        path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    r = cars[0].runif(seed=42)
    train = cars[r > .2]

    validation_scheme = random.randint(
        1, 3)  # 1:none, 2:cross-validation, 3:validation set
    print("Validation scheme: {0}".format(validation_scheme))
    if validation_scheme == 2:
        nfolds = 2
        print("Nfolds: 2")
    if validation_scheme == 3:
        valid = cars[r <= .2]

    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    grid_space = pyunit_utils.make_random_grid_space(algo="rf",
                                                     ncols=len(predictors))
    print("Grid space: {0}".format(grid_space))

    problem = random.randint(1, 3)
    if problem == 1:
        response_col = "economy_20mpg"
    elif problem == 2:
        response_col = "economy"
    else:
        response_col = "cylinders"

    print("Predictors: {0}".format(predictors))
    print("Response: {0}".format(response_col))

    if problem in [1, 3]:
        print("Converting the response column to a factor...")
        train[response_col] = train[response_col].asfactor()
        if validation_scheme == 3:
            valid[response_col] = valid[response_col].asfactor()

    print("Constructing the grid of RF models...")
    cars_rf_grid = H2OGridSearch(H2ORandomForestEstimator,
                                 hyper_params=grid_space)
    if validation_scheme == 1:
        cars_rf_grid.train(x=predictors, y=response_col, training_frame=train)
    elif validation_scheme == 2:
        cars_rf_grid.train(x=predictors,
                           y=response_col,
                           training_frame=train,
                           nfolds=nfolds)
    else:
        cars_rf_grid.train(x=predictors,
                           y=response_col,
                           training_frame=train,
                           validation_frame=valid)

    for model in cars_rf_grid:
        assert isinstance(model, H2ORandomForestEstimator)

    print("Performing various checks of the constructed grid...")

    print(
        "Check cardinality of grid, that is, the correct number of models have been created..."
    )
    size_of_grid_space = 1
    for v in list(grid_space.values()):
        size_of_grid_space = size_of_grid_space * len(v)
    actual_size = len(cars_rf_grid)
    assert size_of_grid_space ==  actual_size, "Expected size of grid to be {0}, but got {1}" \
                                               "".format(size_of_grid_space,actual_size)

    print("Duplicate-entries-in-grid-space check")
    new_grid_space = copy.deepcopy(grid_space)
    for name in list(grid_space.keys()):
        if not name == "distribution":
            new_grid_space[name] = grid_space[name] + grid_space[name]
    print("The new search space: {0}".format(new_grid_space))
    print("Constructing the new grid of RF models...")
    cars_rf_grid2 = H2OGridSearch(H2ORandomForestEstimator,
                                  hyper_params=new_grid_space)
    if validation_scheme == 1:
        cars_rf_grid2.train(x=predictors, y=response_col, training_frame=train)
    elif validation_scheme == 2:
        cars_rf_grid2.train(x=predictors,
                            y=response_col,
                            training_frame=train,
                            nfolds=nfolds)
    else:
        cars_rf_grid2.train(x=predictors,
                            y=response_col,
                            training_frame=train,
                            validation_frame=valid)
    actual_size2 = len(cars_rf_grid2)
    assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \
                                        "size: {1}".format(actual_size, actual_size2)

    for model in cars_rf_grid2:
        assert isinstance(model, H2ORandomForestEstimator)

    print(grid_space)
    print(
        "Check that the hyper_params that were passed to grid, were used to construct the models..."
    )
    for name in list(grid_space.keys()):
        pyunit_utils.expect_model_param(cars_rf_grid, name, grid_space[name])
Example #8
0
def grid_cars_GLM():

    cars = h2o.import_file(
        path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    r = cars[0].runif(seed=42)
    train = cars[r > .2]

    validation_scheme = random.randint(
        1, 3)  # 1:none, 2:cross-validation, 3:validation set
    print("Validation scheme: {0}".format(validation_scheme))
    if validation_scheme == 2:
        nfolds = 2
        print("Nfolds: 2")
    if validation_scheme == 3:
        valid = cars[r <= .2]

    grid_space = pyunit_utils.make_random_grid_space(algo="glm")
    print("Grid space: {0}".format(grid_space))

    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    if grid_space['family'][0] == 'binomial':
        response_col = "economy_20mpg"
        true_model_type = "classifier"
    elif grid_space['family'][0] == 'gaussian':
        response_col = "economy"
        true_model_type = "regressor"
    else:
        response_col = "cylinders"
        true_model_type = "regressor"

    print("Predictors: {0}".format(predictors))
    print("Response: {0}".format(response_col))

    if grid_space['family'][0] in ['binomial', 'multinomial']:
        print("Converting the response column to a factor...")
        train[response_col] = train[response_col].asfactor()
        if validation_scheme == 3:
            valid[response_col] = valid[response_col].asfactor()

    #grid_space.update({"lambda":[0.1,0.05,0.01]})
    family = grid_space.pop('family')[0]
    print("Grid space: {0}".format(grid_space))
    print("Constructing the grid of glm models...")
    print("family = ", family)
    cars_glm_grid = H2OGridSearch(H2OGeneralizedLinearEstimator,
                                  hyper_params=grid_space)
    if validation_scheme == 1:
        cars_glm_grid.train(x=predictors,
                            y=response_col,
                            training_frame=train,
                            family=family)
    elif validation_scheme == 2:
        cars_glm_grid.train(x=predictors,
                            y=response_col,
                            training_frame=train,
                            nfolds=nfolds,
                            family=family)
    else:
        cars_glm_grid.train(x=predictors,
                            y=response_col,
                            training_frame=train,
                            validation_frame=valid,
                            family=family)

    for model in cars_glm_grid:
        assert isinstance(model, H2OGeneralizedLinearEstimator)

    print("Performing various checks of the constructed grid...")

    print(
        "Check cardinality of grid, that is, the correct number of models have been created..."
    )
    size_of_grid_space = 1
    for v in list(grid_space.values()):
        size_of_grid_space = size_of_grid_space * len(v)
    actual_size = len(cars_glm_grid)
    assert size_of_grid_space ==  actual_size, "Expected size of grid to be {0}, but got {1}" \
                                               "".format(size_of_grid_space,actual_size)

    print("Check correct type value....")
    model_type = cars_glm_grid[0].type
    assert model_type == true_model_type, "Type of model ({0}) is incorrect, expected value is {1}.".format(
        model_type, true_model_type)

    print("Duplicate-entries-in-grid-space check")
    new_grid_space = copy.deepcopy(grid_space)
    for name in list(grid_space.keys()):
        if not name == "family":
            new_grid_space[name] = grid_space[name] + grid_space[name]
    print("The new search space: {0}".format(new_grid_space))
    print("Constructing the new grid of glm models...")
    cars_glm_grid2 = H2OGridSearch(H2OGeneralizedLinearEstimator,
                                   hyper_params=new_grid_space)
    if validation_scheme == 1:
        cars_glm_grid2.train(x=predictors,
                             y=response_col,
                             training_frame=train,
                             family=family)
    elif validation_scheme == 2:
        cars_glm_grid2.train(x=predictors,
                             y=response_col,
                             training_frame=train,
                             nfolds=nfolds,
                             family=family)
    else:
        cars_glm_grid2.train(x=predictors,
                             y=response_col,
                             training_frame=train,
                             validation_frame=valid,
                             family=family)
    actual_size2 = len(cars_glm_grid2)
    assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \
                                        "size: {1}".format(actual_size, actual_size2)

    print(
        "Check that the hyper_params that were passed to grid, were used to construct the models..."
    )
    for name in list(grid_space.keys()):
        print(name)
        pyunit_utils.expect_model_param(cars_glm_grid, name, grid_space[name])
Example #9
0
def grid_cars_GLM():

    cars =  h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    r = cars[0].runif(seed=42)
    train = cars[r > .2]

    validation_scheme = random.randint(1,3) # 1:none, 2:cross-validation, 3:validation set
    print("Validation scheme: {0}".format(validation_scheme))
    if validation_scheme == 2:
        nfolds = 2
        print("Nfolds: 2")
    if validation_scheme == 3:
        valid = cars[r <= .2]

    grid_space = pyunit_utils.make_random_grid_space(algo="glm")
    print("Grid space: {0}".format(grid_space))

    predictors = ["displacement","power","weight","acceleration","year"]
    if grid_space['family'][0] == 'binomial':
        response_col = "economy_20mpg"
        true_model_type = "classifier"
    elif grid_space['family'][0] == 'gaussian':
        response_col = "economy"
        true_model_type = "regressor"
    else:
        response_col = "cylinders"
        true_model_type = "regressor"

    print("Predictors: {0}".format(predictors))
    print("Response: {0}".format(response_col))

    if grid_space['family'][0] in ['binomial', 'multinomial']:
        print("Converting the response column to a factor...")
        train[response_col] = train[response_col].asfactor()
        if validation_scheme == 3:
            valid[response_col] = valid[response_col].asfactor()

    #grid_space.update({"lambda":[0.1,0.05,0.01]})
    family = grid_space.pop('family')[0]
    print("Grid space: {0}".format(grid_space))
    print("Constructing the grid of glm models...")
    print("family = ",family)
    cars_glm_grid = H2OGridSearch(H2OGeneralizedLinearEstimator, hyper_params=grid_space)
    if validation_scheme == 1:
        cars_glm_grid.train(x=predictors,y=response_col,training_frame=train, family=family)
    elif validation_scheme == 2:
        cars_glm_grid.train(x=predictors,y=response_col,training_frame=train,nfolds=nfolds, family=family)
    else:
        cars_glm_grid.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid, family=family)

    for model in cars_glm_grid:
      assert isinstance(model, H2OGeneralizedLinearEstimator)

    print("Performing various checks of the constructed grid...")

    print("Check cardinality of grid, that is, the correct number of models have been created...")
    size_of_grid_space = 1
    for v in list(grid_space.values()):
        size_of_grid_space = size_of_grid_space * len(v)
    actual_size = len(cars_glm_grid)
    assert size_of_grid_space ==  actual_size, "Expected size of grid to be {0}, but got {1}" \
                                               "".format(size_of_grid_space,actual_size)

    print("Check correct type value....")
    model_type = cars_glm_grid[0].type
    assert model_type == true_model_type, "Type of model ({0}) is incorrect, expected value is {1}.".format(model_type, true_model_type)

    print("Duplicate-entries-in-grid-space check")
    new_grid_space = copy.deepcopy(grid_space)
    for name in list(grid_space.keys()):
        if not name == "family":
            new_grid_space[name] = grid_space[name] + grid_space[name]
    print("The new search space: {0}".format(new_grid_space))
    print("Constructing the new grid of glm models...")
    cars_glm_grid2 = H2OGridSearch(H2OGeneralizedLinearEstimator, hyper_params=new_grid_space)
    if validation_scheme == 1:
        cars_glm_grid2.train(x=predictors,y=response_col,training_frame=train, family=family)
    elif validation_scheme == 2:
        cars_glm_grid2.train(x=predictors,y=response_col,training_frame=train,nfolds=nfolds, family=family)
    else:
        cars_glm_grid2.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid, family=family)
    actual_size2 = len(cars_glm_grid2)
    assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \
                                        "size: {1}".format(actual_size, actual_size2)

    print("Check that the hyper_params that were passed to grid, were used to construct the models...")
    for name in list(grid_space.keys()):
        print(name)
        pyunit_utils.expect_model_param(cars_glm_grid, name, grid_space[name])
Example #10
0
def grid_cars_RF():

    cars =  h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    r = cars[0].runif(seed=42)
    train = cars[r > .2]

    validation_scheme = random.randint(1,3) # 1:none, 2:cross-validation, 3:validation set
    print("Validation scheme: {0}".format(validation_scheme))
    if validation_scheme == 2:
        nfolds = 2
        print("Nfolds: 2")
    if validation_scheme == 3:
        valid = cars[r <= .2]

    predictors = ["displacement","power","weight","acceleration","year"]
    grid_space = pyunit_utils.make_random_grid_space(algo="rf", ncols=len(predictors))

    # reduce the magnitude of nbins_cats, run was too long.
    if 'nbins_cats' in list(grid_space):
        grid_space['nbins_cats'] = random.sample(list(range(2, 200)), random.randint(2, 3))

    print("Grid space: {0}".format(grid_space))

    problem = random.randint(1,3)
    if problem == 1:
        response_col = "economy_20mpg"
        true_model_type = "classifier"
    elif problem == 2:
        response_col = "economy"
        true_model_type = "regressor"
    else:
        response_col = "cylinders"
        true_model_type = "classifier"

    print("Predictors: {0}".format(predictors))
    print("Response: {0}".format(response_col))

    if problem in [1,3]:
        print("Converting the response column to a factor...")
        train[response_col] = train[response_col].asfactor()
        if validation_scheme == 3:
            valid[response_col] = valid[response_col].asfactor()

    print("Constructing the grid of RF models...")
    cars_rf_grid = H2OGridSearch(H2ORandomForestEstimator, hyper_params=grid_space)
    if validation_scheme == 1:
        cars_rf_grid.train(x=predictors,y=response_col, training_frame=train)
    elif validation_scheme == 2:
        cars_rf_grid.train(x=predictors,y=response_col, training_frame=train, nfolds=nfolds)
    else:
        cars_rf_grid.train(x=predictors,y=response_col, training_frame=train, validation_frame=valid)

    for model in cars_rf_grid:
      assert isinstance(model, H2ORandomForestEstimator)

    print("Performing various checks of the constructed grid...")

    print("Check cardinality of grid, that is, the correct number of models have been created...")
    size_of_grid_space = 1
    for v in list(grid_space.values()):
        size_of_grid_space = size_of_grid_space * len(v)
    actual_size = len(cars_rf_grid)
    assert size_of_grid_space ==  actual_size, "Expected size of grid to be {0}, but got {1}" \
                                               "".format(size_of_grid_space,actual_size)

    print("Check correct type value....")
    model_type = cars_rf_grid[0].type
    assert model_type == true_model_type, "Type of model ({0}) is incorrect, expected value is {1}.".format(model_type, true_model_type)

    print("Duplicate-entries-in-grid-space check")
    new_grid_space = copy.deepcopy(grid_space)
    for name in list(grid_space.keys()):
        if not name == "distribution":
            new_grid_space[name] = grid_space[name] + grid_space[name]
    print("The new search space: {0}".format(new_grid_space))
    print("Constructing the new grid of RF models...")
    cars_rf_grid2 = H2OGridSearch(H2ORandomForestEstimator, hyper_params=new_grid_space)
    if validation_scheme == 1:
        cars_rf_grid2.train(x=predictors,y=response_col, training_frame=train)
    elif validation_scheme == 2:
        cars_rf_grid2.train(x=predictors,y=response_col, training_frame=train, nfolds=nfolds)
    else:
        cars_rf_grid2.train(x=predictors,y=response_col, training_frame=train, validation_frame=valid)
    actual_size2 = len(cars_rf_grid2)
    assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \
                                        "size: {1}".format(actual_size, actual_size2)

    for model in cars_rf_grid2:
      assert isinstance(model, H2ORandomForestEstimator)

    print(grid_space)
    print("Check that the hyper_params that were passed to grid, were used to construct the models...")
    for name in list(grid_space.keys()):
        pyunit_utils.expect_model_param(cars_rf_grid, name, grid_space[name])
def grid_cars_NB():

    cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    r = cars[0].runif(seed=42)
    train = cars[r > .2]

    validation_scheme = random.randint(1,3) # 1:none, 2:cross-validation, 3:validation set
    print("Validation scheme: {0}".format(validation_scheme))
    if validation_scheme == 2:
        nfolds = 2
        print("Nfolds: 2")
    if validation_scheme == 3:
        valid = cars[r <= .2]

    grid_space = pyunit_utils.make_random_grid_space(algo="naiveBayes")
    print("Grid space: {0}".format(grid_space))

    problem = random.sample(["binomial","multinomial"],1)
    predictors = ["displacement","power","weight","acceleration","year"]
    if problem == "binomial":
        response_col = "economy_20mpg"
    else:
        response_col = "cylinders"

    print("Predictors: {0}".format(predictors))
    print("Response: {0}".format(response_col))

    print("Converting the response column to a factor...")
    train[response_col] = train[response_col].asfactor()
    if validation_scheme == 3:
        valid[response_col] = valid[response_col].asfactor()

    print("Grid space: {0}".format(grid_space))
    print("Constructing the grid of nb models...")
    cars_nb_grid = H2OGridSearch(H2ONaiveBayesEstimator, hyper_params=grid_space)
    if validation_scheme == 1:
        cars_nb_grid.train(x=predictors,y=response_col,training_frame=train)
    elif validation_scheme == 2:
        cars_nb_grid.train(x=predictors,y=response_col,training_frame=train,nfolds=nfolds)
    else:
        cars_nb_grid.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid)

    for model in cars_nb_grid:
      assert isinstance(model, H2ONaiveBayesEstimator)

    print("Performing various checks of the constructed grid...")

    print("Check cardinality of grid, that is, the correct number of models have been created...")
    size_of_grid_space = 1
    print(grid_space)
    for v in list(grid_space.values()):
        v2 = [v] if type(v) != list else v
        size_of_grid_space = size_of_grid_space * len(v2)
    actual_size = len(cars_nb_grid)
    assert size_of_grid_space ==  actual_size, "Expected size of grid to be {0}, but got {1}" \
                                               "".format(size_of_grid_space,actual_size)

    print("Duplicate-entries-in-grid-space check")
    new_grid_space = copy.deepcopy(grid_space)
    for name in list(grid_space.keys()):
        new_grid_space[name] = grid_space[name] + grid_space[name]
    print("The new search space: {0}".format(new_grid_space))
    print("Constructing the new grid of nb models...")
    cars_nb_grid2 = H2OGridSearch(H2ONaiveBayesEstimator, hyper_params=new_grid_space)
    if validation_scheme == 1:
        cars_nb_grid2.train(x=predictors,y=response_col,training_frame=train)
    elif validation_scheme == 2:
        cars_nb_grid2.train(x=predictors,y=response_col,training_frame=train,nfolds=nfolds)
    else:
        cars_nb_grid2.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid)
    actual_size2 = len(cars_nb_grid2)
    assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \
                                        "size: {1}".format(actual_size, actual_size2)

    for model in cars_nb_grid2:
      assert isinstance(model, H2ONaiveBayesEstimator)

    print("Check that the hyper_params that were passed to grid, were used to construct the models...")
    for name in list(grid_space.keys()):
        print(name)
        pyunit_utils.expect_model_param(cars_nb_grid, name, grid_space[name])
Example #12
0
def grid_cars_NB():

    cars = h2o.import_file(
        path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    r = cars[0].runif(seed=42)
    train = cars[r > .2]

    validation_scheme = random.randint(
        1, 3)  # 1:none, 2:cross-validation, 3:validation set
    print "Validation scheme: {0}".format(validation_scheme)
    if validation_scheme == 2:
        nfolds = 2
        print "Nfolds: 2"
    if validation_scheme == 3:
        valid = cars[r <= .2]

    grid_space = pyunit_utils.make_random_grid_space(algo="naiveBayes")
    print "Grid space: {0}".format(grid_space)

    problem = random.sample(["binomial", "multinomial"], 1)
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    if problem == "binomial":
        response_col = "economy_20mpg"
    else:
        response_col = "cylinders"

    print "Predictors: {0}".format(predictors)
    print "Response: {0}".format(response_col)

    print "Converting the response column to a factor..."
    train[response_col] = train[response_col].asfactor()
    if validation_scheme == 3:
        valid[response_col] = valid[response_col].asfactor()

    print "Grid space: {0}".format(grid_space)
    print "Constructing the grid of nb models..."
    cars_nb_grid = H2OGridSearch(H2ONaiveBayesEstimator,
                                 hyper_params=grid_space)
    if validation_scheme == 1:
        cars_nb_grid.train(x=predictors, y=response_col, training_frame=train)
    elif validation_scheme == 2:
        cars_nb_grid.train(x=predictors,
                           y=response_col,
                           training_frame=train,
                           nfolds=nfolds)
    else:
        cars_nb_grid.train(x=predictors,
                           y=response_col,
                           training_frame=train,
                           validation_frame=valid)

    print "Performing various checks of the constructed grid..."

    print "Check cardinality of grid, that is, the correct number of models have been created..."
    size_of_grid_space = 1
    print grid_space
    for v in grid_space.values():
        v2 = [v] if type(v) != list else v
        size_of_grid_space = size_of_grid_space * len(v2)
    actual_size = len(cars_nb_grid)
    assert size_of_grid_space ==  actual_size, "Expected size of grid to be {0}, but got {1}" \
                                               "".format(size_of_grid_space,actual_size)

    print "Duplicate-entries-in-grid-space check"
    new_grid_space = copy.deepcopy(grid_space)
    for name in grid_space.keys():
        new_grid_space[name] = grid_space[name] + grid_space[name]
    print "The new search space: {0}".format(new_grid_space)
    print "Constructing the new grid of nb models..."
    cars_nb_grid2 = H2OGridSearch(H2ONaiveBayesEstimator,
                                  hyper_params=new_grid_space)
    if validation_scheme == 1:
        cars_nb_grid2.train(x=predictors, y=response_col, training_frame=train)
    elif validation_scheme == 2:
        cars_nb_grid2.train(x=predictors,
                            y=response_col,
                            training_frame=train,
                            nfolds=nfolds)
    else:
        cars_nb_grid2.train(x=predictors,
                            y=response_col,
                            training_frame=train,
                            validation_frame=valid)
    actual_size2 = len(cars_nb_grid2)
    assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \
                                        "size: {1}".format(actual_size, actual_size2)

    print "Check that the hyper_params that were passed to grid, were used to construct the models..."
    for name in grid_space.keys():
        print name
        pyunit_utils.expect_model_param(cars_nb_grid, name, grid_space[name])