def benign_grid():
    training_data = h2o.import_file(
        pyunit_utils.locate("smalldata/logreg/benign.csv"))

    Y = 3
    X = range(3) + range(4, 11)

    hyper_parameters = {'alpha': [0.01, 0.5, 'a'], 'lambda': [1e-5, 1e-6]}
    gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
                       hyper_parameters)
    gs.train(x=X, y=Y, training_frame=training_data)
    gs.show()
    print gs.sort_by('F1', False)
    best_model_id = gs.sort_by('F1', False)['Model Id'][0]
    best_model = h2o.get_model(best_model_id)
    best_model.predict(training_data)
    gs.predict(training_data)
    print gs.get_hyperparams(best_model_id)
    print gs.grid_id

    new_g = H2OGridSearch.get_grid(
        H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters,
        gs.grid_id)
    new_g.show()
    print new_g.grid_id
    print new_g.sort_by('F1', False)

    assert best_model.params['family']['actual'] == 'binomial'
def benign_grid():
  training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv"))

  Y = 3
  X = range(3) + range(4,11)


  hyper_parameters = {'alpha': [0.01,0.5,'a'], 'lambda': [1e-5,1e-6]}
  gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters)
  gs.train(x=X,y=Y, training_frame=training_data)
  gs.show()
  print gs.sort_by('F1', False)
  best_model_id = gs.sort_by('F1', False)['Model Id'][0]
  best_model = h2o.get_model(best_model_id)
  best_model.predict(training_data)
  gs.predict(training_data)
  print gs.get_hyperparams(best_model_id)
  print gs.grid_id
  
  new_g = H2OGridSearch.get_grid(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters, gs.grid_id)
  new_g.show()
  print new_g.grid_id
  print new_g.sort_by('F1', False)

  assert best_model.params['family']['actual'] == 'binomial'
Exemple #3
0
def iris_gbm_grid():
  train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

  # Run GBM

  ntrees_opts = [1,3]
  learn_rate_opts = [0.1,0.01,.05]
  size_of_hyper_space = len(ntrees_opts) * len(learn_rate_opts)
  hyper_parameters = OrderedDict()
  hyper_parameters["learn_rate"] = learn_rate_opts
  hyper_parameters["ntrees"] = ntrees_opts
  print("GBM grid with the following hyper_parameters:", hyper_parameters)

  gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters)
  gs.train(x=list(range(4)), y=4, training_frame=train)
  print("\nsorted by mse: ")
  print(gs.sort_by("mse"))
  #print gs.hit_ratio_table()

  for model in gs:
    assert isinstance(model, H2OGradientBoostingEstimator)

  assert len(gs) == size_of_hyper_space
  total_grid_space = list(map(list, itertools.product(*list(hyper_parameters.values()))))
  print( str(total_grid_space) )
  for model in gs.models:
    combo = [model.parms['learn_rate']['actual_value'], model.parms['ntrees']['actual_value']]
    assert combo in total_grid_space, "combo: " + str(combo) + "; total_grid_space=" + str(total_grid_space)
    total_grid_space.remove(combo)

  # test back-end sorting of model metrics:
  locally_sorted = gs.sort_by("r2", H2OGridSearch.DESC)
  remotely_sorted_desc = H2OGridSearch.get_grid(H2OGradientBoostingEstimator(distribution='multinomial'), hyper_parameters, gs.grid_id, sort_by='r2', sort_order='desc')

  assert len(locally_sorted.cell_values) == len(remotely_sorted_desc.model_ids), "Expected locally sorted and remotely sorted grids to have the same number of models"
  for i in range(len(remotely_sorted_desc.model_ids)):
    assert locally_sorted.cell_values[i][0] == remotely_sorted_desc.model_ids[i], "Expected back-end sort by r2 to be the same as locally-sorted: " + str(i)

  remotely_sorted_asc = H2OGridSearch.get_grid(H2OGradientBoostingEstimator(distribution='multinomial'), hyper_parameters, gs.grid_id, sort_by='r2', sort_order='asc')
  for model in remotely_sorted_asc:
    assert isinstance(model, H2OGradientBoostingEstimator)

  assert len(locally_sorted.cell_values) == len(remotely_sorted_asc.model_ids), "Expected locally sorted and remotely sorted grids to have the same number of models"
  length = len(remotely_sorted_asc.model_ids)
  for i in range(length):
    assert locally_sorted.cell_values[i][0] == remotely_sorted_asc.model_ids[length - i - 1], "Expected back-end sort by r2, ascending, to be the reverse as locally-sorted ascending: " + str(i)
Exemple #4
0
def benign_grid():
    training_data = h2o.import_file(
        pyunit_utils.locate("smalldata/logreg/benign.csv"))

    Y = 3
    X = list(range(3)) + list(range(4, 11))

    # NOTE: this tests bad parameter value handling; 'a' is not a float:
    hyper_parameters = {'alpha': [0.01, 0.5, 'a'], 'lambda': [1e-5, 1e-6]}
    gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
                       hyper_parameters)
    gs.train(x=X, y=Y, training_frame=training_data)
    for model in gs:
        assert isinstance(model, H2OGeneralizedLinearEstimator)
    gs.show()
    print(gs.sort_by('F1', False))
    best_model_id = gs.sort_by('F1', False)['Model Id'][0]
    best_model = h2o.get_model(best_model_id)
    best_model.predict(training_data)
    gs.predict(training_data)
    print(gs.get_hyperparams(best_model_id))
    print(gs.grid_id)

    new_g = H2OGridSearch.get_grid(
        H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters,
        gs.grid_id)
    new_g.show()
    print(new_g.grid_id)
    print(new_g.sort_by('F1', False))

    assert best_model.params['family']['actual'] == 'binomial'

    # test search_criteria plumbing
    search_criteria = {'strategy': "RandomDiscrete", 'max_models': 3}
    max_models_g = H2OGridSearch(
        H2OGeneralizedLinearEstimator(family='binomial'),
        hyper_parameters,
        search_criteria=search_criteria)
    max_models_g.train(x=X, y=Y, training_frame=training_data)

    max_models_g.show()
    print(max_models_g.grid_id)
    print(max_models_g.sort_by('F1', False))

    assert len(max_models_g.models) == 3, "expected 3 models, got: {}".format(
        len(max_models_g.models))
def benign_grid():
  training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv"))

  Y = 3
  X = list(range(3)) + list(range(4,11))

  # NOTE: this tests bad parameter value handling; 'a' is not a float:
  hyper_parameters = {'alpha': [0.01,0.5,'a'], 'lambda': [1e-5,1e-6]}
  gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters)
  gs.train(x=X,y=Y, training_frame=training_data)
  for model in gs:
    assert isinstance(model, H2OGeneralizedLinearEstimator)
  gs.show()
  print(gs.sort_by('F1', False))
  best_model_id = gs.sort_by('F1', False)['Model Id'][0]
  best_model = h2o.get_model(best_model_id)
  best_model.predict(training_data)
  gs.predict(training_data)
  print(gs.get_hyperparams(best_model_id))
  print(gs.grid_id)
  
  new_g = H2OGridSearch.get_grid(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters, gs.grid_id)
  new_g.show()
  print(new_g.grid_id)
  print(new_g.sort_by('F1', False))

  assert best_model.params['family']['actual'] == 'binomial'

  # test search_criteria plumbing
  search_criteria = { 'strategy': "Random", 'max_models': 3 }
  max_models_g = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters, search_criteria=search_criteria)
  max_models_g.train(x=X,y=Y, training_frame=training_data)

  max_models_g.show()
  print(max_models_g.grid_id)
  print(max_models_g.sort_by('F1', False))

  ##### TODO: remove:
  print("before assert")
  assert len(max_models_g.models) == 3, "expected 3 models, got: {}".format(len(max_models_g.models))