コード例 #1
0
def grid_resume():
    train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    # Run GBM Grid Search
    ntrees_opts = [1, 3]
    learn_rate_opts = [0.1, .05]
    hyper_parameters = OrderedDict()
    hyper_parameters["learn_rate"] = learn_rate_opts
    hyper_parameters["ntrees"] = ntrees_opts
    print("GBM grid with the following hyper_parameters:", hyper_parameters)
    
    export_dir = pyunit_utils.locate("results")
    gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters)
    gs.train(x=list(range(4)), y=4, training_frame=train)
    grid_id = gs.grid_id
    old_grid_model_count = len(gs.model_ids)
    print("Baseline grid has %d models" % old_grid_model_count)
    saved_path = h2o.save_grid(export_dir, grid_id)
    h2o.remove_all();

    train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    grid = h2o.load_grid(saved_path)
    assert grid is not None
    assert len(grid.model_ids) == old_grid_model_count
    # Modify the hyperspace - should add new models to the grid
    hyper_parameters["ntrees"] = [2,5]
    grid = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters, grid_id = grid.grid_id)
    grid.train(x=list(range(4)), y=4, training_frame=train)
    print("Newly grained grid has %d models" % len(grid.model_ids))
    assert len(grid.model_ids) == 2 * old_grid_model_count
    
    for model_id in grid.model_ids:
        model = h2o.get_model(model_id)
        assert model is not None
コード例 #2
0
def kmeans_grid_iris():

    iris_h2o = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris.csv"))
    grid_space = pyunit_utils.make_random_grid_space(algo="km")
    print "Grid space: {0}".format(grid_space)
    print "Constructing grid of Kmeans models"
    iris_grid = H2OGridSearch(H2OKMeansEstimator, hyper_params=grid_space)
    iris_grid.train(x=range(4), training_frame=iris_h2o)

    print "Check cardinality of grid, that is, the correct number of models have been created..."
    size_of_grid_space = 1
    for v in grid_space.values():
        size_of_grid_space = size_of_grid_space * len(v)
    actual_size = len(iris_grid)
    assert size_of_grid_space ==  actual_size, "Expected size of grid to be {0}, but got {1}" \
                                               "".format(size_of_grid_space,actual_size)

    print "Duplicate-entries-in-grid-space check"
    new_grid_space = copy.deepcopy(grid_space)
    for name in grid_space.keys():
        new_grid_space[name] = grid_space[name] + grid_space[name]
    print "The new search space: {0}".format(new_grid_space)
    print "Constructing the new grid of glm models..."
    iris_grid2 = H2OGridSearch(H2OKMeansEstimator, hyper_params=new_grid_space)
    iris_grid2.train(x=range(4), training_frame=iris_h2o)
    actual_size2 = len(iris_grid2)
    assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \
                                        "size: {1}".format(actual_size, actual_size2)

    print "Check that the hyper_params that were passed to grid, were used to construct the models..."
    for name in grid_space.keys():
        print name
        pyunit_utils.expect_model_param(iris_grid, name, grid_space[name])
コード例 #3
0
def benign_grid():
    training_data = h2o.import_file(
        pyunit_utils.locate("smalldata/logreg/benign.csv"))

    Y = 3
    X = range(3) + range(4, 11)

    hyper_parameters = {'alpha': [0.01, 0.5, 'a'], 'lambda': [1e-5, 1e-6]}
    gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
                       hyper_parameters)
    gs.train(x=X, y=Y, training_frame=training_data)
    gs.show()
    print gs.sort_by('F1', False)
    best_model_id = gs.sort_by('F1', False)['Model Id'][0]
    best_model = h2o.get_model(best_model_id)
    best_model.predict(training_data)
    gs.predict(training_data)
    print gs.get_hyperparams(best_model_id)
    print gs.grid_id

    new_g = H2OGridSearch.get_grid(
        H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters,
        gs.grid_id)
    new_g.show()
    print new_g.grid_id
    print new_g.sort_by('F1', False)

    assert best_model.params['family']['actual'] == 'binomial'
def test_gridsearch():
    h2o_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gam_test/synthetic_20Cols_binomial_20KRows.csv"))
    h2o_data['response'] = h2o_data['response'].asfactor()
    h2o_data['C3'] = h2o_data['C3'].asfactor()
    h2o_data['C7'] = h2o_data['C7'].asfactor()
    h2o_data['C8'] = h2o_data['C8'].asfactor()
    h2o_data['C10'] = h2o_data['C10'].asfactor()
    names = h2o_data.names
    myY = "response"
    myX = names.remove(myY)
    search_criteria = {'strategy': 'Cartesian'}
    hyper_parameters = {
        'lambda': [1, 2],
        'subspaces': [{
            'scale': [[0.001], [0.0002]],
            'num_knots': [[5], [10]],
            'bs': [[1], [0]],
            'gam_columns': [[["c_0"]], [["c_1"]]]
        }, {
            'scale': [[0.001, 0.001, 0.001], [0.0002, 0.0002, 0.0002]],
            'bs': [[1, 1, 1], [0, 1, 1]],
            'num_knots': [[5, 10, 12], [6, 11, 13]],
            'gam_columns': [[["c_0"], ["c_1", "c_2"], ["c_3", "c_4", "c_5"]],
                            [["c_1"], ["c_2", "c_3"], ["c_4", "c_5", "c_6"]]]
        }]
    }
    hyper_parameters2 = {
        'lambda': [1, 2],
        'subspaces': [{
            'scale': [[0.001], [0.0002]],
            'num_knots': [[5], [10]],
            'bs': [[1], [0]],
            'gam_columns': [[["c_0"]], [["c_1"]]]
        }, {
            'scale': [[0.001, 0.001, 0.001], [0.0002, 0.0002, 0.0002]],
            'bs': [[1, 1, 1], [0, 1, 1]],
            'num_knots': [[5, 10, 12], [6, 11, 13]],
            'gam_columns': [["c_0", ["c_1", "c_2"], ["c_3", "c_4", "c_5"]],
                            ["c_1", ["c_2", "c_3"], ["c_4", "c_5", "c_6"]]]
        }]
    }
    h2o_model = H2OGridSearch(H2OGeneralizedAdditiveEstimator(
        family="binomial", keep_gam_cols=True),
                              hyper_params=hyper_parameters,
                              search_criteria=search_criteria)
    h2o_model.train(x=myX, y=myY, training_frame=h2o_data)
    h2o_model2 = H2OGridSearch(H2OGeneralizedAdditiveEstimator(
        family="binomial", keep_gam_cols=True),
                               hyper_params=hyper_parameters2,
                               search_criteria=search_criteria)
    h2o_model2.train(x=myX, y=myY, training_frame=h2o_data)
    # compare two models by checking their coefficients.  They should be the same
    for index in range(0, len(h2o_model)):
        model1 = h2o_model[index]
        model2 = h2o_model2[index]
        pyunit_utils.assertEqualCoeffDicts(model1.coef(),
                                           model2.coef(),
                                           tol=1e-6)
コード例 #5
0
def grid_quasar_pca():

    quasar = h2o.import_file(
        path=pyunit_utils.locate("smalldata/pca_test/SDSS_quasar.txt.zip"),
        header=1)
    grid_space = pyunit_utils.make_random_grid_space(algo="pca",
                                                     ncols=quasar.ncol,
                                                     nrows=quasar.nrow)
    print("Grid space: {0}".format(grid_space))

    print("Constructing the grid of PCA models...")
    quasar_pca_grid = H2OGridSearch(H2OPCA, hyper_params=grid_space)
    quasar_pca_grid.train(x=list(range(1, 23)), training_frame=quasar)

    for model in quasar_pca_grid:
        assert isinstance(model, H2OPCA)

    print("Performing various checks of the constructed grid...")

    print(
        "Check cardinality of grid, that is, the correct number of models have been created..."
    )
    size_of_grid_space = 1
    for v in list(grid_space.values()):
        v2 = [v] if type(v) != list else v
        size_of_grid_space = size_of_grid_space * len(v2)
    actual_size = len(quasar_pca_grid)
    assert size_of_grid_space ==  actual_size, "Expected size of grid to be {0}, but got {1}" \
                                               "".format(size_of_grid_space,actual_size)

    print("Duplicate-entries-in-grid-space check")
    new_grid_space = copy.deepcopy(grid_space)
    for name in list(grid_space.keys()):
        new_grid_space[name] = grid_space[name] + grid_space[name]
    print("The new search space: {0}".format(new_grid_space))
    print("Constructing the new grid of nb models...")
    quasar_pca_grid2 = H2OGridSearch(H2OPCA, hyper_params=new_grid_space)
    quasar_pca_grid2.train(x=list(range(1, 23)), training_frame=quasar)
    actual_size2 = len(quasar_pca_grid2)
    assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \
                                        "size: {1}".format(actual_size, actual_size2)

    for model in quasar_pca_grid2:
        assert isinstance(model, H2OPCA)

    print(
        "Check that the hyper_params that were passed to grid, were used to construct the models..."
    )
    for name in list(grid_space.keys()):
        print(name)
        pyunit_utils.expect_model_param(quasar_pca_grid, name,
                                        grid_space[name])
コード例 #6
0
def test_frame_reload():
    work_dir = tempfile.mkdtemp()
    iris = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    df_key = iris.key
    df_pd_orig = iris.as_data_frame()
    iris.save(work_dir)
    try:
        iris.save(work_dir, force=False)  # fails because file exists
    except H2OResponseError as e:
        assert e.args[0].exception_msg.startswith("File already exists")
    try:
        h2o.load_frame(df_key, work_dir,
                       force=False)  # fails because frame exists
    except H2OResponseError as e:
        assert e.args[
            0].exception_msg == "Frame Key<Frame> iris_wheader.hex already exists."
    df_loaded_force = h2o.load_frame(df_key, work_dir)
    h2o.remove(iris)
    df_loaded = h2o.load_frame(df_key, work_dir, force=False)
    df_pd_loaded_force = df_loaded_force.as_data_frame()
    df_pd_loaded = df_loaded.as_data_frame()
    assert df_pd_orig.equals(df_pd_loaded_force)
    assert df_pd_orig.equals(df_pd_loaded)

    # try running grid search on the frame
    h2o.remove_all()
    df_loaded = h2o.load_frame(df_key, work_dir)
    hyper_parameters = OrderedDict()
    hyper_parameters["ntrees"] = [5, 10, 20, 30]
    grid_small = H2OGridSearch(H2OGradientBoostingEstimator,
                               hyper_params=hyper_parameters)
    grid_small.train(x=list(range(4)), y=4, training_frame=df_loaded)
    assert len(grid_small.models) == 4
コード例 #7
0
def iris_dl_grid():
  train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

  # Run DL

  hidden_opts = [[20,20],[50,50,50]]
  loss_opts = ["Quadratic","CrossEntropy"]
  size_of_hyper_space = len(hidden_opts) * len(loss_opts)
  hyper_parameters = OrderedDict()
  hyper_parameters["loss"]  = loss_opts
  hyper_parameters["hidden"] = hidden_opts
  print("DL grid with the following hyper_parameters:", hyper_parameters)

  gs = H2OGridSearch(H2ODeepLearningEstimator, hyper_params=hyper_parameters, grid_id="mygrid")
  gs.train(x=list(range(4)), y=4, training_frame=train)
  print(gs.get_grid(sort_by="mse"))

  for model in gs:
    assert isinstance(model, H2ODeepLearningEstimator)

  assert len(gs) == size_of_hyper_space
  total_grid_space = list(map(list, itertools.product(*list(hyper_parameters.values()))))
  for model in gs.models:
    combo = [model.parms['loss']['actual_value']] + [model.parms['hidden']['actual_value']]
    assert combo in total_grid_space
    total_grid_space.remove(combo)

  print("Check correct type value....")
  model_type = gs[0].type
  true_model_type = "classifier"
  assert model_type == true_model_type, "Type of model ({0}) is incorrect, expected value is {1}.".format(model_type, true_model_type)
コード例 #8
0
def train_grid():
    # Import a sample binary outcome dataset into H2O
    data = h2o.import_file(pyunit_utils.locate("smalldata/testng/higgs_train_5k.csv"))
    test = h2o.import_file(pyunit_utils.locate("smalldata/testng/higgs_test_5k.csv"))
    
    # Identify predictors and response
    x = data.columns
    y = "response"
    x.remove(y)
    # For binary classification, response should be a factor
    data[y] = data[y].asfactor()
    test[y] = test[y].asfactor()
    # Split data into train & validation
    ss = data.split_frame(seed=1)
    train = ss[0]
    valid = ss[1]
    # GBM hyperparameters
    gbm_params1 = {'learn_rate': [0.01],
                   'max_depth': [3],
                   'sample_rate': [0.8],
                   'col_sample_rate': [0.2, 0.5, 1.0]}
    # Train and validate a cartesian grid of GBMs
    gbm_grid1 = H2OGridSearch(model=H2OGradientBoostingEstimator,
                              grid_id='gbm_grid1',
                              hyper_params=gbm_params1)
    gbm_grid1.train(x=x, y=y,
                    training_frame=train,
                    validation_frame=valid,
                    ntrees=100,
                    seed=1)
    return gbm_grid1
コード例 #9
0
def test_grid_search():
    '''This function tests, whether H2O GridSearch with XGBoostEstimator
        can be passed unknown argument, which may possibly crash the H2O instance
    '''
    assert H2OXGBoostEstimator.available(), 'H2O XGBoost is not available! Please check machine env!'

    data = init_data()
    # `col_sample_rate_change_per_level` parameter can be set in other estimators, but NOT IN XGBoost,
    # so it should be an uknown parameter for XGBoost
    hyper_parameters = {
        'ntrees': 1,
        'seed': 1,
        'col_sample_rate_change_per_level': [.9, .3, .2, .4]
    }
    raised = False
    try:
        grid_search = H2OGridSearch(H2OXGBoostEstimator, hyper_params=hyper_parameters)
        grid_search.train(
            x=data['predictors'],
            y=data['response'],
            training_frame=data['train'],
            validation_frame=data['test']
        )
    except H2OResponseError:
        raised = True

    assert raised is True, \
        'H2O should throw an exception if unknown parameter is passed to GridSearch with XGBoostEstimator!'
コード例 #10
0
def grid_search_eif():
    train = h2o.import_file(
        pyunit_utils.locate("smalldata/anomaly/single_blob.csv"))

    grid_space = {
        'sample_size': random.sample(list(range(128, 256)),
                                     random.randint(2, 3)),
        'extension_level': [0, 1]
    }
    print("Grid space: {0}".format(grid_space))

    print("Constructing the grid of IF models...")
    eif_grid = H2OGridSearch(H2OExtendedIsolationForestEstimator,
                             hyper_params=grid_space)
    eif_grid.train(training_frame=train)

    print("Check correct type value....")
    model_type = eif_grid[0].type
    assert model_type == 'unsupervised', "Type of model ({0}) is incorrect, expected value is 'unsupervised'.".format(
        model_type)

    print("Performing various checks of the constructed grid...")

    print(
        "Check cardinality of grid, that is, the correct number of models have been created..."
    )
    size_of_grid_space = 1
    for v in list(grid_space.values()):
        size_of_grid_space = size_of_grid_space * len(v)
    actual_size = len(eif_grid)

    print("Expected size of grid space: {0}".format(size_of_grid_space))
    assert size_of_grid_space == actual_size, "Expected size of grid to be {0}, but got {1}".format(
        size_of_grid_space, actual_size)
    print(eif_grid)
コード例 #11
0
ファイル: test_grid_reload.py プロジェクト: sergeiten/h2o-3
    def test_frame_reload(self):
        name_node = pyunit_utils.hadoop_namenode()
        work_dir = "hdfs://%s%s" % (name_node, utils.get_workdir())
        dataset = "/datasets/iris_wheader.csv"

        ntrees_opts = [100, 120, 130, 140]
        learn_rate_opts = [0.01, 0.02, 0.03, 0.04]
        grid_size = len(ntrees_opts) * len(learn_rate_opts)
        print("max models %s" % grid_size)
        grid_id = "grid_ft_resume"
        hyper_parameters = {
            "learn_rate": learn_rate_opts,
            "ntrees": ntrees_opts
        }

        cluster_1_name = "grid1-py"
        try:
            cluster_1 = utils.start_cluster(cluster_1_name)
            h2o.connect(url=cluster_1)
            train = h2o.import_file(path="hdfs://%s%s" % (name_node, dataset))
            grid = H2OGridSearch(H2OGradientBoostingEstimator,
                                 grid_id=grid_id,
                                 hyper_params=hyper_parameters,
                                 recovery_dir=work_dir)
            print("starting initial grid and sleeping...")
            grid.start(x=list(range(4)), y=4, training_frame=train)
            grid_in_progress = None
            times_waited = 0
            while (times_waited < 20) and (grid_in_progress is None or len(
                    grid_in_progress.model_ids) == 0):
                time.sleep(5)  # give it tome to train some models
                times_waited += 1
                try:
                    grid_in_progress = h2o.get_grid(grid_id)
                except IndexError:
                    print("no models trained yet")
            print("done sleeping")
            h2o.connection().close()
        finally:
            utils.stop_cluster(cluster_1_name)

        cluster_2_name = "grid2-py"
        try:
            cluster_2 = utils.start_cluster(cluster_2_name)
            h2o.connect(url=cluster_2)
            loaded = h2o.load_grid("%s/%s" % (work_dir, grid_id),
                                   load_params_references=True)
            print("models after first run:")
            for x in sorted(loaded.model_ids):
                print(x)
            loaded.resume()
            print("models after second run:")
            for x in sorted(loaded.model_ids):
                print(x)
            print("Newly grained grid has %d models" % len(loaded.model_ids))
            self.assertEqual(len(loaded.model_ids), grid_size,
                             "The full grid was not trained.")
            h2o.connection().close()
        finally:
            utils.stop_cluster(cluster_2_name)
コード例 #12
0
def h2oget_grid():
    """
    Python API test: h2o.get_grid(grid_id)

    Copy from pyunit_gbm_random_grid.py
    """
    try:
        air_hex = h2o.import_file(path=pyunit_utils.locate("smalldata/airlines/allyears2k_headers.zip"), destination_frame="air.hex")
        myX = ["DayofMonth","DayOfWeek"]

        hyper_parameters = {
            'learn_rate':[0.1,0.2],
            'max_depth':[2,3],
            'ntrees':[5,10]
        }

        search_crit = {'strategy': "RandomDiscrete",
                       'max_models': 5,
                       'seed' : 1234,
                       'stopping_rounds' : 3,
                       'stopping_metric' : "AUTO",
                       'stopping_tolerance': 1e-2
                       }

        air_grid = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters, search_criteria=search_crit)
        air_grid.train(x=myX, y="IsDepDelayed", training_frame=air_hex, distribution="bernoulli")

        fetched_grid = h2o.get_grid(str(air_grid.grid_id))
        assert_is_type(fetched_grid, H2OGridSearch)
        assert len(air_grid.get_grid())==5, "h2o.get_grid() is command not working.  " \
                                            "It returned the wrong number of models."
        assert len(air_grid.get_grid())==len(fetched_grid.get_grid()), "h2o.get_grid() is command not working."
    except Exception as e:
        assert False, "h2o.get_grid() command is not working."
コード例 #13
0
def grid_re_run_hyper_serialization():
    train_data = np.dot(np.random.rand(1000, 10), np.random.rand(10, 100))
    train = h2o.H2OFrame(train_data.tolist(), destination_frame="glrm_train")
    params = {
        "k": 2,
        "init": "User",
        "loss": "Quadratic",
        "regularization_x": "OneSparse",
        "regularization_y": "NonNegative"
    }
    hyper_params = {
        "transform": ["NONE", "STANDARDIZE"],
        "gamma_x": [0.1],
    }

    # train grid
    grid = H2OGridSearch(H2OGeneralizedLowRankEstimator,
                         hyper_params=hyper_params)
    grid.train(x=train.names, training_frame=train, **params)
    print(grid)
    assert len(grid.model_ids) == 2

    # load from back-end and train again
    grid = h2o.get_grid(grid.grid_id)
    grid.hyper_params["gamma_x"] = [0.1, 1]
    grid.train(x=train.names, training_frame=train, **params)
    print(grid)
    assert len(grid.model_ids) == 4
コード例 #14
0
def kmeans_start(grid_id, export_dir, train, params, hyper_parameters):
    grid = H2OGridSearch(H2OKMeansEstimator(),
                         grid_id=grid_id,
                         hyper_params=hyper_parameters,
                         recovery_dir=export_dir)
    grid.start(x=list(range(4)), training_frame=train, **params)
    return grid
コード例 #15
0
def dl_start(grid_id, export_dir, train, params, hyper_parameters):
    grid = H2OGridSearch(H2ODeepLearningEstimator,
                         grid_id=grid_id,
                         hyper_params=hyper_parameters,
                         recovery_dir=export_dir)
    grid.start(x=list(range(4)), y=4, training_frame=train, **params)
    return grid
コード例 #16
0
def h2o_grid():
    h2o.init()
    data = h2o.import_file('output/diamonds_PCA.csv')
    splits = data.split_frame(ratios=[0.7, 0.15], seed=1)
    train = splits[0]
    valid = splits[1]
    test = splits[2]
    y = 'price'
    x = list(data.columns)

    x.remove(y)

    hyper_parameters = {'learn_rate': [0.01, 0.1],'max_depth': [3, 5, 9],
            'sample_rate': [0.8, 1.0],'col_sample_rate': [0.2, 0.5, 1.0]}

    gs = H2OGridSearch(H2OGradientBoostingEstimator,hyper_parameters)

    gs.train(x = x,y=y, training_frame=train,validation_frame=valid)
    gs1=gs.get_grid(sort_by='rmse',decreasing=True)
    best_m=gs1.models[0]
    best_mp=best_m.model_performance(test)
    print(best_mp.rmse())
    test = h2o.import_file('output/diamonds_test_PCA.csv')
    predict=best_m.predict(test)
    predict=h2o.as_list(predict) 
    predict.to_csv('output/pred_h2o.csv') 
コード例 #17
0
def iris_gbm_grid():
    train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    # Run GBM

    ntrees_opts = [1,3]
    learn_rate_opts = [0.1,0.01,.05]
    size_of_hyper_space = len(ntrees_opts) * len(learn_rate_opts)
    hyper_parameters = OrderedDict()
    hyper_parameters["learn_rate"] = learn_rate_opts
    hyper_parameters["ntrees"] = ntrees_opts
    print("GBM grid with the following hyper_parameters:", hyper_parameters)

    gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters)
    gs.train(y=4, training_frame=train)
    print("\nsorted by mse: ")
    print(gs.get_grid(sort_by="mse"))
    #print gs.hit_ratio_table()

    for model in gs:
        assert isinstance(model, H2OGradientBoostingEstimator)

    assert len(gs) == size_of_hyper_space
    total_grid_space = list(map(list, itertools.product(*list(hyper_parameters.values()))))
    print( str(total_grid_space) )
    for model in gs.models:
        combo = [model.parms['learn_rate']['actual_value'], model.parms['ntrees']['actual_value']]
        assert combo in total_grid_space, "combo: " + str(combo) + "; total_grid_space=" + str(total_grid_space)
        total_grid_space.remove(combo)
コード例 #18
0
def iris_dl_grid():
    train = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    # Run DL

    hidden_opts = [[20, 20], [50, 50, 50]]
    loss_opts = ["Quadratic", "CrossEntropy"]
    size_of_hyper_space = len(hidden_opts) * len(loss_opts)
    hyper_parameters = OrderedDict()
    hyper_parameters["loss"] = loss_opts
    hyper_parameters["hidden"] = hidden_opts
    print("DL grid with the following hyper_parameters:", hyper_parameters)

    gs = H2OGridSearch(H2ODeepLearningEstimator, hyper_params=hyper_parameters)
    gs.train(x=list(range(4)), y=4, training_frame=train)
    print(gs.sort_by("mse"))

    for model in gs:
        assert isinstance(model, H2ODeepLearningEstimator)

    assert len(gs) == size_of_hyper_space
    total_grid_space = list(
        map(list, itertools.product(*list(hyper_parameters.values()))))
    for model in gs.models:
        combo = [model.parms['loss']['actual_value']
                 ] + [model.parms['hidden']['actual_value']]
        assert combo in total_grid_space
        total_grid_space.remove(combo)
def get_hyperparams_dict_return_correct_params():
    prostate_train = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate_train.csv"))
    prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor()

    num_folds = random.randint(2, 5)
    fold_assignments = h2o.H2OFrame([[random.randint(0, num_folds - 1)]
                                     for _ in range(prostate_train.nrow)])
    fold_assignments.set_names(["fold_assignments"])
    prostate_train = prostate_train.cbind(fold_assignments)

    x_features = range(1, prostate_train.ncol)
    y_target = "CAPSULE"
    h2o_data_frame = prostate_train

    grid = H2OGridSearch(model=H2OGradientBoostingEstimator,
                         hyper_params={
                             'fold_assignment': ['Stratified'],
                             'sample_rate_per_class': [[1.0, 0.6]]
                         },
                         search_criteria={
                             'strategy': 'RandomDiscrete',
                             'max_models': 1
                         })
    grid.train(x=x_features,
               y=y_target,
               training_frame=h2o_data_frame,
               nfolds=num_folds)

    print(grid.get_grid())
    hyperparams_dict = grid.get_hyperparams_dict(0)
    assert hyperparams_dict['fold_assignment'] == 'Stratified'
    assert hyperparams_dict['sample_rate_per_class'] == [1.0, 0.6]
コード例 #20
0
def grid_export_with_cv():
    train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    # Run GBM Grid Search
    hyper_parameters = OrderedDict()
    hyper_parameters["ntrees"] = [1, 2]

    # train with CV
    gs = H2OGridSearch(H2OGradientBoostingEstimator(nfolds=2, keep_cross_validation_predictions=True, seed=42),
                       hyper_params=hyper_parameters)
    gs.train(x=list(range(4)), y=4, training_frame=train)

    holdout_frame_ids = map(lambda m: m.cross_validation_holdout_predictions().frame_id, gs.models)

    export_dir = pyunit_utils.locate("results")
    saved_path = h2o.save_grid(export_dir, gs.grid_id, export_cross_validation_predictions=True)

    h2o.remove_all()

    grid = h2o.load_grid(saved_path)

    assert grid is not None
    for holdout_frame_id in holdout_frame_ids:
        assert h2o.get_frame(holdout_frame_id) is not None

    train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    stack = H2OStackedEnsembleEstimator(base_models=grid.model_ids)
    stack.train(x=list(range(4)), y=4, training_frame=train)

    predicted = stack.predict(train)
    assert predicted.nrow == train.nrow
コード例 #21
0
def grid_resume():
    train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    # Run GBM Grid Search
    ntrees_opts = [1,5]
    hyper_parameters = OrderedDict()
    hyper_parameters["ntrees"] = ntrees_opts
    print("GBM grid with the following hyper_parameters:", hyper_parameters)

    export_dir = pyunit_utils.locate("results")
    gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters,
                       export_checkpoints_dir=export_dir)
    gs.train(x=list(range(4)), y=4, training_frame=train)
    grid_id = gs.grid_id
    old_grid_model_count = len(gs.model_ids)
    print("Baseline grid has %d models" % old_grid_model_count)
    h2o.remove_all()

    train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    grid = h2o.load_grid(export_dir + "/" + grid_id)
    assert grid is not None
    assert len(grid.model_ids) == old_grid_model_count
    grid.train(x=list(range(4)), y=4, training_frame=train)
    assert len(grid.model_ids) == old_grid_model_count
    print("Newly grained grid has %d models" % len(grid.model_ids))
    
    for model_id in grid.model_ids:
        model = h2o.get_model(model_id)
        assert model is not None
コード例 #22
0
 def train_models(self):
     self.h2o_model = H2OGridSearch(H2OGeneralizedAdditiveEstimator(
         family = "gaussian", gam_columns = ["C11", "C12", "C13"],
         keep_gam_cols = True), self.hyper_parameters)
     self.h2o_model.train(x = self.myX, y = self.myY, training_frame = self.h2o_data)
     for model in self.manual_gam_models:
         model.train(x = self.myX, y = self.myY, training_frame = self.h2o_data)
コード例 #23
0
def benign_grid():
  training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv"))

  Y = 3
  X = range(3) + range(4,11)


  hyper_parameters = {'alpha': [0.01,0.5,'a'], 'lambda': [1e-5,1e-6]}
  gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters)
  gs.train(x=X,y=Y, training_frame=training_data)
  gs.show()
  print gs.sort_by('F1', False)
  best_model_id = gs.sort_by('F1', False)['Model Id'][0]
  best_model = h2o.get_model(best_model_id)
  best_model.predict(training_data)
  gs.predict(training_data)
  print gs.get_hyperparams(best_model_id)
  print gs.grid_id
  
  new_g = H2OGridSearch.get_grid(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters, gs.grid_id)
  new_g.show()
  print new_g.grid_id
  print new_g.sort_by('F1', False)

  assert best_model.params['family']['actual'] == 'binomial'
コード例 #24
0
def gbm_grid_search_max_depth(selected, target, train, test, nfolds):
    """
    Performs grid search on a GBM model to find the optimal max_depth parameter that maximizes the AUC on the test frame.
    Returns the best model and the grid. 

    :param selected: list of selected variables
    :param target: target variable
    :param train: h2o frame with train data
    :param test: h2o frame with test data
    :param nfolds: the number of folds to use for cross-validation
    """
    hyperparameters = {'max_depth': [3, 4, 5, 6, 7, 8]}
    search_criteria = {'strategy': "Cartesian"}
    gbm_grid = H2OGridSearch(H2OGradientBoostingEstimator(seed=1234,
                                                          balance_classes=True,
                                                          nfolds=nfolds),
                             hyperparameters,
                             search_criteria=search_criteria)

    gbm_grid.train(x=selected,
                   y=target,
                   training_frame=train,
                   validation_frame=test)

    gbm_grid_table = gbm_grid.get_grid(sort_by='auc', decreasing=True)
    gbm_best_model = gbm_grid.models[0]
    gbm_best_model.name = 'best GBM - max_detph'
    gbm_grid_table = gbm_grid_table.sorted_metric_table().drop('model_ids',
                                                               axis=1)

    return gbm_best_model, gbm_grid_table
コード例 #25
0
 def train_models(self):
     self.h2o_model = H2OGridSearch(H2OGeneralizedAdditiveEstimator(family="gaussian",
                                                                    keep_gam_cols=True), hyper_params=self.hyper_parameters, search_criteria=self.search_criteria)
     self.h2o_model.train(x = self.myX, y = self.myY, training_frame = self.h2o_data)
     for model in self.manual_gam_models:
         model.train(x = self.myX, y = self.myY, training_frame = self.h2o_data)
     print("done")
コード例 #26
0
def gbm_grid_search(selected, target, train, test, nfolds, max_depth,
                    ntrees_list, min_rows_list, show_top):
    """
    Performs grid search on a GBM model to find the optimal parameters that maximize the AUC on the test frame.
    Returns the best model and the grid. 

    :param selected: list of selected variables
    :param target: target variable
    :param train: h2o frame with train data
    :param test: h2o frame with test data
    :param nfolds: the number of folds to use for cross-validation
    :param max_depth: specifies the maximum depth to which each tree will be built
    :param ntrees_list: list of number of trees to build in the model
    :param min_rows_list: list of the minimum number of observations for a leaf in order to split
    """
    hyperparameters = {
        'ntrees':
        ntrees_list,
        'min_rows':
        min_rows_list,
        'min_split_improvement': [1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8],
        'learn_rate': [0.05, 0.08, 0.1, 0.25, 0.35, 0.5, 1],
        'learn_rate_annealing': [0.9, 0.93, 0.95, 0.99, 0.1],
        'col_sample_rate': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
        'sample_rate': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
        'col_sample_rate_change_per_level':
        [0.3, 0.4, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 1, 1.5, 1.8, 2],
        'col_sample_rate_per_tree':
        [0.3, 0.4, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 1],
        'histogram_type': ["UniformAdaptive", "QuantilesGlobal", "Random"]
    }

    search_criteria = {
        'strategy': "RandomDiscrete",
        'stopping_metric': "lift_top_group",
        'stopping_tolerance': 0.01,
        'stopping_rounds': 3,
        'max_runtime_secs': 800,
        'max_models': 50
    }

    gbm_grid = H2OGridSearch(H2OGradientBoostingEstimator(seed=1234,
                                                          balance_classes=True,
                                                          max_depth=max_depth,
                                                          nfolds=nfolds),
                             hyperparameters,
                             search_criteria=search_criteria)

    gbm_grid.train(x=selected,
                   y=target,
                   training_frame=train,
                   validation_frame=test)

    gbm_grid_table = gbm_grid.get_grid(sort_by='auc', decreasing=True)
    gbm_best_model = gbm_grid.models[0]
    gbm_best_model.name = 'best GBM - grid search'
    gbm_grid_table = gbm_grid_table.sorted_metric_table().drop(
        'model_ids', axis=1)[0:show_top]

    return gbm_best_model, gbm_grid_table
コード例 #27
0
def neural_net_grid(X, y, train, valid):
    # define random grid search parameters
    hyper_parameters = {'hidden': [[170, 320], [80, 190], [320, 160, 80], [100], [50, 50, 50, 50]],
                        'l1':[s/1e4 for s in range(0, 1000, 100)],
                        'l2':[s/1e5 for s in range(0, 1000, 100)],
                        'input_dropout_ratio':[s/1e2 for s in range(0, 20, 2)]}

    # define search strategy
    search_criteria = {'strategy':'RandomDiscrete',
                       'max_models':100,
                       'max_runtime_secs':60*60*2,  #2 hours
                       }

    # initialize grid search
    gsearch = H2OGridSearch(H2ODeepLearningEstimator,
                            hyper_params=hyper_parameters,
                            search_criteria=search_criteria)

    # execute training w/ grid search
    gsearch.train(x=X,
                  y=y,
                  training_frame=train,
                  validation_frame=valid,
                  activation='TanhWithDropout',
                  epochs=2000,
                  stopping_rounds=20,
                  sparse=True, # handles data w/ many zeros more efficiently
                  ignore_const_cols=True,
                  adaptive_rate=True)
    best_model = gsearch.get_grid()[0]

    return best_model
コード例 #28
0
def airline_gbm_random_grid():
    air_hex = h2o.import_file(
        path=pyunit_utils.locate("smalldata/airlines/allyears2k_headers.zip"),
        destination_frame="air.hex")
    myX = ["DayofMonth", "DayOfWeek"]

    hyper_parameters = {
        'learn_rate': [0.1, 0.2],
        'max_depth': [2, 3, 4],
        'ntrees': [5, 10, 15]
    }

    search_crit = {
        'strategy': "RandomDiscrete",
        'max_models': 5,
        'seed': 1234,
        'stopping_rounds': 3,
        'stopping_metric': "AUTO",
        'stopping_tolerance': 1e-2
    }

    air_grid = H2OGridSearch(H2OGradientBoostingEstimator,
                             hyper_params=hyper_parameters,
                             search_criteria=search_crit)
    air_grid.train(x=myX,
                   y="IsDepDelayed",
                   training_frame=air_hex,
                   distribution="bernoulli")

    assert (len(air_grid.get_grid()) == 5)
    print(air_grid.get_grid("logloss"))
コード例 #29
0
def grid_glrm_iris():
    print("Importing iris_wheader.csv data...")
    irisH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    irisH2O.describe()
    transform_opts = ["NONE", "DEMEAN", "DESCALE", "STANDARDIZE"]
    k_opts = random.sample(list(range(1, 8)), 3)
    size_of_hyper_space = len(transform_opts) * len(k_opts)
    hyper_parameters = OrderedDict()
    hyper_parameters["k"] = k_opts
    hyper_parameters["transform"] = transform_opts
    gx = random.uniform(0, 1)
    gy = random.uniform(0, 1)
    print("H2O GLRM with , gamma_x = " + str(gx) + ", gamma_y = " + str(gy) +\
          ", hyperparameters = " + str(hyper_parameters))

    gs = H2OGridSearch(H2OGeneralizedLowRankEstimator(loss="Quadratic",
                                                      gamma_x=gx,
                                                      gamma_y=gy),
                       hyper_params=hyper_parameters)
    gs.train(x=list(range(4)), y=4, training_frame=irisH2O)
    for model in gs:
        assert isinstance(model, H2OGeneralizedLowRankEstimator)
    print(gs.get_grid(sort_by="mse"))
    #print gs.hit_ratio_table()

    assert len(gs) == size_of_hyper_space
    total_grid_space = list(
        map(list, itertools.product(*list(hyper_parameters.values()))))
    for model in gs.models:
        combo = [model.parms['k']['actual_value']
                 ] + [model.parms['transform']['actual_value']]
        assert combo in total_grid_space
        total_grid_space.remove(combo)
コード例 #30
0
    def test_kmeans_grid_search_over_validation_datasets(self):
        """
        test_kmeans_grid_search_over_validation_datasets performs the following:
        a. build H2O kmeans models using grid search.
        b. For each model built using grid search, print out the total_sum_squares errors.
        c. If an exception was thrown, mark the test as failed.
        """
        print(
            "*******************************************************************************************"
        )
        print("test_kmeans_grid_search_over_validation_datasets for kmeans ")
        h2o.cluster_info()

        print("Hyper-parameters used here is {0}".format(self.hyper_params))

        #        try:
        # start grid search
        grid_model = H2OGridSearch(H2OKMeansEstimator(),
                                   hyper_params=self.hyper_params)
        grid_model.train(x=self.x_indices, training_frame=self.training1_data)

        for each_model in grid_model:
            summary_list = each_model._model_json["output"][
                "validation_metrics"]
            if (summary_list is not None) and (summary_list._metric_json
                                               is not None):
                grid_model_metrics = summary_list._metric_json['totss']
                print("total sum of squares of a model is: {0}".format(
                    grid_model_metrics))
            else:
                print(
                    'model._model_json["output"]["validation_metrics"] of a model is None for some reason....'
                )
コード例 #31
0
ファイル: spark_main.py プロジェクト: HuipingZhu/big-data
def gboosting_grid(X, y, train, valid):
    # define random grid search parameters
    hyper_parameters = {
        'ntrees': list(range(0, 500, 50)),
        'max_depth': list(range(0, 20, 2)),
        'sample_rate': [s / float(10) for s in range(1, 11)],
        'col_sample_rate': [s / float(10) for s in range(1, 11)]
    }

    # define search strategy
    search_criteria = {
        'strategy': 'RandomDiscrete',
        'max_models': 100,
        'max_runtime_secs': 60 * 60 * 2,  #2 hours
    }

    # initialize grid search
    gsearch = H2OGridSearch(H2OGradientBoostingEstimator,
                            hyper_params=hyper_parameters,
                            search_criteria=search_criteria)

    # execute training w/ grid search
    gsearch.train(x=X, y=y, training_frame=train, validation_frame=valid)

    best_model = gsearch.get_grid()[0]

    return best_model
コード例 #32
0
ファイル: pyunit_gbm_grid.py プロジェクト: josh-whitney/h2o-3
def iris_gbm_grid():
  train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

  # Run GBM

  ntrees_opts = [1,3]
  learn_rate_opts = [0.1,0.01,.05]
  size_of_hyper_space = len(ntrees_opts) * len(learn_rate_opts)
  hyper_parameters = OrderedDict()
  hyper_parameters["learn_rate"] = learn_rate_opts
  hyper_parameters["ntrees"] = ntrees_opts
  print("GBM grid with the following hyper_parameters:", hyper_parameters)

  gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters)
  gs.train(x=list(range(4)), y=4, training_frame=train)
  print("\nsorted by mse: ")
  print(gs.sort_by("mse"))
  #print gs.hit_ratio_table()

  for model in gs:
    assert isinstance(model, H2OGradientBoostingEstimator)

  assert len(gs) == size_of_hyper_space
  total_grid_space = list(map(list, itertools.product(*list(hyper_parameters.values()))))
  print( str(total_grid_space) )
  for model in gs.models:
    combo = [model.parms['learn_rate']['actual_value'], model.parms['ntrees']['actual_value']]
    assert combo in total_grid_space, "combo: " + str(combo) + "; total_grid_space=" + str(total_grid_space)
    total_grid_space.remove(combo)

  # test back-end sorting of model metrics:
  locally_sorted = gs.sort_by("r2", H2OGridSearch.DESC)
  remotely_sorted_desc = H2OGridSearch.get_grid(H2OGradientBoostingEstimator(distribution='multinomial'), hyper_parameters, gs.grid_id, sort_by='r2', sort_order='desc')

  assert len(locally_sorted.cell_values) == len(remotely_sorted_desc.model_ids), "Expected locally sorted and remotely sorted grids to have the same number of models"
  for i in range(len(remotely_sorted_desc.model_ids)):
    assert locally_sorted.cell_values[i][0] == remotely_sorted_desc.model_ids[i], "Expected back-end sort by r2 to be the same as locally-sorted: " + str(i)

  remotely_sorted_asc = H2OGridSearch.get_grid(H2OGradientBoostingEstimator(distribution='multinomial'), hyper_parameters, gs.grid_id, sort_by='r2', sort_order='asc')
  for model in remotely_sorted_asc:
    assert isinstance(model, H2OGradientBoostingEstimator)

  assert len(locally_sorted.cell_values) == len(remotely_sorted_asc.model_ids), "Expected locally sorted and remotely sorted grids to have the same number of models"
  length = len(remotely_sorted_asc.model_ids)
  for i in range(length):
    assert locally_sorted.cell_values[i][0] == remotely_sorted_asc.model_ids[length - i - 1], "Expected back-end sort by r2, ascending, to be the reverse as locally-sorted ascending: " + str(i)
コード例 #33
0
def benign_grid():
  training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv"))

  Y = 3
  X = list(range(3)) + list(range(4,11))

  # NOTE: this tests bad parameter value handling; 'a' is not a float:
  hyper_parameters = {'alpha': [0.01,0.5,'a'], 'lambda': [1e-5,1e-6]}
  gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters)
  gs.train(x=X,y=Y, training_frame=training_data)
  for model in gs:
    assert isinstance(model, H2OGeneralizedLinearEstimator)
  gs.show()
  print(gs.sort_by('F1', False))
  best_model_id = gs.sort_by('F1', False)['Model Id'][0]
  best_model = h2o.get_model(best_model_id)
  best_model.predict(training_data)
  gs.predict(training_data)
  print(gs.get_hyperparams(best_model_id))
  print(gs.grid_id)
  
  new_g = H2OGridSearch.get_grid(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters, gs.grid_id)
  new_g.show()
  print(new_g.grid_id)
  print(new_g.sort_by('F1', False))

  assert best_model.params['family']['actual'] == 'binomial'

  # test search_criteria plumbing
  search_criteria = { 'strategy': "Random", 'max_models': 3 }
  max_models_g = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters, search_criteria=search_criteria)
  max_models_g.train(x=X,y=Y, training_frame=training_data)

  max_models_g.show()
  print(max_models_g.grid_id)
  print(max_models_g.sort_by('F1', False))

  ##### TODO: remove:
  print("before assert")
  assert len(max_models_g.models) == 3, "expected 3 models, got: {}".format(len(max_models_g.models))