コード例 #1
0
def glrm_start(grid_id, export_dir, train, params, hyper_parameters):
    grid = H2OGridSearch(H2OGeneralizedLowRankEstimator,
                         grid_id=grid_id,
                         hyper_params=hyper_parameters,
                         recovery_dir=export_dir,
                         parallelism=2)
    grid.start(x=train.names, training_frame=train, **params)
    return grid
コード例 #2
0
def setup_grid():
    h2o.remove_all()
    hyper_parameters = OrderedDict()
    hyper_parameters["learn_rate"] = [0.1, 0.05, 0.01]
    hyper_parameters["ntrees"] = [1, 3, 5]
    gs = H2OGridSearch(H2OGradientBoostingEstimator,
                       hyper_params=hyper_parameters)
    return gs
コード例 #3
0
def benign_grid():
    training_data = h2o.import_file(
        pyunit_utils.locate("smalldata/logreg/benign.csv"))

    Y = 3
    X = list(range(3)) + list(range(4, 11))

    # NOTE: this tests bad parameter value handling; 'a' is not a float:
    hyper_parameters = {'alpha': [0.01, 0.5, 'a'], 'lambda': [1e-5, 1e-6]}
    gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
                       hyper_parameters)
    gs.train(x=X, y=Y, training_frame=training_data)
    for model in gs:
        assert isinstance(model, H2OGeneralizedLinearEstimator)
    gs.show()
    print(gs.sort_by('F1', False))
    best_model_id = gs.sort_by('F1', False)['Model Id'][0]
    best_model = h2o.get_model(best_model_id)
    best_model.predict(training_data)
    gs.predict(training_data)
    print(gs.get_hyperparams(best_model_id))
    print(gs.grid_id)

    new_g = H2OGridSearch.get_grid(
        H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters,
        gs.grid_id)
    new_g.show()
    print(new_g.grid_id)
    print(new_g.sort_by('F1', False))

    assert best_model.params['family']['actual'] == 'binomial'

    # test search_criteria plumbing
    search_criteria = {'strategy': "RandomDiscrete", 'max_models': 3}
    max_models_g = H2OGridSearch(
        H2OGeneralizedLinearEstimator(family='binomial'),
        hyper_parameters,
        search_criteria=search_criteria)
    max_models_g.train(x=X, y=Y, training_frame=training_data)

    max_models_g.show()
    print(max_models_g.grid_id)
    print(max_models_g.sort_by('F1', False))

    assert len(max_models_g.models) == 3, "expected 3 models, got: {}".format(
        len(max_models_g.models))
コード例 #4
0
 def train_models(self):
     self.h2o_model = \
         H2OGridSearch(H2OGeneralizedAdditiveEstimator(family="gaussian", keep_gam_cols=True), 
                       hyper_params=self.hyper_parameters, search_criteria=self.search_criteria)
     self.h2o_model.train(x = self.myX, y = self.myY, training_frame = self.h2o_data)
     for model in self.manual_gam_models:
         model.train(x = self.myX, y = self.myY, training_frame = self.h2o_data)
     print("done")
コード例 #5
0
def h2o_randomsearch_nn(frames, y_name, x_names, dname):

    """ H2o neural network with parameter tuning.

    Args:
        frames[0], H2o training frame.
        frames[1], H2o validation frame.
        y_name: Target name.
        x_names: List of input names.
        dname: Name of data file.

    Returns:
        List of modeling results:

    """
    ### assign partitions
    tr_frame, v_frame = frames[0], frames[1]

    ### impute numeric
    ### categorical 'NA' treated as valid level
    tr_frame.impute(method='mean')
    v_frame.impute(method='mean')

    ### stdize
    tr_frame = h2o_stdize(tr_frame, y_name, x_names)
    v_frame = h2o_stdize(v_frame, y_name, x_names)

    ### define random grid search parameters and criteria
    hyper_parameters = {"hidden":[[17, 32], [8, 19], [32, 16, 8], [100],\
                                  [10, 10, 10, 10]],
                        "l1":[s/1e6 for s in range(1, 1001)],
                        "l2":[s/1e4 for s in range(1, 101)],
                        "input_dropout_ratio":[s/1e2 for s in range(1, 21)]}

    search_criteria = {"strategy":"RandomDiscrete",
                       "max_models":20,
                       "max_runtime_secs":600,
                       "seed":SEED}

    gsearch = H2OGridSearch(H2ODeepLearningEstimator,
                            hyper_params=hyper_parameters,
                            search_criteria=search_criteria)

    ### execute training w/ grid search
    gsearch.train(x=x_names,
                  y=y_name,
                  training_frame=tr_frame,
                  validation_frame=v_frame)

    ### collect error measures
    if tr_frame[y_name].isfactor()[0]:
        measures = h2o_cla_err_measures(gsearch, tr_frame, v_frame, y_name)
    else:
        measures = h2o_reg_err_measures(gsearch)

    ### return appropriate list
    return ['NN', 'NN w/ random hyperparameter search', dname, tr_frame.nrow,
            len(x_names), measures[0], measures[1], measures[2]]
コード例 #6
0
    def test3_glm_random_grid_search_max_runtime_secs(self):
        """
        This function will test the stopping criteria max_runtime_secs.  For each model built, the field
        run_time actually denote the time in ms used to build the model.  We will add up the run_time from all
        models and check against the stopping criteria max_runtime_secs.  Since each model will check its run time
        differently, there is some inaccuracies in the actual run time.  For example, if we give a model 10 ms to
        build.  The GLM may check and see if it has used up all the time for every 10 epochs that it has run.  On
        the other hand, deeplearning may check the time it has spent after every epoch of training.

        If we are able to restrict the runtime to not exceed the specified max_runtime_secs by a certain
        percentage, we will consider the test a success.

        :return: None
        """
        print(
            "*******************************************************************************************"
        )
        print("test3_glm_random_grid_search_max_runtime_secs for GLM " +
              self.family)
        h2o.cluster_info()

        # setup_data our stopping condition here
        max_run_time_secs = random.uniform(
            0, self.max_grid_runtime * self.allowed_scaled_overtime)
        search_criteria = {
            'strategy': 'RandomDiscrete',
            'max_runtime_secs': max_run_time_secs,
            "seed": round(time.time())
        }
        # search_criteria = {'strategy': 'RandomDiscrete', 'max_runtime_secs': 1/1e8}

        print("GLM Binomial grid search_criteria: {0}".format(search_criteria))

        # fire off random grid-search
        grid_model = \
            H2OGridSearch(H2OGeneralizedLinearEstimator(family=self.family, nfolds=self.nfolds),
                          hyper_params=self.hyper_params, search_criteria=search_criteria)
        grid_model.train(x=self.x_indices,
                         y=self.y_index,
                         training_frame=self.training1_data)

        actual_run_time_secs = pyunit_utils.find_grid_runtime(grid_model)

        if actual_run_time_secs <= search_criteria["max_runtime_secs"] * (
                1 + self.allowed_diff):
            print("test3_glm_random_grid_search_max_runtime_secs: passed!")
        elif len(grid_model) == 1:  # will always generate 1 model
            print("test3_glm_random_grid_search_max_runtime_secs: passed!")
        else:
            self.test_failed += 1
            self.test_failed_array[self.test_num] = 1
            print(
                "test3_glm_random_grid_search_max_runtime_secs: failed.  Model takes time {0}"
                " seconds which exceeds allowed time {1}".format(
                    actual_run_time_secs,
                    max_run_time_secs * (1 + self.allowed_diff)))
        self.test_num += 1
        sys.stdout.flush()
コード例 #7
0
def h2o_randomsearch_gbm(frames, y_name, x_names, dname):

    """ H2o GBM with parameter tuning.

    Args:
        frames[0], H2o training frame.
        frames[1], h2o validation frame.
        y_name: Target name.
        x_names: List of input names.
        dname: Name of data file.

    Returns:
        List of modeling results:

    """

    ### assign partitions
    tr_frame, v_frame = frames[0], frames[1]

    ### impute numeric
    ### categorical 'NA' treated as valid level
    tr_frame.impute(method='mean')
    v_frame.impute(method='mean')

    ### stdize
    tr_frame = h2o_stdize(tr_frame, y_name, x_names)
    v_frame = h2o_stdize(v_frame, y_name, x_names)

    ### define random grid search parameters and criteria
    hyper_parameters = {"ntrees":range(0, 100, 1),
                        "max_depth":range(0, 20, 1),
                        "sample_rate":[s/float(10) for s in range(1, 11)],
                        "col_sample_rate":[s/float(10) for s in range(1, 11)]}

    search_criteria = {"strategy":"RandomDiscrete",
                       "max_models":20,
                       "max_runtime_secs":600,
                       "seed":SEED}

    gsearch = H2OGridSearch(H2OGradientBoostingEstimator,
                            hyper_params=hyper_parameters,
                            search_criteria=search_criteria)

    ### execute training w/ grid search
    gsearch.train(x=x_names,
                  y=y_name,
                  training_frame=tr_frame,
                  validation_frame=v_frame)

    ### collect error measures
    if tr_frame[y_name].isfactor()[0]:
        measures = h2o_cla_err_measures(gsearch, tr_frame, v_frame, y_name)
    else:
        measures = h2o_reg_err_measures(gsearch)

    ### return appropriate list
    return ['GBM', 'GBM w/ random hyperparameter search', dname,\
            tr_frame.nrow, len(x_names), measures[0], measures[1], measures[2]]
コード例 #8
0
def benign_grid():
  training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv"))

  Y = 3
  X = [4,5,6,7,8,9,10,11]

  # NOTE: this tests bad parameter value handling; 'a' is not a float:
  hyper_parameters = {'alpha': [0.01,0.3,0.5,'a'], 'lambda': [1e-5,1e-6,1e-7,1e-8]}
  gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters)
  gs.train(x=X,y=Y, training_frame=training_data)
  for model in gs:
    assert isinstance(model, H2OGeneralizedLinearEstimator)
  gs.show()
  print(gs.sort_by('F1', False))
  best_model_id = gs.sort_by('F1', False)['Model Id'][0]
  best_model = h2o.get_model(best_model_id)
  best_model.predict(training_data)
  gs.predict(training_data)
  print(gs.get_hyperparams(best_model_id))
  print(gs.grid_id)

  assert best_model.params['family']['actual'] == 'binomial'

  # test search_criteria plumbing and max_models
  search_criteria = { 'strategy': "RandomDiscrete", 'max_models': 3 }
  max_models_g = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters, search_criteria=search_criteria)
  max_models_g.train(x=X,y=Y, training_frame=training_data)

  max_models_g.show()
  print(max_models_g.grid_id)
  print(max_models_g.sort_by('F1', False))

  assert len(max_models_g.models) == 3, "expected 3 models, got: {}".format(len(max_models_g.models))
  print(max_models_g.sorted_metric_table())

  # test search_criteria plumbing and asymptotic stopping
  search_criteria = { 'strategy': "RandomDiscrete", 'seed': 42, 'stopping_metric': "AUTO", 'stopping_tolerance': 0.1, 'stopping_rounds': 2 }
  asymp_g = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial', nfolds=5), hyper_parameters, search_criteria=search_criteria)
  asymp_g.train(x=X,y=Y, training_frame=training_data)

  asymp_g.show()
  print(asymp_g.grid_id)
  print(asymp_g.sort_by('F1', False))

  assert len(asymp_g.models) == 5, "expected 5 models, got: {}".format(len(asymp_g.models))
コード例 #9
0
 def create_grid(self):
     """Returns an H2O grid search object 
     """
     gbm_model = H2OGradientBoostingEstimator(**self.model_params)
     gbm_grid = H2OGridSearch(model=gbm_model,
                              hyper_params=self.hyper_params,
                              grid_id=self.grid_id,
                              search_criteria=self.search_params)
     return gbm_grid
コード例 #10
0
    def optimize_hyperparam(self,
                            hyper_params,
                            X,
                            y,
                            sample_weight=None,
                            X_valid=None,
                            y_valid=None,
                            sample_weight_valid=None,
                            h2o_train_params=None,
                            **kwargs):
        """Hyperparameter optimization & fitting model in H2O.

        Args:
            hyper_params:
            X (:obj:`pd.DataFrame`, :obj:`pd.Series`): Training data.
            y (:obj:`pd.DataFrame`, :obj:`pd.Series`): Training target values.
            sample_weight (:obj:`pd.DataFrame`, :obj:`pd.Series`, optional): Training sample weights.
            X_valid (:obj:`pd.DataFrame`, :obj:`pd.Series`, optional): Validation data (only h2o supported).
            y_valid (:obj:`pd.DataFrame`, :obj:`pd.Series`, optional): Validation target values (only h2o supported).
            sample_weight_valid (:obj:`pd.DataFrame`, :obj:`pd.Series`, optional): Validation sample weights.
            h2o_train_params (:obj:`dict`, optional): Parameters passed to `H2OGridSearch.train()`.
            **kwargs: Other parameters passed to H2OGridSearch.

        Returns:
            dict: {`hyperparameter_name`: `optimal_choice`}, Dictionary containing optimal hyperparameter choice.
        """
        if (self.backend == 'h2o') & isinstance(self.model, H2OEstimator):
            params = dict() if h2o_train_params is None else h2o_train_params
            features, target, train_set, params = self._x_y_to_h2o_frame(
                X, y, sample_weight, params, X_valid, y_valid,
                sample_weight_valid)
            model_grid = H2OGridSearch(model=self.model,
                                       hyper_params=hyper_params,
                                       **kwargs)
            model_grid.train(y=target,
                             x=features,
                             training_frame=train_set,
                             **params)
            sorted_grid = model_grid.get_grid(sort_by='residual_deviance',
                                              decreasing=False)
            self.best_params = sorted_grid.sorted_metric_table().loc[
                0, :'model_ids'].drop('model_ids').to_dict()
            self.best_params = {
                key: self.best_params[key].replace('[', '').replace(']', '')
                for key in self.best_params.keys() if key != ''
            }
            self.best_params = {
                key: float(self.best_params[key])
                if is_number(self.best_params[key]) else self.best_params[key]
                for key in self.best_params.keys()
            }
            self.model = sorted_grid.models[0]
        else:
            raise NotImplementedError(
                f'Error with the backend choice. Supported backends: {self._backends}'
            )
        return self.best_params
コード例 #11
0
 def train_models(self):
     self.h2o_model = H2OGridSearch(H2OGeneralizedAdditiveEstimator(family = "gaussian", 
                                                                    gam_columns = ["C11", "C12", "C13"], 
                                                                    keep_gam_cols = True, scale = [1, 1, 1], 
                                                                    num_knots = [5, 6, 7], bs=[0,2,0]), self.hyper_parameters)
     self.h2o_model.train(x = self.myX, y = self.myY, training_frame = self.h2o_data)
     
     for model in self.manual_gam_models:
         model.train(x = self.myX, y = self.myY, training_frame = self.h2o_data)
コード例 #12
0
def kmeans_start(grid_id, export_dir, train, params, hyper_parameters):
    grid = H2OGridSearch(
        H2OKMeansEstimator(),
        grid_id=grid_id,
        hyper_params=hyper_parameters,
        recovery_dir=export_dir
    )
    grid.start(x=list(range(4)), training_frame=train, **params)
    return grid
コード例 #13
0
def dl_start(grid_id, export_dir, train, params, hyper_parameters):
    grid = H2OGridSearch(
        H2ODeepLearningEstimator,
        grid_id=grid_id,
        hyper_params=hyper_parameters,
        recovery_dir=export_dir
    )
    grid.start(x=list(range(4)), y=4, training_frame=train, **params)
    return grid
コード例 #14
0
def gbm_start(grid_id, export_dir, train, params, hyper_parameters):
    grid = H2OGridSearch(
        H2OGradientBoostingEstimator,
        grid_id=grid_id,
        hyper_params=hyper_parameters,
        recovery_dir=export_dir
    )
    grid.start(x=list(range(2, train.ncol)), y="Angaus", training_frame=train, **params)
    return grid
コード例 #15
0
def airline_gbm_random_grid():
    air_hex = h2o.import_file(
        path=pyunit_utils.locate("smalldata/airlines/allyears2k_headers.zip"),
        destination_frame="air.hex")
    myX = ["Year", "Month", "CRSDepTime", "UniqueCarrier", "Origin", "Dest"]

    # create hyperameter and search criteria lists (ranges are inclusive..exclusive))
    hyper_params_tune = {
        'max_depth':
        list(range(1, 10 + 1, 1)),
        'sample_rate': [x / 100. for x in range(20, 101)],
        'col_sample_rate': [x / 100. for x in range(20, 101)],
        'col_sample_rate_per_tree': [x / 100. for x in range(20, 101)],
        'col_sample_rate_change_per_level': [x / 100. for x in range(90, 111)],
        'min_rows':
        [2**x for x in range(0,
                             int(math.log(air_hex.nrow, 2) - 1) + 1)],
        'nbins': [2**x for x in range(4, 11)],
        'nbins_cats': [2**x for x in range(4, 13)],
        'min_split_improvement': [0, 1e-8, 1e-6, 1e-4],
        'histogram_type': ["UniformAdaptive", "QuantilesGlobal", "RoundRobin"]
    }

    search_criteria_tune = {
        'strategy': "RandomDiscrete",
        'max_runtime_secs': 600,  ## limit the runtime to 10 minutes
        'max_models': 5,  ## build no more than 5 models
        'seed': 1234,
        'stopping_rounds': 5,
        'stopping_metric': "AUC",
        'stopping_tolerance': 1e-3
    }

    air_grid = H2OGridSearch(H2OGradientBoostingEstimator,
                             hyper_params=hyper_params_tune,
                             search_criteria=search_criteria_tune)

    air_grid.train(x=myX,
                   y="IsDepDelayed",
                   training_frame=air_hex,
                   nfolds=5,
                   fold_assignment='Modulo',
                   keep_cross_validation_predictions=True,
                   distribution="bernoulli",
                   seed=1234)

    assert (len(air_grid.get_grid()) == 5)
    print(air_grid.get_grid("logloss"))

    stacker = H2OStackedEnsembleEstimator(selection_strategy="choose_all",
                                          base_models=air_grid.model_ids)
    stacker.train(model_id="my_ensemble",
                  y="IsDepDelayed",
                  training_frame=air_hex)
    predictions = stacker.predict(air_hex)  # training data
    print("preditions for ensemble are in: " + predictions.frame_id)
コード例 #16
0
def glm_start(grid_id, export_dir, train, params, hyper_parameters):
    y = "cylinders"
    x = train.names
    x.remove(y)
    grid = H2OGridSearch(H2OGeneralizedLinearEstimator,
                         grid_id=grid_id,
                         hyper_params=hyper_parameters,
                         recovery_dir=export_dir)
    grid.start(x=x, y=y, training_frame=train, **params)
    return grid
コード例 #17
0
def grid_quasar_pca():

    quasar = h2o.import_file(path=pyunit_utils.locate("smalldata/pca_test/SDSS_quasar.txt.zip"), header=1)
    grid_space = pyunit_utils.make_random_grid_space(algo="pca", ncols=quasar.ncol, nrows=quasar.nrow)
    print("Grid space: {0}".format(grid_space))

    print("Constructing the grid of PCA models...")
    quasar_pca_grid = H2OGridSearch(H2OPCA, hyper_params=grid_space)
    quasar_pca_grid.train(x=list(range(1,23)), training_frame=quasar)

    for model in quasar_pca_grid:
      assert isinstance(model, H2OPCA)

    print("Performing various checks of the constructed grid...")

    print("Check cardinality of grid, that is, the correct number of models have been created...")
    size_of_grid_space = 1
    for v in list(grid_space.values()):
        v2 = [v] if type(v) != list else v
        size_of_grid_space = size_of_grid_space * len(v2)
    actual_size = len(quasar_pca_grid)
    assert size_of_grid_space ==  actual_size, "Expected size of grid to be {0}, but got {1}" \
                                               "".format(size_of_grid_space,actual_size)

    print("Duplicate-entries-in-grid-space check")
    new_grid_space = copy.deepcopy(grid_space)
    for name in list(grid_space.keys()):
        new_grid_space[name] = grid_space[name] + grid_space[name]
    print("The new search space: {0}".format(new_grid_space))
    print("Constructing the new grid of nb models...")
    quasar_pca_grid2 = H2OGridSearch(H2OPCA, hyper_params=new_grid_space)
    quasar_pca_grid2.train(x=list(range(1,23)), training_frame=quasar)
    actual_size2 = len(quasar_pca_grid2)
    assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \
                                        "size: {1}".format(actual_size, actual_size2)

    for model in quasar_pca_grid2:
      assert isinstance(model, H2OPCA)

    print("Check that the hyper_params that were passed to grid, were used to construct the models...")
    for name in list(grid_space.keys()):
        print(name)
        pyunit_utils.expect_model_param(quasar_pca_grid, name, grid_space[name])
コード例 #18
0
 def train_models(self):
     self.h2o_model = H2OGridSearch(H2OGeneralizedAdditiveEstimator(
         family="binomial", keep_gam_cols=True, seed=1234),
                                    hyper_params=self.hyper_parameters,
                                    search_criteria=self.search_criteria)
     self.h2o_model.train(x=self.myX,
                          y=self.myY,
                          training_frame=self.h2o_data)
     for model in self.manual_gam_models:
         model.train(x=self.myX, y=self.myY, training_frame=self.h2o_data)
コード例 #19
0
def validacion_r(modelo, hyper_parameters, datos, variables, semilla=1234):    
    h2o.init(max_mem_size=14) 
    train = h2o.H2OFrame(datos[0])
    tipificar_h2o(train)
    splits = train.split_frame(ratios=[0.7], seed=semilla)
    gs = H2OGridSearch(modelo, hyper_params=hyper_parameters)
    gs.train(x=variables, y="Tendencia", training_frame=splits[0])
    resultados=procesamiento_resultados_binario(gs,splits,datos)
    h2o.remove_all()
    return(resultados)
コード例 #20
0
    def test4_glm_random_grid_search_metric(self, metric_name,
                                            bigger_is_better):
        """
        This function will test the last stopping condition using metrics.

        :param metric_name: metric we want to use to test the last stopping condition
        :param bigger_is_better: higher metric value indicates better model performance

        :return: None
        """
        print(
            "*******************************************************************************************"
        )
        print("test4_glm_random_grid_search_metric using " + metric_name +
              " for family " + self.family)
        h2o.cluster_info()

        search_criteria = {
            "strategy": "RandomDiscrete",
            "stopping_metric": metric_name,
            "stopping_tolerance": random.uniform(1e-8, self.max_tolerance),
            "stopping_rounds": random.randint(1, self.max_stopping_rounds),
            "seed": round(time.time())
        }

        print("GLM Gaussian grid search_criteria: {0}".format(search_criteria))

        # add max_runtime_secs back into hyper-parameters to limit model runtime.
        self.hyper_params["max_runtime_secs"] = [
            0.3
        ]  # arbitrarily set to 0.1 second

        # fire off random grid-search
        grid_model = \
            H2OGridSearch(H2OGeneralizedLinearEstimator(family=self.family, nfolds=self.nfolds),
                          hyper_params=self.hyper_params, search_criteria=search_criteria)
        grid_model.train(x=self.x_indices,
                         y=self.y_index,
                         training_frame=self.training1_data)

        # bool indicating if randomized grid search has calculated the early stopping condition correctly
        stopped_correctly = \
            pyunit_utils.evaluate_metrics_stopping(grid_model.models, metric_name, bigger_is_better, search_criteria,
                                                   self.possible_number_models)

        if stopped_correctly:
            print("test4_glm_random_grid_search_metric " + metric_name +
                  ": passed. ")
        else:
            self.test_failed += 1
            self.test_failed_array[self.test_num] = 1
            print("test4_glm_random_grid_search_metric " + metric_name +
                  ": failed. ")

        self.test_num += 1
コード例 #21
0
ファイル: Modeling.py プロジェクト: msw1535540/db-h2o-spark
def grid_search(estimator, hparams=None, search_criteria=None):
    """
    create a grid search estimator
    :param estimator: a specific estimator for grid search
    :param hparams: a hyper parameters dict for grid search
    :param search_criteria: criteria for grid search 
    :return: a constructed grid search estimator
    """
    print estimator
    print hparams
    return H2OGridSearch(model=estimator, hyper_params=hparams, search_criteria=search_criteria)
コード例 #22
0
def glm_grid(X, y, train, valid, should_submit = False):
    """ Wrapper function for penalized GLM with alpha and lambda search.
    :param X: List of inputs.
    :param y: Name of target variable.
    :param train: Name of training H2OFrame.
    :param valid: Name of validation H2OFrame.
    :return: Best H2Omodel from H2OGeneralizedLinearEstimator
    """

    alpha_opts = [0.01, 0.25, 0.5, 0.99] # always keep some L2
    family = ["gaussian", "binomial", "quasibinomial", "multinomial", "poisson", "gamma", "tweedie"]
    hyper_parameters = {"alpha":alpha_opts
                        }

    # initialize grid search
    grid = H2OGridSearch(
        H2OGeneralizedLinearEstimator(
            family="gaussian",
            lambda_search=True,
            seed=12345),
        hyper_params=hyper_parameters)

    # train grid
    grid.train(y=y,
               x=X,
               training_frame=train,
               validation_frame=valid)

    # show grid search results
    print(grid.show())

    best = grid.get_grid()[0]
    print(best)
    # if should_submit:
    #     sub_frame = testHF[ID_VAR].cbind(best.predict(testHF))
    #     print(sub_frame.col_names)
    #     print('Submission frame preview:')
    #     print(sub_frame[0:10, [ID_VAR, 'predict']])
    #     upload_submission(sub_frame,'predict')
    # plot top frame values
    print('yhat_frame')
    yhat_frame = valid.cbind(best.predict(valid))
    print(yhat_frame[0:10, [y, 'predict']])

    # plot sorted predictions
    yhat_frame_df = yhat_frame[[y, 'predict']].as_data_frame()
    yhat_frame_df.sort_values(by='predict', inplace=True)
    yhat_frame_df.reset_index(inplace=True, drop=True)
    plt = yhat_frame_df.plot(title='Ranked Predictions Plot')
    logger.log_string('Ranked Predictions Plot')
    logger.log_matplotlib_plot(plt)

    # select best model
    return best
コード例 #23
0
def xgboost_start(grid_id, export_dir, train, params, hyper_parameters):
    grid = H2OGridSearch(H2OXGBoostEstimator,
                         grid_id=grid_id,
                         hyper_params=hyper_parameters,
                         recovery_dir=export_dir,
                         parallelism=2)
    grid.start(x=list(range(2, train.ncol)),
               y="Angaus",
               training_frame=train,
               **params)
    return grid
コード例 #24
0
def kmeans_grid_iris():

    iris_h2o = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris.csv"))
    grid_space = pyunit_utils.make_random_grid_space(algo="km")
    print("Grid space: {0}".format(grid_space))
    print("Constructing grid of Kmeans models")
    iris_grid = H2OGridSearch(H2OKMeansEstimator, hyper_params=grid_space)
    iris_grid.train(x=list(range(4)), training_frame=iris_h2o)

    print(
        "Check cardinality of grid, that is, the correct number of models have been created..."
    )
    size_of_grid_space = 1
    for v in list(grid_space.values()):
        size_of_grid_space = size_of_grid_space * len(v)
    actual_size = len(iris_grid)
    assert size_of_grid_space ==  actual_size, "Expected size of grid to be {0}, but got {1}" \
                                               "".format(size_of_grid_space,actual_size)

    print("Duplicate-entries-in-grid-space check")
    new_grid_space = copy.deepcopy(grid_space)
    for name in list(grid_space.keys()):
        new_grid_space[name] = grid_space[name] + grid_space[name]
    print("The new search space: {0}".format(new_grid_space))
    print("Constructing the new grid of glm models...")
    iris_grid2 = H2OGridSearch(H2OKMeansEstimator, hyper_params=new_grid_space)
    iris_grid2.train(x=list(range(4)), training_frame=iris_h2o)
    actual_size2 = len(iris_grid2)
    assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \
                                        "size: {1}".format(actual_size, actual_size2)

    for model in iris_grid2:
        assert isinstance(model, H2OKMeansEstimator)

    print(
        "Check that the hyper_params that were passed to grid, were used to construct the models..."
    )
    for name in list(grid_space.keys()):
        print(name)
        pyunit_utils.expect_model_param(iris_grid, name, grid_space[name])
コード例 #25
0
    def init_model(self):
        self.gbm_rand_grid = H2OGridSearch(H2OGradientBoostingEstimator(model_id='gbm_rand_grid' + \
                                                                                 str(random.sample(list(range(101)), 1)[
                                                                                         0]),
                                                                        nfolds=5,
                                                                        fold_assignment="Modulo",
                                                                        keep_cross_validation_predictions=True,
                                                                        stopping_rounds=10,
                                                                        score_tree_interval=1),
                                           search_criteria=self.search_criteria,
                                           hyper_params=self.hyper_p_gbm)
        self.drf_rand_grid = H2OGridSearch(H2ORandomForestEstimator(model_id='drf_rand_grid' + \
                                                                             str(random.sample(list(range(101)), 1)[0]),
                                                                    seed=1234,
                                                                    nfolds=5,
                                                                    fold_assignment="Modulo",
                                                                    balance_classes=True,
                                                                    keep_cross_validation_predictions=True),
                                           search_criteria=self.search_criteria,
                                           hyper_params=self.hyper_p_drf)

        self.glm_rand_grid = H2OGridSearch(
            H2OGeneralizedLinearEstimator(
                family="gaussian",
                nfolds=5,
                seed=1234,
                max_iterations=30,
                keep_cross_validation_predictions=True,
                compute_p_values=False),
            search_criteria=self.search_criteria,
            hyper_params=self.hyper_p_glm)

        self.dnn_rand_grid = H2OGridSearch(H2ODeepLearningEstimator(
            model_id='dnn_rand_grid' + \
                     str(random.sample(list(range(101)), 1)[0]),
            seed=1234,
            nfolds=5,
            fold_assignment="Modulo",
            keep_cross_validation_predictions=True),
            search_criteria=self.search_criteria,
            hyper_params=self.hyper_p_dnn)
コード例 #26
0
def grid_parallel():
    train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    # Run GBM Grid Search
    ntrees_opts = [1, 5]
    hyper_parameters = OrderedDict()
    hyper_parameters["ntrees"] = ntrees_opts
    print("GBM grid with the following hyper_parameters:", hyper_parameters)

    gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters, parallelism = 1)
    gs.train(x=list(range(4)), y=4, training_frame=train)
    assert gs is not None
    assert len(gs.model_ids) == len(ntrees_opts)
コード例 #27
0
    def test1_glm_random_grid_search_model_number(self, metric_name):
        """
        This test is used to make sure the randomized gridsearch will generate all models specified in the
        hyperparameters if no stopping condition is given in the search criterion.  We will compare the
        performance between the randomized gridsearch and normal gridsearch to make sure they generate the same
        number of models and their performances are similar.

        :param metric_name: string to denote what grid search model should be sort by

        :return: None
        """
        print(
            "*******************************************************************************************"
        )
        print("test1_glm_random_grid_search_model_number for GLM " +
              self.family)
        h2o.cluster_info()

        # setup_data our stopping condition here, random discrete and find all models
        search_criteria = {
            'strategy': 'RandomDiscrete',
            "stopping_rounds": 0,
            "seed": round(time.time())
        }
        print("GLM Binomial grid search_criteria: {0}".format(search_criteria))

        # fire off random grid-search
        random_grid_model = \
            H2OGridSearch(H2OGeneralizedLinearEstimator(family=self.family, nfolds=self.nfolds),
                          hyper_params=self.hyper_params, search_criteria=search_criteria)
        random_grid_model.train(x=self.x_indices,
                                y=self.y_index,
                                training_frame=self.training1_data)

        # compare number of models built from both gridsearch
        if not (len(random_grid_model) == self.possible_number_models):
            self.test_failed += 1
            self.test_failed_array[self.test_num] = 1
            print(
                "test1_glm_random_grid_search_model_number for GLM: failed, number of models generated"
                "possible model number {0} and randomized gridsearch model number {1} are not "
                "equal.".format(self.possible_number_models,
                                len(random_grid_model)))

        if self.test_failed_array[self.test_num] == 0:
            print("test1_glm_random_grid_search_model_number for GLM: passed!")

        self.test_num += 1
        sys.stdout.flush()

        # gset max_allowed_runtime as total run time to build all models * (1+fraction)
        self.max_grid_runtime = pyunit_utils.find_grid_runtime(
            random_grid_model.models)
コード例 #28
0
    def test_rf_gridsearch_sorting_metrics(self):
        """
        test_rf_gridsearch_sorting_metrics performs the following:
        b. build H2O random forest models using grid search.  No model is built for bad hyper-parameters
           values.  We should instead get a warning/error message printed out.
        c. Check and make sure that the models are returned sorted with the correct cross-validation metrics.
        """

        if self.possible_number_models > 0:
            print("*******************************************************************************************")
            print("test_rf_gridsearch_sorting_metrics for random forest ")
            h2o.cluster_info()


            print("Hyper-parameters used here is {0}".format(self.final_hyper_params))

            # start grid search
            grid_model = H2OGridSearch(H2ORandomForestEstimator(nfolds=self.nfolds, seed=self.seed,
                                                                score_tree_interval=0),
                                       hyper_params=self.final_hyper_params)
            grid_model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data)

            result_table = grid_model._grid_json["summary_table"]
            model_index = 0
            grid_model_metrics = []

            diff = 0    # calculate difference between gridsearch model metrics and manually extracted model.
            diff_train = 0  # calculate difference between training and cross-validation metrics

            # grab performance metric for each model of grid_model and collect correct sorting metrics by hand
            for each_model in grid_model:
                grid_model_metric = float(result_table[self.training_metric][model_index])
                grid_model_metrics.append(grid_model_metric)

                manual_metric = each_model._model_json["output"]["cross_validation_metrics"]._metric_json["logloss"]
                if not(type(grid_model_metrics) == unicode) and not(type(manual_metric)==unicode):
                    diff += abs(grid_model_metric - manual_metric)

                manual_training_metric = each_model._model_json["output"]["training_metrics"]._metric_json["logloss"]
                if not(type(grid_model_metrics) == unicode) and not(type(manual_training_metric)==unicode):
                    diff_train += abs(grid_model_metric-manual_training_metric)

                print("grid model logloss: {0}, grid model training logloss: "
                      "{1}".format(grid_model_metric, manual_training_metric))

                model_index += 1

            if (diff > self.diff) or not(grid_model_metrics == sorted(grid_model_metrics)) or (diff_train < self.diff):
                self.test_failed = 1
                print("test_rf_gridsearch_sorting_metrics for random forest has failed!")

            if self.test_failed == 0:
                print("test_rf_gridsearch_sorting_metrics for random forest has passed!")
コード例 #29
0
 def train_models(self):
     self.h2o_model = H2OGridSearch(
         H2OGeneralizedAdditiveEstimator(family="multinomial",
                                         gam_columns=["C6", "C7", "C8"],
                                         keep_gam_cols=True,
                                         scale=[1, 1, 1],
                                         num_knots=[5, 5, 5]),
         self.hyper_parameters)
     self.h2o_model.train(x=self.myX,
                          y=self.myY,
                          training_frame=self.h2o_data)
     for model in self.manual_gam_models:
         model.train(x=self.myX, y=self.myY, training_frame=self.h2o_data)
def test_gridsearch():
    h2o_data = h2o.import_file(path = pyunit_utils.locate("smalldata/gam_test/synthetic_20Cols_binomial_20KRows.csv"))
    h2o_data['response'] = h2o_data['response'].asfactor()
    h2o_data['C3'] = h2o_data['C3'].asfactor()
    h2o_data['C7'] = h2o_data['C7'].asfactor()
    h2o_data['C8'] = h2o_data['C8'].asfactor()
    h2o_data['C10'] = h2o_data['C10'].asfactor()
    names = h2o_data.names
    myY = "response"
    myX = names.remove(myY)
    search_criteria = {'strategy': 'Cartesian'}
    hyper_parameters = {'lambda': [1, 2],
                        'subspaces': [{'scale': [[0.001], [0.0002]], 'num_knots': [[5], [10]], 'bs':[[1], [0]], 
                                       'gam_columns': [[["c_0"]], [["c_1"]]]},
                                      {'scale': [[0.001, 0.001, 0.001], [0.0002, 0.0002, 0.0002]], 
                                       'bs':[[1, 1, 1], [0, 1, 1]], 
                                       'num_knots': [[5, 10, 12], [6, 11, 13]], 
                                       'gam_columns': [[["c_0"], ["c_1", "c_2"], ["c_3", "c_4", "c_5"]],
                                                   [["c_1"], ["c_2", "c_3"], ["c_4", "c_5", "c_6"]]]}]}
    hyper_parameters2 = {'lambda': [1, 2],
                        'subspaces': [{'scale': [[0.001], [0.0002]], 'num_knots': [[5], [10]], 'bs':[[1], [0]],
                                       'gam_columns': [[["c_0"]], [["c_1"]]]},
                                      {'scale': [[0.001, 0.001, 0.001], [0.0002, 0.0002, 0.0002]],
                                       'bs':[[1, 1, 1], [0, 1, 1]],
                                       'num_knots': [[5, 10, 12], [6, 11, 13]],
                                       'gam_columns': [["c_0", ["c_1", "c_2"], ["c_3", "c_4", "c_5"]],
                                                       ["c_1", ["c_2", "c_3"], ["c_4", "c_5", "c_6"]]]}]}
    h2o_model = H2OGridSearch(H2OGeneralizedAdditiveEstimator(family="binomial", keep_gam_cols=True, seed=1),
                              hyper_params=hyper_parameters, search_criteria=search_criteria)
    h2o_model.train(x = myX, y = myY, training_frame = h2o_data)
    h2o_model2 = H2OGridSearch(H2OGeneralizedAdditiveEstimator(family="binomial", keep_gam_cols=True, seed=1),
                              hyper_params=hyper_parameters2, search_criteria=search_criteria)
    h2o_model2.train(x = myX, y = myY, training_frame = h2o_data)
    # compare two models by checking their coefficients.  They should be the same
    for index in range(0, len(h2o_model)):
        model1 = h2o_model[index]
        model2 = h2o_model2[index]
        pyunit_utils.assertEqualCoeffDicts(model1.coef(), model2.coef(), tol=1e-6)