Ejemplo n.º 1
0
def nb_iris():


  print("Importing iris_wheader.csv data...\n")
  iris = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
  iris.describe()

  iris_nbayes = H2ONaiveBayesEstimator()
  iris_nbayes.train(x=list(range(4)), y=4, training_frame=iris, validation_frame=iris)
  iris_nbayes.show()

  iris_nbayes = H2ONaiveBayesEstimator(nfolds=3)
  iris_nbayes.train(x=list(range(4)), y=4, training_frame=iris, validation_frame=iris, seed=1234)
  iris_nbayes.show()

  iris_nbayes = H2ONaiveBayesEstimator(nfolds=3)
  iris_nbayes.train(x=list(range(4)), y=4, training_frame=iris, seed=1234)
  iris_nbayes.show()

  iris_nbayes = H2ONaiveBayesEstimator(nfolds=3,fold_assignment="Modulo")
  iris_nbayes.train(x=list(range(4)), y=4, training_frame=iris)
  iris_nbayes.show()

  print("And here it is:")
  print(iris_nbayes.cross_validation_metrics_summary())
  print(iris_nbayes.cross_validation_metrics_summary().as_data_frame())
  print(iris_nbayes.cross_validation_metrics_summary().as_data_frame()['mean'])
Ejemplo n.º 2
0
def nb_init_err():

    print "Importing iris_wheader.csv data...\n"
    iris = h2o.upload_file(
        pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    iris.describe

    print "Laplace smoothing parameter is negative"

    try:
        H2ONaiveBayesEstimator(laplace=-1).train(x=range(4),
                                                 y=4,
                                                 training_frame=iris)
        assert False, "Expected naive bayes algo to fail on negative laplace training parameter"
    except:
        pass

    print "Minimum standard deviation is zero"
    try:
        H2ONaiveBayesEstimator().train(x=range(4),
                                       y=4,
                                       training_frame=iris,
                                       min_sdev=0)
        assert False, "Expected naive bayes algo to fail on min_sdev = 0"
    except:
        pass

    print "Response column is not categorical"
    try:
        H2ONaiveBayesEstimator().train(x=range(3), y=3, min_sdev=0)
        assert False, "Expected naive bayes algo to fail on response not categorical"
    except:
        pass
Ejemplo n.º 3
0
def naive_bayes(xval=None, sample_size=None, nfolds=None, hparams=None, for_stacking=None):
    """
    create a naive bayes algorithm estimator
    :param xval: if for cross-validation
    :param sample_size: training set sample amount
    :param nfolds: k value for k-fold cross-validation
    :param hparams: hyper parameters for grid search
    :param for_stacking: if it is used for stacking
    :return: a constructed naive bayes estimator, a parameters' dict for grid search
    """
    if sample_size <= 50000:
        if sample_size <= 10000:
            default_nfolds = 3
        else:
            default_nfolds = 5
        laplace_opts = [0.1, 1, 5, 10]
        min_sdev_opts = [0.001, 0.005, 0.1]
        eps_sdev_opts = [0, 0.001, 0.01]

    elif 50000 < sample_size <= 500000:
        default_nfolds = 3
        laplace_opts = [0.1, 1, 5]
        min_sdev_opts = [0.001, 0.1]
        eps_sdev_opts = [0, 0.01]

    else:
        default_nfolds = 2
        laplace_opts = [0.1, 5]
        min_sdev_opts = [0.001, 0.005]
        eps_sdev_opts = [0, 0.01]

    default_hparams = dict({'laplace': laplace_opts,
                            'min_sdev': min_sdev_opts,
                            'eps_sdev': eps_sdev_opts})

    if nfolds is None:
        nfolds = default_nfolds
    if hparams is None:
        hparams = default_hparams

    if xval:
        if for_stacking:
            nb_estimator = H2ONaiveBayesEstimator(nfolds=nfolds,
                                                  fold_assignment="Modulo",
                                                  seed=1,
                                                  keep_cross_validation_predictions=True)
        else:
            nb_estimator = H2ONaiveBayesEstimator(nfolds=nfolds)
    else:
        nb_estimator = H2ONaiveBayesEstimator()

    return nb_estimator, hparams
Ejemplo n.º 4
0
def nb_baddata():

    rawdata = [[random.gauss(0, 1) for r in range(100)] for c in range(10)]

    print "Training data with all NA's"
    train = [["NA" for r in range(100)] for c in range(10)]
    train_h2o = h2o.H2OFrame(python_obj=train)
    from h2o.estimators.naive_bayes import H2ONaiveBayesEstimator
    try:
        H2ONaiveBayesEstimator().train(x=range(1, 10),
                                       y=0,
                                       training_frame=train_h2o)
        assert False, "Expected naive bayes algo to fail on training data of all NA's"
    except:
        pass

    # Response column must be categorical
    print "Training data with a numeric response column"
    train_h2o = h2o.H2OFrame(python_obj=rawdata)
    try:
        H2ONaiveBayesEstimator().train(x=range(1, 10),
                                       y=0,
                                       training_frame=train_h2o)
        assert False, "Expected naive bayes algo to fail on training data with a numeric response column"
    except:
        pass

    # Constant response dropped before model building
    print "Training data with a constant response: drop and throw error"
    rawdata[0] = 100 * ["A"]
    train_h2o = h2o.H2OFrame(python_obj=rawdata)
    try:
        H2ONaiveBayesEstimator().train(x=range(1, 10),
                                       y=0,
                                       training_frame=train_h2o)
        assert False, "Expected naive bayes algo to fail on training data with a constant response: drop and throw error"
    except:
        pass

    # Predictors with constant value automatically dropped
    print "Training data with 1 col of all 5's: drop automatically"
    rawdata = [[random.gauss(0, 1) for r in range(100)] for c in range(10)]
    rawdata[4] = 100 * [5]
    rawdata[0] = [random.choice(string.letters) for _ in range(100)]
    train_h2o = h2o.H2OFrame(python_obj=rawdata)
    model = H2ONaiveBayesEstimator()
    model.train(x=range(10), y=0, training_frame=train_h2o)
    assert len(model._model_json['output']['pcond']) == 8, "Expected 8 predictors, but got {0}" \
                                                           "".format(len(model._model_json['output']['pcond']))
def naive_bayes_export():
    print("###### NAIVE BAYES ######")
    frame = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    model = H2ONaiveBayesEstimator(laplace=0.25)
    model.train(x=list(range(4)), y=4, training_frame=frame)
    h2o.download_pojo(model, path=RESULT_DIR)
    expect_error(model.download_mojo, model="Naive Bayes", format='MOJO')
Ejemplo n.º 6
0
    def test_naivebayes_grid_search_over_params(self):
        """
        test_naivebayes_grid_search_over_params performs the following:
        run gridsearch model and then build each model manually and see if we receive the same error messages.
        """
        print(
            "*******************************************************************************************"
        )
        print("test_naivebayes_grid_search_over_params for naivebayes ")
        h2o.cluster_info()

        print("Hyper-parameters used here is {0}".format(
            self.final_hyper_params))

        # # start grid search
        # grid_model = H2OGridSearch(H2ONaiveBayesEstimator(nfolds=self.nfolds),
        #                            hyper_params=self.final_hyper_params)
        # grid_model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data)

        # add parameters into params_dict.  Use this to manually build model, one at a time
        params_dict = dict()
        params_dict["nfolds"] = self.nfolds

        manual_model = [None] * self.possible_number_models

        model_index = 0
        for fold_v in self.final_hyper_params["fold_assignment"]:
            for max_t in self.final_hyper_params["max_runtime_secs"]:
                for laplace_v in self.final_hyper_params["laplace"]:
                    params_list = dict()
                    params_list["fold_assignment"] = fold_v
                    params_list["max_runtime_secs"] = max_t
                    params_list["laplace"] = laplace_v

                    print("Hyper-parameters used here is {0}\n".format(
                        params_list))

                    params_list.update(params_dict)

                    model_params = dict()

                    # need to taken out max_runtime_secs from model parameters, it is now set in .train()
                    if "max_runtime_secs" in params_list:
                        model_params["max_runtime_secs"] = params_list[
                            "max_runtime_secs"]
                        max_runtime = params_list["max_runtime_secs"]
                        del params_list["max_runtime_secs"]
                    else:
                        max_runtime = 0

                    manual_model[model_index] = H2ONaiveBayesEstimator(
                        **params_list)
                    manual_model[model_index].train(
                        x=self.x_indices,
                        y=self.y_index,
                        training_frame=self.training1_data,
                        **model_params)
                    model_index += 1
Ejemplo n.º 7
0
def _run_builder(key, algorithm, training_dataset_key, y, x, model_type):
    try:
        client = _get_memcached_client()
        contents = get_dataset_contents(training_dataset_key)
        with tempfile.TemporaryDirectory() as tmpdir:
            dataset_path = os.path.join(tmpdir, TRAINING_FILE)
            with open(dataset_path, 'w') as training_file:
                training_file.write(contents)
            h2o.init()
            training_frame = h2o.import_file(dataset_path)

            if algorithm == MLAlgorithm.NAIVE_BAYES:
                # naivebayes expects the prediction response to be categorical
                training_frame[y] = training_frame[y].asfactor()
                estimator = H2ONaiveBayesEstimator()
            elif algorithm == MLAlgorithm.GRADIENT_BOOSTING_MACHINE:
                estimator = H2OGradientBoostingEstimator()

            kwargs = {'training_frame': training_frame, 'y': y}
            if x is not None:
                kwargs['x'] = x

            estimator.train(**kwargs)

            temp_folder = os.path.join(os.path.abspath(os.sep), 'tmp')
            if model_type.upper() == 'POJO':
                model_file = estimator.download_pojo(
                    path=temp_folder,
                    get_genmodel_jar=True,
                    genmodel_name='h2o-genmodel.jar')
            else:
                model_file = estimator.download_mojo(
                    path=temp_folder,
                    get_genmodel_jar=True,
                    genmodel_name='h2o-genmodel.jar')

            model_performance = estimator.model_performance()
            details = {'mse': model_performance.mse()}

            with zipfile.ZipFile(os.path.join(temp_folder, key), 'w') as zip:
                zip.write(model_file, os.path.basename(model_file))
                zip.write(os.path.join(temp_folder, 'h2o-genmodel.jar'),
                          'h2o-genmodel.jar')

            client.set(
                key,
                json.dumps({
                    'status': 'COMPLETE',
                    'description': 'Model has been built',
                    'details': details,
                    'path': model_file
                }))
    except Exception as ex:
        client.set(key, json.dumps({
            'status': 'FAILED',
            'description': str(ex)
        }))
        logger.exception("Building model failed")
    def test_naivebayes_grid_search_over_params(self):
        """
        test_naivebayes_grid_search_over_params performs the following:
        run gridsearch model and then build each model manually and see if we receive the same error messages.
        """
        print(
            "*******************************************************************************************"
        )
        print("test_naivebayes_grid_search_over_params for naivebayes ")
        h2o.cluster_info()

        print("Hyper-parameters used here is {0}".format(
            self.final_hyper_params))

        # # start grid search
        # grid_model = H2OGridSearch(H2ONaiveBayesEstimator(nfolds=self.nfolds),
        #                            hyper_params=self.final_hyper_params)
        # grid_model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data)

        # add parameters into params_dict.  Use this to manually build model, one at a time
        params_dict = dict()
        params_dict["nfolds"] = self.nfolds
        params_list = dict()
        params_list["fold_assignment"] = self.final_hyper_params[
            "fold_assignment"][0]
        #       params_list["max_runtime_secs"] = self.final_hyper_params["max_runtime_secs"][1]
        params_list["max_runtime_secs"] = 10  # this will return full NB model

        # the field manual_model._model_json['output']['cross_validation_metrics_summary'].cell_values will be empty
        params_list[
            "max_runtime_secs"] = 0.001  # this will not return full NB model
        params_list["laplace"] = self.final_hyper_params["laplace"][0]

        print("Hyper-parameters used here is {0}\n".format(params_list))

        params_list.update(params_dict)

        model_params = dict()

        # need to taken out max_runtime_secs from model parameters, it is now set in .train()
        if "max_runtime_secs" in params_list:
            model_params["max_runtime_secs"] = params_list["max_runtime_secs"]
            max_runtime = params_list["max_runtime_secs"]
            del params_list["max_runtime_secs"]
        else:
            max_runtime = 0

        manual_model = H2ONaiveBayesEstimator(**params_list)
        manual_model.train(x=self.x_indices,
                           y=self.y_index,
                           training_frame=self.training1_data,
                           **model_params)

        print("Done!")
Ejemplo n.º 9
0
def nb_iris():

    print("Importing iris_wheader.csv data...\n")
    iris = h2o.upload_file(
        pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    iris.describe()

    laplace_range = [0, 1, 0.25]
    for i in laplace_range:
        print("H2O Naive Bayes with Laplace smoothing = {0}".format(i))
        iris_nbayes = H2ONaiveBayesEstimator(laplace=i)
        iris_nbayes.train(x=list(range(4)), y=4, training_frame=iris)
        iris_nbayes.show()
def grid_cars_NB():

    cars = h2o.import_file(
        path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    r = cars[0].runif(seed=42)
    train = cars[r > .2]

    problem = random.sample(["binomial", "multinomial"], 1)
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    if problem == "binomial":
        response_col = "economy_20mpg"
    else:
        response_col = "cylinders"

    print("Predictors: {0}".format(predictors))
    print("Response: {0}".format(response_col))

    print("Converting the response column to a factor...")
    train[response_col] = train[response_col].asfactor()

    max_runtime_secs = 10  # this will return full NB model
    # the field manual_model._model_json['output']['cross_validation_metrics_summary'].cell_values will be empty
    max_runtime_secs = 0.001

    model_params = {
        'compute_metrics': True,
        'fold_assignment': 'AUTO',
        'laplace': 8.3532975,
        'nfolds': 2
    }

    cars_nb = H2ONaiveBayesEstimator(**model_params)
    cars_nb.train(x=predictors,
                  y=response_col,
                  training_frame=train,
                  max_runtime_secs=max_runtime_secs)

    if len(cars_nb._model_json['output']
           ['cross_validation_metrics_summary'].cell_values) > 0:
        print("Pass test.  Complete metrics returned.")
    else:
        print("Failed test.  Model metrics is missing.")
Ejemplo n.º 11
0
def nb_prostate():

    print "Importing prostate.csv data..."
    prostate = h2o.upload_file(
        pyunit_utils.locate("smalldata/logreg/prostate.csv"))

    print "Converting CAPSULE, RACE, DCAPS, and DPROS to categorical"
    prostate['CAPSULE'] = prostate['CAPSULE'].asfactor()
    prostate['RACE'] = prostate['CAPSULE'].asfactor()
    prostate['DCAPS'] = prostate['DCAPS'].asfactor()
    prostate['DPROS'] = prostate['DPROS'].asfactor()

    print "Compare with Naive Bayes when x = 3:9, y = 2"
    from h2o.estimators.naive_bayes import H2ONaiveBayesEstimator
    prostate_nb = H2ONaiveBayesEstimator(laplace=0)
    prostate_nb.train(x=range(2, 9), y=1, training_frame=prostate)
    prostate_nb.show()

    print "Predict on training data"
    prostate_pred = prostate_nb.predict(prostate)
    prostate_pred.head()
Ejemplo n.º 12
0
    def test_naivebayes_grid_search_over_params(self):
        """
        test_naivebayes_grid_search_over_params performs the following:
        a. build H2O naivebayes models using grid search.  Count and make sure models
           are only built for hyper-parameters set to legal values.  No model is built for bad hyper-parameters
           values.  We should instead get a warning/error message printed out.
        b. For each model built using grid search, we will extract the parameters used in building
           that model and manually build a H2O naivebayes model.  Logloss are calculated from a test set
           to compare the performance of grid search model and our manually built model.  If their metrics
           are close, declare test success.  Otherwise, declare test failure.
        c. we will check and make sure the models are built within the max_runtime_secs time limit that was set
           for it as well.  If max_runtime_secs was exceeded, declare test failure as well.
        """
        print(
            "*******************************************************************************************"
        )
        print("test_naivebayes_grid_search_over_params for naivebayes ")
        h2o.cluster_info()

        try:
            print("Hyper-parameters used here is {0}".format(
                self.final_hyper_params))

            # start grid search
            grid_model = H2OGridSearch(
                H2ONaiveBayesEstimator(nfolds=self.nfolds),
                hyper_params=self.final_hyper_params)
            grid_model.train(x=self.x_indices,
                             y=self.y_index,
                             training_frame=self.training1_data)

            self.correct_model_number = len(
                grid_model)  # store number of models built

            # make sure the correct number of models are built by gridsearch
            if not (self.correct_model_number
                    == self.possible_number_models):  # wrong grid model number
                self.test_failed += 1
                print(
                    "test_naivebayes_grid_search_over_params for naivebayes failed: number of models built by "
                    "gridsearch does not equal to all possible combinations of hyper-parameters"
                )
            else:
                # add parameters into params_dict.  Use this to manually build model
                params_dict = dict()
                params_dict["nfolds"] = self.nfolds
                params_dict["score_tree_interval"] = 0
                total_run_time_limits = 0.0  # calculate upper bound of max_runtime_secs
                true_run_time_limits = 0.0
                manual_run_runtime = 0.0

                # compare performance metric of model built by gridsearch with manually built model
                for each_model in grid_model:

                    params_list = grid_model.get_hyperparams_dict(
                        each_model._id)
                    params_list.update(params_dict)

                    model_params = dict()

                    # need to taken out max_runtime_secs from model parameters, it is now set in .train()
                    if "max_runtime_secs" in params_list:
                        model_params["max_runtime_secs"] = params_list[
                            "max_runtime_secs"]
                        max_runtime = params_list["max_runtime_secs"]
                        del params_list["max_runtime_secs"]
                    else:
                        max_runtime = 0

                    if "validation_frame" in params_list:
                        model_params["validation_frame"] = params_list[
                            "validation_frame"]
                        del params_list["validation_frame"]

                    if "score_tree_interval" in params_list:
                        model_params["score_tree_interval"] = params_list[
                            "score_tree_interval"]
                        del params_list["score_tree_interval"]

                    if "eps_prob" in params_list:
                        model_params["eps_prob"] = params_list["eps_prob"]
                        del params_list["eps_prob"]

                    if "min_prob" in params_list:
                        model_params["min_prob"] = params_list["min_prob"]
                        del params_list["min_prob"]

                    # make sure manual model was provided the same max_runtime_secs as the grid model
                    each_model_runtime = pyunit_utils.find_grid_runtime(
                        [each_model])

                    manual_model = H2ONaiveBayesEstimator(**params_list)
                    manual_model.train(x=self.x_indices,
                                       y=self.y_index,
                                       training_frame=self.training1_data,
                                       **model_params)

                    # collect the time taken to manually built all models
                    model_runtime = pyunit_utils.find_grid_runtime(
                        [manual_model])  # time taken to build this model
                    manual_run_runtime += model_runtime

                    if max_runtime > 0:
                        # shortest possible time it takes to build this model
                        if (max_runtime < self.model_run_time):
                            total_run_time_limits += model_runtime
                        else:
                            total_run_time_limits += max_runtime

                    true_run_time_limits += max_runtime

                    # compute and compare test metrics between the two models
                    test_grid_model_metrics = \
                        each_model.model_performance(test_data=self.training1_data)._metric_json[self.training_metric]
                    test_manual_model_metrics = \
                        manual_model.model_performance(test_data=self.training1_data)._metric_json[self.training_metric]

                    # just compare the mse in this case within tolerance:
                    if (each_model_runtime > 0) and \
                            (abs(model_runtime - each_model_runtime)/each_model_runtime < self.allowed_runtime_diff) \
                            and (abs(test_grid_model_metrics - test_manual_model_metrics) > self.allowed_diff):
                        self.test_failed += 1  # count total number of tests that have failed
                        print(
                            "test_naivebayes_grid_search_over_params for naivebayes failed: grid search model and manually "
                            "built H2O model differ too much in test MSE!")
                        break

                total_run_time_limits = max(
                    total_run_time_limits,
                    true_run_time_limits) * (1 + self.extra_time_fraction)

                # make sure the max_runtime_secs is working to restrict model built time
                if not (manual_run_runtime <= total_run_time_limits):
                    self.test_failed += 1
                    print(
                        "test_naivebayes_grid_search_over_params for naivebayes failed: time taken to manually build models is {0}."
                        "  Maximum allowed time is {1}".format(
                            manual_run_runtime, total_run_time_limits))

                if self.test_failed == 0:
                    print(
                        "test_naivebayes_grid_search_over_params for naivebayes has passed!"
                    )
        except:
            if self.possible_number_models > 0:
                print(
                    "test_naivebayes_grid_search_over_params for naivebayes failed: exception was thrown for no reason."
                )
                self.test_failed += 1
    def test_naivebayes_grid_search_over_params(self):
        """
        test_naivebayes_grid_search_over_params performs the following:
        run gridsearch model and then build each model manually and see if we receive the same error messages.
        """
        print(
            "*******************************************************************************************"
        )
        print("test_naivebayes_grid_search_over_params for naivebayes ")
        h2o.cluster_info()

        print("Hyper-parameters used here is {0}".format(
            self.final_hyper_params))

        # start grid search
        grid_model = H2OGridSearch(H2ONaiveBayesEstimator(nfolds=self.nfolds),
                                   hyper_params=self.final_hyper_params)
        grid_model.train(x=self.x_indices,
                         y=self.y_index,
                         training_frame=self.training1_data)

        # add parameters into params_dict.  Use this to manually build model, one at a time
        params_dict = dict()
        params_dict["nfolds"] = self.nfolds

        manual_model = [None] * len(grid_model)

        model_index = 0
        for each_model in grid_model:

            params_list = grid_model.get_hyperparams_dict(each_model._id)
            params_list.update(params_dict)

            model_params = dict()

            # need to taken out max_runtime_secs from model parameters, it is now set in .train()
            if "max_runtime_secs" in params_list:
                model_params["max_runtime_secs"] = params_list[
                    "max_runtime_secs"]
                max_runtime = params_list["max_runtime_secs"]
                del params_list["max_runtime_secs"]
            else:
                max_runtime = 0

            if "validation_frame" in params_list:
                model_params["validation_frame"] = params_list[
                    "validation_frame"]
                del params_list["validation_frame"]

            if "eps_prob" in params_list:
                model_params["eps_prob"] = params_list["eps_prob"]
                del params_list["eps_prob"]

            if "min_prob" in params_list:
                model_params["min_prob"] = params_list["min_prob"]
                del params_list["min_prob"]

            manual_model[model_index] = H2ONaiveBayesEstimator(**params_list)
            manual_model[model_index].train(x=self.x_indices,
                                            y=self.y_index,
                                            training_frame=self.training1_data,
                                            **model_params)
            model_index += 1
Ejemplo n.º 14
0
# In[167]:

cred_train.shape

# In[168]:

cred_test.shape

# In[169]:

from h2o.estimators.naive_bayes import H2ONaiveBayesEstimator

# In[170]:

cred_model = H2ONaiveBayesEstimator(model_id='cred_model', laplace=1)

# In[171]:

cred_model.train(x=features, y=results, training_frame=cred_train)
cred_model.train(x=features, y=results, training_frame=cred_test)

# In[172]:

cred_model.model_performance(cred_train)

# In[173]:

cred_model.model_performance(cred_test)

# In[174]:
        firm_period_predictions = pd.read_csv(os.path.join(model_save_dir,"firm_period_predictions20171228.csv"))
        avg_predictions = firm_period_predictions.groupby('DealerTIN').mean()
        betas = predictions2betas_curve(avg_predictions,model_score_col,label_var,return_betas=True,plot_betas=False)
        betas.to_csv(os.path.join(model_save_dir,"betas20171231.csv"))
        fs_name = feature_sets_names[i]
        axes = predictions2betas_curve(avg_predictions,model_score_col,label_var,return_betas=False,plot_betas=True,label=fs_name,axes=axes)
    axes.set_title('betas curves for various feature sets'.title())
    plt.legend()

CLASSIFIER_DATE_STR = '20180415'
CLASSIFIERS = {
'RandomForest': H2ORandomForestEstimator(ntrees=200, keep_cross_validation_predictions=True, stopping_rounds=2, score_each_iteration=True, model_id="rf_cv_all_folds_"+CLASSIFIER_DATE_STR, seed=1000000),
'RandomForest_depth6': H2ORandomForestEstimator(ntrees=200, max_depth=6,keep_cross_validation_predictions=True, stopping_rounds=2, score_each_iteration=True, model_id="rf_cv_all_folds_"+CLASSIFIER_DATE_STR, seed=1000000),
'GLM': H2OGeneralizedLinearEstimator(family= "binomial", lambda_ = 0, compute_p_values = True, remove_collinear_columns=True, keep_cross_validation_predictions=True, model_id="glm_cv_all_folds_"+CLASSIFIER_DATE_STR, seed=1000000), # todo: regularization? http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/glm.html#regularization-parameters-in-glm
'GBM': H2OGradientBoostingEstimator(ntrees=200, learn_rate=0.2, max_depth=20, stopping_tolerance=0.01, stopping_rounds=2, score_each_iteration=True, keep_cross_validation_predictions=True, model_id="gbm_cv_all_folds_"+CLASSIFIER_DATE_STR, seed=1000000),
'NaiveBayes': H2ONaiveBayesEstimator(keep_cross_validation_predictions=True,model_id="naive_bayes_cv_all_folds_"+CLASSIFIER_DATE_STR, seed=1000000)
# 'XGBOOST': H2OXGBoostEstimator(ntrees=200, learn_rate=0.2, max_depth=20, stopping_tolerance=0.01, stopping_rounds=2, score_each_iteration=True, keep_cross_validation_predictions=True, model_id="gbm_cv_all_folds_"+CLASSIFIER_DATE_STR, seed=1000000)
}
# todo: categorical_encoding='one_hot_internal'? No. Works for GLM automatically, impossible for tree methods.

def different_classifiers_save_predictions(ffemq12,model_score_col='model_score_bogus_online',label_var='bogus_online'):
    """
    run different classification algorithms on the data and save CV predictions & models
    """
    for algo_name,model in CLASSIFIERS.items():
        print algo_name
        print '-'*30
        print time.ctime()
        try:
            model_save_dir=r'D:\shekhar_code_github\BogusFirmCatching\Models\diff_classifiers\{}\classifier_{}'.format(CLASSIFIER_DATE_STR,algo_name)
            if not os.path.exists(model_save_dir):
Ejemplo n.º 16
0

# In[98]:


cols = ['fac_type', 'risk','service_code','month','day','year','code_islarge']
_ = rf_model.partial_plot(data=model_data, cols=cols, nbins=200, figsize=(18, 20))


# In[99]:


#NaiveBayes
from h2o.estimators.naive_bayes import H2ONaiveBayesEstimator
train, test = model_data.split_frame(ratios=[0.7])
naive_bayes =  H2ONaiveBayesEstimator(nfolds=10, seed=42)
naive_bayes.train(x=feature, y=target, training_frame=train, validation_frame=test)


# In[100]:


#Make predictions
train_true = train.as_data_frame()['grade'].values
test_true = test.as_data_frame()['grade'].values
train_pred = naive_bayes.predict(train).as_data_frame()['p1'].values
test_pred = naive_bayes.predict(test).as_data_frame()['p1'].values

train_fpr, train_tpr, _ = roc_curve(train_true, train_pred)
test_fpr, test_tpr, _ = roc_curve(test_true, test_pred)
train_auc = np.round(auc(train_fpr, train_tpr), 3)
Ejemplo n.º 17
0
def javapredict(algo,
                equality,
                train,
                test,
                x,
                y,
                compile_only=False,
                **kwargs):
    print "Creating model in H2O"
    if algo == "gbm": model = H2OGradientBoostingEstimator(**kwargs)
    elif algo == "random_forest": model = H2ORandomForestEstimator(**kwargs)
    elif algo == "deeplearning": model = H2ODeepLearningEstimator(**kwargs)
    elif algo == "glm": model = H2OGeneralizedLinearEstimator(**kwargs)
    elif algo == "naive_bayes": model = H2ONaiveBayesEstimator(**kwargs)
    elif algo == "kmeans": model = H2OKMeansEstimator(**kwargs)
    elif algo == "pca": model = H2OPCA(**kwargs)
    else: raise (ValueError, "algo {0} is not supported".format(algo))
    if algo == "kmeans" or algo == "pca":
        model.train(x=x, training_frame=train)
    else:
        model.train(x=x, y=y, training_frame=train)
    print model

    # HACK: munge model._id so that it conforms to Java class name. For example, change K-means to K_means.
    # TODO: clients should extract Java class name from header.
    regex = re.compile("[+\\-* !@#$%^&()={}\\[\\]|;:'\"<>,.?/]")
    pojoname = regex.sub("_", model._id)

    print "Downloading Java prediction model code from H2O"
    tmpdir = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                     "results", pojoname))
    os.mkdir(tmpdir)
    h2o.download_pojo(model, path=tmpdir)
    h2o_genmodel_jar = os.path.join(tmpdir, "h2o-genmodel.jar")
    assert os.path.exists(
        h2o_genmodel_jar
    ), "Expected file {0} to exist, but it does not.".format(h2o_genmodel_jar)
    print "h2o-genmodel.jar saved in {0}".format(h2o_genmodel_jar)
    java_file = os.path.join(tmpdir, pojoname + ".java")
    assert os.path.exists(
        java_file), "Expected file {0} to exist, but it does not.".format(
            java_file)
    print "java code saved in {0}".format(java_file)

    print "Compiling Java Pojo"
    javac_cmd = [
        "javac", "-cp", h2o_genmodel_jar, "-J-Xmx12g",
        "-J-XX:MaxPermSize=256m", java_file
    ]
    subprocess.check_call(javac_cmd)

    if not compile_only:
        print "Predicting in H2O"
        predictions = model.predict(test)
        predictions.summary()
        predictions.head()
        out_h2o_csv = os.path.join(tmpdir, "out_h2o.csv")
        h2o.download_csv(predictions, out_h2o_csv)
        assert os.path.exists(
            out_h2o_csv
        ), "Expected file {0} to exist, but it does not.".format(out_h2o_csv)
        print "H2O Predictions saved in {0}".format(out_h2o_csv)

        print "Setting up for Java POJO"
        in_csv = os.path.join(tmpdir, "in.csv")
        h2o.download_csv(test[x], in_csv)

        # hack: the PredictCsv driver can't handle quoted strings, so remove them
        f = open(in_csv, 'r+')
        csv = f.read()
        csv = re.sub('\"', '', csv)
        f.seek(0)
        f.write(csv)
        f.truncate()
        f.close()
        assert os.path.exists(
            in_csv), "Expected file {0} to exist, but it does not.".format(
                in_csv)
        print "Input CSV to PredictCsv saved in {0}".format(in_csv)

        print "Running PredictCsv Java Program"
        out_pojo_csv = os.path.join(tmpdir, "out_pojo.csv")
        cp_sep = ";" if sys.platform == "win32" else ":"
        java_cmd = [
            "java", "-ea", "-cp", h2o_genmodel_jar + cp_sep + tmpdir,
            "-Xmx12g", "-XX:MaxPermSize=2g", "-XX:ReservedCodeCacheSize=256m",
            "hex.genmodel.tools.PredictCsv", "--header", "--model", pojoname,
            "--input", in_csv, "--output", out_pojo_csv
        ]
        p = subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT)
        o, e = p.communicate()
        print "Java output: {0}".format(o)
        assert os.path.exists(
            out_pojo_csv
        ), "Expected file {0} to exist, but it does not.".format(out_pojo_csv)
        predictions2 = h2o.upload_file(path=out_pojo_csv)
        print "Pojo predictions saved in {0}".format(out_pojo_csv)

        print "Comparing predictions between H2O and Java POJO"
        # Dimensions
        hr, hc = predictions.dim
        pr, pc = predictions2.dim
        assert hr == pr, "Expected the same number of rows, but got {0} and {1}".format(
            hr, pr)
        assert hc == pc, "Expected the same number of cols, but got {0} and {1}".format(
            hc, pc)

        # Value
        for r in range(hr):
            hp = predictions[r, 0]
            if equality == "numeric":
                pp = float.fromhex(predictions2[r, 0])
                assert abs(
                    hp - pp
                ) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format(
                    r, hp, pp)
            elif equality == "class":
                pp = predictions2[r, 0]
                assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format(
                    r, hp, pp)
            else:
                raise (ValueError,
                       "equality type {0} is not supported".format(equality))
    def test_naivebayes_grid_search_over_params(self):
        """
        test_naivebayes_grid_search_over_params the following:
        a. grab all truely griddable parameters and randomly or manually set the parameter values.
        b. Next, build H2O naivebayes models using grid search.  Count and make sure models
           are only built for hyper-parameters set to legal values.  No model is built for bad hyper-parameters
           values.  We should instead get a warning/error message printed out.
        c. For each model built using grid search, we will extract the parameters used in building
           that model and manually build a H2O naivebayes model.  Training metrics are calculated from the
           gridsearch model and the manually built model.  If their metrics
           differ by too much, print a warning message but don't fail the test.
        d. we will check and make sure the models are built within the max_runtime_secs time limit that was set
           for it as well.  If max_runtime_secs was exceeded, declare test failure.
        """
        print("*******************************************************************************************")
        print("test_naivebayes_grid_search_over_params for naivebayes ")
        h2o.cluster_info()

        try:
            print("Hyper-parameters used here is {0}".format(self.final_hyper_params))

            # start grid search
            grid_model = H2OGridSearch(H2ONaiveBayesEstimator(nfolds=self.nfolds),
                                   hyper_params=self.final_hyper_params)
            grid_model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data)

            self.correct_model_number = len(grid_model)     # store number of models built

            # make sure the correct number of models are built by gridsearch
            if not (self.correct_model_number == self.possible_number_models):  # wrong grid model number
                self.test_failed += 1
                print("test_naivebayes_grid_search_over_params for naivebayes failed: number of models built by "
                      "gridsearch {0} does not equal to all possible combinations of hyper-parameters "
                      "{1}".format(self.correct_model_number, self.possible_number_models))
            else:
                # add parameters into params_dict.  Use this to manually build model
                params_dict = dict()
                params_dict["nfolds"] = self.nfolds
                total_run_time_limits = 0.0   # calculate upper bound of max_runtime_secs
                true_run_time_limits = 0.0
                manual_run_runtime = 0.0
                gridsearch_runtime = 0.0

                # compare performance metric of model built by gridsearch with manually built model
                for each_model in grid_model:

                    params_list = grid_model.get_hyperparams_dict(each_model._id)
                    params_list.update(params_dict)

                    model_params = dict()

                    # need to taken out max_runtime_secs from model parameters, it is now set in .train()
                    if "max_runtime_secs" in params_list:
                        model_params["max_runtime_secs"] = params_list["max_runtime_secs"]
                        max_runtime = params_list["max_runtime_secs"]
                        del params_list["max_runtime_secs"]
                    else:
                        max_runtime = 0

                    if "validation_frame" in params_list:
                        model_params["validation_frame"] = params_list["validation_frame"]
                        del params_list["validation_frame"]

                    if "eps_prob" in params_list:
                        model_params["eps_prob"] = params_list["eps_prob"]
                        del params_list["eps_prob"]

                    if "min_prob" in params_list:
                        model_params["min_prob"] = params_list["min_prob"]
                        del params_list["min_prob"]

                    # make sure manual model was provided the same max_runtime_secs as the grid model
                    each_model_runtime = pyunit_utils.find_grid_runtime([each_model])
                    gridsearch_runtime += each_model_runtime

                    manual_model = H2ONaiveBayesEstimator(**params_list)
                    manual_model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data,
                                       **model_params)

                    # collect the time taken to manually built all models
                    model_runtime = pyunit_utils.find_grid_runtime([manual_model])  # time taken to build this model
                    manual_run_runtime += model_runtime

                    if max_runtime > 0:
                        # shortest possible time it takes to build this model
                        if (max_runtime < self.model_run_time):
                            total_run_time_limits += model_runtime
                        else:
                            total_run_time_limits += max_runtime

                    true_run_time_limits += max_runtime

                    # compute and compare test metrics between the two models
                    grid_model_metrics = \
                        each_model.model_performance(test_data=self.training1_data)._metric_json[self.training_metric]
                    manual_model_metrics = \
                        manual_model.model_performance(test_data=self.training1_data)._metric_json[self.training_metric]

                    # just compare the mse in this case within tolerance:
                    if not((type(grid_model_metrics) == str) or (type(manual_model_metrics) == str)):
                        if (abs(grid_model_metrics) > 0) \
                            and (abs(grid_model_metrics - manual_model_metrics)/grid_model_metrics > self.allowed_diff):
                            print("test_naivebayes_grid_search_over_params for naivebayes WARNING\ngrid search model "
                                  "{0}: {1}, time taken to build (secs): {2}\n and manually built H2O model {3}: {4}, "
                                  "time taken to build (secs): {5}\ndiffer too much!"
                                  "".format(self.training_metric, grid_model_metrics, each_model_runtime,
                                            self.training_metric, manual_model_metrics, model_runtime))

                print("Time taken for gridsearch to build all models (sec): {0}\n Time taken to manually build all "
                      "models (sec): {1}, total run time limits (sec): "
                      "{2}".format(gridsearch_runtime, manual_run_runtime, total_run_time_limits))
                total_run_time_limits = max(total_run_time_limits, true_run_time_limits) * (1+self.extra_time_fraction)


                # make sure the max_runtime_secs is working to restrict model built time
                if not(manual_run_runtime <= total_run_time_limits):
                    self.test_failed += 1
                    print("test_naivebayes_grid_search_over_params for naivebayes failed: time taken to manually build "
                          "models is {0}.  Maximum allowed time "
                          "is {1}".format(manual_run_runtime, total_run_time_limits))

                if self.test_failed == 0:
                    print("test_naivebayes_grid_search_over_params for naivebayes has passed!")
        except Exception as e:
            if self.possible_number_models > 0:
                print("test_naivebayes_grid_search_over_params for naivebayes failed: exception ({0}) was thrown for "
                      "no reason.".format(e))
                self.test_failed += 1
    def setup_model(self):
        """
        This function setup the gridsearch hyper-parameters that will be used later on:

        1. It will first try to grab all the parameters that are griddable and parameters used by naivebayes.
        2. It will find the intersection of parameters that are both griddable and used by naivebayes.
        3. There are several extra parameters that are used by naivebayes that are denoted as griddable but actually
        are not.  These parameters have to be discovered manually and they are captured in
        self.exclude_parameter_lists.
        4. We generate the gridsearch hyper-parameter.  For numerical parameters, we will generate those randomly.
        For enums, we will include all of them.

        :return: None
        """
        # build bare bone model to get all parameters
        model = H2ONaiveBayesEstimator(nfolds=self.nfolds, compute_metrics=True)
        model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data)

        self.model_run_time = pyunit_utils.find_grid_runtime([model])  # find model train time
        print("Time taken to build a base barebone model is {0}".format(self.model_run_time))

        # grab all gridable parameters and its type
        (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.get_gridables(model._model_json["parameters"])

        # randomly generate griddable parameters including values outside legal range, like setting alpha values to
        # be outside legal range of 0 and 1 and etc
        (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params,
                                         self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number),
                                         self.max_int_val, self.min_int_val,
                                         random.randint(1, self.max_real_number),
                                         self.max_real_val, self.min_real_val)

        # scale the max_runtime_secs parameter and others as well to make sure they make sense
        time_scale = self.time_scale * self.model_run_time
        if "max_runtime_secs" in list(self.hyper_params):
            self.hyper_params["max_runtime_secs"] = [time_scale * x for x
                                                     in self.hyper_params["max_runtime_secs"]]

        # generate a new final_hyper_params which only takes a subset of all griddable parameters while
        # hyper_params take all griddable parameters and generate the grid search hyper-parameters
        [self.possible_number_models, self.final_hyper_params] = \
            pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero,
                                                self.params_more_than_one, self.params_zero_positive,
                                                self.max_grid_model)

        final_hyper_params_keys = list(self.final_hyper_params)
        # must add max_runtime_secs to restrict unit test run time and as a promise to Arno to test for this
        if ("max_runtime_secs" not in final_hyper_params_keys) and \
                ("max_runtime_secs" in list(self.hyper_params)):
            self.final_hyper_params["max_runtime_secs"] = self.hyper_params["max_runtime_secs"]
            len_good_time = len([x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)])
            self.possible_number_models = self.possible_number_models*len_good_time

        # need to check that min_prob >= 1e-10
        if "min_prob" in final_hyper_params_keys:
            old_len_prob = len([x for x in self.final_hyper_params["max_runtime_secs"] if (x >= 0)])
            good_len_prob = len([x for x in self.final_hyper_params["max_runtime_secs"] if (x >= 1e-10)])
            self.possible_number_models = self.possible_number_models*good_len_prob/old_len_prob

        if "laplace" in final_hyper_params_keys:
            self.final_hyper_params["laplace"] = [self.laplace_scale * x for x
                                                  in self.hyper_params["laplace"]]

            # write out the hyper-parameters used into json files.
        pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename,
                                                 self.final_hyper_params)
def algo_max_runtime_secs():
    '''
    This pyunit test is written to ensure that column names and column types are returned in the model
      output for every algorithm supported by H2O.  See PUBDEV-5801.
    '''
    seed = 12345
    print("Checking GLM.....")
    training1_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    model = H2OGeneralizedLinearEstimator(family="binomial",
                                          alpha=1.0,
                                          lambda_search=False,
                                          max_iterations=2,
                                          seed=seed)
    checkColumnNamesTypesReturned(
        training1_data,
        model, ["displacement", "power", "weight", "acceleration", "year"],
        y_index="economy_20mpg")

    print("Checking GLRM.....")
    irisH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=3,
                                              loss="Quadratic",
                                              gamma_x=0.5,
                                              gamma_y=0.5,
                                              transform="STANDARDIZE")
    checkColumnNamesTypesReturned(irisH2O, glrm_h2o, irisH2O.names)

    print("Checking NaiveBayes......")
    model = H2ONaiveBayesEstimator(laplace=0.25)
    x_indices = irisH2O.names
    y_index = x_indices[-1]
    x_indices.remove(y_index)
    checkColumnNamesTypesReturned(irisH2O, model, x_indices, y_index=y_index)

    # deeplearning
    print("Checking deeplearning.....")
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/gaussian_training1_set.csv"))
    x_indices = training1_data.names
    y_index = x_indices[-1]
    x_indices.remove(y_index)
    model = H2ODeepLearningEstimator(distribution='gaussian',
                                     seed=seed,
                                     hidden=[10, 10, 10])
    checkColumnNamesTypesReturned(training1_data,
                                  model,
                                  x_indices,
                                  y_index=y_index)

    # stack ensemble, stacking part is not iterative
    print(
        "******************** Skip testing stack ensemble.  Test done in pyunit_stackedensemble_regression.py."
    )

    # GBM run
    print("Checking GBM.....")
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/multinomial_training1_set.csv"))
    x_indices = training1_data.names
    y_index = x_indices[-1]
    x_indices.remove(y_index)
    training1_data[y_index] = training1_data[y_index].round().asfactor()
    model = H2OGradientBoostingEstimator(distribution="multinomial", seed=seed)
    checkColumnNamesTypesReturned(training1_data,
                                  model,
                                  x_indices,
                                  y_index=y_index)

    # random foreset
    print("Checking Random Forest.....")
    model = H2ORandomForestEstimator(ntrees=100, score_tree_interval=0)
    checkColumnNamesTypesReturned(training1_data,
                                  model,
                                  x_indices,
                                  y_index=y_index)

    # PCA
    print("Checking PCA.....")
    training1_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gridsearch/pca1000by25.csv"))
    x_indices = training1_data.names
    model = H2OPCA(k=10,
                   transform="STANDARDIZE",
                   pca_method="Power",
                   compute_metrics=True)
    checkColumnNamesTypesReturned(training1_data, model, x_indices)

    # kmeans
    print("Checking kmeans....")
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/kmeans_8_centers_3_coords.csv"))
    x_indices = training1_data.names
    model = H2OKMeansEstimator(k=10)
    checkColumnNamesTypesReturned(training1_data, model, x_indices)

    # word2vec
    print("Checking word2vec....")
    train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"),
                            header=1,
                            col_types=["string"])
    used = train[0:170000, 0]
    w2v_model = H2OWord2vecEstimator()
    checkColumnNamesTypesReturned(train, w2v_model, [], 0)
def basic_inference_works_for_DRF_and_NB_test():
    train = h2o.import_file(
        pyunit_utils.locate("smalldata/iris/iris_train.csv"))
    test = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_test.csv"))

    x_class = train.columns
    y_class = "species"
    x_class.remove(y_class)

    nfolds = 2

    nb_class = H2ONaiveBayesEstimator(
        nfolds=nfolds,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True,
    )
    nb_class.train(x=x_class, y=y_class, training_frame=train)

    gbm_class = H2OGradientBoostingEstimator(
        nfolds=nfolds,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True,
    )
    gbm_class.train(x=x_class, y=y_class, training_frame=train)

    drf_class = H2ORandomForestEstimator(
        nfolds=nfolds,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True)
    drf_class.train(x=x_class, y=y_class, training_frame=train)

    se_class_0 = H2OStackedEnsembleEstimator(
        training_frame=train,
        validation_frame=test,
        base_models=[nb_class, gbm_class, drf_class],
        metalearner_algorithm="gbm")
    se_class_0.train(x_class, y_class, train)

    assert se_class_0.metalearner().actual_params.get("distribution") == "multinomial", \
        "Expected distribution {} but got {}".format("multinomial",
                                                     se_class_0.metalearner().actual_params.get("distribution"))

    se_class_1 = H2OStackedEnsembleEstimator(
        training_frame=train,
        validation_frame=test,
        base_models=[gbm_class, drf_class, nb_class],
        metalearner_algorithm="gbm")
    se_class_1.train(x_class, y_class, train)

    assert se_class_1.metalearner().actual_params.get("distribution") == "multinomial", \
        "Expected distribution {} but got {}".format("multinomial",
                                                     se_class_1.metalearner().actual_params.get("distribution"))

    se_class_2 = H2OStackedEnsembleEstimator(
        training_frame=train,
        validation_frame=test,
        base_models=[drf_class, nb_class, gbm_class],
        metalearner_algorithm="gbm")
    se_class_2.train(x_class, y_class, train)

    assert se_class_2.metalearner().actual_params.get("distribution") == "multinomial", \
        "Expected distribution {} but got {}".format("multinomial",
                                                     se_class_2.metalearner().actual_params.get("distribution"))

    se_class_3 = H2OStackedEnsembleEstimator(
        training_frame=train,
        validation_frame=test,
        base_models=[nb_class, gbm_class, drf_class])
    se_class_3.train(x_class, y_class, train)

    assert se_class_3.metalearner().actual_params.get("family") == "multinomial", \
        "Expected family {} but got {}".format("multinomial",
                                               se_class_3.metalearner().actual_params.get("family"))

    se_class_4 = H2OStackedEnsembleEstimator(
        training_frame=train,
        validation_frame=test,
        base_models=[gbm_class, drf_class, nb_class])
    se_class_4.train(x_class, y_class, train)

    assert se_class_4.metalearner().actual_params.get("family") == "multinomial", \
        "Expected family {} but got {}".format("multinomial",
                                               se_class_4.metalearner().actual_params.get("family"))

    se_class_5 = H2OStackedEnsembleEstimator(
        training_frame=train,
        validation_frame=test,
        base_models=[drf_class, nb_class, gbm_class])
    se_class_5.train(x_class, y_class, train)

    assert se_class_5.metalearner().actual_params.get("family") == "multinomial", \
        "Expected family {} but got {}".format("multinomial",
                                               se_class_5.metalearner().actual_params.get("family"))
def stackedensemble_multinomial_test():
    """This test check the following (for multinomial regression):
    1) That H2OStackedEnsembleEstimator executes w/o errors on a 6-model manually constructed ensemble.
    2) That .predict() works on a stack.
    3) That .model_performance() works on a stack.
    4) That test performance is better on ensemble vs the base learners.
    5) That the validation_frame arg on H2OStackedEnsembleEstimator works correctly.
    """

    df = h2o.import_file(
        path=pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz"))
    y = "C785"
    x = list(range(784))
    df[y] = df[y].asfactor()
    train = df[0:5000, :]
    test = df[5000:10000, :]
    # Number of CV folds (to generate level-one data for stacking)
    nfolds = 2

    # train and cross-validate a GBM
    my_gbm = H2OGradientBoostingEstimator(
        distribution="multinomial",
        nfolds=nfolds,
        ntrees=10,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True,
        seed=1)
    my_gbm.train(x=x, y=y, training_frame=train)

    # evaluate the performance
    perf_gbm_train = my_gbm.model_performance()
    perf_gbm_test = my_gbm.model_performance(test_data=test)
    print("GBM training performance: ")
    print(perf_gbm_train)
    print("GBM test performance: ")
    print(perf_gbm_test)

    # train and cross-validate a RF
    my_rf = H2ORandomForestEstimator(ntrees=10,
                                     nfolds=nfolds,
                                     fold_assignment="Modulo",
                                     keep_cross_validation_predictions=True,
                                     seed=1)
    my_rf.train(x=x, y=y, training_frame=train)

    # evaluate performance
    perf_rf_train = my_rf.model_performance()
    perf_rf_test = my_rf.model_performance(test_data=test)
    print("RF training performance: ")
    print(perf_rf_train)
    print("RF test performance: ")
    print(perf_rf_test)

    # Train and cross-validate an XGBoost GBM
    my_xgb = H2OXGBoostEstimator(ntrees=10,
                                 nfolds=nfolds,
                                 fold_assignment="Modulo",
                                 keep_cross_validation_predictions=True,
                                 seed=1)
    my_xgb.train(x=x, y=y, training_frame=train)

    # evaluate performance
    perf_xgb_train = my_xgb.model_performance()
    perf_xgb_test = my_xgb.model_performance(test_data=test)
    print("XGB training performance: ")
    print(perf_xgb_train)
    print("XGB test performance: ")
    print(perf_xgb_test)

    # Train and cross-validate a Naive Bayes model
    my_nb = H2ONaiveBayesEstimator(nfolds=nfolds,
                                   fold_assignment="Modulo",
                                   keep_cross_validation_predictions=True,
                                   seed=1)
    my_nb.train(x=x, y=y, training_frame=train)

    # evaluate performance
    perf_nb_train = my_nb.model_performance()
    perf_nb_test = my_nb.model_performance(test_data=test)
    print("NB training performance: ")
    print(perf_nb_train)
    print("NB test performance: ")
    print(perf_nb_test)

    # Train and cross-validate a Deep Learning model
    my_dnn = H2ODeepLearningEstimator(hidden=[10, 10],
                                      nfolds=nfolds,
                                      fold_assignment="Modulo",
                                      keep_cross_validation_predictions=True,
                                      seed=1)
    my_dnn.train(x=x, y=y, training_frame=train)

    # evaluate performance
    perf_dnn_train = my_dnn.model_performance()
    perf_dnn_test = my_dnn.model_performance(test_data=test)
    print("DNN training performance: ")
    print(perf_dnn_train)
    print("DNN test performance: ")
    print(perf_dnn_test)

    # Train and cross-validate a GLM model
    my_glm = H2OGeneralizedLinearEstimator(
        family="multinomial",
        nfolds=nfolds,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True,
        seed=1)
    my_glm.train(x=x, y=y, training_frame=train)

    # evaluate performance
    perf_glm_train = my_glm.model_performance()
    perf_glm_test = my_glm.model_performance(test_data=test)
    print("GLM training performance: ")
    print(perf_glm_train)
    print("GLM test performance: ")
    print(perf_glm_test)

    # Train a stacked ensemble using the GBM and GLM above
    stack = H2OStackedEnsembleEstimator(base_models=[
        my_gbm.model_id, my_rf.model_id, my_xgb.model_id, my_nb.model_id,
        my_dnn.model_id, my_glm.model_id
    ])
    stack.train(
        x=x, y=y, training_frame=train,
        validation_frame=test)  # also test that validation_frame is working
    assert isinstance(
        stack, h2o.estimators.stackedensemble.H2OStackedEnsembleEstimator)
    assert stack.type == "classifier"

    # Check that prediction works
    pred = stack.predict(test_data=test)
    print(pred)
    assert pred.nrow == test.nrow, "expected " + str(
        pred.nrow) + " to be equal to " + str(test.nrow)
    assert pred.ncol == 11, "expected " + str(
        pred.ncol) + " to be equal to 1 but it was equal to " + str(pred.ncol)

    # Evaluate ensemble performance
    perf_stack_train = stack.model_performance()
    assert isinstance(perf_stack_train,
                      h2o.model.metrics_base.H2OMultinomialModelMetrics)
    perf_stack_valid = stack.model_performance(valid=True)
    assert isinstance(perf_stack_valid,
                      h2o.model.metrics_base.H2OMultinomialModelMetrics)
    perf_stack_test = stack.model_performance(test_data=test)
    assert isinstance(perf_stack_test,
                      h2o.model.metrics_base.H2OMultinomialModelMetrics)

    # Check that stack perf is better (smaller) than the best (smaller) base learner perf:
    # Test Mean Per Class Error for each base learner
    baselearner_best_mean_per_class_error_test = min(perf_gbm_test.mean_per_class_error(), \
                                                     perf_rf_test.mean_per_class_error(), \
                                                     perf_xgb_test.mean_per_class_error(), \
                                                     perf_nb_test.mean_per_class_error(), \
                                                     perf_dnn_test.mean_per_class_error(),
                                                     perf_glm_test.mean_per_class_error())
    stack_mean_per_class_error_test = perf_stack_test.mean_per_class_error()
    print("Best Base-learner Test Mean Per Class Error:  {0}".format(
        baselearner_best_mean_per_class_error_test))
    print("Ensemble Test Mean Per Class Error:  {0}".format(
        stack_mean_per_class_error_test))
    assert stack_mean_per_class_error_test <= baselearner_best_mean_per_class_error_test, + \
                                                         "expected stack_mean_per_class_error_test would be less than " \
                                                         " baselearner_best_mean_per_class_error_test, found it wasn't  " \
                                                         "baselearner_best_mean_per_class_error_test = "+ \
                                                         str(baselearner_best_mean_per_class_error_test) + \
                                                         ",stack_mean_per_class_error_test = "+ \
                                                         str(stack_mean_per_class_error_test)

    # Check that passing `test` as a validation_frame produces the same metric as stack.model_performance(test)
    # since the metrics object is not exactly the same, we can just test that RSME is the same
    perf_stack_validation_frame = stack.model_performance(valid=True)
    assert stack_mean_per_class_error_test == perf_stack_validation_frame.mean_per_class_error(), \
                                                                  "expected stack_mean_per_class_error_test to be the same as " \
                                                                  "perf_stack_validation_frame.mean_per_class_error() found it wasn't" \
                                                                  "perf_stack_validation_frame.mean_per_class_error() = " + \
                                                                  str(perf_stack_validation_frame.mean_per_class_error()) + \
                                                                  "stack_mean_per_class_error_test was " + \
                                                                  str(stack_mean_per_class_error_test)