def h2o_H2OFrame_stats():
    """
    Python API test: h2o.frame.H2OFrame.max(), h2o.frame.H2OFrame.mean(), h2o.frame.H2OFrame.median(),
    h2o.frame.H2OFrame.min(),
    """
    row_num = randrange(1, 10)
    col_num = randrange(1, 10)
    python_lists = np.random.randint(-5, 5, (row_num, col_num))
    h2oframe = h2o.H2OFrame(python_obj=python_lists)
    assert abs(h2oframe.max() - np.ndarray.max(python_lists)
               ) < 1e-12, "h2o.H2OFrame.max() command is not working."
    assert abs(h2oframe.min() - np.ndarray.min(python_lists)
               ) < 1e-12, "h2o.H2OFrame.min() command is not working."

    h2oMean = h2oframe.mean(skipna=False, axis=0)
    assert_is_type(h2oMean, H2OFrame)
    numpmean = list(np.mean(python_lists, axis=0))
    h2omean = h2oMean.as_data_frame(use_pandas=True, header=False)
    assert pyunit_utils.equal_two_arrays(
        numpmean,
        h2omean.values.tolist()[0], 1e-12,
        1e-6), "h2o.H2OFrame.mean() command is not working."

    h2oMedian = h2oframe.median(na_rm=True)
    assert_is_type(h2oMedian, list)
    numpmedian = list(np.median(python_lists, axis=0))
    assert pyunit_utils.equal_two_arrays(
        numpmedian, h2oMedian, 1e-12,
        1e-6), "h2o.H2OFrame.median() command is not working."
def test_load_glrm():
  print("Importing iris_wheader.csv data...")
  irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
  irisH2O.describe()

  g_model = H2OGeneralizedLowRankEstimator(k=3)
  g_model.train(x=irisH2O.names, training_frame=irisH2O)
  yarch_old = g_model.archetypes()
  x_old = h2o.get_frame(g_model._model_json["output"]["representation_name"])
  predOld = g_model.predict(irisH2O)
  TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "../..", "results"))

  try:
    TMPDIR = pyunit_utils.locate("results")    # find directory path to results folder
  except:
    os.makedirs(TMPDIR)
  h2o.save_model(g_model, path=TMPDIR, force=True)       # save model
  full_path_filename = os.path.join(TMPDIR, g_model._id)

  h2o.remove(g_model)
  model_reloaded = h2o.load_model(full_path_filename)
  pred = model_reloaded.predict(irisH2O)
  yarch = model_reloaded.archetypes()
  x = h2o.get_frame(model_reloaded._model_json["output"]["representation_name"])

  # assert difference between old and new are close, archetypes should be the same
  pyunit_utils.compare_frames_local(x, x_old, tol=1e-6)
  pyunit_utils.compare_frames_local(pred[0], predOld[0], tol=1)
  for k in range(3):
    pyunit_utils.equal_two_arrays(yarch_old[k], yarch[k], eps = 1e-4, tolerance=1e-10)

  print("glrm model successfully loaded...")
def h2o_H2OFrame_categories():
    """
    Python API test: h2o.frame.H2OFrame.categories()
    """
    python_lists = np.random.randint(4, size=(10, 1))
    h2oframe = h2o.H2OFrame(python_obj=python_lists, column_types=['enum'])
    alllevels = h2oframe.categories()
    alllevels = [int(i) for i in alllevels
                 ]  # convert string into integers for comparison
    truelevels = np.unique(
        python_lists).tolist()  # categorical levels calculated from Python
    assert alllevels == truelevels, "h2o.H2OFrame.categories() command is not working."
    pyunit_utils.equal_two_arrays(alllevels, truelevels, 1e-10, 0)
def test_modelselection_backward_gaussian():
    predictor_elimination_order = ['C33', 'C24', 'C164', 'C66', 'C15']
    eliminated_p_values = [0.9711, 0.0694, 0.0388, 0.0127, 0.0009]
    tst_data = h2o.import_file(
        pyunit_utils.locate(
            "bigdata/laptop/model_selection/backwardBinomial200C50KRows.csv"))
    predictors = tst_data.columns[0:-1]
    response_col = 'response'
    weight = 'wt'
    tst_data['wt'] = 1
    tst_data[tst_data['response'] == 1, 'wt'] = 100
    tst_data['response'] = tst_data['response'].asfactor()

    min_predictor_num = 200 - len(predictor_elimination_order)
    model_backward = modelSelection(family='binomial',
                                    weights_column=weight,
                                    mode='backward',
                                    min_predictor_number=min_predictor_num)
    model_backward.train(training_frame=tst_data, x=predictors, y=response_col)
    # check predictor deletion order same as in predictor_elimination_order
    predictor_orders = model_backward._model_json['output'][
        'best_model_predictors']
    num_models = len(predictor_orders)
    counter = 0
    pred_ele = []
    pred_pvalue = []
    for ind in list(range(num_models - 1, 0, -1)):
        pred_large = model_backward._model_json["output"][
            "best_model_predictors"][ind]
        pred_small = model_backward._model_json["output"][
            "best_model_predictors"][ind - 1]
        predictor_removed = set(pred_large).symmetric_difference(
            pred_small).pop()
        pred_ele.append(predictor_removed)
        predictor_removed_index = model_backward._model_json["output"][
            "coefficient_names"][ind].index(predictor_removed)
        pred_pvalue.append(
            round(
                model_backward._model_json["output"]["coef_p_values"][ind]
                [predictor_removed_index], 4))
        counter += 1
        coefs = model_backward.coef(
            len(pred_large))  # check coefficients result correct length
        assert len(coefs) == len(
            pred_large), "Expected coef length: {0}, Actual: {1}".format(
                len(coefs), len(pred_large))
    common_elimination = list(set(predictor_elimination_order) & set(pred_ele))
    assert len(common_elimination) == len(pred_ele)
    pyunit_utils.equal_two_arrays(pred_pvalue,
                                  eliminated_p_values,
                                  tolerance=1e-6)
Beispiel #5
0
def test_HGLM_R():
    h2o_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/glm_test/semiconductor.csv"))
    y = "y"
    x = ["x1", "x3", "x5", "x6"]
    z = [0]
    tot = 1e-4
    h2o_data[0] = h2o_data[0].asfactor()
    start_vals = [
        0.001929687, 0.002817188, -0.001707812, -0.003889062, 0.010685937, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.1, 0.1
    ]
    h2o_glm = H2OGeneralizedLinearEstimator(HGLM=True,
                                            family="gaussian",
                                            rand_family=["gaussian"],
                                            random_columns=z,
                                            calc_like=True)
    h2o_glm.train(x=x, y=y, training_frame=h2o_data)
    modelMetrics = h2o_glm.training_model_metrics()

    h2o_glm_start_val = H2OGeneralizedLinearEstimator(HGLM=True,
                                                      family="gaussian",
                                                      rand_family=["gaussian"],
                                                      random_columns=z,
                                                      calc_like=True,
                                                      startval=start_vals)
    h2o_glm_start_val.train(x=x, y=y, training_frame=h2o_data)
    modelMetricsSV = h2o_glm_start_val.training_model_metrics()

    # compare model metrics from both models and they should be the same
    metricsNames = [
        "hlik", "pvh", "dfrefe", "varfix", "pbvh", "convergence", "caic",
        "sumetadiffsquare"
    ]
    metricsNamesArrays = [
        "summvc1",
        "sefe",
        "varranef",
        "ranef",
        "sere",
        "fixef",
    ]

    for ind in range(len(metricsNames)):
        assert abs(modelMetrics[metricsNames[ind]]-modelMetricsSV[metricsNames[ind]]) < tot, "expected {0}: {1}, " \
                                                                                             "actual {0}: {2}".format(metricsNames[ind], modelMetrics[metricsNames[ind]], modelMetricsSV[metricsNames[ind]])
    for ind in range(len(metricsNamesArrays)):
        pyunit_utils.equal_two_arrays(modelMetrics[metricsNamesArrays[ind]],
                                      modelMetricsSV[metricsNamesArrays[ind]],
                                      1e-10, tot)
def glrm_grid_user_y():
    export_dir = tempfile.mkdtemp()
    train_data = np.dot(np.random.rand(1000, 10), np.random.rand(10, 100))
    train = h2o.H2OFrame(train_data.tolist(), destination_frame="glrm_train")
    initial_y_data = np.random.rand(10, 100)
    initial_y_h2o = h2o.H2OFrame(initial_y_data.tolist(), destination_frame="glrm_initial_y")
    params = {
        "k": 10,
        "init": "User",
        "user_y": initial_y_h2o,
        "loss": "Quadratic",
        "regularization_x": "OneSparse",
        "regularization_y": "NonNegative"
    }
    hyper_params = {
        "transform": ["NONE", "STANDARDIZE"],
        "gamma_x": [0.1],
    }
    
    # train grid
    grid = H2OGridSearch(
        H2OGeneralizedLowRankEstimator,
        hyper_params=hyper_params
    )
    grid.train(x=train.names, training_frame=train, **params)
    print("first grid")
    print(grid)
    assert len(grid.model_ids) == 2
    archetypes1 = grid.models[0].archetypes()
    archetypes2 = grid.models[1].archetypes()
    grid_path = h2o.save_grid(export_dir, grid.grid_id)
    h2o.remove_all()
    
    # reimport and train some more
    train = h2o.H2OFrame(train_data.tolist(), destination_frame="glrm_train")
    initial_y = h2o.H2OFrame(initial_y_data.tolist(), destination_frame="glrm_initial_y")
    grid = h2o.load_grid(grid_path)
    grid.hyper_params["gamma_x"] = [0.1, 1]
    grid.train(x=train.names, training_frame=train, **params)
    print("second grid")
    print(grid)
    assert len(grid.model_ids) == 4
    # check actual training occurred and results are different
    assert grid.models[0].archetypes() == archetypes1
    assert grid.models[1].archetypes() == archetypes2
    # instead of using python compare, I am just check and make first the first arrays are not equal.
    assert not(pyunit_utils.equal_two_arrays(grid.models[1].archetypes()[0], grid.models[2].archetypes()[0], throw_error=False))
    assert not(pyunit_utils.equal_two_arrays(grid.models[2].archetypes()[0], grid.models[3].archetypes()[0], throw_error=False))
def partial_plot_row_index():
    data = h2o.import_file(
        pyunit_utils.locate('smalldata/prostate/prostate_cat_NA.csv'))
    x = data.names
    y = 'CAPSULE'
    x.remove(y)

    # Build a GBM model predicting for response CAPSULE
    gbm_model = H2OGradientBoostingEstimator(ntrees=50,
                                             learn_rate=0.05,
                                             seed=12345)
    gbm_model.train(x=x, y=y, training_frame=data)

    # Generate Partial Dependence for row index -1 and row index 0, they should differ
    pdp = gbm_model.partial_plot(data=data,
                                 cols=['RACE'],
                                 plot=False,
                                 plot_stddev=False,
                                 row_index=-1)
    pdp0 = gbm_model.partial_plot(data=data,
                                  cols=['RACE'],
                                  plot=False,
                                  plot_stddev=False,
                                  row_index=0)
    assert not (pyunit_utils.equal_two_arrays(
        pdp[0][1], pdp0[0][1], throw_error=False))
def h2o_H2OFrame_categories():
    """
    Python API test: h2o.frame.H2OFrame.categories()
    """
    python_lists = np.random.randint(4, size=(10,1))
    h2oframe = h2o.H2OFrame(python_obj=python_lists, column_types=['enum'])
    alllevels = h2oframe.categories()
    alllevels = [int(i) for i in alllevels]     # convert string into integers for comparison
    truelevels = np.unique(python_lists).tolist()   # categorical levels calculated from Python
    assert alllevels==truelevels, "h2o.H2OFrame.categories() command is not working."
    assert pyunit_utils.equal_two_arrays(alllevels, truelevels, 1e-10, 0), "h2o.H2OFrame.categories() command is" \
                                                                           " not working."
Beispiel #9
0
def compare_weightedStats(model,
                          datafile,
                          xlist,
                          xname,
                          weightV,
                          pdpTDTable,
                          tol=1e-6):
    weightStat = manual_partial_dependence(
        model, datafile, xlist, xname,
        weightV)  # calculate theoretical weighted sts
    wMean = pyunit_utils.extract_col_value_H2OTwoDimTable(
        pdpTDTable, "mean_response")  # stats for age predictor
    wStd = pyunit_utils.extract_col_value_H2OTwoDimTable(
        pdpTDTable, "stddev_response")
    wStdErr = pyunit_utils.extract_col_value_H2OTwoDimTable(
        pdpTDTable, "std_error_mean_response")
    pyunit_utils.equal_two_arrays(weightStat[0],
                                  wMean,
                                  tol,
                                  tol,
                                  throw_error=True)
    pyunit_utils.equal_two_arrays(weightStat[1],
                                  wStd,
                                  tol,
                                  tol,
                                  throw_error=True)
    pyunit_utils.equal_two_arrays(weightStat[2],
                                  wStdErr,
                                  tol,
                                  tol,
                                  throw_error=True)
Beispiel #10
0
def test_load_glrm():
    print("Importing iris_wheader.csv data...")
    irisH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    irisH2O.describe()

    g_model = H2OGeneralizedLowRankEstimator(k=3)
    g_model.train(x=irisH2O.names, training_frame=irisH2O)
    yarch_old = g_model.archetypes()
    x_old = h2o.get_frame(g_model._model_json["output"]["representation_name"])
    predOld = g_model.predict(irisH2O)
    TMPDIR = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath('__file__')), "../..",
                     "results"))

    try:
        TMPDIR = pyunit_utils.locate(
            "results")  # find directory path to results folder
    except:
        os.makedirs(TMPDIR)
    h2o.save_model(g_model, path=TMPDIR, force=True)  # save model
    full_path_filename = os.path.join(TMPDIR, g_model._id)

    h2o.remove(g_model)
    model_reloaded = h2o.load_model(full_path_filename)
    pred = model_reloaded.predict(irisH2O)
    yarch = model_reloaded.archetypes()
    x = h2o.get_frame(
        model_reloaded._model_json["output"]["representation_name"])

    # assert difference between old and new are close, archetypes should be the same
    pyunit_utils.compare_frames_local(x, x_old, tol=1e-6)
    pyunit_utils.compare_frames_local(pred[0], predOld[0], tol=1)
    for k in range(3):
        pyunit_utils.equal_two_arrays(yarch_old[k],
                                      yarch[k],
                                      eps=1e-4,
                                      tolerance=1e-10)

    print("glrm model successfully loaded...")
def h2o_H2OFrame_stats():
    """
    Python API test: h2o.frame.H2OFrame.max(), h2o.frame.H2OFrame.mean(), h2o.frame.H2OFrame.median(),
    h2o.frame.H2OFrame.min(),
    """
    row_num = randrange(1,10)
    col_num = randrange(1,10)
    python_lists = np.random.randint(-5,5, (row_num, col_num))
    h2oframe = h2o.H2OFrame(python_obj=python_lists)
    assert abs(h2oframe.max()-np.ndarray.max(python_lists)) < 1e-12, "h2o.H2OFrame.max() command is not working."
    assert abs(h2oframe.min()-np.ndarray.min(python_lists)) < 1e-12, "h2o.H2OFrame.min() command is not working."

    h2oMean = h2oframe.mean(skipna=False, axis=0)
    assert_is_type(h2oMean, H2OFrame)
    numpmean = list(np.mean(python_lists, axis=0))
    h2omean = h2oMean.as_data_frame(use_pandas=True, header=False)
    assert pyunit_utils.equal_two_arrays(numpmean, h2omean.values.tolist()[0], 1e-12, 1e-6), "h2o.H2OFrame.mean() command is not working."

    h2oMedian = h2oframe.median(na_rm=True)
    assert_is_type(h2oMedian, list)
    numpmedian = list(np.median(python_lists, axis=0))
    assert pyunit_utils.equal_two_arrays(numpmedian, h2oMedian, 1e-12, 1e-6), "h2o.H2OFrame.median() command is not working."
def compare_weightedStats(model, datafile, xlist, xname, weightV, pdpTDTable, tol=1e-6):
    weightStat =  manual_partial_dependence(model, datafile, xlist, xname, weightV) # calculate theoretical weighted sts
    wMean = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpTDTable, "mean_response") # stats for age predictor
    wStd = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpTDTable, "stddev_response")
    wStdErr = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpTDTable, "std_error_mean_response")
    pyunit_utils.equal_two_arrays(weightStat[0], wMean, tol, tol, throwError=True)
    pyunit_utils.equal_two_arrays(weightStat[1], wStd, tol, tol, throwError=True)
    pyunit_utils.equal_two_arrays(weightStat[2], wStdErr, tol, tol, throwError=True)
Beispiel #13
0
def test_maxrglm_cross_validation():

    d = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
    my_y = "C21"
    my_x = [
        "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11",
        "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20"
    ]
    factorX = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"]
    for x in factorX:
        d[x] = d[x].asfactor()
    n_folds = 3
    fold_numbers = d.modulo_kfold_column(n_folds=n_folds)
    fold_numbers.set_names(["fold_numbers_modulo"])
    fold_numbers2 = d.kfold_column(n_folds=n_folds, seed=12345)
    fold_numbers2.set_names(["fold_numbers_kfold"])

    # append the fold_numbers column to the cars dataset
    d = d.cbind(fold_numbers)
    d = d.cbind(fold_numbers2)

    # cv model with fold assignment
    maxrglm_model_fa = maxrglm(seed=12345,
                               max_predictor_number=3,
                               fold_column="fold_numbers_modulo")
    maxrglm_model_fa.train(training_frame=d, x=my_x, y=my_y)
    best_r2_value_fa = maxrglm_model_fa.get_best_R2_values()

    maxrglm_model_fk = maxrglm(seed=12345,
                               max_predictor_number=3,
                               fold_column="fold_numbers_kfold")
    maxrglm_model_fk.train(training_frame=d, x=my_x, y=my_y)
    best_r2_value_fk = maxrglm_model_fk.get_best_R2_values()

    # both models should provide same best R2 values
    pyunit_utils.equal_two_arrays(best_r2_value_fa, best_r2_value_fk, eps=1e-6)
def h2o_H2OFrame_sum():
    """
    Python API test: h2o.frame.H2OFrame.meansum(skipna=True, axis=0, **kwargs)
    """
    row_num = randrange(1,10)
    col_num = randrange(1,10)
    python_lists = np.random.randint(-5,5, (row_num, col_num))
    h2oframe = h2o.H2OFrame(python_obj=python_lists)

    # axis = 0
    h2oSum = h2oframe.sum(skipna=False, axis=0)
    assert_is_type(h2oSum, H2OFrame)
    numpsum = list(np.sum(python_lists, axis=0))
    h2omean = h2oSum.as_data_frame(use_pandas=True, header=False)
    assert pyunit_utils.equal_two_arrays(numpsum, h2omean.values.tolist()[0], 1e-12, 1e-6), "h2o.H2OFrame.meansum()" \
                                                                                            " command is not working."

    # axis = 1
    h2oSum = h2oframe.sum(skipna=False, axis=1)
    assert_is_type(h2oSum, H2OFrame)
    numpsum = list(np.sum(python_lists, axis=1))
    h2omean = h2oSum.as_data_frame(use_pandas=True, header=False)
    assert pyunit_utils.equal_two_arrays(numpsum, h2omean.values.T.tolist()[0], 1e-12, 1e-6), "h2o.H2OFrame.meansum()" \
                                                                                              " command is not working."
def test_modelseletion_modelselection_cross_validation():

    d = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
    my_y = "C21"
    my_x = [
        "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11",
        "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20"
    ]
    factorX = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"]
    for x in factorX:
        d[x] = d[x].asfactor()
    n_folds = 3

    maxr_model_r = modelSelection(seed=12345,
                                  max_predictor_number=3,
                                  nfolds=n_folds,
                                  fold_assignment="random",
                                  mode="maxr")
    maxr_model_r.train(training_frame=d, x=my_x, y=my_y)
    best_r2_maxr_r = maxr_model_r.get_best_R2_values()

    maxrglm_model_a = modelSelection(seed=12345,
                                     max_predictor_number=3,
                                     nfolds=n_folds,
                                     fold_assignment="auto",
                                     mode="maxr")
    maxrglm_model_a.train(training_frame=d, x=my_x, y=my_y)
    best_r2_maxr_a = maxrglm_model_a.get_best_R2_values()

    # both models should provide same best R2 values
    pyunit_utils.equal_two_arrays(best_r2_maxr_r, best_r2_maxr_a, eps=1e-6)

    allsubsets_model_r = modelSelection(seed=12345,
                                        max_predictor_number=3,
                                        nfolds=n_folds,
                                        fold_assignment="random",
                                        mode="allsubsets")
    allsubsets_model_r.train(training_frame=d, x=my_x, y=my_y)
    best_r2_allsubsets_r = allsubsets_model_r.get_best_R2_values()
    pyunit_utils.equal_two_arrays(
        best_r2_allsubsets_r, best_r2_maxr_r,
        eps=1e-6)  # maxr and allsubsets r2 should equal

    allsubsets_model_a = modelSelection(seed=12345,
                                        max_predictor_number=3,
                                        nfolds=n_folds,
                                        fold_assignment="auto",
                                        mode="allsubsets")
    allsubsets_model_a.train(training_frame=d, x=my_x, y=my_y)
    best_r2_allsubsets_a = allsubsets_model_a.get_best_R2_values()
    pyunit_utils.equal_two_arrays(
        best_r2_allsubsets_a, best_r2_maxr_a,
        eps=1e-6)  # maxr and allsubsets r2 should equal
def glrm_pubdev_3756_arrest():
    print("Importing prostate.csv data...")

    # frame binary data is read in as enums.  Let's see if it runs.
    prostateF = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
    prostateF_num = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
    prostateF_num[0] = prostateF_num[0].asnumeric()
    prostateF_num[4] = prostateF_num[4].asnumeric()

    loss_all = ["Hinge", "Quadratic", "Categorical", "Categorical", "Hinge", "Quadratic", "Quadratic", "Quadratic"]

    print("check with init = plusplus")
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=5, loss_by_col=loss_all, recover_svd=True, transform="STANDARDIZE",
                                              seed=12345)
    glrm_h2o.train(x=prostateF.names, training_frame=prostateF, validation_frame=prostateF)
    glrm_h2o.show()

    # exercise logistic loss with numeric columns
    glrm_h2o_num = H2OGeneralizedLowRankEstimator(k=5, loss_by_col=loss_all, recover_svd=True, transform="STANDARDIZE",
                                                  seed=12345)
    glrm_h2o_num.train(x=prostateF_num.names, training_frame=prostateF_num, validation_frame=prostateF_num)
    glrm_h2o_num.show()

    print("check with init = random")
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=5, loss_by_col=loss_all, recover_svd=True, transform="STANDARDIZE",
                                              seed=12345, init="random")
    glrm_h2o.train(x=prostateF.names, training_frame=prostateF, validation_frame=prostateF)
    glrm_h2o.show()

    # exercise logistic loss with numeric columns
    glrm_h2o_num = H2OGeneralizedLowRankEstimator(k=5, loss_by_col=loss_all, recover_svd=True, transform="STANDARDIZE",
                                                  seed=12345, init="random")
    glrm_h2o_num.train(x=prostateF_num.names, training_frame=prostateF_num, validation_frame=prostateF_num)
    glrm_h2o_num.show()

    print("check with init = SVD")
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=5, loss_by_col=loss_all, recover_svd=True, transform="STANDARDIZE",
                                              seed=12345, init="SVD")
    glrm_h2o.train(x=prostateF.names, training_frame=prostateF, validation_frame=prostateF)
    glrm_h2o.show()

    # exercise logistic loss with numeric columns
    glrm_h2o_num = H2OGeneralizedLowRankEstimator(k=5, loss_by_col=loss_all, recover_svd=True, transform="STANDARDIZE",
                                                  seed=12345, init="SVD")
    glrm_h2o_num.train(x=prostateF_num.names, training_frame=prostateF_num, validation_frame=prostateF_num)
    glrm_h2o_num.show()

    print("check with init = user")
    initial_y = [[-1.27675647831893E-15,64.87421383647799,2.0,1.0,2.0816681711721685E-16,8.533270440251574,
                9.380440251572328,5.886792452830188],
               [0.7297297297297298,66.05405405405405,2.0,0.0,1.0,23.270270270270274,9.589189189189193,7.27027027027027],
               [0.01754385964912314,70.35087719298245,2.0,1.0,-1.3877787807814457E-17,10.078947368421053,
                42.37543859649123,6.157894736842105],
               [0.9,65.95,2.0,0.0,0.2,81.94500000000001,16.375,7.4],
               [0.9999999999999989,65.48598130841121,2.0,3.0,1.3877787807814457E-16,13.3092523364486,
                13.268411214953275,6.747663551401869]]
    initial_y_h2o = h2o.H2OFrame(list(initial_y))
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=5, loss_by_col=loss_all, recover_svd=True, transform="STANDARDIZE",
                                              seed=12345, init="User", user_y=initial_y_h2o)
    glrm_h2o.train(x=prostateF.names, training_frame=prostateF, validation_frame=prostateF)
    glrm_h2o.show()

    # exercise logistic loss with numeric columns
    glrm_h2o_num = H2OGeneralizedLowRankEstimator(k=5, loss_by_col=loss_all, recover_svd=True, transform="STANDARDIZE",
                                                  seed=12345, init="User", user_y=initial_y_h2o)
    glrm_h2o_num.train(x=prostateF_num.names, training_frame=prostateF_num, validation_frame=prostateF_num)
    glrm_h2o_num.show()

    # singular values from glrm models should equal if binary columns with binary loss are read in as either
    # categorical or numerics.  If not, something is wrong.
    assert pyunit_utils.equal_two_arrays(glrm_h2o._model_json["output"]["singular_vals"],
                                         glrm_h2o_num._model_json["output"]["singular_vals"], 1e-6, 1e-4), \
        "Singular values obtained from logistic loss with column type as enum and numeric do not agree.  Fix it now."

    sys.stdout.flush()
def random_grid_model_seeds_PUBDEV_4090():
    '''
    This test is written to verify that I have implemented the model seed determination properly when
    random grid search is enabled.  Basically, there are three cases:
    1. Both the model and search_criteria did not set the seed value.  The seed values are either not set or
      set to default values.  In this case, random grid search and model will be using random seeds for itself
      independent of each other;
    2. Both the model and search_criteria set their seeds to be non-default values.  Random grid search will use
      the seed set in search_critera and model will be built using the seed set in its model parameter.
    3. The search_criteria seed is set to a non-default value while the model parameter seed is default value,
      Random grid search will use the search_criteria seed while the models will be built using the following
      sequence of seeds:
      - model 0: search_criteria seed;
      - model 1: search_criteria seed+1;
      -....
      - model n: search_criteria seed+n;
      ...
    4. Model parameter seed is set but search seed is set to default.  In this case, gridsearch will use random
      seed while models are built using the one seed.

    Current code already support cases 1/2/4.  The code changes were made to enable case 3 and this is the
    case that will be tested here.
    '''
    air_hex = h2o.import_file(path=pyunit_utils.locate("smalldata/airlines/allyears2k_headers.zip"),
                              destination_frame="air.hex")
    myX = ["Year","Month","CRSDepTime","UniqueCarrier","Origin","Dest"]
    grid_max_models = 8
    # create hyperameter and search criteria lists (ranges are inclusive..exclusive))
    hyper_params_tune = {'max_depth' : list(range(1,10+1,1)),
                         'sample_rate': [x/100. for x in range(20,101)],
                         'col_sample_rate' : [x/100. for x in range(20,101)],
                         'col_sample_rate_per_tree': [x/100. for x in range(20,101)],
                         'col_sample_rate_change_per_level': [x/100. for x in range(90,111)],
                         'min_rows': [2**x for x in range(0,int(math.log(air_hex.nrow,2)-1)+1)],
                         'nbins': [2**x for x in range(4,11)],
                         'nbins_cats': [2**x for x in range(4,13)],
                         'min_split_improvement': [0,1e-8,1e-6,1e-4],
                         'histogram_type': ["UniformAdaptive","QuantilesGlobal","RoundRobin"]}


    search_criteria_tune1 = {'strategy': "RandomDiscrete",
                             'max_models': grid_max_models ,   # limit the runtime
                             'seed' : 1234,
                             }
    search_criteria_tune2 = {'strategy': "RandomDiscrete",
                             'max_models': grid_max_models ,   # limit the runtime
                             'seed' : 1234,
                             }

    # case 3, search criteria seed is set but model parameter seed is not:
    air_grid1 = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_params_tune,
                              search_criteria=search_criteria_tune1)
    air_grid2 = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_params_tune,
                              search_criteria=search_criteria_tune2)
    air_grid1.train(x=myX, y="IsDepDelayed", training_frame=air_hex, distribution="bernoulli")
    air_grid2.train(x=myX, y="IsDepDelayed", training_frame=air_hex, distribution="bernoulli")

    # expect both models to render the same metrics as they use the same model seed, search criteria seed
    model_seeds1 = pyunit_utils.model_seed_sorted(air_grid1)
    model_seeds2 = pyunit_utils.model_seed_sorted(air_grid2)
    # check model seeds are set as gridseed+model number where model number = 0, 1, ..., ...
    model_len = min(len(air_grid1), len(air_grid2))

    model1Seeds = ','.join(str(x) for x in model_seeds1[0:model_len])
    model2Seeds = ','.join(str(x) for x in model_seeds2[0:model_len])
    assert model1Seeds==model2Seeds, "Model seeds are not equal: gridsearch 1 seeds %s; " \
                                                           " and gridsearch 2 seeds %s" % (model1Seeds, model2Seeds)

    # compare training_rmse from scoring history
    model1seed = air_grid1.models[0].full_parameters['seed']['actual_value']
    index2 = 0  # find the model in grid2 with the same seed
    for ind in range(0, len(air_grid2.models)):
        if air_grid2.models[ind].full_parameters['seed']['actual_value']==model1seed:
            index2=ind
            break

    metric_list1 = pyunit_utils.extract_scoring_history_field(air_grid1.models[0], "training_rmse", False)
    metric_list2 = pyunit_utils.extract_scoring_history_field(air_grid2.models[index2], "training_rmse", False)
    print(metric_list1)
    print(metric_list2)

    assert pyunit_utils.equal_two_arrays(metric_list1, metric_list2, 1e-5, 1e-6, False), \
                "Training_rmse are different between the two grid search models.  Tests are supposed to be repeatable in " \
                "this case.  Make sure model seeds are actually set correctly in the Java backend."
def random_grid_model_seeds_PUBDEV_4090():
    '''
    This test is written to verify that I have implemented the model seed determination properly when
    random grid search is enabled.  Basically, there are three cases:
    1. Both the model and search_criteria did not set the seed value.  The seed values are either not set or
      set to default values.  In this case, random grid search and model will be using random seeds for itself
      independent of each other;
    2. Both the model and search_criteria set their seeds to be non-default values.  Random grid search will use
      the seed set in search_critera and model will be built using the seed set in its model parameter.
    3. The search_criteria seed is set to a non-default value while the model parameter seed is default value,
      Random grid search will use the search_criteria seed while the models will be built using the following
      sequence of seeds:
      - model 0: search_criteria seed;
      - model 1: search_criteria seed+1;
      -....
      - model n: search_criteria seed+n;
      ...
    4. Model parameter seed is set but search seed is set to default.  In this case, gridsearch will use random
      seed while models are built using the one seed.

    Current code already support cases 1/2/4.  The code changes were made to enable case 3 and this is the
    case that will be tested here.
    '''
    air_hex = h2o.import_file(
        path=pyunit_utils.locate("smalldata/airlines/allyears2k_headers.zip"),
        destination_frame="air.hex")
    myX = ["Year", "Month", "CRSDepTime", "UniqueCarrier", "Origin", "Dest"]
    grid_max_runtime_secs = 20
    # create hyperameter and search criteria lists (ranges are inclusive..exclusive))
    hyper_params_tune = {
        'max_depth':
        list(range(1, 10 + 1, 1)),
        'sample_rate': [x / 100. for x in range(20, 101)],
        'col_sample_rate': [x / 100. for x in range(20, 101)],
        'col_sample_rate_per_tree': [x / 100. for x in range(20, 101)],
        'col_sample_rate_change_per_level': [x / 100. for x in range(90, 111)],
        'min_rows':
        [2**x for x in range(0,
                             int(math.log(air_hex.nrow, 2) - 1) + 1)],
        'nbins': [2**x for x in range(4, 11)],
        'nbins_cats': [2**x for x in range(4, 13)],
        'min_split_improvement': [0, 1e-8, 1e-6, 1e-4],
        'histogram_type': ["UniformAdaptive", "QuantilesGlobal", "RoundRobin"]
    }

    search_criteria_tune1 = {
        'strategy': "RandomDiscrete",
        'max_runtime_secs': grid_max_runtime_secs,  # limit the runtime
        'seed': 1234,
    }
    search_criteria_tune2 = {
        'strategy': "RandomDiscrete",
        'max_runtime_secs': grid_max_runtime_secs,  # limit the runtime
        'seed': 1234,
    }

    # case 3, search criteria seed is set but model parameter seed is not:
    air_grid1 = H2OGridSearch(H2OGradientBoostingEstimator,
                              hyper_params=hyper_params_tune,
                              search_criteria=search_criteria_tune1)
    air_grid2 = H2OGridSearch(H2OGradientBoostingEstimator,
                              hyper_params=hyper_params_tune,
                              search_criteria=search_criteria_tune2)
    air_grid1.train(x=myX,
                    y="IsDepDelayed",
                    training_frame=air_hex,
                    distribution="bernoulli")
    air_grid2.train(x=myX,
                    y="IsDepDelayed",
                    training_frame=air_hex,
                    distribution="bernoulli")

    # expect both models to render the same metrics as they use the same model seed, search criteria seed
    model_seeds1 = pyunit_utils.model_seed_sorted_by_time(air_grid1)
    model_seeds2 = pyunit_utils.model_seed_sorted_by_time(air_grid2)
    # check model seeds are set as gridseed+model number where model number = 0, 1, ..., ...
    model_len = min(len(air_grid1), len(air_grid2))
    correct_model_seeds = list(
        range(search_criteria_tune1["seed"],
              search_criteria_tune1["seed"] + model_len))

    expectedSeeds = ','.join(str(x) for x in correct_model_seeds)
    model1Seeds = ','.join(str(x) for x in model_seeds1[0:model_len])
    model2Seeds = ','.join(str(x) for x in model_seeds2[0:model_len])
    assert model_seeds1[0:model_len]==correct_model_seeds, "Model seeds are not set correctly: expected %s; " \
                                                           "got %s" % (expectedSeeds, model1Seeds)
    assert model_seeds2[0:model_len]==correct_model_seeds, "Model seeds are not set correctly: expected %s; " \
                                                           "got %s" % (expectedSeeds, model2Seeds)

    # compare training_rmse from scoring history
    metric_list1 = pyunit_utils.extract_scoring_history_field(
        air_grid1.models[0], "training_rmse")
    metric_list2 = pyunit_utils.extract_scoring_history_field(
        air_grid2.models[0], "training_rmse")
    assert pyunit_utils.equal_two_arrays(metric_list1[0:model_len], metric_list2[0:model_len], 1e-5, 1e-6, False), \
        "Training_rmse are different between the two grid search models.  Tests are supposed to be repeatable in " \
        "this case.  Make sure model seeds are actually set correctly in the Java backend."
def group_by_all():
    """
    I am testing the groupby median function in PUBDEV_4727.
    """

    # generate random dataset with factor column and real columns
    row_num_max = 100000  # 100000
    row_num_min = 100
    enumLow = 5
    enumHigh = 30
    enumVals = randint(enumLow, enumHigh)  # total number of groupby class
    pIndex = []
    pNum = []
    numpMedian = []  # store python median calculated for each groupby class
    numpMean = [
    ]  # store the python mean calculated for each groupby class for column2
    numpMedian2 = []
    numpMean2 = []
    tot = 1e-10
    colFac = 1.1
    for index in range(enumVals):
        rowNum = randint(row_num_min, row_num_max)
        indexList = [index] * rowNum
        numList = np.random.rand(rowNum, 1)

        numpMedian.append(list(np.median(numList, axis=0))[0])
        numpMean.append(list(np.mean(numList, axis=0))[0])
        numpMedian2.append(list(np.median(numList * colFac, axis=0))[0])
        numpMean2.append(list(np.mean(numList * colFac, axis=0))[0])

        pIndex.extend(indexList)
        pNum.extend(numList)

    # generate random H2OFrame
    newOrder = np.random.permutation(len(pIndex))
    python_lists = []
    for index in range(len(pIndex)):
        temp = [
            pIndex[newOrder[index]], pNum[newOrder[index]][0],
            pNum[newOrder[index]][0] * colFac
        ]
        python_lists.append(temp)
    h2oframe = h2o.H2OFrame(python_obj=python_lists,
                            column_types=["enum", "real", "real"],
                            column_names=["factors", "numerics", "numerics2"])

    # generate h2o groupby medians and other groupby functions
    groupedMedianF = h2oframe.group_by([
        "factors"
    ]).median(na='rm').mean(na='all').sum(na="all").count(na="rm").get_frame()

    groupbyMedian = [0] * len(
        numpMedian)  # extract groupby median to compare with python median
    groupbyMean = [0] * len(numpMean)
    groupbyMedian2 = [0] * len(
        numpMedian2)  # extract groupby median to compare with python median
    groupbyMean2 = [0] * len(numpMean2)
    for rowIndex in range(enumVals):
        groupbyMedian[int(
            groupedMedianF[rowIndex, 0])] = groupedMedianF[rowIndex,
                                                           'median_numerics']
        groupbyMean[int(groupedMedianF[rowIndex,
                                       0])] = groupedMedianF[rowIndex,
                                                             'mean_numerics']
        groupbyMedian2[int(
            groupedMedianF[rowIndex, 0])] = groupedMedianF[rowIndex,
                                                           'median_numerics2']
        groupbyMean2[int(groupedMedianF[rowIndex,
                                        0])] = groupedMedianF[rowIndex,
                                                              'mean_numerics2']

    # print out groupby/numpy median and mean
    print(groupedMedianF.as_data_frame(use_pandas=True, header=False))
    print("H2O Groupby median is for numerics {0}".format(groupbyMedian))
    print("Numpy median is numerics {0}".format(numpMedian))
    print("H2O Groupby median is for numerics {0}".format(groupbyMedian))
    print("Numpy median is numerics {0}".format(numpMedian))
    print("H2O Groupby mean is for numerics2 {0}".format(groupbyMean2))
    print("Numpy mean is numerics2 {0}".format(numpMean2))
    print("H2O Groupby mean is for numerics2 {0}".format(groupbyMean2))
    print("Numpy mean is numerics2 {0}".format(numpMean2))

    # compare the h2o groupby medians, means with numpy medians and means.
    assert pyunit_utils.equal_two_arrays(groupbyMedian, numpMedian, tot, tot), "H2O groupby median and numpy " \
                                                                                   "median is different."
    assert pyunit_utils.equal_two_arrays(groupbyMean, numpMean, tot, tot), "H2O groupby mean and numpy " \
                                                                                   "mean is different."
    assert pyunit_utils.equal_two_arrays(groupbyMedian2, numpMedian2, tot, tot), "H2O groupby median and numpy " \
                                                                                   "median is different."
    assert pyunit_utils.equal_two_arrays(groupbyMean2, numpMean2, tot, tot), "H2O groupby mean and numpy " \
                                                                            "mean is different."
def group_by_all():
    """
    I am testing the groupby median function in PUBDEV_4727.
    """

    # generate random dataset with factor column and real columns
    row_num_max = 100000 # 100000
    row_num_min = 100
    enumLow = 5
    enumHigh = 30
    enumVals = randint(enumLow, enumHigh)   # total number of groupby class
    pIndex = []
    pNum = []
    numpMedian = [] # store python median calculated for each groupby class
    numpMean = []  # store the python mean calculated for each groupby class for column2
    numpMedian2 = []
    numpMean2 = []
    tot = 1e-10
    colFac = 1.1
    for index in range(enumVals):
        rowNum = randint(row_num_min, row_num_max)
        indexList = [index]*rowNum
        numList = np.random.rand(rowNum,1)

        numpMedian.append(list(np.median(numList, axis=0))[0])
        numpMean.append(list(np.mean(numList, axis=0))[0])
        numpMedian2.append(list(np.median(numList*colFac, axis=0))[0])
        numpMean2.append(list(np.mean(numList*colFac, axis=0))[0])

        pIndex.extend(indexList)
        pNum.extend(numList)

    # generate random H2OFrame
    newOrder = np.random.permutation(len(pIndex))
    python_lists = []
    for index in range(len(pIndex)):
        temp = [pIndex[newOrder[index]], pNum[newOrder[index]][0], pNum[newOrder[index]][0]*colFac]
        python_lists.append(temp)
    h2oframe= h2o.H2OFrame(python_obj=python_lists, column_types=["enum","real","real"], column_names=["factors", "numerics", "numerics2"])

    # generate h2o groupby medians and other groupby functions
    groupedMedianF = h2oframe.group_by(["factors"]).median(na='rm').mean(na='all').sum(na="all").count(na="rm").get_frame()

    groupbyMedian = [0]*len(numpMedian) # extract groupby median to compare with python median
    groupbyMean = [0]*len(numpMean)
    groupbyMedian2 = [0]*len(numpMedian2) # extract groupby median to compare with python median
    groupbyMean2 = [0]*len(numpMean2)
    for rowIndex in range(enumVals):
        groupbyMedian[int(groupedMedianF[rowIndex,0])] = groupedMedianF[rowIndex,'median_numerics']
        groupbyMean[int(groupedMedianF[rowIndex,0])] = groupedMedianF[rowIndex,'mean_numerics']
        groupbyMedian2[int(groupedMedianF[rowIndex,0])] = groupedMedianF[rowIndex,'median_numerics2']
        groupbyMean2[int(groupedMedianF[rowIndex,0])] = groupedMedianF[rowIndex,'mean_numerics2']

    # print out groupby/numpy median and mean
    print(groupedMedianF.as_data_frame(use_pandas=True, header=False))
    print("H2O Groupby median is for numerics {0}".format(groupbyMedian))
    print("Numpy median is numerics {0}".format(numpMedian))
    print("H2O Groupby median is for numerics {0}".format(groupbyMedian))
    print("Numpy median is numerics {0}".format(numpMedian))
    print("H2O Groupby mean is for numerics2 {0}".format(groupbyMean2))
    print("Numpy mean is numerics2 {0}".format(numpMean2))
    print("H2O Groupby mean is for numerics2 {0}".format(groupbyMean2))
    print("Numpy mean is numerics2 {0}".format(numpMean2))

    # compare the h2o groupby medians, means with numpy medians and means.
    assert pyunit_utils.equal_two_arrays(groupbyMedian, numpMedian, tot, tot), "H2O groupby median and numpy " \
                                                                                   "median is different."
    assert pyunit_utils.equal_two_arrays(groupbyMean, numpMean, tot, tot), "H2O groupby mean and numpy " \
                                                                                   "mean is different."
    assert pyunit_utils.equal_two_arrays(groupbyMedian2, numpMedian2, tot, tot), "H2O groupby median and numpy " \
                                                                                   "median is different."
    assert pyunit_utils.equal_two_arrays(groupbyMean2, numpMean2, tot, tot), "H2O groupby mean and numpy " \
                                                                            "mean is different."
def glrm_pubdev_3728_arrest():
    print("Importing prostate.csv data...")

    # frame binary data is read in as enums.  Let's see if it runs.
    prostateF = h2o.upload_file(
        pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
    prostateF_num = h2o.upload_file(
        pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
    prostateF_num[0] = prostateF_num[0].asnumeric()
    prostateF_num[4] = prostateF_num[4].asnumeric()

    loss_all = [
        "Logistic", "Quadratic", "Categorical", "Categorical", "Logistic",
        "Quadratic", "Quadratic", "Quadratic"
    ]

    print("check with init = plusplus")
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=5,
                                              loss_by_col=loss_all,
                                              recover_svd=True,
                                              transform="STANDARDIZE",
                                              seed=12345)
    glrm_h2o.train(x=prostateF.names,
                   training_frame=prostateF,
                   validation_frame=prostateF)
    glrm_h2o.show()

    # exercise logistic loss with numeric columns
    glrm_h2o_num = H2OGeneralizedLowRankEstimator(k=5,
                                                  loss_by_col=loss_all,
                                                  recover_svd=True,
                                                  transform="STANDARDIZE",
                                                  seed=12345)
    glrm_h2o_num.train(x=prostateF_num.names,
                       training_frame=prostateF_num,
                       validation_frame=prostateF_num)
    glrm_h2o_num.show()

    print("check with init = random")
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=5,
                                              loss_by_col=loss_all,
                                              recover_svd=True,
                                              transform="DEMEAN",
                                              seed=12345,
                                              init="random")
    glrm_h2o.train(x=prostateF.names,
                   training_frame=prostateF,
                   validation_frame=prostateF)
    glrm_h2o.show()

    # exercise logistic loss with numeric columns
    glrm_h2o_num = H2OGeneralizedLowRankEstimator(k=5,
                                                  loss_by_col=loss_all,
                                                  recover_svd=True,
                                                  transform="DEMEAN",
                                                  seed=12345,
                                                  init="random")
    glrm_h2o_num.train(x=prostateF_num.names,
                       training_frame=prostateF_num,
                       validation_frame=prostateF_num)
    glrm_h2o_num.show()

    print("check with init = SVD")
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=5,
                                              loss_by_col=loss_all,
                                              recover_svd=True,
                                              seed=12345,
                                              init="SVD")
    glrm_h2o.train(x=prostateF.names,
                   training_frame=prostateF,
                   validation_frame=prostateF)
    glrm_h2o.show()

    # exercise logistic loss with numeric columns
    glrm_h2o_num = H2OGeneralizedLowRankEstimator(k=5,
                                                  loss_by_col=loss_all,
                                                  recover_svd=True,
                                                  seed=12345,
                                                  init="SVD")
    glrm_h2o_num.train(x=prostateF_num.names,
                       training_frame=prostateF_num,
                       validation_frame=prostateF_num)
    glrm_h2o_num.show()

    print("check with init = user")
    initial_y = [[
        -1.27675647831893E-15, 64.87421383647799, 2.0, 1.0,
        2.0816681711721685E-16, 8.533270440251574, 9.380440251572328,
        5.886792452830188
    ],
                 [
                     0.7297297297297298, 66.05405405405405, 2.0, 0.0, 1.0,
                     23.270270270270274, 9.589189189189193, 7.27027027027027
                 ],
                 [
                     0.01754385964912314, 70.35087719298245, 2.0, 1.0,
                     -1.3877787807814457E-17, 10.078947368421053,
                     42.37543859649123, 6.157894736842105
                 ],
                 [0.9, 65.95, 2.0, 0.0, 0.2, 81.94500000000001, 16.375, 7.4],
                 [
                     0.9999999999999989, 65.48598130841121, 2.0, 3.0,
                     1.3877787807814457E-16, 13.3092523364486,
                     13.268411214953275, 6.747663551401869
                 ]]
    initial_y_h2o = h2o.H2OFrame(list(initial_y))
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=5,
                                              loss_by_col=loss_all,
                                              recover_svd=True,
                                              transform="STANDARDIZE",
                                              seed=12345,
                                              init="User",
                                              user_y=initial_y_h2o)
    glrm_h2o.train(x=prostateF.names,
                   training_frame=prostateF,
                   validation_frame=prostateF)
    glrm_h2o.show()

    # exercise logistic loss with numeric columns
    glrm_h2o_num = H2OGeneralizedLowRankEstimator(k=5,
                                                  loss_by_col=loss_all,
                                                  recover_svd=True,
                                                  transform="STANDARDIZE",
                                                  seed=12345,
                                                  init="User",
                                                  user_y=initial_y_h2o)
    glrm_h2o_num.train(x=prostateF_num.names,
                       training_frame=prostateF_num,
                       validation_frame=prostateF_num)
    glrm_h2o_num.show()

    # singular values from glrm models should equal if binary columns with binary loss are read in as either
    # categorical or numerics.  If not, something is wrong.
    assert pyunit_utils.equal_two_arrays(glrm_h2o._model_json["output"]["singular_vals"],
                                         glrm_h2o_num._model_json["output"]["singular_vals"], 1e-6, 1e-4), \
        "Singular values obtained from logistic loss with column type as enum and numeric do not agree.  Fix it now."
def glrm_grid_user_y():
    export_dir = tempfile.mkdtemp()
    train_data = np.dot(np.random.rand(1000, 10), np.random.rand(10, 100))
    train = h2o.H2OFrame(train_data.tolist(), destination_frame="glrm_train")
    initial_y_data = np.random.rand(10, 100)
    initial_y_h2o = h2o.H2OFrame(initial_y_data.tolist(),
                                 destination_frame="glrm_initial_y")
    numArchetypes = 10
    params = {
        "k": numArchetypes,
        "init": "User",
        "user_y": initial_y_h2o,
        "loss": "Quadratic",
        "regularization_x": "OneSparse",
        "regularization_y": "NonNegative",
        "seed": 12345
    }
    hyper_params = {
        "transform": ["NONE", "STANDARDIZE"],
        "gamma_x": [0.1],
    }

    # train grid
    grid = H2OGridSearch(H2OGeneralizedLowRankEstimator,
                         hyper_params=hyper_params)
    grid.train(x=train.names, training_frame=train, **params)
    print("first grid")
    print(grid)
    assert len(grid.model_ids) == 2
    if (grid.models[0].actual_params['transform'] == 'STANDARDIZE'):
        archetypes0p1Standardize = grid.models[0].archetypes()
        archetypes0p1None = grid.models[1].archetypes()
    else:
        archetypes0p1Standardize = grid.models[1].archetypes()
        archetypes0p1None = grid.models[0].archetypes()
    grid_path = h2o.save_grid(export_dir, grid.grid_id)
    h2o.remove_all()

    # reimport and train some more
    train = h2o.H2OFrame(train_data.tolist(), destination_frame="glrm_train")
    initial_y = h2o.H2OFrame(initial_y_data.tolist(),
                             destination_frame="glrm_initial_y")
    grid = h2o.load_grid(grid_path)
    grid.hyper_params["gamma_x"] = [0.1, 1]
    grid.train(x=train.names, training_frame=train, **params)
    print("second grid")
    print(grid)
    assert len(grid.model_ids) == 4
    # check actual training occurred and results are different
    for oneGridModel in grid.models:
        if (oneGridModel.actual_params['gamma_x']
                == 0.1) and (oneGridModel.actual_params['transform']
                             == 'STANDARDIZE'):
            assert oneGridModel.archetypes() == archetypes0p1Standardize
        if (oneGridModel.actual_params['gamma_x']
                == 0.1) and (oneGridModel.actual_params['transform']
                             == 'NONE'):
            assert oneGridModel.archetypes() == archetypes0p1None
        if (oneGridModel.actual_params['gamma_x']
                == 1) and (oneGridModel.actual_params['transform']
                           == 'STANDARDIZE'):
            archetypes1None = oneGridModel.archetypes()
        if (oneGridModel.actual_params['gamma_x']
                == 1) and (oneGridModel.actual_params['transform'] == 'NONE'):
            archetypes1Standardize = oneGridModel.archetypes()

    archetypesNotEqual12 = not all([
        pyunit_utils.equal_two_arrays(
            archetypes1None[i], archetypes0p1None[i], throw_error=False)
        for i in range(numArchetypes)
    ])
    assert archetypesNotEqual12
    archetypesNotEqual23 = not all([
        pyunit_utils.equal_two_arrays(archetypes1Standardize[i],
                                      archetypes0p1Standardize[i],
                                      throw_error=False)
        for i in range(numArchetypes)
    ])
    assert archetypesNotEqual23
Beispiel #23
0
def random_grid_model_seeds_early_stopping_case():
    air_hex = h2o.import_file(
        path=pyunit_utils.locate("smalldata/airlines/allyears2k_headers.zip"),
        destination_frame="air.hex")

    features = [
        "Year", "Month", "CRSDepTime", "UniqueCarrier", "Origin", "Dest"
    ]

    gbm1 = H2OGradientBoostingEstimator(
        nfolds=0,
        keep_cross_validation_models=True,
        keep_cross_validation_predictions=False,
        keep_cross_validation_fold_assignment=False,
        score_each_iteration=False,
        score_tree_interval=1,  # has to be set to reproduce early stopping
        ignore_const_cols=True,
        balance_classes=False,
        max_after_balance_size=5.0,
        max_confusion_matrix_size=20,
        ntrees=50,
        max_depth=10,
        min_rows=2.0,
        nbins=16,
        nbins_top_level=1024,
        nbins_cats=64,
        r2_stopping=1.7976931348623157e+308,
        stopping_rounds=0,
        stopping_tolerance=0.001,
        max_runtime_secs=1.7976931348623157e+308,
        seed=1249,
        build_tree_one_node=False,
        learn_rate=0.1,
        learn_rate_annealing=1.0,
        distribution='bernoulli',
        quantile_alpha=0.5,
        tweedie_power=1.5,
        huber_alpha=0.9,
        sample_rate=0.75,
        col_sample_rate=0.31,
        col_sample_rate_change_per_level=0.94,
        col_sample_rate_per_tree=0.65,
        min_split_improvement=0.0001,
        histogram_type='QuantilesGlobal',
        max_abs_leafnode_pred=1.7976931348623157e+308,
        pred_noise_bandwidth=0.0,
        calibrate_model=False,
        check_constant_response=True)
    gbm1.train(x=features, y="IsDepDelayed", training_frame=air_hex)

    gbm2 = H2OGradientBoostingEstimator(
        nfolds=0,
        keep_cross_validation_models=True,
        keep_cross_validation_predictions=False,
        keep_cross_validation_fold_assignment=False,
        score_each_iteration=False,
        score_tree_interval=1,  # has to be set to reproduce early stopping
        ignore_const_cols=True,
        balance_classes=False,
        max_after_balance_size=5.0,
        max_confusion_matrix_size=20,
        ntrees=50,
        max_depth=10,
        min_rows=2.0,
        nbins=16,
        nbins_top_level=1024,
        nbins_cats=64,
        r2_stopping=1.7976931348623157e+308,
        stopping_rounds=0,
        stopping_tolerance=0.001,
        max_runtime_secs=1.7976931348623157e+308,
        seed=1249,
        build_tree_one_node=False,
        learn_rate=0.1,
        learn_rate_annealing=1.0,
        distribution='bernoulli',
        quantile_alpha=0.5,
        tweedie_power=1.5,
        huber_alpha=0.9,
        sample_rate=0.75,
        col_sample_rate=0.31,
        col_sample_rate_change_per_level=0.94,
        col_sample_rate_per_tree=0.65,
        min_split_improvement=0.0001,
        histogram_type='QuantilesGlobal',
        max_abs_leafnode_pred=1.7976931348623157e+308,
        pred_noise_bandwidth=0.0,
        calibrate_model=False,
        check_constant_response=True)
    gbm2.train(x=features, y="IsDepDelayed", training_frame=air_hex)

    rmse1 = pyunit_utils.extract_from_twoDimTable(
        gbm1._model_json["output"]["scoring_history"], "training_rmse", False)
    rmse2 = pyunit_utils.extract_from_twoDimTable(
        gbm2._model_json["output"]["scoring_history"], "training_rmse", False)
    print(rmse1)
    print(rmse2)

    assert pyunit_utils.equal_two_arrays(rmse1, rmse2, 1e-5, 1e-6, False), \
        "Training_rmse are different between the two grid search models.  Tests are supposed to be repeatable in " \
        "this case.  Make sure model seeds are actually set correctly in the Java backend."