def iris_nfolds():



  iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv"))

  model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50, nfolds=5)
  model.show()

  # Can specify both nfolds >= 2 and validation = H2OParsedData at once



  try:
      H2ORandomForestEstimator(ntrees=50, nfolds=5).train(y=4, x=list(range(4)), validation_frame=iris)
      assert True
  except EnvironmentError:
      assert False, "expected an error"



  if __name__ == "__main__":
    pyunit_utils.standalone_test(iris_nfolds)
  else:
    iris_nfolds()
Example #2
0
                               y=iris_train[4],
                               validation_x=iris_valid[["C1","C2","C3"]],
                               validation_y=iris_valid[4],
                               ntrees=5,
                               distribution="multinomial",
                               weights_column="C2",
                               training_frame=iris_train,
                               validation_frame=iris_valid)

    # validation_frame not specified, weights not part of validation_x
    try:
        gbm4_multinomial = h2o.gbm(x=iris_train[["C1","C2","C3"]],
                                   y=iris_train[4],
                                   validation_x=iris_valid[["C1","C2","C3"]],
                                   validation_y=iris_valid[4],
                                   ntrees=5,
                                   distribution="multinomial",
                                   weights_column="C4")

        assert False, "expected an error"
    except:
        assert True




if __name__ == "__main__":
    pyunit_utils.standalone_test(weights_api)
else:
    weights_api()
Example #3
0


def bigcatRF():

  # Training set has 100 categories from cat001 to cat100
  # Categories cat001, cat003, ... are perfect predictors of y = 1
  # Categories cat002, cat004, ... are perfect predictors of y = 0

  #Log.info("Importing bigcat_5000x2.csv data...\n")
  bigcat = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/bigcat_5000x2.csv"))
  bigcat["y"] = bigcat["y"].asfactor()

  #Log.info("Summary of bigcat_5000x2.csv from H2O:\n")
  #bigcat.summary()



  # Train H2O DRF Model:
  #Log.info("H2O DRF (Naive Split) with parameters:\nclassification = TRUE, ntree = 1, depth = 1, nbins = 100, nbins_cats=10\n")
  model = H2ORandomForestEstimator(ntrees=1, max_depth=1, nbins=100, nbins_cats=10)
  model.train(x="X", y="y", training_frame=bigcat)
  model.show()



if __name__ == "__main__":
  pyunit_utils.standalone_test(bigcatRF)
else:
  bigcatRF()
               [0.7297297297297298,66.05405405405405,2.0,0.0,1.0,23.270270270270274,9.589189189189193,7.27027027027027],
               [0.01754385964912314,70.35087719298245,2.0,1.0,-1.3877787807814457E-17,10.078947368421053,
                42.37543859649123,6.157894736842105],
               [0.9,65.95,2.0,0.0,0.2,81.94500000000001,16.375,7.4],
               [0.9999999999999989,65.48598130841121,2.0,3.0,1.3877787807814457E-16,13.3092523364486,
                13.268411214953275,6.747663551401869]]
    initial_y_h2o = h2o.H2OFrame(list(initial_y))
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=5, loss_by_col=loss_all, recover_svd=True, transform="STANDARDIZE",
                                              seed=12345, init="User", user_y=initial_y_h2o)
    glrm_h2o.train(x=prostateF.names, training_frame=prostateF, validation_frame=prostateF)
    glrm_h2o.show()

    # exercise logistic loss with numeric columns
    glrm_h2o_num = H2OGeneralizedLowRankEstimator(k=5, loss_by_col=loss_all, recover_svd=True, transform="STANDARDIZE",
                                                  seed=12345, init="User", user_y=initial_y_h2o)
    glrm_h2o_num.train(x=prostateF_num.names, training_frame=prostateF_num, validation_frame=prostateF_num)
    glrm_h2o_num.show()

    # singular values from glrm models should equal if binary columns with binary loss are read in as either
    # categorical or numerics.  If not, something is wrong.
    assert pyunit_utils.equal_two_arrays(glrm_h2o._model_json["output"]["singular_vals"],
                                         glrm_h2o_num._model_json["output"]["singular_vals"], 1e-6, 1e-4), \
        "Singular values obtained from logistic loss with column type as enum and numeric do not agree.  Fix it now."

    sys.stdout.flush()

if __name__ == "__main__":
    pyunit_utils.standalone_test(glrm_pubdev_3756_arrest)
else:
    glrm_pubdev_3756_arrest()
    cls_bias = mx.sym.Variable('cls_bias')

    fc = mx.sym.FullyConnected(data=h_drop, weight=cls_weight, bias=cls_bias, num_hidden=num_label)

    # softmax output
    sm = mx.sym.SoftmaxOutput(data=fc, label=input_y, name='softmax')
    return sm



def deepwater_tweets():
  if not H2ODeepWaterEstimator.available(): return

  tweets = h2o.import_file(pyunit_utils.locate("/home/arno/tweets.txt"), col_names=["text"], sep="|")
  labels = h2o.import_file(pyunit_utils.locate("/home/arno/labels.txt"), col_names=["label"])
  frame = tweets.cbind(labels)
  print(frame.head(5))

#  cnn = make_text_cnn(sentence_size=100, num_embed=300, batch_size=32,
#            vocab_size=100000, dropout=dropout, with_embedding=with_embedding)
  model = H2ODeepWaterEstimator(epochs=50000, learning_rate=1e-3, hidden=[100,100,100,100,100])
  model.train(x=[0],y=1, training_frame=frame)
  model.show()
  error = model.model_performance(train=True).mean_per_class_error()
  assert error < 0.1, "mean classification error is too high : " + str(error)

if __name__ == "__main__":
  pyunit_utils.standalone_test(deepwater_tweets)
else:
  deepwater_tweets()
from __future__ import print_function
import sys, os
sys.path.insert(1, os.path.join("..",".."))
import h2o
from tests import pyunit_utils
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

def deeplearning_multi():
  print("Test checks if Deep Learning works fine with a multiclass training and test dataset")

  prostate = h2o.import_file(pyunit_utils.locate("smalldata/logreg/prostate.csv"))

  prostate[4] = prostate[4].asfactor()

  hh = H2ODeepLearningEstimator(loss="CrossEntropy")
  hh.train(x=[0,1],y=4, training_frame=prostate, validation_frame=prostate)
  hh.show()

if __name__ == "__main__":
  pyunit_utils.standalone_test(deeplearning_multi)
else:
  deeplearning_multi()
  ytrain = trainDataResponse.tolist()
  trainData = h2o.H2OFrame.fromPython([ytrain]+xtrain)

  trainData[0] = trainData[0].asfactor()

  print("Run model on 3250 columns of Arcene with strong rules off.")
  model = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=False, alpha=1)
  model.train(x=range(1,3250), y=0, training_frame=trainData)

  print("Test model on validation set.")
  validDataResponse = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_valid_labels.labels"), delimiter=' ')
  validDataResponse = np.where(validDataResponse == -1, 0, 1)
  validDataFeatures = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_valid.data"), delimiter=' ')
  xvalid = np.transpose(validDataFeatures).tolist()
  yvalid = validDataResponse.tolist()
  validData = h2o.H2OFrame.fromPython([yvalid]+xvalid)
  prediction = model.predict(validData)

  print("Check performance of predictions.")
  performance = model.model_performance(validData)

  print("Check that prediction AUC better than guessing (0.5).")
  assert performance.auc() > 0.5, "predictions should be better then pure chance"



if __name__ == "__main__":
  pyunit_utils.standalone_test(wide_dataset_large)
else:
  wide_dataset_large()
                print("check unsuccessful! h2o computed {0} and numpy computed {1}".format(h2o_val, num_val))
        return success

    h2o_val = h2o_data.min()
    num_val = np.min(np_data)
    assert abs(h2o_val - num_val) < 1e-06, (
        "check unsuccessful! h2o computed {0} and numpy computed {1}. expected equal min values between h2o and "
        "numpy".format(h2o_val, num_val)
    )
    h2o_val = h2o_data.max()
    num_val = np.max(np_data)
    assert abs(h2o_val - num_val) < 1e-06, (
        "check unsuccessful! h2o computed {0} and numpy computed {1}. expected equal max values between h2o and "
        "numpy".format(h2o_val, num_val)
    )
    h2o_val = h2o_data.sum()
    num_val = np.sum(np_data)
    assert abs(h2o_val - num_val) < 1e-06, (
        "check unsuccessful! h2o computed {0} and numpy computed {1}. expected equal sum values between h2o and "
        "numpy".format(h2o_val, num_val)
    )
    pyunit_utils.np_comparison_check(
        h2o_data.var(), np.cov(np_data, rowvar=0, ddof=1), 10
    ), "expected equal var values between h2o and numpy"


if __name__ == "__main__":
    pyunit_utils.standalone_test(expr_reducers)
else:
    expr_reducers()
import h2o
from h2o.estimators import H2OXGBoostEstimator

from tests import pyunit_utils


# Create many small models
def models_stress_test():
    data = h2o.import_file(
        pyunit_utils.locate("smalldata/testng/airlines_train.csv"))

    for i in range(0, 1000):
        xgb = H2OXGBoostEstimator(ntrees=1, max_depth=2)
        xgb.train(x=["Origin", "Distance"],
                  y="IsDepDelayed",
                  training_frame=data)


if __name__ == "__main__":
    pyunit_utils.standalone_test(models_stress_test)
else:
    models_stress_test()
    pyunit_utils.compare_frames_local(
        pred_h2o, pred_mojo, 0.1, tol=1e-10
    )  # make sure operation sequence is preserved from Tomk        h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True)  # save model for debugging
    print("Comparing pojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)


def set_params():
    global PROBLEM
    #missingValues = ['Skip', 'MeanImputation']
    missingValues = ['MeanImputation']
    PROBLEM = "multinomial"
    print("PROBLEM is {0}".format(PROBLEM))
    missing_values = missingValues[randint(0, len(missingValues) - 1)]
    reg = 1.0 / 250000.0
    params = {
        'missing_values_handling': missing_values,
        'family': "ordinal",
        'alpha': [0.5],
        'lambda_': [reg],
        'obj_reg': reg
    }
    print(params)
    return params


if __name__ == "__main__":
    pyunit_utils.standalone_test(glm_ordinal_mojo_pojo)
else:
    glm_ordinal_mojo_pojo()
Example #11
0
    colnames = doubled_data.pop(0)
    for idx, w in enumerate(doubled_weights[0]):
        if w == 2: doubled_data.append(doubled_data[idx])
    doubled_data = zip(*doubled_data)
    h2o_data_doubled = h2o.H2OFrame(python_obj=doubled_data)
    h2o_data_doubled.set_names(list(colnames))

    h2o_data_doubled["economy_20mpg"] = h2o_data_doubled[
        "economy_20mpg"].asfactor()
    h2o_data_doubled["cylinders"] = h2o_data_doubled["cylinders"].asfactor()
    h2o_data_doubled_weights["economy_20mpg"] = h2o_data_doubled_weights[
        "economy_20mpg"].asfactor()
    h2o_data_doubled_weights["cylinders"] = h2o_data_doubled_weights[
        "cylinders"].asfactor()

    print "Checking that doubling some weights is equivalent to doubling those observations:"
    print
    check_same(h2o_data_doubled, h2o_data_doubled_weights, 1)

    # TODO: random weights

    # TODO: all zero weights???

    # TODO: negative weights???


if __name__ == "__main__":
    pyunit_utils.standalone_test(weights_check)
else:
    weights_check()
Example #12
0
    x = ["C1", "C2"]
    y = "C11"
    gam_cols1 = ["C6", ["C7", "C8"], "C9", "C10"]
    gam_cols2 = [["C6"], ["C7", "C8"], ["C9"], ["C10"]]
    h2o_model1 = H2OGeneralizedAdditiveEstimator(family='multinomial',
                                                 gam_columns=gam_cols1,
                                                 bs=[1, 1, 0, 0],
                                                 max_iterations=2)
    h2o_model1.train(x=x, y=y, training_frame=train, validation_frame=test)
    h2o_model2 = H2OGeneralizedAdditiveEstimator(family='multinomial',
                                                 gam_columns=gam_cols2,
                                                 bs=[1, 1, 0, 0],
                                                 max_iterations=2)
    h2o_model2.train(x=x, y=y, training_frame=train, validation_frame=test)
    # check that both models produce the same coefficients
    print(h2o_model1.coef())
    print(h2o_model2.coef())
    pyunit_utils.assertCoefDictEqual(h2o_model1.coef()['coefficients'],
                                     h2o_model2.coef()['coefficients'],
                                     tol=1e-6)
    # check both models product the same validation metrics
    assert abs(h2o_model1.logloss(valid=True) - h2o_model2.logloss(valid=True)) < 1e-6,\
        "Expected validation logloss: {0}, Actual validation logloss: {1}".format(h2o_model1.logloss(valid=True),
                                                                                  h2o_model2.logloss(valid=True))


if __name__ == "__main__":
    pyunit_utils.standalone_test(test_gam_dual_mode_multinomial)
else:
    test_gam_dual_mode_multinomial()
Example #13
0
import os
sys.path.insert(1, os.path.join("../../../h2o-py"))
from tests import pyunit_utils
import h2o
from h2o.exceptions import H2OServerError


def trace_request():
    err = None
    try:
        h2o.api("TRACE /3/Cloud")
    except H2OServerError as e:
        err = e

    msg = str(err.args[0])

    assert err is not None
    print("<Error message>")
    print(msg)
    print("</Error Message>")

    # exact message depends on Jetty Version and security settings
    assert msg.startswith("HTTP 500") or msg.startswith(
        "HTTP 405 Method Not Allowed")


if __name__ == "__main__":
    pyunit_utils.standalone_test(trace_request)
else:
    trace_request()
Example #14
0
  # 2. more folds than observations
  try:
    rf = H2ORandomForestEstimator(nfolds=cars.nrow+1, fold_assignment="Modulo")
    rf.train(y=response_col, x=predictors, training_frame=cars)
    assert False, "Expected model-build to fail when nfolds > nobs"
  except EnvironmentError:
    assert True

  # 3. fold_column and nfolds both specified
  try:
    rf = H2ORandomForestEstimator(nfolds=3)
    rf.train(y=response_col, x=predictors, fold_column="fold_assignments", training_frame=cars)
    assert False, "Expected model-build to fail when fold_column and nfolds both specified"
  except EnvironmentError:
    assert True

    # # 4. fold_column and fold_assignment both specified
    # try:
    #     rf = h2o.random_forest(y=cars[response_col], x=cars[predictors], fold_assignment="Random",
    #                            fold_column="fold_assignments", training_frame=cars)
    #     assert False, "Expected model-build to fail when fold_column and fold_assignment both specified"
    # except EnvironmentError:
    #     assert True



if __name__ == "__main__":
  pyunit_utils.standalone_test(cv_carsRF)
else:
  cv_carsRF()
Example #15
0
                                          max_iterations=7,
                                          solver=solver)
    model.train(training_frame=split_frames[0],
                x=x_indices,
                y=y_index,
                validation_frame=split_frames[1])
    modelCheckpoint = H2OGeneralizedLinearEstimator(family=family,
                                                    checkpoint=model.model_id,
                                                    solver=solver)
    modelCheckpoint.train(training_frame=split_frames[0],
                          x=x_indices,
                          y=y_index,
                          validation_frame=split_frames[1])

    modelLong = H2OGeneralizedLinearEstimator(
        family=family, solver=solver)  # allow to run to completion
    modelLong.train(training_frame=split_frames[0],
                    x=x_indices,
                    y=y_index,
                    validation_frame=split_frames[1])

    pyunit_utils.assertEqualCoeffDicts(modelCheckpoint.coef(),
                                       modelLong.coef(),
                                       tol=5e-2)


if __name__ == "__main__":
    pyunit_utils.standalone_test(testGLMCheckpointBinomial)
else:
    testGLMCheckpointBinomial()
  # bernoulli - offset not supported
  #dl = h2o.deeplearning(x=cars[2:8], y=cars["economy_20mpg"], distribution="bernoulli", offset_column="x1",
  #                       training_frame=cars)
  #predictions = dl.predict(cars)



  # gamma
  dl = H2ODeepLearningEstimator(distribution="gamma")
  dl.train(x=list(range(3)),y="Claims", training_frame=insurance, offset_column="offset")
  predictions = dl.predict(insurance)

  # gaussian
  dl = H2ODeepLearningEstimator(distribution="gaussian")
  dl.train(x=list(range(3)),y="Claims", training_frame=insurance, offset_column="offset")
  predictions = dl.predict(insurance)

  # poisson
  dl = H2ODeepLearningEstimator(distribution="poisson")
  dl.train(x=list(range(3)),y="Claims", training_frame=insurance, offset_column="offset")
  predictions = dl.predict(insurance)

  # tweedie
  dl = H2ODeepLearningEstimator(distribution="tweedie")
  dl.train(x=list(range(3)),y="Claims", training_frame=insurance, offset_column="offset")
  predictions = dl.predict(insurance)

if __name__ == "__main__":
  pyunit_utils.standalone_test(offsets_and_distributions)
else:
  offsets_and_distributions()
Example #17
0
    except H2OValueError: # as designed
      pass
        
    compare_frames(badFrame, badClone)

    originalAfterOp = H2OFrame.get_frame(badFrame.frame_id)
    compare_frames(badFrame, originalAfterOp)

    goodFrame = H2OFrame({"one": [4, 6, 1], "two": ["a", "b", "cde"]})
    goodClone = H2OFrame({"one": [4, 6, 1], "two": ["a", "b", "cde"]})
    compare_frames(goodFrame, goodClone)

    factoredFrame = goodFrame.asfactor()

    originalAfterOp = H2OFrame.get_frame(goodFrame.frame_id)
    compare_frames(goodFrame, originalAfterOp)

    expectedFactoredFrame = H2OFrame({"one": [4, 6, 1], "two": ["a", "b", "cde"]}, column_types={"one":"categorical", "two": "enum"})

    compare_frames(expectedFactoredFrame, factoredFrame)

    refactoredFrame = expectedFactoredFrame.asfactor()
    factoredAfterOp = H2OFrame.get_frame(refactoredFrame.frame_id)
    compare_frames(expectedFactoredFrame, factoredAfterOp)

if __name__ == "__main__":
    pyunit_utils.standalone_test(test1)
else:
    test1()

import sys
sys.path.insert(1, "../../")
import h2o
from tests import pyunit_utils


def vec_as_list():

    iris = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    res = h2o.as_list(iris[0], use_pandas=False)
    assert abs(float(res[0][4]) - 4.6) < 1e-10 and abs(float(res[0][6]) - 5.4) < 1e-10 and \
           abs(float(res[0][10]) - 4.9) < 1e-10, "incorrect values"

    res = 2 - iris
    res = h2o.as_list(res[0], use_pandas=False)
    assert abs(float(res[0][4]) - -2.6) < 1e-10 and abs(float(res[0][18]) - -3.1) < 1e-10 and \
           abs(float(res[0][25]) - -2.8) < 1e-10, "incorrect values"


if __name__ == "__main__":
    pyunit_utils.standalone_test(vec_as_list)
else:
    vec_as_list()
                     ntrees=ntrees2,
                     max_depth=max_depth2,
                     min_rows=min_rows2,
                     distribution=distribution,
                     score_each_iteration=True,
                     validation_x=valid[predictors],
                     validation_y=valid[response_col],
                     checkpoint=model1._id)

    model4 = h2o.gbm(x=train[predictors],
                     y=train[response_col],
                     ntrees=ntrees2,
                     max_depth=max_depth2,
                     min_rows=min_rows2,
                     distribution=distribution,
                     score_each_iteration=True,
                     validation_x=valid[predictors],
                     validation_y=valid[response_col])


    assert model2.auc(valid=True)==model4.auc(valid=True), "Expected Model 2 AUC: {0} to be the same as Model 4 AUC: {1}".format(model2.auc(valid=True), model4.auc(valid=True))
    assert model2.giniCoef(valid=True)==model4.giniCoef(valid=True), "Expected Model 2 Gini Coef {0} to be the same as Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True))
    assert model2.logloss(valid=True)==model4.logloss(valid=True), "Expected Model 2 Log Loss: {0} to be the same as Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True))



if __name__ == "__main__":
    pyunit_utils.standalone_test(pubdev_1829)
else:
    pubdev_1829()
Example #20
0
import h2o

from h2o.exceptions import H2OResponseError
from tests import pyunit_utils


def pubdev_4863():

    try:
        h2o.rapids("(tmp= digi_temp (cols_py 123STARTSWITHDIGITS 'a'))")
        assert False
    except H2OResponseError as error:
        print(error)
        assert 'Error: Name lookup of \'123STARTSWITHDIGITS\' failed' in str(
            error)


if __name__ == "__main__":
    pyunit_utils.standalone_test(pubdev_4863)
else:
    pubdev_4863()
    def test_property_disabled():
        print("\n=== disabling "+kcvm+" ===")
        grid_search = setup_grid()
        train = prepare_data()
        grid_search.train(x=range(4), y=4, training_frame=train, nfolds=nfolds,
                 keep_cross_validation_models=False)
        keys = list_keys_in_memory()
        tot, cv = len(keys['models']), len(keys['cv_models'])
        print("total grid models in memory = {tot}, among which {cv} CV models".format(tot=tot, cv=cv))
        assert tot > 0, "no grid models left in memory"
        assert cv == 0, "{cv} CV models were not cleaned from memory".format(cv=cv)
        for m in grid_search.models:
            assert not m.cross_validation_models(), "unexpected cv models for model "+m


    test_defaults()
    test_property_enabled()
    test_property_disabled()


def test_all():
    test_keep_cross_validation_predictions_on_gbm_grid()
    test_keep_cross_validation_models_on_gbm_grid()

if __name__ == "__main__":
    pyunit_utils.standalone_test(test_all)
else:
    test_all()


Example #22
0
                model_index += 1

            if (diff > self.diff) or not(grid_model_metrics == sorted(grid_model_metrics)) or (diff_train < self.diff):
                self.test_failed = 1
                print("test_rf_gridsearch_sorting_metrics for random forest has failed!")

            if self.test_failed == 0:
                print("test_rf_gridsearch_sorting_metrics for random forest has passed!")



def test_gridsearch_sorting_metrics():
    """
    Create and instantiate class and perform tests specified for random forest

    :return: None
    """
    test_rf_grid = Test_rf_gridsearch_sorting_metrics()
    test_rf_grid.test_rf_gridsearch_sorting_metrics()

    sys.stdout.flush()

    if test_rf_grid.test_failed:  # exit with error if any tests have failed
        sys.exit(1)


if __name__ == "__main__":
    pyunit_utils.standalone_test(test_gridsearch_sorting_metrics)
else:
    test_gridsearch_sorting_metrics()
        ones = np.where(a == 1)[0].size
        basis = ones == 1 and (zeros + ones) == k
        assert basis, "Got " + str(ones) + " ones and " + str(zeros) + " zeros, but expected all zeros except a single 1"
        return basis
    np.apply_along_axis(is_basis, 1, fit_x_np)

    print "Check final objective function value"
    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_xy = np.dot(fit_x_np, fit_y_np)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    sse = np.sum(np.square(train.__sub__(fit_xy)))
    assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse)

    print "Impute XY and check error metrics"
    pred_h2o = glrm_h2o.predict(train_h2o)
    pred_np = np.array(h2o.as_list(pred_h2o))
    assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product"
    glrm_numerr = glrm_h2o._model_json['output']['training_metrics']._metric_json['numerr']
    glrm_caterr = glrm_h2o._model_json['output']['training_metrics']._metric_json['caterr']
    assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj)
    assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"



if __name__ == "__main__":
    pyunit_utils.standalone_test(glrm_unitonesparse)
else:
    glrm_unitonesparse()
Example #24
0
                    "test3_duplicated_parameter_specification failed: Java error exception ({0}) should not "
                    "have been thrown! ".format(e))
            else:
                print(
                    "test3_duplicated_parameter_specification passed: Java error exception ({0}) should "
                    "have been thrown and did.".format(e))


def test_grid_search_for_glm_over_all_params():
    """
    Create and instantiate class and perform tests specified for GLM

    :return: None
    """
    test_glm_grid = Test_glm_grid_search()
    test_glm_grid.test1_glm_grid_search_over_params()
    test_glm_grid.test2_illegal_name_value()
    test_glm_grid.test3_duplicated_parameter_specification()
    sys.stdout.flush()

    if test_glm_grid.test_failed:  # exit with error if any tests have failed
        sys.exit(1)
    else:  # remove json files if everything passes
        test_glm_grid.tear_down()


if __name__ == "__main__":
    pyunit_utils.standalone_test(test_grid_search_for_glm_over_all_params)
else:
    test_grid_search_for_glm_over_all_params()
        assert True

    # Log.info("Number of rows exceeds training set's")
    start = [[random.gauss(0,1) for c in range(numcol)] for r in range(numrow+2)]
    try:
        h2o.kmeans(x=benign_h2o, k=numrow+2, user_points=h2o.H2OFrame(start))
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    # Nones are replaced with mean of a column in H2O. Not sure about Inf.
    # Log.info("Any entry is NA, NaN, or Inf")
    start = [[random.gauss(0,1) for c in range(numcol)] for r in range(3)]
    for x in ["NA", "NaN", "Inf", "-Inf"]:
        start_err = start[:]
        start_err[1][random.randint(0,numcol-1)] = x
        h2o.kmeans(x=benign_h2o, k=3, user_points=h2o.H2OFrame(start_err))

    # Duplicates will affect sampling probability during initialization.
    # Log.info("Duplicate initial clusters specified")
    start = [[random.gauss(0,1) for c in range(numcol)] for r in range(3)]
    start[2] = start[0]
    h2o.kmeans(x=benign_h2o, k=3, user_points=h2o.H2OFrame(start))
  


if __name__ == "__main__":
    pyunit_utils.standalone_test(init_err_casesKmeans)
else:
    init_err_casesKmeans()
Example #26
0
            result_frame_allsubsets["model_id"][ind, 0])
        pred_allsubsets = one_model_allsubsets.predict(d)
        print("last element of predictor frame: {0}".format(
            pred_allsubsets[pred_allsubsets.nrows - 1,
                            pred_allsubsets.ncols - 1]))
        assert pred_allsubsets.nrows == d.nrows, "expected dataset row: {0}, actual dataset row: " \
                                                 "{1}".format(pred_allsubsets.nrows, d.nrows)
        best_r2_value_maxr = best_r2_maxr[ind]
        one_model_maxr = h2o.get_model(result_frame_maxr["model_id"][ind, 0])
        pred_maxr = one_model_maxr.predict(d)
        pyunit_utils.compare_frames_local(
            pred_maxr, pred_allsubsets, prob=1,
            tol=1e-6)  # compare allsubsets and maxr results
        # r2 from result frame
        frame_r2_allsubsets = result_frame_allsubsets["best_r2_value"][ind, 0]
        # r2 from model
        model_r2_allsubsets = one_model_allsubsets.r2()
        # make sure all r2 are equal
        assert abs(best_r2_value_allsubsets-frame_r2_allsubsets) < 1e-6, "expected best r2: {0}, actual best r2: " \
                                                                   "{1}".format(best_r2_value_allsubsets, frame_r2_allsubsets)
        assert abs(frame_r2_allsubsets-model_r2_allsubsets) < 1e-6, "expected best r2: {0}, actual best r2: " \
                                                                    "{1}".format(model_r2_allsubsets, frame_r2_allsubsets)
        assert abs(best_r2_value_maxr-model_r2_allsubsets) < 1e-6, "expected best r2: {0}, maxr best r2: {1}" \
                                                             "".format(best_r2_value_maxr, model_r2_allsubsets)


if __name__ == "__main__":
    pyunit_utils.standalone_test(test_gaussian_result_frame_model_id)
else:
    test_gaussian_result_frame_model_id()
    # Connect to a pre-existing cluster
    # connect to localhost:54321

    # Log.info("Importing benign.csv data...\n")
    benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv"))
    # benign_h2o.summary()

    benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",")
    # Impute missing values with column mean
    imp = Imputer(missing_values="NaN", strategy="mean", axis=0)
    benign_sci = imp.fit_transform(benign_sci)

    for i in range(2, 7):
        # Log.info("H2O K-Means")
        km_h2o = H2OKMeansEstimator(k=i)
        km_h2o.train(x=range(benign_h2o.ncol), training_frame=benign_h2o)
        km_h2o.show()
        model = h2o.get_model(km_h2o._id)
        model.show()

        km_sci = KMeans(n_clusters=i, init="k-means++", n_init=1)
        km_sci.fit(benign_sci)
        print "sckit centers"
        print km_sci.cluster_centers_


if __name__ == "__main__":
    pyunit_utils.standalone_test(get_modelKmeans)
else:
    get_modelKmeans()
    assert set(['a', 'b', 'c']) == set(levels), \
        "Expected levels to be {0}, but got {1}".format(set(['a', 'b', 'c']),levels)
    assert nlevels == 3, "Expected nlevels to be 3, but got {0}".format(
        nlevels)

    iris[4] = iris[4].set_level(level='b')
    levels = iris.levels(col=4)
    nlevels = iris.nlevels(col=4)
    assert set(['a', 'b', 'c']) == set(levels), \
        "Expected levels to be {0}, but got {1}".format(set(['a', 'b', 'c']),levels)
    assert nlevels == 3, "Expected nlevels to be 3, but got {0}".format(
        nlevels)
    assert iris[0, 4] == 'b'

    levels = iris[1].levels()
    nlevels = iris[1].nlevels()
    assert levels == None, "Expected levels to be None, but got {0}".format(
        levels)
    assert nlevels == 0, "Expected nlevels to be 0, but got {0}".format(
        nlevels)

    one_column_frame = iris[4]
    one_column_frame = one_column_frame.set_level(level='c')
    assert one_column_frame[0, 0] == 'c'


if __name__ == "__main__":
    pyunit_utils.standalone_test(levels_nlevels_setlevel_setLevels_test)
else:
    levels_nlevels_setlevel_setLevels_test()
Example #29
0
sys.path.insert(1,"../../")
import h2o
from tests import pyunit_utils
import os


def remove_obj_client():

  training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv"))
  
  Y = 3
  X = range(3) + range(4,11)
  
  from h2o.estimators.glm import H2OGeneralizedLinearEstimator
  model = H2OGeneralizedLinearEstimator(family="binomial", alpha=0, Lambda=1e-5)
  print model.model_id
  print model
  model.train(x=X,y=Y, training_frame=training_data)
  print model
  h2o.remove(model)
  print model
  
  h2o.remove(training_data)
  print training_data


if __name__ == "__main__":
  pyunit_utils.standalone_test(remove_obj_client)
else:
  remove_obj_client()
                               max_iterations=miters)
        print(cross1_km)

        print(
            "Run k-means with init = final cluster centers and max_iterations = 1"
        )
        init_centers = h2o.H2OFrame(cross1_km.centers())
        cross2_km = h2o.kmeans(training_frame=cross_h2o,
                               x=cross_h2o[0:57],
                               k=ncent,
                               user_points=init_centers,
                               max_iterations=1)
        print(cross2_km)

        print("Check k-means converged or maximum iterations reached")
        c1 = h2o.H2OFrame(cross1_km.centers())
        c2 = h2o.H2OFrame(cross2_km.centers())
        avg_change = old_div(((c1 - c2)**2).sum(), ncent)
        iters = cross1_km._model_json['output']['model_summary'].cell_values[
            0][3]
        assert avg_change < 1e-6 or iters > miters, "Expected k-means to converge or reach max iterations. avg_change = " \
                                                    "{0} and iterations = {1}".format(avg_change, iters)
    else:
        raise EnvironmentError


if __name__ == "__main__":
    pyunit_utils.standalone_test(hdfs_kmeans_converge)
else:
    hdfs_kmeans_converge()
  h2o_data_doubled_weights = h2o_cars_data.cbind(h2o_doubled_weights)

  doubled_data = h2o.as_list(h2o_cars_data, use_pandas=False)
  colnames = doubled_data.pop(0)
  for idx, w in enumerate(doubled_weights[0]):
    if w == 2: doubled_data.append(doubled_data[idx])
  h2o_data_doubled = h2o.H2OFrame(doubled_data)
  h2o_data_doubled.set_names(list(colnames))

  h2o_data_doubled["economy_20mpg"] = h2o_data_doubled["economy_20mpg"].asfactor()
  h2o_data_doubled["cylinders"] = h2o_data_doubled["cylinders"].asfactor()
  h2o_data_doubled_weights["economy_20mpg"] = h2o_data_doubled_weights["economy_20mpg"].asfactor()
  h2o_data_doubled_weights["cylinders"] = h2o_data_doubled_weights["cylinders"].asfactor()

  print("Checking that doubling some weights is equivalent to doubling those observations:")
  print()
  check_same(h2o_data_doubled, h2o_data_doubled_weights, 1)

  # TODO: random weights

  # TODO: all zero weights???

  # TODO: negative weights???



if __name__ == "__main__":
  pyunit_utils.standalone_test(weights_check)
else:
  weights_check()
Example #32
0
                                           reproducible=True,
                                           seed=1234)
    hh_balanced.train(x=range(54), y=54, training_frame=covtype)
    print hh_balanced

    #compare overall logloss
    class_6_err_imbalanced = hh_imbalanced.logloss()
    class_6_err_balanced = hh_balanced.logloss()

    if class_6_err_imbalanced < class_6_err_balanced:
        print "--------------------"
        print ""
        print "FAIL, balanced error greater than imbalanced error"
        print ""
        print ""
        print "class_6_err_imbalanced"
        print class_6_err_imbalanced
        print ""
        print "class_6_err_balanced"
        print class_6_err_balanced
        print ""
        print "--------------------"

    assert class_6_err_imbalanced >= class_6_err_balanced, "balance_classes makes it worse!"


if __name__ == "__main__":
    pyunit_utils.standalone_test(imbalance)
else:
    imbalance()
                                     max_depth=1,
                                     min_rows=1,
                                     learn_rate=0.1,
                                     distribution="gaussian")
  gbm.train(x=range(3), y="Claims", training_frame=insurance, offset_column="offset")

  predictions = gbm.predict(insurance)

  # Comparison result generated from R's gbm:
  #	fit2 <- gbm(Claims ~ District + Group + Age+ offset(log(Holders)) , interaction.depth = 1,n.minobsinnode = 1,
  #               shrinkage = .1,bag.fraction = 1,train.fraction = 1,
  #   data = Insurance, distribution ="gaussian", n.trees = 600)
  #   pg = predict(fit2, newdata = Insurance, type = "response", n.trees=600)
  #   pr = pg - - log(Insurance$Holders)
  assert abs(44.33016 - gbm._model_json['output']['init_f']) < 1e-5, "expected init_f to be {0}, but got {1}". \
    format(44.33016, gbm._model_json['output']['init_f'])
  assert abs(1491.135 - gbm.mse()) < 1e-2, "expected mse to be {0}, but got {1}".format(1491.135, gbm.mse())
  assert abs(49.23438 - predictions.mean()) < 1e-2, "expected prediction mean to be {0}, but got {1}". \
    format(49.23438, predictions.mean())
  assert abs(-45.5720659304 - predictions.min()) < 1e-2, "expected prediction min to be {0}, but got {1}". \
    format(-45.5720659304, predictions.min())
  assert abs(207.387 - predictions.max()) < 1e-2, "expected prediction max to be {0}, but got {1}". \
    format(207.387, predictions.max())



if __name__ == "__main__":
  pyunit_utils.standalone_test(offset_gaussian)
else:
  offset_gaussian()
sys.path.insert(1, os.path.join("..", "..", ".."))
import h2o
from tests import pyunit_utils
from collections import OrderedDict
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators.gbm import H2OGradientBoostingEstimator


def grid_parallel_cv():
    train = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    # Run GBM Grid Search using Cross Validation with parallelization enabled
    ntrees_opts = [1, 3, 5]
    hyper_parameters = OrderedDict()
    hyper_parameters["ntrees"] = ntrees_opts
    print("GBM grid with the following hyper_parameters:", hyper_parameters)

    gs = H2OGridSearch(H2OGradientBoostingEstimator,
                       hyper_params=hyper_parameters,
                       parallelism=2)
    gs.train(x=list(range(4)), y=4, training_frame=train, nfolds=3)
    assert gs is not None
    assert len(gs.model_ids) == len(ntrees_opts)


if __name__ == "__main__":
    pyunit_utils.standalone_test(grid_parallel_cv)
else:
    grid_parallel_cv()
    dataset_params['randomize'] = True
    dataset_params['factors'] = random.randint(2,2000)
    dataset_params['response_factors'] = random.randint(3,100)
    print "Dataset parameters: {0}".format(dataset_params)

    train = h2o.create_frame(**dataset_params)

    print "Training dataset:"
    print train

    # Save dataset to results directory
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(train,os.path.join(results_dir,"nb_dynamic_training_dataset.log"))

    # Generate random parameters
    params = {}
    params['laplace'] = 0
    if random.randint(0,1): params['laplace'] = random.uniform(0,11)
    print "Parameter list: {0}".format(params)

    x = train.names
    x.remove("response")
    y = "response"

    pyunit_utils.javapredict(algo="naive_bayes", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)

if __name__ == "__main__":
    pyunit_utils.standalone_test(javapredict_dynamic_data)
else:
    javapredict_dynamic_data()
    col_sample_rate_per_tree = 0.6
    nfolds = 2
    min_split_improvement = 1e-04
    response = "class"
    features = train.col_names.remove(response)

    print("Train 100 GBM models to test if it fails.")
    for i in range(1, 100):
        seed = randint(1000, 2000)
        print(i, ": train model with random seed: ", seed)
        my_gbm = H2OGradientBoostingEstimator(
            ntrees=ntrees,
            max_depth=max_depth,
            min_rows=min_rows,
            learn_rate=learn_rate,
            sample_rate=sample_rate,
            col_sample_rate_per_tree=col_sample_rate_per_tree,
            nfolds=nfolds,
            min_split_improvement=min_split_improvement,
            seed=seed)
        my_gbm.train(x=features,
                     y=response,
                     training_frame=train,
                     validation_frame=train)


if __name__ == "__main__":
    pyunit_utils.standalone_test(test_pubdev_3847)
else:
    test_pubdev_3847()
    #    pass

    # LHS: H2OFrame, RHS: H2OVec
    #try:
    #    res = iris + iris[0]
    #    res.show()
    #    assert False, "expected error. objects of different dimensions not supported."
    #except EnvironmentError:
    #    pass

    # LHS: H2OFrame, RHS: scaler
    # res = 1.2 + iris[2]
    # res2 = iris + res[21,:]
    # res2.show()

    # LHS: H2OFrame, RHS: scaler
    res = iris + 2
    res_rows, res_cols = res.dim
    assert res_rows == rows and res_cols == cols, "dimension mismatch"
    for x, y in zip([res[c].sum() for c in range(cols-1)], [469.9, 342.6, 266.9, 162.2]):
        assert abs(x - y) < 1e-1,  "expected same values"

    ###################################################################



if __name__ == "__main__":
    pyunit_utils.standalone_test(binop_plus)
else:
    binop_plus()
    
        contributions = m.predict_contributions(first_row, top_n=50, bottom_n=50, compare_abs=True, output_format=output_format)
        check_sorted_correctly(contributions, first_row_sorted_desc_abs)
    
        contributions = m.predict_contributions(first_row, top_n=4, bottom_n=4, compare_abs=True, output_format=output_format)
        check_sorted_correctly(contributions, first_row_sorted_desc_abs)


def check_sorted_correctly(contributions, python_sorted):
    assert_equals(15, contributions.shape[1], "Wrong number of columns")
    assert_equals(python_sorted[0][0], contributions[0, 0], "Not correctly sorted")
    assert_equals(python_sorted[1][0], contributions[0, 2], "Not correctly sorted")
    assert_equals(python_sorted[2][0], contributions[0, 4], "Not correctly sorted")
    assert_equals(python_sorted[3][0], contributions[0, 6], "Not correctly sorted")
    assert_equals(python_sorted[4][0], contributions[0, 8], "Not correctly sorted")
    assert_equals(python_sorted[5][0], contributions[0, 10], "Not correctly sorted")
    assert_equals(python_sorted[6][0], contributions[0, 12], "Not correctly sorted")


def check_sorted_correcty_first_two_last_two(contributions, python_sorted_desc, python_sorted_asc):
    assert_equals(python_sorted_desc[0][0], contributions[0, 0], "Not correctly sorted")
    assert_equals(python_sorted_desc[1][0], contributions[0, 2], "Not correctly sorted")
    assert_equals(python_sorted_asc[0][0], contributions[0, 4], "Not correctly sorted")
    assert_equals(python_sorted_asc[1][0], contributions[0, 6], "Not correctly sorted")


if __name__ == "__main__":
    pyunit_utils.standalone_test(xgboost_predict_contributions_sorting)
else:
    xgboost_predict_contributions_sorting()
import h2o
from tests import pyunit_utils
from h2o.estimators.gbm import H2OGradientBoostingEstimator

def weights_gamma():

  htable  = h2o.upload_file(pyunit_utils.locate("smalldata/gbm_test/moppe.csv"))
  htable["premiekl"] = htable["premiekl"].asfactor()
  htable["moptva"] = htable["moptva"].asfactor()
  htable["zon"] = htable["zon"]

  hh = H2OGradientBoostingEstimator(distribution="gamma",
                                    ntrees=20,
                                    max_depth=1,
                                    min_rows=1,
                                    learn_rate=1)
  hh.train(x=list(range(3)), y="medskad", training_frame=htable, weights_column="antskad")
  ph = hh.predict(htable)

  assert abs(8.804447-hh._model_json['output']['init_f']) < 1e-6*8.804447
  assert abs(3751.01-ph[0].min()) < 1e-4*3751.01
  assert abs(15298.87-ph[0].max()) < 1e-4*15298.87
  assert abs(8121.98-ph[0].mean()[0]) < 1e-4*8121.98



if __name__ == "__main__":
  pyunit_utils.standalone_test(weights_gamma)
else:
  weights_gamma()
Example #40
0
  mean_residual_deviance_history = extract_scoring_history_field(gbm, "training_deviance")
  print("History of training mean residual deviance during training is {0}".format(mean_residual_deviance_history))

  assert abs(mean_residual_deviance_history[-1]-gbm_mrd) < 1e-12, "mean_residual_deviance function is not working."

def extract_scoring_history_field(aModel, fieldOfInterest):
  """
  Given a fieldOfInterest that are found in the model scoring history, this function will extract the list
  of field values for you from the model.

  :param aModel: H2O model where you want to extract a list of fields from the scoring history
  :param fieldOfInterest: string representing a field of interest.
  :return: List of field values or None if it cannot be found
  """

  allFields = aModel._model_json["output"]["scoring_history"]._col_header
  if fieldOfInterest in allFields:
    cellValues = []
    fieldIndex = allFields.index(fieldOfInterest)
    for eachCell in aModel._model_json["output"]["scoring_history"].cell_values:
      cellValues.append(eachCell[fieldIndex])
    return cellValues
  else:
    return None


if __name__ == "__main__":
  pyunit_utils.standalone_test(gbm_residual_deviance)
else:
  gbm_residual_deviance()
Example #41
0
  hh_balanced = H2OGradientBoostingEstimator(ntrees=10,
                                             nfolds=3,
                                             distribution="multinomial",
                                             balance_classes=False)
  hh_balanced.train(x=range(54), y=54, training_frame=covtype)
  hh_balanced_perf = hh_balanced.model_performance(covtype)
  hh_balanced_perf.show()

  #compare error for class 6 (difficult minority)
  class_6_err_imbalanced = hh_imbalanced_perf.confusion_matrix().cell_values[5][7]
  class_6_err_balanced = hh_balanced_perf.confusion_matrix().cell_values[5][7]

  print("--------------------")
  print("")
  print("class_6_err_imbalanced")
  print(class_6_err_imbalanced)
  print("")
  print("class_6_err_balanced")
  print(class_6_err_balanced)
  print("")
  print("--------------------")

  assert class_6_err_imbalanced >= 0.90*class_6_err_balanced, "balance_classes makes it at least 10% worse!"



if __name__ == "__main__":
  pyunit_utils.standalone_test(imbalanced_gbm)
else:
  imbalanced_gbm()
Example #42
0
    # gather, print and save performance numbers for h2o model
    h2oModelD.train(x=myX, y=y, training_frame=trainFile)
    h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"]
    time1 = time.time()
    h2oPredictD = h2oModelD.predict(trainFile)
    h2oPredictTimeD = time.time() - time1

    # train the native XGBoost
    nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile,
                                                        y,
                                                        enumCols=enumCols)
    nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain)
    nativeTrainTime = time.time() - time1
    time1 = time.time()
    nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees)
    nativeScoreTime = time.time() - time1

    pyunit_utils.summarizeResult_regression(h2oPredictD,
                                            nativePred,
                                            h2oTrainTimeD,
                                            nativeTrainTime,
                                            h2oPredictTimeD,
                                            nativeScoreTime,
                                            tolerance=testTol)


if __name__ == "__main__":
    pyunit_utils.standalone_test(comparison_test_dense)
else:
    comparison_test_dense()
    H2OKMeansEstimator(max_iterations=0).train(x = range(ozone_h2o.ncol), training_frame=ozone_h2o)
    assert False, "expected an error"
  except EnvironmentError:
    assert True

  centers = start
  for i in range(miters):
    rep_fit = H2OKMeansEstimator(k=ncent, user_points=centers, max_iterations=1)
    rep_fit.train(x = range(ozone_h2o.ncol), training_frame=ozone_h2o)
    centers = h2o.H2OFrame(rep_fit.centers())

  # Log.info(paste("Run k-means with max_iter=miters"))
  all_fit = H2OKMeansEstimator(k=ncent, user_points=start, max_iterations=miters)
  all_fit.train(x=range(ozone_h2o.ncol), training_frame=ozone_h2o)
  assert rep_fit.centers() == all_fit.centers(), "expected the centers to be the same"

  # Log.info("Check cluster centers have converged")
  all_fit2 = H2OKMeansEstimator(k=ncent, user_points=h2o.H2OFrame(all_fit.centers()),
                        max_iterations=1)
  all_fit2.train(x=range(ozone_h2o.ncol), training_frame= ozone_h2o)
  avg_change = sum([sum([pow((e1 - e2),2) for e1, e2 in zip(c1,c2)]) for c1, c2 in zip(all_fit.centers(),
                                                                                       all_fit2.centers())]) / ncent
  assert avg_change < 1e-6 or all_fit._model_json['output']['iterations'] == miters



if __name__ == "__main__":
  pyunit_utils.standalone_test(convergeKmeans)
else:
  convergeKmeans()
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print("ntrees model 2: {0}".format(ntrees2))
    print("max_depth model 2: {0}".format(max_depth2))
    print("min_rows model 2: {0}".format(min_rows2))
    model2 = H2OGradientBoostingEstimator(ntrees=ntrees2,
                                          max_depth=max_depth2,
                                          min_rows=min_rows2,
                                          distribution=distribution,
                                          checkpoint=restored_model.model_id)
    model2.train(x=list(range(1, milsong_train.ncol)),
                 y=0,
                 training_frame=milsong_train,
                 validation_frame=milsong_valid)

    model3 = H2OGradientBoostingEstimator(ntrees=ntrees2,
                                          max_depth=max_depth2,
                                          min_rows=min_rows2,
                                          distribution=distribution)

    model3.train(x=list(range(1, milsong_train.ncol)),
                 y=0,
                 training_frame=milsong_train,
                 validation_frame=milsong_valid)


if __name__ == "__main__":
    pyunit_utils.standalone_test(milsong_checkpoint)
else:
    milsong_checkpoint()
Example #45
0
import sys
sys.path.insert(1,"../../")
import h2o
from tests import pyunit_utils
from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator as H2OPCA


def screeplot_test():
    kwargs = {}
    kwargs['server'] = True
    australia = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/AustraliaCoast.csv"))
    australia_pca = H2OPCA(k=4,transform="STANDARDIZE")
    australia_pca.train(x=list(range(8)), training_frame=australia)
    australia_pca.screeplot(type="barplot", **kwargs)
    australia_pca.screeplot(type="lines", **kwargs)

if __name__ == "__main__":
    pyunit_utils.standalone_test(screeplot_test)
else:
    screeplot_test()
Example #46
0
    print
    print "======================================================================"
    print "============================== Gaussian =============================="
    print "======================================================================"
    for i in range(10):
        attack("gaussian", cars_train, cars_valid,
               random.sample([2, 3, 4, 5, 6, 7], random.randint(1, 6)), 1)

    print
    print "======================================================================"
    print "============================== Poisson  =============================="
    print "======================================================================"
    for i in range(10):
        attack("poisson", cars_train, cars_valid,
               random.sample([1, 3, 4, 5, 6, 7], random.randint(1, 6)), 2)

    print
    print "======================================================================"
    print "==============================  Gamma   =============================="
    print "======================================================================"
    for i in range(10):
        attack("gamma", pros_train, pros_valid,
               random.sample([1, 2, 3, 5, 6, 7, 8], random.randint(1, 7)), 4)


if __name__ == "__main__":
    pyunit_utils.standalone_test(random_attack)
else:
    random_attack()
  # /99/Rapids, parms: {ast=(tmp= py_8 (append py_7 (| (== (cols_py py_7 "WeekDay") "Sun") (== (cols_py py_7 "WeekDay") "Sat")) "Weekend"))}
  # DELETE /3/DKV/(?<key>.*), parms: {key=py_7}
  # /3/Frames/(?<frameid>.*), parms: {frame_id=py_8, row_count=10}
  crimes["Weekend"] = (crimes["WeekDay"] == "Sun") | (crimes["WeekDay"] == "Sat")

  # /99/Rapids, parms: {ast=(tmp= py_9 (append py_8 (cut (cols_py py_8 "Month") [0 2 5 7 10 12] ["Winter" "Spring" "Summer" "Autumn" "Winter"] FALSE TRUE 3) "Season"))}
  # DELETE /3/DKV/(?<key>.*), parms: {key=py_8}
  # /3/Frames/(?<frameid>.*), parms: {frame_id=py_9, row_count=10}
  crimes["Season"]  = crimes["Month"].cut([0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"])

  # /99/Rapids, parms: {ast=(tmp= py_10 (cols py_9 -3))}
  # DELETE /3/DKV/(?<key>.*), parms: {key=py_9}
  # /3/Frames/(?<frameid>.*), parms: {frame_id=py_10, row_count=10}
  crimes = crimes.drop("Date")

  crimes.describe()

  # DELETE /3/DKV/(?<key>.*), parms: {key=py_10}

  tmps1 = pyunit_utils.temp_ctr(); ntmps = tmps1-tmps0
  rest1 = pyunit_utils.rest_ctr(); nrest = rest1-rest0
  print(("Number of temps used: ",ntmps))
  print(("Number of RESTs used: ",nrest))
  assert ntmps <= 15
  assert nrest <= 20

if __name__ == "__main__":
  pyunit_utils.standalone_test(date_munge)
else:
  date_munge()
def pubdev_1431():

    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        airlines_billion_file = "/datasets/airlinesbillion.csv"
        url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file)
        airlines_billion = h2o.import_file(url)
        airlines_billion[30] = airlines_billion[30].asfactor()
        gbm = h2o.gbm(x=airlines_billion[0:30],
                      y=airlines_billion[30],
                      ntrees=1,
                      distribution="bernoulli",
                      max_depth=1)
        predictions = gbm.predict(airlines_billion)
        csv = os.path.join(os.getcwd(), "delete.csv")
        h2o.download_csv(predictions, csv)
        os.remove(csv)
    else:
        raise (EnvironmentError,
               "Not running on H2O internal network.  No access to HDFS.")


if __name__ == "__main__":
    pyunit_utils.standalone_test(pubdev_1431)
else:
    pubdev_1431()
    expNum=expNum+1
    if (buildModel[expNum]):
        print("------  Testing Randomized PCA --------")
        gramSVD = H2OPCA(k=8, impute_missing=True, transform=transformN, seed=12345)
        gramSVD.train(x=x, training_frame=rotterdamH2O)
        randomizedPCA = H2OPCA(k=8, impute_missing=True, transform=transformN, pca_method="Randomized", seed=12345,
                               max_iterations=5)  # power
        randomizedPCA.train(x=x, training_frame=rotterdamH2O)

        # compare singular values and stuff with GramSVD
        print("@@@@@@  Comparing eigenvalues between GramSVD and Randomized...\n")
        pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["importance"],
                                                 randomizedPCA._model_json["output"]["importance"],
                                                 ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"],
                                                 tolerance=1e-1, check_all=False)

        print("@@@@@@  Comparing eigenvectors between GramSVD and Power...\n")
        # compare singular vectors
        pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["eigenvectors"],
                                                 randomizedPCA._model_json["output"]["eigenvectors"],
                                                 randomizedPCA._model_json["output"]["names"], tolerance=1e-6,
                                                 check_sign=True, check_all=False)
    h2o.remove_all()


if __name__ == "__main__":
    pyunit_utils.standalone_test(pca_wideDataset_rotterdam)
else:
    pca_wideDataset_rotterdam()
Example #50
0
    # Without weights
    myX = ["Merit", "Class", "C1M3", "C4M3"]

    from h2o.estimators.deeplearning import H2ODeepLearningEstimator
    dl = H2ODeepLearningEstimator(distribution="tweedie",
                                  hidden=[1],
                                  epochs=1000,
                                  train_samples_per_iteration=-1,
                                  reproducible=True,
                                  activation="Tanh",
                                  balance_classes=False,
                                  force_load_balance=False,
                                  seed=2353123,
                                  tweedie_power=1.5,
                                  score_training_samples=0,
                                  score_validation_samples=0)

    dl.train(x=myX, y="Loss", training_frame=cancar)

    mean_residual_deviance = dl.mean_residual_deviance()

    # With weights
    dl.train(x=myX, y="Loss", training_frame=cancar, weights_column="Insured")


if __name__ == "__main__":
    pyunit_utils.standalone_test(tweedie_weights)
else:
    tweedie_weights()
Example #51
0
    pyunit_utils.np_comparison_check(h2o_data3.expm1(), np.expm1(np_data3), 10)
    h2o_val = h2o_data3.gamma()[5,5]
    num_val = math.gamma(h2o_data3[5,5])
    assert abs(h2o_val - num_val) < max(abs(h2o_val), abs(num_val)) * 1e-6, \
        "check unsuccessful! h2o computed {0} and math computed {1}. expected equal gamma values between h2o and " \
        "math".format(h2o_val,num_val)
    h2o_val = h2o_data3.lgamma()[5,5]
    num_val = math.lgamma(h2o_data3[5,5])
    assert abs(h2o_val - num_val) < max(abs(h2o_val), abs(num_val)) * 1e-6, \
        "check unsuccessful! h2o computed {0} and math computed {1}. expected equal lgamma values between h2o and " \
        "math".\
            format(h2o_val,num_val)
    h2o_val = h2o_data3.digamma()[5,5]
    num_val = scipy.special.polygamma(0,h2o_data3[5,5])
    assert abs(h2o_val - num_val) < max(abs(h2o_val), abs(num_val)) * 1e-6, \
        "check unsuccessful! h2o computed {0} and math computed {1}. expected equal digamma values between h2o and " \
        "math"\
            .format(h2o_val,num_val)
    h2o_val = h2o_data3.trigamma()[5,5]
    num_val = float(scipy.special.polygamma(1,h2o_data3[5,5]))
    assert abs(h2o_val - num_val) < max(abs(h2o_val), abs(num_val)) * 1e-6, \
        "check unsuccessful! h2o computed {0} and math computed {1}. expected equal trigamma values between h2o and " \
        "math".format(h2o_val,num_val)



if __name__ == "__main__":
    pyunit_utils.standalone_test(expr_math_ops)
else:
    expr_math_ops()
    # create a fold column for train
    fold_numbers = train.kfold_column(n_folds=5, seed=1234)
    # rename the column "fold_numbers"
    fold_numbers.set_names(["fold_numbers"])
    train = train.cbind(fold_numbers)

    # build the GAM model
    h2o_model_fold_column = H2OGeneralizedAdditiveEstimator(
        family='multinomial',
        gam_columns=["C6", "C7", "C8"],
        scale=[1, 1, 1],
        num_knots=numKnots,
        knot_ids=[frameKnots1.key, frameKnots2.key, frameKnots3.key])

    h2o_model_fold_column.train(x=x,
                                y=y,
                                training_frame=train,
                                fold_column="fold_numbers")

    # both model should return the same coefficients since they use the same fold assignment
    coeff = h2o_model.coef()
    coeff_fold_column = h2o_model_fold_column.coef()
    pyunit_utils.assertCoefDictEqual(coeff['coefficients'],
                                     coeff_fold_column['coefficients'])


if __name__ == "__main__":
    pyunit_utils.standalone_test(test_gam_cv_fold_columns)
else:
    test_gam_cv_fold_columns()
Example #53
0
  # 2. more folds than observations
  try:
    rf = H2ORandomForestEstimator(nfolds=cars.nrow+1, fold_assignment="Modulo")
    rf.train(y=response_col, x=predictors, training_frame=cars)
    assert False, "Expected model-build to fail when nfolds > nobs"
  except EnvironmentError:
    assert True

  # 3. fold_column and nfolds both specified
  try:
    rf = H2ORandomForestEstimator(nfolds=3)
    rf.train(y=response_col, x=predictors, fold_column="fold_assignments", training_frame=cars)
    assert False, "Expected model-build to fail when fold_column and nfolds both specified"
  except EnvironmentError:
    assert True

    # # 4. fold_column and fold_assignment both specified
    # try:
    #     rf = h2o.random_forest(y=cars[response_col], x=cars[predictors], fold_assignment="Random",
    #                            fold_column="fold_assignments", training_frame=cars)
    #     assert False, "Expected model-build to fail when fold_column and fold_assignment both specified"
    # except EnvironmentError:
    #     assert True



if __name__ == "__main__":
  pyunit_utils.standalone_test(cv_carsRF)
else:
  cv_carsRF()
Example #54
0
    #print t.cell_values
    exp = [(u'', 1, 0.010526315789473684, 0.9656726046291464, 2.4836601307189543, 2.4836601307189543, 1.0, 1.0, 0.026143790849673203, 0.026143790849673203, 148.36601307189542, 148.36601307189542), (u'', 2, 0.021052631578947368, 0.958934346136156, 2.4836601307189543, 2.4836601307189543, 1.0, 1.0, 0.026143790849673203, 0.05228758169934641, 148.36601307189542, 148.36601307189542), (u'', 3, 0.031578947368421054, 0.9507825261794234, 2.4836601307189543, 2.4836601307189543, 1.0, 1.0, 0.026143790849673203, 0.0784313725490196, 148.36601307189542, 148.36601307189542), (u'', 4, 0.042105263157894736, 0.9422672415967039, 2.4836601307189543, 2.4836601307189543, 1.0, 1.0, 0.026143790849673203, 0.10457516339869281, 148.36601307189542, 148.36601307189542), (u'', 5, 0.05, 0.9301225958876777, 2.4836601307189543, 2.4836601307189543, 1.0, 1.0, 0.0196078431372549, 0.12418300653594772, 148.36601307189542, 148.36601307189542), (u'', 6, 0.1, 0.9044146434092466, 2.4836601307189543, 2.4836601307189543, 1.0, 1.0, 0.12418300653594772, 0.24836601307189543, 148.36601307189542, 148.36601307189542), (u'', 7, 0.15, 0.8446852887955882, 2.4836601307189543, 2.4836601307189543, 1.0, 1.0, 0.12418300653594772, 0.37254901960784315, 148.36601307189542, 148.36601307189542), (u'', 8, 0.2, 0.7961432029967228, 2.4836601307189543, 2.4836601307189543, 1.0, 1.0, 0.12418300653594772, 0.49673202614379086, 148.36601307189542, 148.36601307189542), (u'', 9, 0.3, 0.6723258370286895, 2.4836601307189543, 2.4836601307189543, 1.0, 1.0, 0.24836601307189543, 0.7450980392156863, 148.36601307189542, 148.36601307189542), (u'', 10, 0.4, 0.4587689423764878, 1.6993464052287583, 2.287581699346405, 0.6842105263157895, 0.9210526315789473, 0.16993464052287582, 0.9150326797385621, 69.93464052287584, 128.7581699346405), (u'', 11, 0.5, 0.2941654296210501, 0.7843137254901961, 1.9869281045751637, 0.3157894736842105, 0.8, 0.0784313725490196, 0.9934640522875817, -21.568627450980394, 98.69281045751637), (u'', 12, 0.6, 0.19369580737381084, 0.06535947712418301, 1.6666666666666667, 0.02631578947368421, 0.6710526315789473, 0.006535947712418301, 1.0, -93.4640522875817, 66.66666666666667), (u'', 13, 0.7, 0.11690110696439827, 0.0, 1.4285714285714286, 0.0, 0.575187969924812, 0.0, 1.0, -100.0, 42.85714285714286), (u'', 14, 0.8, 0.08004746870641981, 0.0, 1.25, 0.0, 0.5032894736842105, 0.0, 1.0, -100.0, 25.0), (u'', 15, 0.9, 0.04735532042158167, 0.0, 1.1111111111111112, 0.0, 0.4473684210526316, 0.0, 1.0, -100.0, 11.111111111111116), (u'', 16, 1.0, 0.009748408811701144, 0.0, 1.0, 0.0, 0.4026315789473684, 0.0, 1.0, -100.0, 0.0)]
    mycomp(exp, t.cell_values)

    t = m.gains_lift(valid=True)
    mycomp(exp, t.cell_values)

    p = m.model_performance(df)
    t = p.gains_lift()
    mycomp(exp, t.cell_values)


    m = H2OGradientBoostingEstimator(nfolds=3, seed=1234)
    m.train(x=df.names,y="CAPSULE", training_frame=df, validation_frame=df)
    t = m.gains_lift(xval=True)

    #print t.cell_values
    exp2 = [(u'', 1, 0.010526315789473684, 0.9677782562476234, 1.2418300653594772, 1.2418300653594772, 0.5, 0.5, 0.013071895424836602, 0.013071895424836602, 24.183006535947715, 24.183006535947715), (u'', 2, 0.021052631578947368, 0.9582846040782473, 1.8627450980392157, 1.5522875816993464, 0.75, 0.625, 0.0196078431372549, 0.032679738562091505, 86.27450980392157, 55.22875816993464), (u'', 3, 0.031578947368421054, 0.9458499103092155, 2.4836601307189543, 1.8627450980392157, 1.0, 0.75, 0.026143790849673203, 0.058823529411764705, 148.36601307189542, 86.27450980392157), (u'', 4, 0.042105263157894736, 0.9331874956273033, 1.8627450980392157, 1.8627450980392157, 0.75, 0.75, 0.0196078431372549, 0.0784313725490196, 86.27450980392157, 86.27450980392157), (u'', 5, 0.05, 0.9319212918270888, 2.4836601307189543, 1.9607843137254903, 1.0, 0.7894736842105263, 0.0196078431372549, 0.09803921568627451, 148.36601307189542, 96.07843137254903), (u'', 6, 0.1, 0.8704014317587268, 2.2222222222222223, 2.0915032679738563, 0.8947368421052632, 0.8421052631578947, 0.1111111111111111, 0.20915032679738563, 122.22222222222223, 109.15032679738563), (u'', 7, 0.15, 0.8022612148480965, 1.5686274509803921, 1.9172113289760349, 0.631578947368421, 0.7719298245614035, 0.0784313725490196, 0.2875816993464052, 56.86274509803921, 91.72113289760348), (u'', 8, 0.2, 0.7409640897307539, 1.6993464052287583, 1.8627450980392157, 0.6842105263157895, 0.75, 0.08496732026143791, 0.37254901960784315, 69.93464052287584, 86.27450980392157), (u'', 9, 0.3, 0.5840891361136157, 1.5686274509803921, 1.7647058823529413, 0.631578947368421, 0.7105263157894737, 0.1568627450980392, 0.5294117647058824, 56.86274509803921, 76.47058823529413), (u'', 10, 0.4, 0.4462887172671538, 1.3725490196078434, 1.6666666666666667, 0.5526315789473685, 0.6710526315789473, 0.13725490196078433, 0.6666666666666666, 37.25490196078434, 66.66666666666667), (u'', 11, 0.5, 0.3193859623494622, 0.9803921568627452, 1.5294117647058825, 0.39473684210526316, 0.6157894736842106, 0.09803921568627451, 0.7647058823529411, -1.9607843137254832, 52.941176470588246), (u'', 12, 0.6, 0.2340751507622484, 0.7843137254901961, 1.4052287581699348, 0.3157894736842105, 0.5657894736842105, 0.0784313725490196, 0.8431372549019608, -21.568627450980394, 40.522875816993476), (u'', 13, 0.7, 0.14629536699033518, 0.5882352941176471, 1.288515406162465, 0.23684210526315788, 0.518796992481203, 0.058823529411764705, 0.9019607843137255, -41.17647058823529, 28.851540616246506), (u'', 14, 0.8, 0.09247017777496397, 0.5228758169934641, 1.1928104575163399, 0.21052631578947367, 0.48026315789473684, 0.05228758169934641, 0.954248366013072, -47.71241830065359, 19.281045751633986), (u'', 15, 0.9, 0.04779416944259696, 0.19607843137254902, 1.0820624546114743, 0.07894736842105263, 0.43567251461988304, 0.0196078431372549, 0.9738562091503268, -80.3921568627451, 8.206245461147432), (u'', 16, 1.0, 0.009938670599098145, 0.26143790849673204, 1.0, 0.10526315789473684, 0.4026315789473684, 0.026143790849673203, 1.0, -73.8562091503268, 0.0)]
    mycomp(exp2, t.cell_values)


    p = m.model_performance(df)
    t = p.gains_lift()
    mycomp(exp, t.cell_values)


if __name__ == "__main__":
    pyunit_utils.standalone_test(pubdev_2118)
else:
    pubdev_2118()
    fractions = dict()
    fractions["real_fraction"] = 0 # Right now we are dropping string columns, so no point in having them.
    fractions["categorical_fraction"] = 1
    fractions["integer_fraction"] = 0
    fractions["time_fraction"] = 0
    fractions["string_fraction"] = 0 # Right now we are dropping string columns, so no point in having them.
    fractions["binary_fraction"] = 0
    
    # this used to get an error message
    try: 
        traindata = h2o.create_frame(rows=100, cols=2, missing_fraction=0, has_response=False, factors=9999999, seed=12345, **fractions)
    except Exception as ex:
        sys.exit(1)

    # this get an error message
    try:
        traindata = h2o.create_frame(rows=100, cols=2, missing_fraction=0, has_response=False, factors=19999999, seed=12345, **fractions)
        sys.exit(1) # should have thrown an error
    except Exception as ex: # expect an error here
        print(ex)
        if 'Number of factors must be <= 10,000,000' in ex.args[0].dev_msg:
            sys.exit(0) # correct error message
        else:
            sys.exit(1) # something else is wrong.


if __name__ == "__main__":
    pyunit_utils.standalone_test(pubdev_6304)
else:
    pubdev_6304()
Example #56
0
from h2o.estimators.glm import H2OGeneralizedLinearEstimator


# I copied this test from Jeff Plourde.  Thank you.
# This test just needs to run to completion without receiving any error.  There is no assert statement needed here.
def remove_collinear_columns_multinomial():
    train = h2o.import_file(
        pyunit_utils.locate("smalldata/glm_test/multinomial_rcc.csv"))
    train[0] = train[0].asfactor()
    mdl = H2OGeneralizedLinearEstimator(solver='IRLSM',
                                        family='multinomial',
                                        link='family_default',
                                        seed=76,
                                        lambda_=[0],
                                        max_iterations=100000,
                                        beta_epsilon=1e-7,
                                        early_stopping=False,
                                        standardize=True,
                                        remove_collinear_columns=True)
    mdl.start(x=train.col_names[1:],
              y=train.col_names[0],
              training_frame=train)
    mdl.join()
    print("test completed.")


if __name__ == "__main__":
    pyunit_utils.standalone_test(remove_collinear_columns_multinomial)
else:
    remove_collinear_columns_multinomial()
import sys

sys.path.insert(1, "../../../")
import h2o
from tests import pyunit_utils
from tests.pyunit_utils import CustomOneFuncStr, \
    assert_all_metrics_equal, regression_model
from h2o.estimators.gbm import H2OGradientBoostingEstimator


def test_custom_metric_from_str():
    custom_metric = h2o.upload_custom_metric(CustomOneFuncStr, class_name="CustomOneFunc", func_name="custom_mm")
    (model2, f_test2) = regression_model(H2OGradientBoostingEstimator, custom_metric)
    assert_all_metrics_equal(model2, f_test2, "custom_mm", 1)


__TESTS__ = [
    test_custom_metric_from_str
]

if __name__ == "__main__":
    for func in __TESTS__:
        pyunit_utils.standalone_test(func)
else:
    for func in __TESTS__:
        func()
Example #58
0
sys.path.insert(1, "../../")
import h2o
from tests import pyunit_utils


def test_relevel_by_freq_topn():
    prostate_cat = h2o.import_file(
        path=pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))

    dpros_levels_ordered = prostate_cat["DPROS"].table().as_data_frame(
    )["DPROS"].tolist()
    assert dpros_levels_ordered == ["Both", "Left", "None", "Right"]

    prostate_cat_relevel = prostate_cat.relevel_by_frequency(top_n=1)

    dpros_relevel_levels = prostate_cat_relevel["DPROS"].table().as_data_frame(
    )["DPROS"].tolist()
    assert dpros_relevel_levels == ['Left', 'Both', 'None', 'Right']

    top_drops_level = prostate_cat["DPROS"].table().as_data_frame(
    ).sort_values(by="Count")["DPROS"].tolist()[-1]
    prostate_cat_relevel_manual = prostate_cat["DPROS"].relevel(
        y=top_drops_level)
    assert prostate_cat_relevel_manual.levels() == [dpros_relevel_levels]


if __name__ == "__main__":
    pyunit_utils.standalone_test(test_relevel_by_freq_topn)
else:
    test_relevel_by_freq_topn()
Example #59
0
    h2o_data.impute(column="C4", method="median", combine_method="high")
    c4_imputed = h2o_data[2,3]
    assert c4_imputed == 5, "Wrong value imputed. Expected imputed value of 5, but got {0}".format(c4_imputed)

    # mode-categorical
    h2o_data = h2o.H2OFrame(zip(*data))
    h2o_data.impute(column="C5", method="mode")
    c5_imputed = h2o_data[4,4]
    assert c5_imputed == 'b', "Wrong value imputed. Expected imputed value of b, but got {0}".format(c5_imputed)

    # mode-numeric
    h2o_data = h2o.H2OFrame(zip(*data))
    h2o_data.impute(column="C6", method="mode")
    c6_imputed = h2o_data[5,5]
    assert c6_imputed == 1, "Wrong value imputed. Expected imputed value of 1, but got {0}".format(c6_imputed)

    # mean-group by C7
    h2o_data = h2o.H2OFrame(zip(*data))
    h2o_data.impute(column="C3", method="mean", by="C7")
    imputed1 = h2o_data[2,2]
    imputed2 = h2o_data[3,2]
    assert imputed1 == 3.5, "Wrong value imputed. Expected imputed value of 3.5, but got {0}".format(imputed1)
    assert imputed2 == 9.5, "Wrong value imputed. Expected imputed value of 9.5, but got {0}".format(imputed2)



if __name__ == "__main__":
    pyunit_utils.standalone_test(impute2)
else:
    impute2()
    model1Seeds = ','.join(str(x) for x in model_seeds1[0:model_len])
    model2Seeds = ','.join(str(x) for x in model_seeds2[0:model_len])
    assert model1Seeds==model2Seeds, "Model seeds are not equal: gridsearch 1 seeds %s; " \
                                                           " and gridsearch 2 seeds %s" % (model1Seeds, model2Seeds)

    # compare training_rmse from scoring history
    model1seed = air_grid1.models[0].full_parameters['seed']['actual_value']
    index2 = 0  # find the model in grid2 with the same seed
    for ind in range(0, len(air_grid2.models)):
        if air_grid2.models[ind].full_parameters['seed'][
                'actual_value'] == model1seed:
            index2 = ind
            break

    metric_list1 = pyunit_utils.extract_scoring_history_field(
        air_grid1.models[0], "training_rmse", False)
    metric_list2 = pyunit_utils.extract_scoring_history_field(
        air_grid2.models[index2], "training_rmse", False)
    print(metric_list1)
    print(metric_list2)

    assert pyunit_utils.equal_two_arrays(metric_list1, metric_list2, 1e-5, 1e-6, False), \
                "Training_rmse are different between the two grid search models.  Tests are supposed to be repeatable in " \
                "this case.  Make sure model seeds are actually set correctly in the Java backend."


if __name__ == "__main__":
    pyunit_utils.standalone_test(random_grid_model_seeds_PUBDEV_4090)
else:
    random_grid_model_seeds_PUBDEV_4090()