def iris_nfolds(): iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv")) model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50, nfolds=5) model.show() # Can specify both nfolds >= 2 and validation = H2OParsedData at once try: H2ORandomForestEstimator(ntrees=50, nfolds=5).train(y=4, x=list(range(4)), validation_frame=iris) assert True except EnvironmentError: assert False, "expected an error" if __name__ == "__main__": pyunit_utils.standalone_test(iris_nfolds) else: iris_nfolds()
y=iris_train[4], validation_x=iris_valid[["C1","C2","C3"]], validation_y=iris_valid[4], ntrees=5, distribution="multinomial", weights_column="C2", training_frame=iris_train, validation_frame=iris_valid) # validation_frame not specified, weights not part of validation_x try: gbm4_multinomial = h2o.gbm(x=iris_train[["C1","C2","C3"]], y=iris_train[4], validation_x=iris_valid[["C1","C2","C3"]], validation_y=iris_valid[4], ntrees=5, distribution="multinomial", weights_column="C4") assert False, "expected an error" except: assert True if __name__ == "__main__": pyunit_utils.standalone_test(weights_api) else: weights_api()
def bigcatRF(): # Training set has 100 categories from cat001 to cat100 # Categories cat001, cat003, ... are perfect predictors of y = 1 # Categories cat002, cat004, ... are perfect predictors of y = 0 #Log.info("Importing bigcat_5000x2.csv data...\n") bigcat = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/bigcat_5000x2.csv")) bigcat["y"] = bigcat["y"].asfactor() #Log.info("Summary of bigcat_5000x2.csv from H2O:\n") #bigcat.summary() # Train H2O DRF Model: #Log.info("H2O DRF (Naive Split) with parameters:\nclassification = TRUE, ntree = 1, depth = 1, nbins = 100, nbins_cats=10\n") model = H2ORandomForestEstimator(ntrees=1, max_depth=1, nbins=100, nbins_cats=10) model.train(x="X", y="y", training_frame=bigcat) model.show() if __name__ == "__main__": pyunit_utils.standalone_test(bigcatRF) else: bigcatRF()
[0.7297297297297298,66.05405405405405,2.0,0.0,1.0,23.270270270270274,9.589189189189193,7.27027027027027], [0.01754385964912314,70.35087719298245,2.0,1.0,-1.3877787807814457E-17,10.078947368421053, 42.37543859649123,6.157894736842105], [0.9,65.95,2.0,0.0,0.2,81.94500000000001,16.375,7.4], [0.9999999999999989,65.48598130841121,2.0,3.0,1.3877787807814457E-16,13.3092523364486, 13.268411214953275,6.747663551401869]] initial_y_h2o = h2o.H2OFrame(list(initial_y)) glrm_h2o = H2OGeneralizedLowRankEstimator(k=5, loss_by_col=loss_all, recover_svd=True, transform="STANDARDIZE", seed=12345, init="User", user_y=initial_y_h2o) glrm_h2o.train(x=prostateF.names, training_frame=prostateF, validation_frame=prostateF) glrm_h2o.show() # exercise logistic loss with numeric columns glrm_h2o_num = H2OGeneralizedLowRankEstimator(k=5, loss_by_col=loss_all, recover_svd=True, transform="STANDARDIZE", seed=12345, init="User", user_y=initial_y_h2o) glrm_h2o_num.train(x=prostateF_num.names, training_frame=prostateF_num, validation_frame=prostateF_num) glrm_h2o_num.show() # singular values from glrm models should equal if binary columns with binary loss are read in as either # categorical or numerics. If not, something is wrong. assert pyunit_utils.equal_two_arrays(glrm_h2o._model_json["output"]["singular_vals"], glrm_h2o_num._model_json["output"]["singular_vals"], 1e-6, 1e-4), \ "Singular values obtained from logistic loss with column type as enum and numeric do not agree. Fix it now." sys.stdout.flush() if __name__ == "__main__": pyunit_utils.standalone_test(glrm_pubdev_3756_arrest) else: glrm_pubdev_3756_arrest()
cls_bias = mx.sym.Variable('cls_bias') fc = mx.sym.FullyConnected(data=h_drop, weight=cls_weight, bias=cls_bias, num_hidden=num_label) # softmax output sm = mx.sym.SoftmaxOutput(data=fc, label=input_y, name='softmax') return sm def deepwater_tweets(): if not H2ODeepWaterEstimator.available(): return tweets = h2o.import_file(pyunit_utils.locate("/home/arno/tweets.txt"), col_names=["text"], sep="|") labels = h2o.import_file(pyunit_utils.locate("/home/arno/labels.txt"), col_names=["label"]) frame = tweets.cbind(labels) print(frame.head(5)) # cnn = make_text_cnn(sentence_size=100, num_embed=300, batch_size=32, # vocab_size=100000, dropout=dropout, with_embedding=with_embedding) model = H2ODeepWaterEstimator(epochs=50000, learning_rate=1e-3, hidden=[100,100,100,100,100]) model.train(x=[0],y=1, training_frame=frame) model.show() error = model.model_performance(train=True).mean_per_class_error() assert error < 0.1, "mean classification error is too high : " + str(error) if __name__ == "__main__": pyunit_utils.standalone_test(deepwater_tweets) else: deepwater_tweets()
from __future__ import print_function import sys, os sys.path.insert(1, os.path.join("..","..")) import h2o from tests import pyunit_utils from h2o.estimators.deeplearning import H2ODeepLearningEstimator def deeplearning_multi(): print("Test checks if Deep Learning works fine with a multiclass training and test dataset") prostate = h2o.import_file(pyunit_utils.locate("smalldata/logreg/prostate.csv")) prostate[4] = prostate[4].asfactor() hh = H2ODeepLearningEstimator(loss="CrossEntropy") hh.train(x=[0,1],y=4, training_frame=prostate, validation_frame=prostate) hh.show() if __name__ == "__main__": pyunit_utils.standalone_test(deeplearning_multi) else: deeplearning_multi()
ytrain = trainDataResponse.tolist() trainData = h2o.H2OFrame.fromPython([ytrain]+xtrain) trainData[0] = trainData[0].asfactor() print("Run model on 3250 columns of Arcene with strong rules off.") model = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=False, alpha=1) model.train(x=range(1,3250), y=0, training_frame=trainData) print("Test model on validation set.") validDataResponse = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_valid_labels.labels"), delimiter=' ') validDataResponse = np.where(validDataResponse == -1, 0, 1) validDataFeatures = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_valid.data"), delimiter=' ') xvalid = np.transpose(validDataFeatures).tolist() yvalid = validDataResponse.tolist() validData = h2o.H2OFrame.fromPython([yvalid]+xvalid) prediction = model.predict(validData) print("Check performance of predictions.") performance = model.model_performance(validData) print("Check that prediction AUC better than guessing (0.5).") assert performance.auc() > 0.5, "predictions should be better then pure chance" if __name__ == "__main__": pyunit_utils.standalone_test(wide_dataset_large) else: wide_dataset_large()
print("check unsuccessful! h2o computed {0} and numpy computed {1}".format(h2o_val, num_val)) return success h2o_val = h2o_data.min() num_val = np.min(np_data) assert abs(h2o_val - num_val) < 1e-06, ( "check unsuccessful! h2o computed {0} and numpy computed {1}. expected equal min values between h2o and " "numpy".format(h2o_val, num_val) ) h2o_val = h2o_data.max() num_val = np.max(np_data) assert abs(h2o_val - num_val) < 1e-06, ( "check unsuccessful! h2o computed {0} and numpy computed {1}. expected equal max values between h2o and " "numpy".format(h2o_val, num_val) ) h2o_val = h2o_data.sum() num_val = np.sum(np_data) assert abs(h2o_val - num_val) < 1e-06, ( "check unsuccessful! h2o computed {0} and numpy computed {1}. expected equal sum values between h2o and " "numpy".format(h2o_val, num_val) ) pyunit_utils.np_comparison_check( h2o_data.var(), np.cov(np_data, rowvar=0, ddof=1), 10 ), "expected equal var values between h2o and numpy" if __name__ == "__main__": pyunit_utils.standalone_test(expr_reducers) else: expr_reducers()
import h2o from h2o.estimators import H2OXGBoostEstimator from tests import pyunit_utils # Create many small models def models_stress_test(): data = h2o.import_file( pyunit_utils.locate("smalldata/testng/airlines_train.csv")) for i in range(0, 1000): xgb = H2OXGBoostEstimator(ntrees=1, max_depth=2) xgb.train(x=["Origin", "Distance"], y="IsDepDelayed", training_frame=data) if __name__ == "__main__": pyunit_utils.standalone_test(models_stress_test) else: models_stress_test()
pyunit_utils.compare_frames_local( pred_h2o, pred_mojo, 0.1, tol=1e-10 ) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10) def set_params(): global PROBLEM #missingValues = ['Skip', 'MeanImputation'] missingValues = ['MeanImputation'] PROBLEM = "multinomial" print("PROBLEM is {0}".format(PROBLEM)) missing_values = missingValues[randint(0, len(missingValues) - 1)] reg = 1.0 / 250000.0 params = { 'missing_values_handling': missing_values, 'family': "ordinal", 'alpha': [0.5], 'lambda_': [reg], 'obj_reg': reg } print(params) return params if __name__ == "__main__": pyunit_utils.standalone_test(glm_ordinal_mojo_pojo) else: glm_ordinal_mojo_pojo()
colnames = doubled_data.pop(0) for idx, w in enumerate(doubled_weights[0]): if w == 2: doubled_data.append(doubled_data[idx]) doubled_data = zip(*doubled_data) h2o_data_doubled = h2o.H2OFrame(python_obj=doubled_data) h2o_data_doubled.set_names(list(colnames)) h2o_data_doubled["economy_20mpg"] = h2o_data_doubled[ "economy_20mpg"].asfactor() h2o_data_doubled["cylinders"] = h2o_data_doubled["cylinders"].asfactor() h2o_data_doubled_weights["economy_20mpg"] = h2o_data_doubled_weights[ "economy_20mpg"].asfactor() h2o_data_doubled_weights["cylinders"] = h2o_data_doubled_weights[ "cylinders"].asfactor() print "Checking that doubling some weights is equivalent to doubling those observations:" print check_same(h2o_data_doubled, h2o_data_doubled_weights, 1) # TODO: random weights # TODO: all zero weights??? # TODO: negative weights??? if __name__ == "__main__": pyunit_utils.standalone_test(weights_check) else: weights_check()
x = ["C1", "C2"] y = "C11" gam_cols1 = ["C6", ["C7", "C8"], "C9", "C10"] gam_cols2 = [["C6"], ["C7", "C8"], ["C9"], ["C10"]] h2o_model1 = H2OGeneralizedAdditiveEstimator(family='multinomial', gam_columns=gam_cols1, bs=[1, 1, 0, 0], max_iterations=2) h2o_model1.train(x=x, y=y, training_frame=train, validation_frame=test) h2o_model2 = H2OGeneralizedAdditiveEstimator(family='multinomial', gam_columns=gam_cols2, bs=[1, 1, 0, 0], max_iterations=2) h2o_model2.train(x=x, y=y, training_frame=train, validation_frame=test) # check that both models produce the same coefficients print(h2o_model1.coef()) print(h2o_model2.coef()) pyunit_utils.assertCoefDictEqual(h2o_model1.coef()['coefficients'], h2o_model2.coef()['coefficients'], tol=1e-6) # check both models product the same validation metrics assert abs(h2o_model1.logloss(valid=True) - h2o_model2.logloss(valid=True)) < 1e-6,\ "Expected validation logloss: {0}, Actual validation logloss: {1}".format(h2o_model1.logloss(valid=True), h2o_model2.logloss(valid=True)) if __name__ == "__main__": pyunit_utils.standalone_test(test_gam_dual_mode_multinomial) else: test_gam_dual_mode_multinomial()
import os sys.path.insert(1, os.path.join("../../../h2o-py")) from tests import pyunit_utils import h2o from h2o.exceptions import H2OServerError def trace_request(): err = None try: h2o.api("TRACE /3/Cloud") except H2OServerError as e: err = e msg = str(err.args[0]) assert err is not None print("<Error message>") print(msg) print("</Error Message>") # exact message depends on Jetty Version and security settings assert msg.startswith("HTTP 500") or msg.startswith( "HTTP 405 Method Not Allowed") if __name__ == "__main__": pyunit_utils.standalone_test(trace_request) else: trace_request()
# 2. more folds than observations try: rf = H2ORandomForestEstimator(nfolds=cars.nrow+1, fold_assignment="Modulo") rf.train(y=response_col, x=predictors, training_frame=cars) assert False, "Expected model-build to fail when nfolds > nobs" except EnvironmentError: assert True # 3. fold_column and nfolds both specified try: rf = H2ORandomForestEstimator(nfolds=3) rf.train(y=response_col, x=predictors, fold_column="fold_assignments", training_frame=cars) assert False, "Expected model-build to fail when fold_column and nfolds both specified" except EnvironmentError: assert True # # 4. fold_column and fold_assignment both specified # try: # rf = h2o.random_forest(y=cars[response_col], x=cars[predictors], fold_assignment="Random", # fold_column="fold_assignments", training_frame=cars) # assert False, "Expected model-build to fail when fold_column and fold_assignment both specified" # except EnvironmentError: # assert True if __name__ == "__main__": pyunit_utils.standalone_test(cv_carsRF) else: cv_carsRF()
max_iterations=7, solver=solver) model.train(training_frame=split_frames[0], x=x_indices, y=y_index, validation_frame=split_frames[1]) modelCheckpoint = H2OGeneralizedLinearEstimator(family=family, checkpoint=model.model_id, solver=solver) modelCheckpoint.train(training_frame=split_frames[0], x=x_indices, y=y_index, validation_frame=split_frames[1]) modelLong = H2OGeneralizedLinearEstimator( family=family, solver=solver) # allow to run to completion modelLong.train(training_frame=split_frames[0], x=x_indices, y=y_index, validation_frame=split_frames[1]) pyunit_utils.assertEqualCoeffDicts(modelCheckpoint.coef(), modelLong.coef(), tol=5e-2) if __name__ == "__main__": pyunit_utils.standalone_test(testGLMCheckpointBinomial) else: testGLMCheckpointBinomial()
# bernoulli - offset not supported #dl = h2o.deeplearning(x=cars[2:8], y=cars["economy_20mpg"], distribution="bernoulli", offset_column="x1", # training_frame=cars) #predictions = dl.predict(cars) # gamma dl = H2ODeepLearningEstimator(distribution="gamma") dl.train(x=list(range(3)),y="Claims", training_frame=insurance, offset_column="offset") predictions = dl.predict(insurance) # gaussian dl = H2ODeepLearningEstimator(distribution="gaussian") dl.train(x=list(range(3)),y="Claims", training_frame=insurance, offset_column="offset") predictions = dl.predict(insurance) # poisson dl = H2ODeepLearningEstimator(distribution="poisson") dl.train(x=list(range(3)),y="Claims", training_frame=insurance, offset_column="offset") predictions = dl.predict(insurance) # tweedie dl = H2ODeepLearningEstimator(distribution="tweedie") dl.train(x=list(range(3)),y="Claims", training_frame=insurance, offset_column="offset") predictions = dl.predict(insurance) if __name__ == "__main__": pyunit_utils.standalone_test(offsets_and_distributions) else: offsets_and_distributions()
except H2OValueError: # as designed pass compare_frames(badFrame, badClone) originalAfterOp = H2OFrame.get_frame(badFrame.frame_id) compare_frames(badFrame, originalAfterOp) goodFrame = H2OFrame({"one": [4, 6, 1], "two": ["a", "b", "cde"]}) goodClone = H2OFrame({"one": [4, 6, 1], "two": ["a", "b", "cde"]}) compare_frames(goodFrame, goodClone) factoredFrame = goodFrame.asfactor() originalAfterOp = H2OFrame.get_frame(goodFrame.frame_id) compare_frames(goodFrame, originalAfterOp) expectedFactoredFrame = H2OFrame({"one": [4, 6, 1], "two": ["a", "b", "cde"]}, column_types={"one":"categorical", "two": "enum"}) compare_frames(expectedFactoredFrame, factoredFrame) refactoredFrame = expectedFactoredFrame.asfactor() factoredAfterOp = H2OFrame.get_frame(refactoredFrame.frame_id) compare_frames(expectedFactoredFrame, factoredAfterOp) if __name__ == "__main__": pyunit_utils.standalone_test(test1) else: test1()
import sys sys.path.insert(1, "../../") import h2o from tests import pyunit_utils def vec_as_list(): iris = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) res = h2o.as_list(iris[0], use_pandas=False) assert abs(float(res[0][4]) - 4.6) < 1e-10 and abs(float(res[0][6]) - 5.4) < 1e-10 and \ abs(float(res[0][10]) - 4.9) < 1e-10, "incorrect values" res = 2 - iris res = h2o.as_list(res[0], use_pandas=False) assert abs(float(res[0][4]) - -2.6) < 1e-10 and abs(float(res[0][18]) - -3.1) < 1e-10 and \ abs(float(res[0][25]) - -2.8) < 1e-10, "incorrect values" if __name__ == "__main__": pyunit_utils.standalone_test(vec_as_list) else: vec_as_list()
ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, score_each_iteration=True, validation_x=valid[predictors], validation_y=valid[response_col], checkpoint=model1._id) model4 = h2o.gbm(x=train[predictors], y=train[response_col], ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, score_each_iteration=True, validation_x=valid[predictors], validation_y=valid[response_col]) assert model2.auc(valid=True)==model4.auc(valid=True), "Expected Model 2 AUC: {0} to be the same as Model 4 AUC: {1}".format(model2.auc(valid=True), model4.auc(valid=True)) assert model2.giniCoef(valid=True)==model4.giniCoef(valid=True), "Expected Model 2 Gini Coef {0} to be the same as Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True)) assert model2.logloss(valid=True)==model4.logloss(valid=True), "Expected Model 2 Log Loss: {0} to be the same as Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True)) if __name__ == "__main__": pyunit_utils.standalone_test(pubdev_1829) else: pubdev_1829()
import h2o from h2o.exceptions import H2OResponseError from tests import pyunit_utils def pubdev_4863(): try: h2o.rapids("(tmp= digi_temp (cols_py 123STARTSWITHDIGITS 'a'))") assert False except H2OResponseError as error: print(error) assert 'Error: Name lookup of \'123STARTSWITHDIGITS\' failed' in str( error) if __name__ == "__main__": pyunit_utils.standalone_test(pubdev_4863) else: pubdev_4863()
def test_property_disabled(): print("\n=== disabling "+kcvm+" ===") grid_search = setup_grid() train = prepare_data() grid_search.train(x=range(4), y=4, training_frame=train, nfolds=nfolds, keep_cross_validation_models=False) keys = list_keys_in_memory() tot, cv = len(keys['models']), len(keys['cv_models']) print("total grid models in memory = {tot}, among which {cv} CV models".format(tot=tot, cv=cv)) assert tot > 0, "no grid models left in memory" assert cv == 0, "{cv} CV models were not cleaned from memory".format(cv=cv) for m in grid_search.models: assert not m.cross_validation_models(), "unexpected cv models for model "+m test_defaults() test_property_enabled() test_property_disabled() def test_all(): test_keep_cross_validation_predictions_on_gbm_grid() test_keep_cross_validation_models_on_gbm_grid() if __name__ == "__main__": pyunit_utils.standalone_test(test_all) else: test_all()
model_index += 1 if (diff > self.diff) or not(grid_model_metrics == sorted(grid_model_metrics)) or (diff_train < self.diff): self.test_failed = 1 print("test_rf_gridsearch_sorting_metrics for random forest has failed!") if self.test_failed == 0: print("test_rf_gridsearch_sorting_metrics for random forest has passed!") def test_gridsearch_sorting_metrics(): """ Create and instantiate class and perform tests specified for random forest :return: None """ test_rf_grid = Test_rf_gridsearch_sorting_metrics() test_rf_grid.test_rf_gridsearch_sorting_metrics() sys.stdout.flush() if test_rf_grid.test_failed: # exit with error if any tests have failed sys.exit(1) if __name__ == "__main__": pyunit_utils.standalone_test(test_gridsearch_sorting_metrics) else: test_gridsearch_sorting_metrics()
ones = np.where(a == 1)[0].size basis = ones == 1 and (zeros + ones) == k assert basis, "Got " + str(ones) + " ones and " + str(zeros) + " zeros, but expected all zeros except a single 1" return basis np.apply_along_axis(is_basis, 1, fit_x_np) print "Check final objective function value" fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_xy = np.dot(fit_x_np, fit_y_np) glrm_obj = glrm_h2o._model_json['output']['objective'] sse = np.sum(np.square(train.__sub__(fit_xy))) assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse) print "Impute XY and check error metrics" pred_h2o = glrm_h2o.predict(train_h2o) pred_np = np.array(h2o.as_list(pred_h2o)) assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product" glrm_numerr = glrm_h2o._model_json['output']['training_metrics']._metric_json['numerr'] glrm_caterr = glrm_h2o._model_json['output']['training_metrics']._metric_json['caterr'] assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj) assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero" if __name__ == "__main__": pyunit_utils.standalone_test(glrm_unitonesparse) else: glrm_unitonesparse()
"test3_duplicated_parameter_specification failed: Java error exception ({0}) should not " "have been thrown! ".format(e)) else: print( "test3_duplicated_parameter_specification passed: Java error exception ({0}) should " "have been thrown and did.".format(e)) def test_grid_search_for_glm_over_all_params(): """ Create and instantiate class and perform tests specified for GLM :return: None """ test_glm_grid = Test_glm_grid_search() test_glm_grid.test1_glm_grid_search_over_params() test_glm_grid.test2_illegal_name_value() test_glm_grid.test3_duplicated_parameter_specification() sys.stdout.flush() if test_glm_grid.test_failed: # exit with error if any tests have failed sys.exit(1) else: # remove json files if everything passes test_glm_grid.tear_down() if __name__ == "__main__": pyunit_utils.standalone_test(test_grid_search_for_glm_over_all_params) else: test_grid_search_for_glm_over_all_params()
assert True # Log.info("Number of rows exceeds training set's") start = [[random.gauss(0,1) for c in range(numcol)] for r in range(numrow+2)] try: h2o.kmeans(x=benign_h2o, k=numrow+2, user_points=h2o.H2OFrame(start)) assert False, "expected an error" except EnvironmentError: assert True # Nones are replaced with mean of a column in H2O. Not sure about Inf. # Log.info("Any entry is NA, NaN, or Inf") start = [[random.gauss(0,1) for c in range(numcol)] for r in range(3)] for x in ["NA", "NaN", "Inf", "-Inf"]: start_err = start[:] start_err[1][random.randint(0,numcol-1)] = x h2o.kmeans(x=benign_h2o, k=3, user_points=h2o.H2OFrame(start_err)) # Duplicates will affect sampling probability during initialization. # Log.info("Duplicate initial clusters specified") start = [[random.gauss(0,1) for c in range(numcol)] for r in range(3)] start[2] = start[0] h2o.kmeans(x=benign_h2o, k=3, user_points=h2o.H2OFrame(start)) if __name__ == "__main__": pyunit_utils.standalone_test(init_err_casesKmeans) else: init_err_casesKmeans()
result_frame_allsubsets["model_id"][ind, 0]) pred_allsubsets = one_model_allsubsets.predict(d) print("last element of predictor frame: {0}".format( pred_allsubsets[pred_allsubsets.nrows - 1, pred_allsubsets.ncols - 1])) assert pred_allsubsets.nrows == d.nrows, "expected dataset row: {0}, actual dataset row: " \ "{1}".format(pred_allsubsets.nrows, d.nrows) best_r2_value_maxr = best_r2_maxr[ind] one_model_maxr = h2o.get_model(result_frame_maxr["model_id"][ind, 0]) pred_maxr = one_model_maxr.predict(d) pyunit_utils.compare_frames_local( pred_maxr, pred_allsubsets, prob=1, tol=1e-6) # compare allsubsets and maxr results # r2 from result frame frame_r2_allsubsets = result_frame_allsubsets["best_r2_value"][ind, 0] # r2 from model model_r2_allsubsets = one_model_allsubsets.r2() # make sure all r2 are equal assert abs(best_r2_value_allsubsets-frame_r2_allsubsets) < 1e-6, "expected best r2: {0}, actual best r2: " \ "{1}".format(best_r2_value_allsubsets, frame_r2_allsubsets) assert abs(frame_r2_allsubsets-model_r2_allsubsets) < 1e-6, "expected best r2: {0}, actual best r2: " \ "{1}".format(model_r2_allsubsets, frame_r2_allsubsets) assert abs(best_r2_value_maxr-model_r2_allsubsets) < 1e-6, "expected best r2: {0}, maxr best r2: {1}" \ "".format(best_r2_value_maxr, model_r2_allsubsets) if __name__ == "__main__": pyunit_utils.standalone_test(test_gaussian_result_frame_model_id) else: test_gaussian_result_frame_model_id()
# Connect to a pre-existing cluster # connect to localhost:54321 # Log.info("Importing benign.csv data...\n") benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv")) # benign_h2o.summary() benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",") # Impute missing values with column mean imp = Imputer(missing_values="NaN", strategy="mean", axis=0) benign_sci = imp.fit_transform(benign_sci) for i in range(2, 7): # Log.info("H2O K-Means") km_h2o = H2OKMeansEstimator(k=i) km_h2o.train(x=range(benign_h2o.ncol), training_frame=benign_h2o) km_h2o.show() model = h2o.get_model(km_h2o._id) model.show() km_sci = KMeans(n_clusters=i, init="k-means++", n_init=1) km_sci.fit(benign_sci) print "sckit centers" print km_sci.cluster_centers_ if __name__ == "__main__": pyunit_utils.standalone_test(get_modelKmeans) else: get_modelKmeans()
assert set(['a', 'b', 'c']) == set(levels), \ "Expected levels to be {0}, but got {1}".format(set(['a', 'b', 'c']),levels) assert nlevels == 3, "Expected nlevels to be 3, but got {0}".format( nlevels) iris[4] = iris[4].set_level(level='b') levels = iris.levels(col=4) nlevels = iris.nlevels(col=4) assert set(['a', 'b', 'c']) == set(levels), \ "Expected levels to be {0}, but got {1}".format(set(['a', 'b', 'c']),levels) assert nlevels == 3, "Expected nlevels to be 3, but got {0}".format( nlevels) assert iris[0, 4] == 'b' levels = iris[1].levels() nlevels = iris[1].nlevels() assert levels == None, "Expected levels to be None, but got {0}".format( levels) assert nlevels == 0, "Expected nlevels to be 0, but got {0}".format( nlevels) one_column_frame = iris[4] one_column_frame = one_column_frame.set_level(level='c') assert one_column_frame[0, 0] == 'c' if __name__ == "__main__": pyunit_utils.standalone_test(levels_nlevels_setlevel_setLevels_test) else: levels_nlevels_setlevel_setLevels_test()
sys.path.insert(1,"../../") import h2o from tests import pyunit_utils import os def remove_obj_client(): training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv")) Y = 3 X = range(3) + range(4,11) from h2o.estimators.glm import H2OGeneralizedLinearEstimator model = H2OGeneralizedLinearEstimator(family="binomial", alpha=0, Lambda=1e-5) print model.model_id print model model.train(x=X,y=Y, training_frame=training_data) print model h2o.remove(model) print model h2o.remove(training_data) print training_data if __name__ == "__main__": pyunit_utils.standalone_test(remove_obj_client) else: remove_obj_client()
max_iterations=miters) print(cross1_km) print( "Run k-means with init = final cluster centers and max_iterations = 1" ) init_centers = h2o.H2OFrame(cross1_km.centers()) cross2_km = h2o.kmeans(training_frame=cross_h2o, x=cross_h2o[0:57], k=ncent, user_points=init_centers, max_iterations=1) print(cross2_km) print("Check k-means converged or maximum iterations reached") c1 = h2o.H2OFrame(cross1_km.centers()) c2 = h2o.H2OFrame(cross2_km.centers()) avg_change = old_div(((c1 - c2)**2).sum(), ncent) iters = cross1_km._model_json['output']['model_summary'].cell_values[ 0][3] assert avg_change < 1e-6 or iters > miters, "Expected k-means to converge or reach max iterations. avg_change = " \ "{0} and iterations = {1}".format(avg_change, iters) else: raise EnvironmentError if __name__ == "__main__": pyunit_utils.standalone_test(hdfs_kmeans_converge) else: hdfs_kmeans_converge()
h2o_data_doubled_weights = h2o_cars_data.cbind(h2o_doubled_weights) doubled_data = h2o.as_list(h2o_cars_data, use_pandas=False) colnames = doubled_data.pop(0) for idx, w in enumerate(doubled_weights[0]): if w == 2: doubled_data.append(doubled_data[idx]) h2o_data_doubled = h2o.H2OFrame(doubled_data) h2o_data_doubled.set_names(list(colnames)) h2o_data_doubled["economy_20mpg"] = h2o_data_doubled["economy_20mpg"].asfactor() h2o_data_doubled["cylinders"] = h2o_data_doubled["cylinders"].asfactor() h2o_data_doubled_weights["economy_20mpg"] = h2o_data_doubled_weights["economy_20mpg"].asfactor() h2o_data_doubled_weights["cylinders"] = h2o_data_doubled_weights["cylinders"].asfactor() print("Checking that doubling some weights is equivalent to doubling those observations:") print() check_same(h2o_data_doubled, h2o_data_doubled_weights, 1) # TODO: random weights # TODO: all zero weights??? # TODO: negative weights??? if __name__ == "__main__": pyunit_utils.standalone_test(weights_check) else: weights_check()
reproducible=True, seed=1234) hh_balanced.train(x=range(54), y=54, training_frame=covtype) print hh_balanced #compare overall logloss class_6_err_imbalanced = hh_imbalanced.logloss() class_6_err_balanced = hh_balanced.logloss() if class_6_err_imbalanced < class_6_err_balanced: print "--------------------" print "" print "FAIL, balanced error greater than imbalanced error" print "" print "" print "class_6_err_imbalanced" print class_6_err_imbalanced print "" print "class_6_err_balanced" print class_6_err_balanced print "" print "--------------------" assert class_6_err_imbalanced >= class_6_err_balanced, "balance_classes makes it worse!" if __name__ == "__main__": pyunit_utils.standalone_test(imbalance) else: imbalance()
max_depth=1, min_rows=1, learn_rate=0.1, distribution="gaussian") gbm.train(x=range(3), y="Claims", training_frame=insurance, offset_column="offset") predictions = gbm.predict(insurance) # Comparison result generated from R's gbm: # fit2 <- gbm(Claims ~ District + Group + Age+ offset(log(Holders)) , interaction.depth = 1,n.minobsinnode = 1, # shrinkage = .1,bag.fraction = 1,train.fraction = 1, # data = Insurance, distribution ="gaussian", n.trees = 600) # pg = predict(fit2, newdata = Insurance, type = "response", n.trees=600) # pr = pg - - log(Insurance$Holders) assert abs(44.33016 - gbm._model_json['output']['init_f']) < 1e-5, "expected init_f to be {0}, but got {1}". \ format(44.33016, gbm._model_json['output']['init_f']) assert abs(1491.135 - gbm.mse()) < 1e-2, "expected mse to be {0}, but got {1}".format(1491.135, gbm.mse()) assert abs(49.23438 - predictions.mean()) < 1e-2, "expected prediction mean to be {0}, but got {1}". \ format(49.23438, predictions.mean()) assert abs(-45.5720659304 - predictions.min()) < 1e-2, "expected prediction min to be {0}, but got {1}". \ format(-45.5720659304, predictions.min()) assert abs(207.387 - predictions.max()) < 1e-2, "expected prediction max to be {0}, but got {1}". \ format(207.387, predictions.max()) if __name__ == "__main__": pyunit_utils.standalone_test(offset_gaussian) else: offset_gaussian()
sys.path.insert(1, os.path.join("..", "..", "..")) import h2o from tests import pyunit_utils from collections import OrderedDict from h2o.grid.grid_search import H2OGridSearch from h2o.estimators.gbm import H2OGradientBoostingEstimator def grid_parallel_cv(): train = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # Run GBM Grid Search using Cross Validation with parallelization enabled ntrees_opts = [1, 3, 5] hyper_parameters = OrderedDict() hyper_parameters["ntrees"] = ntrees_opts print("GBM grid with the following hyper_parameters:", hyper_parameters) gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters, parallelism=2) gs.train(x=list(range(4)), y=4, training_frame=train, nfolds=3) assert gs is not None assert len(gs.model_ids) == len(ntrees_opts) if __name__ == "__main__": pyunit_utils.standalone_test(grid_parallel_cv) else: grid_parallel_cv()
dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2,2000) dataset_params['response_factors'] = random.randint(3,100) print "Dataset parameters: {0}".format(dataset_params) train = h2o.create_frame(**dataset_params) print "Training dataset:" print train # Save dataset to results directory results_dir = pyunit_utils.locate("results") h2o.download_csv(train,os.path.join(results_dir,"nb_dynamic_training_dataset.log")) # Generate random parameters params = {} params['laplace'] = 0 if random.randint(0,1): params['laplace'] = random.uniform(0,11) print "Parameter list: {0}".format(params) x = train.names x.remove("response") y = "response" pyunit_utils.javapredict(algo="naive_bayes", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params) if __name__ == "__main__": pyunit_utils.standalone_test(javapredict_dynamic_data) else: javapredict_dynamic_data()
col_sample_rate_per_tree = 0.6 nfolds = 2 min_split_improvement = 1e-04 response = "class" features = train.col_names.remove(response) print("Train 100 GBM models to test if it fails.") for i in range(1, 100): seed = randint(1000, 2000) print(i, ": train model with random seed: ", seed) my_gbm = H2OGradientBoostingEstimator( ntrees=ntrees, max_depth=max_depth, min_rows=min_rows, learn_rate=learn_rate, sample_rate=sample_rate, col_sample_rate_per_tree=col_sample_rate_per_tree, nfolds=nfolds, min_split_improvement=min_split_improvement, seed=seed) my_gbm.train(x=features, y=response, training_frame=train, validation_frame=train) if __name__ == "__main__": pyunit_utils.standalone_test(test_pubdev_3847) else: test_pubdev_3847()
# pass # LHS: H2OFrame, RHS: H2OVec #try: # res = iris + iris[0] # res.show() # assert False, "expected error. objects of different dimensions not supported." #except EnvironmentError: # pass # LHS: H2OFrame, RHS: scaler # res = 1.2 + iris[2] # res2 = iris + res[21,:] # res2.show() # LHS: H2OFrame, RHS: scaler res = iris + 2 res_rows, res_cols = res.dim assert res_rows == rows and res_cols == cols, "dimension mismatch" for x, y in zip([res[c].sum() for c in range(cols-1)], [469.9, 342.6, 266.9, 162.2]): assert abs(x - y) < 1e-1, "expected same values" ################################################################### if __name__ == "__main__": pyunit_utils.standalone_test(binop_plus) else: binop_plus()
contributions = m.predict_contributions(first_row, top_n=50, bottom_n=50, compare_abs=True, output_format=output_format) check_sorted_correctly(contributions, first_row_sorted_desc_abs) contributions = m.predict_contributions(first_row, top_n=4, bottom_n=4, compare_abs=True, output_format=output_format) check_sorted_correctly(contributions, first_row_sorted_desc_abs) def check_sorted_correctly(contributions, python_sorted): assert_equals(15, contributions.shape[1], "Wrong number of columns") assert_equals(python_sorted[0][0], contributions[0, 0], "Not correctly sorted") assert_equals(python_sorted[1][0], contributions[0, 2], "Not correctly sorted") assert_equals(python_sorted[2][0], contributions[0, 4], "Not correctly sorted") assert_equals(python_sorted[3][0], contributions[0, 6], "Not correctly sorted") assert_equals(python_sorted[4][0], contributions[0, 8], "Not correctly sorted") assert_equals(python_sorted[5][0], contributions[0, 10], "Not correctly sorted") assert_equals(python_sorted[6][0], contributions[0, 12], "Not correctly sorted") def check_sorted_correcty_first_two_last_two(contributions, python_sorted_desc, python_sorted_asc): assert_equals(python_sorted_desc[0][0], contributions[0, 0], "Not correctly sorted") assert_equals(python_sorted_desc[1][0], contributions[0, 2], "Not correctly sorted") assert_equals(python_sorted_asc[0][0], contributions[0, 4], "Not correctly sorted") assert_equals(python_sorted_asc[1][0], contributions[0, 6], "Not correctly sorted") if __name__ == "__main__": pyunit_utils.standalone_test(xgboost_predict_contributions_sorting) else: xgboost_predict_contributions_sorting()
import h2o from tests import pyunit_utils from h2o.estimators.gbm import H2OGradientBoostingEstimator def weights_gamma(): htable = h2o.upload_file(pyunit_utils.locate("smalldata/gbm_test/moppe.csv")) htable["premiekl"] = htable["premiekl"].asfactor() htable["moptva"] = htable["moptva"].asfactor() htable["zon"] = htable["zon"] hh = H2OGradientBoostingEstimator(distribution="gamma", ntrees=20, max_depth=1, min_rows=1, learn_rate=1) hh.train(x=list(range(3)), y="medskad", training_frame=htable, weights_column="antskad") ph = hh.predict(htable) assert abs(8.804447-hh._model_json['output']['init_f']) < 1e-6*8.804447 assert abs(3751.01-ph[0].min()) < 1e-4*3751.01 assert abs(15298.87-ph[0].max()) < 1e-4*15298.87 assert abs(8121.98-ph[0].mean()[0]) < 1e-4*8121.98 if __name__ == "__main__": pyunit_utils.standalone_test(weights_gamma) else: weights_gamma()
mean_residual_deviance_history = extract_scoring_history_field(gbm, "training_deviance") print("History of training mean residual deviance during training is {0}".format(mean_residual_deviance_history)) assert abs(mean_residual_deviance_history[-1]-gbm_mrd) < 1e-12, "mean_residual_deviance function is not working." def extract_scoring_history_field(aModel, fieldOfInterest): """ Given a fieldOfInterest that are found in the model scoring history, this function will extract the list of field values for you from the model. :param aModel: H2O model where you want to extract a list of fields from the scoring history :param fieldOfInterest: string representing a field of interest. :return: List of field values or None if it cannot be found """ allFields = aModel._model_json["output"]["scoring_history"]._col_header if fieldOfInterest in allFields: cellValues = [] fieldIndex = allFields.index(fieldOfInterest) for eachCell in aModel._model_json["output"]["scoring_history"].cell_values: cellValues.append(eachCell[fieldIndex]) return cellValues else: return None if __name__ == "__main__": pyunit_utils.standalone_test(gbm_residual_deviance) else: gbm_residual_deviance()
hh_balanced = H2OGradientBoostingEstimator(ntrees=10, nfolds=3, distribution="multinomial", balance_classes=False) hh_balanced.train(x=range(54), y=54, training_frame=covtype) hh_balanced_perf = hh_balanced.model_performance(covtype) hh_balanced_perf.show() #compare error for class 6 (difficult minority) class_6_err_imbalanced = hh_imbalanced_perf.confusion_matrix().cell_values[5][7] class_6_err_balanced = hh_balanced_perf.confusion_matrix().cell_values[5][7] print("--------------------") print("") print("class_6_err_imbalanced") print(class_6_err_imbalanced) print("") print("class_6_err_balanced") print(class_6_err_balanced) print("") print("--------------------") assert class_6_err_imbalanced >= 0.90*class_6_err_balanced, "balance_classes makes it at least 10% worse!" if __name__ == "__main__": pyunit_utils.standalone_test(imbalanced_gbm) else: imbalanced_gbm()
# gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time() - time1 # train the native XGBoost nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols) nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain) nativeTrainTime = time.time() - time1 time1 = time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time() - time1 pyunit_utils.summarizeResult_regression(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol) if __name__ == "__main__": pyunit_utils.standalone_test(comparison_test_dense) else: comparison_test_dense()
H2OKMeansEstimator(max_iterations=0).train(x = range(ozone_h2o.ncol), training_frame=ozone_h2o) assert False, "expected an error" except EnvironmentError: assert True centers = start for i in range(miters): rep_fit = H2OKMeansEstimator(k=ncent, user_points=centers, max_iterations=1) rep_fit.train(x = range(ozone_h2o.ncol), training_frame=ozone_h2o) centers = h2o.H2OFrame(rep_fit.centers()) # Log.info(paste("Run k-means with max_iter=miters")) all_fit = H2OKMeansEstimator(k=ncent, user_points=start, max_iterations=miters) all_fit.train(x=range(ozone_h2o.ncol), training_frame=ozone_h2o) assert rep_fit.centers() == all_fit.centers(), "expected the centers to be the same" # Log.info("Check cluster centers have converged") all_fit2 = H2OKMeansEstimator(k=ncent, user_points=h2o.H2OFrame(all_fit.centers()), max_iterations=1) all_fit2.train(x=range(ozone_h2o.ncol), training_frame= ozone_h2o) avg_change = sum([sum([pow((e1 - e2),2) for e1, e2 in zip(c1,c2)]) for c1, c2 in zip(all_fit.centers(), all_fit2.centers())]) / ncent assert avg_change < 1e-6 or all_fit._model_json['output']['iterations'] == miters if __name__ == "__main__": pyunit_utils.standalone_test(convergeKmeans) else: convergeKmeans()
max_depth2 = max_depth1 min_rows2 = min_rows1 print("ntrees model 2: {0}".format(ntrees2)) print("max_depth model 2: {0}".format(max_depth2)) print("min_rows model 2: {0}".format(min_rows2)) model2 = H2OGradientBoostingEstimator(ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, checkpoint=restored_model.model_id) model2.train(x=list(range(1, milsong_train.ncol)), y=0, training_frame=milsong_train, validation_frame=milsong_valid) model3 = H2OGradientBoostingEstimator(ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution) model3.train(x=list(range(1, milsong_train.ncol)), y=0, training_frame=milsong_train, validation_frame=milsong_valid) if __name__ == "__main__": pyunit_utils.standalone_test(milsong_checkpoint) else: milsong_checkpoint()
import sys sys.path.insert(1,"../../") import h2o from tests import pyunit_utils from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator as H2OPCA def screeplot_test(): kwargs = {} kwargs['server'] = True australia = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/AustraliaCoast.csv")) australia_pca = H2OPCA(k=4,transform="STANDARDIZE") australia_pca.train(x=list(range(8)), training_frame=australia) australia_pca.screeplot(type="barplot", **kwargs) australia_pca.screeplot(type="lines", **kwargs) if __name__ == "__main__": pyunit_utils.standalone_test(screeplot_test) else: screeplot_test()
print print "======================================================================" print "============================== Gaussian ==============================" print "======================================================================" for i in range(10): attack("gaussian", cars_train, cars_valid, random.sample([2, 3, 4, 5, 6, 7], random.randint(1, 6)), 1) print print "======================================================================" print "============================== Poisson ==============================" print "======================================================================" for i in range(10): attack("poisson", cars_train, cars_valid, random.sample([1, 3, 4, 5, 6, 7], random.randint(1, 6)), 2) print print "======================================================================" print "============================== Gamma ==============================" print "======================================================================" for i in range(10): attack("gamma", pros_train, pros_valid, random.sample([1, 2, 3, 5, 6, 7, 8], random.randint(1, 7)), 4) if __name__ == "__main__": pyunit_utils.standalone_test(random_attack) else: random_attack()
# /99/Rapids, parms: {ast=(tmp= py_8 (append py_7 (| (== (cols_py py_7 "WeekDay") "Sun") (== (cols_py py_7 "WeekDay") "Sat")) "Weekend"))} # DELETE /3/DKV/(?<key>.*), parms: {key=py_7} # /3/Frames/(?<frameid>.*), parms: {frame_id=py_8, row_count=10} crimes["Weekend"] = (crimes["WeekDay"] == "Sun") | (crimes["WeekDay"] == "Sat") # /99/Rapids, parms: {ast=(tmp= py_9 (append py_8 (cut (cols_py py_8 "Month") [0 2 5 7 10 12] ["Winter" "Spring" "Summer" "Autumn" "Winter"] FALSE TRUE 3) "Season"))} # DELETE /3/DKV/(?<key>.*), parms: {key=py_8} # /3/Frames/(?<frameid>.*), parms: {frame_id=py_9, row_count=10} crimes["Season"] = crimes["Month"].cut([0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"]) # /99/Rapids, parms: {ast=(tmp= py_10 (cols py_9 -3))} # DELETE /3/DKV/(?<key>.*), parms: {key=py_9} # /3/Frames/(?<frameid>.*), parms: {frame_id=py_10, row_count=10} crimes = crimes.drop("Date") crimes.describe() # DELETE /3/DKV/(?<key>.*), parms: {key=py_10} tmps1 = pyunit_utils.temp_ctr(); ntmps = tmps1-tmps0 rest1 = pyunit_utils.rest_ctr(); nrest = rest1-rest0 print(("Number of temps used: ",ntmps)) print(("Number of RESTs used: ",nrest)) assert ntmps <= 15 assert nrest <= 20 if __name__ == "__main__": pyunit_utils.standalone_test(date_munge) else: date_munge()
def pubdev_1431(): hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() airlines_billion_file = "/datasets/airlinesbillion.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file) airlines_billion = h2o.import_file(url) airlines_billion[30] = airlines_billion[30].asfactor() gbm = h2o.gbm(x=airlines_billion[0:30], y=airlines_billion[30], ntrees=1, distribution="bernoulli", max_depth=1) predictions = gbm.predict(airlines_billion) csv = os.path.join(os.getcwd(), "delete.csv") h2o.download_csv(predictions, csv) os.remove(csv) else: raise (EnvironmentError, "Not running on H2O internal network. No access to HDFS.") if __name__ == "__main__": pyunit_utils.standalone_test(pubdev_1431) else: pubdev_1431()
expNum=expNum+1 if (buildModel[expNum]): print("------ Testing Randomized PCA --------") gramSVD = H2OPCA(k=8, impute_missing=True, transform=transformN, seed=12345) gramSVD.train(x=x, training_frame=rotterdamH2O) randomizedPCA = H2OPCA(k=8, impute_missing=True, transform=transformN, pca_method="Randomized", seed=12345, max_iterations=5) # power randomizedPCA.train(x=x, training_frame=rotterdamH2O) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvalues between GramSVD and Randomized...\n") pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["importance"], randomizedPCA._model_json["output"]["importance"], ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"], tolerance=1e-1, check_all=False) print("@@@@@@ Comparing eigenvectors between GramSVD and Power...\n") # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["eigenvectors"], randomizedPCA._model_json["output"]["eigenvectors"], randomizedPCA._model_json["output"]["names"], tolerance=1e-6, check_sign=True, check_all=False) h2o.remove_all() if __name__ == "__main__": pyunit_utils.standalone_test(pca_wideDataset_rotterdam) else: pca_wideDataset_rotterdam()
# Without weights myX = ["Merit", "Class", "C1M3", "C4M3"] from h2o.estimators.deeplearning import H2ODeepLearningEstimator dl = H2ODeepLearningEstimator(distribution="tweedie", hidden=[1], epochs=1000, train_samples_per_iteration=-1, reproducible=True, activation="Tanh", balance_classes=False, force_load_balance=False, seed=2353123, tweedie_power=1.5, score_training_samples=0, score_validation_samples=0) dl.train(x=myX, y="Loss", training_frame=cancar) mean_residual_deviance = dl.mean_residual_deviance() # With weights dl.train(x=myX, y="Loss", training_frame=cancar, weights_column="Insured") if __name__ == "__main__": pyunit_utils.standalone_test(tweedie_weights) else: tweedie_weights()
pyunit_utils.np_comparison_check(h2o_data3.expm1(), np.expm1(np_data3), 10) h2o_val = h2o_data3.gamma()[5,5] num_val = math.gamma(h2o_data3[5,5]) assert abs(h2o_val - num_val) < max(abs(h2o_val), abs(num_val)) * 1e-6, \ "check unsuccessful! h2o computed {0} and math computed {1}. expected equal gamma values between h2o and " \ "math".format(h2o_val,num_val) h2o_val = h2o_data3.lgamma()[5,5] num_val = math.lgamma(h2o_data3[5,5]) assert abs(h2o_val - num_val) < max(abs(h2o_val), abs(num_val)) * 1e-6, \ "check unsuccessful! h2o computed {0} and math computed {1}. expected equal lgamma values between h2o and " \ "math".\ format(h2o_val,num_val) h2o_val = h2o_data3.digamma()[5,5] num_val = scipy.special.polygamma(0,h2o_data3[5,5]) assert abs(h2o_val - num_val) < max(abs(h2o_val), abs(num_val)) * 1e-6, \ "check unsuccessful! h2o computed {0} and math computed {1}. expected equal digamma values between h2o and " \ "math"\ .format(h2o_val,num_val) h2o_val = h2o_data3.trigamma()[5,5] num_val = float(scipy.special.polygamma(1,h2o_data3[5,5])) assert abs(h2o_val - num_val) < max(abs(h2o_val), abs(num_val)) * 1e-6, \ "check unsuccessful! h2o computed {0} and math computed {1}. expected equal trigamma values between h2o and " \ "math".format(h2o_val,num_val) if __name__ == "__main__": pyunit_utils.standalone_test(expr_math_ops) else: expr_math_ops()
# create a fold column for train fold_numbers = train.kfold_column(n_folds=5, seed=1234) # rename the column "fold_numbers" fold_numbers.set_names(["fold_numbers"]) train = train.cbind(fold_numbers) # build the GAM model h2o_model_fold_column = H2OGeneralizedAdditiveEstimator( family='multinomial', gam_columns=["C6", "C7", "C8"], scale=[1, 1, 1], num_knots=numKnots, knot_ids=[frameKnots1.key, frameKnots2.key, frameKnots3.key]) h2o_model_fold_column.train(x=x, y=y, training_frame=train, fold_column="fold_numbers") # both model should return the same coefficients since they use the same fold assignment coeff = h2o_model.coef() coeff_fold_column = h2o_model_fold_column.coef() pyunit_utils.assertCoefDictEqual(coeff['coefficients'], coeff_fold_column['coefficients']) if __name__ == "__main__": pyunit_utils.standalone_test(test_gam_cv_fold_columns) else: test_gam_cv_fold_columns()
#print t.cell_values exp = [(u'', 1, 0.010526315789473684, 0.9656726046291464, 2.4836601307189543, 2.4836601307189543, 1.0, 1.0, 0.026143790849673203, 0.026143790849673203, 148.36601307189542, 148.36601307189542), (u'', 2, 0.021052631578947368, 0.958934346136156, 2.4836601307189543, 2.4836601307189543, 1.0, 1.0, 0.026143790849673203, 0.05228758169934641, 148.36601307189542, 148.36601307189542), (u'', 3, 0.031578947368421054, 0.9507825261794234, 2.4836601307189543, 2.4836601307189543, 1.0, 1.0, 0.026143790849673203, 0.0784313725490196, 148.36601307189542, 148.36601307189542), (u'', 4, 0.042105263157894736, 0.9422672415967039, 2.4836601307189543, 2.4836601307189543, 1.0, 1.0, 0.026143790849673203, 0.10457516339869281, 148.36601307189542, 148.36601307189542), (u'', 5, 0.05, 0.9301225958876777, 2.4836601307189543, 2.4836601307189543, 1.0, 1.0, 0.0196078431372549, 0.12418300653594772, 148.36601307189542, 148.36601307189542), (u'', 6, 0.1, 0.9044146434092466, 2.4836601307189543, 2.4836601307189543, 1.0, 1.0, 0.12418300653594772, 0.24836601307189543, 148.36601307189542, 148.36601307189542), (u'', 7, 0.15, 0.8446852887955882, 2.4836601307189543, 2.4836601307189543, 1.0, 1.0, 0.12418300653594772, 0.37254901960784315, 148.36601307189542, 148.36601307189542), (u'', 8, 0.2, 0.7961432029967228, 2.4836601307189543, 2.4836601307189543, 1.0, 1.0, 0.12418300653594772, 0.49673202614379086, 148.36601307189542, 148.36601307189542), (u'', 9, 0.3, 0.6723258370286895, 2.4836601307189543, 2.4836601307189543, 1.0, 1.0, 0.24836601307189543, 0.7450980392156863, 148.36601307189542, 148.36601307189542), (u'', 10, 0.4, 0.4587689423764878, 1.6993464052287583, 2.287581699346405, 0.6842105263157895, 0.9210526315789473, 0.16993464052287582, 0.9150326797385621, 69.93464052287584, 128.7581699346405), (u'', 11, 0.5, 0.2941654296210501, 0.7843137254901961, 1.9869281045751637, 0.3157894736842105, 0.8, 0.0784313725490196, 0.9934640522875817, -21.568627450980394, 98.69281045751637), (u'', 12, 0.6, 0.19369580737381084, 0.06535947712418301, 1.6666666666666667, 0.02631578947368421, 0.6710526315789473, 0.006535947712418301, 1.0, -93.4640522875817, 66.66666666666667), (u'', 13, 0.7, 0.11690110696439827, 0.0, 1.4285714285714286, 0.0, 0.575187969924812, 0.0, 1.0, -100.0, 42.85714285714286), (u'', 14, 0.8, 0.08004746870641981, 0.0, 1.25, 0.0, 0.5032894736842105, 0.0, 1.0, -100.0, 25.0), (u'', 15, 0.9, 0.04735532042158167, 0.0, 1.1111111111111112, 0.0, 0.4473684210526316, 0.0, 1.0, -100.0, 11.111111111111116), (u'', 16, 1.0, 0.009748408811701144, 0.0, 1.0, 0.0, 0.4026315789473684, 0.0, 1.0, -100.0, 0.0)] mycomp(exp, t.cell_values) t = m.gains_lift(valid=True) mycomp(exp, t.cell_values) p = m.model_performance(df) t = p.gains_lift() mycomp(exp, t.cell_values) m = H2OGradientBoostingEstimator(nfolds=3, seed=1234) m.train(x=df.names,y="CAPSULE", training_frame=df, validation_frame=df) t = m.gains_lift(xval=True) #print t.cell_values exp2 = [(u'', 1, 0.010526315789473684, 0.9677782562476234, 1.2418300653594772, 1.2418300653594772, 0.5, 0.5, 0.013071895424836602, 0.013071895424836602, 24.183006535947715, 24.183006535947715), (u'', 2, 0.021052631578947368, 0.9582846040782473, 1.8627450980392157, 1.5522875816993464, 0.75, 0.625, 0.0196078431372549, 0.032679738562091505, 86.27450980392157, 55.22875816993464), (u'', 3, 0.031578947368421054, 0.9458499103092155, 2.4836601307189543, 1.8627450980392157, 1.0, 0.75, 0.026143790849673203, 0.058823529411764705, 148.36601307189542, 86.27450980392157), (u'', 4, 0.042105263157894736, 0.9331874956273033, 1.8627450980392157, 1.8627450980392157, 0.75, 0.75, 0.0196078431372549, 0.0784313725490196, 86.27450980392157, 86.27450980392157), (u'', 5, 0.05, 0.9319212918270888, 2.4836601307189543, 1.9607843137254903, 1.0, 0.7894736842105263, 0.0196078431372549, 0.09803921568627451, 148.36601307189542, 96.07843137254903), (u'', 6, 0.1, 0.8704014317587268, 2.2222222222222223, 2.0915032679738563, 0.8947368421052632, 0.8421052631578947, 0.1111111111111111, 0.20915032679738563, 122.22222222222223, 109.15032679738563), (u'', 7, 0.15, 0.8022612148480965, 1.5686274509803921, 1.9172113289760349, 0.631578947368421, 0.7719298245614035, 0.0784313725490196, 0.2875816993464052, 56.86274509803921, 91.72113289760348), (u'', 8, 0.2, 0.7409640897307539, 1.6993464052287583, 1.8627450980392157, 0.6842105263157895, 0.75, 0.08496732026143791, 0.37254901960784315, 69.93464052287584, 86.27450980392157), (u'', 9, 0.3, 0.5840891361136157, 1.5686274509803921, 1.7647058823529413, 0.631578947368421, 0.7105263157894737, 0.1568627450980392, 0.5294117647058824, 56.86274509803921, 76.47058823529413), (u'', 10, 0.4, 0.4462887172671538, 1.3725490196078434, 1.6666666666666667, 0.5526315789473685, 0.6710526315789473, 0.13725490196078433, 0.6666666666666666, 37.25490196078434, 66.66666666666667), (u'', 11, 0.5, 0.3193859623494622, 0.9803921568627452, 1.5294117647058825, 0.39473684210526316, 0.6157894736842106, 0.09803921568627451, 0.7647058823529411, -1.9607843137254832, 52.941176470588246), (u'', 12, 0.6, 0.2340751507622484, 0.7843137254901961, 1.4052287581699348, 0.3157894736842105, 0.5657894736842105, 0.0784313725490196, 0.8431372549019608, -21.568627450980394, 40.522875816993476), (u'', 13, 0.7, 0.14629536699033518, 0.5882352941176471, 1.288515406162465, 0.23684210526315788, 0.518796992481203, 0.058823529411764705, 0.9019607843137255, -41.17647058823529, 28.851540616246506), (u'', 14, 0.8, 0.09247017777496397, 0.5228758169934641, 1.1928104575163399, 0.21052631578947367, 0.48026315789473684, 0.05228758169934641, 0.954248366013072, -47.71241830065359, 19.281045751633986), (u'', 15, 0.9, 0.04779416944259696, 0.19607843137254902, 1.0820624546114743, 0.07894736842105263, 0.43567251461988304, 0.0196078431372549, 0.9738562091503268, -80.3921568627451, 8.206245461147432), (u'', 16, 1.0, 0.009938670599098145, 0.26143790849673204, 1.0, 0.10526315789473684, 0.4026315789473684, 0.026143790849673203, 1.0, -73.8562091503268, 0.0)] mycomp(exp2, t.cell_values) p = m.model_performance(df) t = p.gains_lift() mycomp(exp, t.cell_values) if __name__ == "__main__": pyunit_utils.standalone_test(pubdev_2118) else: pubdev_2118()
fractions = dict() fractions["real_fraction"] = 0 # Right now we are dropping string columns, so no point in having them. fractions["categorical_fraction"] = 1 fractions["integer_fraction"] = 0 fractions["time_fraction"] = 0 fractions["string_fraction"] = 0 # Right now we are dropping string columns, so no point in having them. fractions["binary_fraction"] = 0 # this used to get an error message try: traindata = h2o.create_frame(rows=100, cols=2, missing_fraction=0, has_response=False, factors=9999999, seed=12345, **fractions) except Exception as ex: sys.exit(1) # this get an error message try: traindata = h2o.create_frame(rows=100, cols=2, missing_fraction=0, has_response=False, factors=19999999, seed=12345, **fractions) sys.exit(1) # should have thrown an error except Exception as ex: # expect an error here print(ex) if 'Number of factors must be <= 10,000,000' in ex.args[0].dev_msg: sys.exit(0) # correct error message else: sys.exit(1) # something else is wrong. if __name__ == "__main__": pyunit_utils.standalone_test(pubdev_6304) else: pubdev_6304()
from h2o.estimators.glm import H2OGeneralizedLinearEstimator # I copied this test from Jeff Plourde. Thank you. # This test just needs to run to completion without receiving any error. There is no assert statement needed here. def remove_collinear_columns_multinomial(): train = h2o.import_file( pyunit_utils.locate("smalldata/glm_test/multinomial_rcc.csv")) train[0] = train[0].asfactor() mdl = H2OGeneralizedLinearEstimator(solver='IRLSM', family='multinomial', link='family_default', seed=76, lambda_=[0], max_iterations=100000, beta_epsilon=1e-7, early_stopping=False, standardize=True, remove_collinear_columns=True) mdl.start(x=train.col_names[1:], y=train.col_names[0], training_frame=train) mdl.join() print("test completed.") if __name__ == "__main__": pyunit_utils.standalone_test(remove_collinear_columns_multinomial) else: remove_collinear_columns_multinomial()
import sys sys.path.insert(1, "../../../") import h2o from tests import pyunit_utils from tests.pyunit_utils import CustomOneFuncStr, \ assert_all_metrics_equal, regression_model from h2o.estimators.gbm import H2OGradientBoostingEstimator def test_custom_metric_from_str(): custom_metric = h2o.upload_custom_metric(CustomOneFuncStr, class_name="CustomOneFunc", func_name="custom_mm") (model2, f_test2) = regression_model(H2OGradientBoostingEstimator, custom_metric) assert_all_metrics_equal(model2, f_test2, "custom_mm", 1) __TESTS__ = [ test_custom_metric_from_str ] if __name__ == "__main__": for func in __TESTS__: pyunit_utils.standalone_test(func) else: for func in __TESTS__: func()
sys.path.insert(1, "../../") import h2o from tests import pyunit_utils def test_relevel_by_freq_topn(): prostate_cat = h2o.import_file( path=pyunit_utils.locate("smalldata/prostate/prostate_cat.csv")) dpros_levels_ordered = prostate_cat["DPROS"].table().as_data_frame( )["DPROS"].tolist() assert dpros_levels_ordered == ["Both", "Left", "None", "Right"] prostate_cat_relevel = prostate_cat.relevel_by_frequency(top_n=1) dpros_relevel_levels = prostate_cat_relevel["DPROS"].table().as_data_frame( )["DPROS"].tolist() assert dpros_relevel_levels == ['Left', 'Both', 'None', 'Right'] top_drops_level = prostate_cat["DPROS"].table().as_data_frame( ).sort_values(by="Count")["DPROS"].tolist()[-1] prostate_cat_relevel_manual = prostate_cat["DPROS"].relevel( y=top_drops_level) assert prostate_cat_relevel_manual.levels() == [dpros_relevel_levels] if __name__ == "__main__": pyunit_utils.standalone_test(test_relevel_by_freq_topn) else: test_relevel_by_freq_topn()
h2o_data.impute(column="C4", method="median", combine_method="high") c4_imputed = h2o_data[2,3] assert c4_imputed == 5, "Wrong value imputed. Expected imputed value of 5, but got {0}".format(c4_imputed) # mode-categorical h2o_data = h2o.H2OFrame(zip(*data)) h2o_data.impute(column="C5", method="mode") c5_imputed = h2o_data[4,4] assert c5_imputed == 'b', "Wrong value imputed. Expected imputed value of b, but got {0}".format(c5_imputed) # mode-numeric h2o_data = h2o.H2OFrame(zip(*data)) h2o_data.impute(column="C6", method="mode") c6_imputed = h2o_data[5,5] assert c6_imputed == 1, "Wrong value imputed. Expected imputed value of 1, but got {0}".format(c6_imputed) # mean-group by C7 h2o_data = h2o.H2OFrame(zip(*data)) h2o_data.impute(column="C3", method="mean", by="C7") imputed1 = h2o_data[2,2] imputed2 = h2o_data[3,2] assert imputed1 == 3.5, "Wrong value imputed. Expected imputed value of 3.5, but got {0}".format(imputed1) assert imputed2 == 9.5, "Wrong value imputed. Expected imputed value of 9.5, but got {0}".format(imputed2) if __name__ == "__main__": pyunit_utils.standalone_test(impute2) else: impute2()
model1Seeds = ','.join(str(x) for x in model_seeds1[0:model_len]) model2Seeds = ','.join(str(x) for x in model_seeds2[0:model_len]) assert model1Seeds==model2Seeds, "Model seeds are not equal: gridsearch 1 seeds %s; " \ " and gridsearch 2 seeds %s" % (model1Seeds, model2Seeds) # compare training_rmse from scoring history model1seed = air_grid1.models[0].full_parameters['seed']['actual_value'] index2 = 0 # find the model in grid2 with the same seed for ind in range(0, len(air_grid2.models)): if air_grid2.models[ind].full_parameters['seed'][ 'actual_value'] == model1seed: index2 = ind break metric_list1 = pyunit_utils.extract_scoring_history_field( air_grid1.models[0], "training_rmse", False) metric_list2 = pyunit_utils.extract_scoring_history_field( air_grid2.models[index2], "training_rmse", False) print(metric_list1) print(metric_list2) assert pyunit_utils.equal_two_arrays(metric_list1, metric_list2, 1e-5, 1e-6, False), \ "Training_rmse are different between the two grid search models. Tests are supposed to be repeatable in " \ "this case. Make sure model seeds are actually set correctly in the Java backend." if __name__ == "__main__": pyunit_utils.standalone_test(random_grid_model_seeds_PUBDEV_4090) else: random_grid_model_seeds_PUBDEV_4090()