def swpredsGBM(): # Training set has two predictor columns # X1: 10 categorical levels, 100 observations per level; X2: Unif(0,1) noise # Ratio of y = 1 per Level: cat01 = 1.0 (strong predictor), cat02 to cat10 = 0.5 (weak predictors) #Log.info("Importing swpreds_1000x3.csv data...\n") swpreds = h2o.import_file(path=tests.locate("smalldata/gbm_test/swpreds_1000x3.csv")) swpreds["y"] = swpreds["y"].asfactor() #Log.info("Summary of swpreds_1000x3.csv from H2O:\n") #swpreds.summary() # Train H2O GBM without Noise Column #Log.info("H2O GBM with parameters:\nntrees = 50, max_depth = 20, nbins = 500\n") h2o_gbm_model1 = h2o.gbm(x=swpreds[["X1"]], y=swpreds["y"], distribution="bernoulli", ntrees=50, max_depth=20, nbins=500) h2o_gbm_model1.show() h2o_gbm_perf1 = h2o_gbm_model1.model_performance(swpreds) h2o_auc1 = h2o_gbm_perf1.auc() # Train H2O GBM Model including Noise Column: #Log.info("H2O GBM with parameters:\nntrees = 50, max_depth = 20, nbins = 500\n") h2o_gbm_model2 = h2o.gbm(x=swpreds[["X1","X2"]], y=swpreds["y"], distribution="bernoulli", ntrees=50, max_depth=20, nbins=500) h2o_gbm_model2.show() h2o_gbm_perf2 = h2o_gbm_model2.model_performance(swpreds) h2o_auc2 = h2o_gbm_perf2.auc()
def milsong_checkpoint(ip,port): milsong_train = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) distribution = "gaussian" # build first model ntrees1 = random.sample(range(50,100),1)[0] max_depth1 = random.sample(range(2,6),1)[0] min_rows1 = random.sample(range(10,16),1)[0] print "ntrees model 1: {0}".format(ntrees1) print "max_depth model 1: {0}".format(max_depth1) print "min_rows model 1: {0}".format(min_rows1) model1 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1, distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0]) # save the model, then load the model model_path = h2o.save_model(model1, name="delete_model", force=True) restored_model = h2o.load_model(model_path) shutil.rmtree("delete_model") # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print "ntrees model 2: {0}".format(ntrees2) print "max_depth model 2: {0}".format(max_depth2) print "min_rows model 2: {0}".format(min_rows2) model2 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0], checkpoint=restored_model._id) # build the equivalent of model 2 in one shot model3 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0])
def imbalancedGBM(ip,port): covtype = h2o.import_file(path=h2o.locate("smalldata/covtype/covtype.20k.data")) covtype[54] = covtype[54].asfactor() hh_imbalanced = h2o.gbm(x=covtype[0:54], y=covtype[54], ntrees=10, balance_classes=False, nfolds=3, distribution="multinomial") hh_imbalanced_perf = hh_imbalanced.model_performance(covtype) hh_imbalanced_perf.show() hh_balanced = h2o.gbm(x=covtype[0:54], y=covtype[54], ntrees=10, balance_classes=True, seed=123, nfolds=3, distribution="multinomial") hh_balanced_perf = hh_balanced.model_performance(covtype) hh_balanced_perf.show() #compare error for class 6 (difficult minority) class_6_err_imbalanced = hh_imbalanced_perf.confusion_matrix().cell_values[5][7] class_6_err_balanced = hh_balanced_perf.confusion_matrix().cell_values[5][7] print("--------------------") print("") print("class_6_err_imbalanced") print(class_6_err_imbalanced) print("") print("class_6_err_balanced") print(class_6_err_balanced) print("") print("--------------------") assert class_6_err_imbalanced >= 0.90*class_6_err_balanced, "balance_classes makes it at least 10% worse!"
def get_model_test(ip,port): # Connect to h2o h2o.init(ip,port) prostate = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv")) r = prostate[0].runif() train = prostate[r < 0.70] test = prostate[r >= 0.30] # Regression regression_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="gaussian") predictions1 = regression_gbm1.predict(test) regression_gbm2 = h2o.get_model(regression_gbm1._key) assert regression_gbm2._model_json['output']['model_category'] == "Regression" predictions2 = regression_gbm2.predict(test) for r in range(predictions1.nrow()): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected regression predictions to be the same for row {0}, but got {1} and {2}" \ "".format(r, p1, p2) # Binomial train[1] = train[1].asfactor() bernoulli_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="bernoulli") predictions1 = bernoulli_gbm1.predict(test) bernoulli_gbm2 = h2o.get_model(bernoulli_gbm1._key) assert bernoulli_gbm2._model_json['output']['model_category'] == "Binomial" predictions2 = bernoulli_gbm2.predict(test) for r in range(predictions1.nrow()): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected binomial predictions to be the same for row {0}, but got {1} and {2}" \ "".format(r, p1, p2) # Clustering benign_h2o = h2o.import_frame(path=h2o.locate("smalldata/logreg/benign.csv")) km_h2o = h2o.kmeans(x=benign_h2o, k=3) benign_km = h2o.get_model(km_h2o._key) assert benign_km._model_json['output']['model_category'] == "Clustering" # Multinomial train[4] = train[4].asfactor() multinomial_dl1 = h2o.deeplearning(x=train[0:2], y=train[4], loss='CrossEntropy') predictions1 = multinomial_dl1.predict(test) multinomial_dl2 = h2o.get_model(multinomial_dl1._key) assert multinomial_dl2._model_json['output']['model_category'] == "Multinomial" predictions2 = multinomial_dl2.predict(test) for r in range(predictions1.nrow()): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected multinomial predictions to be the same for row {0}, but got {1} and {2}" \ "".format(r, p1, p2)
def pubdev_1696(ip, port): iris = h2o.import_file(h2o.locate("smalldata/iris/iris.csv")) try: h2o.gbm(x=iris[0:3], y=iris[3], nfolds=-99) assert False, "expected an error" except EnvironmentError: assert True
def check_same(data1, data2): gbm1_regression = h2o.gbm(x=data1[2:20], y=data1[1]) gbm2_regression = h2o.gbm(x=data2[2:21], y=data2[1], weights_column="weights") gbm1_binomial = h2o.gbm(x=data1[1:20], y=data1[0], distribution="bernoulli") gbm2_binomial = h2o.gbm(x=data2[1:21], y=data2[0], weights_column="weights", distribution="bernoulli") assert abs(gbm1_regression.mse() - gbm2_regression.mse()) < 1e-6, "Expected mse's to be the same, but got {0}, " \ "and {1}".format(gbm1_regression.mse(), gbm2_regression.mse()) assert abs(gbm1_binomial.auc() - gbm2_binomial.auc()) < 1e-6, "Expected auc's to be the same, but got {0}, and " \ "{1}".format(gbm1_binomial.auc(), gbm2_binomial.auc())
def pub_444_spaces_in_filenames(ip,port): # tempdir = "smalldata/jira/" # if was okay to write to smalldata, it's okay to write to the current directory # probably don't want to, but can't find what the standard temp directory is supposed to be. no sandbox? tempdir = "./" # make a few files with spaces in the name f1 = open(h2o.locate(tempdir) + "foo .csv", "w") f1.write("response, predictor\n") for i in range(10): f1.write("1, a\n") f1.write("0, b\n") f1.write("1, a\n" if random.randint(0,1) else "0, b\n") f1.close() f2 = open(h2o.locate(tempdir) + "b a r .csv", "w") f2.write("response, predictor\n") for i in range(10): f2.write("1, a\n") f2.write("0, b\n") f2.write("1, a\n" if random.randint(0,1) else "0, b\n") f2.close() f3 = open(h2o.locate(tempdir) + " ba z.csv", "w") for i in range(10): f3.write("1, a\n") f3.write("0, b\n") f3.write("1, a\n" if random.randint(0,1) else "0, b\n") f3.close() train_data = h2o.upload_file(path=h2o.locate(tempdir + "foo .csv")) train_data.show() train_data.describe() gbm = h2o.gbm(x=train_data[1:], y=train_data["response"].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1) gbm.show() train_data = h2o.upload_file(path=h2o.locate(tempdir + "b a r .csv")) train_data.show() train_data.describe() gbm = h2o.gbm(x=train_data[1:], y=train_data["response"].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1) gbm.show() train_data = h2o.upload_file(path=h2o.locate(tempdir + " ba z.csv")) train_data.show() train_data.describe() gbm = h2o.gbm(x=train_data[1:], y=train_data[0].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1) gbm.show() os.remove(h2o.locate(tempdir) + "foo .csv") os.remove(h2o.locate(tempdir) + "b a r .csv") os.remove(h2o.locate(tempdir) + " ba z.csv")
def distribution_behaviorGBM(): #Log.info("==============================") #Log.info("Default Behavior - Gaussian") #Log.info("==============================") eco = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv")) # 0/1 response: expect gaussian eco_model = h2o.gbm(x=eco[2:13], y=eco["Angaus"]) # more than 2 integers for response: expect gaussian cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars.csv")) cars_model = h2o.gbm(x=cars[3:7], y=cars["cylinders"]) #Log.info("==============================") #Log.info("Gaussian Behavior") #Log.info("==============================") # 0/1 response: expect gaussian eco_model = h2o.gbm(x=eco[2:13], y=eco["Angaus"], distribution="gaussian") # character response: expect error try: eco_model = h2o.gbm(x=eco[1:8], y=eco["Method"], distribution="gaussian") assert False, "expected an error" except EnvironmentError: assert True #Log.info("==============================") #Log.info("Bernoulli Behavior") #Log.info("==============================") # 0/1 response: expect bernoulli eco_model = h2o.gbm(x=eco[2:13], y=eco["Angaus"].asfactor(), distribution="bernoulli") # 2 level character response: expect bernoulli tree = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/test_tree_minmax.csv")) tree_model = h2o.gbm(x=tree[0:3], y=tree["response"], distribution="bernoulli", min_rows=1) # more than two integers for response: expect error try: cars_mod = h2o.gbm(x=cars[3:7], y=cars["cylinders"], distribution="bernoulli") assert False, "expected an error" except EnvironmentError: assert True # more than two character levels for response: expect error try: eco_model = h2o.gbm(x=eco[0:8], y=eco["Method"], distribution="bernoulli") assert False, "expected an error" except EnvironmentError: assert True #Log.info("==============================") #Log.info("Multinomial Behavior") #Log.info("==============================") # more than two integers for response: expect multinomial cars_model = h2o.gbm(x=cars[3:7], y=cars["cylinders"].asfactor(), distribution="multinomial") # more than two character levels for response: expect multinomial eco_model = h2o.gbm(x=eco[0:8], y=eco["Method"], distribution="multinomial")
def pubdev_1829(): train = h2o.import_file(path=pyunit_utils.locate("smalldata/jira/gbm_checkpoint_train.csv")) valid = h2o.import_file(path=pyunit_utils.locate("smalldata/jira/gbm_checkpoint_valid.csv")) predictors = ["displacement","power","weight","acceleration","year"] response_col = "economy_20mpg" distribution = "bernoulli" train[response_col] = train[response_col].asfactor() valid[response_col] = valid[response_col].asfactor() ntrees1 = 5 max_depth1 = 5 min_rows1 = 10 model1 = h2o.gbm(x=train[predictors], y=train[response_col], ntrees=ntrees1, max_depth=max_depth1, min_rows=min_rows1, score_each_iteration=True, distribution=distribution, validation_x=valid[predictors], validation_y=valid[response_col]) ntrees2 = 10 max_depth2 = 5 min_rows2 = 10 model2 = h2o.gbm(x=train[predictors], y=train[response_col], ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, score_each_iteration=True, validation_x=valid[predictors], validation_y=valid[response_col], checkpoint=model1._id) model4 = h2o.gbm(x=train[predictors], y=train[response_col], ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, score_each_iteration=True, validation_x=valid[predictors], validation_y=valid[response_col]) assert model2.auc(valid=True)==model4.auc(valid=True), "Expected Model 2 AUC: {0} to be the same as Model 4 AUC: {1}".format(model2.auc(valid=True), model4.auc(valid=True)) assert model2.giniCoef(valid=True)==model4.giniCoef(valid=True), "Expected Model 2 Gini Coef {0} to be the same as Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True)) assert model2.logloss(valid=True)==model4.logloss(valid=True), "Expected Model 2 Log Loss: {0} to be the same as Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True))
def hexdev_394(): path = pyunit_utils.locate("smalldata/covtype/covtype.20k.data") c_types = [None] * 55 c_types[10] = "enum" c_types[11] = "enum" c_types[12] = "enum" train = h2o.import_file(path, col_types=c_types) cols = train.col_names # This returned space for first column name x_cols = [colname for colname in cols if colname != "C55"] x_cols splits = train.split_frame() newtrain = splits[0] newvalid = splits[1] newtrain_x = newtrain[x_cols] newtrain_y = newtrain[54].asfactor() newvalid_x = newvalid[x_cols] newvalid_y = newvalid[54].asfactor() my_gbm = h2o.gbm( y=newtrain_y, validation_y=newvalid_y, x=newtrain_x, validation_x=newvalid_x, distribution="multinomial", ntrees=100, learn_rate=0.1, max_depth=6, ) split1, split2 = train.split_frame() newtrain_x = split1[x_cols] newtrain_y = split1[54].asfactor() newvalid_x = split2[x_cols] newvalid_y = split2[54].asfactor() my_gbm = h2o.gbm( y=newtrain_y, validation_y=newvalid_y, x=newtrain_x, validation_x=newvalid_x, distribution="multinomial", ntrees=100, learn_rate=0.1, max_depth=6, ) print "KEEPING FRAME???" print train._keep
def pubdev_random_cv(): cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) response_col = "economy" distribution = "gaussian" predictors = ["displacement","power","weight","acceleration","year"] gbm1 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=3, distribution=distribution,fold_assignment="Random") gbm2 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=3, distribution=distribution,fold_assignment="Random") mse1 = gbm1.mse(xval=True) mse2 = gbm2.mse(xval=True) assert mse1 != mse2, "The first model has an MSE of {0} and the second model has an MSE of {1}. Expected the " \ "first to be different from the second.".format(mse1, mse2)
def checkpoint_new_category_in_response(): sv = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv")) iris = h2o.upload_file(tests.locate("smalldata/iris/iris.csv")) m1 = h2o.gbm(x=sv[[0,1,2,3]], y=sv[4], ntrees=100) # attempt to continue building model, but with an expanded categorical response domain. # this should fail try: m2 = h2o.gbm(x=iris[[0,1,2,3]], y=iris[4], ntrees=200, checkpoint=m1.model_id) assert False, "Expected continued model-building to fail with new categories introduced in response" except EnvironmentError: pass
def hexdev_394(): path = tests.locate("smalldata/covtype/covtype.20k.data") trainraw = h2o.lazy_import(path) tsetup = h2o.parse_setup(trainraw) tsetup["column_types"][10] = "ENUM" tsetup["column_types"][11] = "ENUM" tsetup["column_types"][12] = "ENUM" train = h2o.parse_raw(tsetup) cols = train.col_names # This returned space for first column name x_cols = [colname for colname in cols if colname != "C55"] x_cols splits = train.split_frame() newtrain = splits[0] newvalid = splits[1] newtrain_x = newtrain[x_cols] newtrain_y = newtrain[54].asfactor() newvalid_x = newvalid[x_cols] newvalid_y = newvalid[54].asfactor() my_gbm = h2o.gbm(y=newtrain_y, validation_y=newvalid_y, x=newtrain_x, validation_x=newvalid_x, distribution = "multinomial", ntrees=100, learn_rate=0.1, max_depth=6) split1, split2 = train.split_frame() newtrain_x = split1[x_cols] newtrain_y = split1[54].asfactor() newvalid_x = split2[x_cols] newvalid_y = split2[54].asfactor() my_gbm = h2o.gbm(y=newtrain_y, validation_y=newvalid_y, x=newtrain_x, validation_x=newvalid_x, distribution = "multinomial", ntrees=100, learn_rate=0.1, max_depth=6) print "KEEPING FRAME???" print train._keep
def pub_444_spaces_in_filenames(ip,port): # Connect to h2o h2o.init(ip,port) # make a few files with spaces in the name f1 = open(h2o.locate("smalldata/jira/") + "foo .csv", "w") f1.write("response, predictor\n") for i in range(10): f1.write("1, a\n") f1.write("0, b\n") f1.write("1, a\n" if random.randint(0,1) else "0, b\n") f1.close() f2 = open(h2o.locate("smalldata/jira/") + "b a r .csv", "w") f2.write("response, predictor\n") for i in range(10): f2.write("1, a\n") f2.write("0, b\n") f2.write("1, a\n" if random.randint(0,1) else "0, b\n") f2.close() f3 = open(h2o.locate("smalldata/jira/") + " ba z.csv", "w") for i in range(10): f3.write("1, a\n") f3.write("0, b\n") f3.write("1, a\n" if random.randint(0,1) else "0, b\n") f3.close() train_data = h2o.upload_file(path=h2o.locate("smalldata/jira/foo .csv")) train_data.show() train_data.describe() gbm = h2o.gbm(x=train_data[1:], y=train_data["response"].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1) gbm.show() train_data = h2o.upload_file(path=h2o.locate("smalldata/jira/b a r .csv")) train_data.show() train_data.describe() gbm = h2o.gbm(x=train_data[1:], y=train_data["response"].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1) gbm.show() train_data = h2o.upload_file(path=h2o.locate("smalldata/jira/ ba z.csv")) train_data.show() train_data.describe() gbm = h2o.gbm(x=train_data[1:], y=train_data[0].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1) gbm.show() os.remove(h2o.locate("smalldata/jira/") + "foo .csv") os.remove(h2o.locate("smalldata/jira/") + "b a r .csv") os.remove(h2o.locate("smalldata/jira/") + " ba z.csv")
def pubdev_1431(ip, port): running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() airlines_billion_file_1 = "/datasets/airlinesbillion.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file_1) airlines_billion_1 = h2o.import_file(url) airlines_billion_1[30] = airlines_billion_1[30].asfactor() gbm = h2o.gbm(x=airlines_billion_1[0:30], y=airlines_billion_1[30], ntrees=1, distribution="bernoulli", max_depth=1) predictions = gbm.predict(airlines_billion_1) csv = os.path.join(os.getcwd(),"delete.csv") h2o.download_csv(predictions,csv) airlines_billion_2 = h2o.import_file(csv) os.remove(csv) r1, c1 = airlines_billion_1.dim r2, c2 = airlines_billion_2.dim assert r1 == r2 and c1 == c2, "Expect rows to be equal. r1: {0} and r2: {1}. Expect cols to be equal c1: {0} " \ "c2: {1}".format(r1,r2,c1,c2) else: print "Not running on H2O internal network. No access to HDFS."
def fiftycatGBM(ip,port): # Training set has only 45 categories cat1 through cat45 #Log.info("Importing 50_cattest_train.csv data...\n") train = h2o.import_file(path=h2o.locate("smalldata/gbm_test/50_cattest_train.csv")) train["y"] = train["y"].asfactor() #Log.info("Summary of 50_cattest_train.csv from H2O:\n") #train.summary() # Train H2O GBM Model: #Log.info(paste("H2O GBM with parameters:\nntrees = 10, max_depth = 20, nbins = 20\n", sep = "")) model = h2o.gbm(x=train[["x1","x2"]], y=train["y"], distribution="bernoulli", ntrees=10, max_depth=5, nbins=20) model.show() # Test dataset has all 50 categories cat1 through cat50 #Log.info("Importing 50_cattest_test.csv data...\n") test = h2o.import_file(path=h2o.locate("smalldata/gbm_test/50_cattest_test.csv")) #Log.info("Summary of 50_cattest_test.csv from H2O:\n") #test.summary() # Predict on test dataset with GBM model: #Log.info("Performing predictions on test dataset...\n") predictions = model.predict(test) predictions.show() # Get the confusion matrix and AUC #Log.info("Confusion matrix of predictions (max accuracy):\n") performance = model.model_performance(test) test_cm = performance.confusion_matrix() test_auc = performance.auc()
def offset_gaussian(ip,port): # Connect to a pre-existing cluster insurance = h2o.import_file(h2o.locate("smalldata/glm_test/insurance.csv")) insurance["offset"] = insurance["Holders"].log() gbm = h2o.gbm(x=insurance[0:3], y=insurance["Claims"], distribution="gaussian", ntrees=600, max_depth=1, min_rows=1, learn_rate=.1, offset_column="offset", training_frame=insurance) predictions = gbm.predict(insurance) # Comparison result generated from R's gbm: # fit2 <- gbm(Claims ~ District + Group + Age+ offset(log(Holders)) , interaction.depth = 1,n.minobsinnode = 1, # shrinkage = .1,bag.fraction = 1,train.fraction = 1, # data = Insurance, distribution ="gaussian", n.trees = 600) # pg = predict(fit2, newdata = Insurance, type = "response", n.trees=600) # pr = pg - - log(Insurance$Holders) assert abs(44.33016 - gbm._model_json['output']['init_f']) < 1e-5, "expected init_f to be {0}, but got {1}". \ format(44.33016, gbm._model_json['output']['init_f']) assert abs(1491.135 - gbm.mse()) < 1e-2, "expected mse to be {0}, but got {1}".format(1491.135, gbm.mse()) assert abs(49.23438 - predictions.mean()) < 1e-2, "expected prediction mean to be {0}, but got {1}". \ format(49.23438, predictions.mean()) assert abs(-45.5720659304 - predictions.min()) < 1e-2, "expected prediction min to be {0}, but got {1}". \ format(-45.5720659304, predictions.min()) assert abs(207.387 - predictions.max()) < 1e-2, "expected prediction max to be {0}, but got {1}". \ format(207.387, predictions.max())
def plot_test(): kwargs = {} kwargs['server'] = True air = h2o.import_file(h2o.locate("smalldata/airlines/AirlinesTrain.csv.zip")) # Constructing test and train sets by sampling (20/80) s = air[0].runif() air_train = air[s <= 0.8] air_valid = air[s > 0.8] myX = ["Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek"] myY = "IsDepDelayed" air_gbm = h2o.gbm(x=air_train[myX], y=air_train[myY], validation_x=air_valid[myX], validation_y=air_valid[myY], distribution="bernoulli", ntrees=100, max_depth=3, learn_rate=0.01) # Plot ROC for training and validation sets air_gbm.plot(type="roc", train=True, **kwargs) air_gbm.plot(type="roc", valid=True, **kwargs) air_test = h2o.import_file(h2o.locate("smalldata/airlines/AirlinesTest.csv.zip")) perf = air_gbm.model_performance(air_test) #Plot ROC for test set perf.plot(type="roc", **kwargs)
def offset_gamma(): # Connect to a pre-existing cluster insurance = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/insurance.csv")) insurance["offset"] = insurance["Holders"].log() gbm = h2o.gbm(x=insurance[0:3], y=insurance["Claims"], distribution="gamma", ntrees=600, max_depth=1, min_rows=1, learn_rate=.1, offset_column="offset", training_frame=insurance) predictions = gbm.predict(insurance) # Comparison result generated from harrysouthworth's gbm: # fit2 = gbm(Claims ~ District + Group + Age+ offset(log(Holders)) , interaction.depth = 1,n.minobsinnode = 1,shrinkage = .1,bag.fraction = 1,train.fraction = 1, # data = Insurance, distribution ="gamma", n.trees = 600) # pr = predict(fit2, Insurance) # pr = exp(pr+log(Insurance$Holders)) assert abs(-1.714958 - gbm._model_json['output']['init_f']) < 1e-5, "expected init_f to be {0}, but got {1}". \ format(-1.714958, gbm._model_json['output']['init_f']) assert abs(50.1087 - predictions.mean()) < 1e-2, "expected prediction mean to be {0}, but got {1}". \ format(50.1087, predictions.mean()) assert abs(0.9133843 - predictions.min()) < 1e-4, "expected prediction min to be {0}, but got {1}". \ format(0.9133843, predictions.min()) assert abs(392.6667 - predictions.max()) < 0.1, "expected prediction max to be {0}, but got {1}". \ format(392.6667, predictions.max())
def offset_tweedie(ip,port): # Connect to a pre-existing cluster h2o.init(ip,port) insurance = h2o.import_frame(h2o.locate("smalldata/glm_test/insurance.csv")) insurance["offset"] = insurance["Holders"].log() gbm = h2o.gbm(x=insurance[0:3], y=insurance["Claims"], distribution="tweedie", ntrees=600, max_depth=1, min_rows=1, learn_rate=.1, offset_column="offset", training_frame=insurance) predictions = gbm.predict(insurance) # Comparison result generated from harrysouthworth's gbm: # fit2 = gbm(Claims ~ District + Group + Age+ offset(log(Holders)) , interaction.depth = 1,n.minobsinnode = 1,shrinkage = .1,bag.fraction = 1,train.fraction = 1, # data = Insurance, distribution ="tweedie", n.trees = 600) # pr = predict(fit2, Insurance) # pr = exp(pr+log(Insurance$Holders)) assert abs(-1.869702 - gbm._model_json['output']['init_f']) < 1e-5, "expected init_f to be {0}, but got {1}".\ format(-1.869702, gbm._model_json['output']['init_f']) assert abs(49.21591 - predictions.mean()) < 1e-4, "expected prediction mean to be {0}, but got {1}". \ format(49.21591, predictions.mean()) assert abs(1.0258 - predictions.min()) < 1e-4, "expected prediction min to be {0}, but got {1}". \ format(1.0258, predictions.min()) assert abs(392.4651 - predictions.max()) < 1e-2, "expected prediction max to be {0}, but got {1}". \ format(392.4651, predictions.max())
def offset_poisson(ip,port): # Connect to a pre-existing cluster insurance = h2o.import_file(h2o.locate("smalldata/glm_test/insurance.csv")) insurance["offset"] = insurance["Holders"].log() gbm = h2o.gbm(x=insurance[0:3], y=insurance["Claims"], distribution="poisson", ntrees=600, max_depth=1, min_rows=1, learn_rate=.1, offset_column="offset", training_frame=insurance) predictions = gbm.predict(insurance) # Comparison result generated from R's gbm: #fit2 = gbm(Claims ~ District + Group + Age+ offset(log(Holders)) , interaction.depth = 1,n.minobsinnode = 1, # shrinkage = .1,bag.fraction = 1,train.fraction = 1, data = Insurance, distribution ="poisson", # n.trees = 600) #link = predict.gbm(fit2, Insurance, n.trees=600, type="link") #link.offset = link + log(Insurance$Holders) ##for poisson #pr = exp(link.offset) assert abs(-2.003262 - gbm._model_json['output']['init_f']) < 1e-5, "expected init_f to be {0}, but got {1}". \ format(-2.003262, gbm._model_json['output']['init_f']) assert abs(49.23437 - predictions.mean()) < 1e-4, "expected prediction mean to be {0}, but got {1}". \ format(49.23437, predictions.mean()) assert abs(1.077275 - predictions.min()) < 1e-4, "expected prediction min to be {0}, but got {1}". \ format(1.077275, predictions.min()) assert abs(398.0608 - predictions.max()) < 1e-2, "expected prediction max to be {0}, but got {1}". \ format(398.0608, predictions.max())
def nfold_predict(): fr = h2o.import_file(path=h2o.locate("smalldata/logreg/prostate_train.csv")) m = h2o.gbm(x=fr[2:], y=fr[1], nfolds=10, ntrees=10) xval_models = m.get_xval_models() fr["weights"]=1 preds = [model.predict(fr) for model in xval_models] (sum(preds)/10).show()
def offset_bernoulli_cars(): # Connect to a pre-existing cluster cars = h2o.upload_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) cars = cars[cars["economy_20mpg"].isna() == 0] cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() offset = h2o.H2OFrame([[.5 for x in range(398)]]) offset.set_names(["x1"]) cars = cars.cbind(offset) gbm = h2o.gbm(x=cars[2:8], y=cars["economy_20mpg"], distribution="bernoulli", ntrees=1, max_depth=1, min_rows=1, learn_rate=1, offset_column="x1", training_frame=cars) predictions = gbm.predict(cars) # Comparison result generated from R's gbm: # gg = gbm(formula = economy_20mpg~cylinders+displacement+power+weight+acceleration+year+offset(rep(.5,398)), # distribution = "bernoulli",data = df,n.trees = 1,interaction.depth = 1,n.minobsinnode = 1,shrinkage = 1, # train.fraction = 1,bag.fraction = 1) # pr = predict.gbm(object = gg,newdata = df,n.trees = 1,type = "link") # pr = 1/(1+exp(-df$x1 - pr)) assert abs(-0.1041234 - gbm._model_json['output']['init_f']) < 1e-6, "expected init_f to be {0}, but got {1}". \ format(-0.1041234, gbm._model_json['output']['init_f']) assert abs(0.577326 - predictions[:,2].mean()[0]) < 1e-6, "expected prediction mean to be {0}, but got {1}". \ format(0.577326, predictions[:,2].mean()[0]) assert abs(0.1621461 - predictions[:,2].min()) < 1e-6, "expected prediction min to be {0}, but got {1}". \ format(0.1621461, predictions[:,2].min()) assert abs(0.8506528 - predictions[:,2].max()) < 1e-6, "expected prediction max to be {0}, but got {1}". \ format(0.8506528, predictions[:,2].max())
def cv_nfoldsGBM(ip,port): # Connect to h2o h2o.init(ip,port) prostate = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv")) #prostate.summary() prostate_gbm = h2o.gbm(y=prostate[1], x=prostate[2:9], nfolds = 5, distribution="bernoulli") prostate_gbm.show() # Can't specify both nfolds >= 2 and validation data at once try: h2o.gbm(y=prostate[1], x=prostate[2:9], nfolds=5, validation_y=prostate[1], validation_x=prostate[2:9], distribution="bernoulli") assert False, "expected an error" except EnvironmentError: assert True
def gbm_mean_residual_deviance(ip, port): cars = h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv")) s = cars[0].runif() train = cars[s > 0.2] valid = cars[s <= 0.2] predictors = ["displacement", "power", "weight", "acceleration", "year"] response_col = "economy" gbm = h2o.gbm( x=train[predictors], y=train[response_col], validation_x=valid[predictors], validation_y=valid[response_col], nfolds=3, ) gbm_mrd = gbm.mean_residual_deviance(train=True, valid=True, xval=True) assert isinstance( gbm_mrd["train"], float ), "Expected training mean residual deviance to be a float, but got " "{0}".format(type(gbm_mrd["train"])) assert isinstance( gbm_mrd["valid"], float ), "Expected validation mean residual deviance to be a float, but got " "{0}".format(type(gbm_mrd["valid"])) assert isinstance( gbm_mrd["xval"], float ), "Expected cross-validation mean residual deviance to be a float, but got " "{0}".format(type(gbm_mrd["xval"]))
def smallcatGBM(ip,port): # Training set has 26 categories from A to Z # Categories A, C, E, G, ... are perfect predictors of y = 1 # Categories B, D, F, H, ... are perfect predictors of y = 0 #Log.info("Importing alphabet_cattest.csv data...\n") alphabet = h2o.import_file(path=h2o.locate("smalldata/gbm_test/alphabet_cattest.csv")) alphabet["y"] = alphabet["y"].asfactor() #Log.info("Summary of alphabet_cattest.csv from H2O:\n") #alphabet.summary() # Prepare data for scikit use trainData = np.loadtxt(h2o.locate("smalldata/gbm_test/alphabet_cattest.csv"), delimiter=',', skiprows=1, converters={0:lambda s: ord(s.split("\"")[1])}) trainDataResponse = trainData[:,1] trainDataFeatures = trainData[:,0] # Train H2O GBM Model: #Log.info("H2O GBM (Naive Split) with parameters:\nntrees = 1, max_depth = 1, nbins = 100\n") gbm_h2o = h2o.gbm(x=alphabet[['X']], y=alphabet["y"], distribution="bernoulli", ntrees=1, max_depth=1, nbins=100) gbm_h2o.show() # Train scikit GBM Model: # Log.info("scikit GBM with same parameters:") gbm_sci = ensemble.GradientBoostingClassifier(n_estimators=1, max_depth=1, max_features=None) gbm_sci.fit(trainDataFeatures[:,np.newaxis],trainDataResponse)
def weights_gamma(): htable = h2o.upload_file(h2o.locate("smalldata/gbm_test/moppe.csv")) htable["premiekl"] = htable["premiekl"].asfactor() htable["moptva"] = htable["moptva"].asfactor() htable["zon"] = htable["zon"] # gg = gbm(formula = medskad ~ premiekl + moptva + zon,data = table.1.2,distribution = "gamma", weights = table.1.2$antskad , # n.trees = 20,interaction.depth = 1,n.minobsinnode = 1,shrinkage = 1,bag.fraction = 1,train.fraction = 1) # pr = predict(gg,newdata = table.1.2,type = "response") # htable= as.h2o(table.1.2,destination_frame = "htable") hh = h2o.gbm( x=htable[0:3], y=htable["medskad"], training_frame=htable, distribution="gamma", weights_column="antskad", ntrees=20, max_depth=1, min_rows=1, learn_rate=1, ) ph = hh.predict(htable) assert abs(8.804447 - hh._model_json["output"]["init_f"]) < 1e-6 * 8.804447 assert abs(3751.01 - ph[0].min()) < 1e-4 * 3751.01 assert abs(15298.87 - ph[0].max()) < 1e-4 * 15298.87 assert abs(8121.98 - ph[0].mean()) < 1e-4 * 8121.98
def framesliceGBM(): # Log.info("Importing prostate data...\n") prostate = h2o.import_file(path=tests.locate("smalldata/logreg/prostate.csv")) prostate = prostate[1:9] # Log.info("Running GBM on a sliced data frame...\n") model = h2o.gbm(x=prostate[1:8], y=prostate[0])
def cv_nfoldsGBM(): prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) prostate[1] = prostate[1].asfactor() prostate.summary() prostate_gbm = h2o.gbm(y=prostate[1], x=prostate[2:9], nfolds = 5, distribution="bernoulli") prostate_gbm.show() # Can specify both nfolds >= 2 and validation data at once try: h2o.gbm(y=prostate[1], x=prostate[2:9], nfolds=5, validation_y=prostate[1], validation_x=prostate[2:9], distribution="bernoulli") assert True except EnvironmentError: assert False, "expected an error"
def ntrain(): h2o.init(ip="zurich.h2o.ai",strict_version_check=False) weather = load_weather() training = load_training() X = assemble_X(training, weather) mean, std = normalize(X) y =assemble_y(training) xd=[] for l in X: xd.append(l.tolist()) y=np.asarray(y,dtype='bool_') xtr=H2OFrame(python_obj=xd) ytr=H2OFrame(python_obj=y.tolist()) ytr["C1"]._name = "C40" # Rename the default column gb = h2o.gbm(x =xtr[1:39],y =ytr['C40'], distribution = "bernoulli", ntrees=1000, # 500 works well max_depth=12, learn_rate=0.01) dl= h2o.deeplearning(x =xtr[1:39],y =ytr['C40'], variable_importances=True,balance_classes=True, input_dropout_ratio=0.2,rho=0.899, hidden_dropout_ratios=[0.4,0.4,0.4,0.4], activation="Tanh",hidden=[39,325,325,1],epochs=100) rf= h2o.random_forest(x =xtr[1:39],y =ytr['C40'], seed=1234, ntrees=600, max_depth=20, balance_classes=False) testing = load_testing() X_test= assemble_X(testing, weather) normalize(X_test, mean, std) xd=[] for l in X_test: xd.append(l.tolist()) xts=H2OFrame(python_obj=xd) # gp=gb.predict(xts) dp=dl.predict(xts) rp=rf.predict(xts) gbp=gb.predict(xts) gp=dp*0.35+rp*0.3+gbp*0.35 gph=h2o.as_list(gp) Id= np.arange(gp.nrow()+1)[1:].reshape(gp.nrow(),1) df = pd.DataFrame(Id) df_concat = pd.concat([df, gph.True],axis=1) df_concat.columns=['Id','WnvPresent'] df_concat.to_csv("wnvh.csv",index=False)
def gbm_mean_residual_deviance(ip, port): cars = h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv")) s = cars[0].runif() train = cars[s > 0.2] valid = cars[s <= 0.2] predictors = ["displacement", "power", "weight", "acceleration", "year"] response_col = "economy" gbm = h2o.gbm(x=train[predictors], y=train[response_col], validation_x=valid[predictors], validation_y=valid[response_col], nfolds=3) gbm_mrd = gbm.mean_residual_deviance(train=True, valid=True, xval=True) assert isinstance(gbm_mrd['train'],float), "Expected training mean residual deviance to be a float, but got " \ "{0}".format(type(gbm_mrd['train'])) assert isinstance(gbm_mrd['valid'],float), "Expected validation mean residual deviance to be a float, but got " \ "{0}".format(type(gbm_mrd['valid'])) assert isinstance(gbm_mrd['xval'],float), "Expected cross-validation mean residual deviance to be a float, but got " \ "{0}".format(type(gbm_mrd['xval']))
def split_fit_predict(data): # Classic Test/Train split r = data['Days'].runif() # Random UNIForm numbers, one per row train = data[r < 0.6] test = data[(0.6 <= r) & (r < 0.9)] hold = data[0.9 <= r] print "Training data has", train.ncol(), "columns and", train.nrow( ), "rows, test has", test.nrow(), "rows, holdout has", hold.nrow() # Run GBM gbm = h2o.gbm( x=train.drop("bikes"), y=train["bikes"], validation_x=test.drop("bikes"), validation_y=test["bikes"], ntrees=500, # 500 works well max_depth=6, min_rows=10, nbins=20, learn_rate=0.1) #gbm.show() # Run GLM glm = h2o.glm(x=train.drop("bikes"), y=train["bikes"], validation_x=test.drop("bikes"), validation_y=test["bikes"], dropNA20Cols=True) #glm.show() # ---------- # 4- Score on holdout set & report train_r2_gbm = gbm.model_performance(train).r2() test_r2_gbm = gbm.model_performance(test).r2() hold_r2_gbm = gbm.model_performance(hold).r2() print "GBM R2 TRAIN=", train_r2_gbm, ", R2 TEST=", test_r2_gbm, ", R2 HOLDOUT=", hold_r2_gbm train_r2_glm = glm.model_performance(train).r2() test_r2_glm = glm.model_performance(test).r2() hold_r2_glm = glm.model_performance(hold).r2() print "GLM R2 TRAIN=", train_r2_glm, ", R2 TEST=", test_r2_glm, ", R2 HOLDOUT=", hold_r2_glm
def pubdev_1431(ip, port): running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() airlines_billion_file = "/datasets/airlinesbillion.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file) airlines_billion = h2o.import_file(url) airlines_billion[30] = airlines_billion[30].asfactor() gbm = h2o.gbm(x=airlines_billion[0:30], y=airlines_billion[30], ntrees=1, distribution="bernoulli", max_depth=1) predictions = gbm.predict(airlines_billion) csv = os.path.join(os.getcwd(), "delete.csv") h2o.download_csv(predictions, csv) os.remove(csv) else: print "Not running on H2O internal network. No access to HDFS."
def prep_airlines(ip,port): # Connect to a pre-existing cluster h2o.init(ip,port) air = h2o.import_frame(h2o.locate("smalldata/airlines/allyears2k_headers.zip")) numRows, numCols = air.dim() x_cols = ["Month", "DayofMonth", "DayOfWeek", "CRSDepTime", "CRSArrTime", "UniqueCarrier", "CRSElapsedTime", "Origin", "Dest", "Distance"] y_col = "SynthDepDelayed" noDepDelayedNAs = air[air["DepDelay"].isna() == 0] print "Dimensions of new dataset: {0}".format(noDepDelayedNAs.dim()) minutesOfDelayWeTolerate = 15 noDepDelayedNAs.cbind(noDepDelayedNAs["DepDelay"] > minutesOfDelayWeTolerate) noDepDelayedNAs[numCols] = noDepDelayedNAs[numCols].asfactor() noDepDelayedNAs._vecs[numCols].setName(y_col) gbm = h2o.gbm(x=noDepDelayedNAs[x_cols], y=noDepDelayedNAs[y_col], distribution="bernoulli") gbm.show()
def deepLearningDemo(ip, port): h2o.init(ip, port) # Training data train_data = h2o.import_frame( path=h2o.locate("smalldata/gbm_test/ecology_model.csv")) train_data = train_data.drop('Site') train_data['Angaus'] = train_data['Angaus'].asfactor() print train_data.describe() train_data.head() # Testing data test_data = h2o.import_frame( path=h2o.locate("smalldata/gbm_test/ecology_eval.csv")) test_data['Angaus'] = test_data['Angaus'].asfactor() print test_data.describe() test_data.head() # Run GBM gbm = h2o.gbm(x=train_data[1:], y=train_data['Angaus'], validation_x=test_data[1:], validation_y=test_data['Angaus'], ntrees=100, distribution="bernoulli") gbm.show() # Run DeepLearning dl = h2o.deeplearning(x=train_data[1:], y=train_data['Angaus'], validation_x=test_data[1:], validation_y=test_data['Angaus'], loss='CrossEntropy', epochs=1000, hidden=[20, 20, 20]) dl.show()
def fiftycatGBM(ip, port): # Connect to h2o h2o.init(ip, port) # Training set has only 45 categories cat1 through cat45 #Log.info("Importing 50_cattest_train.csv data...\n") train = h2o.import_frame( path=h2o.locate("smalldata/gbm_test/50_cattest_train.csv")) train["y"] = train["y"].asfactor() #Log.info("Summary of 50_cattest_train.csv from H2O:\n") #train.summary() # Train H2O GBM Model: #Log.info(paste("H2O GBM with parameters:\nntrees = 10, max_depth = 20, nbins = 20\n", sep = "")) model = h2o.gbm(x=train[["x1", "x2"]], y=train["y"], loss="bernoulli", ntrees=10, max_depth=5, nbins=20) model.show() # Test dataset has all 50 categories cat1 through cat50 #Log.info("Importing 50_cattest_test.csv data...\n") test = h2o.import_frame( path=h2o.locate("smalldata/gbm_test/50_cattest_test.csv")) #Log.info("Summary of 50_cattest_test.csv from H2O:\n") #test.summary() # Predict on test dataset with GBM model: #Log.info("Performing predictions on test dataset...\n") predictions = model.predict(test) predictions.show() # Get the confusion matrix and AUC #Log.info("Confusion matrix of predictions (max accuracy):\n") performance = model.model_performance(test) test_cm = performance.confusion_matrices() test_auc = performance.auc()
def smallcatGBM(ip, port): # Training set has 26 categories from A to Z # Categories A, C, E, G, ... are perfect predictors of y = 1 # Categories B, D, F, H, ... are perfect predictors of y = 0 # Connect to h2o h2o.init(ip, port) #Log.info("Importing alphabet_cattest.csv data...\n") alphabet = h2o.import_frame( path=h2o.locate("smalldata/gbm_test/alphabet_cattest.csv")) alphabet["y"] = alphabet["y"].asfactor() #Log.info("Summary of alphabet_cattest.csv from H2O:\n") #alphabet.summary() # Prepare data for scikit use trainData = np.loadtxt( h2o.locate("smalldata/gbm_test/alphabet_cattest.csv"), delimiter=',', skiprows=1, converters={0: lambda s: ord(s.split("\"")[1])}) trainDataResponse = trainData[:, 1] trainDataFeatures = trainData[:, 0] # Train H2O GBM Model: #Log.info("H2O GBM (Naive Split) with parameters:\nntrees = 1, max_depth = 1, nbins = 100\n") gbm_h2o = h2o.gbm(x=alphabet[['X']], y=alphabet["y"], loss="bernoulli", ntrees=1, max_depth=1, nbins=100) gbm_h2o.show() # Train scikit GBM Model: # Log.info("scikit GBM with same parameters:") gbm_sci = ensemble.GradientBoostingClassifier(n_estimators=1, max_depth=1, max_features=None) gbm_sci.fit(trainDataFeatures[:, np.newaxis], trainDataResponse)
def plot_test(ip, port): # Connect to h2o h2o.init(ip, port) kwargs = {} kwargs['server'] = True air = h2o.import_frame( h2o.locate("smalldata/airlines/AirlinesTrain.csv.zip")) # Constructing test and train sets by sampling (20/80) s = air[0].runif() air_train = air[s <= 0.8] air_valid = air[s > 0.8] myX = [ "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek" ] myY = "IsDepDelayed" air_gbm = h2o.gbm(x=air_train[myX], y=air_train[myY], validation_x=air_valid[myX], validation_y=air_valid[myY], distribution="bernoulli", ntrees=100, max_depth=3, learn_rate=0.01) # Plot ROC for training and validation sets air_gbm.plot(type="roc", train=True, **kwargs) air_gbm.plot(type="roc", valid=True, **kwargs) air_test = h2o.import_frame( h2o.locate("smalldata/airlines/AirlinesTest.csv.zip")) perf = air_gbm.model_performance(air_test) #Plot ROC for test set perf.plot(type="roc", **kwargs)
def irisGBM(ip, port): # Connect to a pre-existing cluster # connect to localhost:54321 # Import training data train = h2o.import_file(path=h2o.locate("smalldata/iris/iris_wheader.csv")) train.describe() # Run GBM my_gbm = h2o.gbm(y=train["class"], validation_y=train["class"], x=train[1:4], validation_x=train[1:4], ntrees=50, learn_rate=0.1, distribution="multinomial") my_gbm.show() my_gbm_metrics = my_gbm.model_performance(train) my_gbm_metrics.show() my_gbm_metrics #.show(criterion=my_gbm_metrics.theCriteria.PRECISION)
def offset_gaussian(ip, port): # Connect to a pre-existing cluster h2o.init(ip, port) insurance = h2o.import_frame( h2o.locate("smalldata/glm_test/insurance.csv")) insurance["offset"] = insurance["Holders"].log() gbm = h2o.gbm(x=insurance[0:3], y=insurance["Claims"], distribution="gaussian", ntrees=600, max_depth=1, min_rows=1, learn_rate=.1, offset_column="offset", training_frame=insurance) predictions = gbm.predict(insurance) # Comparison result generated from R's gbm: # fit2 <- gbm(Claims ~ District + Group + Age+ offset(log(Holders)) , interaction.depth = 1,n.minobsinnode = 1, # shrinkage = .1,bag.fraction = 1,train.fraction = 1, # data = Insurance, distribution ="gaussian", n.trees = 600) # pg = predict(fit2, newdata = Insurance, type = "response", n.trees=600) # pr = pg - - log(Insurance$Holders) assert abs(44.33016 - gbm._model_json['output']['init_f']) < 1e-5, "expected init_f to be {0}, but got {1}". \ format(44.33016, gbm._model_json['output']['init_f']) assert abs(1491.135 - gbm.mse()) < 1e-3, "expected mse to be {0}, but got {1}".format( 1491.135, gbm.mse()) assert abs(49.23438 - predictions.mean()) < 1e-3, "expected prediction mean to be {0}, but got {1}". \ format(49.23438, predictions.mean()) assert abs(-45.54382 - predictions.min()) < 1e-1, "expected prediction min to be {0}, but got {1}". \ format(-45.54382, predictions.min()) assert abs(207.348 - predictions.max()) < 1e-1, "expected prediction max to be {0}, but got {1}". \ format(207.348, predictions.max())
def pubdev_1431(): hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() airlines_billion_file = "/datasets/airlinesbillion.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file) airlines_billion = h2o.import_file(url) airlines_billion[30] = airlines_billion[30].asfactor() gbm = h2o.gbm(x=airlines_billion[0:30], y=airlines_billion[30], ntrees=1, distribution="bernoulli", max_depth=1) predictions = gbm.predict(airlines_billion) csv = os.path.join(os.getcwd(), "delete.csv") h2o.download_csv(predictions, csv) os.remove(csv) else: raise (EnvironmentError, "Not running on H2O internal network. No access to HDFS.")
def confusion_matrices_check(ip, port): local_data = [[1, 'a'], [1, 'a'], [1, 'a'], [1, 'a'], [1, 'a'], [1, 'a'], [1, 'a'], [1, 'a'], [1, 'a'], [1, 'a'], [0, 'b'], [0, 'b'], [0, 'b'], [0, 'b'], [0, 'b'], [0, 'b'], [0, 'b'], [0, 'b'], [0, 'b'], [0, 'b']] h2o_data = h2o.H2OFrame(python_obj=local_data) h2o_data.setNames(['response', 'predictor']) h2o_data.show() gbm = h2o.gbm(x=h2o_data[1:], y=h2o_data["response"].asfactor(), ntrees=1, distribution="bernoulli") gbm.show() perf = gbm.model_performance() tps = perf.metric("tps", [perf.find_threshold_by_max_metric("f1")])[0][1] tns = perf.metric("tns", [perf.find_threshold_by_max_metric("f1")])[0][1] fps = perf.metric("fps", [perf.find_threshold_by_max_metric("f1")])[0][1] fns = perf.metric("fns", [perf.find_threshold_by_max_metric("f1")])[0][1] assert tps + tns + fps + fns == 20, "incorrect confusion matrix computation: tps: {0}, fps: {1}, tns: {2}, fns: " \ "{3}. Should sum to 20.".format(tps, fps, tns, fns)
def prostateGBM(): # Connect to a pre-existing cluster # connect to localhost:54321 df = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) df.describe() # Remove ID from training frame train = df.drop("ID") # For VOL & GLEASON, a zero really means "missing" vol = train['VOL'] vol[vol == 0] = None gle = train['GLEASON'] gle[gle == 0] = None # Convert CAPSULE to a logical factor train['CAPSULE'] = train['CAPSULE'].asfactor() # See that the data is ready train.describe() # Run GBM my_gbm = h2o.gbm(y=train["CAPSULE"], validation_y=train["CAPSULE"], x=train[1:], validation_x=train[1:], ntrees=50, learn_rate=0.1, distribution="bernoulli") my_gbm.show() my_gbm_metrics = my_gbm.model_performance(train) my_gbm_metrics.show() my_gbm_metrics #.show(criterion=my_gbm_metrics.theCriteria.PRECISION)
def offset_bernoulli_cars(): # Connect to a pre-existing cluster cars = h2o.upload_file( pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) cars = cars[cars["economy_20mpg"].isna() == 0] cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() offset = h2o.H2OFrame([[.5 for x in range(398)]]) offset.set_names(["x1"]) cars = cars.cbind(offset) gbm = h2o.gbm(x=cars[2:8], y=cars["economy_20mpg"], distribution="bernoulli", ntrees=1, max_depth=1, min_rows=1, learn_rate=1, offset_column="x1", training_frame=cars) predictions = gbm.predict(cars) # Comparison result generated from R's gbm: # gg = gbm(formula = economy_20mpg~cylinders+displacement+power+weight+acceleration+year+offset(rep(.5,398)), # distribution = "bernoulli",data = df,n.trees = 1,interaction.depth = 1,n.minobsinnode = 1,shrinkage = 1, # train.fraction = 1,bag.fraction = 1) # pr = predict.gbm(object = gg,newdata = df,n.trees = 1,type = "link") # pr = 1/(1+exp(-df$x1 - pr)) assert abs(-0.1041234 - gbm._model_json['output']['init_f']) < 1e-6, "expected init_f to be {0}, but got {1}". \ format(-0.1041234, gbm._model_json['output']['init_f']) assert abs(0.577326 - predictions[:,2].mean()[0]) < 1e-6, "expected prediction mean to be {0}, but got {1}". \ format(0.577326, predictions[:,2].mean()[0]) assert abs(0.1621461 - predictions[:,2].min()) < 1e-6, "expected prediction min to be {0}, but got {1}". \ format(0.1621461, predictions[:,2].min()) assert abs(0.8506528 - predictions[:,2].max()) < 1e-6, "expected prediction max to be {0}, but got {1}". \ format(0.8506528, predictions[:,2].max())
def offset_poisson(): # Connect to a pre-existing cluster insurance = h2o.import_file( tests.locate("smalldata/glm_test/insurance.csv")) insurance["offset"] = insurance["Holders"].log() gbm = h2o.gbm(x=insurance[0:3], y=insurance["Claims"], distribution="poisson", ntrees=600, max_depth=1, min_rows=1, learn_rate=.1, offset_column="offset", training_frame=insurance) predictions = gbm.predict(insurance) # Comparison result generated from R's gbm: #fit2 = gbm(Claims ~ District + Group + Age+ offset(log(Holders)) , interaction.depth = 1,n.minobsinnode = 1, # shrinkage = .1,bag.fraction = 1,train.fraction = 1, data = Insurance, distribution ="poisson", # n.trees = 600) #link = predict.gbm(fit2, Insurance, n.trees=600, type="link") #link.offset = link + log(Insurance$Holders) ##for poisson #pr = exp(link.offset) assert abs(-2.003262 - gbm._model_json['output']['init_f']) < 1e-5, "expected init_f to be {0}, but got {1}". \ format(-2.003262, gbm._model_json['output']['init_f']) assert abs(49.23437 - predictions.mean()) < 1e-4, "expected prediction mean to be {0}, but got {1}". \ format(49.23437, predictions.mean()) assert abs(1.077275 - predictions.min()) < 1e-4, "expected prediction min to be {0}, but got {1}". \ format(1.077275, predictions.min()) assert abs(398.0608 - predictions.max()) < 1e-2, "expected prediction max to be {0}, but got {1}". \ format(398.0608, predictions.max())
def bigcatGBM(ip, port): #Log.info("Importing bigcat_5000x2.csv data...\n") bigcat = h2o.import_file( path=h2o.locate("smalldata/gbm_test/bigcat_5000x2.csv")) bigcat["y"] = bigcat["y"].asfactor() #Log.info("Summary of bigcat_5000x2.csv from H2O:\n") #bigcat.summary() # Train H2O GBM Model: #Log.info("H2O GBM with parameters:\nntrees = 1, max_depth = 1, nbins = 100\n") model = h2o.gbm(x=bigcat[["X"]], y=bigcat["y"], distribution="bernoulli", ntrees=1, max_depth=1, nbins=100) model.show() performance = model.model_performance(bigcat) performance.show() # Check AUC and overall prediction error #test_accuracy = performance.accuracy() test_auc = performance.auc()
def metric_accessors(ip, port): cars = h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv")) r = cars[0].runif() train = cars[r > .2] valid = cars[r <= .2] # regression response_col = "economy" distribution = "gaussian" predictors = ["displacement", "power", "weight", "acceleration", "year"] gbm = h2o.gbm(y=train[response_col], x=train[predictors], validation_y=valid[response_col], validation_x=valid[predictors], nfolds=3, distribution=distribution, fold_assignment="Random") # mse mse1 = gbm.mse(train=True, valid=False, xval=False) assert isinstance(mse1, float) mse2 = gbm.mse(train=False, valid=True, xval=False) assert isinstance(mse2, float) mse3 = gbm.mse(train=False, valid=False, xval=True) assert isinstance(mse3, float) mse = gbm.mse(train=True, valid=True, xval=False) assert "train" in mse.keys() and "valid" in mse.keys( ), "expected training and validation metrics to be returned, but got {0}".format( mse.keys()) assert len( mse ) == 2, "expected only training and validation metrics to be returned, but got {0}".format( mse.keys()) assert isinstance(mse["train"], float) and isinstance( mse["valid"], float ), "expected training and validation metrics to be floats, but got {0} and {1}".format( type(mse["train"]), type(mse["valid"])) assert mse["valid"] == mse2 mse = gbm.mse(train=True, valid=False, xval=True) assert "train" in mse.keys() and "xval" in mse.keys( ), "expected training and cross validation metrics to be returned, but got {0}".format( mse.keys()) assert len( mse ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( mse.keys()) assert isinstance(mse["train"], float) and isinstance( mse["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(mse["train"]), type(mse["xval"])) assert mse["xval"] == mse3 mse = gbm.mse(train=True, valid=True, xval=True) assert "train" in mse.keys() and "valid" in mse.keys( ) and "xval" in mse.keys( ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format( mse.keys()) assert len( mse ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format( mse.keys()) assert isinstance(mse["train"], float) and isinstance( mse["valid"], float ) and isinstance( mse["xval"], float ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format( type(mse["train"]), type(mse["valid"]), type(mse["xval"])) mse = gbm.mse(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(mse, float) assert mse == mse1 mse = gbm.mse(train=False, valid=True, xval=True) assert "valid" in mse.keys() and "xval" in mse.keys( ), "expected validation and cross validation metrics to be returned, but got {0}".format( mse.keys()) assert len( mse ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format( mse.keys()) assert isinstance(mse["valid"], float) and isinstance( mse["xval"], float ), "validation and cross validation metrics to be floats, but got {0} and {1}".format( type(mse["valid"]), type(mse["xval"])) # r2 r21 = gbm.r2(train=True, valid=False, xval=False) assert isinstance(r21, float) r22 = gbm.r2(train=False, valid=True, xval=False) assert isinstance(r22, float) r23 = gbm.r2(train=False, valid=False, xval=True) assert isinstance(r23, float) r2 = gbm.r2(train=True, valid=True, xval=False) assert "train" in r2.keys() and "valid" in r2.keys( ), "expected training and validation metrics to be returned, but got {0}".format( r2.keys()) assert len( r2 ) == 2, "expected only training and validation metrics to be returned, but got {0}".format( r2.keys()) assert isinstance(r2["train"], float) and isinstance( r2["valid"], float ), "expected training and validation metrics to be floats, but got {0} and {1}".format( type(r2["train"]), type(r2["valid"])) assert r2["valid"] == r22 r2 = gbm.r2(train=True, valid=False, xval=True) assert "train" in r2.keys() and "xval" in r2.keys( ), "expected training and cross validation metrics to be returned, but got {0}".format( r2.keys()) assert len( r2 ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( r2.keys()) assert isinstance(r2["train"], float) and isinstance( r2["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(r2["train"]), type(r2["xval"])) assert r2["xval"] == r23 r2 = gbm.r2(train=True, valid=True, xval=True) assert "train" in r2.keys() and "valid" in r2.keys() and "xval" in r2.keys( ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format( r2.keys()) assert len( r2 ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format( r2.keys()) assert isinstance(r2["train"], float) and isinstance( r2["valid"], float ) and isinstance( r2["xval"], float ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format( type(r2["train"]), type(r2["valid"]), type(r2["xval"])) r2 = gbm.r2(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(r2, float) assert r2 == r21 r2 = gbm.r2(train=False, valid=True, xval=True) assert "valid" in r2.keys() and "xval" in r2.keys( ), "expected validation and cross validation metrics to be returned, but got {0}".format( r2.keys()) assert len( r2 ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format( r2.keys()) assert isinstance(r2["valid"], float) and isinstance( r2["xval"], float ), "validation and cross validation metrics to be floats, but got {0} and {1}".format( type(r2["valid"]), type(r2["xval"])) # mean_residual_deviance mean_residual_deviance1 = gbm.mean_residual_deviance(train=True, valid=False, xval=False) assert isinstance(mean_residual_deviance1, float) mean_residual_deviance2 = gbm.mean_residual_deviance(train=False, valid=True, xval=False) assert isinstance(mean_residual_deviance2, float) mean_residual_deviance3 = gbm.mean_residual_deviance(train=False, valid=False, xval=True) assert isinstance(mean_residual_deviance3, float) mean_residual_deviance = gbm.mean_residual_deviance(train=True, valid=True, xval=False) assert "train" in mean_residual_deviance.keys( ) and "valid" in mean_residual_deviance.keys( ), "expected training and validation metrics to be returned, but got {0}".format( mean_residual_deviance.keys()) assert len( mean_residual_deviance ) == 2, "expected only training and validation metrics to be returned, but got {0}".format( mean_residual_deviance.keys()) assert isinstance(mean_residual_deviance["train"], float) and isinstance( mean_residual_deviance["valid"], float ), "expected training and validation metrics to be floats, but got {0} and {1}".format( type(mean_residual_deviance["train"]), type(mean_residual_deviance["valid"])) assert mean_residual_deviance["valid"] == mean_residual_deviance2 mean_residual_deviance = gbm.mean_residual_deviance(train=True, valid=False, xval=True) assert "train" in mean_residual_deviance.keys( ) and "xval" in mean_residual_deviance.keys( ), "expected training and cross validation metrics to be returned, but got {0}".format( mean_residual_deviance.keys()) assert len( mean_residual_deviance ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( mean_residual_deviance.keys()) assert isinstance(mean_residual_deviance["train"], float) and isinstance( mean_residual_deviance["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(mean_residual_deviance["train"]), type(mean_residual_deviance["xval"])) assert mean_residual_deviance["xval"] == mean_residual_deviance3 mean_residual_deviance = gbm.mean_residual_deviance(train=True, valid=True, xval=True) assert "train" in mean_residual_deviance.keys( ) and "valid" in mean_residual_deviance.keys( ) and "xval" in mean_residual_deviance.keys( ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format( mean_residual_deviance.keys()) assert len( mean_residual_deviance ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format( mean_residual_deviance.keys()) assert isinstance(mean_residual_deviance["train"], float) and isinstance( mean_residual_deviance["valid"], float ) and isinstance( mean_residual_deviance["xval"], float ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format( type(mean_residual_deviance["train"]), type(mean_residual_deviance["valid"]), type(mean_residual_deviance["xval"])) mean_residual_deviance = gbm.mean_residual_deviance( train=False, valid=False, xval=False) # default: return training metrics assert isinstance(mean_residual_deviance, float) assert mean_residual_deviance == mean_residual_deviance1 mean_residual_deviance = gbm.mean_residual_deviance(train=False, valid=True, xval=True) assert "valid" in mean_residual_deviance.keys( ) and "xval" in mean_residual_deviance.keys( ), "expected validation and cross validation metrics to be returned, but got {0}".format( mean_residual_deviance.keys()) assert len( mean_residual_deviance ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format( mean_residual_deviance.keys()) assert isinstance(mean_residual_deviance["valid"], float) and isinstance( mean_residual_deviance["xval"], float ), "validation and cross validation metrics to be floats, but got {0} and {1}".format( type(mean_residual_deviance["valid"]), type(mean_residual_deviance["xval"])) # binomial cars = h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv")) cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() r = cars[0].runif() train = cars[r > .2] valid = cars[r <= .2] response_col = "economy_20mpg" distribution = "bernoulli" predictors = ["displacement", "power", "weight", "acceleration", "year"] gbm = h2o.gbm(y=train[response_col], x=train[predictors], validation_y=valid[response_col], validation_x=valid[predictors], nfolds=3, distribution=distribution, fold_assignment="Random") # auc auc1 = gbm.auc(train=True, valid=False, xval=False) assert isinstance(auc1, float) auc2 = gbm.auc(train=False, valid=True, xval=False) assert isinstance(auc2, float) auc3 = gbm.auc(train=False, valid=False, xval=True) assert isinstance(auc3, float) auc = gbm.auc(train=True, valid=True, xval=False) assert "train" in auc.keys() and "valid" in auc.keys( ), "expected training and validation metrics to be returned, but got {0}".format( auc.keys()) assert len( auc ) == 2, "expected only training and validation metrics to be returned, but got {0}".format( auc.keys()) assert isinstance(auc["train"], float) and isinstance( auc["valid"], float ), "expected training and validation metrics to be floats, but got {0} and {1}".format( type(auc["train"]), type(auc["valid"])) assert auc["valid"] == auc2 auc = gbm.auc(train=True, valid=False, xval=True) assert "train" in auc.keys() and "xval" in auc.keys( ), "expected training and cross validation metrics to be returned, but got {0}".format( auc.keys()) assert len( auc ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( auc.keys()) assert isinstance(auc["train"], float) and isinstance( auc["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(auc["train"]), type(auc["xval"])) assert auc["xval"] == auc3 auc = gbm.auc(train=True, valid=True, xval=True) assert "train" in auc.keys() and "valid" in auc.keys( ) and "xval" in auc.keys( ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format( auc.keys()) assert len( auc ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format( auc.keys()) assert isinstance(auc["train"], float) and isinstance( auc["valid"], float ) and isinstance( auc["xval"], float ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format( type(auc["train"]), type(auc["valid"]), type(auc["xval"])) auc = gbm.auc(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(auc, float) assert auc == auc1 auc = gbm.auc(train=False, valid=True, xval=True) assert "valid" in auc.keys() and "xval" in auc.keys( ), "expected validation and cross validation metrics to be returned, but got {0}".format( auc.keys()) assert len( auc ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format( auc.keys()) assert isinstance(auc["valid"], float) and isinstance( auc["xval"], float ), "validation and cross validation metrics to be floats, but got {0} and {1}".format( type(auc["valid"]), type(auc["xval"])) # logloss logloss1 = gbm.logloss(train=True, valid=False, xval=False) assert isinstance(logloss1, float) logloss2 = gbm.logloss(train=False, valid=True, xval=False) assert isinstance(logloss2, float) logloss3 = gbm.logloss(train=False, valid=False, xval=True) assert isinstance(logloss3, float) logloss = gbm.logloss(train=True, valid=True, xval=False) assert "train" in logloss.keys() and "valid" in logloss.keys( ), "expected training and validation metrics to be returned, but got {0}".format( logloss.keys()) assert len( logloss ) == 2, "expected only training and validation metrics to be returned, but got {0}".format( logloss.keys()) assert isinstance(logloss["train"], float) and isinstance( logloss["valid"], float ), "expected training and validation metrics to be floats, but got {0} and {1}".format( type(logloss["train"]), type(logloss["valid"])) assert logloss["valid"] == logloss2 logloss = gbm.logloss(train=True, valid=False, xval=True) assert "train" in logloss.keys() and "xval" in logloss.keys( ), "expected training and cross validation metrics to be returned, but got {0}".format( logloss.keys()) assert len( logloss ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( logloss.keys()) assert isinstance(logloss["train"], float) and isinstance( logloss["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(logloss["train"]), type(logloss["xval"])) assert logloss["xval"] == logloss3 logloss = gbm.logloss(train=True, valid=True, xval=True) assert "train" in logloss.keys() and "valid" in logloss.keys( ) and "xval" in logloss.keys( ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format( logloss.keys()) assert len( logloss ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format( logloss.keys()) assert isinstance(logloss["train"], float) and isinstance( logloss["valid"], float ) and isinstance( logloss["xval"], float ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format( type(logloss["train"]), type(logloss["valid"]), type(logloss["xval"])) logloss = gbm.logloss(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(logloss, float) assert logloss == logloss1 logloss = gbm.logloss(train=False, valid=True, xval=True) assert "valid" in logloss.keys() and "xval" in logloss.keys( ), "expected validation and cross validation metrics to be returned, but got {0}".format( logloss.keys()) assert len( logloss ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format( logloss.keys()) assert isinstance(logloss["valid"], float) and isinstance( logloss["xval"], float ), "validation and cross validation metrics to be floats, but got {0} and {1}".format( type(logloss["valid"]), type(logloss["xval"])) # giniCoef giniCoef1 = gbm.giniCoef(train=True, valid=False, xval=False) assert isinstance(giniCoef1, float) giniCoef2 = gbm.giniCoef(train=False, valid=True, xval=False) assert isinstance(giniCoef2, float) giniCoef3 = gbm.giniCoef(train=False, valid=False, xval=True) assert isinstance(giniCoef3, float) giniCoef = gbm.giniCoef(train=True, valid=True, xval=False) assert "train" in giniCoef.keys() and "valid" in giniCoef.keys( ), "expected training and validation metrics to be returned, but got {0}".format( giniCoef.keys()) assert len( giniCoef ) == 2, "expected only training and validation metrics to be returned, but got {0}".format( giniCoef.keys()) assert isinstance(giniCoef["train"], float) and isinstance( giniCoef["valid"], float ), "expected training and validation metrics to be floats, but got {0} and {1}".format( type(giniCoef["train"]), type(giniCoef["valid"])) assert giniCoef["valid"] == giniCoef2 giniCoef = gbm.giniCoef(train=True, valid=False, xval=True) assert "train" in giniCoef.keys() and "xval" in giniCoef.keys( ), "expected training and cross validation metrics to be returned, but got {0}".format( giniCoef.keys()) assert len( giniCoef ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( giniCoef.keys()) assert isinstance(giniCoef["train"], float) and isinstance( giniCoef["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(giniCoef["train"]), type(giniCoef["xval"])) assert giniCoef["xval"] == giniCoef3 giniCoef = gbm.giniCoef(train=True, valid=True, xval=True) assert "train" in giniCoef.keys() and "valid" in giniCoef.keys( ) and "xval" in giniCoef.keys( ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format( giniCoef.keys()) assert len( giniCoef ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format( giniCoef.keys()) assert isinstance(giniCoef["train"], float) and isinstance( giniCoef["valid"], float ) and isinstance( giniCoef["xval"], float ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format( type(giniCoef["train"]), type(giniCoef["valid"]), type(giniCoef["xval"])) giniCoef = gbm.giniCoef(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(giniCoef, float) assert giniCoef == giniCoef1 giniCoef = gbm.giniCoef(train=False, valid=True, xval=True) assert "valid" in giniCoef.keys() and "xval" in giniCoef.keys( ), "expected validation and cross validation metrics to be returned, but got {0}".format( giniCoef.keys()) assert len( giniCoef ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format( giniCoef.keys()) assert isinstance(giniCoef["valid"], float) and isinstance( giniCoef["xval"], float ), "validation and cross validation metrics to be floats, but got {0} and {1}".format( type(giniCoef["valid"]), type(giniCoef["xval"])) # F1 F11 = gbm.F1(train=True, valid=False, xval=False) F12 = gbm.F1(train=False, valid=True, xval=False) F13 = gbm.F1(train=False, valid=False, xval=True) F1 = gbm.F1(train=True, valid=True, xval=False) F1 = gbm.F1(train=True, valid=False, xval=True) F1 = gbm.F1(train=True, valid=True, xval=True) F1 = gbm.F1(train=False, valid=False, xval=False) # default: return training metrics F1 = gbm.F1(train=False, valid=True, xval=True) # F0point5 F0point51 = gbm.F0point5(train=True, valid=False, xval=False) F0point52 = gbm.F0point5(train=False, valid=True, xval=False) F0point53 = gbm.F0point5(train=False, valid=False, xval=True) F0point5 = gbm.F0point5(train=True, valid=True, xval=False) F0point5 = gbm.F0point5(train=True, valid=False, xval=True) F0point5 = gbm.F0point5(train=True, valid=True, xval=True) F0point5 = gbm.F0point5(train=False, valid=False, xval=False) # default: return training metrics F0point5 = gbm.F0point5(train=False, valid=True, xval=True) # F2 F21 = gbm.F2(train=True, valid=False, xval=False) F22 = gbm.F2(train=False, valid=True, xval=False) F23 = gbm.F2(train=False, valid=False, xval=True) F2 = gbm.F2(train=True, valid=True, xval=False) F2 = gbm.F2(train=True, valid=False, xval=True) F2 = gbm.F2(train=True, valid=True, xval=True) F2 = gbm.F2(train=False, valid=False, xval=False) # default: return training metrics F2 = gbm.F2(train=False, valid=True, xval=True) # accuracy accuracy1 = gbm.accuracy(train=True, valid=False, xval=False) accuracy2 = gbm.accuracy(train=False, valid=True, xval=False) accuracy3 = gbm.accuracy(train=False, valid=False, xval=True) accuracy = gbm.accuracy(train=True, valid=True, xval=False) accuracy = gbm.accuracy(train=True, valid=False, xval=True) accuracy = gbm.accuracy(train=True, valid=True, xval=True) accuracy = gbm.accuracy(train=False, valid=False, xval=False) # default: return training metrics accuracy = gbm.accuracy(train=False, valid=True, xval=True) # error error1 = gbm.error(train=True, valid=False, xval=False) error2 = gbm.error(train=False, valid=True, xval=False) error3 = gbm.error(train=False, valid=False, xval=True) error = gbm.error(train=True, valid=True, xval=False) error = gbm.error(train=True, valid=False, xval=True) error = gbm.error(train=True, valid=True, xval=True) error = gbm.error(train=False, valid=False, xval=False) # default: return training metrics error = gbm.error(train=False, valid=True, xval=True) # precision precision1 = gbm.precision(train=True, valid=False, xval=False) precision2 = gbm.precision(train=False, valid=True, xval=False) precision3 = gbm.precision(train=False, valid=False, xval=True) precision = gbm.precision(train=True, valid=True, xval=False) precision = gbm.precision(train=True, valid=False, xval=True) precision = gbm.precision(train=True, valid=True, xval=True) precision = gbm.precision(train=False, valid=False, xval=False) # default: return training metrics precision = gbm.precision(train=False, valid=True, xval=True) # mcc mcc1 = gbm.mcc(train=True, valid=False, xval=False) mcc2 = gbm.mcc(train=False, valid=True, xval=False) mcc3 = gbm.mcc(train=False, valid=False, xval=True) mcc = gbm.mcc(train=True, valid=True, xval=False) mcc = gbm.mcc(train=True, valid=False, xval=True) mcc = gbm.mcc(train=True, valid=True, xval=True) mcc = gbm.mcc(train=False, valid=False, xval=False) # default: return training metrics mcc = gbm.mcc(train=False, valid=True, xval=True) # max_per_class_error max_per_class_error1 = gbm.max_per_class_error(train=True, valid=False, xval=False) max_per_class_error2 = gbm.max_per_class_error(train=False, valid=True, xval=False) max_per_class_error3 = gbm.max_per_class_error(train=False, valid=False, xval=True) max_per_class_error = gbm.max_per_class_error(train=True, valid=True, xval=False) max_per_class_error = gbm.max_per_class_error(train=True, valid=False, xval=True) max_per_class_error = gbm.max_per_class_error(train=True, valid=True, xval=True) max_per_class_error = gbm.max_per_class_error( train=False, valid=False, xval=False) # default: return training metrics max_per_class_error = gbm.max_per_class_error(train=False, valid=True, xval=True) # confusion_matrix confusion_matrix1 = gbm.confusion_matrix(train=True, valid=False, xval=False) confusion_matrix2 = gbm.confusion_matrix(train=False, valid=True, xval=False) confusion_matrix3 = gbm.confusion_matrix(train=False, valid=False, xval=True) confusion_matrix = gbm.confusion_matrix(train=True, valid=True, xval=False) confusion_matrix = gbm.confusion_matrix(train=True, valid=False, xval=True) confusion_matrix = gbm.confusion_matrix(train=True, valid=True, xval=True) confusion_matrix = gbm.confusion_matrix( train=False, valid=False, xval=False) # default: return training metrics confusion_matrix = gbm.confusion_matrix(train=False, valid=True, xval=True) # # plot # plot1 = gbm.plot(train=True, valid=False, xval=False) # plot2 = gbm.plot(train=False, valid=True, xval=False) # plot3 = gbm.plot(train=False, valid=False, xval=True) # plot = gbm.plot(train=True, valid=True, xval=False) # plot = gbm.plot(train=True, valid=False, xval=True) # plot = gbm.plot(train=True, valid=True, xval=True) # plot = gbm.plot(train=False, valid=False, xval=False) # default: return training metrics # plot = gbm.plot(train=False, valid=True, xval=True) # # tpr # tpr1 = gbm.tpr(train=True, valid=False, xval=False) # tpr2 = gbm.tpr(train=False, valid=True, xval=False) # tpr3 = gbm.tpr(train=False, valid=False, xval=True) # tpr = gbm.tpr(train=True, valid=True, xval=False) # tpr = gbm.tpr(train=True, valid=False, xval=True) # tpr = gbm.tpr(train=True, valid=True, xval=True) # tpr = gbm.tpr(train=False, valid=False, xval=False) # default: return training metrics # tpr = gbm.tpr(train=False, valid=True, xval=True) # # # tnr # tnr1 = gbm.tnr(train=True, valid=False, xval=False) # tnr2 = gbm.tnr(train=False, valid=True, xval=False) # tnr3 = gbm.tnr(train=False, valid=False, xval=True) # tnr = gbm.tnr(train=True, valid=True, xval=False) # tnr = gbm.tnr(train=True, valid=False, xval=True) # tnr = gbm.tnr(train=True, valid=True, xval=True) # tnr = gbm.tnr(train=False, valid=False, xval=False) # default: return training metrics # tnr = gbm.tnr(train=False, valid=True, xval=True) # # # fnr # fnr1 = gbm.fnr(train=True, valid=False, xval=False) # fnr2 = gbm.fnr(train=False, valid=True, xval=False) # fnr3 = gbm.fnr(train=False, valid=False, xval=True) # fnr = gbm.fnr(train=True, valid=True, xval=False) # fnr = gbm.fnr(train=True, valid=False, xval=True) # fnr = gbm.fnr(train=True, valid=True, xval=True) # fnr = gbm.fnr(train=False, valid=False, xval=False) # default: return training metrics # fnr = gbm.fnr(train=False, valid=True, xval=True) # # # fpr # fpr1 = gbm.fpr(train=True, valid=False, xval=False) # fpr2 = gbm.fpr(train=False, valid=True, xval=False) # fpr3 = gbm.fpr(train=False, valid=False, xval=True) # fpr = gbm.fpr(train=True, valid=True, xval=False) # fpr = gbm.fpr(train=True, valid=False, xval=True) # fpr = gbm.fpr(train=True, valid=True, xval=True) # fpr = gbm.fpr(train=False, valid=False, xval=False) # default: return training metrics # fpr = gbm.fpr(train=False, valid=True, xval=True) # multinomial cars = h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv")) cars["cylinders"] = cars["cylinders"].asfactor() r = cars[0].runif() train = cars[r > .2] valid = cars[r <= .2] response_col = "cylinders" distribution = "multinomial" predictors = ["displacement", "power", "weight", "acceleration", "year"] gbm = h2o.gbm(y=train[response_col], x=train[predictors], validation_y=valid[response_col], validation_x=valid[predictors], nfolds=3, distribution=distribution, fold_assignment="Random") # mse mse1 = gbm.mse(train=True, valid=False, xval=False) assert isinstance(mse1, float) mse2 = gbm.mse(train=False, valid=True, xval=False) assert isinstance(mse2, float) mse3 = gbm.mse(train=False, valid=False, xval=True) assert isinstance(mse3, float) mse = gbm.mse(train=True, valid=True, xval=False) assert "train" in mse.keys() and "valid" in mse.keys( ), "expected training and validation metrics to be returned, but got {0}".format( mse.keys()) assert len( mse ) == 2, "expected only training and validation metrics to be returned, but got {0}".format( mse.keys()) assert isinstance(mse["train"], float) and isinstance( mse["valid"], float ), "expected training and validation metrics to be floats, but got {0} and {1}".format( type(mse["train"]), type(mse["valid"])) assert mse["valid"] == mse2 mse = gbm.mse(train=True, valid=False, xval=True) assert "train" in mse.keys() and "xval" in mse.keys( ), "expected training and cross validation metrics to be returned, but got {0}".format( mse.keys()) assert len( mse ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( mse.keys()) assert isinstance(mse["train"], float) and isinstance( mse["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(mse["train"]), type(mse["xval"])) assert mse["xval"] == mse3 mse = gbm.mse(train=True, valid=True, xval=True) assert "train" in mse.keys() and "valid" in mse.keys( ) and "xval" in mse.keys( ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format( mse.keys()) assert len( mse ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format( mse.keys()) assert isinstance(mse["train"], float) and isinstance( mse["valid"], float ) and isinstance( mse["xval"], float ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format( type(mse["train"]), type(mse["valid"]), type(mse["xval"])) mse = gbm.mse(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(mse, float) assert mse == mse1 mse = gbm.mse(train=False, valid=True, xval=True) assert "valid" in mse.keys() and "xval" in mse.keys( ), "expected validation and cross validation metrics to be returned, but got {0}".format( mse.keys()) assert len( mse ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format( mse.keys()) assert isinstance(mse["valid"], float) and isinstance( mse["xval"], float ), "validation and cross validation metrics to be floats, but got {0} and {1}".format( type(mse["valid"]), type(mse["xval"])) # logloss logloss1 = gbm.logloss(train=True, valid=False, xval=False) assert isinstance(logloss1, float) logloss2 = gbm.logloss(train=False, valid=True, xval=False) assert isinstance(logloss2, float) logloss3 = gbm.logloss(train=False, valid=False, xval=True) assert isinstance(logloss3, float) logloss = gbm.logloss(train=True, valid=True, xval=False) assert "train" in logloss.keys() and "valid" in logloss.keys( ), "expected training and validation metrics to be returned, but got {0}".format( logloss.keys()) assert len( logloss ) == 2, "expected only training and validation metrics to be returned, but got {0}".format( logloss.keys()) assert isinstance(logloss["train"], float) and isinstance( logloss["valid"], float ), "expected training and validation metrics to be floats, but got {0} and {1}".format( type(logloss["train"]), type(logloss["valid"])) assert logloss["valid"] == logloss2 logloss = gbm.logloss(train=True, valid=False, xval=True) assert "train" in logloss.keys() and "xval" in logloss.keys( ), "expected training and cross validation metrics to be returned, but got {0}".format( logloss.keys()) assert len( logloss ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( logloss.keys()) assert isinstance(logloss["train"], float) and isinstance( logloss["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(logloss["train"]), type(logloss["xval"])) assert logloss["xval"] == logloss3 logloss = gbm.logloss(train=True, valid=True, xval=True) assert "train" in logloss.keys() and "valid" in logloss.keys( ) and "xval" in logloss.keys( ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format( logloss.keys()) assert len( logloss ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format( logloss.keys()) assert isinstance(logloss["train"], float) and isinstance( logloss["valid"], float ) and isinstance( logloss["xval"], float ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format( type(logloss["train"]), type(logloss["valid"]), type(logloss["xval"])) logloss = gbm.logloss(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(logloss, float) assert logloss == logloss1 logloss = gbm.logloss(train=False, valid=True, xval=True) assert "valid" in logloss.keys() and "xval" in logloss.keys( ), "expected validation and cross validation metrics to be returned, but got {0}".format( logloss.keys()) assert len( logloss ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format( logloss.keys()) assert isinstance(logloss["valid"], float) and isinstance( logloss["xval"], float ), "validation and cross validation metrics to be floats, but got {0} and {1}".format( type(logloss["valid"]), type(logloss["xval"])) # hit_ratio_table hit_ratio_table1 = gbm.hit_ratio_table(train=True, valid=False, xval=False) hit_ratio_table2 = gbm.hit_ratio_table(train=False, valid=True, xval=False) hit_ratio_table3 = gbm.hit_ratio_table(train=False, valid=False, xval=True) hit_ratio_table = gbm.hit_ratio_table(train=True, valid=True, xval=False) hit_ratio_table = gbm.hit_ratio_table(train=True, valid=False, xval=True) hit_ratio_table = gbm.hit_ratio_table(train=True, valid=True, xval=True) hit_ratio_table = gbm.hit_ratio_table( train=False, valid=False, xval=False) # default: return training metrics hit_ratio_table = gbm.hit_ratio_table(train=False, valid=True, xval=True) # clustering iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv")) km = h2o.kmeans(x=iris[0:4], nfolds=3, k=3) # betweenss betweenss1 = km.betweenss(train=True, valid=False, xval=False) assert isinstance(betweenss1, float) betweenss3 = km.betweenss(train=False, valid=False, xval=True) assert isinstance(betweenss3, float) betweenss = km.betweenss(train=True, valid=False, xval=True) assert "train" in betweenss.keys() and "xval" in betweenss.keys( ), "expected training and cross validation metrics to be returned, but got {0}".format( betweenss.keys()) assert len( betweenss ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( betweenss.keys()) assert isinstance(betweenss["train"], float) and isinstance( betweenss["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(betweenss["train"]), type(betweenss["xval"])) assert betweenss["xval"] == betweenss3 betweenss = km.betweenss(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(betweenss, float) assert betweenss == betweenss1 # totss totss1 = km.totss(train=True, valid=False, xval=False) assert isinstance(totss1, float) totss3 = km.totss(train=False, valid=False, xval=True) assert isinstance(totss3, float) totss = km.totss(train=True, valid=False, xval=True) assert "train" in totss.keys() and "xval" in totss.keys( ), "expected training and cross validation metrics to be returned, but got {0}".format( totss.keys()) assert len( totss ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( totss.keys()) assert isinstance(totss["train"], float) and isinstance( totss["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(totss["train"]), type(totss["xval"])) assert totss["xval"] == totss3 totss = km.totss(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(totss, float) assert totss == totss1 # tot_withinss tot_withinss1 = km.tot_withinss(train=True, valid=False, xval=False) assert isinstance(tot_withinss1, float) tot_withinss3 = km.tot_withinss(train=False, valid=False, xval=True) assert isinstance(tot_withinss3, float) tot_withinss = km.tot_withinss(train=True, valid=False, xval=True) assert "train" in tot_withinss.keys() and "xval" in tot_withinss.keys( ), "expected training and cross validation metrics to be returned, but got {0}".format( tot_withinss.keys()) assert len( tot_withinss ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( tot_withinss.keys()) assert isinstance(tot_withinss["train"], float) and isinstance( tot_withinss["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(tot_withinss["train"]), type(tot_withinss["xval"])) assert tot_withinss["xval"] == tot_withinss3 tot_withinss = km.tot_withinss( train=False, valid=False, xval=False) # default: return training metrics assert isinstance(tot_withinss, float) assert tot_withinss == tot_withinss1 # withinss withinss1 = km.withinss(train=True, valid=False, xval=False) withinss3 = km.withinss(train=False, valid=False, xval=True) withinss = km.withinss(train=True, valid=False, xval=True) withinss = km.withinss(train=False, valid=False, xval=False) # default: return training metrics # centroid_stats centroid_stats1 = km.centroid_stats(train=True, valid=False, xval=False) centroid_stats3 = km.centroid_stats(train=False, valid=False, xval=True) centroid_stats = km.centroid_stats(train=True, valid=False, xval=True) centroid_stats = km.centroid_stats( train=False, valid=False, xval=False) # default: return training metrics # size size1 = km.size(train=True, valid=False, xval=False) size3 = km.size(train=False, valid=False, xval=True) size = km.size(train=True, valid=False, xval=True) size = km.size(train=False, valid=False, xval=False) # default: return training metrics
def cars_checkpoint(): cars = h2o.upload_file( pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) s = cars.runif() train = cars[s > .2] valid = cars[s <= .2] print("\n*** Description (chunk distribution, etc) of training frame:") train.describe() print("\n*** Description (chunk distribution, etc) of validation frame:") valid.describe() # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial, # 2:multinomial problem = random.sample(list(range(3)), 1)[0] # pick the predictors and response column, along with the correct distribution predictors = ["displacement", "power", "weight", "acceleration", "year"] if problem == 1: response_col = "economy_20mpg" distribution = "bernoulli" train[response_col] = train[response_col].asfactor() valid[response_col] = valid[response_col].asfactor() elif problem == 2: response_col = "cylinders" distribution = "multinomial" train[response_col] = train[response_col].asfactor() valid[response_col] = valid[response_col].asfactor() else: response_col = "economy" distribution = "gaussian" print("\n*** Distribution: {0}".format(distribution)) print("\n*** Response column: {0}".format(response_col)) # build first model ntrees1 = 5 max_depth1 = random.sample(list(range(2, 6)), 1)[0] min_rows1 = random.sample(list(range(10, 16)), 1)[0] print("\n*** Building model 1 with the following parameters:") print("*** ntrees model 1: {0}".format(ntrees1)) print("*** max_depth model 1: {0}".format(max_depth1)) print("*** min_rows model 1: {0}".format(min_rows1)) model1 = h2o.gbm(x=train[predictors], y=train[response_col], ntrees=ntrees1, max_depth=max_depth1, min_rows=min_rows1, score_each_iteration=True, distribution=distribution, validation_x=valid[predictors], validation_y=valid[response_col]) # save the model, then load the model model_path = h2o.save_model(model1, name="delete_model", force=True) restored_model = h2o.load_model(model_path) shutil.rmtree("delete_model") # continue building the model ntrees2 = ntrees1 + 5 max_depth2 = max_depth1 min_rows2 = min_rows1 print( "\n*** Continuing to build model 1 (now called model 2) with the following parameters:" ) print("*** ntrees model 2: {0}".format(ntrees2)) print("*** max_depth model 2: {0}".format(max_depth2)) print("*** min_rows model 2: {0}".format(min_rows2)) model2 = h2o.gbm(x=train[predictors], y=train[response_col], ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, score_each_iteration=True, validation_x=valid[predictors], validation_y=valid[response_col], checkpoint=restored_model._id) # continue building the model, but with different number of trees ntrees3 = ntrees2 + 50 max_depth3 = max_depth1 min_rows3 = min_rows1 print( "\n*** Continuing to build model 1 (now called model 3) with the following parameters:" ) print("*** ntrees model 3: {0}".format(ntrees3)) print("*** max_depth model 3: {0}".format(max_depth3)) print("*** min_rows model 3: {0}".format(min_rows3)) model3 = h2o.gbm(x=train[predictors], y=train[response_col], ntrees=ntrees3, max_depth=max_depth3, min_rows=min_rows3, distribution=distribution, score_each_iteration=True, validation_x=valid[predictors], validation_y=valid[response_col], checkpoint=restored_model._id) # build the equivalent of model 2 in one shot print( "\n*** Building the equivalent of model 2 (called model 4) in one shot:" ) model4 = h2o.gbm(x=train[predictors], y=train[response_col], ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, score_each_iteration=True, validation_x=valid[predictors], validation_y=valid[response_col]) print("\n*** Model Summary for model 2:") print(model2.summary()) print("\n*** Model Summary for model 3:") print(model3.summary()) print("\n*** Model Summary for model 4:") print(model4.summary()) print("\n*** Score History for model 2:") print(model2.scoring_history()) print("\n*** Score History for model 3:") print(model3.scoring_history()) print("\n*** Score History for model 4:") print(model4.scoring_history()) # checks if problem == 0: assert isinstance(model2, type(model4)) assert model2.mse(valid=True) == model4.mse( valid=True ), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format( model2.mse(valid=True), model4.mse(valid=True)) #assert model3.mse(valid=True)!=model4.mse(valid=True), "Expected Model 3 MSE: {0} to be different from Model 4 MSE: {1}".format(model3.mse(valid=True), model4.mse(valid=True)) elif problem == 1: assert isinstance(model2, type(model4)) assert model2.auc(valid=True) == model4.auc( valid=True ), "Expected Model 2 AUC: {0} to be the same as Model 4 AUC: {1}".format( model2.auc(valid=True), model4.auc(valid=True)) #assert model3.auc(valid=True)!=model4.auc(valid=True), "Expected Model 3 AUC: {0} to be different from Model 4 AUC: {1}".format(model3.auc(valid=True), model4.auc(valid=True)) assert model2.logloss(valid=True) == model4.logloss( valid=True ), "Expected Model 2 Log Loss: {0} to be the same as Model 4 Log Loss: {1}".format( model2.logloss(valid=True), model4.logloss(valid=True)) #assert model3.logloss(valid=True)!=model4.logloss(valid=True), "Expected Model 3 Log Loss: {0} to be different from Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True)) assert model2.giniCoef(valid=True) == model4.giniCoef( valid=True ), "Expected Model 2 Gini Coef {0} to be the same as Model 4 Gini Coef: {1}".format( model2.giniCoef(valid=True), model4.giniCoef(valid=True)) #assert model3.giniCoef(valid=True)!=model4.giniCoef(valid=True), "Expected Model 3 Gini Coef: {0} to be different from Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True)) else: assert isinstance(model2, type(model4)) assert model2.mse(valid=True) == model4.mse( valid=True ), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format( model2.mse(valid=True), model4.mse(valid=True)) #assert model3.mse(valid=True)!=model4.mse(valid=True), "Expected Model 3 MSE: {0} to be different from Model 4 MSE: {1}".format(model3.mse(valid=True), model4.mse(valid=True)) assert model2.r2(valid=True) == model4.r2( valid=True ), "Expected Model 2 R2: {0} to be the same as Model 4 R2: {1}".format( model2.r2(valid=True), model4.r2(valid=True))
def ecologyGBM(ip, port): #Log.info("Importing ecology_model.csv data...\n") ecology_train = h2o.import_file( path=h2o.locate("smalldata/gbm_test/ecology_model.csv")) #Log.info("Summary of the ecology data from h2o: \n") #ecology.summary() # Log.info("==============================") # Log.info("H2O GBM Params: ") # Log.info("x = ecology_train[2:14]") # Log.info("y = ecology_train["Angaus"]") # Log.info("ntrees = 100") # Log.info("max_depth = 5") # Log.info("min_rows = 10") # Log.info("learn_rate = 0.1") # Log.info("==============================") # Log.info("==============================") # Log.info("scikit GBM Params: ") # Log.info("learning_rate=0.1") # Log.info("n_estimators=100") # Log.info("max_depth=5") # Log.info("min_samples_leaf = 10") # Log.info("n.minobsinnode = 10") # Log.info("max_features=None") # Log.info("==============================") ntrees = 100 max_depth = 5 min_rows = 10 learn_rate = 0.1 # Prepare data for scikit use trainData = np.genfromtxt( h2o.locate("smalldata/gbm_test/ecology_model.csv"), delimiter=',', dtype=None, names=("Site", "Angaus", "SegSumT", "SegTSeas", "SegLowFlow", "DSDist", "DSMaxSlope", "USAvgT", "USRainDays", "USSlope", "USNative", "DSDam", "Method", "LocSed"), skip_header=1, missing_values=('NA'), filling_values=(np.nan)) trainDataResponse = trainData["Angaus"] trainDataFeatures = trainData[[ "SegSumT", "SegTSeas", "SegLowFlow", "DSDist", "DSMaxSlope", "USAvgT", "USRainDays", "USSlope", "USNative", "DSDam", "Method", "LocSed" ]] ecology_train["Angaus"] = ecology_train["Angaus"].asfactor() # Train H2O GBM Model: gbm_h2o = h2o.gbm(x=ecology_train[2:], y=ecology_train["Angaus"], ntrees=ntrees, learn_rate=learn_rate, max_depth=max_depth, min_rows=min_rows, distribution="bernoulli") # Train scikit GBM Model: gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learn_rate, n_estimators=ntrees, max_depth=max_depth, min_samples_leaf=min_rows, max_features=None) gbm_sci.fit(trainDataFeatures[:, np.newaxis], trainDataResponse) # Evaluate the trained models on test data # Load the test data (h2o) ecology_test = h2o.import_file( path=h2o.locate("smalldata/gbm_test/ecology_eval.csv")) # Load the test data (scikit) testData = np.genfromtxt(h2o.locate("smalldata/gbm_test/ecology_eval.csv"), delimiter=',', dtype=None, names=("Angaus", "SegSumT", "SegTSeas", "SegLowFlow", "DSDist", "DSMaxSlope", "USAvgT", "USRainDays", "USSlope", "USNative", "DSDam", "Method", "LocSed"), skip_header=1, missing_values=('NA'), filling_values=(np.nan)) testDataResponse = testData["Angaus"] testDataFeatures = testData[[ "SegSumT", "SegTSeas", "SegLowFlow", "DSDist", "DSMaxSlope", "USAvgT", "USRainDays", "USSlope", "USNative", "DSDam", "Method", "LocSed" ]] # Score on the test data and compare results # scikit auc_sci = roc_auc_score( testDataResponse, gbm_sci.predict_proba(testDataFeatures[:, np.newaxis])[:, 1]) # h2o gbm_perf = gbm_h2o.model_performance(ecology_test) auc_h2o = gbm_perf.auc() #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o)) assert auc_h2o >= auc_sci, "h2o (auc) performance degradation, with respect to scikit"
def cv_carsGBM(ip,port): # read in the dataset and construct training set (and validation set) cars = h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv")) # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial, # 2:multinomial problem = random.sample(range(3),1)[0] # pick the predictors and response column, along with the correct distribution predictors = ["displacement","power","weight","acceleration","year"] if problem == 1 : response_col = "economy_20mpg" distribution = "bernoulli" cars[response_col] = cars[response_col].asfactor() elif problem == 2 : response_col = "cylinders" distribution = "multinomial" cars[response_col] = cars[response_col].asfactor() else : response_col = "economy" distribution = "gaussian" print "Distribution: {0}".format(distribution) print "Response column: {0}".format(response_col) ## cross-validation # 1. check that cv metrics are the same over repeated "Modulo" runs nfolds = random.randint(3,10) gbm1 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, distribution=distribution, ntrees=5, fold_assignment="Modulo") gbm2 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, distribution=distribution, ntrees=5, fold_assignment="Modulo") h2o.check_models(gbm1, gbm2, True) # 2. check that cv metrics are different over repeated "Random" runs nfolds = random.randint(3,10) gbm1 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, distribution=distribution, ntrees=5, fold_assignment="Random") gbm2 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, distribution=distribution, ntrees=5, fold_assignment="Random") try: h2o.check_models(gbm1, gbm2, True) assert False, "Expected models to be different over repeated Random runs" except AssertionError: assert True # 3. folds_column num_folds = random.randint(2,5) fold_assignments = h2o.H2OFrame(python_obj=[[random.randint(0,num_folds-1)] for f in range(cars.nrow())]) fold_assignments.setNames(["fold_assignments"]) cars = cars.cbind(fold_assignments) gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], training_frame=cars, distribution=distribution, ntrees=5, fold_column="fold_assignments", keep_cross_validation_predictions=True) num_cv_models = len(gbm._model_json['output']['cross_validation_models']) assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \ "{1}".format(num_folds, num_cv_models) cv_model1 = h2o.get_model(gbm._model_json['output']['cross_validation_models'][0]['name']) cv_model2 = h2o.get_model(gbm._model_json['output']['cross_validation_models'][1]['name']) assert isinstance(cv_model1, type(gbm)), "Expected cross-validation model to be the same model type as the " \ "constructed model, but got {0} and {1}".format(type(cv_model1),type(gbm)) assert isinstance(cv_model2, type(gbm)), "Expected cross-validation model to be the same model type as the " \ "constructed model, but got {0} and {1}".format(type(cv_model2),type(gbm)) # 4. keep_cross_validation_predictions cv_predictions = gbm1._model_json['output']['cross_validation_predictions'] assert cv_predictions is None, "Expected cross-validation predictions to be None, but got {0}".format(cv_predictions) cv_predictions = gbm._model_json['output']['cross_validation_predictions'] assert len(cv_predictions)==num_folds, "Expected the same number of cross-validation predictions " \ "as folds, but got {0}".format(len(cv_predictions)) # # 5. manually construct models # fold1 = cars[cars["fold_assignments"]==0] # fold2 = cars[cars["fold_assignments"]==1] # manual_model1 = h2o.gbm(y=fold2[response_col], # x=fold2[predictors], # validation_y=fold1[response_col], # validation_x=fold1[predictors], ntrees=5, # distribution=distribution) # manual_model2 = h2o.gbm(y=fold1[response_col], # x=fold1[predictors], # validation_y=fold2[response_col], # validation_x=fold2[predictors], ntrees=5, # distribution=distribution) ## boundary cases # 1. nfolds = number of observations (leave-one-out cross-validation) gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow(), distribution=distribution, ntrees=5, fold_assignment="Modulo") # 2. nfolds = 0 gbm1 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=0, distribution=distribution, ntrees=5) # check that this is equivalent to no nfolds gbm2 = h2o.gbm(y=cars[response_col], x=cars[predictors], distribution=distribution, ntrees=5) h2o.check_models(gbm1, gbm2) # 3. cross-validation and regular validation attempted gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=random.randint(3,10), validation_y=cars[response_col], ntrees=5, validation_x=cars[predictors], distribution=distribution) ## error cases # 1. nfolds == 1 or < 0 try: gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=random.sample([-1,1], 1)[0], ntrees=5, distribution=distribution) assert False, "Expected model-build to fail when nfolds is 1 or < 0" except EnvironmentError: assert True # 2. more folds than observations try: gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow()+1, distribution=distribution, ntrees=5, fold_assignment="Modulo") assert False, "Expected model-build to fail when nfolds > nobs" except EnvironmentError: assert True # 3. fold_column and nfolds both specified try: gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=3, fold_column="fold_assignments", ntrees=5, distribution=distribution, training_frame=cars) assert False, "Expected model-build to fail when fold_column and nfolds both specified" except EnvironmentError: assert True # 4. fold_column and fold_assignment both specified try: gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], fold_assignment="Random", fold_column="fold_assignments", ntrees=5, distribution=distribution, training_frame=cars) assert False, "Expected model-build to fail when fold_column and fold_assignment both specified" except EnvironmentError: assert True
def milsong_checkpoint(): milsong_train = h2o.upload_file( tests.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file( tests.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) distribution = "gaussian" # build first model ntrees1 = random.sample(range(50, 100), 1)[0] max_depth1 = random.sample(range(2, 6), 1)[0] min_rows1 = random.sample(range(10, 16), 1)[0] print "ntrees model 1: {0}".format(ntrees1) print "max_depth model 1: {0}".format(max_depth1) print "min_rows model 1: {0}".format(min_rows1) model1 = h2o.gbm(x=milsong_train[1:], y=milsong_train[0], ntrees=ntrees1, max_depth=max_depth1, min_rows=min_rows1, distribution=distribution, validation_x=milsong_valid[1:], validation_y=milsong_valid[0]) # save the model, then load the model path = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "results")) assert os.path.isdir( path), "Expected save directory {0} to exist, but it does not.".format( path) model_path = h2o.save_model(model1, path=path, force=True) assert os.path.isdir( model_path ), "Expected load directory {0} to exist, but it does not.".format( model_path) restored_model = h2o.load_model(model_path) # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print "ntrees model 2: {0}".format(ntrees2) print "max_depth model 2: {0}".format(max_depth2) print "min_rows model 2: {0}".format(min_rows2) model2 = h2o.gbm(x=milsong_train[1:], y=milsong_train[0], ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, validation_x=milsong_valid[1:], validation_y=milsong_valid[0], checkpoint=restored_model._id) # build the equivalent of model 2 in one shot model3 = h2o.gbm(x=milsong_train[1:], y=milsong_train[0], ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, validation_x=milsong_valid[1:], validation_y=milsong_valid[0])
def domain_check(): air_train = h2o.import_file( path=tests.locate("smalldata/airlines/AirlinesTrain.csv.zip")) air_train.show() air_test = h2o.import_file( path=tests.locate("smalldata/airlines/AirlinesTest.csv.zip")) air_test.show() actual_domain = [u'YES', u'NO'] print "actual domain of the response: {0}".format(actual_domain) ### DRF ### print print "-------------- DRF:" print rf = h2o.random_forest(x=air_train[[ "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek" ]], y=air_train["IsDepDelayed"].asfactor(), training_frame=air_train) computed_domain = rf._model_json['output'][ 'training_metrics']._metric_json['domain'] domain_diff = list(set(computed_domain) - set(actual_domain)) assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \ "The difference is {2}".format(actual_domain, computed_domain, domain_diff) perf = rf.model_performance(test_data=air_test) computed_domain = perf._metric_json['domain'] domain_diff = list(set(computed_domain) - set(actual_domain)) assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \ "The difference is {2}".format(actual_domain, computed_domain, domain_diff) ### GBM ### print print "-------------- GBM:" print gbm = h2o.gbm(x=air_train[[ "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek" ]], y=air_train["IsDepDelayed"].asfactor(), training_frame=air_train, distribution="bernoulli") computed_domain = gbm._model_json['output'][ 'training_metrics']._metric_json['domain'] domain_diff = list(set(computed_domain) - set(actual_domain)) assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \ "The difference is {2}".format(actual_domain, computed_domain, domain_diff) perf = rf.model_performance(test_data=air_test) computed_domain = perf._metric_json['domain'] domain_diff = list(set(computed_domain) - set(actual_domain)) assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \ "The difference is {2}".format(actual_domain, computed_domain, domain_diff) ### Deeplearning ### print print "-------------- Deeplearning:" print dl = h2o.deeplearning(x=air_train[[ "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek" ]], y=air_train["IsDepDelayed"].asfactor(), training_frame=air_train, activation="Tanh", hidden=[2, 2, 2], epochs=10) computed_domain = dl._model_json['output'][ 'training_metrics']._metric_json['domain'] domain_diff = list(set(computed_domain) - set(actual_domain)) assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \ "The difference is {2}".format(actual_domain, computed_domain, domain_diff) perf = rf.model_performance(test_data=air_test) computed_domain = perf._metric_json['domain'] domain_diff = list(set(computed_domain) - set(actual_domain)) assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \ "The difference is {2}".format(actual_domain, computed_domain, domain_diff) ### GLM ### print print "-------------- GLM:" print glm = h2o.glm(x=air_train[[ "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek" ]], y=air_train["IsDepDelayed"], training_frame=air_train, family="binomial") computed_domain = glm._model_json['output'][ 'training_metrics']._metric_json['domain'] domain_diff = list(set(computed_domain) - set(actual_domain)) assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \ "The difference is {2}".format(actual_domain, computed_domain, domain_diff) perf = glm.model_performance(test_data=air_test) computed_domain = perf._metric_json['domain'] domain_diff = list(set(computed_domain) - set(actual_domain)) assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \ "The difference is {2}".format(actual_domain, computed_domain, domain_diff)
def loss_behaviorGBM(ip,port): # Connect to h2o h2o.init(ip,port) #Log.info("==============================") #Log.info("Default Behavior - Gaussian") #Log.info("==============================") eco = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/ecology_model.csv")) # 0/1 response: expect gaussian eco_model = h2o.gbm(x=eco[2:13], y=eco["Angaus"]) assert isinstance(eco_model,h2o.model.regression.H2ORegressionModel) # more than 2 integers for response: expect gaussian cars = h2o.import_frame(path=h2o.locate("smalldata/junit/cars.csv")) cars_model = h2o.gbm(x=cars[3:7], y=cars["cylinders"]) assert isinstance(cars_model,h2o.model.regression.H2ORegressionModel) # AUTO loss works now - no longer dies here # # character response: expect error # try: # eco_model = h2o.gbm(x=eco[0:8], y=eco["Method"]) # assert False, "expected an error" # except EnvironmentError: # assert True #Log.info("==============================") #Log.info("Gaussian Behavior") #Log.info("==============================") # 0/1 response: expect gaussian eco_model = h2o.gbm(x=eco[2:13], y=eco["Angaus"], loss="gaussian") assert isinstance(eco_model,h2o.model.regression.H2ORegressionModel) # character response: expect error try: eco_model = h2o.gbm(x=eco[1:8], y=eco["Method"], loss="gaussian") assert False, "expected an error" except EnvironmentError: assert True #Log.info("==============================") #Log.info("Bernoulli Behavior") #Log.info("==============================") # 0/1 response: expect bernoulli eco_model = h2o.gbm(x=eco[2:13], y=eco["Angaus"].asfactor(), loss="bernoulli") assert isinstance(eco_model,h2o.model.binomial.H2OBinomialModel) # 2 level character response: expect bernoulli tree = h2o.import_frame(path=h2o.locate("smalldata/junit/test_tree_minmax.csv")) tree_model = h2o.gbm(x=tree[0:3], y=tree["response"], loss="bernoulli", min_rows=1) assert isinstance(tree_model,h2o.model.binomial.H2OBinomialModel) # more than two integers for response: expect error try: cars_mod = h2o.gbm(x=cars[3:7], y=cars["cylinders"], loss="bernoulli") assert False, "expected an error" except EnvironmentError: assert True # more than two character levels for response: expect error try: eco_model = h2o.gbm(x=eco[0:8], y=eco["Method"], loss="bernoulli") assert False, "expected an error" except EnvironmentError: assert True #Log.info("==============================") #Log.info("Multinomial Behavior") #Log.info("==============================") # more than two integers for response: expect multinomial cars_model = h2o.gbm(x=cars[3:7], y=cars["cylinders"].asfactor(), loss="multinomial") assert isinstance(cars_model,h2o.model.multinomial.H2OMultinomialModel) # more than two character levels for response: expect multinomial eco_model = h2o.gbm(x=eco[0:8], y=eco["Method"], loss="multinomial") assert isinstance(eco_model,h2o.model.multinomial.H2OMultinomialModel)
def javapredict(algo, equality, train, test, x, y, **kwargs): print "Creating model in H2O" if algo == "gbm": model = h2o.gbm(x=train[x], y=train[y], **kwargs) elif algo == "random_forest": model = h2o.random_forest(x=train[x], y=train[y], **kwargs) elif algo == "deeplearning": model = h2o.deeplearning(x=train[x], y=train[y], **kwargs) elif algo == "glm": model = h2o.glm(x=train[x], y=train[y], **kwargs) else: raise (ValueError, "algo {0} is not supported".format(algo)) print model print "Downloading Java prediction model code from H2O" tmpdir = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "results", model._id)) os.mkdir(tmpdir) h2o.download_pojo(model, path=tmpdir) h2o_genmodel_jar = os.path.join(tmpdir, "h2o-genmodel.jar") assert os.path.exists( h2o_genmodel_jar ), "Expected file {0} to exist, but it does not.".format(h2o_genmodel_jar) print "h2o-genmodel.jar saved in {0}".format(h2o_genmodel_jar) java_file = os.path.join(tmpdir, model._id + ".java") assert os.path.exists( java_file), "Expected file {0} to exist, but it does not.".format( java_file) print "java code saved in {0}".format(java_file) print "Predicting in H2O" predictions = model.predict(test) predictions.summary() predictions.head() out_h2o_csv = os.path.join(tmpdir, "out_h2o.csv") h2o.download_csv(predictions, out_h2o_csv) assert os.path.exists( out_h2o_csv), "Expected file {0} to exist, but it does not.".format( out_h2o_csv) print "H2O Predictions saved in {0}".format(out_h2o_csv) print "Setting up for Java POJO" in_csv = os.path.join(tmpdir, "in.csv") h2o.download_csv(test[x], in_csv) # hack: the PredictCsv driver can't handle quoted strings, so remove them f = open(in_csv, 'r+') csv = f.read() csv = re.sub('\"', '', csv) f.seek(0) f.write(csv) f.truncate() f.close() assert os.path.exists( in_csv), "Expected file {0} to exist, but it does not.".format(in_csv) print "Input CSV to PredictCsv saved in {0}".format(in_csv) print "Compiling Java Pojo" javac_cmd = [ "javac", "-cp", h2o_genmodel_jar, "-J-Xmx4g", "-J-XX:MaxPermSize=256m", java_file ] subprocess.check_call(javac_cmd) print "Running PredictCsv Java Program" out_pojo_csv = os.path.join(tmpdir, "out_pojo.csv") cp_sep = ";" if sys.platform == "win32" else ":" java_cmd = [ "java", "-ea", "-cp", h2o_genmodel_jar + cp_sep + tmpdir, "-Xmx4g", "-XX:MaxPermSize=256m", "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.PredictCsv", "--header", "--model", model._id, "--input", in_csv, "--output", out_pojo_csv ] p = subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT) o, e = p.communicate() print "Java output: {0}".format(o) assert os.path.exists( out_pojo_csv), "Expected file {0} to exist, but it does not.".format( out_pojo_csv) predictions2 = h2o.import_file(path=out_pojo_csv) print "Pojo predictions saved in {0}".format(out_pojo_csv) print "Comparing predictions between H2O and Java POJO" # Dimensions hr, hc = predictions.dim pr, pc = predictions2.dim assert hr == pr, "Exepcted the same number of rows, but got {0} and {1}".format( hr, pr) assert hc == pc, "Exepcted the same number of cols, but got {0} and {1}".format( hc, pc) # Value for r in range(hr): hp = predictions[r, 0] if equality == "numeric": pp = float.fromhex(predictions2[r, 0]) assert abs( hp - pp ) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format( r, hp, pp) elif equality == "class": pp = predictions2[r, 0] assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format( r, hp, pp) else: raise (ValueError, "equality type {0} is not supported".format(equality))
def bernoulliGBM(ip, port): # Connect to h2o h2o.init(ip, port) #Log.info("Importing prostate.csv data...\n") prostate_train = h2o.import_frame( path=h2o.locate("smalldata/logreg/prostate_train.csv")) #Log.info("Converting CAPSULE and RACE columns to factors...\n") prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor() #Log.info("H2O Summary of prostate frame:\n") #prostate.summary() # Import prostate_train.csv as numpy array for scikit comparison trainData = np.loadtxt(h2o.locate("smalldata/logreg/prostate_train.csv"), delimiter=',', skiprows=1) trainDataResponse = trainData[:, 0] trainDataFeatures = trainData[:, 1:] ntrees = 100 learning_rate = 0.1 depth = 5 min_rows = 10 # Build H2O GBM classification model: #Log.info(paste("H2O GBM with parameters:\ndistribution = 'bernoulli', ntrees = ", ntrees, ", max_depth = 5, # min_rows = 10, learn_rate = 0.1\n", sep = "")) gbm_h2o = h2o.gbm(x=prostate_train[1:], y=prostate_train["CAPSULE"], ntrees=ntrees, learn_rate=learning_rate, max_depth=depth, min_rows=min_rows, distribution="bernoulli") # Build scikit GBM classification model #Log.info("scikit GBM with same parameters\n") gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learning_rate, n_estimators=ntrees, max_depth=depth, min_samples_leaf=min_rows, max_features=None) gbm_sci.fit(trainDataFeatures, trainDataResponse) #Log.info("Importing prostate_test.csv data...\n") prostate_test = h2o.import_frame( path=h2o.locate("smalldata/logreg/prostate_test.csv")) #Log.info("Converting CAPSULE and RACE columns to factors...\n") prostate_test["CAPSULE"] = prostate_test["CAPSULE"].asfactor() # Import prostate_test.csv as numpy array for scikit comparison testData = np.loadtxt(h2o.locate("smalldata/logreg/prostate_test.csv"), delimiter=',', skiprows=1) testDataResponse = testData[:, 0] testDataFeatures = testData[:, 1:] # Score on the test data and compare results # scikit auc_sci = roc_auc_score(testDataResponse, gbm_sci.predict_proba(testDataFeatures)[:, 1]) # h2o gbm_perf = gbm_h2o.model_performance(prostate_test) auc_h2o = gbm_perf.auc() #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o)) assert auc_h2o >= auc_sci, "h2o (auc) performance degradation, with respect to scikit"
def gbm_demo(interactive, echo, test): h2o_data_path = system_file("prostate.csv") demo_description = [ '\n-----------------------------------------------------------------', 'This is a demo of H2O\'s GBM function.', 'It uploads a dataset to h2o, parses it, and shows a description.', 'Then, it divides the dataset into training and test sets, ', 'builds a GBM from the training set, and predicts on the test set.', 'Finally, default performance metrics are displayed.', '-----------------------------------------------------------------' ] demo_commands = [ '# Connect to h2o', '>>> h2o.init()\n', '\n# Upload the prostate dataset that comes included in the h2o python package', '>>> prostate = h2o.upload_file(path = ' + h2o_data_path + '))\n', '\n# Print a description of the prostate data', '>>> prostate.summary()\n', '\n# Randomly split the dataset into ~70/30, training/test sets', '>>> r = prostate[0].runif()', '>>> train = prostate[r < 0.70]', '>>> valid = prostate[r >= 0.30]\n', '\n# Convert the response columns to factors (for binary classification problems)', '>>> train["CAPSULE"] = train["CAPSULE"].asfactor()', '>>> test["CAPSULE"] = test["CAPSULE"].asfactor()\n', '\n# Build a (classification) GBM', '>>> prostate_gbm = h2o.gbm(x=train[["AGE", "RACE", "PSA", "VOL", "GLEASON"]], ' 'y=train["CAPSULE"], distribution="bernoulli", ntrees=10, max_depth=8, min_rows=10, ' 'learn_rate=0.2)\n', '\n# Show the model', '>>> prostate_gbm.show()\n', '\n# Predict on the test set and show the first ten predictions', '>>> predictions = prostate_gbm.predict(test)', '>>> predictions.show()\n', '\n# Show default performance metrics', '>>> performance = prostate_gbm.model_performance(test)', '>>> performance.show()\n' ] for line in demo_description: print line print echo_and_interact(demo_commands, interactive, echo) if not test: h2o.init() echo_and_interact(demo_commands, interactive, echo) prostate = h2o.upload_file(path=h2o_data_path) echo_and_interact(demo_commands, interactive, echo) prostate.summary() echo_and_interact(demo_commands, interactive, echo, npop=4) r = prostate[0].runif() train = prostate[r < 0.70] test = prostate[r >= 0.30] echo_and_interact(demo_commands, interactive, echo, npop=3) train["CAPSULE"] = train["CAPSULE"].asfactor() test["CAPSULE"] = test["CAPSULE"].asfactor() echo_and_interact(demo_commands, interactive, echo) prostate_gbm = h2o.gbm(x=train[["AGE", "RACE", "PSA", "VOL", "GLEASON"]], y=train["CAPSULE"], distribution="bernoulli", ntrees=10, max_depth=8, min_rows=10, learn_rate=0.2) echo_and_interact(demo_commands, interactive, echo) prostate_gbm.show() echo_and_interact(demo_commands, interactive, echo, npop=3) predictions = prostate_gbm.predict(test) predictions.show() echo_and_interact(demo_commands, interactive, echo, npop=3) performance = prostate_gbm.model_performance(test) performance.show()
def metric_json_check(): df = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) # Regression metric json reg_mod = h2o.gbm(y=df["CAPSULE"], x=df[3:], training_frame=df, distribution="gaussian") reg_met = reg_mod.model_performance() reg_metric_json_keys_have = reg_met._metric_json.keys() reg_metric_json_keys_desired = [ u'model_category', u'description', u'r2', u'frame', u'model_checksum', u'MSE', u'__meta', u'scoring_time', u'predictions', u'model', u'duration_in_ms', u'frame_checksum', u'mean_residual_deviance' ] reg_metric_diff = list( set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired)) assert not reg_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) regression " \ "metric json. The difference is {2}".format(reg_metric_json_keys_have, reg_metric_json_keys_desired, reg_metric_diff) # Regression metric json (GLM) reg_mod = h2o.glm(y=df["CAPSULE"], x=df[3:], training_frame=df, family="gaussian") reg_met = reg_mod.model_performance() reg_metric_json_keys_have = reg_met._metric_json.keys() reg_metric_json_keys_desired = [ u'model_category', u'description', u'r2', u'residual_degrees_of_freedom', u'frame', u'model_checksum', u'MSE', u'__meta', u'null_deviance', u'scoring_time', u'null_degrees_of_freedom', u'predictions', u'AIC', u'model', u'duration_in_ms', u'frame_checksum', u'residual_deviance', u'mean_residual_deviance' ] reg_metric_diff = list( set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired)) assert not reg_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) glm-regression " \ "metric json. The difference is {2}".format(reg_metric_json_keys_have, reg_metric_json_keys_desired, reg_metric_diff) # Binomial metric json bin_mod = h2o.gbm(y=df["CAPSULE"].asfactor(), x=df[3:], training_frame=df, distribution="bernoulli") bin_met = bin_mod.model_performance() bin_metric_json_keys_have = bin_met._metric_json.keys() bin_metric_json_keys_desired = [ u'AUC', u'Gini', u'model_category', u'description', u'r2', u'frame', u'model_checksum', u'MSE', u'__meta', u'logloss', u'scoring_time', u'thresholds_and_metric_scores', u'predictions', u'max_criteria_and_metric_scores', u'model', u'duration_in_ms', u'frame_checksum', u'domain' ] bin_metric_diff = list( set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired)) assert not bin_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) binomial " \ "metric json. The difference is {2}".format(bin_metric_json_keys_have, bin_metric_json_keys_desired, bin_metric_diff) # Binomial metric json (GLM) bin_mod = h2o.glm(y=df["CAPSULE"].asfactor(), x=df[3:], training_frame=df, family="binomial") bin_met = bin_mod.model_performance() bin_metric_json_keys_have = bin_met._metric_json.keys() bin_metric_json_keys_desired = [ u'frame', u'residual_deviance', u'max_criteria_and_metric_scores', u'MSE', u'frame_checksum', u'AIC', u'logloss', u'Gini', u'predictions', u'AUC', u'description', u'model_checksum', u'duration_in_ms', u'model_category', u'r2', u'residual_degrees_of_freedom', u'__meta', u'null_deviance', u'scoring_time', u'null_degrees_of_freedom', u'model', u'thresholds_and_metric_scores', u'domain' ] bin_metric_diff = list( set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired)) assert not bin_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) glm-binomial " \ "metric json. The difference is {2}".format(bin_metric_json_keys_have, bin_metric_json_keys_desired, bin_metric_diff) # Multinomial metric json df = h2o.import_file( path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip")) myX = [ "Origin", "Dest", "IsDepDelayed", "UniqueCarrier", "Distance", "fDayofMonth", "fDayOfWeek" ] myY = "fYear" mul_mod = h2o.gbm(x=df[myX], y=df[myY], training_frame=df, distribution="multinomial") mul_met = mul_mod.model_performance() mul_metric_json_keys_have = mul_met._metric_json.keys() mul_metric_json_keys_desired = [ u'cm', u'model_category', u'description', u'r2', u'frame', u'model_checksum', u'MSE', u'__meta', u'logloss', u'scoring_time', u'predictions', u'hit_ratio_table', u'model', u'duration_in_ms', u'frame_checksum' ] mul_metric_diff = list( set(mul_metric_json_keys_have) - set(mul_metric_json_keys_desired)) assert not mul_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) multinomial " \ "metric json. The difference is {2}".format(mul_metric_json_keys_have, mul_metric_json_keys_desired, mul_metric_diff) # Clustering metric json df = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv")) clus_mod = h2o.kmeans(x=df[0:4], k=3, standardize=False) clus_met = clus_mod.model_performance() clus_metric_json_keys_have = clus_met._metric_json.keys() clus_metric_json_keys_desired = [ u'tot_withinss', u'model_category', u'description', u'frame', u'model_checksum', u'MSE', u'__meta', u'scoring_time', u'betweenss', u'predictions', u'totss', u'model', u'duration_in_ms', u'frame_checksum', u'centroid_stats' ] clus_metric_diff = list( set(clus_metric_json_keys_have) - set(clus_metric_json_keys_desired)) assert not clus_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) clustering " \ "metric json. The difference is {2}".format(clus_metric_json_keys_have, clus_metric_json_keys_desired, clus_metric_diff)
def all_confusion_matrix_funcs(): metrics = [ "min_per_class_accuracy", "absolute_MCC", "precision", "accuracy", "f0point5", "f2", "f1" ] train = [True, False] valid = [True, False] print("PARSING TRAINING DATA") air_train = h2o.import_file( path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip")) print("PARSING TESTING DATA") air_test = h2o.import_file( path=pyunit_utils.locate("smalldata/airlines/AirlinesTest.csv.zip")) print() print("RUNNING FIRST GBM: ") print() gbm_bin = h2o.gbm(x=air_train[[ "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek" ]], y=air_train["IsDepDelayed"].asfactor(), validation_x=air_test[[ "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek" ]], validation_y=air_test["IsDepDelayed"].asfactor(), distribution="bernoulli") print() print("RUNNING SECOND GBM: ") print() gbm_mult = h2o.gbm(x=air_train[[ "Origin", "Dest", "Distance", "UniqueCarrier", "IsDepDelayed", "fDayofMonth", "fMonth" ]], y=air_train["fDayOfWeek"].asfactor(), validation_x=air_test[[ "Origin", "Dest", "Distance", "UniqueCarrier", "IsDepDelayed", "fDayofMonth", "fMonth" ]], validation_y=air_test["fDayOfWeek"].asfactor(), distribution="multinomial") def dim_check(cm, m, t, v): assert len(cm) == 2 and len(cm[0]) == 2 and len(cm[1]) == 2, "incorrect confusion matrix dimensions " \ "for metric/thresh: {0}, train: {1}, valid: " \ "{2}".format(m, t, v) def type_check(cm, m, t, v): assert isinstance(cm[0][0], (int, float)) and isinstance(cm[0][1], (int, float)) and \ isinstance(cm[1][0], (int, float)) and isinstance(cm[0][0], (int, float)), \ "confusion matrix entries should be integers or floats but got {0}, {1}, {2}, {3}. metric/thresh: {4}, " \ "train: {5}, valid: {6}".format(type(cm[0][0]), type(cm[0][1]), type(cm[1][0]), type(cm[1][1]), m, t, v) def count_check(cm, m, t, v): if v: assert cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1] == air_test.nrow, \ "incorrect confusion matrix elements: {0}, {1}, {2}, {3}. Should sum " \ "to {4}. metric/thresh: {5}, train: {6}, valid: {7}".format(cm[0][0], cm[0][1], cm[1][0], cm[1][1], air_test.nrow, m, t, v) else: assert cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1] == air_train.nrow, \ "incorrect confusion matrix elements: {0}, {1}, {2}, {3}. Should sum " \ "to {4}. metric/thresh: {5}, train: {6}, valid: {7}".format(cm[0][0], cm[0][1], cm[1][0], cm[1][1], air_train.nrow, m, t, v) # H2OBinomialModel.confusion_matrix() for m in metrics: for t in train: for v in valid: if t and v: continue cm = gbm_bin.confusion_matrix(metrics=m, train=t, valid=v) if cm: cm = cm.to_list() dim_check(cm, m, t, v) type_check(cm, m, t, v) count_check(cm, m, t, v) # H2OBinomialModel.confusion_matrix() for x in range(10): for t in train: for v in valid: if t and v: continue thresholds = [ gbm_bin.find_threshold_by_max_metric(m, t, v) for m in random.sample(metrics, random.randint(1, len(metrics))) ] cms = gbm_bin.confusion_matrix(thresholds=thresholds, train=t, valid=v) if not isinstance(cms, list): cms = [cms] for idx, cm in enumerate(cms): cm = cm.to_list() dim_check(cm, thresholds[idx], t, v) type_check(cm, thresholds[idx], t, v) count_check(cm, thresholds[idx], t, v) # H2OMultinomialModel.confusion_matrix() cm = gbm_mult.confusion_matrix(data=air_test) cm_count = 0 for r in range(7): for c in range(7): cm_count += cm.cell_values[r][c] assert cm_count == air_test.nrow, "incorrect confusion matrix elements. Should sum to {0}, but got {1}".\ format(air_test.nrow, cm_count) # H2OBinomialModelMetrics.confusion_matrix() bin_perf = gbm_bin.model_performance(valid=True) for metric in metrics: cm = bin_perf.confusion_matrix(metrics=metric).to_list() dim_check(cm, metric, False, True) type_check(cm, metric, False, True) count_check(cm, metric, False, True) # H2OBinomialModelMetrics.confusion_matrix() bin_perf = gbm_bin.model_performance(train=True) for x in range(10): thresholds = [ gbm_bin.find_threshold_by_max_metric(m, t, v) for m in random.sample(metrics, random.randint(1, len(metrics))) ] cms = bin_perf.confusion_matrix(thresholds=thresholds) if not isinstance(cms, list): cms = [cms] for idx, cm in enumerate(cms): cm = cm.to_list() dim_check(cm, thresholds[idx], True, False) type_check(cm, thresholds[idx], True, False) count_check(cm, thresholds[idx], True, False) # H2OMultinomialModelMetrics.confusion_matrix() mult_perf = gbm_mult.model_performance(valid=True) cm = mult_perf.confusion_matrix() cm_count = 0 for r in range(7): for c in range(7): cm_count += cm.cell_values[r][c] assert cm_count == air_test.nrow, "incorrect confusion matrix elements. Should sum to {0}, but got {1}". \ format(air_test.nrow, cm_count)
def bernoulli_synthetic_data_mediumGBM(ip, port): # Connect to h2o h2o.init(ip, port) # Generate training dataset (adaptation of http://www.stat.missouri.edu/~speckman/stat461/boost.R) train_rows = 10000 train_cols = 10 # Generate variables V1, ... V10 X_train = np.random.randn(train_rows, train_cols) # y = +1 if sum_i x_{ij}^2 > chisq median on 10 df y_train = np.asarray([ 1 if rs > scipy.stats.chi2.ppf(0.5, 10) else -1 for rs in [sum(r) for r in np.multiply(X_train, X_train).tolist()] ]) # Train scikit gbm # TODO: grid-search distribution = "bernoulli" ntrees = 150 min_rows = 1 max_depth = 2 learn_rate = .01 nbins = 20 gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learn_rate, n_estimators=ntrees, max_depth=max_depth, min_samples_leaf=min_rows, max_features=None) gbm_sci.fit(X_train, y_train) # Generate testing dataset test_rows = 2000 test_cols = 10 # Generate variables V1, ... V10 X_test = np.random.randn(test_rows, test_cols) # y = +1 if sum_i x_{ij}^2 > chisq median on 10 df y_test = np.asarray([ 1 if rs > scipy.stats.chi2.ppf(0.5, 10) else -1 for rs in [sum(r) for r in np.multiply(X_test, X_test).tolist()] ]) # Score (AUC) the scikit gbm model on the test data auc_sci = roc_auc_score(y_test, gbm_sci.predict_proba(X_test)[:, 1]) # Compare this result to H2O train_h2o = H2OFrame(np.column_stack((y_train, X_train)).tolist()) test_h2o = H2OFrame(np.column_stack((y_test, X_test)).tolist()) gbm_h2o = h2o.gbm(x=train_h2o[1:], y=train_h2o["C1"].asfactor(), distribution=distribution, ntrees=ntrees, min_rows=min_rows, max_depth=max_depth, learn_rate=learn_rate, nbins=nbins) gbm_perf = gbm_h2o.model_performance(test_h2o) auc_h2o = gbm_perf.auc() #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o)) assert abs(auc_h2o - auc_sci) < 5e-3, "h2o (auc) performance degradation, with respect to scikit. h2o auc: {0} " \ "scickit auc: {1}".format(auc_h2o, auc_sci)
def pubdev_1829(): train = h2o.import_file( path=pyunit_utils.locate("smalldata/jira/gbm_checkpoint_train.csv")) valid = h2o.import_file( path=pyunit_utils.locate("smalldata/jira/gbm_checkpoint_valid.csv")) predictors = ["displacement", "power", "weight", "acceleration", "year"] response_col = "economy_20mpg" distribution = "bernoulli" train[response_col] = train[response_col].asfactor() valid[response_col] = valid[response_col].asfactor() ntrees1 = 5 max_depth1 = 5 min_rows1 = 10 model1 = h2o.gbm(x=train[predictors], y=train[response_col], ntrees=ntrees1, max_depth=max_depth1, min_rows=min_rows1, score_each_iteration=True, distribution=distribution, validation_x=valid[predictors], validation_y=valid[response_col]) ntrees2 = 10 max_depth2 = 5 min_rows2 = 10 model2 = h2o.gbm(x=train[predictors], y=train[response_col], ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, score_each_iteration=True, validation_x=valid[predictors], validation_y=valid[response_col], checkpoint=model1._id) model4 = h2o.gbm(x=train[predictors], y=train[response_col], ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, score_each_iteration=True, validation_x=valid[predictors], validation_y=valid[response_col]) assert model2.auc(valid=True) == model4.auc( valid=True ), "Expected Model 2 AUC: {0} to be the same as Model 4 AUC: {1}".format( model2.auc(valid=True), model4.auc(valid=True)) assert model2.giniCoef(valid=True) == model4.giniCoef( valid=True ), "Expected Model 2 Gini Coef {0} to be the same as Model 4 Gini Coef: {1}".format( model2.giniCoef(valid=True), model4.giniCoef(valid=True)) assert model2.logloss(valid=True) == model4.logloss( valid=True ), "Expected Model 2 Log Loss: {0} to be the same as Model 4 Log Loss: {1}".format( model2.logloss(valid=True), model4.logloss(valid=True))