def fiftycatGBM(ip,port): # Connect to h2o h2o.init(ip,port) # Training set has only 45 categories cat1 through cat45 #Log.info("Importing 50_cattest_train.csv data...\n") train = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/50_cattest_train.csv")) train["y"] = train["y"].asfactor() #Log.info("Summary of 50_cattest_train.csv from H2O:\n") #train.summary() # Train H2O GBM Model: #Log.info(paste("H2O GBM with parameters:\nntrees = 10, max_depth = 20, nbins = 20\n", sep = "")) model = h2o.gbm(x=train[["x1","x2"]], y=train["y"], distribution="bernoulli", ntrees=10, max_depth=5, nbins=20) model.show() # Test dataset has all 50 categories cat1 through cat50 #Log.info("Importing 50_cattest_test.csv data...\n") test = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/50_cattest_test.csv")) #Log.info("Summary of 50_cattest_test.csv from H2O:\n") #test.summary() # Predict on test dataset with GBM model: #Log.info("Performing predictions on test dataset...\n") predictions = model.predict(test) predictions.show() # Get the confusion matrix and AUC #Log.info("Confusion matrix of predictions (max accuracy):\n") performance = model.model_performance(test) test_cm = performance.confusion_matrix() test_auc = performance.auc()
def frame_slicing(ip,port): # Connect to h2o h2o.init(ip,port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) prostate = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate.csv.zip")) airlines = h2o.import_frame(path=h2o.locate("smalldata/airlines/allyears2k.zip")) iris.show() prostate.show() airlines.show() ################################################################### # H2OFrame[int] (column slice) res1 = h2o.as_list(iris[0]) assert abs(res1[8][0] - 4.4) < 1e-10, "incorrect values" # H2OFrame[int,int] res2 = h2o.as_list(prostate[13, 3]) assert abs(res2[0][0] - 1) < 1e-10, "incorrect values" # H2OFrame[int, slice] res3 = h2o.as_list(airlines[12, 0:3]) assert abs(res3[0][0] - 1987) < 1e-10 and abs(res3[0][1] - 10) < 1e-10 and abs(res3[0][2] - 29) < 1e-10, \ "incorrect values" # H2OFrame[slice, int] res4 = h2o.as_list(iris[5:8, 1]) assert abs(res4[0][0] - 3.9) < 1e-10 and abs(res4[1][0] - 3.4) < 1e-10 and abs(res4[2][0] - 3.4) < 1e-10 and \ abs(res4[3][0] - 2.9) < 1e-10, "incorrect values" # H2OFrame[slice, slice] res5 = h2o.as_list(prostate[5:8, 0:3]) assert abs(res5[0][0] - 6) < 1e-10 and abs(res5[1][1] - 0) < 1e-10 and abs(res5[2][2] - 61) < 1e-10, "incorrect values"
def fiftycatRF(ip, port): # Connect to h2o h2o.init(ip, port) # Training set has only 45 categories cat1 through cat45 # Log.info("Importing 50_cattest_train.csv data...\n") train = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/50_cattest_train.csv")) train["y"] = train["y"].asfactor() # Log.info("Summary of 50_cattest_train.csv from H2O:\n") # train.summary() # Train H2O DRF Model: # Log.info(paste("H2O DRF with parameters:\nclassification = TRUE, ntree = 50, depth = 20, nbins = 500\n", sep = "")) model = h2o.random_forest(x=train[["x1", "x2"]], y=train["y"], ntrees=50, max_depth=20, nbins=500) # Test dataset has all 50 categories cat1 through cat50 # Log.info("Importing 50_cattest_test.csv data...\n") test = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/50_cattest_test.csv")) # Log.info("Summary of 50_cattest_test.csv from H2O:\n") # test.summary() # Predict on test dataset with DRF model: # Log.info("Performing predictions on test dataset...\n") preds = model.predict(test) preds.head() # Get the confusion matrix and AUC # Log.info("Confusion matrix of predictions (max accuracy):\n") perf = model.model_performance(test) perf.show() cm = perf.confusion_matrix() print(cm)
def anomaly(ip, port): h2o.init(ip, port) print "Deep Learning Anomaly Detection MNIST" train = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/train.csv.gz")) test = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/test.csv.gz")) predictors = range(0,784) resp = 784 # unsupervised -> drop the response column (digit: 0-9) train = train[predictors] test = test[predictors] # 1) LEARN WHAT'S NORMAL # train unsupervised Deep Learning autoencoder model on train_hex ae_model = h2o.deeplearning(x=train[predictors], training_frame=train, activation="Tanh", autoencoder=True, hidden=[50], l1=1e-5, ignore_const_cols=False, epochs=1) # 2) DETECT OUTLIERS # anomaly app computes the per-row reconstruction error for the test data set # (passing it through the autoencoder model and computing mean square error (MSE) for each row) test_rec_error = ae_model.anomaly(test) # 3) VISUALIZE OUTLIERS # Let's look at the test set points with low/median/high reconstruction errors. # We will now visualize the original test set points and their reconstructions obtained # by propagating them through the narrow neural net. # Convert the test data into its autoencoded representation (pass through narrow neural net) test_recon = ae_model.predict(test)
def frame_slicing(ip,port): # Connect to h2o h2o.init(ip,port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) prostate = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate.csv.zip")) airlines = h2o.import_frame(path=h2o.locate("smalldata/airlines/allyears2k.zip")) iris.show() prostate.show() airlines.show() ################################################################### # H2OFrame[int] (column slice) res1 = iris[0] assert abs(res1[8] - 4.4) < 1e-10, "incorrect values" # H2OFrame[int,int] res2 = prostate[13, 3] assert abs(res2 - 1) < 1e-10, "incorrect values" # H2OFrame[int, slice] res3 = airlines[12, 0:3] assert abs(res3[0,0] - 1987) < 1e-10 and abs(res3[0,1] - 10) < 1e-10 and abs(res3[0,2] - 29) < 1e-10, \ "incorrect values" # H2OFrame[slice, int] res4 = iris[5:8, 1] assert abs(res4[0] - 3.9) < 1e-10 and abs(res4[1] - 3.4) < 1e-10 and abs(res4[2] - 3.4) < 1e-10, "incorrect values" # H2OFrame[slice, slice] res5 = prostate[5:8, 0:3] assert abs(res5[0,0] - 6) < 1e-10 and abs(res5[1,1] - 0) < 1e-10 and abs(res5[2,2] - 61) < 1e-10, "incorrect values"
def asnumeric(ip,port): # Connect to h2o h2o.init(ip,port) h2oframe = h2o.import_frame(path=h2o.locate("smalldata/junit/cars.csv")) rows = h2oframe.nrow() h2oframe['cylinders'] = h2oframe['cylinders'].ascharacter() assert h2oframe["cylinders"].isfactor(), "expected the column to be a factor" # H2OFrame case h2oframe = h2o.asnumeric(h2oframe) h2oframe['cylinders'] = h2oframe['cylinders'] - h2oframe['cylinders'] h2oframe = h2oframe[h2oframe['cylinders'] == 0] assert h2oframe.nrow() == rows, "expected the same number of rows as before {0}, but got {1}".format(rows, h2oframe.nrow()) h2oframe = h2o.import_frame(path=h2o.locate("smalldata/junit/cars.csv")) h2oframe['cylinders'] = h2oframe['cylinders'].ascharacter() assert h2oframe["cylinders"].isfactor(), "expected the column to be a factor" # H2OVec case h2oframe['cylinders'] = h2o.asnumeric(h2oframe['cylinders']) h2oframe['cylinders'] = h2oframe['cylinders'] - h2oframe['cylinders'] h2oframe = h2oframe[h2oframe['cylinders'] == 0] assert h2oframe.nrow() == rows, "expected the same number of rows as before {0}, but got {1}".format(rows, h2oframe.nrow())
def hit_ratio_test(ip, port): # Connect to h2o h2o.init(ip, port) air_train = h2o.import_frame( path=h2o.locate("smalldata/airlines/AirlinesTrain.csv.zip")) air_valid = h2o.import_frame( path=h2o.locate("smalldata/airlines/AirlinesTest.csv.zip")) air_test = h2o.import_frame( path=h2o.locate("smalldata/airlines/AirlinesTest.csv.zip")) gbm_mult = h2o.gbm(x=air_train[[ "Origin", "Dest", "Distance", "UniqueCarrier", "IsDepDelayed", "fDayofMonth", "fMonth" ]], y=air_train["fDayOfWeek"].asfactor(), validation_x=air_valid[[ "Origin", "Dest", "Distance", "UniqueCarrier", "IsDepDelayed", "fDayofMonth", "fMonth" ]], validation_y=air_valid["fDayOfWeek"].asfactor(), distribution="multinomial") training_hit_ratio_table = gbm_mult.hit_ratio_table(train=True) training_hit_ratio_table.show() validation_hit_ratio_table = gbm_mult.hit_ratio_table(valid=True) validation_hit_ratio_table.show() perf = gbm_mult.model_performance(air_test) test_hit_ratio_table = perf.hit_ratio_table() test_hit_ratio_table.show()
def fiftycatGBM(ip,port): # Connect to h2o h2o.init(ip,port) # Training set has only 45 categories cat1 through cat45 #Log.info("Importing 50_cattest_train.csv data...\n") train = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/50_cattest_train.csv")) train["y"] = train["y"].asfactor() #Log.info("Summary of 50_cattest_train.csv from H2O:\n") #train.summary() # Train H2O GBM Model: #Log.info(paste("H2O GBM with parameters:\nntrees = 10, max_depth = 20, nbins = 20\n", sep = "")) model = h2o.gbm(x=train[["x1","x2"]], y=train["y"], loss="bernoulli", ntrees=10, max_depth=5, nbins=20) model.show() # Test dataset has all 50 categories cat1 through cat50 #Log.info("Importing 50_cattest_test.csv data...\n") test = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/50_cattest_test.csv")) #Log.info("Summary of 50_cattest_test.csv from H2O:\n") #test.summary() # Predict on test dataset with GBM model: #Log.info("Performing predictions on test dataset...\n") predictions = model.predict(test) predictions.show() # Get the confusion matrix and AUC #Log.info("Confusion matrix of predictions (max accuracy):\n") performance = model.model_performance(test) test_cm = performance.confusion_matrices() test_auc = performance.auc()
def asnumeric(ip, port): # Connect to h2o h2o.init(ip, port) h2oframe = h2o.import_frame(path=h2o.locate("smalldata/junit/cars.csv")) rows = h2oframe.nrow() h2oframe['cylinders'] = h2oframe['cylinders'].ascharacter() assert h2oframe["cylinders"].isfactor( ), "expected the column to be a factor" # H2OFrame case h2oframe = h2o.asnumeric(h2oframe) h2oframe['cylinders'] = h2oframe['cylinders'] - h2oframe['cylinders'] h2oframe = h2oframe[h2oframe['cylinders'] == 0] assert h2oframe.nrow( ) == rows, "expected the same number of rows as before {0}, but got {1}".format( rows, h2oframe.nrow()) h2oframe = h2o.import_frame(path=h2o.locate("smalldata/junit/cars.csv")) h2oframe['cylinders'] = h2oframe['cylinders'].ascharacter() assert h2oframe["cylinders"].isfactor( ), "expected the column to be a factor" # H2OVec case h2oframe['cylinders'] = h2o.asnumeric(h2oframe['cylinders']) h2oframe['cylinders'] = h2oframe['cylinders'] - h2oframe['cylinders'] h2oframe = h2oframe[h2oframe['cylinders'] == 0] assert h2oframe.nrow( ) == rows, "expected the same number of rows as before {0}, but got {1}".format( rows, h2oframe.nrow())
def get_model_test(ip,port): # Connect to h2o h2o.init(ip,port) prostate = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv")) r = prostate[0].runif() train = prostate[r < 0.70] test = prostate[r >= 0.30] # Regression regression_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="gaussian") predictions1 = regression_gbm1.predict(test) regression_gbm2 = h2o.get_model(regression_gbm1._key) assert regression_gbm2._model_json['output']['model_category'] == "Regression" predictions2 = regression_gbm2.predict(test) for r in range(predictions1.nrow()): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected regression predictions to be the same for row {0}, but got {1} and {2}" \ "".format(r, p1, p2) # Binomial train[1] = train[1].asfactor() bernoulli_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="bernoulli") predictions1 = bernoulli_gbm1.predict(test) bernoulli_gbm2 = h2o.get_model(bernoulli_gbm1._key) assert bernoulli_gbm2._model_json['output']['model_category'] == "Binomial" predictions2 = bernoulli_gbm2.predict(test) for r in range(predictions1.nrow()): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected binomial predictions to be the same for row {0}, but got {1} and {2}" \ "".format(r, p1, p2) # Clustering benign_h2o = h2o.import_frame(path=h2o.locate("smalldata/logreg/benign.csv")) km_h2o = h2o.kmeans(x=benign_h2o, k=3) benign_km = h2o.get_model(km_h2o._key) assert benign_km._model_json['output']['model_category'] == "Clustering" # Multinomial train[4] = train[4].asfactor() multinomial_dl1 = h2o.deeplearning(x=train[0:2], y=train[4], loss='CrossEntropy') predictions1 = multinomial_dl1.predict(test) multinomial_dl2 = h2o.get_model(multinomial_dl1._key) assert multinomial_dl2._model_json['output']['model_category'] == "Multinomial" predictions2 = multinomial_dl2.predict(test) for r in range(predictions1.nrow()): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected multinomial predictions to be the same for row {0}, but got {1} and {2}" \ "".format(r, p1, p2)
def bernoulliGBM(ip,port): # Connect to h2o h2o.init(ip,port) #Log.info("Importing prostate.csv data...\n") prostate_train = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate_train.csv")) #Log.info("Converting CAPSULE and RACE columns to factors...\n") prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor() #Log.info("H2O Summary of prostate frame:\n") #prostate.summary() # Import prostate_train.csv as numpy array for scikit comparison trainData = np.loadtxt(h2o.locate("smalldata/logreg/prostate_train.csv"), delimiter=',', skiprows=1) trainDataResponse = trainData[:,0] trainDataFeatures = trainData[:,1:] ntrees = 100 learning_rate = 0.1 depth = 5 min_rows = 10 # Build H2O GBM classification model: #Log.info(paste("H2O GBM with parameters:\ndistribution = 'bernoulli', ntrees = ", ntrees, ", max_depth = 5, # min_rows = 10, learn_rate = 0.1\n", sep = "")) gbm_h2o = h2o.gbm(x=prostate_train[1:], y=prostate_train["CAPSULE"], ntrees=ntrees, learn_rate=learning_rate, max_depth=depth, min_rows=min_rows, distribution="bernoulli") # Build scikit GBM classification model #Log.info("scikit GBM with same parameters\n") gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learning_rate, n_estimators=ntrees, max_depth=depth, min_samples_leaf=min_rows, max_features=None) gbm_sci.fit(trainDataFeatures,trainDataResponse) #Log.info("Importing prostate_test.csv data...\n") prostate_test = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate_test.csv")) #Log.info("Converting CAPSULE and RACE columns to factors...\n") prostate_test["CAPSULE"] = prostate_test["CAPSULE"].asfactor() # Import prostate_test.csv as numpy array for scikit comparison testData = np.loadtxt(h2o.locate("smalldata/logreg/prostate_test.csv"), delimiter=',', skiprows=1) testDataResponse = testData[:,0] testDataFeatures = testData[:,1:] # Score on the test data and compare results # scikit auc_sci = roc_auc_score(testDataResponse, gbm_sci.predict_proba(testDataFeatures)[:,1]) # h2o gbm_perf = gbm_h2o.model_performance(prostate_test) auc_h2o = gbm_perf.auc() #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o)) assert auc_h2o >= auc_sci, "h2o (auc) performance degradation, with respect to scikit"
def headers(ip,port): # Connect to h2o h2o.init(ip,port) headers = h2o.import_frame(h2o.locate("smalldata/airlines/allyears2k_headers_only.csv")) headers_and = h2o.import_frame(h2o.locate("smalldata/airlines/allyears2k.zip"), col_names=headers) print headers.names() print headers_and.names() assert headers.names() == headers_and.names(), "Expected the same column names but got {0} and {1}". \ format(headers.names(), headers_and.names())
def deeplearning_autoencoder(ip, port): h2o.init(ip, port) resp = 784 nfeatures = 20 # number of features (smallest hidden layer) train_hex = h2o.import_frame( h2o.locate("bigdata/laptop/mnist/train.csv.gz")) test_hex = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/test.csv.gz")) # split data into two parts sid = train_hex[1].runif(1234) # unsupervised data for autoencoder train_unsupervised = train_hex[sid >= 0.5] train_unsupervised.describe() # supervised data for drf train_supervised = train_hex[sid < 0.5] train_supervised.describe() # train autoencoder ae_model = h2o.deeplearning( x=train_unsupervised.drop(resp), activation="Tanh", autoencoder=True, hidden=[nfeatures], epochs=1, reproducible=True, #slow, turn off for real problems seed=1234) # conver train_supervised with autoencoder to lower-dimensional space train_supervised_features = ae_model.deepfeatures(train_supervised, 0) train_supervised_features.describe() assert train_supervised_features.ncol( ) == nfeatures, "Dimensionality of reconstruction is wrong!" # Train DRF on extracted feature space drf_model = h2o.random_forest(x=train_supervised_features, y=train_supervised[resp].asfactor(), ntrees=10, seed=1234) # Test the DRF model on the test set (processed through deep features) test_features = ae_model.deepfeatures(test_hex.drop(resp), 0) test_features.cbind(test_hex[resp]) # Confusion Matrix and assertion cm = drf_model.confusion_matrix(test_features) cm.show() # 10% error +/- 0.001 assert abs(cm.cell_values[10][10] - 0.1057) < 0.001, "Error not as expected"
def pub_445_long_request_uri(ip,port): # Connect to h2o h2o.init(ip,port) mnistTrain = h2o.import_frame(path=h2o.locate("bigdata/laptop/mnist/train.csv.gz")) mnistTest = h2o.import_frame(path=h2o.locate("bigdata/laptop/mnist/train.csv.gz")) mnistTrain[784]._name = "label" mnistTest[784]._name = "label" mnistModel = h2o.gbm(x=mnistTrain.drop("label"), y=mnistTrain["label"], validation_x=mnistTest.drop("label"), validation_y=mnistTest["label"], ntrees=100, max_depth=10)
def hdfs_basic(ip, port): h2o.init(ip, port) # Check if we are running inside the H2O network by seeing if we can touch # the namenode. running_inside_h2o = h2o.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = h2o.get_h2o_internal_hdfs_name_node() hdfs_iris_file = "/datasets/runit/iris_wheader.csv" hdfs_iris_dir = "/datasets/runit/iris_test_train" #---------------------------------------------------------------------- # Single file cases. #---------------------------------------------------------------------- print "Testing single file importHDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file) iris_h2o = h2o.import_frame(url) iris_h2o.head() iris_h2o.tail() n = iris_h2o.nrow() print "rows: {0}".format(n) assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format( n, 150) assert isinstance( iris_h2o, h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".format( type(iris_h2o)) print "Import worked" #---------------------------------------------------------------------- # Directory file cases. #---------------------------------------------------------------------- print "Testing directory importHDFS" urls = [ "hdfs://{0}{1}/iris_test.csv".format(hdfs_name_node, hdfs_iris_dir), "hdfs://{0}{1}/iris_train.csv".format(hdfs_name_node, hdfs_iris_dir) ] iris_dir_h2o = h2o.import_frame(urls) iris_dir_h2o.head() iris_dir_h2o.tail() n = iris_dir_h2o.nrow() print "rows: {0}".format(n) assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format( n, 150) assert isinstance(iris_dir_h2o, h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".\ format(type(iris_dir_h2o)) print "Import worked" else: print "Not running on H2O internal network. No access to HDFS."
def frame_show(ip,port): # Connect to h2o h2o.init(ip,port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) prostate = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate.csv.zip")) airlines = h2o.import_frame(path=h2o.locate("smalldata/airlines/allyears2k.zip")) iris.show() prostate.show() airlines.show()
def bernoulliGBM(ip,port): # Connect to h2o h2o.init(ip,port) #Log.info("Importing prostate.csv data...\n") prostate_train = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate_train.csv")) #Log.info("Converting CAPSULE and RACE columns to factors...\n") prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor() #Log.info("H2O Summary of prostate frame:\n") #prostate.summary() # Import prostate_train.csv as numpy array for scikit comparison trainData = np.loadtxt(h2o.locate("smalldata/logreg/prostate_train.csv"), delimiter=',', skiprows=1) trainDataResponse = trainData[:,0] trainDataFeatures = trainData[:,1:] ntrees = 100 learning_rate = 0.1 depth = 5 min_rows = 10 # Build H2O GBM classification model: #Log.info(paste("H2O GBM with parameters:\ndistribution = 'bernoulli', ntrees = ", ntrees, ", max_depth = 5, min_rows = 10, learn_rate = 0.1\n", sep = "")) gbm_h2o = h2o.gbm(x=prostate_train[1:], y=prostate_train["CAPSULE"], ntrees=ntrees, learn_rate=learning_rate, max_depth=depth, min_rows=min_rows, distribution="bernoulli") # Build scikit GBM classification model #Log.info("scikit GBM with same parameters\n") gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learning_rate, n_estimators=ntrees, max_depth=depth, min_samples_leaf=min_rows, max_features=None) gbm_sci.fit(trainDataFeatures,trainDataResponse) #Log.info("Importing prostate_test.csv data...\n") prostate_test = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate_test.csv")) #Log.info("Converting CAPSULE and RACE columns to factors...\n") prostate_test["CAPSULE"] = prostate_test["CAPSULE"].asfactor() # Import prostate_test.csv as numpy array for scikit comparison testData = np.loadtxt(h2o.locate("smalldata/logreg/prostate_test.csv"), delimiter=',', skiprows=1) testDataResponse = testData[:,0] testDataFeatures = testData[:,1:] # Score on the test data and compare results # scikit auc_sci = roc_auc_score(testDataResponse, gbm_sci.predict_proba(testDataFeatures)[:,1]) # h2o gbm_perf = gbm_h2o.model_performance(prostate_test) auc_h2o = gbm_perf.auc() #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o)) assert auc_h2o >= auc_sci, "h2o (auc) performance degradation, with respect to scikit"
def col_names_check(ip,port): # Connect to h2o h2o.init(ip,port) iris_wheader = h2o.import_frame(h2o.locate("smalldata/iris/iris_wheader.csv")) assert iris_wheader.col_names() == ["sepal_len","sepal_wid","petal_len","petal_wid","class"], \ "Expected {0} for column names but got {1}".format(["sepal_len","sepal_wid","petal_len","petal_wid","class"], iris_wheader.col_names()) iris = h2o.import_frame(h2o.locate("smalldata/iris/iris.csv")) assert iris.col_names() == ["C1","C2","C3","C4","C5"], "Expected {0} for column names but got " \ "{1}".format(["C1","C2","C3","C4","C5"], iris.col_names())
def deeplearning_autoencoder(ip, port): h2o.init(ip, port) resp = 784 nfeatures = 20 # number of features (smallest hidden layer) train_hex = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/train.csv.gz")) test_hex = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/test.csv.gz")) # split data into two parts sid = train_hex[1].runif(1234) # unsupervised data for autoencoder train_unsupervised = train_hex[sid >= 0.5] train_unsupervised.describe() # supervised data for drf train_supervised = train_hex[sid < 0.5] train_supervised.describe() # train autoencoder ae_model = h2o.deeplearning(x=train_unsupervised.drop(resp), y=train_unsupervised[resp], #ignored (pick any non-constant) activation="Tanh", autoencoder=True, hidden=[nfeatures], epochs=1, reproducible=True, #slow, turn off for real problems seed=1234) # conver train_supervised with autoencoder to lower-dimensional space train_supervised_features = ae_model.deepfeatures(train_supervised, 0) train_supervised_features.describe() assert train_supervised_features.ncol() == nfeatures, "Dimensionality of reconstruction is wrong!" # Train DRF on extracted feature space drf_model = h2o.random_forest(x=train_supervised_features, y=train_supervised[resp].asfactor(), ntrees=10, seed=1234) # Test the DRF model on the test set (processed through deep features) test_features = ae_model.deepfeatures(test_hex.drop(resp), 0) test_features.cbind(test_hex[resp]) # Confusion Matrix and assertion cm = drf_model.confusionMatrix(test_features) cm.show() # 10% error +/- 0.001 assert abs(cm["Totals", "Error"] - 0.1038) < 0.001, "Error not as expected"
def pub_445_long_request_uri(ip,port): # Connect to h2o h2o.init(ip,port) mnistTrain = h2o.import_frame(path=h2o.locate("bigdata/laptop/mnist/train.csv.gz")) mnistTest = h2o.import_frame(path=h2o.locate("bigdata/laptop/mnist/train.csv.gz")) mnistTrain.setName(col=784, name="label") mnistTest.setName(col=784, name="label") mnistModel = h2o.gbm(x=mnistTrain[0:784], y=mnistTrain["label"], validation_x=mnistTest[0:784], validation_y=mnistTest["label"], ntrees=100, max_depth=10)
def headers(ip, port): # Connect to h2o h2o.init(ip, port) headers = h2o.import_frame( h2o.locate("smalldata/airlines/allyears2k_headers_only.csv")) headers_and = h2o.import_frame( h2o.locate("smalldata/airlines/allyears2k.zip"), col_names=headers) print headers.names() print headers_and.names() assert headers.names() == headers_and.names(), "Expected the same column names but got {0} and {1}". \ format(headers.names(), headers_and.names())
def col_names_check(ip, port): # Connect to h2o h2o.init(ip, port) iris_wheader = h2o.import_frame( h2o.locate("smalldata/iris/iris_wheader.csv")) assert iris_wheader.col_names() == ["sepal_len","sepal_wid","petal_len","petal_wid","class"], \ "Expected {0} for column names but got {1}".format(["sepal_len","sepal_wid","petal_len","petal_wid","class"], iris_wheader.col_names()) iris = h2o.import_frame(h2o.locate("smalldata/iris/iris.csv")) assert iris.col_names() == ["C1","C2","C3","C4","C5"], "Expected {0} for column names but got " \ "{1}".format(["C1","C2","C3","C4","C5"], iris.col_names())
def frame_as_list(ip,port): # Connect to h2o h2o.init(ip,port) prostate = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate.csv.zip")) print (prostate % 10).show() print (prostate[4] % 10).show() airlines = h2o.import_frame(path=h2o.locate("smalldata/airlines/allyears2k_headers.zip")) print (airlines["CRSArrTime"] % 100).show()
def frame_show(ip, port): # Connect to h2o h2o.init(ip, port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) prostate = h2o.import_frame( path=h2o.locate("smalldata/prostate/prostate.csv.zip")) airlines = h2o.import_frame( path=h2o.locate("smalldata/airlines/allyears2k.zip")) iris.show() prostate.show() airlines.show()
def cupMediumGBM(ip,port): # Connect to h2o h2o.init(ip,port) train = h2o.import_frame(path=h2o.locate("bigdata/laptop/usecases/cup98LRN_z.csv")) test = h2o.import_frame(path=h2o.locate("bigdata/laptop/usecases/cup98VAL_z.csv")) train["TARGET_B"] = train["TARGET_B"].asfactor() # Train H2O GBM Model: train_cols = train.names() for c in ['', "TARGET_D", "TARGET_B", "CONTROLN"]: train_cols.remove(c) model = h2o.gbm(x=train[train_cols], y=train["TARGET_B"], distribution = "bernoulli", ntrees = 5)
def frame_as_list(ip, port): # Connect to h2o h2o.init(ip, port) prostate = h2o.import_frame( path=h2o.locate("smalldata/prostate/prostate.csv.zip")) (prostate % 10).show() (prostate[4] % 10).show() airlines = h2o.import_frame( path=h2o.locate("smalldata/airlines/allyears2k_headers.zip")) (airlines["CRSArrTime"] % 100).show()
def cupMediumGBM(ip,port): # Connect to h2o h2o.init(ip,port) train = h2o.import_frame(path=h2o.locate("bigdata/laptop/usecases/cup98LRN_z.csv")) test = h2o.import_frame(path=h2o.locate("bigdata/laptop/usecases/cup98VAL_z.csv")) train["TARGET_B"] = train["TARGET_B"].asfactor() # Train H2O GBM Model: train_cols = train.names() for c in ['C1', "TARGET_D", "TARGET_B", "CONTROLN"]: train_cols.remove(c) model = h2o.gbm(x=train[train_cols], y=train["TARGET_B"], distribution = "bernoulli", ntrees = 5)
def expr_slicing(ip,port): # Connect to h2o h2o.init(ip,port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) iris.show() ################################################################### # H2OFrame[int] (column slice) res = 2 - iris res2 = res[0] assert abs(res2[3,:] - -2.6) < 1e-10 and abs(res2[17,:] - -3.1) < 1e-10 and abs(res2[24,:] - -2.8) < 1e-10, \ "incorrect values" # H2OFrame[int,int] assert abs(res[13, 3] - 1.9) < 1e-10, "incorrect values" # H2OFrame[int, slice] res4 = res[12, 0:4] assert abs(res4[0,0] - -2.8) < 1e-10 and abs(res4[0,1] - -1.0) < 1e-10 and abs(res4[0,2] - 0.6) < 1e-10 and \ abs(res4[0,3] - 1.9) < 1e-10, "incorrect values" # H2OFrame[slice, int] res5 = res[5:9, 1] assert abs(res5[0,:] - -1.9) < 1e-10 and abs(res5[1,:] - -1.4) < 1e-10 and abs(res5[2,:] - -1.4) < 1e-10 and \ abs(res5[3,:] - -0.9) < 1e-10, "incorrect values" # H2OFrame[slice, slice] res = iris * 2 res6 = res[5:9, 0:4] assert abs(res6[0,0] - 10.8) < 1e-10 and abs(res6[1,1] - 6.8) < 1e-10 and abs(res6[2,2] - 3.0) < 1e-10 and \ abs(res6[3,3] - 0.4) < 1e-10, "incorrect values"
def deep_learning_metrics_test(ip, port): h2o.init(ip, port) # connect to existing cluster df = h2o.import_frame(path="smalldata/logreg/prostate.csv") del df['ID'] # remove ID df['CAPSULE'] = df['CAPSULE'].asfactor() # make CAPSULE categorical vol = df['VOL'] vol[vol == 0] = None # 0 VOL means 'missing' r = vol.runif() # random train/test split train = df[r < 0.8] test = df[r >= 0.8] # See that the data is ready train.describe() train.head() test.describe() test.head() # Run DeepLearning print "Train a Deeplearning model: " dl = h2o.deeplearning(x=train[1:], y=train['CAPSULE'], epochs=100, hidden=[10, 10, 10]) print "Binomial Model Metrics: " print dl.model_performance(test).show()
def iris_h2o_vs_sciKmeans(ip,port): # Connect to a pre-existing cluster h2o.init(ip,port) # connect to localhost:54321 iris_h2o = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv")) iris_sci = np.genfromtxt(h2o.locate("smalldata/iris/iris.csv"), delimiter=',') iris_sci = iris_sci[:,0:4] s =[[4.9,3.0,1.4,0.2], [5.6,2.5,3.9,1.1], [6.5,3.0,5.2,2.0]] start = h2o.H2OFrame(s) start_key = start.send_frame() h2o_km = h2o.kmeans(x=iris_h2o[0:4], k=3, user_points=start_key, standardize=False) sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1) sci_km.fit(iris_sci) # Log.info("Cluster centers from H2O:") print "Cluster centers from H2O:" h2o_centers = h2o_km.centers() print h2o_centers # Log.info("Cluster centers from scikit:") print "Cluster centers from scikit:" sci_centers = sci_km.cluster_centers_.tolist() print sci_centers for hcenter, scenter in zip(h2o_centers, sci_centers): for hpoint, spoint in zip(hcenter,scenter): assert (hpoint- spoint) < 1e-10, "expected centers to be the same"
def getModelKmeans(ip, port): # Connect to a pre-existing cluster h2o.init(ip, port) # connect to localhost:54321 #Log.info("Importing benign.csv data...\n") benign_h2o = h2o.import_frame( path=h2o.locate("smalldata/logreg/benign.csv")) #benign_h2o.summary() benign_sci = np.genfromtxt(h2o.locate("smalldata/logreg/benign.csv"), delimiter=",") # Impute missing values with column mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) benign_sci = imp.fit_transform(benign_sci) for i in range(2, 7): # Log.info("H2O K-Means") km_h2o = h2o.kmeans(x=benign_h2o, k=i) km_h2o.show() #TODO: impement h2o.getModel() model = h2o.getModel(km_h2o._key) model.show() km_sci = KMeans(n_clusters=i, init='k-means++', n_init=1) km_sci.fit(benign_sci) print "sckit centers" print km_sci.cluster_centers_
def cv_nfoldsGBM(ip, port): # Connect to h2o h2o.init(ip, port) prostate = h2o.import_frame( path=h2o.locate("smalldata/logreg/prostate.csv")) #prostate.summary() prostate_gbm = h2o.gbm(y=prostate[1], x=prostate[2:9], nfolds=5, distribution="bernoulli") prostate_gbm.show() # Can't specify both nfolds >= 2 and validation data at once try: h2o.gbm(y=prostate[1], x=prostate[2:9], nfolds=5, validation_y=prostate[1], validation_x=prostate[2:9], distribution="bernoulli") assert False, "expected an error" except EnvironmentError: assert True
def bigcatGBM(ip, port): # Connect to h2o h2o.init(ip, port) #Log.info("Importing bigcat_5000x2.csv data...\n") bigcat = h2o.import_frame( path=h2o.locate("smalldata/gbm_test/bigcat_5000x2.csv")) bigcat["y"] = bigcat["y"].asfactor() #Log.info("Summary of bigcat_5000x2.csv from H2O:\n") #bigcat.summary() # Train H2O GBM Model: #Log.info("H2O GBM with parameters:\nntrees = 1, max_depth = 1, nbins = 100\n") model = h2o.gbm(x=bigcat[["X"]], y=bigcat["y"], loss="bernoulli", ntrees=1, max_depth=1, nbins=100) model.show() performance = model.model_performance(bigcat) performance.show() # Check AUC and overall prediction error #test_accuracy = performance.accuracy() test_auc = performance.auc()
def trim_check(ip, port): # Connect to a pre-existing cluster h2o.init(ip, port) frame = h2o.import_frame(path=h2o.locate("smalldata/junit/cars_trim.csv")) # single column (frame) trimmed_frame = frame["name"].trim() assert trimmed_frame[0, 0] == "AMC Ambassador Brougham", "Expected 'AMC Ambassador Brougham', but got {}".format( trimmed_frame[0, 0] ) assert trimmed_frame[1, 0] == "AMC Ambassador DPL", "Expected 'AMC Ambassador DPL', but got {}".format( trimmed_frame[1, 0] ) assert trimmed_frame[2, 0] == "AMC Ambassador SST", "Expected 'AMC Ambassador SST', but got {}".format( trimmed_frame[2, 0] ) # single column (vec) vec = frame["name"] trimmed_vec = vec.trim() assert trimmed_vec[0, 0] == "AMC Ambassador Brougham", "Expected 'AMC Ambassador Brougham', but got {}".format( trimmed_frame[0, 0] ) assert trimmed_vec[1, 0] == "AMC Ambassador DPL", "Expected 'AMC Ambassador DPL', but got {}".format( trimmed_frame[1, 0] ) assert trimmed_vec[2, 0] == "AMC Ambassador SST", "Expected 'AMC Ambassador SST', but got {}".format( trimmed_frame[2, 0] )
def sdev(ip, port): # Connect to h2o h2o.init(ip, port) iris_h2o = h2o.import_frame( path=h2o.locate("smalldata/iris/iris_wheader.csv")) iris_np = np.genfromtxt(h2o.locate("smalldata/iris/iris_wheader.csv"), delimiter=',', skip_header=1, usecols=(0, 1, 2, 3)) sd_np = np.std(iris_np, axis=0, ddof=1) for i in range(4): sd_h2o = iris_h2o[i].sd() assert abs(sd_np[i] - sd_h2o.eager() ) < 1e-10, "expected standard deviations to be the same" try: iris_h2o[4].sd().eager() assert False, "expected an error. column is categorical." except EnvironmentError: assert True try: iris_h2o[0:2].sd().eager() assert False, "expected an error. more than one column." except AttributeError: assert True
def link_functions_gaussian(ip, port): # Connect to h2o h2o.init(ip, port) print("Read in prostate data.") h2o_data = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip")) h2o_data.head() sm_data = pd.read_csv( zipfile.ZipFile(h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).open("prostate_complete.csv") ).as_matrix() sm_data_response = sm_data[:, 9] sm_data_features = sm_data[:, 1:9] print("Testing for family: GAUSSIAN") print("Set variables for h2o.") myY = "GLEASON" myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] print("Create models with canonical link: IDENTITY") h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="gaussian", link="identity", alpha=[0.5], Lambda=[0]) sm_model = sm.GLM( endog=sm_data_response, exog=sm_data_features, family=sm.families.Gaussian(sm.families.links.identity) ).fit() print("Compare model deviances for link function identity") h2o_deviance = h2o_model.residual_deviance() / h2o_model.null_deviance() sm_deviance = sm_model.deviance / sm_model.null_deviance assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
def iris_h2o_vs_sciKmeans(ip, port): # Connect to a pre-existing cluster h2o.init(ip, port) # connect to localhost:54321 iris_h2o = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv")) iris_sci = np.genfromtxt(h2o.locate("smalldata/iris/iris.csv"), delimiter=',') iris_sci = iris_sci[:, 0:4] s = [[4.9, 3.0, 1.4, 0.2], [5.6, 2.5, 3.9, 1.1], [6.5, 3.0, 5.2, 2.0]] start = h2o.H2OFrame(s) start_key = start.send_frame() h2o_km = h2o.kmeans(x=iris_h2o[0:4], k=3, user_points=start_key, standardize=False) sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1) sci_km.fit(iris_sci) # Log.info("Cluster centers from H2O:") print "Cluster centers from H2O:" h2o_centers = h2o_km.centers() print h2o_centers # Log.info("Cluster centers from scikit:") print "Cluster centers from scikit:" sci_centers = sci_km.cluster_centers_.tolist() print sci_centers for hcenter, scenter in zip(h2o_centers, sci_centers): for hpoint, spoint in zip(hcenter, scenter): assert (hpoint - spoint) < 1e-10, "expected centers to be the same"
def expr_show(ip,port): # Connect to h2o h2o.init(ip,port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) print "iris:" iris.show() ################################################################### # expr[int], expr._data is pending res = 2 - iris res2 = res[0] print "res2:" res2.show() # expr[int], expr._data is remote res3 = res[0] print "res3:" res3.show() # expr[int], expr._data is local expr = Expr([1,2,3]) print "expr:" expr.show() # expr[tuple], expr._data is local expr = Expr([[1,2,3], [4,5,6]]) print "expr:" expr.show()
def group_by(ip,port): # Connect to a pre-existing cluster h2o.init(ip,port) h2o_iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) pd_iris = pd.read_csv(h2o.locate("smalldata/iris/iris_wheader.csv")) h2o_agg_funcs = ["count","count_unique","first","last","min","max","mean","avg","sd","stdev","var","sum","ss"] na_handling = ["ignore","rm","all"] col_names = h2o_iris.col_names()[0:4] # smoke test for a in h2o_agg_funcs: for n in na_handling: for c in col_names: h2o.group_by(h2o_iris, ["class"], {"foo":[a,c,n]}) # h2o/pandas/numpy comparison test h2o_np_agg_dict = {"min":np.min, "max":np.max, "mean":np.mean, "sum":np.sum} for k in h2o_np_agg_dict.keys(): for c in col_names: h2o_res = h2o.group_by(h2o_iris, ["class"], {"foo":[k,c,"all"]}) pd_res = pd_iris.groupby("class")[c].aggregate(h2o_np_agg_dict[k]) for i in range(3): h2o_val = h2o_res[i,1] pd_val = pd_res[h2o_res[i,0]] assert abs(h2o_val - pd_val) < 1e-06, \ "check unsuccessful! h2o computed {0} and pandas computed {1}. expected equal aggregate {2} " \ "values between h2o and pandas on column {3}".format(h2o_val,pd_val,k,c)
def group_by(ip,port): # Connect to a pre-existing cluster h2o.init(ip,port) h2o_iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) pd_iris = pd.read_csv(h2o.locate("smalldata/iris/iris_wheader.csv")) h2o_agg_funcs = ["count","count_unique","first","last","min","max","mean","avg","sd","stdev","var","sum","ss"] na_handling = ["ignore","rm","all"] col_names = h2o_iris.col_names()[0:4] print "Running smoke test" # smoke test for a in h2o_agg_funcs: for n in na_handling: for c in col_names: print "group by : " + str(a) + "; " + str(n) + "; " + str(c) h2o.group_by(h2o_iris, ["class"], {"foo":[a,c,n]}) # h2o/pandas/numpy comparison test h2o_np_agg_dict = {"min":np.min, "max":np.max, "mean":np.mean, "sum":np.sum} for k in h2o_np_agg_dict.keys(): for c in col_names: print "group by comparison: " + str(k) + "; " + str(c) h2o_res = h2o.group_by(h2o_iris, ["class"], {"foo":[k,c,"all"]}) pd_res = pd_iris.groupby("class")[c].aggregate(h2o_np_agg_dict[k]) for i in range(3): h2o_val = h2o_res[i,1] pd_val = pd_res[h2o_res[i,0]] assert abs(h2o_val - pd_val) < 1e-06, \ "check unsuccessful! h2o computed {0} and pandas computed {1}. expected equal aggregate {2} " \ "values between h2o and pandas on column {3}".format(h2o_val,pd_val,k,c)
def benignKmeans(ip, port): # Connect to a pre-existing cluster h2o.init(ip, port) # connect to localhost:54321 # Log.info("Importing benign.csv data...\n") benign_h2o = h2o.import_frame( path=h2o.locate("smalldata/logreg/benign.csv")) #benign_h2o.summary() benign_sci = np.genfromtxt(h2o.locate("smalldata/logreg/benign.csv"), delimiter=",") # Impute missing values with column mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) benign_sci = imp.fit_transform(benign_sci) # Log.info(paste("H2O K-Means with ", i, " clusters:\n", sep = "")) for i in range(1, 7): benign_h2o_km = h2o.kmeans(x=benign_h2o, k=i) print "H2O centers" print benign_h2o_km.centers() benign_sci_km = KMeans(n_clusters=i, init='k-means++', n_init=1) benign_sci_km.fit(benign_sci) print "sckit centers" print benign_sci_km.cluster_centers_
def covtype_get_model(ip,port): # Connect to h2o h2o.init(ip,port) #Log.info("Importing covtype.20k.data...\n") covtype = h2o.import_frame(path=h2o.locate("smalldata/covtype/covtype.20k.data")) Y = 54 X = range(0,20) + range(29,54) # Set response to be indicator of a particular class res_class = random.randint(1,4) # Log.info(paste("Setting response column", myY, "to be indicator of class", res_class, "\n")) covtype[54] = (covtype[54] == res_class) #covtype_data.summary() # L2: alpha = 0, lambda = 0 covtype_mod1 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[0], Lambda=[0]) covtype_mod1.show() covtype_mod1 = h2o.get_model(covtype_mod1._id) covtype_mod1.show() # Elastic: alpha = 0.5, lambda = 1e-4 covtype_mod2 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[0.5], Lambda=[1e-4]) covtype_mod2.show() covtype_mod2 = h2o.get_model(covtype_mod2._id) covtype_mod2.show() # L1: alpha = 1, lambda = 1e-4 covtype_mod3 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[1], Lambda=[1e-4]) covtype_mod3.show() covtype_mod3 = h2o.get_model(covtype_mod3._id) covtype_mod3.show()
def benign(ip, port): # Connect to h2o h2o.init(ip, port) training_data = h2o.import_frame(h2o.locate("smalldata/logreg/benign.csv")) Y = 3 X = range(3) + range(4, 11) #Log.info("Build the model") model = h2o.glm(y=training_data[Y].asfactor(), x=training_data[X], family="binomial", alpha=[0], Lambda=[1e-5]) #Log.info("Check that the columns used in the model are the ones we passed in.") #Log.info("===================Columns passed in: ================") in_names = [training_data.names()[i] for i in X] #Log.info("===================Columns passed out: ================") out_names = [ model._model_json['output']['coefficients_table'].cell_values[c][0] for c in range(len(X) + 1) ] assert in_names == out_names[1:]
def link_functions_gamma(ip, port): # Connect to h2o h2o.init(ip, port) print("Read in prostate data.") h2o_data = h2o.import_frame( path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip")) h2o_data.head() sm_data = pd.read_csv( zipfile.ZipFile( h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).open( "prostate_complete.csv")).as_matrix() sm_data_response = sm_data[:, 5] sm_data_features = sm_data[:, [1, 2, 3, 4, 6, 7, 8, 9]] print("Testing for family: GAMMA") print("Set variables for h2o.") myY = "DPROS" myX = ["ID", "AGE", "RACE", "GLEASON", "DCAPS", "PSA", "VOL", "CAPSULE"] print("Create models with canonical link: INVERSE") h2o_model_in = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="gamma", link="inverse", alpha=[0.5], Lambda=[0], n_folds=0) sm_model_in = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Gamma( sm.families.links.inverse_power)).fit() print("Compare model deviances for link function inverse") h2o_deviance_in = h2o_model_in._model_json['output'][ 'residual_deviance'] / h2o_model_in._model_json['output'][ 'null_deviance'] sm_deviance_in = sm_model_in.deviance / sm_model_in.null_deviance assert h2o_deviance_in - sm_deviance_in < 0.01, "expected h2o to have an equivalent or better deviance measures" print("Create models with canonical link: LOG") h2o_model_log = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="gamma", link="log", alpha=[0.5], Lambda=[0], n_folds=0) sm_model_log = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Gamma( sm.families.links.log)).fit() print("Compare model deviances for link function log") h2o_deviance_log = h2o_model_log._model_json['output'][ 'residual_deviance'] / h2o_model_log._model_json['output'][ 'null_deviance'] sm_deviance_log = sm_model_log.deviance / sm_model_log.null_deviance assert h2o_deviance_log - sm_deviance_log < 0.01, "expected h2o to have an equivalent or better deviance measures"
def slicing_shape(ip,port): # Connect to a pre-existing cluster h2o.init(ip,port) prostate = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv")) rows, cols = prostate.dim() #foo = prostate[0:0] # TODO: empty frame allowed? #foo.show() # prostate[slice] for ncols in range(1,cols+1): r, c = prostate[0:ncols].dim() assert r == rows, "incorrect number of rows. correct: {0}, computed: {1}".format(rows, r) assert c == ncols, "incorrect number of cols. correct: {0}, computed: {1}".format(ncols, c) # prostate[int,slice] for ncols in range(1,cols+1): r, c = prostate[random.randint(0,rows-1),0:ncols].dim() assert r == 1, "incorrect number of rows. correct: {0}, computed: {1}".format(1, r) assert c == ncols, "incorrect number of cols. correct: {0}, computed: {1}".format(ncols, c) # prostate[slice,int] # TODO: there's a bug here: HEXDEV-266 for nrows in range(1,10): r, c = prostate[0:nrows,random.randint(0,cols-1)].dim() assert r == nrows, "incorrect number of rows. correct: {0}, computed: {1}".format(nrows, r) assert c == 1, "incorrect number of cols. correct: {0}, computed: {1}".format(1, c) # prostate[slice,slice] # TODO: there's a bug here: HEXDEV-266 for nrows in range(1,10): for ncols in range(1,cols+1): r, c = prostate[0:nrows,0:ncols].dim() assert r == nrows, "incorrect number of rows. correct: {0}, computed: {1}".format(nrows, r) assert c == ncols, "incorrect number of cols. correct: {0}, computed: {1}".format(ncols, c)
def center_scale(ip,port): # Connect to h2o h2o.init(ip,port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))[0:4] # frame (default args) foo = iris.scale() # TODO: the below assertion fails. Should it? #assert abs(foo[0,0] - -0.8976739) < 1e-6 and abs(foo[0,1] - 1.01560199) < 1e-6 and abs(foo[0,2] - -1.335752) < 1e-6 \ # and abs(foo[0,3] - -1.311052) < 1e-6, "h2o differed from r. h2o got {0}, {1}, {2}, and {3}" \ # "".format(foo[0,0],foo[0,1],foo[0,2],foo[0,3]) # frame (centers=True, scale=False) foo = iris.scale(center=True, scale=False) # frame (centers=False, scale=True) foo = iris.scale(center=False, scale=True) # frame (centers=False, scale=False) foo = iris.scale(center=False, scale=False) # vec (default args) foo = iris[0].scale() # vec (centers=True, scale=False) foo = iris[1].scale(center=True, scale=False) # vec (centers=False, scale=True) foo = iris[2].scale(center=False, scale=True) # vec (centers=False, scale=False) foo = iris[3].scale(center=False, scale=False)
def deep_learning_metrics_test(ip, port): h2o.init(ip, port) # connect to existing cluster df = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv")) df.drop("ID") # remove ID df["CAPSULE"] = df["CAPSULE"].asfactor() # make CAPSULE categorical vol = df["VOL"] vol[vol == 0] = float("nan") # 0 VOL means 'missing' r = vol.runif() # random train/test split train = df[r < 0.8] test = df[r >= 0.8] # See that the data is ready train.describe() train.head() train.tail() test.describe() test.head() test.tail() # Run DeepLearning print "Train a Deeplearning model: " dl = h2o.deeplearning(x=train[1:], y=train["CAPSULE"], epochs=100, hidden=[10, 10, 10], loss="CrossEntropy") print "Binomial Model Metrics: " print dl.show() dl.model_performance(test).show()
def offset_tweedie(ip, port): # Connect to a pre-existing cluster h2o.init(ip, port) insurance = h2o.import_frame( h2o.locate("smalldata/glm_test/insurance.csv")) insurance["offset"] = insurance["Holders"].log() gbm = h2o.gbm(x=insurance[0:3], y=insurance["Claims"], distribution="tweedie", ntrees=600, max_depth=1, min_rows=1, learn_rate=.1, offset_column="offset", training_frame=insurance) predictions = gbm.predict(insurance) # Comparison result generated from harrysouthworth's gbm: # fit2 = gbm(Claims ~ District + Group + Age+ offset(log(Holders)) , interaction.depth = 1,n.minobsinnode = 1,shrinkage = .1,bag.fraction = 1,train.fraction = 1, # data = Insurance, distribution ="tweedie", n.trees = 600) # pr = predict(fit2, Insurance) # pr = exp(pr+log(Insurance$Holders)) assert abs(-1.869702 - gbm._model_json['output']['init_f']) < 1e-5, "expected init_f to be {0}, but got {1}".\ format(-1.869702, gbm._model_json['output']['init_f']) assert abs(49.21591 - predictions.mean()) < 1e-4, "expected prediction mean to be {0}, but got {1}". \ format(49.21591, predictions.mean()) assert abs(1.0258 - predictions.min()) < 1e-4, "expected prediction min to be {0}, but got {1}". \ format(1.0258, predictions.min()) assert abs(392.4651 - predictions.max()) < 1e-2, "expected prediction max to be {0}, but got {1}". \ format(392.4651, predictions.max())
def smallcatGBM(ip,port): # Training set has 26 categories from A to Z # Categories A, C, E, G, ... are perfect predictors of y = 1 # Categories B, D, F, H, ... are perfect predictors of y = 0 # Connect to h2o h2o.init(ip,port) #Log.info("Importing alphabet_cattest.csv data...\n") alphabet = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/alphabet_cattest.csv")) alphabet["y"] = alphabet["y"].asfactor() #Log.info("Summary of alphabet_cattest.csv from H2O:\n") #alphabet.summary() # Prepare data for scikit use trainData = np.loadtxt(h2o.locate("smalldata/gbm_test/alphabet_cattest.csv"), delimiter=',', skiprows=1, converters={0:lambda s: ord(s.split("\"")[1])}) trainDataResponse = trainData[:,1] trainDataFeatures = trainData[:,0] # Train H2O GBM Model: #Log.info("H2O GBM (Naive Split) with parameters:\nntrees = 1, max_depth = 1, nbins = 100\n") gbm_h2o = h2o.gbm(x=alphabet[['X']], y=alphabet["y"], distribution="bernoulli", ntrees=1, max_depth=1, nbins=100) gbm_h2o.show() # Train scikit GBM Model: # Log.info("scikit GBM with same parameters:") gbm_sci = ensemble.GradientBoostingClassifier(n_estimators=1, max_depth=1, max_features=None) gbm_sci.fit(trainDataFeatures[:,np.newaxis],trainDataResponse)
def expr_show(ip, port): # Connect to h2o h2o.init(ip, port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) print "iris:" iris.show() ################################################################### # expr[int], expr._data is pending res = 2 - iris res2 = res[0] print "res2:" res2.show() # expr[int], expr._data is remote res3 = res[0] print "res3:" res3.show() # expr[int], expr._data is local expr = Expr([1, 2, 3]) print "expr:" expr.show() # expr[tuple], expr._data is local expr = Expr([[1, 2, 3], [4, 5, 6]]) print "expr:" expr.show()
def table_check(ip,port): # Connect to a pre-existing cluster h2o.init(ip,port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv")) # single column (frame) table1 = h2o.table(iris[["C5"]]) assert table1[0,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[0,0], table1[0,1]) assert table1[1,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[1,0], table1[1,1]) assert table1[2,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[2,0], table1[2,1]) # single column (vec) table1 = h2o.table(iris["C5"]) assert table1[0,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[0,0], table1[0,1]) assert table1[1,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[1,0], table1[1,1]) assert table1[2,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[2,0], table1[2,1]) # two-column (one argument) table2 = h2o.table(iris[["C1", "C5"]]) assert table2[0,2] == 4, "Expected , but got {0}".format(table2[0,2]) assert table2[1,2] == 5, "Expected , but got {0}".format(table2[1,2]) assert table2[2,2] == 3, "Expected , but got {0}".format(table2[2,2]) # two columns (seperate arguments (frames)) table3 = h2o.table(iris[["C1"]],iris[["C5"]]) assert table3[0,2] == 4, "Expected , but got {0}".format(table3[0,2]) assert table3[1,2] == 5, "Expected , but got {0}".format(table3[1,2]) assert table3[2,2] == 3, "Expected , but got {0}".format(table3[2,2]) # two columns (seperate arguments (vecs)) table3 = h2o.table(iris["C1"],iris["C5"]) assert table3[0,2] == 4, "Expected , but got {0}".format(table3[0,2]) assert table3[1,2] == 5, "Expected , but got {0}".format(table3[1,2]) assert table3[2,2] == 3, "Expected , but got {0}".format(table3[2,2])
def link_functions_binomial(ip,port): # Connect to h2o h2o.init(ip,port) print("Read in prostate data.") h2o_data = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip")) h2o_data.head() sm_data = pd.read_csv(zipfile.ZipFile(h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).open("prostate_complete.csv")).as_matrix() sm_data_response = sm_data[:,2] sm_data_features = sm_data[:,[1,3,4,5,6,7,8,9]] print("Testing for family: BINOMIAL") print("Set variables for h2o.") myY = "CAPSULE" myX = ["ID","AGE","RACE","GLEASON","DCAPS","PSA","VOL","DPROS"] print("Create models with canonical link: LOGIT") h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY].asfactor(), family="binomial", link="logit",alpha=[0.5], Lambda=[0]) sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Binomial(sm.families.links.logit)).fit() print("Compare model deviances for link function logit") h2o_deviance = h2o_model._model_json['output']['residual_deviance'] / h2o_model._model_json['output']['null_deviance'] sm_deviance = sm_model.deviance / sm_model.null_deviance assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
def sdev(ip,port): # Connect to h2o h2o.init(ip,port) iris_h2o = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) iris_np = np.genfromtxt(h2o.locate("smalldata/iris/iris_wheader.csv"), delimiter=',', skip_header=1, usecols=(0, 1, 2, 3)) sd_np = np.std(iris_np, axis=0, ddof=1) for i in range(4): sd_h2o = iris_h2o[i].sd() assert abs(sd_np[i] - sd_h2o) < 1e-10, "expected standard deviations to be the same" try: iris_h2o[4].sd() assert False, "expected an error. column is categorical." except EnvironmentError: assert True try: iris_h2o[0:2].sd() assert False, "expected an error. more than one column." except EnvironmentError: assert True
def https_import(ip,port): # Connect to h2o h2o.init(ip,port) url = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip" aa = h2o.import_frame(path=url) aa.show()
def swpredsRF(ip,port): # Training set has two predictor columns # X1: 10 categorical levels, 100 observations per level; X2: Unif(0,1) noise # Ratio of y = 1 per Level: cat01 = 1.0 (strong predictor), cat02 to cat10 = 0.5 (weak predictors) # Connect to h2o h2o.init(ip,port) #Log.info("Importing swpreds_1000x3.csv data...\n") swpreds = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/swpreds_1000x3.csv")) swpreds["y"] = swpreds["y"].asfactor() #Log.info("Summary of swpreds_1000x3.csv from H2O:\n") #swpreds.summary() # Train H2O DRF without Noise Column #Log.info("Distributed Random Forest with only Predictor Column") model1 = h2o.random_forest(x=swpreds[["X1"]], y=swpreds["y"], ntrees=50, max_depth=20, nbins=500) model1.show() perf1 = model1.model_performance(swpreds) print(perf1.auc()) # Train H2O DRF Model including Noise Column: #Log.info("Distributed Random Forest including Noise Column") model2 = h2o.random_forest(x=swpreds[["X1","X2"]], y=swpreds["y"], ntrees=50, max_depth=20, nbins=500) model2.show() perf2 = model2.model_performance(swpreds) print(perf2.auc())
def ascharacter(ip,port): h2oframe = h2o.import_frame(path=h2o.locate("smalldata/junit/cars.csv")) h2oframe.show() h2oframe['cylinders'] = h2oframe['cylinders'].asfactor() h2oframe['cylinders'].ascharacter() assert h2oframe["cylinders"].isfactor(), "expected the column be a factor" assert not h2oframe["cylinders"].isstring(), "expected the column to not be a string"
def perfectSeparation_balanced(ip, port): # Connect to h2o h2o.init(ip, port) print("Read in synthetic balanced dataset") data = h2o.import_frame( path=h2o.locate("smalldata/synthetic_perfect_separation/balanced.csv")) print("Fit model on dataset") model = h2o.glm(x=data[["x1", "x2"]], y=data["y"], family="binomial", lambda_search=True, use_all_factor_levels=True, alpha=[0.5], Lambda=[0]) print( "Extract models' coefficients and assert reasonable values (ie. no greater than 50)" ) print("Balanced dataset") coef = [ c[1] for c in model._model_json['output']['coefficients_table'].cell_values if c[0] != "Intercept" ] for c in coef: assert c < 50, "coefficient is too large"