def checkpoint_new_category_in_predictor(): sv1 = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv")) sv2 = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv")) vir = h2o.upload_file(tests.locate("smalldata/iris/virginica.csv")) m1 = h2o.deeplearning(x=sv1[[0, 1, 2, 4]], y=sv1[3], epochs=100) m2 = h2o.deeplearning(x=sv2[[0, 1, 2, 4]], y=sv2[3], epochs=200, checkpoint=m1.model_id) # attempt to continue building model, but with an expanded categorical predictor domain. # this should fail try: m3 = h2o.deeplearning(x=vir[[0, 1, 2, 4]], y=vir[3], epochs=200, checkpoint=m1.model_id) assert False, "Expected continued model-building to fail with new categories introduced in predictor" except EnvironmentError: pass # attempt to predict on new model, but with observations that have expanded categorical predictor domain. predictions = m2.predict(vir)
def tweedie_weights(ip,port): data = h2o.import_file(h2o.locate("smalldata/glm_test/cancar_logIn.csv")) data["C1M3"] = (data["Class"] == 1 and data["Merit"] == 3).asfactor() data["C3M3"] = (data["Class"] == 3 and data["Merit"] == 3).asfactor() data["C4M3"] = (data["Class"] == 4 and data["Merit"] == 3).asfactor() data["C1M2"] = (data["Class"] == 1 and data["Merit"] == 2).asfactor() data["Merit"] = data["Merit"].asfactor() data["Class"] = data["Class"].asfactor() loss = data["Cost"] / data["Insured"] loss.setName(0,"Loss") cancar = loss.cbind(data) # Without weights myX = ["Merit","Class","C1M3","C4M3"] dl = h2o.deeplearning(x = cancar[myX],y = cancar["Loss"],distribution ="tweedie",hidden = [1],epochs = 1000, train_samples_per_iteration = -1,reproducible = True,activation = "Tanh",balance_classes = False, force_load_balance = False, seed = 2353123,tweedie_power = 1.5,score_training_samples = 0, score_validation_samples = 0) mean_residual_deviance = dl.mean_residual_deviance() # With weights dl = h2o.deeplearning(x = cancar[myX],y = cancar["Loss"],distribution ="tweedie",hidden = [1],epochs = 1000, train_samples_per_iteration = -1,reproducible = True,activation = "Tanh",balance_classes = False, force_load_balance = False, seed = 2353123,tweedie_power = 1.5,score_training_samples = 0, score_validation_samples = 0,weights_column = "Insured",training_frame = cancar)
def tweedie_weights(): data = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/cancar_logIn.csv")) data["C1M3"] = (data["Class"] == 1 and data["Merit"] == 3).asfactor() data["C3M3"] = (data["Class"] == 3 and data["Merit"] == 3).asfactor() data["C4M3"] = (data["Class"] == 4 and data["Merit"] == 3).asfactor() data["C1M2"] = (data["Class"] == 1 and data["Merit"] == 2).asfactor() data["Merit"] = data["Merit"].asfactor() data["Class"] = data["Class"].asfactor() loss = data["Cost"] / data["Insured"] loss.set_name(0,"Loss") cancar = loss.cbind(data) # Without weights myX = ["Merit","Class","C1M3","C4M3"] dl = h2o.deeplearning(x = cancar[myX],y = cancar["Loss"],distribution ="tweedie",hidden = [1],epochs = 1000, train_samples_per_iteration = -1,reproducible = True,activation = "Tanh",balance_classes = False, force_load_balance = False, seed = 2353123,tweedie_power = 1.5,score_training_samples = 0, score_validation_samples = 0) mean_residual_deviance = dl.mean_residual_deviance() # With weights dl = h2o.deeplearning(x = cancar[myX],y = cancar["Loss"],distribution ="tweedie",hidden = [1],epochs = 1000, train_samples_per_iteration = -1,reproducible = True,activation = "Tanh",balance_classes = False, force_load_balance = False, seed = 2353123,tweedie_power = 1.5,score_training_samples = 0, score_validation_samples = 0,weights_column = "Insured",training_frame = cancar)
def offsets_and_distributions(ip,port): # cars cars = h2o.upload_file(h2o.locate("smalldata/junit/cars_20mpg.csv")) cars = cars[cars["economy_20mpg"].isna() == 0] cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() offset = h2o.H2OFrame(python_obj=[[.5] for x in range(398)]) offset.setNames(["x1"]) cars = cars.cbind(offset) # insurance insurance = h2o.import_file(h2o.locate("smalldata/glm_test/insurance.csv")) insurance["offset"] = insurance["Holders"].log() # bernoulli - offset not supported #dl = h2o.deeplearning(x=cars[2:8], y=cars["economy_20mpg"], distribution="bernoulli", offset_column="x1", # training_frame=cars) #predictions = dl.predict(cars) # gamma dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gamma", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance) # gaussian dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gaussian", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance) # poisson dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="poisson", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance) # tweedie dl = h2o.deeplearning(x=insurance.names[0:3], y="Claims", distribution="tweedie", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance)
def offsets_and_distributions(): # cars cars = h2o.upload_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) cars = cars[cars["economy_20mpg"].isna() == 0] cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() offset = h2o.H2OFrame([[.5]]*398) offset.set_name(0,"x1") cars = cars.cbind(offset) # insurance insurance = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/insurance.csv")) insurance["offset"] = insurance["Holders"].log() # bernoulli - offset not supported #dl = h2o.deeplearning(x=cars[2:8], y=cars["economy_20mpg"], distribution="bernoulli", offset_column="x1", # training_frame=cars) #predictions = dl.predict(cars) # gamma dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gamma", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance) # gaussian dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gaussian", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance) # poisson dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="poisson", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance) # tweedie dl = h2o.deeplearning(x=insurance.names[0:3], y="Claims", distribution="tweedie", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance)
def weights_and_distributions(): htable = h2o.upload_file(h2o.locate("smalldata/gbm_test/moppe.csv")) htable["premiekl"] = htable["premiekl"].asfactor() htable["moptva"] = htable["moptva"].asfactor() htable["zon"] = htable["zon"] # gamma dl = h2o.deeplearning( x=htable[0:3], y=htable["medskad"], training_frame=htable, distribution="gamma", weights_column="antskad" ) predictions = dl.predict(htable) # gaussian dl = h2o.deeplearning( x=htable[0:3], y=htable["medskad"], training_frame=htable, distribution="gaussian", weights_column="antskad" ) predictions = dl.predict(htable) # poisson dl = h2o.deeplearning( x=htable[0:3], y=htable["medskad"], training_frame=htable, distribution="poisson", weights_column="antskad" ) predictions = dl.predict(htable) # tweedie dl = h2o.deeplearning( x=htable[0:3], y=htable["medskad"], training_frame=htable, distribution="tweedie", weights_column="antskad" ) predictions = dl.predict(htable)
def imbalance(ip, port): print "Test checks if Deep Learning works fine with an imbalanced dataset" covtype = h2o.upload_file(h2o.locate("smalldata/covtype/covtype.20k.data")) covtype[54] = covtype[54].asfactor() hh_imbalanced = h2o.deeplearning( x=covtype[0:54], y=covtype[54], l1=1e-5, activation="Rectifier", loss="CrossEntropy", hidden=[200, 200], epochs=1, training_frame=covtype, balance_classes=False, reproducible=True, seed=1234, ) print hh_imbalanced hh_balanced = h2o.deeplearning( x=covtype[0:54], y=covtype[54], l1=1e-5, activation="Rectifier", loss="CrossEntropy", hidden=[200, 200], epochs=1, training_frame=covtype, balance_classes=True, reproducible=True, seed=1234, ) print hh_balanced # compare error for class 6 (difficult minority) class_6_err_imbalanced = hh_imbalanced.confusion_matrix(covtype).cell_values[5][7] class_6_err_balanced = hh_balanced.confusion_matrix(covtype).cell_values[5][7] if class_6_err_imbalanced < class_6_err_balanced: print "--------------------" print "" print "FAIL, balanced error greater than imbalanced error" print "" print "" print "class_6_err_imbalanced" print class_6_err_imbalanced print "" print "class_6_err_balanced" print class_6_err_balanced print "" print "--------------------" assert class_6_err_imbalanced >= class_6_err_balanced, "balance_classes makes it worse!"
def imbalance(ip, port): h2o.init(ip, port) print "Test checks if Deep Learning works fine with an imbalanced dataset" covtype = h2o.upload_file(h2o.locate("smalldata/covtype/covtype.20k.data")) covtype[54] = covtype[54].asfactor() hh_imbalanced = h2o.deeplearning(x=covtype[0:54], y=covtype[54], l1=1e-5, activation="Rectifier", loss="CrossEntropy", hidden=[200, 200], epochs=1, training_frame=covtype, balance_classes=False, reproducible=True, seed=1234) print hh_imbalanced hh_balanced = h2o.deeplearning(x=covtype[0:54], y=covtype[54], l1=1e-5, activation="Rectifier", loss="CrossEntropy", hidden=[200, 200], epochs=1, training_frame=covtype, balance_classes=True, reproducible=True, seed=1234) print hh_balanced #compare error for class 6 (difficult minority) class_6_err_imbalanced = hh_imbalanced.confusion_matrix( covtype).cell_values[5][7] class_6_err_balanced = hh_balanced.confusion_matrix( covtype).cell_values[5][7] if class_6_err_imbalanced < class_6_err_balanced: print "--------------------" print "" print "FAIL, balanced error greater than imbalanced error" print "" print "" print "class_6_err_imbalanced" print class_6_err_imbalanced print "" print "class_6_err_balanced" print class_6_err_balanced print "" print "--------------------" assert class_6_err_imbalanced >= class_6_err_balanced, "balance_classes makes it worse!"
def offsets_and_distributions(ip, port): # cars cars = h2o.upload_file(h2o.locate("smalldata/junit/cars_20mpg.csv")) cars = cars[cars["economy_20mpg"].isna() == 0] cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() offset = h2o.H2OFrame(python_obj=[[.5] for x in range(398)]) offset.setNames(["x1"]) cars = cars.cbind(offset) # insurance insurance = h2o.import_frame( h2o.locate("smalldata/glm_test/insurance.csv")) insurance["offset"] = insurance["Holders"].log() # bernoulli dl = h2o.deeplearning(x=cars[2:8], y=cars["economy_20mpg"], distribution="bernoulli", offset_column="x1", training_frame=cars) predictions = dl.predict(cars) # gamma dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gamma", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance) # gaussian dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gaussian", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance) # poisson dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="poisson", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance) # tweedie dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="tweedie", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance)
def pubdev_2041(): iris = h2o.import_file(tests.locate("smalldata/iris/iris.csv")) s = iris.runif(seed=12345) train1 = iris[s >= 0.5] train2 = iris[s < 0.5] m1 = h2o.deeplearning(x=train1[0:4], y=train1[4], epochs=100) # update m1 with new training data m2 = h2o.deeplearning(x=train2[0:4], y=train2[4], epochs=200, checkpoint=m1.model_id)
def cv_carsDL(ip,port): # Connect to h2o h2o.init(ip,port) # read in the dataset and construct training set (and validation set) cars = h2o.import_frame(path=h2o.locate("smalldata/junit/cars_20mpg.csv")) # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial, # 2:multinomial problem = random.sample(range(3),1)[0] # pick the predictors and the correct response column predictors = ["displacement","power","weight","acceleration","year"] if problem == 1 : response_col = "economy_20mpg" cars[response_col] = cars[response_col].asfactor() elif problem == 2 : response_col = "cylinders" cars[response_col] = cars[response_col].asfactor() else : response_col = "economy" print "Response column: {0}".format(response_col) ## cross-validation ## basic nfolds = random.randint(3,10) dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=nfolds) ## boundary case # nfolds = 0 dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=0) ## error cases # 1. nfolds == 1 or < 0 # TODO: PUBDEV-1696 try: dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=random.randint(-10000,-1)) dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=1) assert False, "Expected model-build to fail when nfolds is 1 or < 0" except EnvironmentError: assert True # 2. cross-validation and regular validation attempted r = cars[0].runif() train = cars[r > .2] valid = cars[r <= .2] try: dl = h2o.deeplearning(y=train[response_col], x=train[predictors], nfolds=random.randint(3,10), validation_y=valid[1], validation_x=valid[predictors]) assert False, "Expected model-build to fail when both cross-validation and regular validation is attempted" except EnvironmentError: assert True
def imbalance(): print "Test checks if Deep Learning works fine with an imbalanced dataset" covtype = h2o.upload_file( pyunit_utils.locate("smalldata/covtype/covtype.20k.data")) covtype[54] = covtype[54].asfactor() hh_imbalanced = h2o.deeplearning(x=covtype[0:54], y=covtype[54], l1=1e-5, activation="Rectifier", loss="CrossEntropy", hidden=[200, 200], epochs=1, training_frame=covtype, balance_classes=False, reproducible=True, seed=1234) print hh_imbalanced hh_balanced = h2o.deeplearning(x=covtype[0:54], y=covtype[54], l1=1e-5, activation="Rectifier", loss="CrossEntropy", hidden=[200, 200], epochs=1, training_frame=covtype, balance_classes=True, reproducible=True, seed=1234) print hh_balanced #compare overall logloss class_6_err_imbalanced = hh_imbalanced.logloss() class_6_err_balanced = hh_balanced.logloss() if class_6_err_imbalanced < class_6_err_balanced: print "--------------------" print "" print "FAIL, balanced error greater than imbalanced error" print "" print "" print "class_6_err_imbalanced" print class_6_err_imbalanced print "" print "class_6_err_balanced" print class_6_err_balanced print "" print "--------------------" assert class_6_err_imbalanced >= class_6_err_balanced, "balance_classes makes it worse!"
def checkpoint_new_category_in_response(): sv = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv")) iris = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris.csv")) m1 = h2o.deeplearning(x=sv[[0,1,2,3]], y=sv[4], epochs=100) # attempt to continue building model, but with an expanded categorical response domain. # this should fail try: m2 = h2o.deeplearning(x=iris[[0,1,2,3]], y=iris[4], epochs=200, checkpoint=m1.model_id) assert False, "Expected continued model-building to fail with new categories introduced in response" except EnvironmentError: pass
def deep_learning_metrics_test(): # connect to existing cluster df = h2o.import_file(path=tests.locate("smalldata/logreg/prostate.csv")) df.drop("ID") # remove ID df['CAPSULE'] = df['CAPSULE'].asfactor() # make CAPSULE categorical vol = df['VOL'] vol[vol == 0] = float("nan") # 0 VOL means 'missing' r = vol.runif() # random train/test split train = df[r < 0.8] test = df[r >= 0.8] # See that the data is ready train.describe() train.head() train.tail() test.describe() test.head() test.tail() # Run DeepLearning print "Train a Deeplearning model: " dl = h2o.deeplearning(x=train[1:], y=train['CAPSULE'], epochs=100, hidden=[10, 10, 10], loss='CrossEntropy') print "Binomial Model Metrics: " print dl.show() dl.model_performance(test).show()
def deep_learning_metrics_test(ip, port): h2o.init(ip, port) # connect to existing cluster df = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv")) df.drop("ID") # remove ID df["CAPSULE"] = df["CAPSULE"].asfactor() # make CAPSULE categorical vol = df["VOL"] vol[vol == 0] = float("nan") # 0 VOL means 'missing' r = vol.runif() # random train/test split train = df[r < 0.8] test = df[r >= 0.8] # See that the data is ready train.describe() train.head() train.tail() test.describe() test.head() test.tail() # Run DeepLearning print "Train a Deeplearning model: " dl = h2o.deeplearning(x=train[1:], y=train["CAPSULE"], epochs=100, hidden=[10, 10, 10], loss="CrossEntropy") print "Binomial Model Metrics: " print dl.show() dl.model_performance(test).show()
def pubdev_2223(): covtype = h2o.import_file( pyunit_utils.locate("smalldata/covtype/covtype.20k.data")) covtype[54] = covtype[54].asfactor() dlmodel = h2o.deeplearning(x=covtype[0:54], y=covtype[54], hidden=[17, 191], epochs=1, training_frame=covtype, balance_classes=False, reproducible=True, seed=1234, export_weights_and_biases=True) print( "Normalization/Standardization multipliers for numeric predictors: {0}\n" .format(dlmodel.normmul())) print( "Normalization/Standardization offsets for numeric predictors: {0}\n". format(dlmodel.normsub())) print( "Normalization/Standardization multipliers for numeric response: {0}\n" .format(dlmodel.respmul())) print("Normalization/Standardization offsets for numeric response: {0}\n". format(dlmodel.respsub())) print("Categorical offsets for one-hot encoding: {0}\n".format( dlmodel.catoffsets()))
def deeplearning_autoencoder(): resp = 784 nfeatures = 20 # number of features (smallest hidden layer) train_hex = h2o.upload_file( pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz")) train_hex[resp] = train_hex[resp].asfactor() test_hex = h2o.upload_file( pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz")) test_hex[resp] = test_hex[resp].asfactor() # split data into two parts sid = train_hex[0].runif(1234) # unsupervised data for autoencoder train_unsupervised = train_hex[sid >= 0.5] train_unsupervised.drop(resp) train_unsupervised.describe() # supervised data for drf train_supervised = train_hex[sid < 0.5] train_supervised.describe() # train autoencoder ae_model = h2o.deeplearning( x=train_unsupervised[0:resp], activation="Tanh", autoencoder=True, hidden=[nfeatures], epochs=1, reproducible=True, #slow, turn off for real problems seed=1234) # conver train_supervised with autoencoder to lower-dimensional space train_supervised_features = ae_model.deepfeatures( train_supervised[0:resp]._frame(), 0) assert train_supervised_features.ncol == nfeatures, "Dimensionality of reconstruction is wrong!" # Train DRF on extracted feature space drf_model = h2o.random_forest(x=train_supervised_features[0:20], y=train_supervised[resp], ntrees=10, min_rows=10, seed=1234) # Test the DRF model on the test set (processed through deep features) test_features = ae_model.deepfeatures(test_hex[0:resp]._frame(), 0) test_features = test_features.cbind(test_hex[resp])._frame() # Confusion Matrix and assertion cm = drf_model.confusion_matrix(test_features) cm.show() # 10% error +/- 0.001 assert abs(cm.cell_values[10][10] - 0.081) < 0.001, "Error. Expected 0.081, but got {0}".format( cm.cell_values[10][10])
def anomaly(): print "Deep Learning Anomaly Detection MNIST" train = h2o.import_file(tests.locate("bigdata/laptop/mnist/train.csv.gz")) test = h2o.import_file(tests.locate("bigdata/laptop/mnist/test.csv.gz")) predictors = range(0,784) resp = 784 # unsupervised -> drop the response column (digit: 0-9) train = train[predictors] test = test[predictors] # 1) LEARN WHAT'S NORMAL # train unsupervised Deep Learning autoencoder model on train_hex ae_model = h2o.deeplearning(x=train[predictors], training_frame=train, activation="Tanh", autoencoder=True, hidden=[50], l1=1e-5, ignore_const_cols=False, epochs=1) # 2) DETECT OUTLIERS # anomaly app computes the per-row reconstruction error for the test data set # (passing it through the autoencoder model and computing mean square error (MSE) for each row) test_rec_error = ae_model.anomaly(test) # 3) VISUALIZE OUTLIERS # Let's look at the test set points with low/median/high reconstruction errors. # We will now visualize the original test set points and their reconstructions obtained # by propagating them through the narrow neural net. # Convert the test data into its autoencoded representation (pass through narrow neural net) test_recon = ae_model.predict(test)
def deep_learning_metrics_test(ip, port): h2o.init(ip, port) # connect to existing cluster df = h2o.import_frame(path="smalldata/logreg/prostate.csv") del df['ID'] # remove ID df['CAPSULE'] = df['CAPSULE'].asfactor() # make CAPSULE categorical vol = df['VOL'] vol[vol == 0] = None # 0 VOL means 'missing' r = vol.runif() # random train/test split train = df[r < 0.8] test = df[r >= 0.8] # See that the data is ready train.describe() train.head() test.describe() test.head() # Run DeepLearning print "Train a Deeplearning model: " dl = h2o.deeplearning(x=train[1:], y=train['CAPSULE'], epochs=100, hidden=[10, 10, 10]) print "Binomial Model Metrics: " print dl.model_performance(test).show()
def get_model_test(ip,port): # Connect to h2o h2o.init(ip,port) prostate = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv")) r = prostate[0].runif() train = prostate[r < 0.70] test = prostate[r >= 0.30] # Regression regression_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="gaussian") predictions1 = regression_gbm1.predict(test) regression_gbm2 = h2o.get_model(regression_gbm1._key) assert regression_gbm2._model_json['output']['model_category'] == "Regression" predictions2 = regression_gbm2.predict(test) for r in range(predictions1.nrow()): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected regression predictions to be the same for row {0}, but got {1} and {2}" \ "".format(r, p1, p2) # Binomial train[1] = train[1].asfactor() bernoulli_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="bernoulli") predictions1 = bernoulli_gbm1.predict(test) bernoulli_gbm2 = h2o.get_model(bernoulli_gbm1._key) assert bernoulli_gbm2._model_json['output']['model_category'] == "Binomial" predictions2 = bernoulli_gbm2.predict(test) for r in range(predictions1.nrow()): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected binomial predictions to be the same for row {0}, but got {1} and {2}" \ "".format(r, p1, p2) # Clustering benign_h2o = h2o.import_frame(path=h2o.locate("smalldata/logreg/benign.csv")) km_h2o = h2o.kmeans(x=benign_h2o, k=3) benign_km = h2o.get_model(km_h2o._key) assert benign_km._model_json['output']['model_category'] == "Clustering" # Multinomial train[4] = train[4].asfactor() multinomial_dl1 = h2o.deeplearning(x=train[0:2], y=train[4], loss='CrossEntropy') predictions1 = multinomial_dl1.predict(test) multinomial_dl2 = h2o.get_model(multinomial_dl1._key) assert multinomial_dl2._model_json['output']['model_category'] == "Multinomial" predictions2 = multinomial_dl2.predict(test) for r in range(predictions1.nrow()): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected multinomial predictions to be the same for row {0}, but got {1} and {2}" \ "".format(r, p1, p2)
def deeplearning_basic(ip, port): h2o.init(ip, port) iris_hex = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv")) hh = h2o.deeplearning(x=iris_hex[:3], y=iris_hex[4], loss='CrossEntropy') hh.show()
def checkpoint_new_category_in_response(): sv = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv")) iris = h2o.upload_file(tests.locate("smalldata/iris/iris.csv")) m1 = h2o.deeplearning(x=sv[[0, 1, 2, 3]], y=sv[4], epochs=100) # attempt to continue building model, but with an expanded categorical response domain. # this should fail try: m2 = h2o.deeplearning(x=iris[[0, 1, 2, 3]], y=iris[4], epochs=200, checkpoint=m1.id) assert False, "Expected continued model-building to fail with new categories introduced in response" except EnvironmentError: pass
def deeplearning_basic(): iris_hex = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv")) hh = h2o.deeplearning(x=iris_hex[:3], y=iris_hex[4], loss='CrossEntropy') hh.show()
def train(self, x, y): self.model = h2o.deeplearning(x = self.trainData.drop('score diff'), y = self.trainData['score diff'], validation_x = self.valData.drop('score diff'), validation_y = self.valData['score diff'], hidden=self.params[2], epochs=self.params[3], nfolds=self.params[4])
def ntrain(): h2o.init(ip="zurich.h2o.ai",strict_version_check=False) weather = load_weather() training = load_training() X = assemble_X(training, weather) mean, std = normalize(X) y =assemble_y(training) xd=[] for l in X: xd.append(l.tolist()) y=np.asarray(y,dtype='bool_') xtr=H2OFrame(python_obj=xd) ytr=H2OFrame(python_obj=y.tolist()) ytr["C1"]._name = "C40" # Rename the default column gb = h2o.gbm(x =xtr[1:39],y =ytr['C40'], distribution = "bernoulli", ntrees=1000, # 500 works well max_depth=12, learn_rate=0.01) dl= h2o.deeplearning(x =xtr[1:39],y =ytr['C40'], variable_importances=True,balance_classes=True, input_dropout_ratio=0.2,rho=0.899, hidden_dropout_ratios=[0.4,0.4,0.4,0.4], activation="Tanh",hidden=[39,325,325,1],epochs=100) rf= h2o.random_forest(x =xtr[1:39],y =ytr['C40'], seed=1234, ntrees=600, max_depth=20, balance_classes=False) testing = load_testing() X_test= assemble_X(testing, weather) normalize(X_test, mean, std) xd=[] for l in X_test: xd.append(l.tolist()) xts=H2OFrame(python_obj=xd) # gp=gb.predict(xts) dp=dl.predict(xts) rp=rf.predict(xts) gbp=gb.predict(xts) gp=dp*0.35+rp*0.3+gbp*0.35 gph=h2o.as_list(gp) Id= np.arange(gp.nrow()+1)[1:].reshape(gp.nrow(),1) df = pd.DataFrame(Id) df_concat = pd.concat([df, gph.True],axis=1) df_concat.columns=['Id','WnvPresent'] df_concat.to_csv("wnvh.csv",index=False)
def get_model_test(): prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) r = prostate[0].runif() train = prostate[r < 0.70] test = prostate[r >= 0.70] # Regression regression_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="gaussian") predictions1 = regression_gbm1.predict(test) regression_gbm2 = h2o.get_model(regression_gbm1._id) assert regression_gbm2._model_json['output']['model_category'] == "Regression" predictions2 = regression_gbm2.predict(test) for r in range(predictions1.nrow): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected regression predictions to be the same for row {}, but got {} and {}".format(r, p1, p2) # Binomial train[1] = train[1].asfactor() bernoulli_gbm1 = h2o.gbm(y=train[1], x=train[2:], distribution="bernoulli") predictions1 = bernoulli_gbm1.predict(test) bernoulli_gbm2 = h2o.get_model(bernoulli_gbm1._id) assert bernoulli_gbm2._model_json['output']['model_category'] == "Binomial" predictions2 = bernoulli_gbm2.predict(test) for r in range(predictions1.nrow): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected binomial predictions to be the same for row {}, but got {} and {}".format(r, p1, p2) # Clustering benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv")) km_h2o = h2o.kmeans(x=benign_h2o, k=3) benign_km = h2o.get_model(km_h2o._id) assert benign_km._model_json['output']['model_category'] == "Clustering" # Multinomial train[4] = train[4].asfactor() multinomial_dl1 = h2o.deeplearning(x=train[0:2], y=train[4], loss='CrossEntropy') predictions1 = multinomial_dl1.predict(test) multinomial_dl2 = h2o.get_model(multinomial_dl1._id) assert multinomial_dl2._model_json['output']['model_category'] == "Multinomial" predictions2 = multinomial_dl2.predict(test) for r in range(predictions1.nrow): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected multinomial predictions to be the same for row {0}, but got {1} and {2}" \ "".format(r, p1, p2)
def weights_and_biases(): print "Test checks if Deep Learning weights and biases are accessible from R" covtype = h2o.upload_file(h2o.locate("smalldata/covtype/covtype.20k.data")) covtype[54] = covtype[54].asfactor() dlmodel = h2o.deeplearning( x=covtype[0:54], y=covtype[54], hidden=[17, 191], epochs=1, training_frame=covtype, balance_classes=False, reproducible=True, seed=1234, export_weights_and_biases=True, ) print dlmodel weights1 = dlmodel.weights(0) weights2 = dlmodel.weights(1) weights3 = dlmodel.weights(2) biases1 = dlmodel.biases(0) biases2 = dlmodel.biases(1) biases3 = dlmodel.biases(2) w1c = weights1.ncol w1r = weights1.nrow assert w1c == 52, "wrong dimensionality! expected {0}, but got {1}.".format(52, w1c) assert w1r == 17, "wrong dimensionality! expected {0}, but got {1}.".format(17, w1r) w2c = weights2.ncol w2r = weights2.nrow assert w2c == 17, "wrong dimensionality! expected {0}, but got {1}.".format(17, w2c) assert w2r == 191, "wrong dimensionality! expected {0}, but got {1}.".format(191, w2r) w3c = weights3.ncol w3r = weights3.nrow assert w3c == 191, "wrong dimensionality! expected {0}, but got {1}.".format(191, w3c) assert w3r == 7, "wrong dimensionality! expected {0}, but got {1}.".format(7, w3r) b1c = biases1.ncol b1r = biases1.nrow assert b1c == 1, "wrong dimensionality! expected {0}, but got {1}.".format(1, b1c) assert b1r == 17, "wrong dimensionality! expected {0}, but got {1}.".format(17, b1r) b2c = biases2.ncol b2r = biases2.nrow assert b2c == 1, "wrong dimensionality! expected {0}, but got {1}.".format(1, b2c) assert b2r == 191, "wrong dimensionality! expected {0}, but got {1}.".format(191, b2r) b3c = biases3.ncol b3r = biases3.nrow assert b3c == 1, "wrong dimensionality! expected {0}, but got {1}.".format(1, b3c) assert b3r == 7, "wrong dimensionality! expected {0}, but got {1}.".format(7, b3r)
def deeplearning_autoencoder(): resp = 784 nfeatures = 20 # number of features (smallest hidden layer) train_hex = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz")) train_hex[resp] = train_hex[resp].asfactor() test_hex = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz")) test_hex[resp] = test_hex[resp].asfactor() # split data into two parts sid = train_hex[0].runif(1234) # unsupervised data for autoencoder train_unsupervised = train_hex[sid >= 0.5] train_unsupervised.drop(resp) train_unsupervised.describe() # supervised data for drf train_supervised = train_hex[sid < 0.5] train_supervised.describe() # train autoencoder ae_model = h2o.deeplearning( x=train_unsupervised[0:resp], activation="Tanh", autoencoder=True, hidden=[nfeatures], epochs=1, reproducible=True, # slow, turn off for real problems seed=1234, ) # conver train_supervised with autoencoder to lower-dimensional space train_supervised_features = ae_model.deepfeatures(train_supervised[0:resp], 0) assert train_supervised_features.ncol == nfeatures, "Dimensionality of reconstruction is wrong!" # Train DRF on extracted feature space drf_model = h2o.random_forest( x=train_supervised_features[0:20], y=train_supervised[resp], ntrees=10, min_rows=10, seed=1234 ) # Test the DRF model on the test set (processed through deep features) test_features = ae_model.deepfeatures(test_hex[0:resp], 0) test_features = test_features.cbind(test_hex[resp]) # Confusion Matrix and assertion cm = drf_model.confusion_matrix(test_features) cm.show() # 10% error +/- 0.001 assert abs(cm.cell_values[10][10] - 0.086) < 0.001, "Error. Expected 0.086, but got {0}".format( cm.cell_values[10][10] )
def deeplearning_autoencoder(ip, port): h2o.init(ip, port) resp = 784 nfeatures = 20 # number of features (smallest hidden layer) train_hex = h2o.import_frame( h2o.locate("bigdata/laptop/mnist/train.csv.gz")) test_hex = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/test.csv.gz")) # split data into two parts sid = train_hex[1].runif(1234) # unsupervised data for autoencoder train_unsupervised = train_hex[sid >= 0.5] train_unsupervised.describe() # supervised data for drf train_supervised = train_hex[sid < 0.5] train_supervised.describe() # train autoencoder ae_model = h2o.deeplearning( x=train_unsupervised.drop(resp), y=train_unsupervised[resp], #ignored (pick any non-constant) activation="Tanh", autoencoder=True, hidden=[nfeatures], epochs=1, reproducible=True, #slow, turn off for real problems seed=1234) # conver train_supervised with autoencoder to lower-dimensional space train_supervised_features = ae_model.deepfeatures(train_supervised, 0) train_supervised_features.describe() assert train_supervised_features.ncol( ) == nfeatures, "Dimensionality of reconstruction is wrong!" # Train DRF on extracted feature space drf_model = h2o.random_forest(x=train_supervised_features, y=train_supervised[resp].asfactor(), ntrees=10, seed=1234) # Test the DRF model on the test set (processed through deep features) test_features = ae_model.deepfeatures(test_hex.drop(resp), 0) test_features.cbind(test_hex[resp]) # Confusion Matrix and assertion cm = drf_model.confusion_matrix(test_features) cm.show() # 10% error +/- 0.001 assert abs(cm["Totals", "Error"] - 0.1038) < 0.001, "Error not as expected"
def checkpoint_new_category_in_predictor(): sv1 = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv")) sv2 = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv")) vir = h2o.upload_file(tests.locate("smalldata/iris/virginica.csv")) m1 = h2o.deeplearning(x=sv1[[0, 1, 2, 4]], y=sv1[3], epochs=100) m2 = h2o.deeplearning(x=sv2[[0, 1, 2, 4]], y=sv2[3], epochs=200, checkpoint=m1.id) # attempt to continue building model, but with an expanded categorical predictor domain. # this should fail try: m3 = h2o.deeplearning(x=vir[[0, 1, 2, 4]], y=vir[3], epochs=200, checkpoint=m1.id) assert False, "Expected continued model-building to fail with new categories introduced in predictor" except EnvironmentError: pass # attempt to predict on new model, but with observations that have expanded categorical predictor domain. predictions = m2.predict(vir)
def tweedie_offset(ip,port): insurance = h2o.import_file(h2o.locate("smalldata/glm_test/insurance.csv")) insurance["offset"] = insurance["Holders"].log() insurance["Group"] = insurance["Group"].asfactor() insurance["Age"] = insurance["Age"].asfactor() insurance["District"] = insurance["District"].asfactor() # without offset dl = h2o.deeplearning(x=insurance[0:3],y=insurance["Claims"],distribution="tweedie",hidden=[1],epochs=1000, train_samples_per_iteration=-1,reproducible=True,activation="Tanh",single_node_mode=False, balance_classes=False,force_load_balance=False,seed=23123,tweedie_power=1.5, score_training_samples=0,score_validation_samples=0) mean_residual_deviance = dl.mean_residual_deviance() assert abs(0.561641366536-mean_residual_deviance) < 1e-6, "Expected mean residual deviance to be 0.561641366536, but got " \ "{0}".format(mean_residual_deviance) predictions = dl.predict(insurance) assert abs(47.6819999424-predictions[0].mean()) < 1e-6, "Expected mean of predictions to be 47.6819999424, but got " \ "{0}".format(predictions[0].mean()) assert abs(1.90409304033-predictions[0].min()) < 1e-6, "Expected min of predictions to be 1.90409304033, but got " \ "{0}".format(predictions[0].min()) assert abs(280.735054543-predictions[0].max()) < 1e-6, "Expected max of predictions to be 280.735054543, but got " \ "{0}".format(predictions[0].max()) # with offset dl = h2o.deeplearning(x=insurance[0:3],y=insurance["Claims"],distribution="tweedie",hidden=[1],epochs=1000, train_samples_per_iteration=-1,reproducible=True,activation="Tanh",single_node_mode=False, balance_classes=False,force_load_balance=False,seed=23123,tweedie_power=1.5, score_training_samples=0,score_validation_samples=0,offset_column="offset", training_frame=insurance) mean_residual_deviance = dl.mean_residual_deviance() assert abs(0.261065520191-mean_residual_deviance) < 1e-6, "Expected mean residual deviance to be 0.261065520191, but got " \ "{0}".format(mean_residual_deviance) predictions = dl.predict(insurance) assert abs(49.2939039783-predictions[0].mean()) < 1e-6, "Expected mean of predictions to be 49.2939039783, but got " \ "{0}".format(predictions[0].mean()) assert abs(1.07391126487-predictions[0].min()) < 1e-6, "Expected min of predictions to be 1.07391126487, but got " \ "{0}".format(predictions[0].min()) assert abs(397.328758591-predictions[0].max()) < 1e-6, "Expected max of predictions to be 397.328758591, but got " \ "{0}".format(predictions[0].max())
def tweedie_offset(): insurance = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/insurance.csv")) insurance["offset"] = insurance["Holders"].log() insurance["Group"] = insurance["Group"].asfactor() insurance["Age"] = insurance["Age"].asfactor() insurance["District"] = insurance["District"].asfactor() # without offset dl = h2o.deeplearning(x=insurance[0:3],y=insurance["Claims"],distribution="tweedie",hidden=[1],epochs=1000, train_samples_per_iteration=-1,reproducible=True,activation="Tanh",single_node_mode=False, balance_classes=False,force_load_balance=False,seed=23123,tweedie_power=1.5, score_training_samples=0,score_validation_samples=0) mean_residual_deviance = dl.mean_residual_deviance() assert abs(0.556 - mean_residual_deviance) < 1e-3, "Expected mean residual deviance to be 0.556, but got " \ "{0}".format(mean_residual_deviance) predictions = dl.predict(insurance) assert abs(47.61-predictions[0].mean()) < 1e-2, "Expected mean of predictions to be 47.61, but got " \ "{0}".format(predictions[0].mean()) assert abs(1.94-predictions[0].min()) < 1e-1, "Expected min of predictions to be 1.94, but got " \ "{0}".format(predictions[0].min()) assert abs(284.6-predictions[0].max()) < 28, "Expected max of predictions to be 284.6, but got " \ "{0}".format(predictions[0].max()) # with offset dl = h2o.deeplearning(x=insurance[0:3],y=insurance["Claims"],distribution="tweedie",hidden=[1],epochs=1000, train_samples_per_iteration=-1,reproducible=True,activation="Tanh",single_node_mode=False, balance_classes=False,force_load_balance=False,seed=23123,tweedie_power=1.5, score_training_samples=0,score_validation_samples=0,offset_column="offset", training_frame=insurance) mean_residual_deviance = dl.mean_residual_deviance() assert abs(0.261-mean_residual_deviance) < 1e-2, "Expected mean residual deviance to be 0.261, but got " \ "{0}".format(mean_residual_deviance) predictions = dl.predict(insurance) assert abs(49.53-predictions[0].mean()) < 1e-1, "Expected mean of predictions to be 49.53, but got " \ "{0}".format(predictions[0].mean()) assert abs(1.074-predictions[0].min()) < 1e-1, "Expected min of predictions to be 1.074, but got " \ "{0}".format(predictions[0].min()) assert abs(397.3-predictions[0].max()) < 40, "Expected max of predictions to be 397.3, but got " \ "{0}".format(predictions[0].max())
def deeplearning_autoencoder(ip, port): h2o.init(ip, port) resp = 784 nfeatures = 20 # number of features (smallest hidden layer) train_hex = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/train.csv.gz")) test_hex = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/test.csv.gz")) # split data into two parts sid = train_hex[1].runif(1234) # unsupervised data for autoencoder train_unsupervised = train_hex[sid >= 0.5] train_unsupervised.describe() # supervised data for drf train_supervised = train_hex[sid < 0.5] train_supervised.describe() # train autoencoder ae_model = h2o.deeplearning(x=train_unsupervised.drop(resp), y=train_unsupervised[resp], #ignored (pick any non-constant) activation="Tanh", autoencoder=True, hidden=[nfeatures], epochs=1, reproducible=True, #slow, turn off for real problems seed=1234) # conver train_supervised with autoencoder to lower-dimensional space train_supervised_features = ae_model.deepfeatures(train_supervised, 0) train_supervised_features.describe() assert train_supervised_features.ncol() == nfeatures, "Dimensionality of reconstruction is wrong!" # Train DRF on extracted feature space drf_model = h2o.random_forest(x=train_supervised_features, y=train_supervised[resp].asfactor(), ntrees=10, seed=1234) # Test the DRF model on the test set (processed through deep features) test_features = ae_model.deepfeatures(test_hex.drop(resp), 0) test_features.cbind(test_hex[resp]) # Confusion Matrix and assertion cm = drf_model.confusionMatrix(test_features) cm.show() # 10% error +/- 0.001 assert abs(cm["Totals", "Error"] - 0.1038) < 0.001, "Error not as expected"
def missing(): # Connect to a pre-existing cluster missing_ratios = [0, 0.1, 0.25, 0.5, 0.75, 0.99] errors = [0, 0, 0, 0, 0, 0] for i in range(len(missing_ratios)): data = h2o.upload_file( pyunit_utils.locate("smalldata/junit/weather.csv")) data[15] = data[15].asfactor() #ChangeTempDir data[16] = data[16].asfactor() #ChangeTempMag data[17] = data[17].asfactor() #ChangeWindDirect data[18] = data[18].asfactor() #MaxWindPeriod data[19] = data[19].asfactor() #RainToday data[21] = data[21].asfactor() #PressureChange data[23] = data[23].asfactor() #RainTomorrow print "For missing {0}%".format(missing_ratios[i] * 100) # add missing values to the data section of the file (leave the response alone) if missing_ratios[i] > 0: resp = data[23] pred = data[:, range(23) + range(24, data.ncol)] data_missing = pred.insert_missing_values( fraction=missing_ratios[i]) data_fin = data_missing.cbind(resp) else: data_fin = data # split into train + test datasets ratio = data_fin[0].runif() train = data_fin[ratio <= .75] test = data_fin[ratio > .75] hh = h2o.deeplearning(x=train[2:22], y=train[23], validation_x=test[2:22], validation_y=test[23], epochs=5, reproducible=True, seed=12345, activation='RectifierWithDropout', l1=1e-5, input_dropout_ratio=0.2) errors[i] = hh.error()[0][1] for i in range(len(missing_ratios)): print "missing ratio: {0}% --> classification error: {1}".format( missing_ratios[i] * 100, errors[i]) assert sum(errors) < 2.2, "Sum of classification errors is too large!"
def weights_and_distributions(): htable = h2o.upload_file( pyunit_utils.locate("smalldata/gbm_test/moppe.csv")) htable["premiekl"] = htable["premiekl"].asfactor() htable["moptva"] = htable["moptva"].asfactor() htable["zon"] = htable["zon"] # gamma dl = h2o.deeplearning(x=htable[0:3], y=htable["medskad"], training_frame=htable, distribution="gamma", weights_column="antskad") predictions = dl.predict(htable) # gaussian dl = h2o.deeplearning(x=htable[0:3], y=htable["medskad"], training_frame=htable, distribution="gaussian", weights_column="antskad") predictions = dl.predict(htable) # poisson dl = h2o.deeplearning(x=htable[0:3], y=htable["medskad"], training_frame=htable, distribution="poisson", weights_column="antskad") predictions = dl.predict(htable) # tweedie dl = h2o.deeplearning(x=htable[0:3], y=htable["medskad"], training_frame=htable, distribution="tweedie", weights_column="antskad") predictions = dl.predict(htable)
def missing(): # Connect to a pre-existing cluster missing_ratios = [0, 0.1, 0.25, 0.5, 0.75, 0.99] errors = [0, 0, 0, 0, 0, 0] for i in range(len(missing_ratios)): data = h2o.upload_file(h2o.locate("smalldata/junit/weather.csv")) data[15] = data[15].asfactor() # ChangeTempDir data[16] = data[16].asfactor() # ChangeTempMag data[17] = data[17].asfactor() # ChangeWindDirect data[18] = data[18].asfactor() # MaxWindPeriod data[19] = data[19].asfactor() # RainToday data[21] = data[21].asfactor() # PressureChange data[23] = data[23].asfactor() # RainTomorrow print "For missing {0}%".format(missing_ratios[i] * 100) # add missing values to the data section of the file (leave the response alone) if missing_ratios[i] > 0: resp = data[23] pred = data[:, range(23) + range(24, data.ncol)] data_missing = pred.insert_missing_values(fraction=missing_ratios[i]) data_fin = data_missing.cbind(resp) else: data_fin = data # split into train + test datasets ratio = data_fin[0].runif() train = data_fin[ratio <= 0.75] test = data_fin[ratio > 0.75] hh = h2o.deeplearning( x=train[2:22], y=train[23], validation_x=test[2:22], validation_y=test[23], epochs=5, reproducible=True, seed=12345, activation="RectifierWithDropout", l1=1e-5, input_dropout_ratio=0.2, ) errors[i] = hh.error()[0][1] for i in range(len(missing_ratios)): print "missing ratio: {0}% --> classification error: {1}".format(missing_ratios[i] * 100, errors[i]) assert sum(errors) < 2.2, "Sum of classification errors is too large!"
def pubdev_2223(): covtype = h2o.import_file(pyunit_utils.locate("smalldata/covtype/covtype.20k.data")) covtype[54] = covtype[54].asfactor() dlmodel = h2o.deeplearning(x=covtype[0:54], y=covtype[54], hidden=[17,191], epochs=1, training_frame=covtype, balance_classes=False, reproducible=True, seed=1234, export_weights_and_biases=True) print("Normalization/Standardization multipliers for numeric predictors: {0}\n".format(dlmodel.normmul())) print("Normalization/Standardization offsets for numeric predictors: {0}\n".format(dlmodel.normsub())) print("Normalization/Standardization multipliers for numeric response: {0}\n".format(dlmodel.respmul())) print("Normalization/Standardization offsets for numeric response: {0}\n".format(dlmodel.respsub())) print("Categorical offsets for one-hot encoding: {0}\n".format(dlmodel.catoffsets()))
def deeplearning_multi(ip, port): print("Test checks if Deep Learning works fine with a multiclass training and test dataset") prostate = h2o.import_file(h2o.locate("smalldata/logreg/prostate.csv")) prostate[4] = prostate[4].asfactor() hh = h2o.deeplearning(x = prostate[0:2], y = prostate[4], validation_x = prostate[0:2], validation_y = prostate[4], loss = 'CrossEntropy') hh.show()
def deeplearning_multi(): print("Test checks if Deep Learning works fine with a multiclass training and test dataset") prostate = h2o.import_file(tests.locate("smalldata/logreg/prostate.csv")) prostate[4] = prostate[4].asfactor() hh = h2o.deeplearning(x = prostate[0:2], y = prostate[4], validation_x = prostate[0:2], validation_y = prostate[4], loss = 'CrossEntropy') hh.show()
def imbalance(): print "Test checks if Deep Learning works fine with an imbalanced dataset" covtype = h2o.upload_file(tests.locate("smalldata/covtype/covtype.20k.data")) covtype[54] = covtype[54].asfactor() hh_imbalanced = h2o.deeplearning(x=covtype[0:54], y=covtype[54], l1=1e-5, activation="Rectifier", loss="CrossEntropy", hidden=[200,200], epochs=1, training_frame=covtype, balance_classes=False, reproducible=True, seed=1234) print hh_imbalanced hh_balanced = h2o.deeplearning(x=covtype[0:54], y=covtype[54], l1=1e-5, activation="Rectifier", loss="CrossEntropy", hidden=[200,200], epochs=1, training_frame=covtype, balance_classes=True, reproducible=True, seed=1234) print hh_balanced #compare overall logloss class_6_err_imbalanced = hh_imbalanced.logloss() class_6_err_balanced = hh_balanced.logloss() if class_6_err_imbalanced < class_6_err_balanced: print "--------------------" print "" print "FAIL, balanced error greater than imbalanced error" print "" print "" print "class_6_err_imbalanced" print class_6_err_imbalanced print "" print "class_6_err_balanced" print class_6_err_balanced print "" print "--------------------" assert class_6_err_imbalanced >= class_6_err_balanced, "balance_classes makes it worse!"
def weights_and_biases(): print("Test checks if Deep Learning weights and biases are accessible from R") covtype = h2o.upload_file(pyunit_utils.locate("smalldata/covtype/covtype.20k.data")) covtype[54] = covtype[54].asfactor() dlmodel = h2o.deeplearning(x=covtype[0:54], y=covtype[54], hidden=[17,191], epochs=1, training_frame=covtype, balance_classes=False, reproducible=True, seed=1234, export_weights_and_biases=True) print(dlmodel) weights1 = dlmodel.weights(0) weights2 = dlmodel.weights(1) weights3 = dlmodel.weights(2) biases1 = dlmodel.biases(0) biases2 = dlmodel.biases(1) biases3 = dlmodel.biases(2) w1c = weights1.ncol w1r = weights1.nrow assert w1c == 52, "wrong dimensionality! expected {0}, but got {1}.".format(52, w1c) assert w1r == 17, "wrong dimensionality! expected {0}, but got {1}.".format(17, w1r) w2c = weights2.ncol w2r = weights2.nrow assert w2c == 17, "wrong dimensionality! expected {0}, but got {1}.".format(17, w2c) assert w2r == 191, "wrong dimensionality! expected {0}, but got {1}.".format(191, w2r) w3c = weights3.ncol w3r = weights3.nrow assert w3c == 191, "wrong dimensionality! expected {0}, but got {1}.".format(191, w3c) assert w3r == 7, "wrong dimensionality! expected {0}, but got {1}.".format(7, w3r) b1c = biases1.ncol b1r = biases1.nrow assert b1c == 1, "wrong dimensionality! expected {0}, but got {1}.".format(1, b1c) assert b1r == 17, "wrong dimensionality! expected {0}, but got {1}.".format(17, b1r) b2c = biases2.ncol b2r = biases2.nrow assert b2c == 1, "wrong dimensionality! expected {0}, but got {1}.".format(1, b2c) assert b2r == 191, "wrong dimensionality! expected {0}, but got {1}.".format(191, b2r) b3c = biases3.ncol b3r = biases3.nrow assert b3c == 1, "wrong dimensionality! expected {0}, but got {1}.".format(1, b3c) assert b3r == 7, "wrong dimensionality! expected {0}, but got {1}.".format(7, b3r)
def split_fit_predict_dl(h1, h2, h3, hdr1, hdr2, hdr3, rho, epsilon): print "Trying h1, h2, h3, hdr1, hdr2, hdr3, rho, epsilon values of:", h1, h2, h3, hdr1, hdr2, hdr3, rho, epsilon dl = h2o.deeplearning(x = train[predictors], y = train['EVI'], validation_x = test[predictors], validation_y = test['EVI'], training_frame = train, validation_frame = test, weights_column = 'PixelReliability', hidden = [int(h1), int(h2), int(h3)], activation = "RectifierWithDropout", hidden_dropout_ratios = [hdr1, hdr2, hdr3], fast_mode = True, rho = rho, epsilon = epsilon) mse = dl.mse(valid=True) r2 = dl.r2(valid=True) print "Deep learning MSE:", mse return([mse, r2])
def split_fit_predict_dl(h1, h2, h3, hdr1, hdr2, hdr3, rho, epsilon): print "Trying h1, h2, h3, hdr1, hdr2, hdr3, rho, epsilon values of:", h1, h2, h3, hdr1, hdr2, hdr3, rho, epsilon dl = h2o.deeplearning(x=train[predictors], y=train['EVI'], validation_x=test[predictors], validation_y=test['EVI'], training_frame=train, validation_frame=test, weights_column='PixelReliability', hidden=[int(h1), int(h2), int(h3)], activation="RectifierWithDropout", hidden_dropout_ratios=[hdr1, hdr2, hdr3], fast_mode=True, rho=rho, epsilon=epsilon) mse = dl.mse(valid=True) r2 = dl.r2(valid=True) print "Deep learning MSE:", mse return ([mse, r2])
def deeplearning_multi(): print("Test checks if Deep Learning works fine with a categorical dataset") # print(locate("smalldata/logreg/protstate.csv")) prostate = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) prostate[1] = prostate[1].asfactor() #CAPSULE -> CAPSULE prostate[2] = prostate[2].asfactor() #AGE -> Factor prostate[3] = prostate[3].asfactor() #RACE -> Factor prostate[4] = prostate[4].asfactor() #DPROS -> Factor prostate[5] = prostate[5].asfactor() #DCAPS -> Factor prostate = prostate.drop('ID') #remove ID prostate.describe() hh = h2o.deeplearning(x=prostate.drop('CAPSULE'), y=prostate['CAPSULE'], loss='CrossEntropy', hidden=[10, 10], use_all_factor_levels=False) hh.show()
def deeplearning_multi(): print("Test checks if Deep Learning works fine with a categorical dataset") # print(locate("smalldata/logreg/protstate.csv")) prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) prostate[1] = prostate[1].asfactor() #CAPSULE -> CAPSULE prostate[2] = prostate[2].asfactor() #AGE -> Factor prostate[3] = prostate[3].asfactor() #RACE -> Factor prostate[4] = prostate[4].asfactor() #DPROS -> Factor prostate[5] = prostate[5].asfactor() #DCAPS -> Factor prostate = prostate.drop('ID') #remove ID prostate.describe() hh = h2o.deeplearning(x = prostate.drop('CAPSULE'), y = prostate['CAPSULE'], loss = 'CrossEntropy', hidden = [10, 10], use_all_factor_levels = False) hh.show()
def deeplearning_mean_residual_deviance(): cars = h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv")) s = cars[0].runif() train = cars[s > 0.2] valid = cars[s <= 0.2] predictors = ["displacement","power","weight","acceleration","year"] response_col = "economy" dl = h2o.deeplearning(x=train[predictors], y=train[response_col], validation_x=valid[predictors], validation_y=valid[response_col], nfolds=3) dl_mrd = dl.mean_residual_deviance(train=True,valid=True,xval=True) assert isinstance(dl_mrd['train'],float), "Expected training mean residual deviance to be a float, but got " \ "{0}".format(type(dl_mrd['train'])) assert isinstance(dl_mrd['valid'],float), "Expected validation mean residual deviance to be a float, but got " \ "{0}".format(type(dl_mrd['valid'])) assert isinstance(dl_mrd['xval'],float), "Expected cross-validation mean residual deviance to be a float, but got " \ "{0}".format(type(dl_mrd['xval']))
def deeplearning_mean_residual_deviance(ip, port): cars = h2o.import_frame(path=h2o.locate("smalldata/junit/cars_20mpg.csv")) s = cars[0].runif() train = cars[s > 0.2] valid = cars[s <= 0.2] predictors = ["displacement", "power", "weight", "acceleration", "year"] response_col = "economy" dl = h2o.deeplearning(x=train[predictors], y=train[response_col], validation_x=valid[predictors], validation_y=valid[response_col], nfolds=3) dl_mrd = dl.mean_residual_deviance(train=True, valid=True, xval=True) assert isinstance(dl_mrd['train'],float), "Expected training mean residual deviance to be a float, but got " \ "{0}".format(type(dl_mrd['train'])) assert isinstance(dl_mrd['valid'],float), "Expected validation mean residual deviance to be a float, but got " \ "{0}".format(type(dl_mrd['valid'])) assert isinstance(dl_mrd['xval'],float), "Expected cross-validation mean residual deviance to be a float, but got " \ "{0}".format(type(dl_mrd['xval']))
def deepLearningDemo(ip, port): # Training data train_data = h2o.import_file(path=h2o.locate("smalldata/gbm_test/ecology_model.csv")) train_data = train_data.drop("Site") train_data["Angaus"] = train_data["Angaus"].asfactor() print train_data.describe() train_data.head() # Testing data test_data = h2o.import_file(path=h2o.locate("smalldata/gbm_test/ecology_eval.csv")) test_data["Angaus"] = test_data["Angaus"].asfactor() print test_data.describe() test_data.head() # Run GBM gbm = h2o.gbm( x=train_data[1:], y=train_data["Angaus"], validation_x=test_data[1:], validation_y=test_data["Angaus"], ntrees=100, distribution="bernoulli", ) gbm.show() # Run DeepLearning dl = h2o.deeplearning( x=train_data[1:], y=train_data["Angaus"], validation_x=test_data[1:], validation_y=test_data["Angaus"], loss="CrossEntropy", epochs=1000, hidden=[20, 20, 20], ) dl.show()
def deepLearningDemo(ip, port): h2o.init(ip, port) # Training data train_data = h2o.import_frame( path=h2o.locate("smalldata/gbm_test/ecology_model.csv")) train_data = train_data.drop('Site') train_data['Angaus'] = train_data['Angaus'].asfactor() print train_data.describe() train_data.head() # Testing data test_data = h2o.import_frame( path=h2o.locate("smalldata/gbm_test/ecology_eval.csv")) test_data['Angaus'] = test_data['Angaus'].asfactor() print test_data.describe() test_data.head() # Run GBM gbm = h2o.gbm(x=train_data[1:], y=train_data['Angaus'], validation_x=test_data[1:], validation_y=test_data['Angaus'], ntrees=100, distribution="bernoulli") gbm.show() # Run DeepLearning dl = h2o.deeplearning(x=train_data[1:], y=train_data['Angaus'], validation_x=test_data[1:], validation_y=test_data['Angaus'], loss='CrossEntropy', epochs=1000, hidden=[20, 20, 20]) dl.show()
def deepLearningDemo(): # Training data train_data = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv")) train_data = train_data.drop('Site') train_data['Angaus'] = train_data['Angaus'].asfactor() print train_data.describe() train_data.head() # Testing data test_data = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/ecology_eval.csv")) test_data['Angaus'] = test_data['Angaus'].asfactor() print test_data.describe() test_data.head() # Run GBM gbm = h2o.gbm(x = train_data[1:], y = train_data['Angaus'], validation_x= test_data [1:] , validation_y= test_data ['Angaus'], ntrees=100, distribution="bernoulli") gbm.show() # Run DeepLearning dl = h2o.deeplearning(x = train_data[1:], y = train_data['Angaus'], validation_x= test_data [1:] , validation_y= test_data ['Angaus'], loss = 'CrossEntropy', epochs = 1000, hidden = [20, 20, 20]) dl.show()
def deeplearning_basic(): iris_hex = h2o.import_file(path=tests.locate("smalldata/iris/iris.csv")) hh = h2o.deeplearning(x=iris_hex[:3], y=iris_hex[4], loss='CrossEntropy') hh.show()
def tweedie_offset(ip, port): insurance = h2o.import_file(h2o.locate("smalldata/glm_test/insurance.csv")) insurance["offset"] = insurance["Holders"].log() insurance["Group"] = insurance["Group"].asfactor() insurance["Age"] = insurance["Age"].asfactor() insurance["District"] = insurance["District"].asfactor() # without offset dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="tweedie", hidden=[1], epochs=1000, train_samples_per_iteration=-1, reproducible=True, activation="Tanh", single_node_mode=False, balance_classes=False, force_load_balance=False, seed=23123, tweedie_power=1.5, score_training_samples=0, score_validation_samples=0) mean_residual_deviance = dl.mean_residual_deviance() assert abs(0.561641366536-mean_residual_deviance) < 1e-6, "Expected mean residual deviance to be 0.561641366536, but got " \ "{0}".format(mean_residual_deviance) predictions = dl.predict(insurance) assert abs(47.6819999424-predictions[0].mean()) < 1e-6, "Expected mean of predictions to be 47.6819999424, but got " \ "{0}".format(predictions[0].mean()) assert abs(1.90409304033-predictions[0].min()) < 1e-6, "Expected min of predictions to be 1.90409304033, but got " \ "{0}".format(predictions[0].min()) assert abs(280.735054543-predictions[0].max()) < 1e-6, "Expected max of predictions to be 280.735054543, but got " \ "{0}".format(predictions[0].max()) # with offset dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="tweedie", hidden=[1], epochs=1000, train_samples_per_iteration=-1, reproducible=True, activation="Tanh", single_node_mode=False, balance_classes=False, force_load_balance=False, seed=23123, tweedie_power=1.5, score_training_samples=0, score_validation_samples=0, offset_column="offset", training_frame=insurance) mean_residual_deviance = dl.mean_residual_deviance() assert abs(0.261065520191-mean_residual_deviance) < 1e-6, "Expected mean residual deviance to be 0.261065520191, but got " \ "{0}".format(mean_residual_deviance) predictions = dl.predict(insurance) assert abs(49.2939039783-predictions[0].mean()) < 1e-6, "Expected mean of predictions to be 49.2939039783, but got " \ "{0}".format(predictions[0].mean()) assert abs(1.07391126487-predictions[0].min()) < 1e-6, "Expected min of predictions to be 1.07391126487, but got " \ "{0}".format(predictions[0].min()) assert abs(397.328758591-predictions[0].max()) < 1e-6, "Expected max of predictions to be 397.328758591, but got " \ "{0}".format(predictions[0].max())
def tweedie_offset(): insurance = h2o.import_file( pyunit_utils.locate("smalldata/glm_test/insurance.csv")) insurance["offset"] = insurance["Holders"].log() insurance["Group"] = insurance["Group"].asfactor() insurance["Age"] = insurance["Age"].asfactor() insurance["District"] = insurance["District"].asfactor() # without offset dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="tweedie", hidden=[1], epochs=1000, train_samples_per_iteration=-1, reproducible=True, activation="Tanh", single_node_mode=False, balance_classes=False, force_load_balance=False, seed=23123, tweedie_power=1.5, score_training_samples=0, score_validation_samples=0) mean_residual_deviance = dl.mean_residual_deviance() assert abs(0.556 - mean_residual_deviance) < 1e-3, "Expected mean residual deviance to be 0.556, but got " \ "{0}".format(mean_residual_deviance) predictions = dl.predict(insurance) assert abs(47.61-predictions[0].mean()[0]) < 1e-2, "Expected mean of predictions to be 47.61, but got " \ "{0}".format(predictions[0].mean()[0]) assert abs(1.94-predictions[0].min()) < 1e-1, "Expected min of predictions to be 1.94, but got " \ "{0}".format(predictions[0].min()) assert abs(284.6-predictions[0].max()) < 28, "Expected max of predictions to be 284.6, but got " \ "{0}".format(predictions[0].max()) # with offset dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="tweedie", hidden=[1], epochs=1000, train_samples_per_iteration=-1, reproducible=True, activation="Tanh", single_node_mode=False, balance_classes=False, force_load_balance=False, seed=23123, tweedie_power=1.5, score_training_samples=0, score_validation_samples=0, offset_column="offset", training_frame=insurance) mean_residual_deviance = dl.mean_residual_deviance() assert abs(0.261-mean_residual_deviance) < 1e-2, "Expected mean residual deviance to be 0.261, but got " \ "{0}".format(mean_residual_deviance) predictions = dl.predict(insurance) assert abs(49.53-predictions[0].mean()[0]) < 1e-1, "Expected mean of predictions to be 49.53, but got " \ "{0}".format(predictions[0].mean()[0]) assert abs(1.074-predictions[0].min()) < 1e-1, "Expected min of predictions to be 1.074, but got " \ "{0}".format(predictions[0].min()) assert abs(397.3-predictions[0].max()) < 40, "Expected max of predictions to be 397.3, but got " \ "{0}".format(predictions[0].max())
def deeplearning_demo(interactive, echo, test): h2o_data_path = system_file("prostate.csv") demo_description = [ '\n-----------------------------------------------------------------', 'This is a demo of H2O\'s Deeplearning function.', 'It uploads a dataset to h2o, parses it, and shows a description.', 'Then, it divides the dataset into training and test sets, ', 'builds a model from the training set, and predicts on the test set.', 'Finally, default performance metrics are displayed.', '-----------------------------------------------------------------' ] demo_commands = [ '# Connect to h2o', '>>> h2o.init()\n', '\n# Upload the prostate dataset that comes included in the h2o python package', '>>> prostate = h2o.upload_file(path = ' + h2o_data_path + '))\n', '\n# Print a description of the prostate data', '>>> prostate.summary()\n', '\n# Randomly split the dataset into ~70/30, training/test sets', '>>> r = prostate[0].runif()', '>>> train = prostate[r < 0.70]', '>>> valid = prostate[r >= 0.30]\n', '\n# Convert the response columns to factors (for binary classification problems)', '>>> train["CAPSULE"] = train["CAPSULE"].asfactor()', '>>> test["CAPSULE"] = test["CAPSULE"].asfactor()\n', '\n# Build a (classification) Deeplearning model', '>>> prostate_dl = h2o.deeplearning(x=train[list(set(prostate.col_names)-set(["ID","CAPSULE"]))]' ', y=train["CAPSULE"], activation="Tanh", hidden=[10, 10, 10], epochs=10000)\n', '\n# Show the model', '>>> prostate_dl.show()\n', '\n# Predict on the test set and show the first ten predictions', '>>> predictions = prostate_dl.predict(test)', '>>> predictions.show()\n', '\n# Show default performance metrics', '>>> performance = prostate_dl.model_performance(test)', '>>> performance.show()\n' ] for line in demo_description: print line print echo_and_interact(demo_commands, interactive, echo) if not test: h2o.init() echo_and_interact(demo_commands, interactive, echo) prostate = h2o.upload_file(path=h2o_data_path) echo_and_interact(demo_commands, interactive, echo) prostate.summary() echo_and_interact(demo_commands, interactive, echo, npop=4) r = prostate[0].runif() train = prostate[r < 0.70] test = prostate[r >= 0.30] echo_and_interact(demo_commands, interactive, echo, npop=3) train["CAPSULE"] = train["CAPSULE"].asfactor() test["CAPSULE"] = test["CAPSULE"].asfactor() echo_and_interact(demo_commands, interactive, echo) prostate_dl = h2o.deeplearning( x=train[list(set(prostate.col_names) - set(["ID", "CAPSULE"]))], y=train["CAPSULE"], activation="Tanh", hidden=[10, 10, 10], epochs=10000) echo_and_interact(demo_commands, interactive, echo) prostate_dl.show() echo_and_interact(demo_commands, interactive, echo, npop=3) predictions = prostate_dl.predict(test) predictions.show() echo_and_interact(demo_commands, interactive, echo, npop=3) performance = prostate_dl.model_performance(test) performance.show()
def domain_check(): air_train = h2o.import_file( path=tests.locate("smalldata/airlines/AirlinesTrain.csv.zip")) air_train.show() air_test = h2o.import_file( path=tests.locate("smalldata/airlines/AirlinesTest.csv.zip")) air_test.show() actual_domain = [u'YES', u'NO'] print "actual domain of the response: {0}".format(actual_domain) ### DRF ### print print "-------------- DRF:" print rf = h2o.random_forest(x=air_train[[ "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek" ]], y=air_train["IsDepDelayed"].asfactor(), training_frame=air_train) computed_domain = rf._model_json['output'][ 'training_metrics']._metric_json['domain'] domain_diff = list(set(computed_domain) - set(actual_domain)) assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \ "The difference is {2}".format(actual_domain, computed_domain, domain_diff) perf = rf.model_performance(test_data=air_test) computed_domain = perf._metric_json['domain'] domain_diff = list(set(computed_domain) - set(actual_domain)) assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \ "The difference is {2}".format(actual_domain, computed_domain, domain_diff) ### GBM ### print print "-------------- GBM:" print gbm = h2o.gbm(x=air_train[[ "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek" ]], y=air_train["IsDepDelayed"].asfactor(), training_frame=air_train, distribution="bernoulli") computed_domain = gbm._model_json['output'][ 'training_metrics']._metric_json['domain'] domain_diff = list(set(computed_domain) - set(actual_domain)) assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \ "The difference is {2}".format(actual_domain, computed_domain, domain_diff) perf = rf.model_performance(test_data=air_test) computed_domain = perf._metric_json['domain'] domain_diff = list(set(computed_domain) - set(actual_domain)) assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \ "The difference is {2}".format(actual_domain, computed_domain, domain_diff) ### Deeplearning ### print print "-------------- Deeplearning:" print dl = h2o.deeplearning(x=air_train[[ "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek" ]], y=air_train["IsDepDelayed"].asfactor(), training_frame=air_train, activation="Tanh", hidden=[2, 2, 2], epochs=10) computed_domain = dl._model_json['output'][ 'training_metrics']._metric_json['domain'] domain_diff = list(set(computed_domain) - set(actual_domain)) assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \ "The difference is {2}".format(actual_domain, computed_domain, domain_diff) perf = rf.model_performance(test_data=air_test) computed_domain = perf._metric_json['domain'] domain_diff = list(set(computed_domain) - set(actual_domain)) assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \ "The difference is {2}".format(actual_domain, computed_domain, domain_diff) ### GLM ### print print "-------------- GLM:" print glm = h2o.glm(x=air_train[[ "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek" ]], y=air_train["IsDepDelayed"], training_frame=air_train, family="binomial") computed_domain = glm._model_json['output'][ 'training_metrics']._metric_json['domain'] domain_diff = list(set(computed_domain) - set(actual_domain)) assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \ "The difference is {2}".format(actual_domain, computed_domain, domain_diff) perf = glm.model_performance(test_data=air_test) computed_domain = perf._metric_json['domain'] domain_diff = list(set(computed_domain) - set(actual_domain)) assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \ "The difference is {2}".format(actual_domain, computed_domain, domain_diff)
import sys sys.prefix = "/usr/local" # Start up H2O import h2o h2o.init(start_h2o=True) # Load the dataset prostate = h2o.upload_file(path=h2o.locate("datasets/prostate.csv")) prostate.describe() # Set the CAPSULE column to be a factor column then build the model prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() model = h2o.deeplearning(x=prostate[list(set(prostate.col_names) - set(["ID", "CAPSULE"]))], y = prostate["CAPSULE"], training_frame=prostate, activation="Tanh", hidden=[10, 10, 10], epochs=10000) model.show() # Make predictions with the trained model predictions = model.predict(prostate) predictions.show() # Check performance of the classification model performance = model.model_performance(prostate) performance.show() # Domino Diagnostic Statistics r2 = performance.r2() mse = performance.mse()
def javapredict(algo, equality, train, test, x, y, **kwargs): print "Creating model in H2O" if algo == "gbm": model = h2o.gbm(x=train[x], y=train[y], **kwargs) elif algo == "random_forest": model = h2o.random_forest(x=train[x], y=train[y], **kwargs) elif algo == "deeplearning": model = h2o.deeplearning(x=train[x], y=train[y], **kwargs) elif algo == "glm": model = h2o.glm(x=train[x], y=train[y], **kwargs) else: raise (ValueError, "algo {0} is not supported".format(algo)) print model print "Downloading Java prediction model code from H2O" tmpdir = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "results", model._id)) os.mkdir(tmpdir) h2o.download_pojo(model, path=tmpdir) h2o_genmodel_jar = os.path.join(tmpdir, "h2o-genmodel.jar") assert os.path.exists( h2o_genmodel_jar ), "Expected file {0} to exist, but it does not.".format(h2o_genmodel_jar) print "h2o-genmodel.jar saved in {0}".format(h2o_genmodel_jar) java_file = os.path.join(tmpdir, model._id + ".java") assert os.path.exists( java_file), "Expected file {0} to exist, but it does not.".format( java_file) print "java code saved in {0}".format(java_file) print "Predicting in H2O" predictions = model.predict(test) predictions.summary() predictions.head() out_h2o_csv = os.path.join(tmpdir, "out_h2o.csv") h2o.download_csv(predictions, out_h2o_csv) assert os.path.exists( out_h2o_csv), "Expected file {0} to exist, but it does not.".format( out_h2o_csv) print "H2O Predictions saved in {0}".format(out_h2o_csv) print "Setting up for Java POJO" in_csv = os.path.join(tmpdir, "in.csv") h2o.download_csv(test[x], in_csv) # hack: the PredictCsv driver can't handle quoted strings, so remove them f = open(in_csv, 'r+') csv = f.read() csv = re.sub('\"', '', csv) f.seek(0) f.write(csv) f.truncate() f.close() assert os.path.exists( in_csv), "Expected file {0} to exist, but it does not.".format(in_csv) print "Input CSV to PredictCsv saved in {0}".format(in_csv) print "Compiling Java Pojo" javac_cmd = [ "javac", "-cp", h2o_genmodel_jar, "-J-Xmx4g", "-J-XX:MaxPermSize=256m", java_file ] subprocess.check_call(javac_cmd) print "Running PredictCsv Java Program" out_pojo_csv = os.path.join(tmpdir, "out_pojo.csv") cp_sep = ";" if sys.platform == "win32" else ":" java_cmd = [ "java", "-ea", "-cp", h2o_genmodel_jar + cp_sep + tmpdir, "-Xmx4g", "-XX:MaxPermSize=256m", "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.PredictCsv", "--header", "--model", model._id, "--input", in_csv, "--output", out_pojo_csv ] p = subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT) o, e = p.communicate() print "Java output: {0}".format(o) assert os.path.exists( out_pojo_csv), "Expected file {0} to exist, but it does not.".format( out_pojo_csv) predictions2 = h2o.import_file(path=out_pojo_csv) print "Pojo predictions saved in {0}".format(out_pojo_csv) print "Comparing predictions between H2O and Java POJO" # Dimensions hr, hc = predictions.dim pr, pc = predictions2.dim assert hr == pr, "Exepcted the same number of rows, but got {0} and {1}".format( hr, pr) assert hc == pc, "Exepcted the same number of cols, but got {0} and {1}".format( hc, pc) # Value for r in range(hr): hp = predictions[r, 0] if equality == "numeric": pp = float.fromhex(predictions2[r, 0]) assert abs( hp - pp ) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format( r, hp, pp) elif equality == "class": pp = predictions2[r, 0] assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format( r, hp, pp) else: raise (ValueError, "equality type {0} is not supported".format(equality))