def smallcatGBM(): # Training set has 26 categories from A to Z # Categories A, C, E, G, ... are perfect predictors of y = 1 # Categories B, D, F, H, ... are perfect predictors of y = 0 #Log.info("Importing alphabet_cattest.csv data...\n") alphabet = h2o.import_file(path=tests.locate("smalldata/gbm_test/alphabet_cattest.csv")) alphabet["y"] = alphabet["y"].asfactor() #Log.info("Summary of alphabet_cattest.csv from H2O:\n") #alphabet.summary() # Prepare data for scikit use trainData = np.loadtxt(tests.locate("smalldata/gbm_test/alphabet_cattest.csv"), delimiter=',', skiprows=1, converters={0:lambda s: ord(s.split("\"")[1])}) trainDataResponse = trainData[:,1] trainDataFeatures = trainData[:,0] # Train H2O GBM Model: #Log.info("H2O GBM (Naive Split) with parameters:\nntrees = 1, max_depth = 1, nbins = 100\n") gbm_h2o = h2o.gbm(x=alphabet[['X']], y=alphabet["y"], distribution="bernoulli", ntrees=1, max_depth=1, nbins=100) gbm_h2o.show() # Train scikit GBM Model: # Log.info("scikit GBM with same parameters:") gbm_sci = ensemble.GradientBoostingClassifier(n_estimators=1, max_depth=1, max_features=None) gbm_sci.fit(trainDataFeatures[:,np.newaxis],trainDataResponse)
def dim_checks(): # Log.info("Uploading logreg/princeton/cuse.dat") h2o_data = h2o.import_file(path=tests.locate("smalldata/logreg/prostate.csv")) np_data = np.loadtxt(tests.locate("smalldata/logreg/prostate.csv"), delimiter=',', skiprows=1) h2o_rows, h2o_cols = h2o_data.dim np_rows, np_cols = list(np_data.shape) print 'The dimensions of h2o frame is: {0} x {1}'.format(h2o_rows, h2o_cols) print 'The dimensions of numpy array is: {0} x {1}'.format(np_rows, np_cols) assert [h2o_rows, h2o_cols] == [np_rows, np_cols], "expected equal number of columns and rows" # Log.info("Slice out a column and data frame it, try dim on it...") h2o_slice = h2o_data[4] np_slice = np_data[:,4] h2o_rows, h2o_cols = h2o_slice.dim np_rows = np_slice.shape[0] print 'The dimensions of h2o column slice is: {0} x {1}'.format(h2o_rows, h2o_cols) print 'The dimensions of numpy array column slice is: {0} x 1'.format(np_rows) assert [h2o_rows, h2o_cols] == [np_rows, 1], "expected equal number of columns and rows" # Log.info("OK, now try an operator, e.g. '&', and then check dimensions agao...") h2oColAmpFive = h2o_slice & 5 assert h2oColAmpFive.nrow == h2o_rows, "expected the number of rows to remain unchanged"
def link_functions_binomial(): print("Read in prostate data.") h2o_data = h2o.import_file(path=tests.locate("smalldata/prostate/prostate_complete.csv.zip")) h2o_data.head() sm_data = pd.read_csv(zipfile.ZipFile(tests.locate("smalldata/prostate/prostate_complete.csv.zip")).open("prostate_complete.csv")).as_matrix() sm_data_response = sm_data[:,2] sm_data_features = sm_data[:,[1,3,4,5,6,7,8,9]] print("Testing for family: BINOMIAL") print("Set variables for h2o.") myY = "CAPSULE" myX = ["ID","AGE","RACE","GLEASON","DCAPS","PSA","VOL","DPROS"] print("Create models with canonical link: LOGIT") h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY].asfactor(), family="binomial", link="logit",alpha=[0.5], Lambda=[0]) sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Binomial(sm.families.links.logit)).fit() print("Compare model deviances for link function logit") h2o_deviance = h2o_model.residual_deviance() / h2o_model.null_deviance() sm_deviance = sm_model.deviance / sm_model.null_deviance assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
def deeplearning_demo(): # Training data train_data = h2o.import_file( path=tests.locate("smalldata/gbm_test/ecology_model.csv")) train_data = train_data.drop('Site') train_data['Angaus'] = train_data['Angaus'].asfactor() print train_data.describe() train_data.head() # Testing data test_data = h2o.import_file( path=tests.locate("smalldata/gbm_test/ecology_eval.csv")) test_data['Angaus'] = test_data['Angaus'].asfactor() print test_data.describe() test_data.head() # Run DeepLearning dl = H2ODeepLearningEstimator(loss="CrossEntropy", epochs=1000, hidden=[20, 20, 20]) dl.train(x=range(1, train_data.ncol), y="Angaus", training_frame=train_data, validation_frame=test_data) dl.show()
def benignKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 # Log.info("Importing benign.csv data...\n") benign_h2o = h2o.import_file( path=tests.locate("smalldata/logreg/benign.csv")) #benign_h2o.summary() benign_sci = np.genfromtxt(tests.locate("smalldata/logreg/benign.csv"), delimiter=",") # Impute missing values with column mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) benign_sci = imp.fit_transform(benign_sci) # Log.info(paste("H2O K-Means with ", i, " clusters:\n", sep = "")) for i in range(1, 7): benign_h2o_km = h2o.kmeans(x=benign_h2o, k=i) print "H2O centers" print benign_h2o_km.centers() benign_sci_km = KMeans(n_clusters=i, init='k-means++', n_init=1) benign_sci_km.fit(benign_sci) print "sckit centers" print benign_sci_km.cluster_centers_
def link_functions_gaussian(): print("Read in prostate data.") h2o_data = h2o.import_file( path=tests.locate("smalldata/prostate/prostate_complete.csv.zip")) h2o_data.head() sm_data = pd.read_csv( zipfile.ZipFile( tests.locate("smalldata/prostate/prostate_complete.csv.zip")).open( "prostate_complete.csv")).as_matrix() sm_data_response = sm_data[:, 9] sm_data_features = sm_data[:, 1:9] print("Testing for family: GAUSSIAN") print("Set variables for h2o.") myY = "GLEASON" myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] print("Create models with canonical link: IDENTITY") h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="gaussian", link="identity", alpha=[0.5], Lambda=[0]) sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Gaussian( sm.families.links.identity)).fit() print("Compare model deviances for link function identity") h2o_deviance = h2o_model.residual_deviance() / h2o_model.null_deviance() sm_deviance = sm_model.deviance / sm_model.null_deviance assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
def group_by(): # Connect to a pre-existing cluster h2o_iris = h2o.import_file( path=tests.locate("smalldata/iris/iris_wheader.csv")) pd_iris = pd.read_csv(tests.locate("smalldata/iris/iris_wheader.csv")) na_handling = ["ignore", "rm", "all"] col_names = h2o_iris.col_names[0:4] print "Running smoke test" # smoke test for na in na_handling: grouped = h2o_iris.group_by("class") grouped \ .count(na=na) \ .min( na=na) \ .max( na=na) \ .mean( na=na) \ .var( na=na) \ .sd( na=na) \ .ss( na=na) \ .sum( na=na) print grouped.get_frame()
def additional_parameters(): #col_types as list dest_frame="dev29&hex%" c_names = ["a", "b", "c"] c_types = ["enum", "enum", "string"] fhex = h2o.import_file(tests.locate("smalldata/jira/hexdev_29.csv"), destination_frame=dest_frame, col_names=c_names, col_types=c_types) fhex.describe() assert fhex._id == dest_frame.replace("%",".").replace("&",".") assert fhex.col_names == c_names col_summary = h2o.frame(fhex._id)["frames"][0]["columns"] for i in range(len(col_summary)): assert col_summary[i]["type"] == c_types[i] #col_types as dictionary dest_frame="dev29&hex%" c_names = ["a", "b", "c"] c_types = {"c":"string", "a":"enum", "b": "enum"} fhex = h2o.import_file(tests.locate("smalldata/jira/hexdev_29.csv"), destination_frame=dest_frame, col_names=c_names, col_types=c_types) fhex.describe() assert fhex._id == dest_frame.replace("%",".").replace("&",".") assert fhex.col_names == c_names col_summary = h2o.frame(fhex._id)["frames"][0]["columns"] for i in range(len(col_summary)): assert col_summary[i]["type"] == c_types[c_names[i]]
def plot_test(): kwargs = {} kwargs['server'] = True air = h2o.import_file(tests.locate("smalldata/airlines/AirlinesTrain.csv.zip")) # Constructing test and train sets by sampling (20/80) s = air[0].runif() air_train = air[s <= 0.8] air_valid = air[s > 0.8] myX = ["Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek"] myY = "IsDepDelayed" air_gbm = h2o.gbm(x=air_train[myX], y=air_train[myY], validation_x=air_valid[myX], validation_y=air_valid[myY], distribution="bernoulli", ntrees=100, max_depth=3, learn_rate=0.01) # Plot ROC for training and validation sets air_gbm.plot(type="roc", train=True, **kwargs) air_gbm.plot(type="roc", valid=True, **kwargs) air_test = h2o.import_file(tests.locate("smalldata/airlines/AirlinesTest.csv.zip")) perf = air_gbm.model_performance(air_test) #Plot ROC for test set perf.plot(type="roc", **kwargs)
def fiftycatGBM(): # Training set has only 45 categories cat1 through cat45 #Log.info("Importing 50_cattest_train.csv data...\n") train = h2o.import_file(path=tests.locate("smalldata/gbm_test/50_cattest_train.csv")) train["y"] = train["y"].asfactor() #Log.info("Summary of 50_cattest_train.csv from H2O:\n") #train.summary() # Train H2O GBM Model: #Log.info(paste("H2O GBM with parameters:\nntrees = 10, max_depth = 20, nbins = 20\n", sep = "")) model = h2o.gbm(x=train[["x1","x2"]], y=train["y"], distribution="bernoulli", ntrees=10, max_depth=5, nbins=20) model.show() # Test dataset has all 50 categories cat1 through cat50 #Log.info("Importing 50_cattest_test.csv data...\n") test = h2o.import_file(path=tests.locate("smalldata/gbm_test/50_cattest_test.csv")) #Log.info("Summary of 50_cattest_test.csv from H2O:\n") #test.summary() # Predict on test dataset with GBM model: #Log.info("Performing predictions on test dataset...\n") predictions = model.predict(test) predictions.show() # Get the confusion matrix and AUC #Log.info("Confusion matrix of predictions (max accuracy):\n") performance = model.model_performance(test) test_cm = performance.confusion_matrix() test_auc = performance.auc()
def pubdev_1953(): # small_test = [tests.locate("bigdata/laptop/citibike-nyc/2013-10.csv")] # data = h2o.import_file(path=small_test) # startime = data["starttime"] # secsPerDay=1000*60*60*24 # data["Days"] = (startime/secsPerDay).floor() # grouped = data.group_by(["Days","start station name"]) # bpd = grouped.count(name="bikes").get_frame() # secs = bpd["Days"]*secsPerDay # bpd["Month"] = secs.month().asfactor() # bpd["DayOfWeek"] = secs.dayOfWeek() # wthr1 = h2o.import_file(path=[tests.locate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv"), tests.locate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv")]) # wthr2 = wthr1[["Year Local","Month Local","Day Local","Hour Local","Dew Point (C)","Humidity Fraction","Precipitation One Hour (mm)","Temperature (C)","Weather Code 1/ Description"]] # wthr2.set_name(wthr2.index("Precipitation One Hour (mm)"), "Rain (mm)") # wthr2.set_name(wthr2.index("Weather Code 1/ Description"), "WC1") # wthr3 = wthr2[ wthr2["Hour Local"]==12 ] # wthr3["msec"] = h2o.H2OFrame.mktime(year=wthr3["Year Local"], month=wthr3["Month Local"]-1, day=wthr3["Day Local"]-1, hour=wthr3["Hour Local"]) # secsPerDay=1000*60*60*24 # wthr3["Days"] = (wthr3["msec"]/secsPerDay).floor() # wthr4 = wthr3.drop("Year Local").drop("Month Local").drop("Day Local").drop("Hour Local").drop("msec") # rain = wthr4["Rain (mm)"] # rain[ rain.isna() ] = 0 # bpd_with_weather = bpd.merge(wthr4,allLeft=True,allRite=False) # r = bpd_with_weather['Days'].runif(seed=356964763) # train = bpd_with_weather[ r < 0.6] # test = bpd_with_weather[(0.6 <= r) & (r < 0.9)] predictors = ['DayOfWeek', 'WC1', 'start station name', 'Temperature (C)', 'Days', 'Month', 'Humidity Fraction', 'Rain (mm)', 'Dew Point (C)'] train = h2o.import_file(tests.locate("smalldata/glm_test/citibike_small_train.csv")) test = h2o.import_file(tests.locate("smalldata/glm_test/citibike_small_test.csv")) glm0 = h2o.glm(x=train[predictors], y=train["bikes"], validation_x=test[predictors], validation_y=test["bikes"], family="poisson")
def frame_as_list(): iris = h2o.import_file(path=tests.locate("smalldata/iris/iris_wheader.csv")) prostate = h2o.import_file(path=tests.locate("smalldata/prostate/prostate.csv.zip")) airlines = h2o.import_file(path=tests.locate("smalldata/airlines/allyears2k.zip")) res1 = h2o.as_list(iris, use_pandas=False) assert ( abs(float(res1[9][0]) - 4.4) < 1e-10 and abs(float(res1[9][1]) - 2.9) < 1e-10 and abs(float(res1[9][2]) - 1.4) < 1e-10 ), "incorrect values" res2 = h2o.as_list(prostate, use_pandas=False) assert ( abs(float(res2[7][0]) - 7) < 1e-10 and abs(float(res2[7][1]) - 0) < 1e-10 and abs(float(res2[7][2]) - 68) < 1e-10 ), "incorrect values" res3 = h2o.as_list(airlines, use_pandas=False) assert ( abs(float(res3[4][0]) - 1987) < 1e-10 and abs(float(res3[4][1]) - 10) < 1e-10 and abs(float(res3[4][2]) - 18) < 1e-10 ), "incorrect values"
def checkpoint_new_category_in_predictor(): sv1 = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv")) sv2 = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv")) vir = h2o.upload_file(tests.locate("smalldata/iris/virginica.csv")) m1 = h2o.deeplearning(x=sv1[[0, 1, 2, 4]], y=sv1[3], epochs=100) m2 = h2o.deeplearning(x=sv2[[0, 1, 2, 4]], y=sv2[3], epochs=200, checkpoint=m1.model_id) # attempt to continue building model, but with an expanded categorical predictor domain. # this should fail try: m3 = h2o.deeplearning(x=vir[[0, 1, 2, 4]], y=vir[3], epochs=200, checkpoint=m1.model_id) assert False, "Expected continued model-building to fail with new categories introduced in predictor" except EnvironmentError: pass # attempt to predict on new model, but with observations that have expanded categorical predictor domain. predictions = m2.predict(vir)
def wide_dataset_large(): print("Reading in Arcene training data for binomial modeling.") trainDataResponse = np.genfromtxt(tests.locate("smalldata/arcene/arcene_train_labels.labels"), delimiter=' ') trainDataResponse = np.where(trainDataResponse == -1, 0, 1) trainDataFeatures = np.genfromtxt(tests.locate("smalldata/arcene/arcene_train.data"), delimiter=' ') trainData = h2o.H2OFrame(np.column_stack((trainDataResponse, trainDataFeatures)).tolist()) print("Run model on 3250 columns of Arcene with strong rules off.") model = h2o.glm(x=trainData[1:3250], y=trainData[0].asfactor(), family="binomial", lambda_search=False, alpha=[1]) print("Test model on validation set.") validDataResponse = np.genfromtxt(tests.locate("smalldata/arcene/arcene_valid_labels.labels"), delimiter=' ') validDataResponse = np.where(validDataResponse == -1, 0, 1) validDataFeatures = np.genfromtxt(tests.locate("smalldata/arcene/arcene_valid.data"), delimiter=' ') validData = h2o.H2OFrame(np.column_stack((validDataResponse, validDataFeatures)).tolist()) prediction = model.predict(validData) print("Check performance of predictions.") performance = model.model_performance(validData) print("Check that prediction AUC better than guessing (0.5).") assert performance.auc() > 0.5, "predictions should be better then pure chance"
def group_by(): # Connect to a pre-existing cluster h2o_iris = h2o.import_file(path=tests.locate("smalldata/iris/iris_wheader.csv")) pd_iris = pd.read_csv(tests.locate("smalldata/iris/iris_wheader.csv")) na_handling = ["ignore","rm","all"] col_names = h2o_iris.col_names[0:4] print "Running smoke test" # smoke test for na in na_handling: grouped = h2o_iris.group_by("class") grouped \ .count(na=na) \ .min( na=na) \ .max( na=na) \ .mean( na=na) \ .var( na=na) \ .sd( na=na) \ .ss( na=na) \ .sum( na=na) print grouped.get_frame()
def anomaly(): print "Deep Learning Anomaly Detection MNIST" train = h2o.import_file(tests.locate("bigdata/laptop/mnist/train.csv.gz")) test = h2o.import_file(tests.locate("bigdata/laptop/mnist/test.csv.gz")) predictors = range(0,784) resp = 784 # unsupervised -> drop the response column (digit: 0-9) train = train[predictors] test = test[predictors] # 1) LEARN WHAT'S NORMAL # train unsupervised Deep Learning autoencoder model on train_hex ae_model = h2o.deeplearning(x=train[predictors], training_frame=train, activation="Tanh", autoencoder=True, hidden=[50], l1=1e-5, ignore_const_cols=False, epochs=1) # 2) DETECT OUTLIERS # anomaly app computes the per-row reconstruction error for the test data set # (passing it through the autoencoder model and computing mean square error (MSE) for each row) test_rec_error = ae_model.anomaly(test) # 3) VISUALIZE OUTLIERS # Let's look at the test set points with low/median/high reconstruction errors. # We will now visualize the original test set points and their reconstructions obtained # by propagating them through the narrow neural net. # Convert the test data into its autoencoded representation (pass through narrow neural net) test_recon = ae_model.predict(test)
def iris_h2o_vs_sciKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 iris_h2o = h2o.import_file(path=tests.locate("smalldata/iris/iris.csv")) iris_sci = np.genfromtxt(tests.locate("smalldata/iris/iris.csv"), delimiter=',') iris_sci = iris_sci[:,0:4] s =[[4.9,3.0,1.4,0.2], [5.6,2.5,3.9,1.1], [6.5,3.0,5.2,2.0]] start = h2o.H2OFrame(s) h2o_km = h2o.kmeans(x=iris_h2o[0:4], k=3, user_points=start, standardize=False) sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1) sci_km.fit(iris_sci) # Log.info("Cluster centers from H2O:") print "Cluster centers from H2O:" h2o_centers = h2o_km.centers() print h2o_centers # Log.info("Cluster centers from scikit:") print "Cluster centers from scikit:" sci_centers = sci_km.cluster_centers_.tolist() print sci_centers for hcenter, scenter in zip(h2o_centers, sci_centers): for hpoint, spoint in zip(hcenter,scenter): assert (hpoint- spoint) < 1e-10, "expected centers to be the same"
def javapredict_cars(): # optional parameters params = { 'ntrees': 5000, 'max_depth': 10, 'min_rows': 1, 'learn_rate': 0.1, 'balance_classes': random.sample([True, False], 1)[0] } print "Parameter list:" for k, v in zip(params.keys(), params.values()): print "{0}, {1}".format(k, v) train = h2o.import_file( tests.locate("smalldata/junit/cars_nice_header.csv")) test = h2o.import_file( tests.locate("smalldata/junit/cars_nice_header.csv")) x = [ "name", "economy", "displacement", "power", "weight", "acceleration", "year" ] y = "cylinders" tests.javapredict("gbm", "numeric", train, test, x, y, **params)
def frame_slicing(): iris = h2o.import_file(path=tests.locate("smalldata/iris/iris_wheader.csv")) prostate = h2o.import_file(path=tests.locate("smalldata/prostate/prostate.csv.zip")) airlines = h2o.import_file(path=tests.locate("smalldata/airlines/allyears2k.zip")) iris.show() prostate.show() airlines.show() ################################################################### # H2OFrame[int] (column slice) res1 = iris[0] assert abs(res1[8,:] - 4.4) < 1e-10, "incorrect values" # H2OFrame[int,int] res2 = prostate[13, 3] assert abs(res2 - 1) < 1e-10, "incorrect values" # H2OFrame[int, slice] res3 = airlines[12, 0:3] assert abs(res3[0,0] - 1987) < 1e-10 and abs(res3[0,1] - 10) < 1e-10 and abs(res3[0,2] - 29) < 1e-10, \ "incorrect values" # H2OFrame[slice, int] res4 = iris[5:8, 1] assert abs(res4[0,:] - 3.9) < 1e-10 and abs(res4[1,:] - 3.4) < 1e-10 and abs(res4[2,:] - 3.4) < 1e-10, "incorrect values" # H2OFrame[slice, slice] res5 = prostate[5:8, 0:3] assert abs(res5[0,0] - 6) < 1e-10 and abs(res5[1,1] - 0) < 1e-10 and abs(res5[2,2] - 61) < 1e-10, "incorrect values"
def link_functions_gaussian(): print("Read in prostate data.") h2o_data = h2o.import_file(path=tests.locate("smalldata/prostate/prostate_complete.csv.zip")) h2o_data.head() sm_data = pd.read_csv(zipfile.ZipFile(tests.locate("smalldata/prostate/prostate_complete.csv.zip")). open("prostate_complete.csv")).as_matrix() sm_data_response = sm_data[:,9] sm_data_features = sm_data[:,1:9] print("Testing for family: GAUSSIAN") print("Set variables for h2o.") myY = "GLEASON" myX = ["ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS"] print("Create models with canonical link: IDENTITY") h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="gaussian", link="identity",alpha=[0.5], Lambda=[0]) sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Gaussian(sm.families.links.identity)).fit() print("Compare model deviances for link function identity") h2o_deviance = h2o_model.residual_deviance() / h2o_model.null_deviance() sm_deviance = sm_model.deviance / sm_model.null_deviance assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
def fiftycatRF(): # Training set has only 45 categories cat1 through cat45 #Log.info("Importing 50_cattest_train.csv data...\n") train = h2o.import_file(path=tests.locate("smalldata/gbm_test/50_cattest_train.csv")) train["y"] = train["y"].asfactor() #Log.info("Summary of 50_cattest_train.csv from H2O:\n") #train.summary() # Train H2O DRF Model: #Log.info(paste("H2O DRF with parameters:\nclassification = TRUE, ntree = 50, depth = 20, nbins = 500\n", sep = "")) model = h2o.random_forest(x=train[["x1", "x2"]], y=train["y"], ntrees=50, max_depth=20, nbins=500) # Test dataset has all 50 categories cat1 through cat50 #Log.info("Importing 50_cattest_test.csv data...\n") test = h2o.import_file(path=tests.locate("smalldata/gbm_test/50_cattest_test.csv")) #Log.info("Summary of 50_cattest_test.csv from H2O:\n") #test.summary() # Predict on test dataset with DRF model: #Log.info("Performing predictions on test dataset...\n") preds = model.predict(test) preds.head() # Get the confusion matrix and AUC #Log.info("Confusion matrix of predictions (max accuracy):\n") perf = model.model_performance(test) perf.show() cm = perf.confusion_matrix() print(cm)
def offsets_and_distributions(): # cars cars = h2o.upload_file(tests.locate("smalldata/junit/cars_20mpg.csv")) cars = cars[cars["economy_20mpg"].isna() == 0] cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() offset = h2o.H2OFrame(python_obj=[[.5] for x in range(398)]) offset.set_name(0,"x1") cars = cars.cbind(offset) # insurance insurance = h2o.import_file(tests.locate("smalldata/glm_test/insurance.csv")) insurance["offset"] = insurance["Holders"].log() # bernoulli - offset not supported #dl = h2o.deeplearning(x=cars[2:8], y=cars["economy_20mpg"], distribution="bernoulli", offset_column="x1", # training_frame=cars) #predictions = dl.predict(cars) # gamma dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gamma", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance) # gaussian dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gaussian", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance) # poisson dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="poisson", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance) # tweedie dl = h2o.deeplearning(x=insurance.names[0:3], y="Claims", distribution="tweedie", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance)
def col_names_check(): iris_wheader = h2o.import_file(tests.locate("smalldata/iris/iris_wheader.csv")) assert iris_wheader.col_names == ["sepal_len","sepal_wid","petal_len","petal_wid","class"], \ "Expected {0} for column names but got {1}".format(["sepal_len","sepal_wid","petal_len","petal_wid","class"], iris_wheader.col_names) iris = h2o.import_file(tests.locate("smalldata/iris/iris.csv")) assert iris.col_names == ["C1","C2","C3","C4","C5"], "Expected {0} for column names but got " \ "{1}".format(["C1","C2","C3","C4","C5"], iris.col_names) df = h2o.H2OFrame(np.random.randn(100,4).tolist(), column_names=list("ABCD"), column_types=["Enum"]*4) df.head() assert df.col_names == list("ABCD"), "Expected {} for column names but got {}".format(list("ABCD"), df.col_names) assert df.types == {"A": "Enum", "C": "Enum", "B": "Enum", "D": "Enum"}, "Expected {} for column types " \ "but got {}".format({"A": "Enum", "C": "Enum", "B": "Enum", "D": "Enum"}, df.types) df = h2o.H2OFrame(np.random.randn(100,4).tolist()) df.head() assert df.col_names == ["C1","C2","C3","C4"], "Expected {} for column names but got {}".format(["C1","C2","C3","C4"] , df.col_names) assert df.types == {"C3": "Numeric", "C2": "Numeric", "C1": "Numeric", "C4": "Numeric"}, "Expected {}" \ " for column types but got {}".format({"C3": "Numeric", "C2": "Numeric", "C1": "Numeric", "C4": "Numeric"}, df.types)
def hit_ratio_test(): air_train = h2o.import_file( path=tests.locate("smalldata/airlines/AirlinesTrain.csv.zip")) air_valid = h2o.import_file( path=tests.locate("smalldata/airlines/AirlinesTest.csv.zip")) air_test = h2o.import_file( path=tests.locate("smalldata/airlines/AirlinesTest.csv.zip")) gbm_mult = h2o.gbm(x=air_train[[ "Origin", "Dest", "Distance", "UniqueCarrier", "IsDepDelayed", "fDayofMonth", "fMonth" ]], y=air_train["fDayOfWeek"].asfactor(), validation_x=air_valid[[ "Origin", "Dest", "Distance", "UniqueCarrier", "IsDepDelayed", "fDayofMonth", "fMonth" ]], validation_y=air_valid["fDayOfWeek"].asfactor(), distribution="multinomial") training_hit_ratio_table = gbm_mult.hit_ratio_table(train=True) training_hit_ratio_table.show() validation_hit_ratio_table = gbm_mult.hit_ratio_table(valid=True) validation_hit_ratio_table.show() perf = gbm_mult.model_performance(air_test) test_hit_ratio_table = perf.hit_ratio_table() test_hit_ratio_table.show()
def iris_h2o_vs_sciKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 iris_h2o = h2o.import_file(path=tests.locate("smalldata/iris/iris.csv")) iris_sci = np.genfromtxt(tests.locate("smalldata/iris/iris.csv"), delimiter=',') iris_sci = iris_sci[:, 0:4] s = [[4.9, 3.0, 1.4, 0.2], [5.6, 2.5, 3.9, 1.1], [6.5, 3.0, 5.2, 2.0]] start = h2o.H2OFrame(s) h2o_km = h2o.kmeans(x=iris_h2o[0:4], k=3, user_points=start, standardize=False) sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1) sci_km.fit(iris_sci) # Log.info("Cluster centers from H2O:") print "Cluster centers from H2O:" h2o_centers = h2o_km.centers() print h2o_centers # Log.info("Cluster centers from scikit:") print "Cluster centers from scikit:" sci_centers = sci_km.cluster_centers_.tolist() print sci_centers for hcenter, scenter in zip(h2o_centers, sci_centers): for hpoint, spoint in zip(hcenter, scenter): assert (hpoint - spoint) < 1e-10, "expected centers to be the same"
def separator(): path = "smalldata/jira/hexdev_29.csv" fhex = h2o.import_file(tests.locate(path), sep=",") fhex.summary() fhex_col_summary = h2o.H2OConnection.get_json( "Frames/" + urllib.quote(fhex._id) + "/summary")["frames"][0]["columns"] fhex_missing_count = sum([e["missing_count"] for e in fhex_col_summary]) assert fhex_missing_count == 0 fhex_wrong_separator = h2o.import_file(tests.locate(path), sep=";") fhex_wrong_separator.summary() fhex_wrong_separator_col_summary = h2o.H2OConnection.get_json( "Frames/" + urllib.quote(fhex_wrong_separator._id) + "/summary")["frames"][0]["columns"] fhex_wrong_separator_missing_count = sum( [e["missing_count"] for e in fhex_wrong_separator_col_summary]) assert fhex_wrong_separator_missing_count == fhex_wrong_separator._nrows * fhex_wrong_separator._ncols try: h2o.import_file(tests.locate(path), sep="--") except ValueError: pass else: assert False
def test_locate(): iris_path = h2o.locate("smalldata/iris/iris.csv") try: tests.locate("smalldata/iris/afilethatdoesnotexist.csv") assert False, "Expected h2o.locate to raise a ValueError" except ValueError: assert True
def get_model_test(): prostate = h2o.import_file(path=tests.locate("smalldata/logreg/prostate.csv")) r = prostate[0].runif() train = prostate[r < 0.70] test = prostate[r >= 0.70] # Regression regression_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="gaussian") predictions1 = regression_gbm1.predict(test) regression_gbm2 = h2o.get_model(regression_gbm1._id) assert regression_gbm2._model_json['output']['model_category'] == "Regression" predictions2 = regression_gbm2.predict(test) for r in range(predictions1.nrow): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected regression predictions to be the same for row {}, but got {} and {}".format(r, p1, p2) # Binomial train[1] = train[1].asfactor() bernoulli_gbm1 = h2o.gbm(y=train[1], x=train[2:], distribution="bernoulli") predictions1 = bernoulli_gbm1.predict(test) bernoulli_gbm2 = h2o.get_model(bernoulli_gbm1._id) assert bernoulli_gbm2._model_json['output']['model_category'] == "Binomial" predictions2 = bernoulli_gbm2.predict(test) for r in range(predictions1.nrow): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected binomial predictions to be the same for row {}, but got {} and {}".format(r, p1, p2) # Clustering benign_h2o = h2o.import_file(path=tests.locate("smalldata/logreg/benign.csv")) km_h2o = h2o.kmeans(x=benign_h2o, k=3) benign_km = h2o.get_model(km_h2o._id) assert benign_km._model_json['output']['model_category'] == "Clustering" # Multinomial train[4] = train[4].asfactor() multinomial_dl1 = h2o.deeplearning(x=train[0:2], y=train[4], loss='CrossEntropy') predictions1 = multinomial_dl1.predict(test) multinomial_dl2 = h2o.get_model(multinomial_dl1._id) assert multinomial_dl2._model_json['output']['model_category'] == "Multinomial" predictions2 = multinomial_dl2.predict(test) for r in range(predictions1.nrow): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected multinomial predictions to be the same for row {0}, but got {1} and {2}" \ "".format(r, p1, p2)
def bernoulliGBM(): #Log.info("Importing prostate.csv data...\n") prostate_train = h2o.import_file(path=tests.locate("smalldata/logreg/prostate_train.csv")) #Log.info("Converting CAPSULE and RACE columns to factors...\n") prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor() #Log.info("H2O Summary of prostate frame:\n") #prostate.summary() # Import prostate_train.csv as numpy array for scikit comparison trainData = np.loadtxt(tests.locate("smalldata/logreg/prostate_train.csv"), delimiter=',', skiprows=1) trainDataResponse = trainData[:,0] trainDataFeatures = trainData[:,1:] ntrees = 100 learning_rate = 0.1 depth = 5 min_rows = 10 # Build H2O GBM classification model: #Log.info(paste("H2O GBM with parameters:\ndistribution = 'bernoulli', ntrees = ", ntrees, ", max_depth = 5, # min_rows = 10, learn_rate = 0.1\n", sep = "")) gbm_h2o = h2o.gbm(x=prostate_train[1:], y=prostate_train["CAPSULE"], ntrees=ntrees, learn_rate=learning_rate, max_depth=depth, min_rows=min_rows, distribution="bernoulli") # Build scikit GBM classification model #Log.info("scikit GBM with same parameters\n") gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learning_rate, n_estimators=ntrees, max_depth=depth, min_samples_leaf=min_rows, max_features=None) gbm_sci.fit(trainDataFeatures,trainDataResponse) #Log.info("Importing prostate_test.csv data...\n") prostate_test = h2o.import_file(path=tests.locate("smalldata/logreg/prostate_test.csv")) #Log.info("Converting CAPSULE and RACE columns to factors...\n") prostate_test["CAPSULE"] = prostate_test["CAPSULE"].asfactor() # Import prostate_test.csv as numpy array for scikit comparison testData = np.loadtxt(tests.locate("smalldata/logreg/prostate_test.csv"), delimiter=',', skiprows=1) testDataResponse = testData[:,0] testDataFeatures = testData[:,1:] # Score on the test data and compare results # scikit auc_sci = roc_auc_score(testDataResponse, gbm_sci.predict_proba(testDataFeatures)[:,1]) # h2o gbm_perf = gbm_h2o.model_performance(prostate_test) auc_h2o = gbm_perf.auc() #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o)) assert auc_h2o >= auc_sci, "h2o (auc) performance degradation, with respect to scikit"
def pubdev_1839(): train = h2o.import_file(tests.locate("smalldata/jira/pubdev_1839_repro_train.csv")) test = h2o.import_file(tests.locate("smalldata/jira/pubdev_1839_repro_test.csv")) glm0 = h2o.glm(x =train.drop("bikes"), y =train ["bikes"], validation_x=test .drop("bikes"), validation_y=test ["bikes"], family="poisson")
def deeplearning_autoencoder(): resp = 784 nfeatures = 20 # number of features (smallest hidden layer) train_hex = h2o.upload_file(tests.locate("bigdata/laptop/mnist/train.csv.gz")) train_hex[resp] = train_hex[resp].asfactor() test_hex = h2o.upload_file(tests.locate("bigdata/laptop/mnist/test.csv.gz")) test_hex[resp] = test_hex[resp].asfactor() # split data into two parts sid = train_hex[0].runif(1234) # unsupervised data for autoencoder train_unsupervised = train_hex[sid >= 0.5] train_unsupervised.drop(resp) train_unsupervised.describe() # supervised data for drf train_supervised = train_hex[sid < 0.5] train_supervised.describe() # train autoencoder ae_model = h2o.deeplearning(x=train_unsupervised[0:resp], activation="Tanh", autoencoder=True, hidden=[nfeatures], epochs=1, reproducible=True, #slow, turn off for real problems seed=1234) # conver train_supervised with autoencoder to lower-dimensional space train_supervised_features = ae_model.deepfeatures(train_supervised[0:resp]._frame(), 0) assert train_supervised_features.ncol == nfeatures, "Dimensionality of reconstruction is wrong!" # Train DRF on extracted feature space drf_model = h2o.random_forest(x=train_supervised_features[0:20], y=train_supervised[resp], ntrees=10, min_rows=10, seed=1234) # Test the DRF model on the test set (processed through deep features) test_features = ae_model.deepfeatures(test_hex[0:resp]._frame(), 0) test_features = test_features.cbind(test_hex[resp])._frame() # Confusion Matrix and assertion cm = drf_model.confusion_matrix(test_features) cm.show() # 10% error +/- 0.001 assert abs(cm.cell_values[10][10] - 0.081) < 0.001, "Error. Expected 0.081, but got {0}".format(cm.cell_values[10][10])
def deeplearning_autoencoder(): resp = 784 nfeatures = 20 # number of features (smallest hidden layer) train_hex = h2o.upload_file(tests.locate("bigdata/laptop/mnist/train.csv.gz")) train_hex[resp] = train_hex[resp].asfactor() test_hex = h2o.upload_file(tests.locate("bigdata/laptop/mnist/test.csv.gz")) test_hex[resp] = test_hex[resp].asfactor() # split data into two parts sid = train_hex[0].runif(1234) # unsupervised data for autoencoder train_unsupervised = train_hex[sid >= 0.5] train_unsupervised.drop(resp) train_unsupervised.describe() # supervised data for drf train_supervised = train_hex[sid < 0.5] train_supervised.describe() # train autoencoder ae_model = h2o.deeplearning(x=train_unsupervised[0:resp], activation="Tanh", autoencoder=True, hidden=[nfeatures], epochs=1, reproducible=True, #slow, turn off for real problems seed=1234) # conver train_supervised with autoencoder to lower-dimensional space train_supervised_features = ae_model.deepfeatures(train_supervised[0:resp]._frame(), 0) assert train_supervised_features.ncol == nfeatures, "Dimensionality of reconstruction is wrong!" # Train DRF on extracted feature space drf_model = h2o.random_forest(x=train_supervised_features[0:20], y=train_supervised[resp], ntrees=10, min_rows=10, seed=1234) # Test the DRF model on the test set (processed through deep features) test_features = ae_model.deepfeatures(test_hex[0:resp]._frame(), 0) test_features = test_features.cbind(test_hex[resp])._frame() # Confusion Matrix and assertion cm = drf_model.confusion_matrix(test_features) cm.show() # 10% error +/- 0.001 assert abs(cm.cell_values[10][10] - 0.082) < 0.001, "Error. Expected 0.082, but got {0}".format(cm.cell_values[10][10])
def shuffling_large(): print("Reading in Arcene training data for binomial modeling.") train_data = h2o.upload_file( path=tests.locate("smalldata/arcene/shuffle_test_version/arcene.csv")) train_data_shuffled = h2o.upload_file(path=tests.locate( "smalldata/arcene/shuffle_test_version/arcene_shuffled.csv")) print("Create model on original Arcene dataset.") h2o_model = h2o.glm(x=train_data[0:1000], y=train_data[1000], family="binomial", lambda_search=True, alpha=[0.5]) print("Create second model on original Arcene dataset.") h2o_model_2 = h2o.glm(x=train_data[0:1000], y=train_data[1000], family="binomial", lambda_search=True, alpha=[0.5]) print("Create model on shuffled Arcene dataset.") h2o_model_s = h2o.glm(x=train_data_shuffled[0:1000], y=train_data_shuffled[1000], family="binomial", lambda_search=True, alpha=[0.5]) print( "Assert that number of predictors remaining and their respective coefficients are equal." ) for x, y in zip( h2o_model._model_json['output']['coefficients_table'].cell_values, h2o_model_2._model_json['output'] ['coefficients_table'].cell_values): assert (type(x[1]) == type(y[1])) and (type(x[2]) == type( y[2])), "coefficients should be the same type" if isinstance(x[1], float): assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal" if isinstance(x[2], float): assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal" for x, y in zip( h2o_model._model_json['output']['coefficients_table'].cell_values, h2o_model_s._model_json['output'] ['coefficients_table'].cell_values): assert (type(x[1]) == type(y[1])) and (type(x[2]) == type( y[2])), "coefficients should be the same type" if isinstance(x[1], float): assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal" if isinstance(x[2], float): assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"
def headers(): headers = h2o.import_file(tests.locate("smalldata/airlines/allyears2k_headers_only.csv")) headers_and = h2o.import_file(tests.locate("smalldata/airlines/allyears2k.zip")) headers_and.set_names(headers.names) print headers.names print headers_and.names assert headers.names == headers_and.names, "Expected the same column names but got {0} and {1}". \ format(headers.names, headers_and.names)
def frame_show(): iris = h2o.import_file(path=tests.locate("smalldata/iris/iris_wheader.csv")) prostate = h2o.import_file(path=tests.locate("smalldata/prostate/prostate.csv.zip")) airlines = h2o.import_file(path=tests.locate("smalldata/airlines/allyears2k.zip")) iris.show() prostate.show() airlines.show()
def runif_check(): # Connect to a pre-existing cluster uploaded_frame = h2o.upload_file(tests.locate("bigdata/laptop/mnist/train.csv.gz")) r_u = uploaded_frame[0].runif(1234) imported_frame = h2o.import_file(tests.locate("bigdata/laptop/mnist/train.csv.gz")) r_i = imported_frame[0].runif(1234) print "This demonstrates that seeding runif on identical frames with different chunk distributions provides " \ "different results. upload_file: {0}, import_frame: {1}.".format(r_u.mean(), r_i.mean())
def frame_as_list(): prostate = h2o.import_file( path=tests.locate("smalldata/prostate/prostate.csv.zip")) (prostate % 10).show() (prostate[4] % 10).show() airlines = h2o.import_file( path=tests.locate("smalldata/airlines/allyears2k_headers.zip")) (airlines["CRSArrTime"] % 100).show()
def pubdev_1839(): train = h2o.import_file( tests.locate("smalldata/jira/pubdev_1839_repro_train.csv")) test = h2o.import_file( tests.locate("smalldata/jira/pubdev_1839_repro_test.csv")) glm0 = h2o.glm(x=train.drop("bikes"), y=train["bikes"], validation_x=test.drop("bikes"), validation_y=test["bikes"], family="poisson")
def frame_show(): iris = h2o.import_file( path=tests.locate("smalldata/iris/iris_wheader.csv")) prostate = h2o.import_file( path=tests.locate("smalldata/prostate/prostate.csv.zip")) airlines = h2o.import_file( path=tests.locate("smalldata/airlines/allyears2k.zip")) iris.show() prostate.show() airlines.show()
def col_names_check(): iris_wheader = h2o.import_file(tests.locate("smalldata/iris/iris_wheader.csv")) assert iris_wheader.col_names == ["sepal_len","sepal_wid","petal_len","petal_wid","class"], \ "Expected {0} for column names but got {1}".format(["sepal_len","sepal_wid","petal_len","petal_wid","class"], iris_wheader.col_names) iris = h2o.import_file(tests.locate("smalldata/iris/iris.csv")) assert iris.col_names == ["C1","C2","C3","C4","C5"], "Expected {0} for column names but got " \ "{1}".format(["C1","C2","C3","C4","C5"], iris.col_names)
def kmeans_mllib(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() hdfs_cross_file = "/datasets/runit/BigCross.data" print "Import BigCross.data from HDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file) cross_h2o = h2o.import_file(url) n = cross_h2o.nrow err_mllib = np.genfromtxt( tests.locate("smalldata/mllib_bench/bigcross_wcsse.csv"), delimiter=",", skip_header=1) ncent = [int(err_mllib[r][0]) for r in range(len(err_mllib))] for k in ncent: print "Run k-means++ with k = {0} and max_iterations = 10".format( k) cross_km = h2o.kmeans(training_frame=cross_h2o, x=cross_h2o, k=k, init="PlusPlus", max_iterations=10, standardize=False) clust_mllib = np.genfromtxt( tests.locate("smalldata/mllib_bench/bigcross_centers_" + str(k) + ".csv"), delimiter=",").tolist() clust_h2o = cross_km.centers() # Sort in ascending order by first dimension for comparison purposes clust_mllib.sort(key=lambda x: x[0]) clust_h2o.sort(key=lambda x: x[0]) print "\nMLlib Cluster Centers:\n" print clust_mllib print "\nH2O Cluster Centers:\n" print clust_h2o wcsse_mllib = err_mllib[err_mllib[0:4, 0].tolist().index(k)][1] wcsse_h2o = cross_km.tot_withinss() / n print "\nMLlib Average Within-Cluster SSE: \n".format(wcsse_mllib) print "H2O Average Within-Cluster SSE: \n".format(wcsse_h2o) assert wcsse_h2o == wcsse_mllib, "Expected mllib and h2o to get the same wcsse. Mllib got {0}, and H2O " \ "got {1}".format(wcsse_mllib, wcsse_h2o)
def pubdev_1829(): train = h2o.import_file(path=tests.locate("smalldata/jira/gbm_checkpoint_train.csv")) valid = h2o.import_file(path=tests.locate("smalldata/jira/gbm_checkpoint_valid.csv")) predictors = ["displacement","power","weight","acceleration","year"] response_col = "economy_20mpg" distribution = "bernoulli" train[response_col] = train[response_col].asfactor() valid[response_col] = valid[response_col].asfactor() ntrees1 = 5 max_depth1 = 5 min_rows1 = 10 model1 = h2o.gbm(x=train[predictors], y=train[response_col], ntrees=ntrees1, max_depth=max_depth1, min_rows=min_rows1, score_each_iteration=True, distribution=distribution, validation_x=valid[predictors], validation_y=valid[response_col]) ntrees2 = 10 max_depth2 = 5 min_rows2 = 10 model2 = h2o.gbm(x=train[predictors], y=train[response_col], ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, score_each_iteration=True, validation_x=valid[predictors], validation_y=valid[response_col], checkpoint=model1._id) model4 = h2o.gbm(x=train[predictors], y=train[response_col], ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, score_each_iteration=True, validation_x=valid[predictors], validation_y=valid[response_col]) assert model2.auc(valid=True)==model4.auc(valid=True), "Expected Model 2 AUC: {0} to be the same as Model 4 AUC: {1}".format(model2.auc(valid=True), model4.auc(valid=True)) assert model2.giniCoef(valid=True)==model4.giniCoef(valid=True), "Expected Model 2 Gini Coef {0} to be the same as Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True)) assert model2.logloss(valid=True)==model4.logloss(valid=True), "Expected Model 2 Log Loss: {0} to be the same as Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True))
def javapredict_iris_drf(): # optional parameters params = {'ntrees':100, 'max_depth':5, 'min_rows':10} print "Parameter list:" for k,v in zip(params.keys(), params.values()): print "{0}, {1}".format(k,v) train = h2o.import_file(tests.locate("smalldata/iris/iris_train.csv")) test = h2o.import_file(tests.locate("smalldata/iris/iris_train.csv")) x = ["sepal_len","sepal_wid","petal_len","petal_wid"] y = "species" tests.javapredict("random_forest", "class", train, test, x, y, **params)
def javapredict_smallcat(): # optional parameters params = {'ntrees':100, 'max_depth':5, 'min_rows':10} print "Parameter list:" for k,v in zip(params.keys(), params.values()): print "{0}, {1}".format(k,v) train = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv")) test = h2o.upload_file(tests.locate("smalldata/iris/virginica.csv")) x = [0,1,2,4] y = 3 tests.javapredict("random_forest", "numeric", train, test, x, y, **params)
def javapredict_cars(): # optional parameters params = {'ntrees':5000, 'max_depth':10, 'min_rows':1, 'learn_rate':0.1, 'balance_classes':random.sample([True,False],1)[0]} print "Parameter list:" for k,v in zip(params.keys(), params.values()): print "{0}, {1}".format(k,v) train = h2o.import_file(tests.locate("smalldata/junit/cars_nice_header.csv")) test = h2o.import_file(tests.locate("smalldata/junit/cars_nice_header.csv")) x = ["name","economy", "displacement","power","weight","acceleration","year"] y = "cylinders" tests.javapredict("gbm", "numeric", train, test, x, y, **params)
def javapredict_smallcat(): # optional parameters params = {'epochs':100} print "Parameter list:" for k,v in zip(params.keys(), params.values()): print "{0}, {1}".format(k,v) train = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv")) test = h2o.upload_file(tests.locate("smalldata/iris/virginica.csv")) x = [0,1,2,4] y = 3 tests.javapredict("deeplearning", "numeric", train, test, x, y, **params)
def colname_set_basic(): print "Uploading iris data..." no_headers = h2o.upload_file(tests.locate("smalldata/iris/iris.csv")) headers_and = h2o.upload_file( tests.locate("smalldata/iris/iris_header.csv")) print no_headers.names print headers_and.names no_headers.set_names(headers_and.names) assert no_headers.names == headers_and.names, "Expected the same column names but got {0} and {1}".\ format(no_headers.names, headers_and.names)
def javapredict_smallcat(): # optional parameters params = {'epochs': 100} print "Parameter list:" for k, v in zip(params.keys(), params.values()): print "{0}, {1}".format(k, v) train = h2o.upload_file( tests.locate("smalldata/iris/setosa_versicolor.csv")) test = h2o.upload_file(tests.locate("smalldata/iris/virginica.csv")) x = [0, 1, 2, 4] y = 3 tests.javapredict("deeplearning", "numeric", train, test, x, y, **params)
def prostate(): h2o_data = h2o.upload_file(path=tests.locate("smalldata/logreg/prostate.csv")) h2o_data.summary() sm_data = pd.read_csv(tests.locate("smalldata/logreg/prostate.csv")).as_matrix() sm_data_response = sm_data[:,1] sm_data_features = sm_data[:,2:] h2o_glm = h2o.glm(y=h2o_data[1], x=h2o_data[2:], family="binomial", nfolds=10, alpha=[0.5]) sm_glm = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Binomial()).fit() print "statsmodels null deviance {0}".format(sm_glm.null_deviance) print "h2o null deviance {0}".format(h2o_glm.null_deviance()) assert abs(sm_glm.null_deviance - h2o_glm.null_deviance()) < 1e-5, "Expected null deviances to be the same"
def javapredict_smallcat(): # optional parameters params = {'ntrees': 100, 'max_depth': 5, 'min_rows': 10} print "Parameter list:" for k, v in zip(params.keys(), params.values()): print "{0}, {1}".format(k, v) train = h2o.upload_file( tests.locate("smalldata/iris/setosa_versicolor.csv")) test = h2o.upload_file(tests.locate("smalldata/iris/virginica.csv")) x = [0, 1, 2, 4] y = 3 tests.javapredict("random_forest", "numeric", train, test, x, y, **params)