def tree_test(): # GBM airlines = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) gbm = H2OGradientBoostingEstimator(ntrees = 1) gbm.train(x = ["Origin", "Dest"], y = "IsDepDelayed", training_frame=airlines) tree = H2OTree(gbm, 0, "NO") # Indexing from 0 in Python. There is exactly one tree built check_tree(tree, 0, "NO") assert tree.root_node.left_levels is not None#Only categoricals in the model, guaranteed to have categorical split assert tree.root_node.right_levels is not None #Only categoricals in the model, guaranteed to have categorical split # DRF cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_nice_header.csv")) drf = H2ORandomForestEstimator(ntrees=2) drf.train(x = ["power", "acceleration"], y="cylinders", training_frame=cars) drf_tree = H2OTree(drf, 1, None) check_tree(drf_tree, 1) # ISOFOR ecg_discord = h2o.import_file(pyunit_utils.locate("smalldata/anomaly/ecg_discord_train.csv")) isofor = H2OIsolationForestEstimator(ntrees=3, seed=12, sample_size=5) isofor.train(training_frame=ecg_discord) if_tree = H2OTree(isofor, 2) check_tree(if_tree, 2)
def __init__(self, ID, params): Model.__init__(self, ID, params) h2o.init() datadir = os.path.expanduser('~') +'/FSA/data/' trainingFile = datadir + params[1][0] valFile = datadir + params[1][1] testingFile = datadir + params[1][2] self.trainData = h2o.import_file(path=trainingFile) self.valData = h2o.import_file(path=valFile) #self.valData = self.trainData self.testData = h2o.import_file(path=testingFile) # print self.trainData.col_names() # drop the invalid columns self.trainData = self.trainData.drop("away_score").drop("home_score") self.valData = self.valData.drop("away_score").drop("home_score") self.testData = self.testData.drop("away_score").drop("home_score") self.params = params if self.params[0] == False: self.trainData = self.trainData.drop('spread') # self.valData = self.valData.drop('spread') self.testData = self.testData.drop('spread') # for h2o, creating the model is the same as training the model so # need to hold of here self.model = None
def pubdev_6339(): cluster = h2o.cluster() # number of nodes cloud_size = cluster.cloud_size # number of CPUs cores = sum(node["num_cpus"] for node in cluster.nodes) # path to file file_paths = [ pyunit_utils.locate("smalldata/arcene/arcene_train.data"), pyunit_utils.locate("smalldata/census_income/adult_data.csv"), pyunit_utils.locate("smalldata/chicago/chicagoAllWeather.csv"), pyunit_utils.locate("smalldata/gbm_test/alphabet_cattest.csv"), pyunit_utils.locate("smalldata/wa_cannabis/raw/Dashboard_Usable_Sales_w_Weight_Daily.csv") ] for file_path in file_paths: # read data and parse setup to get number of columns data_raw = h2o.import_file(path=file_path,parse=False) setup = h2o.parse_setup(data_raw) # get number of columns from setup num_cols = setup['number_columns'] # get the chunk size chunk_size = calculate_chunk_size(file_path, num_cols, cores, cloud_size) # get chunk size to compare if calculation is correct result_size = setup['chunk_size'] assert chunk_size == result_size, "Calculated chunk size is incorrect!" print("chunk size for file", file_path, "is:", chunk_size) data_raw = h2o.import_file(path=file_paths[1],parse=False) setup = h2o.parse_setup(data_raw)
def directory_import(): hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() url1 = "hdfs://{0}{1}".format(hdfs_name_node, "/datasets/iris/identical_iris_files/iris1.csv") url2 = "hdfs://{0}{1}".format(hdfs_name_node, "/datasets/iris/identical_iris_files/") print "Importing HDFS file {0} and directory {1}".format(url1, url2) frm_one = h2o.import_file(url1) frm_all = h2o.import_file(url2) r1, c1 = frm_one.dim ra, ca = frm_all.dim assert r1*3 == ra, "Expected 3 times the rows, but got {0} and {1}".format(r1,ra) assert c1 == ca, "Expected same number of cols, but got {0} and {1}".format(c1,ca) else: raise(EnvironmentError, "Not running on H2O internal network. No access to HDFS.") small1 = pyunit_utils.locate("smalldata/jira/identical_files/iris1.csv") small2 = small1.split("iris1.csv")[0] print "Importing smalldata file {0} and directory {1}".format(small1, small2) frm_one = h2o.import_file(small1) frm_all = h2o.import_file(small2) r1, c1 = frm_one.dim ra, ca = frm_all.dim assert r1*3 == ra, "Expected 3 times the rows, but got {0} and {1}".format(r1,ra) assert c1 == ca, "Expected same number of cols, but got {0} and {1}".format(c1,ca)
def setup_data(self): """ This function performs all initializations necessary: load the data sets and set the training set indices and response column index """ # create and clean out the sandbox directory first self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, True) # randomly choose which family of GBM algo to use self.family = self.families[random.randint(0, len(self.families)-1)] # preload datasets, set x_indices, y_index and change response to factor for classification if 'multinomial' in self.family: self.training_metric = 'logloss' self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filenames[1])) self.y_index = self.training1_data.ncol-1 self.x_indices = list(range(self.y_index)) self.training1_data[self.y_index] = self.training1_data[self.y_index].round().asfactor() self.scale_model = 1 else: self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filenames[0])) self.y_index = self.training1_data.ncol-1 self.x_indices = list(range(self.y_index)) self.scale_model = 0.75 # save the training data files just in case the code crashed. pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
def ecologyGBM(): ecology_train = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv")) ntrees = 100 max_depth = 5 min_rows = 10 learn_rate = 0.1 # Prepare data for scikit use trainData = pandas.read_csv(pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv")) trainData.dropna(inplace=True) le = preprocessing.LabelEncoder() le.fit(trainData['Method']) trainData['Method'] = le.transform(trainData['Method']) trainDataResponse = trainData["Angaus"] trainDataFeatures = trainData[["SegSumT","SegTSeas","SegLowFlow","DSDist","DSMaxSlope","USAvgT", "USRainDays","USSlope","USNative","DSDam","Method","LocSed"]] ecology_train["Angaus"] = ecology_train["Angaus"].asfactor() # Train H2O GBM Model: gbm_h2o = H2OGradientBoostingEstimator(ntrees=ntrees, learn_rate=learn_rate, distribution="bernoulli", min_rows=min_rows, max_depth=max_depth, categorical_encoding='label_encoder') gbm_h2o.train(x=list(range(2,ecology_train.ncol)), y="Angaus", training_frame=ecology_train) # Train scikit GBM Model: gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learn_rate, n_estimators=ntrees, max_depth=max_depth, min_samples_leaf=min_rows, max_features=None) gbm_sci.fit(trainDataFeatures,trainDataResponse) # Evaluate the trained models on test data # Load the test data (h2o) ecology_test = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/ecology_eval.csv")) # Load the test data (scikit) testData = pandas.read_csv(pyunit_utils.locate("smalldata/gbm_test/ecology_eval.csv")) testData.dropna(inplace=True) testData['Method'] = le.transform(testData['Method']) testDataResponse = testData["Angaus"] testDataFeatures = testData[["SegSumT","SegTSeas","SegLowFlow","DSDist","DSMaxSlope","USAvgT", "USRainDays","USSlope","USNative","DSDam","Method","LocSed"]] # Score on the test data and compare results # scikit auc_sci = roc_auc_score(testDataResponse, gbm_sci.predict_proba(testDataFeatures)[:,1]) # h2o gbm_perf = gbm_h2o.model_performance(ecology_test) auc_h2o = gbm_perf.auc() assert auc_h2o >= auc_sci, "h2o (auc) performance degradation, with respect to scikit"
def pubdev_1953(): # small_test = [h2o.locate("bigdata/laptop/citibike-nyc/2013-10.csv")] # data = h2o.import_file(path=small_test) # startime = data["starttime"] # secsPerDay=1000*60*60*24 # data["Days"] = (startime/secsPerDay).floor() # grouped = data.group_by(["Days","start station name"]) # bpd = grouped.count(name="bikes").get_frame() # secs = bpd["Days"]*secsPerDay # bpd["Month"] = secs.month().asfactor() # bpd["DayOfWeek"] = secs.dayOfWeek() # wthr1 = h2o.import_file(path=[h2o.locate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv"), h2o.locate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv")]) # wthr2 = wthr1[["Year Local","Month Local","Day Local","Hour Local","Dew Point (C)","Humidity Fraction","Precipitation One Hour (mm)","Temperature (C)","Weather Code 1/ Description"]] # wthr2.set_name(wthr2.index("Precipitation One Hour (mm)"), "Rain (mm)") # wthr2.set_name(wthr2.index("Weather Code 1/ Description"), "WC1") # wthr3 = wthr2[ wthr2["Hour Local"]==12 ] # wthr3["msec"] = h2o.H2OFrame.mktime(year=wthr3["Year Local"], month=wthr3["Month Local"]-1, day=wthr3["Day Local"]-1, hour=wthr3["Hour Local"]) # secsPerDay=1000*60*60*24 # wthr3["Days"] = (wthr3["msec"]/secsPerDay).floor() # wthr4 = wthr3.drop("Year Local").drop("Month Local").drop("Day Local").drop("Hour Local").drop("msec") # rain = wthr4["Rain (mm)"] # rain[ rain.isna() ] = 0 # bpd_with_weather = bpd.merge(wthr4,allLeft=True,allRite=False) # r = bpd_with_weather['Days'].runif(seed=356964763) # train = bpd_with_weather[ r < 0.6] # test = bpd_with_weather[(0.6 <= r) & (r < 0.9)] predictors = ['DayOfWeek', 'WC1', 'start station name', 'Temperature (C)', 'Days', 'Month', 'Humidity Fraction', 'Rain (mm)', 'Dew Point (C)'] train = h2o.import_file(h2o.locate("smalldata/glm_test/citibike_small_train.csv")) test = h2o.import_file(h2o.locate("smalldata/glm_test/citibike_small_test.csv")) glm0 = h2o.glm(x=train[predictors], y=train["bikes"], validation_x=test[predictors], validation_y=test["bikes"], family="poisson")
def plot_test(): kwargs = {} kwargs['server'] = True air = h2o.import_file(h2o.locate("smalldata/airlines/AirlinesTrain.csv.zip")) # Constructing test and train sets by sampling (20/80) s = air[0].runif() air_train = air[s <= 0.8] air_valid = air[s > 0.8] myX = ["Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek"] myY = "IsDepDelayed" air_gbm = h2o.gbm(x=air_train[myX], y=air_train[myY], validation_x=air_valid[myX], validation_y=air_valid[myY], distribution="bernoulli", ntrees=100, max_depth=3, learn_rate=0.01) # Plot ROC for training and validation sets air_gbm.plot(type="roc", train=True, **kwargs) air_gbm.plot(type="roc", valid=True, **kwargs) air_test = h2o.import_file(h2o.locate("smalldata/airlines/AirlinesTest.csv.zip")) perf = air_gbm.model_performance(air_test) #Plot ROC for test set perf.plot(type="roc", **kwargs)
def fiftycatRF(ip, port): # Training set has only 45 categories cat1 through cat45 # Log.info("Importing 50_cattest_train.csv data...\n") train = h2o.import_file(path=h2o.locate("smalldata/gbm_test/50_cattest_train.csv")) train["y"] = train["y"].asfactor() # Log.info("Summary of 50_cattest_train.csv from H2O:\n") # train.summary() # Train H2O DRF Model: # Log.info(paste("H2O DRF with parameters:\nclassification = TRUE, ntree = 50, depth = 20, nbins = 500\n", sep = "")) model = h2o.random_forest(x=train[["x1", "x2"]], y=train["y"], ntrees=50, max_depth=20, nbins=500) # Test dataset has all 50 categories cat1 through cat50 # Log.info("Importing 50_cattest_test.csv data...\n") test = h2o.import_file(path=h2o.locate("smalldata/gbm_test/50_cattest_test.csv")) # Log.info("Summary of 50_cattest_test.csv from H2O:\n") # test.summary() # Predict on test dataset with DRF model: # Log.info("Performing predictions on test dataset...\n") preds = model.predict(test) preds.head() # Get the confusion matrix and AUC # Log.info("Confusion matrix of predictions (max accuracy):\n") perf = model.model_performance(test) perf.show() cm = perf.confusion_matrix() print(cm)
def frame_slicing(ip,port): iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris_wheader.csv")) prostate = h2o.import_file(path=h2o.locate("smalldata/prostate/prostate.csv.zip")) airlines = h2o.import_file(path=h2o.locate("smalldata/airlines/allyears2k.zip")) iris.show() prostate.show() airlines.show() ################################################################### # H2OFrame[int] (column slice) res1 = iris[0] assert abs(res1[8,:] - 4.4) < 1e-10, "incorrect values" # H2OFrame[int,int] res2 = prostate[13, 3] assert abs(res2 - 1) < 1e-10, "incorrect values" # H2OFrame[int, slice] res3 = airlines[12, 0:3] assert abs(res3[0,0] - 1987) < 1e-10 and abs(res3[0,1] - 10) < 1e-10 and abs(res3[0,2] - 29) < 1e-10, \ "incorrect values" # H2OFrame[slice, int] res4 = iris[5:8, 1] assert abs(res4[0,:] - 3.9) < 1e-10 and abs(res4[1,:] - 3.4) < 1e-10 and abs(res4[2,:] - 3.4) < 1e-10, "incorrect values" # H2OFrame[slice, slice] res5 = prostate[5:8, 0:3] assert abs(res5[0,0] - 6) < 1e-10 and abs(res5[1,1] - 0) < 1e-10 and abs(res5[2,2] - 61) < 1e-10, "incorrect values"
def fiftycatGBM(ip,port): # Training set has only 45 categories cat1 through cat45 #Log.info("Importing 50_cattest_train.csv data...\n") train = h2o.import_file(path=h2o.locate("smalldata/gbm_test/50_cattest_train.csv")) train["y"] = train["y"].asfactor() #Log.info("Summary of 50_cattest_train.csv from H2O:\n") #train.summary() # Train H2O GBM Model: #Log.info(paste("H2O GBM with parameters:\nntrees = 10, max_depth = 20, nbins = 20\n", sep = "")) model = h2o.gbm(x=train[["x1","x2"]], y=train["y"], distribution="bernoulli", ntrees=10, max_depth=5, nbins=20) model.show() # Test dataset has all 50 categories cat1 through cat50 #Log.info("Importing 50_cattest_test.csv data...\n") test = h2o.import_file(path=h2o.locate("smalldata/gbm_test/50_cattest_test.csv")) #Log.info("Summary of 50_cattest_test.csv from H2O:\n") #test.summary() # Predict on test dataset with GBM model: #Log.info("Performing predictions on test dataset...\n") predictions = model.predict(test) predictions.show() # Get the confusion matrix and AUC #Log.info("Confusion matrix of predictions (max accuracy):\n") performance = model.model_performance(test) test_cm = performance.confusion_matrix() test_auc = performance.auc()
def table_check(): df = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate.csv")) print(df[['AGE','RACE']].table(dense=True).head().as_data_frame(True)) print(df[['AGE','RACE']].table(dense=False).head().as_data_frame(True)) print(df[['RACE','AGE']].table(dense=True).head().as_data_frame(True)) print(df[['RACE','AGE']].table(dense=False).head().as_data_frame(True)) iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv")) # single column (frame) table1 = iris["C5"].table() assert table1[0,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[0,0], table1[0,1]) assert table1[1,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[1,0], table1[1,1]) assert table1[2,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[2,0], table1[2,1]) # two-column (one argument) #dense table2 = iris["C1"].table(iris["C5"]) #not dense table3 = iris["C1"].table(iris["C5"],dense=False) #check same value assert (table3[table3['C1'] == 5,'Iris-setosa'] == table2[(table2['C1'] == 5) & (table2['C5'] == 'Iris-setosa'),'Counts']).all() assert (table2 == iris[["C1","C5"]].table()).all() assert (table3 == iris[["C1","C5"]].table(dense=False)).all() cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) table = cars[2].table().as_data_frame() table = dict(table[1:]) table = {k:int(v) for k,v in list(table.items())} expected = Counter(itertools.chain(*cars[2].as_data_frame()[1:])) assert table == expected, "Expected {} for table counts but got {}".format(expected, table)
def user(): a = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))[0:4] a.head() print(a[0].names) # Column header print(a[2,0]) # column 0, row 2 value print(a[2,"sepal_len"]) # Column 0, row 2 value (a[0] + 2).show() # Add 2 to every element; broadcast a constant (a[0] + a[1]).show() # Add 2 columns; broadcast parallel add sum(a).show() print(a["sepal_len"].mean()) print() print("Rows 50 through 77 in the `sepal_len` column") a[50:78, "sepal_len"].show() # print out rows 50 thru 77 inclusive print() a["sepal_len"].show() print(a[50:78, ["sepal_len", "sepal_wid"]].show()) a.show() print("The column means: ") print(a.mean()) print() try: print(a["Sepal_len"].dim) # Error, misspelt column name except Exception: pass # Expected error b = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))[0:4] c = a + b d = c + c + sum(a) e = c + a + 1 e.show() # Note that "d=c+..." keeps the internal C expressions alive, until "d" goes # out of scope even as we nuke "c" c.show() c = None # Internal "ExprNode(c=a+b)" not dead! print(1 + (a[0] + b[1]).mean()) import collections c = h2o.H2OFrame(collections.OrderedDict({"A": [1, 2, 3], "B": [4, 5, 6]})) c.show() c.describe() c.head() c[0].show() print(c[1,0]) c[0:2,0].show() sliced = a[0:51,0] sliced.show()
def stackedensemble_metalearner_seed_test(): # Import training set train = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/higgs_train_5k.csv"), destination_frame="higgs_train_5k") test = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/higgs_test_5k.csv"), destination_frame="higgs_test_5k") # Identify predictors and response x = train.columns y = "response" x.remove(y) # Convert response to a factor train[y] = train[y].asfactor() test[y] = test[y].asfactor() # Set number of folds for base learners nfolds = 3 #Metalearner params for gbm, drf, glm, and deep deeplearning gbm_params = {"sample_rate" : 0.3, "col_sample_rate" : 0.3} # Train and cross-validate a GBM my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=10, nfolds=nfolds, keep_cross_validation_predictions=True, seed=1) my_gbm.train(x=x, y=y, training_frame=train) # Train and cross-validate a RF my_rf = H2ORandomForestEstimator(ntrees=10, nfolds=nfolds, keep_cross_validation_predictions=True, seed=1) my_rf.train(x=x, y=y, training_frame=train) #Train two SE models with same metalearner seeds stack_gbm1 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm", metalearner_params = gbm_params, seed = 55555) stack_gbm2 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm", metalearner_params = gbm_params, seed = 55555) stack_gbm1.train(x=x, y=y, training_frame=train) stack_gbm2.train(x=x, y=y, training_frame=train) meta_gbm1 = h2o.get_model(stack_gbm1.metalearner()['name']) meta_gbm2 = h2o.get_model(stack_gbm2.metalearner()['name']) assert meta_gbm1.rmse(train=True) == meta_gbm2.rmse(train=True), "RMSE should match if same seed" #Train two SE models with diff metalearner seeds stack_gbm3 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm", metalearner_params = gbm_params, seed = 55555) stack_gbm4 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm", metalearner_params = gbm_params, seed = 98765) stack_gbm3.train(x=x, y=y, training_frame=train) stack_gbm4.train(x=x, y=y, training_frame=train) meta_gbm3 = h2o.get_model(stack_gbm3.metalearner()['name']) meta_gbm4 = h2o.get_model(stack_gbm4.metalearner()['name']) assert meta_gbm3.rmse(train=True) != meta_gbm4.rmse(train=True), "RMSE should NOT match if diff seed"
def col_names_check(): iris_wheader = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) assert iris_wheader.col_names == ["sepal_len","sepal_wid","petal_len","petal_wid","class"], \ "Expected {0} for column names but got {1}".format(["sepal_len","sepal_wid","petal_len","petal_wid","class"], iris_wheader.col_names) iris = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris.csv")) assert iris.col_names == ["C1","C2","C3","C4","C5"], "Expected {0} for column names but got " \ "{1}".format(["C1","C2","C3","C4","C5"], iris.col_names) df = h2o.H2OFrame.from_python(list(zip(*np.random.randn(100,4).tolist())), column_names=list("ABCD"), column_types=["enum"]*4) df.head() assert df.col_names == list("ABCD"), "Expected {} for column names but got {}".format(list("ABCD"), df.col_names) assert list(df.types.values()) == ["enum"]*4, "Expected {} for column types but got {}".format(["enum"]*4, df.types) df = h2o.H2OFrame(list(zip(*np.random.randn(100,4).tolist()))) df.head() assert df.col_names == ["C1","C2","C3","C4"], "Expected {} for column names but got {}".format(["C1","C2","C3","C4"] , df.col_names) assert list(df.types.values()) == ["real"]*4, "Expected {} for column types but got {}".format(["real"]*4, df.types) df = h2o.H2OFrame({'B': ['a', 'a', 'b', 'NA', 'NA']}) df.head() assert df.col_names == ["B"], "Expected {} for column names but got {}".format(["B"], df.col_names) df = h2o.H2OFrame.from_python({'B': ['a', 'a', 'b', 'NA', 'NA']}, column_names=["X"]) df.head() assert df.col_names == ["X"], "Expected {} for column names but got {}".format(["X"], df.col_names)
def test_hdfs_io(): ''' Test H2O read and write to hdfs ''' hdfs_name_node = os.getenv("NAME_NODE") print("Importing hdfs data") h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/airlines/airlines_all.05p.csv") print("Spliting data") for c in ["Month","DayofMonth","IsArrDelayed"]: h2o_data[c] = h2o_data[c].asfactor() myX = ["Month","DayofMonth","Distance"] train,test = h2o_data.split_frame(ratios=[0.9]) print("Exporting file to hdfs") h2o.export_file(test[:,["Year","DayOfWeek"]], "hdfs://" + hdfs_name_node + "/datasets/exported.csv") print("Reading file back in and comparing if data is the same") new_test = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/exported.csv") assert((test[:,"DayOfWeek"] - new_test[:,"DayOfWeek"]).sum() == 0) print("Training") h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01) h2o_glm.train(x=myX, y="IsArrDelayed", training_frame=train) # dont need to train on all features hdfs_model_path = os.getenv("MODEL_PATH") print("Saving model") new_model_path = h2o.save_model(h2o_glm, "hdfs://" + hdfs_name_node + "/" + hdfs_model_path) print("Loading back model") new_model = h2o.load_model(new_model_path) print("Running predictions") preds = new_model.predict(test)
def anomaly(): print("Deep Learning Anomaly Detection MNIST") train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz")) test = h2o.import_file(pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz")) predictors = list(range(0,784)) resp = 784 # unsupervised -> drop the response column (digit: 0-9) train = train[predictors] test = test[predictors] # 1) LEARN WHAT'S NORMAL # train unsupervised Deep Learning autoencoder model on train_hex ae_model = H2OAutoEncoderEstimator(activation="Tanh", hidden=[2], l1=1e-5, ignore_const_cols=False, epochs=1) ae_model.train(x=predictors,training_frame=train) # 2) DETECT OUTLIERS # anomaly app computes the per-row reconstruction error for the test data set # (passing it through the autoencoder model and computing mean square error (MSE) for each row) test_rec_error = ae_model.anomaly(test) # 3) VISUALIZE OUTLIERS # Let's look at the test set points with low/median/high reconstruction errors. # We will now visualize the original test set points and their reconstructions obtained # by propagating them through the narrow neural net. # Convert the test data into its autoencoded representation (pass through narrow neural net) test_recon = ae_model.predict(test)
def test_relevel(): #First, compare againts itself print("Importing prostate_cat.csv data...\n") d = h2o.import_file(path = pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"), na_strings=["NA","NA","NA","NA","NA","NA","NA","NA"]) mh2o1 = H2OGeneralizedLinearEstimator(family = "binomial", Lambda=0, missing_values_handling = "Skip") mh2o1.train(x=list(range(1, d.ncol)), y=0, training_frame=d) ns = mh2o1.coef().keys() print(ns) assert("DPROS.None" in ns, "None level IS NOT expected to be skipped by default") assert(("DPROS.Both" not in ns), "Both level IS expected to be skipped by default") x = d["DPROS"].relevel("None") print(x) d["DPROS"] = x[0] mh2o2 = H2OGeneralizedLinearEstimator(family = "binomial", Lambda=0, missing_values_handling = "Skip") mh2o2.train(x=list(range(1, d.ncol)), y=0, training_frame=d) ns2 = mh2o2.coef().keys() print(ns2) assert("DPROS.None" in ns2, "None level IS NOT expected to be skipped by default") assert(("DPROS.Both" not in ns2), "Both level IS expected to be skipped by default") #Second, compare against R input (taken from runit_relevel.R) dr = h2o.import_file(path = pyunit_utils.locate("smalldata/prostate/prostate_cat.csv")) dr["DPROS"] = d["DPROS"].relevel("None") #Results are from R but manualy reordered and renamed to match h2o naming and order exp_coefs = {"Intercept": -7.63245 , "DPROS.Both": 1.39185, "DPROS.Left": 0.73482, "DPROS.Right": 1.51437, "RACE.White": 0.65160, "DCAPS.Yes": 0.49233, "AGE":-0.01189 , "PSA": 0.02990, "VOL": -0.01141, "GLEASON": 0.96466927} coeff_diff = {key: abs(exp_coefs[key] - mh2o2.coef().get(key, 0)) for key in exp_coefs.keys()} assert (max(coeff_diff.values()) < 1e-4)
def bigcat_gbm(): covtype = h2o.import_file(path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data")) covtype[54] = covtype[54].asfactor() covtypeTest = h2o.import_file(path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data")) covtypeTest[54] = covtype[54].asfactor() regular = H2OGradientBoostingEstimator(ntrees=10, seed=1234) regular.train(x=list(range(54)), y=54, training_frame=covtype) # do prediction on original dataset, no warnings check_warnings(regular, 0, covtypeTest) # drop response, no warnings covtypeTest = covtypeTest.drop(54) check_warnings(regular, 0, covtypeTest) covtypeTest = covtypeTest.drop(1) covtypeTest=covtypeTest.drop(1) check_warnings(regular, 2, covtypeTest) covtypeTest = h2o.import_file(path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data")) covtypeTest[54] = covtype[54].asfactor() covtypeTest=covtypeTest.drop(3) covtypeTest=covtypeTest.drop(5) covtypeTest=covtypeTest.drop(7) check_warnings(regular, 3, covtypeTest)
def test_hadoop(): ''' Test H2O read and write to hdfs ''' hdfs_name_node = os.getenv("NAME_NODE") print("Importing hdfs data") h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/100k.csv") print("Spliting data") train,test = h2o_data.split_frame(ratios=[0.9]) print("Exporting file to hdfs") h2o.export_file(test[:,0:2], "hdfs://" + hdfs_name_node + "/datasets/exported.csv") print("Reading file back in and comparing if data is the same") new_test = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/exported.csv") assert((test[:,1] - new_test[:,1]).sum() == 0) print("Training") h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01) h2o_glm.train(x=range(1, 10), y=0, training_frame=train) # dont need to train on all features hdfs_model_path = os.getenv("MODEL_PATH") print("Saving model") new_model_path = h2o.save_model(h2o_glm, "hdfs://" + hdfs_name_node + "/" + hdfs_model_path) print("Loading back model") new_model = h2o.load_model(new_model_path) print("Running predictions") preds = new_model.predict(test)
def glrm_catagorical_bug_fix(): trainData = h2o.import_file(pyunit_utils.locate("smalldata/airlines/AirlinesTest.csv.zip")) testData = h2o.import_file(pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip")) glrmModel = H2OGeneralizedLowRankEstimator(k=4) glrmModel.train(x=trainData.names, training_frame=trainData) predV = glrmModel.predict(testData) print(predV)
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_milsongs_orc.py")) pass else: hdfs_orc_file = "/datasets/orc_parser/milsongs_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/milsongs_csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) multi_file_csv = h2o.import_file(url_csv) multi_file_orc = h2o.import_file(url_orc) multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] multi_file_orc.summary() orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(csv_summary, orc_summary) else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() # run a quick test to determine if the hive-exec is too old. if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_prostate_orc.py")) pass else: tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 10 # choose number of elements per column to compare. Save test time. hdfs_orc_file = "/datasets/orc_parser/orc/prostate_NA.orc" hdfs_csv_file = "/datasets/orc_parser/csv/prostate_NA.csv" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) h2oOrc = h2o.import_file(url_orc) h2oCsv = h2o.import_file(url_csv) # compare the two frames assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \ "H2O frame parsed from orc and csv files are different!" else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_iris_import_types_orc.py")) pass else: numElements2Compare = 100 tol_time = 200 tol_numeric = 1e-5 hdfs_orc_file = "/datasets/orc_parser/orc/iris.orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/csv/iris.csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) h2oframe_csv = h2o.import_file(url_csv) data_types = ['real', 'real', 'real', 'real', 'enum'] h2oframe_orc = h2o.import_file(url_orc, col_types = data_types) # compare the two frames assert pyunit_utils.compare_frames(h2oframe_orc, h2oframe_csv, numElements2Compare, tol_time, tol_numeric, True), "H2O frame parsed from orc and csv files are different!" else: raise EnvironmentError
def col_names_check(): iris_wheader = h2o.import_file(tests.locate("smalldata/iris/iris_wheader.csv")) assert iris_wheader.col_names == ["sepal_len","sepal_wid","petal_len","petal_wid","class"], \ "Expected {0} for column names but got {1}".format(["sepal_len","sepal_wid","petal_len","petal_wid","class"], iris_wheader.col_names) iris = h2o.import_file(tests.locate("smalldata/iris/iris.csv")) assert iris.col_names == ["C1","C2","C3","C4","C5"], "Expected {0} for column names but got " \ "{1}".format(["C1","C2","C3","C4","C5"], iris.col_names) df = h2o.H2OFrame(np.random.randn(100,4).tolist(), column_names=list("ABCD"), column_types=["Enum"]*4) df.head() assert df.col_names == list("ABCD"), "Expected {} for column names but got {}".format(list("ABCD"), df.col_names) assert df.types == {"A": "Enum", "C": "Enum", "B": "Enum", "D": "Enum"}, "Expected {} for column types " \ "but got {}".format({"A": "Enum", "C": "Enum", "B": "Enum", "D": "Enum"}, df.types) df = h2o.H2OFrame(np.random.randn(100,4).tolist()) df.head() assert df.col_names == ["C1","C2","C3","C4"], "Expected {} for column names but got {}".format(["C1","C2","C3","C4"] , df.col_names) assert df.types == {"C3": "Numeric", "C2": "Numeric", "C1": "Numeric", "C4": "Numeric"}, "Expected {}" \ " for column types but got {}".format({"C3": "Numeric", "C2": "Numeric", "C1": "Numeric", "C4": "Numeric"}, df.types)
def fiftycat_gbm(): # Training set has only 45 categories cat1 through cat45 train = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/50_cattest_train.csv")) train["y"] = train["y"].asfactor() # Train H2O GBM Model: from h2o.estimators.gbm import H2OGradientBoostingEstimator model = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=10, max_depth=5, nbins=20) model.train(x=["x1","x2"],y="y", training_frame=train) model.show() # Test dataset has all 50 categories cat1 through cat50 test = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/50_cattest_test.csv")) # Predict on test dataset with GBM model: predictions = model.predict(test) predictions.show() # Get the confusion matrix and AUC performance = model.model_performance(test) test_cm = performance.confusion_matrix() test_auc = performance.auc()
def directory_import(): running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() url1 = "hdfs://{0}{1}".format(hdfs_name_node, "/datasets/iris/identical_iris_files/iris1.csv") url2 = "hdfs://{0}{1}".format(hdfs_name_node, "/datasets/iris/identical_iris_files/") print "Importing HDFS file {0} and directory {1}".format(url1, url2) frm_one = h2o.import_file(url1) frm_all = h2o.import_file(url2) r1, c1 = frm_one.dim ra, ca = frm_all.dim assert r1*3 == ra, "Expected 3 times the rows, but got {0} and {1}".format(r1,ra) assert c1 == ca, "Expected same number of cols, but got {0} and {1}".format(c1,ca) small1 = h2o.locate("smalldata/jira/identical_files/iris1.csv") small2 = small1.split("iris1.csv")[0] print "Importing smalldata file {0} and directory {1}".format(small1, small2) frm_one = h2o.import_file(small1) frm_all = h2o.import_file(small2) r1, c1 = frm_one.dim ra, ca = frm_all.dim assert r1*3 == ra, "Expected 3 times the rows, but got {0} and {1}".format(r1,ra) assert c1 == ca, "Expected same number of cols, but got {0} and {1}".format(c1,ca)
def additional_parameters(): #col_types as list dest_frame="dev29&hex%" c_names = ["a", "b", "c"] c_types = ["enum", "enum", "string"] fhex = h2o.import_file(tests.locate("smalldata/jira/hexdev_29.csv"), destination_frame=dest_frame, col_names=c_names, col_types=c_types) fhex.describe() assert fhex._id == dest_frame.replace("%",".").replace("&",".") assert fhex.col_names == c_names col_summary = h2o.frame(fhex._id)["frames"][0]["columns"] for i in range(len(col_summary)): assert col_summary[i]["type"] == c_types[i] #col_types as dictionary dest_frame="dev29&hex%" c_names = ["a", "b", "c"] c_types = {"c":"string", "a":"enum", "b": "enum"} fhex = h2o.import_file(tests.locate("smalldata/jira/hexdev_29.csv"), destination_frame=dest_frame, col_names=c_names, col_types=c_types) fhex.describe() assert fhex._id == dest_frame.replace("%",".").replace("&",".") assert fhex.col_names == c_names col_summary = h2o.frame(fhex._id)["frames"][0]["columns"] for i in range(len(col_summary)): assert col_summary[i]["type"] == c_types[c_names[i]]
def pubdev_1431(ip, port): running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() airlines_billion_file_1 = "/datasets/airlinesbillion.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file_1) airlines_billion_1 = h2o.import_file(url) airlines_billion_1[30] = airlines_billion_1[30].asfactor() gbm = h2o.gbm(x=airlines_billion_1[0:30], y=airlines_billion_1[30], ntrees=1, distribution="bernoulli", max_depth=1) predictions = gbm.predict(airlines_billion_1) csv = os.path.join(os.getcwd(),"delete.csv") h2o.download_csv(predictions,csv) airlines_billion_2 = h2o.import_file(csv) os.remove(csv) r1, c1 = airlines_billion_1.dim r2, c2 = airlines_billion_2.dim assert r1 == r2 and c1 == c2, "Expect rows to be equal. r1: {0} and r2: {1}. Expect cols to be equal c1: {0} " \ "c2: {1}".format(r1,r2,c1,c2) else: print "Not running on H2O internal network. No access to HDFS."
def test_arrange_OOM(): ''' PUBDEV-5990 customer reported that h2o.arrange (sorting) takes way more memory than normal for sparse datasets of 1G. Thanks to Lauren DiPerna for finding the dataset to repo the problem. ''' df = h2o.import_file(pyunit_utils.locate("bigdata/laptop/jira/sort_OOM.csv")) t1 = time.time() newFrame = df.sort("sort_col") print(newFrame[0,0]) elapsed_time = time.time()-t1 print("time taken to perform sort is {0}".format(elapsed_time)) # check and make sure the sort columns contain the right value after sorting! answerFrame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/jira/sort_OOM_answer.csv")) # compare sort_col from my sort with answer Frame pyunit_utils.compare_frames_local(answerFrame["sort_col"], newFrame["sort_col"]) # compare 10 more columns with answer Frame. Compare all columns will take too long allColumns = list(range(0, df.ncols)) random.shuffle(allColumns) pyunit_utils.compare_frames_local(answerFrame[allColumns[0:5]], newFrame[allColumns[0:5]])
This event flow is for generating gradient boost model for classification approach ''' import h2o from h2o.estimators import H2OGradientBoostingEstimator print 'A2 Benchmark' print '------------' # Initialize H2O server h2o.init(max_mem_size_GB=5) # Load train and test data as H2O frames train = h2o.import_file('processed-data/A2Benchmark_train.csv') test = h2o.import_file('processed-data/A2Benchmark_test.csv') # Define input and response columns response_column = 'is_anomaly' input_columns = train.col_names input_columns.remove(response_column) input_columns.remove('timestamp') print 'Input columns :', input_columns print 'Response column :', response_column # Explicitly imply response column contains label data train[response_column] = train[response_column].asfactor() test[response_column] = test[response_column].asfactor()
def weights_check(ip, port): def check_same(data1, data2, min_rows_scale): gbm1_regression = h2o.gbm(x=data1[[ "displacement", "power", "weight", "acceleration", "year" ]], y="economy", training_frame=data1, min_rows=5, ntrees=5, max_depth=5) gbm2_regression = h2o.gbm(x=data2[[ "displacement", "power", "weight", "acceleration", "year", "weights" ]], y=data2["economy"], min_rows=5 * min_rows_scale, weights_column=data2["weights"], ntrees=5, max_depth=5) gbm1_binomial = h2o.gbm(x=data1[[ "displacement", "power", "weight", "acceleration", "year" ]], y=data1["economy_20mpg"], min_rows=5, distribution="bernoulli", ntrees=5, max_depth=5) gbm2_binomial = h2o.gbm(x=data2[[ "displacement", "power", "weight", "acceleration", "year", "weights" ]], y=data2["economy_20mpg"], weights_column="weights", training_frame=data2, min_rows=5 * min_rows_scale, distribution="bernoulli", ntrees=5, max_depth=5) gbm1_multinomial = h2o.gbm(x=data1[[ "displacement", "power", "weight", "acceleration", "year" ]], y=data1["cylinders"], min_rows=5, distribution="multinomial", ntrees=5, max_depth=5) gbm2_multinomial = h2o.gbm(x=data2[[ "displacement", "power", "weight", "acceleration", "year", "weights" ]], y=data2["cylinders"], weights_column="weights", training_frame=data2, min_rows=5 * min_rows_scale, distribution="multinomial", ntrees=5, max_depth=5) reg1_mse = gbm1_regression.mse() reg2_mse = gbm2_regression.mse() bin1_auc = gbm1_binomial.auc() bin2_auc = gbm2_binomial.auc() mul1_mse = gbm1_multinomial.mse() mul2_mse = gbm2_multinomial.mse() print "MSE (regresson) no weights vs. weights: {0}, {1}".format( reg1_mse, reg2_mse) print "AUC (binomial) no weights vs. weights: {0}, {1}".format( bin1_auc, bin2_auc) print "MSE (multinomial) no weights vs. weights: {0}, {1}".format( mul1_mse, mul2_mse) assert abs( reg1_mse - reg2_mse ) < 1e-6 * reg1_mse, "Expected mse's to be the same, but got {0}, and {1}".format( reg1_mse, reg2_mse) assert abs( bin1_auc - bin2_auc ) < 1e-6 * bin1_auc, "Expected auc's to be the same, but got {0}, and {1}".format( bin1_auc, bin2_auc) assert abs( mul1_mse - mul1_mse ) < 1e-6 * mul1_mse, "Expected auc's to be the same, but got {0}, and {1}".format( mul1_mse, mul2_mse) h2o_cars_data = h2o.import_file( h2o.locate("smalldata/junit/cars_20mpg.csv")) h2o_cars_data["economy_20mpg"] = h2o_cars_data["economy_20mpg"].asfactor() h2o_cars_data["cylinders"] = h2o_cars_data["cylinders"].asfactor() # uniform weights same as no weights random.seed(2222) weight = random.randint(1, 10) uniform_weights = [[weight] for r in range(406)] h2o_uniform_weights = h2o.H2OFrame(python_obj=uniform_weights) h2o_uniform_weights.setNames(["weights"]) h2o_data_uniform_weights = h2o_cars_data.cbind(h2o_uniform_weights) print "Checking that using uniform weights is equivalent to no weights:" print check_same(h2o_cars_data, h2o_data_uniform_weights, weight) # zero weights same as removed observations zero_weights = [[0] if random.randint(0, 1) else [1] for r in range(406)] h2o_zero_weights = h2o.H2OFrame(python_obj=zero_weights) h2o_zero_weights.setNames(["weights"]) h2o_data_zero_weights = h2o_cars_data.cbind(h2o_zero_weights) h2o_data_zeros_removed = h2o_cars_data[h2o_zero_weights["weights"] == 1] print "Checking that using some zero weights is equivalent to removing those observations:" print check_same(h2o_data_zeros_removed, h2o_data_zero_weights, 1) # doubled weights same as doubled observations doubled_weights = [[1] if random.randint(0, 1) else [2] for r in range(406)] h2o_doubled_weights = h2o.H2OFrame(python_obj=doubled_weights) h2o_doubled_weights.setNames(["weights"]) h2o_data_doubled_weights = h2o_cars_data.cbind(h2o_doubled_weights) doubled_data = h2o.as_list(h2o_cars_data, use_pandas=False) colnames = doubled_data.pop(0) for idx, w in enumerate(doubled_weights): if w[0] == 2: doubled_data.append(doubled_data[idx]) h2o_data_doubled = h2o.H2OFrame(python_obj=doubled_data) h2o_data_doubled.setNames(colnames) h2o_data_doubled["economy_20mpg"] = h2o_data_doubled[ "economy_20mpg"].asfactor() h2o_data_doubled["cylinders"] = h2o_data_doubled["cylinders"].asfactor() h2o_data_doubled_weights["economy_20mpg"] = h2o_data_doubled_weights[ "economy_20mpg"].asfactor() h2o_data_doubled_weights["cylinders"] = h2o_data_doubled_weights[ "cylinders"].asfactor() print "Checking that doubling some weights is equivalent to doubling those observations:" print check_same(h2o_data_doubled, h2o_data_doubled_weights, 1)
def grid_lambda_search(): # Log.info("Importing prostate.csv data...\n") prostate = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) #prostate.summary() # Log.info("H2O GLM (binomial) with parameters: alpha = c(0.25, 0.5), nlambda = 20, lambda_search = TRUE, nfolds: 2\n") model = H2OGeneralizedLinearEstimator(family="binomial", nlambdas=5, lambda_search=True, n_folds=2) model.train(x=list(range(2, 9)), y=1, training_frame=prostate) # model = h2o.glm(x=prostate[2:9], y=prostate[1], family="binomial", nlambdas=5, lambda_search=True, n_folds=2) if random.random() < 0.5: model_idx = 0 else: model_idx = 1 model_bestlambda = model.models(model_idx) params_bestlambda = model.params() # Log.info(cat("All lambda values returned:\n", params_bestlambda.lambdas())) assert len(params_bestlambda.lambdas()) <= 5, "expected 5 or less lambdas" random_lambda = random.choice(params_bestlambda.lambdas()) print("RANDOM LAMBDA") print(random_lambda) # Log.info(cat("Retrieving model corresponding to alpha =", params_bestlambda.alpha(), "and randomly chosen lambda", random_lambda, "\n")) random_model = model.getGLMLambdaModel(model_bestlambda, random_lambda) # Log.info("EXPECTING THESE TO BE EQUAL") print(random_model.Lambda()) print(random_lambda) assert random_model.Lambda( ) == random_lambda, "expected lambdas to be equal" # Log.info(cat("Retrieving model corresponding to alpha =", params_bestlambda.alpha(), "and best lambda", params_bestlambda.lambdaBest(), "\n")) best_model = h2o.getGLMLambdaModel(model_bestlambda, params_bestlambda.lambda_best()) assert best_model.model() == model_bestlambda.model( ), "expected models to be equal" # Log.info("H2O GLM (binomial) with parameters: alpha = [0.25, 0.5], nlambda = 20, lambda_search = TRUE, nfolds: 2\n") prostate_search = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0.25, 0.5], nlambdas=5, lambda_search=True, n_folds=2) prostate_search.train(x=list(range(2, 9)), y=1, training_frame=prostate) # prostate_search = h2o.glm(x=prostate[2:9], y=prostate[1], family="binomial", alpha=[0.25, 0.5], nlambdas=5, lambda_search=True, n_folds=2) model_search = prostate_search.models(model_idx) models_best = model_search.models(model_search.best_model()) params_best = models_best.params() assert params_bestlambda.lambda_best() == params_best.lambda_best( ), "expected lambdas to be equal" assert len(params_best.lambda_all()) <= 20, "expected 20 or fewer lambdas"
def algo_pr_auc_test(): ''' This pyunit test is written to expose the pr_auc for all binomial runs of all algos per PUBDEV-5665. ''' seed = 123456789 prostate_train = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate_train.csv")) prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor() # Build H2O GBM classification model: gbm_h2o = H2OGradientBoostingEstimator(ntrees=10, learn_rate=0.1, max_depth=4, min_rows=10, distribution="bernoulli", seed=seed) gbm_h2o.train(x=list(range(1, prostate_train.ncol)), y="CAPSULE", training_frame=prostate_train) print("*************************** Printing GBM model") print(gbm_h2o) assert_found_pr_auc( gbm_h2o, 'training_pr_auc' ) # check and make sure pr_auc is found in scoring history # Build H2O GLM classification model: glm_h2o = H2OGeneralizedLinearEstimator(family='binomial', seed=seed) glm_h2o.train(x=list(range(1, prostate_train.ncol)), y="CAPSULE", training_frame=prostate_train) print("*************************** Printing GLM model") print(glm_h2o ) # glm scoring history does not contain AUC, and hence no pr_auc rf_h2o = H2ORandomForestEstimator(ntrees=10, score_tree_interval=0) rf_h2o.train(x=list(range(1, prostate_train.ncol)), y="CAPSULE", training_frame=prostate_train) print("*************************** Printing random forest model") print(rf_h2o) assert_found_pr_auc( rf_h2o, 'training_pr_auc' ) # check and make sure pr_auc is found in scoring history dl_h2o = H2ODeepLearningEstimator(distribution='bernoulli', seed=seed, hidden=[2, 2]) dl_h2o.train(x=list(range(1, prostate_train.ncol)), y="CAPSULE", training_frame=prostate_train) print("*************************** Printing deeplearning model") print(dl_h2o) assert_found_pr_auc( dl_h2o, 'training_pr_auc' ) # check and make sure pr_auc is found in scoring history print( "precision/recall AUC for gbm is {0}, for glm is {1},\n for rf is {2}, for deeplearning is" " {3}".format( gbm_h2o._model_json["output"] ["training_metrics"]._metric_json["pr_auc"], glm_h2o._model_json["output"] ["training_metrics"]._metric_json["pr_auc"], rf_h2o._model_json["output"] ["training_metrics"]._metric_json["pr_auc"], dl_h2o._model_json["output"] ["training_metrics"]._metric_json["pr_auc"])) assert abs(gbm_h2o._model_json["output"]["training_metrics"]._metric_json["pr_auc"] - glm_h2o._model_json["output"]["training_metrics"]._metric_json["pr_auc"]) < 0.9, \ "problem with pr_auc values" assert abs(rf_h2o._model_json["output"]["training_metrics"]._metric_json["pr_auc"] - dl_h2o._model_json["output"]["training_metrics"]._metric_json["pr_auc"]) < 0.9, \ "problem with pr_auc values"
#============================================================================== # POJO is a standalone Java class with no dependencies on the full H2O stack # (only the h2o-genmodel.jar file, which defines the POJO interface). #============================================================================== import h2o from h2o.estimators.kmeans import H2OKMeansEstimator h2o.init() h2o_df = h2o.import_file(path = "C:/xxx.csv", parse=True,header=0,sep=",") type(h2o_df) h2o_df.describe() h2o_df.columns h2o_df.types h2o_df.as_data_frame() h2o_df.as_data_frame() type(_) h2o.import_file? h2o.parse_setup?
''' This event flow is for generating deep learning model for regression approach ''' import h2o from h2o.estimators import H2ODeepLearningEstimator # Initialize H2O server h2o.init(max_mem_size_GB=5) # Load train and test data as H2O frames train = h2o.import_file('processed-data/train.csv') test = h2o.import_file('processed-data/test.csv') # Define input and response columns response_column = 'RUL' input_columns = train.col_names input_columns.remove('UnitNumber') input_columns.remove('Time') input_columns.remove('Setting1') input_columns.remove('Setting2') input_columns.remove('Setting3') input_columns.remove('RUL') # Define model and train model model = H2ODeepLearningEstimator(hidden=[500, 500], nfolds=10, epochs=100) model.train(x=input_columns, y=response_column, training_frame=train) # Test model performance = model.model_performance(test_data=test)
def algo_max_runtime_secs(): ''' This pyunit test is written to ensure that the various model will not crash if the max_runtime_secs is set to be too short. See PUBDEV-4802. ''' global model_within_max_runtime seed = 12345 # word2vec train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"), header=1, col_types=["string"]) used = train[0:170000, 0] w2v_model = H2OWord2vecEstimator() grabRuntimeInfo(w2v_model, used, [], 0) cleanUp([train, used, w2v_model]) # kmeans training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/kmeans_8_centers_3_coords.csv")) x_indices = list(range(training1_data.ncol)) model = H2OKMeansEstimator(k=10) grabRuntimeInfo(model, training1_data, x_indices) cleanUp([training1_data, model]) # PCA, pca_method=Power training1_data = h2o.import_file( path=pyunit_utils.locate("smalldata/gridsearch/pca1000by25.csv")) x_indices = list(range(training1_data.ncol)) model = H2OPCA(k=10, transform="STANDARDIZE", pca_method="Power", compute_metrics=True) grabRuntimeInfo(model, training1_data, x_indices) cleanUp([model]) # PCA, pca_method=Randomized model = H2OPCA(k=10, transform="STANDARDIZE", pca_method="Randomized", compute_metrics=True) grabRuntimeInfo(model, training1_data, x_indices) cleanUp([model]) # PCA, pca_method=GLRM model = H2OPCA(k=10, transform="STANDARDIZE", pca_method="GLRM", compute_metrics=True, use_all_factor_levels=True) grabRuntimeInfo(model, training1_data, x_indices) cleanUp([model]) # deeplearning training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/gaussian_training1_set.csv")) y_index = training1_data.ncol - 1 x_indices = list(range(y_index)) model = H2ODeepLearningEstimator(distribution='gaussian', seed=seed, hidden=[10, 10, 10]) grabRuntimeInfo(model, training1_data, x_indices, y_index) cleanUp([training1_data, model]) # stack ensemble, stacking part is not iterative print( "******************** Skip testing stack ensemble. Not an iterative algo." ) # GBM run training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/multinomial_training1_set.csv")) y_index = training1_data.ncol - 1 x_indices = list(range(y_index)) training1_data[y_index] = training1_data[y_index].round().asfactor() model = H2OGradientBoostingEstimator(distribution="multinomial", seed=seed) grabRuntimeInfo(model, training1_data, x_indices, y_index) cleanUp([model]) # GLM run model = H2OGeneralizedLinearEstimator(family='multinomial', seed=seed) grabRuntimeInfo(model, training1_data, x_indices, y_index) cleanUp([model]) # naivebayes, not iterative print( "******************** Skip testing Naives Bayes. Not an iterative algo." ) # random foreset model = H2ORandomForestEstimator(ntrees=100, score_tree_interval=0) grabRuntimeInfo(model, training1_data, x_indices) cleanUp([model, training1_data]) # deepwater if H2ODeepWaterEstimator.available(): training1_data = h2o.import_file( path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv")) training1_data = training1_data.drop('Site') training1_data['Angaus'] = training1_data['Angaus'].asfactor() y_index = "Angaus" x_indices = list(range(1, training1_data.ncol)) model = H2ODeepWaterEstimator(epochs=50, hidden=[4096, 4096, 4096], hidden_dropout_ratios=[0.2, 0.2, 0.2]) grabRuntimeInfo(model, training1_data, x_indices, y_index) cleanUp([training1_data, model]) # GLRM, do not make sense to stop in the middle of an iteration training1_data = h2o.import_file( path=pyunit_utils.locate("smalldata/gridsearch/glrmdata1000x25.csv")) x_indices = list(range(training1_data.ncol)) model = H2OGeneralizedLowRankEstimator(k=10, loss="Quadratic", gamma_x=0.3, gamma_y=0.3, transform="STANDARDIZE", recover_svd=True) grabRuntimeInfo(model, training1_data, x_indices) cleanUp([training1_data, model]) if sum(model_within_max_runtime) > 0: sys.exit(1)
# Import Modules import h2o import pandas import random project_path = "/gtc-2017" # Connect or Start H2O h2o.init() # Import Data mnist_training = h2o.import_file(project_path + "/data/mnist-training.csv") mnist_testing = h2o.import_file(project_path + "/data/mnist-testing.csv") mnist_training["label"] = mnist_training["label"].asfactor() mnist_testing["label"] = mnist_testing["label"].asfactor() # Explore Data print(mnist_training.head()) # Build Deep Water Model from h2o.estimators.deepwater import H2ODeepWaterEstimator model_mnist_lenet_mx = H2ODeepWaterEstimator(epochs=80, network="lenet") model_mnist_lenet_mx.train(x=["uri"], y="label", training_frame=mnist_training, validation_frame=mnist_testing, model_id="model_mnist_lenet_mx") model_mnist_lenet_mx.show()
import matplotlib.pyplot as plt import numpy as np import statsmodels.formula.api as smf import pandas as pd data=pd.read_csv("random_data.csv") data.head() mod = smf.OLS(data["y"], data["x"]) res = mod.fit() print res.summary() import h2o from h2o.estimators.glm import H2OGeneralizedLinearEstimator h2o.init() h2o_df = h2o.import_file("random_data.csv") h2o_df.summary() m = H2OGeneralizedLinearEstimator(model_id="GLM_1",nfolds=0) x = h2o_df.col_names[0] y = h2o_df.col_names[1] m.train(x,y,h2o_df) m
import h2o import os import tabulate import operator from h2o.estimators.gbm import H2OGradientBoostingEstimator h2o.init() #Loading Data productionprocess = h2o.import_file(path=os.path.realpath( "/home/iconnect4/bespoke_manufacturing/data/production-process-data.csv"), destination_frame="bespokemanufacturing", header=1, col_types=[ "string", "string", "string", "string", "string", "string", "string" ]) productionprocess.describe() data_cols = [ "productshortname", "prodordertype", "prodordercategory", "orderitempriority", "ordersource", "address_dl_country", "prod_allocated_process" ] for col in data_cols: productionprocess[col] = productionprocess[col].asfactor() #Split into train and test frames train, test = productionprocess.split_frame(ratios=[0.7]) print(train.nrows) print(test.nrows)
def download_mojo_filename(): fr = h2o.import_file( path=pyunit_utils.locate("smalldata/prostate/prostate.csv")) model = H2OGradientBoostingEstimator(ntrees=10, seed=1234) model.train(x=list(range(2, fr.ncol)), y=1, training_frame=fr) # Default location is current working directory and filename is model_id mojo_path = model.download_mojo() assert_equals(os.path.join(os.getcwd(), model.model_id + ".zip"), mojo_path, "Not expected path") mojo_model = h2o.import_mojo(mojo_path) assert isinstance(mojo_model, H2OGenericEstimator) # Location is parent of current working directory and filename is model_id mojo_path = model.download_mojo("..") assert_equals( os.path.abspath(os.path.join(os.pardir, model.model_id + ".zip")), mojo_path, "Not expected path") mojo_model = h2o.import_mojo(mojo_path) assert isinstance(mojo_model, H2OGenericEstimator) # Location is home directory and filename is model_id mojo_path = model.download_mojo("~") assert_equals( os.path.abspath( os.path.expanduser(os.path.join("~", model.model_id + ".zip"))), mojo_path, "Not expected path") mojo_model = h2o.import_mojo(mojo_path) assert isinstance(mojo_model, H2OGenericEstimator) # Default locations is current working directory with custom filename mojo_path = model.download_mojo("gbm_prostate.zip") assert_equals(os.path.join(os.getcwd(), "gbm_prostate.zip"), mojo_path, "Not expected path") mojo_model = h2o.import_mojo(mojo_path) assert isinstance(mojo_model, H2OGenericEstimator) # Location is current working directory with custom filename mojo_path = model.download_mojo("./gbm_prostate.zip") assert_equals(os.path.join(os.getcwd(), "gbm_prostate.zip"), mojo_path, "Not expected path") mojo_model = h2o.import_mojo(mojo_path) assert isinstance(mojo_model, H2OGenericEstimator) # Location is parent of current working directory with custom filename mojo_path = model.download_mojo("../gbm_prostate.zip") assert_equals(os.path.abspath(os.path.join(os.pardir, "gbm_prostate.zip")), mojo_path, "Not expected path") mojo_model = h2o.import_mojo(mojo_path) assert isinstance(mojo_model, H2OGenericEstimator) # Location is home directory with custom filename mojo_path = model.download_mojo("~/gbm_prostate.zip") assert_equals( os.path.abspath( os.path.expanduser(os.path.join("~", "gbm_prostate.zip"))), mojo_path, "Not expected path") mojo_model = h2o.import_mojo(mojo_path) assert isinstance(mojo_model, H2OGenericEstimator) # Custom filename with custom path tmpdir = tempfile.mkdtemp() mojo_path = model.download_mojo(os.path.join(tmpdir, "gbm_prostate.zip")) assert_equals(os.path.join(tmpdir, "gbm_prostate.zip"), mojo_path, "Not expected path") mojo_model = h2o.import_mojo(mojo_path) assert isinstance(mojo_model, H2OGenericEstimator)
def binop_eq(ip, port): iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris_wheader.csv")) rows, cols = iris.dim iris.show() #frame/scaler res = iris == 4.7 res_rows, res_cols = res.dim assert res_rows == rows and res_cols == cols, "dimension mismatch" new_rows = iris[res[0]].nrow assert new_rows == 2, "wrong number of rows returned" res = 3.5 == iris res_rows, res_cols = res.dim assert res_rows == rows and res_cols == cols, "dimension mismatch" new_rows = iris[res[1]].nrow assert new_rows == 6, "wrong number of rows returned" #frame/vec #try: # res = iris == iris[0] # res.show() # assert False, "expected error. objects of different dimensions not supported." #except EnvironmentError: # pass #try: # res = iris[2] == iris # res.show() # assert False, "expected error. objects of different dimensions not supported." #except EnvironmentError: # pass #vec/vec res = iris[0] == iris[1] res_rows = res.nrow assert res_rows == rows, "dimension mismatch" new_rows = iris[res].nrow assert new_rows == 0, "wrong number of rows returned" res = iris[2] == iris[2] res_rows = res.nrow assert res_rows == rows, "dimension mismatch" new_rows = iris[res].nrow assert new_rows == 150, "wrong number of rows returned" #vec/scaler res = iris[0] == 4.7 res_rows = res.nrow assert res_rows == rows, "dimension mismatch" new_rows = iris[res].nrow assert new_rows == 2, "wrong number of rows returned" res = 3.5 == iris[1] res_rows = res.nrow assert res_rows == rows, "dimension mismatch" new_rows = iris[res].nrow assert new_rows == 6, "wrong number of rows returned" # frame/frame res = iris == iris res_rows, res_cols = res.dim assert res_rows == rows and res_cols == cols, "dimension mismatch" res = iris[0:2] == iris[1:3] res_rows, res_cols = res.dim assert res_rows == rows and res_cols == 2, "dimension mismatch"
def testGLMGaussianScoringHistory(): col_list_compare = [ "iterations", "objective", "negative_log_likelihood", "training_rmse", "validation_rmse", "training_mae", "validation_mae", "training_deviance", "validation_deviance", "deviance_train", "deviance_test" ] h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/glm_test/gaussian_20cols_10000Rows.csv")) enum_columns = [ "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10" ] for cname in enum_columns: h2o_data[cname] = h2o_data[cname] myY = "C21" myX = h2o_data.names.remove(myY) data_frames = h2o_data.split_frame(ratios=[0.8]) training_data = data_frames[0] test_data = data_frames[1] # build gaussian model with score_each_interval to true model = glm(family="gaussian", score_each_iteration=True, generate_scoring_history=True) model.train(x=myX, y=myY, training_frame=training_data, validation_frame=test_data) # build gaussian model with score_iteration_interval to 1 model_score_each = glm(family="gaussian", score_iteration_interval=1, generate_scoring_history=True) model_score_each.train(x=myX, y=myY, training_frame=training_data, validation_frame=test_data) pyunit_utils.assert_equal_scoring_history(model, model_score_each, col_list_compare) # build gaussian model with score_each_interval to true, with CV model_cv = glm(family="gaussian", score_each_iteration=True, nfolds=3, fold_assignment='modulo', seed=1234, generate_scoring_history=True) model_cv.train(x=myX, y=myY, training_frame=training_data, validation_frame=test_data) # build gaussian model with score_iteration_interval to 1, with CV model_score_each_cv = glm(family="gaussian", score_iteration_interval=1, nfolds=3, fold_assignment='modulo', seed=1234, generate_scoring_history=True) model_score_each_cv.train(x=myX, y=myY, training_frame=training_data, validation_frame=test_data) pyunit_utils.assert_equal_scoring_history(model_cv, model_score_each_cv, col_list_compare) model_cv_4th = glm(family="gaussian", score_iteration_interval=4, nfolds=3, fold_assignment='modulo', seed=1234, generate_scoring_history=True) model_cv_4th.train(x=myX, y=myY, training_frame=training_data, validation_frame=test_data) pyunit_utils.assertEqualScoringHistoryIteration(model_cv_4th, model_cv, col_list_compare)
import h2o h2o.init() path = h2o.system_file("prostate.csv") h2o_df = h2o.import_file(path) h2o_df["CAPSULE"] = h2o_df["CAPSULE"].asfactor() h2o_df.summary()
# Load the H2O library and start up the H2O cluter locally on your machine import h2o # Import H2O GLM: from h2o.estimators.glm import H2OGeneralizedLinearEstimator if __name__ == "__main__": # Number of threads, nthreads = -1, means use all cores on your machine # max_mem_size is the maximum memory (in GB) to allocate to H2O h2o.init(nthreads=-1, max_mem_size=8) #loan_csv = "/Volumes/H2OTOUR/loan.csv" # modify this for your machine # Alternatively, you can import the data directly from a URL loan_csv = "https://raw.githubusercontent.com/h2oai/app-consumer-loan/master/data/loan.csv" data = h2o.import_file(loan_csv) # 163,987 rows x 15 columns data['bad_loan'] = data['bad_loan'].asfactor( ) #encode the binary response as a factor #data['bad_loan'].levels() #optional: after encoding, this shows the two factor levels, '0' and '1' y = 'bad_loan' x = list(data.columns) x.remove(y) #remove the response x.remove( 'int_rate' ) #remove the interest rate column because it's correlated with the outcome # Initialize the GLM estimator: # Similar to R's glm() and H2O's R GLM, H2O's GLM has the "family" argument glm_fit1 = H2OGeneralizedLinearEstimator(family='binomial', model_id='glm_fit1')
def deeplearning_grid_cars(): cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) r = cars[0].runif(seed=42) train = cars[r > .2] validation_scheme = random.randint(1,3) # 1:none, 2:cross-validation, 3:validation set print "Validation scheme: {0}".format(validation_scheme) if validation_scheme == 2: nfolds = 2 print "Nfolds: 2" if validation_scheme == 3: valid = cars[r <= .2] grid_space = pyunit_utils.make_random_grid_space(algo="dl") print "Grid space: {0}".format(grid_space) predictors = ["displacement","power","weight","acceleration","year"] if grid_space['distribution'][0] == 'bernoulli': response_col = "economy_20mpg" elif grid_space['distribution'][0] == 'gaussian': response_col = "economy" else: response_col = "cylinders" print "Predictors: {0}".format(predictors) print "Response: {0}".format(response_col) if grid_space['distribution'][0] in ['bernoulli', 'multinomial']: print "Converting the response column to a factor..." train[response_col] = train[response_col].asfactor() if validation_scheme == 3: valid[response_col] = valid[response_col].asfactor() print "Constructing the grid of gbm models..." cars_dl_grid = H2OGridSearch(H2ODeepLearningEstimator, hyper_params=grid_space) if validation_scheme == 1: cars_dl_grid.train(x=predictors,y=response_col,training_frame=train) elif validation_scheme == 2: cars_dl_grid.train(x=predictors,y=response_col,training_frame=train,nfolds=nfolds) else: cars_dl_grid.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid) print "Performing various checks of the constructed grid..." print "Check cardinality of grid, that is, the correct number of models have been created..." size_of_grid_space = 1 for v in grid_space.values(): size_of_grid_space = size_of_grid_space * len(v) actual_size = len(cars_dl_grid) assert size_of_grid_space == actual_size, "Expected size of grid to be {0}, but got {1}" \ "".format(size_of_grid_space,actual_size) print "Duplicate-entries-in-grid-space check" new_grid_space = copy.deepcopy(grid_space) for name in grid_space.keys(): if not name == "distribution": new_grid_space[name] = grid_space[name] + grid_space[name] print "The new search space: {0}".format(new_grid_space) print "Constructing the new grid of gbm models..." cars_dl_grid2 = H2OGridSearch(H2ODeepLearningEstimator, hyper_params=new_grid_space) if validation_scheme == 1: cars_dl_grid2.train(x=predictors,y=response_col,training_frame=train) elif validation_scheme == 2: cars_dl_grid2.train(x=predictors,y=response_col,training_frame=train,nfolds=nfolds) else: cars_dl_grid2.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid) actual_size2 = len(cars_dl_grid2) assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \ "size: {1}".format(actual_size, actual_size2) print "Check that the hyper_params that were passed to grid, were used to construct the models..." for name in grid_space.keys(): pyunit_utils.expect_model_param(cars_dl_grid, name, grid_space[name])
def test_gam_cv_fold_columns(): # create frame knots knots1 = [-1.99905699, -0.98143075, 0.02599159, 1.00770987, 1.99942290] frameKnots1 = h2o.H2OFrame(python_obj=knots1) knots2 = [-1.999821861, -1.005257990, -0.006716042, 1.002197392, 1.999073589] frameKnots2 = h2o.H2OFrame(python_obj=knots2) knots3 = [-1.999675688, -0.979893796, 0.007573327, 1.011437347, 1.999611676] frameKnots3 = h2o.H2OFrame(python_obj=knots3) # import the dataset h2o_data = h2o.import_file( pyunit_utils.locate("smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv")) # convert the C1, C2, and C11 columns to factors h2o_data["C1"] = h2o_data["C1"].asfactor() h2o_data["C2"] = h2o_data["C2"].asfactor() h2o_data["C11"] = h2o_data["C11"].asfactor() # split into train and validation sets train, test = h2o_data.split_frame(ratios=[.8]) # set the predictor and response columns y = "C11" x = ["C1", "C2"] # specify the knots array numKnots = [5, 5, 5] # Both of these gives an NPE, should be fixed now. # build the GAM model gam_columns=["C6","C7","C8"] h2o_model = H2OGeneralizedAdditiveEstimator(family='multinomial', gam_columns=["C6", "C7", "C8"], scale=[0, 1, 2], num_knots=numKnots, knot_ids=[frameKnots1.key, frameKnots2.key, frameKnots3.key], nfolds=5, seed=1234, fold_assignment='modulo') h2o_model.train(x=x, y=y, training_frame=train) # create a fold column for train fold_numbers = train.kfold_column(n_folds=5, seed=1234) # rename the column "fold_numbers" fold_numbers.set_names(["fold_numbers"]) train = train.cbind(fold_numbers) # build the GAM model h2o_model_fold_column = H2OGeneralizedAdditiveEstimator(family='multinomial', gam_columns=["C6", "C7", "C8"], scale=[0, 1, 2], num_knots=numKnots, knot_ids=[frameKnots1.key, frameKnots2.key, frameKnots3.key]) h2o_model_fold_column.train(x=x, y=y, training_frame=train, fold_column="fold_numbers") # both model should return the same coefficients since they use the same fold assignment coeff = h2o_model.coef() coeff_fold_column = h2o_model_fold_column.coef() pyunit_utils.assertCoefDictEqual(coeff['coefficients'], coeff_fold_column['coefficients'])
def glm_alpha_lambda_arrays(): # read in the dataset and construct training set (and validation set) d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) mL = glm(family='binomial', Lambda=[0.9, 0.5, 0.1], alpha=[0.1, 0.5, 0.9], solver='COORDINATE_DESCENT', cold_start=False) mL.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1) r = glm.getGLMRegularizationPath(mL) regKeys = [ "alphas", "lambdas", "explained_deviance_valid", "explained_deviance_train" ] best_submodel_index = mL._model_json["output"]["best_submodel_index"] m2 = glm.makeGLMModel(model=mL, coefs=r['coefficients'][best_submodel_index]) dev1 = r['explained_deviance_train'][best_submodel_index] p2 = m2.model_performance(d) dev2 = 1 - p2.residual_deviance() / p2.null_deviance() print(dev1, " =?= ", dev2) assert abs(dev1 - dev2) < 1e-6 responseMean = d[1].mean() initIntercept = math.log(responseMean / (1.0 - responseMean)) startValInit = [0, 0, 0, 0, 0, 0, 0, initIntercept] startVal = [0, 0, 0, 0, 0, 0, 0, initIntercept] orderedCoeffNames = [ "AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON", "Intercept" ] for l in range(0, len(r['lambdas'])): m = glm(family='binomial', alpha=[r['alphas'][l]], Lambda=[r['lambdas'][l]], solver='COORDINATE_DESCENT', startval=startVal) m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1) mr = glm.getGLMRegularizationPath(m) cs = r['coefficients'][l] cs_norm = r['coefficients_std'][l] pyunit_utils.assertEqualCoeffDicts(cs, m.coef(), tol=1e-3) pyunit_utils.assertEqualCoeffDicts(cs_norm, m.coef_norm(), 1e-3) if (l + 1) < len( r['lambdas']) and r['alphas'][l] != r['alphas'][l + 1]: startVal = startValInit else: startVal = pyunit_utils.extractNextCoeff( cs_norm, orderedCoeffNames, startVal) # prepare startval for next round p = m.model_performance(d) devm = 1 - p.residual_deviance() / p.null_deviance() devn = r['explained_deviance_train'][l] assert abs(devm - devn) < 1e-4 pyunit_utils.assertEqualRegPaths(regKeys, r, l, mr, tol=1e-4) if (l == best_submodel_index ): # check training metrics, should equal for best submodel index pyunit_utils.assertEqualModelMetrics( m._model_json["output"]["training_metrics"], mL._model_json["output"]["training_metrics"], tol=1e-4) else: # for other submodel, should have worse residual_deviance() than best submodel assert p.residual_deviance() >= p2.residual_deviance(), "Best submodel does not have lowerest " \ "residual_deviance()!"
def testGLMBinomialScoringHistory(): col_list_compare = [ "iterations", "objective", "negative_log_likelihood", "training_logloss", "validation_logloss", "training_classification_error", "validation_classification_error", "training_rmse", "validation_rmse", "training_auc", "validation_auc", "training_pr_auc", "validation_pr_auc", "training_lift", "validation_lift", "deviance_train", "deviance_test" ] h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/glm_test/binomial_20_cols_10KRows.csv")) for ind in range(10): h2o_data[ind] = h2o_data[ind].asfactor() h2o_data["C21"] = h2o_data["C21"].asfactor() splits_frames = h2o_data.split_frame(ratios=[.8], seed=1234) train = splits_frames[0] valid = splits_frames[1] Y = "C21" X = list(range(0, 20)) print( "Building model with score_interval=1. Should generate same model as " "score_each_iteration turned on.") h2o_model = glm(family="binomial", score_iteration_interval=1, generate_scoring_history=True) h2o_model.train(x=X, y=Y, training_frame=train, validation_frame=valid) print("Building model with score_each_iteration turned on.") h2o_model_score_each = glm(family="binomial", score_each_iteration=True, generate_scoring_history=True) h2o_model_score_each.train(x=X, y=Y, training_frame=train, validation_frame=valid) # scoring history from h2o_model_score_each and h2o_model should be the same pyunit_utils.assert_equal_scoring_history(h2o_model_score_each, h2o_model, col_list_compare) print("Building model with score_each_iteration turned on, with CV.") h2o_model_score_each_cv = glm(family="binomial", score_each_iteration=True, nfolds=3, fold_assignment='modulo', seed=1234, generate_scoring_history=True) h2o_model_score_each_cv.train(x=X, y=Y, training_frame=train, validation_frame=valid) print( "Building model with score_interval=1, and CV. Should generate same model as score_each_iteration turned " "on, with lambda search and CV.") h2o_model_cv = glm(family="binomial", score_iteration_interval=1, nfolds=3, fold_assignment='modulo', seed=1234, generate_scoring_history=True) h2o_model_cv.train(x=X, y=Y, training_frame=train, validation_frame=valid) # scoring history from h2o_model_score_each_cv and h2o_model_cv should be the same col_list_compare.append("deviance_xval") col_list_compare.append("deviance_se") pyunit_utils.assert_equal_scoring_history(h2o_model_score_each_cv, h2o_model_cv, col_list_compare) # check if scoring_interval is set to 4, the output should be the same for every fourth iteration h2o_model_cv_4th = glm(family="binomial", score_iteration_interval=3, nfolds=3, fold_assignment='modulo', seed=1234, generate_scoring_history=True) h2o_model_cv_4th.train(x=X, y=Y, training_frame=train, validation_frame=valid) pyunit_utils.assertEqualScoringHistoryIteration(h2o_model_cv, h2o_model_cv_4th, col_list_compare)
# We visualize the nature of H2O Deep Learning (DL), H2O's tree methods (GBM/DRF) and H2O's generalized linear modeling (GLM) by plotting the decision boundary between the red and black spirals: import matplotlib.pyplot as plt import numpy as np import pandas as pd from h2o.estimators.deeplearning import H2OAutoEncoderEstimator, H2ODeepLearningEstimator from h2o.estimators.gbm import H2OGradientBoostingEstimator from h2o.estimators.glm import H2OGeneralizedLinearEstimator from h2o.estimators.random_forest import H2ORandomForestEstimator # First, we need to upload our datasets to the the H2O cluster. The data is imported into H2OFrames, which operate similarly in function to pandas DataFrames. import os spiral = h2o.import_file(path = os.path.realpath("input/spiral.csv")) grid = h2o.import_file(path = os.path.realpath("input/grid.csv")) # Spiral is a simple data set consisting of two spirals of black and red dots. # Grid is a 201 by 201 matrix with dimensions [-1.5, 1.5] by [-1.5, 1.5]. # # To visualize these datasets, we can pull them from H2OFrames into pandas DataFrames for easier plotting. spiral_df = spiral.as_data_frame(use_pandas=True) grid_df = grid.as_data_frame(use_pandas=True) grid_x, grid_y = grid_df.x.reshape(201,201), grid_df.y.reshape(201,201) spiral_r = spiral_df[spiral_df.color == "Red"] spiral_k = spiral_df[spiral_df.color == "Black"] spiral_xr, spiral_yr = spiral_r[spiral_r.columns[0]], spiral_r[spiral_r.columns[1]]
def test_random_forrest_effective_parameters(): frame = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv")) frame["Angaus"] = frame["Angaus"].asfactor() frame["Weights"] = h2o.H2OFrame.from_python(abs(np.random.randn(frame.nrow, 1)).tolist())[0] train, calib = frame.split_frame(ratios=[.8], destination_frames=["eco_train", "eco_calib"], seed=42) rf1 = H2ORandomForestEstimator(ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5, weights_column="Weights", stopping_rounds = 3, calibrate_model=True, calibration_frame=calib, seed = 1234) rf1.train(x=list(range(2, train.ncol)), y="Angaus", training_frame=train) rf2 = H2ORandomForestEstimator(ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5, weights_column="Weights", stopping_rounds = 3, stopping_metric='logloss', calibrate_model=True, calibration_frame=calib, seed = 1234, categorical_encoding = 'Enum') rf2.train(x=list(range(2, train.ncol)), y="Angaus", training_frame=train) assert rf1.parms['stopping_metric']['input_value'] == 'AUTO' assert rf1.parms['stopping_metric']['actual_value'] == rf2.parms['stopping_metric']['actual_value'] np.testing.assert_almost_equal(rf1.logloss(), rf2.logloss()) assert rf1.parms['distribution']['input_value'] == 'bernoulli' assert rf1.parms['distribution']['actual_value'] == rf2.parms['distribution']['actual_value'] assert rf1.parms['categorical_encoding']['input_value'] == 'AUTO' assert rf1.parms['categorical_encoding']['actual_value'] == rf2.parms['categorical_encoding']['actual_value'] assert rf1.parms['fold_assignment']['input_value'] == 'AUTO' assert rf1.parms['fold_assignment']['actual_value'] == None rf1 = H2ORandomForestEstimator(ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5, weights_column="Weights", nfolds = 5, calibrate_model=True, calibration_frame=calib, seed = 1234) rf1.train(x=list(range(2, train.ncol)), y="Angaus", training_frame=train) rf2 = H2ORandomForestEstimator(ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5, weights_column="Weights", nfolds=5, fold_assignment='Random', calibrate_model=True, calibration_frame=calib, seed = 1234, categorical_encoding = 'Enum') rf2.train(x=list(range(2, train.ncol)), y="Angaus", training_frame=train) assert rf1.parms['stopping_metric']['input_value'] == 'AUTO' assert rf1.parms['stopping_metric']['actual_value'] is None np.testing.assert_almost_equal(rf1.logloss(), rf2.logloss()) assert rf1.parms['distribution']['input_value'] == 'bernoulli' assert rf1.parms['distribution']['actual_value'] == rf2.parms['distribution']['actual_value'] assert rf1.parms['fold_assignment']['input_value'] == 'AUTO' assert rf1.parms['fold_assignment']['actual_value'] == rf2.parms['fold_assignment']['actual_value'] assert rf1.parms['categorical_encoding']['input_value'] == 'AUTO' assert rf1.parms['categorical_encoding']['actual_value'] == rf2.parms['categorical_encoding']['actual_value'] try: h2o.rapids("(setproperty \"{}\" \"{}\")".format("sys.ai.h2o.algos.evaluate_auto_model_parameters", "false")) rf1 = H2ORandomForestEstimator(ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5, weights_column="Weights", nfolds = 5, calibrate_model=True, calibration_frame=calib, seed = 1234) rf1.train(x=list(range(2, train.ncol)), y="Angaus", training_frame=train) rf2 = H2ORandomForestEstimator(ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5, weights_column="Weights", nfolds=5, fold_assignment='Random', calibrate_model=True, calibration_frame=calib, seed = 1234, categorical_encoding = 'Enum') rf2.train(x=list(range(2, train.ncol)), y="Angaus", training_frame=train) assert rf1.parms['stopping_metric']['input_value'] == 'AUTO' assert rf1.parms['stopping_metric']['actual_value'] == 'AUTO' np.testing.assert_almost_equal(rf1.logloss(), rf2.logloss()) assert rf1.parms['distribution']['input_value'] == 'bernoulli' assert rf1.parms['distribution']['actual_value'] == rf2.parms['distribution']['actual_value'] assert rf1.parms['fold_assignment']['input_value'] == 'AUTO' assert rf1.parms['fold_assignment']['actual_value'] == 'AUTO' assert rf1.parms['categorical_encoding']['input_value'] == 'AUTO' assert rf1.parms['categorical_encoding']['actual_value'] == 'AUTO' finally: h2o.rapids("(setproperty \"{}\" \"{}\")".format("sys.ai.h2o.algos.evaluate_auto_model_parameters", "true"))
# Used swedish insurance data from smalldata instead of MASS/insurance due to the license of the MASS R package. import h2o from h2o.estimators.glm import H2OGeneralizedLinearEstimator h2o.init() h2o_df = h2o.import_file( "http://h2o-public-test-data.s3.amazonaws.com/smalldata/glm_test/Motor_insurance_sweden.txt", sep='\t') poisson_fit = H2OGeneralizedLinearEstimator(family="poisson") poisson_fit.train( y="Claims", x=["Payment", "Insured", "Kilometres", "Zone", "Bonus", "Make"], training_frame=h2o_df)
def offset_1388(): print "Loading datasets..." pros_hex = h2o.import_file(tests.locate("smalldata/prostate/prostate.csv")) pros_hex[1] = pros_hex[1].asfactor() pros_hex[3] = pros_hex[3].asfactor() pros_hex[4] = pros_hex[4].asfactor() pros_hex[5] = pros_hex[5].asfactor() pros_hex[8] = pros_hex[8].asfactor() cars_hex = h2o.import_file(tests.locate("smalldata/junit/cars.csv")) cars_hex[0] = cars_hex[0].asfactor() cars_hex[2] = cars_hex[2].asfactor() print "Running Binomial Comparison..." glm_bin_h2o = h2o.glm(x=pros_hex[2:9], y=pros_hex[1], training_frame=pros_hex, family="binomial", standardize=False, offset_column="AGE", Lambda=[0], max_iterations=100) print "binomial" print "R:" print "deviance: {0}".format(1464.9565781185) print "null deviance: {0}".format(2014.93087862689) print "aic: {0}".format(1494.9565781185) print "H2O:" print "deviance {0}".format(glm_bin_h2o.residual_deviance()) print "null deviance {0}".format(glm_bin_h2o.null_deviance()) print "aic {0}".format(glm_bin_h2o.aic()) assert abs(1464.9565781185 - glm_bin_h2o.residual_deviance()) < 0.1 assert abs(2014.93087862689 - glm_bin_h2o.null_deviance()) < 0.1 assert abs(1494.9565781185 - glm_bin_h2o.aic()) < 0.1 print "Running Regression Comparisons..." glm_h2o = h2o.glm(x=cars_hex[2:8], y=cars_hex[1], training_frame=cars_hex, family="gaussian", standardize=False, offset_column="year", Lambda = [0], max_iterations = 100) print "gaussian" print "R:" print "deviance: {0}".format(4204.68399275449) print "null deviance: {0}".format(16072.0955102041) print "aic: {0}".format(2062.54330117177) print "H2O:" print "deviance {0}".format(glm_h2o.residual_deviance()) print "null deviance {0}".format(glm_h2o.null_deviance()) print "aic {0}".format(glm_h2o.aic()) assert abs(4204.68399275449 - glm_h2o.residual_deviance()) < 0.1 assert abs(16072.0955102041 - glm_h2o.null_deviance()) < 0.1 assert abs(2062.54330117177 - glm_h2o.aic()) < 0.1 glm_h2o = h2o.glm(x=cars_hex[2:8], y=cars_hex[1], training_frame=cars_hex, family="poisson", standardize=False, offset_column="year", Lambda = [0], max_iterations = 100) print "poisson" print "R:" print "deviance: {0}".format(54039.1725227918) print "null deviance: {0}".format(59381.5624028358) print "aic: {0}".format("Inf") print "H2O:" print "deviance {0}".format(glm_h2o.residual_deviance()) print "null deviance {0}".format(glm_h2o.null_deviance()) print "aic {0}".format(glm_h2o.aic()) assert abs(54039.1725227918 - glm_h2o.residual_deviance()) < 0.1 assert abs(59381.5624028358 - glm_h2o.null_deviance()) < 0.1 assert abs(float('inf') - glm_h2o.aic()) < 0.1
def setup_data(self): """ This function performs all initializations necessary: 1. generates all the random parameter values for our dynamic tests like the Gaussian noise std, column count and row count for training/test data sets. 2. randomly choose the distribution family (gaussian, binomial, multinomial) to test. 3. with the chosen distribution family, generate the appropriate data sets 4. load the data sets and set the training set indices and response column index """ # create and clean out the sandbox directory first self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, True) # randomly set Gaussian noise standard deviation as a fraction of actual predictor standard deviation self.noise_std = random.uniform(0, math.sqrt(pow((self.max_p_value - self.min_p_value), 2) / 12)) self.noise_var = self.noise_std*self.noise_std # randomly determine data set size in terms of column and row counts self.train_col_count = random.randint(1, self.max_col_count) self.train_row_count = round(self.train_col_count * random.uniform(self.min_col_count_ratio, self.max_col_count_ratio)) # DEBUGGING setup_data, remember to comment them out once done. # self.train_col_count = 3 # self.train_row_count = 200 # self.max_real_number = 3 # self.max_int_number = 3 # end DEBUGGING # randomly choose which family of GLM algo to use self.family = self.families[random.randint(0, len(self.families)-1)] # set class number for classification if 'binomial' in self.family: self.class_number = 2 elif 'multinomial' in self.family: self.class_number = random.randint(3, self.max_class_number) # randomly set number of classes K # generate real value weight vector and training/validation/test data sets for GLM pyunit_utils.write_syn_floating_point_dataset_glm(self.training1_data_file, "", self.training2_data_file, self.weight_data_file, self.train_row_count, self.train_col_count, 2, self.max_p_value, self.min_p_value, self.max_w_value, self.min_w_value, self.noise_std, self.family, self.train_row_count, self.train_row_count, class_number=self.class_number, class_method=['probability', 'probability', 'probability']) # preload data sets self.training1_data = h2o.import_file(pyunit_utils.locate(self.training1_data_file)) self.training2_data = h2o.import_file(pyunit_utils.locate(self.training2_data_file)) # set data set indices for predictors and response self.y_index = self.training1_data.ncol-1 self.x_indices = list(range(self.y_index)) # set response to be categorical for classification tasks if ('binomial' in self.family) or ('multinomial' in self.family): self.training1_data[self.y_index] = self.training1_data[self.y_index].round().asfactor() # check to make sure all response classes are represented, otherwise, quit if self.training1_data[self.y_index].nlevels()[0] < self.class_number: print("Response classes are not represented in training dataset.") sys.exit(0) self.training2_data[self.y_index] = self.training2_data[self.y_index].round().asfactor() # save the training data files just in case the code crashed. pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
def pyunit_mean_per_class_error(): gbm = H2OGradientBoostingEstimator(nfolds=3, fold_assignment="Random", seed=1234) ## Binomial cars = h2o.import_file("/users/arno/h2o-3/smalldata/junit/cars_20mpg.csv") cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() r = cars[0].runif(seed=1234) train = cars[r > .2] valid = cars[r <= .2] response_col = "economy_20mpg" predictors = ["displacement", "power", "weight", "acceleration", "year"] gbm.distribution = "bernoulli" gbm.train(y=response_col, x=predictors, validation_frame=valid, training_frame=train) print(gbm) mpce = gbm.mean_per_class_error([0.5, 0.8]) ## different thresholds assert (abs(mpce[0][1] - 0.004132231404958664) < 1e-5) assert (abs(mpce[1][1] - 0.021390374331550777) < 1e-5) ## score on train first print( gbm.model_performance(train).mean_per_class_error( thresholds=[0.3, 0.5])) ## Multinomial cars = h2o.import_file("/users/arno/h2o-3/smalldata/junit/cars_20mpg.csv") cars["cylinders"] = cars["cylinders"].asfactor() r = cars[0].runif(seed=1234) train = cars[r > .2] valid = cars[r <= .2] response_col = "cylinders" predictors = ["displacement", "power", "weight", "acceleration", "year"] gbm.distribution = "multinomial" gbm.train(x=predictors, y=response_col, training_frame=train, validation_frame=valid) print(gbm) mpce = gbm.mean_per_class_error(train=True) assert (mpce == 0) mpce = gbm.mean_per_class_error(valid=True) assert (abs(mpce - 0.207142857143) < 1e-5) mpce = gbm.mean_per_class_error(xval=True) assert (abs(mpce - 0.350071715433) < 1e-5) ## Early stopping gbm.stopping_rounds = 2 gbm.stopping_metric = "mean_per_class_error" gbm.ntrees = 10000 gbm.max_depth = 3 gbm.min_rows = 1 gbm.learn_rate = 0.01 gbm.score_tree_interval = 1 gbm.nfolds = None gbm.fold_assignment = None gbm.train(x=predictors, y=response_col, training_frame=train, validation_frame=valid) print(gbm) print(gbm.scoring_history()) ## Grid search hyper_params_tune = { 'max_depth': list(range(1, 10 + 1, 1)), 'sample_rate': [x / 100. for x in range(20, 101)], 'col_sample_rate': [x / 100. for x in range(20, 101)], 'col_sample_rate_per_tree': [x / 100. for x in range(20, 101)], 'col_sample_rate_change_per_level': [x / 100. for x in range(90, 111)], 'min_rows': [2**x for x in range(0, int(math.log(train.nrow, 2) - 2) + 1)], 'nbins': [2**x for x in range(4, 11)], 'nbins_cats': [2**x for x in range(4, 13)], 'min_split_improvement': [0, 1e-8, 1e-6, 1e-4], 'histogram_type': ["UniformAdaptive", "QuantilesGlobal", "RoundRobin"] } search_criteria_tune = { 'strategy': "RandomDiscrete", 'max_runtime_secs': 600, ## limit the runtime to 10 minutes 'max_models': 10, 'seed': 1234, 'stopping_rounds': 5, 'stopping_metric': "mean_per_class_error", 'stopping_tolerance': 1e-3 } grid = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_params_tune, search_criteria=search_criteria_tune) grid.train(x=predictors, y=response_col, training_frame=train, validation_frame=valid, distribution="multinomial", seed=1234, stopping_rounds=10, stopping_metric="mean_per_class_error", stopping_tolerance=1e-3) print(grid) ## sorted by logloss print(grid.get_grid("mean_per_class_error"))
def metric_accessors(): cars = h2o.import_file( path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) r = cars[0].runif() train = cars[r > .2] valid = cars[r <= .2] # regression response_col = "economy" distribution = "gaussian" predictors = ["displacement", "power", "weight", "acceleration", "year"] gbm = h2o.gbm(y=train[response_col], x=train[predictors], validation_y=valid[response_col], validation_x=valid[predictors], nfolds=3, distribution=distribution, fold_assignment="Random") # mse mse1 = gbm.mse(train=True, valid=False, xval=False) assert isinstance(mse1, float) mse2 = gbm.mse(train=False, valid=True, xval=False) assert isinstance(mse2, float) mse3 = gbm.mse(train=False, valid=False, xval=True) assert isinstance(mse3, float) mse = gbm.mse(train=True, valid=True, xval=False) assert "train" in list(mse.keys()) and "valid" in list( mse.keys() ), "expected training and validation metrics to be returned, but got {0}".format( list(mse.keys())) assert len( mse ) == 2, "expected only training and validation metrics to be returned, but got {0}".format( list(mse.keys())) assert isinstance(mse["train"], float) and isinstance( mse["valid"], float ), "expected training and validation metrics to be floats, but got {0} and {1}".format( type(mse["train"]), type(mse["valid"])) assert mse["valid"] == mse2 mse = gbm.mse(train=True, valid=False, xval=True) assert "train" in list(mse.keys()) and "xval" in list( mse.keys() ), "expected training and cross validation metrics to be returned, but got {0}".format( list(mse.keys())) assert len( mse ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( list(mse.keys())) assert isinstance(mse["train"], float) and isinstance( mse["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(mse["train"]), type(mse["xval"])) assert mse["xval"] == mse3 mse = gbm.mse(train=True, valid=True, xval=True) assert "train" in list(mse.keys()) and "valid" in list(mse.keys( )) and "xval" in list( mse.keys() ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format( list(mse.keys())) assert len( mse ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format( list(mse.keys())) assert isinstance(mse["train"], float) and isinstance( mse["valid"], float ) and isinstance( mse["xval"], float ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format( type(mse["train"]), type(mse["valid"]), type(mse["xval"])) mse = gbm.mse(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(mse, float) assert mse == mse1 mse = gbm.mse(train=False, valid=True, xval=True) assert "valid" in list(mse.keys()) and "xval" in list( mse.keys() ), "expected validation and cross validation metrics to be returned, but got {0}".format( list(mse.keys())) assert len( mse ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format( list(mse.keys())) assert isinstance(mse["valid"], float) and isinstance( mse["xval"], float ), "validation and cross validation metrics to be floats, but got {0} and {1}".format( type(mse["valid"]), type(mse["xval"])) # r2 r21 = gbm.r2(train=True, valid=False, xval=False) assert isinstance(r21, float) r22 = gbm.r2(train=False, valid=True, xval=False) assert isinstance(r22, float) r23 = gbm.r2(train=False, valid=False, xval=True) assert isinstance(r23, float) r2 = gbm.r2(train=True, valid=True, xval=False) assert "train" in list(r2.keys()) and "valid" in list( r2.keys() ), "expected training and validation metrics to be returned, but got {0}".format( list(r2.keys())) assert len( r2 ) == 2, "expected only training and validation metrics to be returned, but got {0}".format( list(r2.keys())) assert isinstance(r2["train"], float) and isinstance( r2["valid"], float ), "expected training and validation metrics to be floats, but got {0} and {1}".format( type(r2["train"]), type(r2["valid"])) assert r2["valid"] == r22 r2 = gbm.r2(train=True, valid=False, xval=True) assert "train" in list(r2.keys()) and "xval" in list( r2.keys() ), "expected training and cross validation metrics to be returned, but got {0}".format( list(r2.keys())) assert len( r2 ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( list(r2.keys())) assert isinstance(r2["train"], float) and isinstance( r2["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(r2["train"]), type(r2["xval"])) assert r2["xval"] == r23 r2 = gbm.r2(train=True, valid=True, xval=True) assert "train" in list(r2.keys()) and "valid" in list(r2.keys( )) and "xval" in list( r2.keys() ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format( list(r2.keys())) assert len( r2 ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format( list(r2.keys())) assert isinstance(r2["train"], float) and isinstance( r2["valid"], float ) and isinstance( r2["xval"], float ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format( type(r2["train"]), type(r2["valid"]), type(r2["xval"])) r2 = gbm.r2(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(r2, float) assert r2 == r21 r2 = gbm.r2(train=False, valid=True, xval=True) assert "valid" in list(r2.keys()) and "xval" in list( r2.keys() ), "expected validation and cross validation metrics to be returned, but got {0}".format( list(r2.keys())) assert len( r2 ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format( list(r2.keys())) assert isinstance(r2["valid"], float) and isinstance( r2["xval"], float ), "validation and cross validation metrics to be floats, but got {0} and {1}".format( type(r2["valid"]), type(r2["xval"])) # mean_residual_deviance mean_residual_deviance1 = gbm.mean_residual_deviance(train=True, valid=False, xval=False) assert isinstance(mean_residual_deviance1, float) mean_residual_deviance2 = gbm.mean_residual_deviance(train=False, valid=True, xval=False) assert isinstance(mean_residual_deviance2, float) mean_residual_deviance3 = gbm.mean_residual_deviance(train=False, valid=False, xval=True) assert isinstance(mean_residual_deviance3, float) mean_residual_deviance = gbm.mean_residual_deviance(train=True, valid=True, xval=False) assert "train" in list(mean_residual_deviance.keys()) and "valid" in list( mean_residual_deviance.keys() ), "expected training and validation metrics to be returned, but got {0}".format( list(mean_residual_deviance.keys())) assert len( mean_residual_deviance ) == 2, "expected only training and validation metrics to be returned, but got {0}".format( list(mean_residual_deviance.keys())) assert isinstance(mean_residual_deviance["train"], float) and isinstance( mean_residual_deviance["valid"], float ), "expected training and validation metrics to be floats, but got {0} and {1}".format( type(mean_residual_deviance["train"]), type(mean_residual_deviance["valid"])) assert mean_residual_deviance["valid"] == mean_residual_deviance2 mean_residual_deviance = gbm.mean_residual_deviance(train=True, valid=False, xval=True) assert "train" in list(mean_residual_deviance.keys()) and "xval" in list( mean_residual_deviance.keys() ), "expected training and cross validation metrics to be returned, but got {0}".format( list(mean_residual_deviance.keys())) assert len( mean_residual_deviance ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( list(mean_residual_deviance.keys())) assert isinstance(mean_residual_deviance["train"], float) and isinstance( mean_residual_deviance["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(mean_residual_deviance["train"]), type(mean_residual_deviance["xval"])) assert mean_residual_deviance["xval"] == mean_residual_deviance3 mean_residual_deviance = gbm.mean_residual_deviance(train=True, valid=True, xval=True) assert "train" in list(mean_residual_deviance.keys( )) and "valid" in list(mean_residual_deviance.keys()) and "xval" in list( mean_residual_deviance.keys() ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format( list(mean_residual_deviance.keys())) assert len( mean_residual_deviance ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format( list(mean_residual_deviance.keys())) assert isinstance(mean_residual_deviance["train"], float) and isinstance( mean_residual_deviance["valid"], float ) and isinstance( mean_residual_deviance["xval"], float ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format( type(mean_residual_deviance["train"]), type(mean_residual_deviance["valid"]), type(mean_residual_deviance["xval"])) mean_residual_deviance = gbm.mean_residual_deviance( train=False, valid=False, xval=False) # default: return training metrics assert isinstance(mean_residual_deviance, float) assert mean_residual_deviance == mean_residual_deviance1 mean_residual_deviance = gbm.mean_residual_deviance(train=False, valid=True, xval=True) assert "valid" in list(mean_residual_deviance.keys()) and "xval" in list( mean_residual_deviance.keys() ), "expected validation and cross validation metrics to be returned, but got {0}".format( list(mean_residual_deviance.keys())) assert len( mean_residual_deviance ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format( list(mean_residual_deviance.keys())) assert isinstance(mean_residual_deviance["valid"], float) and isinstance( mean_residual_deviance["xval"], float ), "validation and cross validation metrics to be floats, but got {0} and {1}".format( type(mean_residual_deviance["valid"]), type(mean_residual_deviance["xval"])) # binomial cars = h2o.import_file( path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() r = cars[0].runif() train = cars[r > .2] valid = cars[r <= .2] response_col = "economy_20mpg" distribution = "bernoulli" predictors = ["displacement", "power", "weight", "acceleration", "year"] gbm = h2o.gbm(y=train[response_col], x=train[predictors], validation_y=valid[response_col], validation_x=valid[predictors], nfolds=3, distribution=distribution, fold_assignment="Random") # auc auc1 = gbm.auc(train=True, valid=False, xval=False) assert isinstance(auc1, float) auc2 = gbm.auc(train=False, valid=True, xval=False) assert isinstance(auc2, float) auc3 = gbm.auc(train=False, valid=False, xval=True) assert isinstance(auc3, float) auc = gbm.auc(train=True, valid=True, xval=False) assert "train" in list(auc.keys()) and "valid" in list( auc.keys() ), "expected training and validation metrics to be returned, but got {0}".format( list(auc.keys())) assert len( auc ) == 2, "expected only training and validation metrics to be returned, but got {0}".format( list(auc.keys())) assert isinstance(auc["train"], float) and isinstance( auc["valid"], float ), "expected training and validation metrics to be floats, but got {0} and {1}".format( type(auc["train"]), type(auc["valid"])) assert auc["valid"] == auc2 auc = gbm.auc(train=True, valid=False, xval=True) assert "train" in list(auc.keys()) and "xval" in list( auc.keys() ), "expected training and cross validation metrics to be returned, but got {0}".format( list(auc.keys())) assert len( auc ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( list(auc.keys())) assert isinstance(auc["train"], float) and isinstance( auc["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(auc["train"]), type(auc["xval"])) assert auc["xval"] == auc3 auc = gbm.auc(train=True, valid=True, xval=True) assert "train" in list(auc.keys()) and "valid" in list(auc.keys( )) and "xval" in list( auc.keys() ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format( list(auc.keys())) assert len( auc ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format( list(auc.keys())) assert isinstance(auc["train"], float) and isinstance( auc["valid"], float ) and isinstance( auc["xval"], float ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format( type(auc["train"]), type(auc["valid"]), type(auc["xval"])) auc = gbm.auc(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(auc, float) assert auc == auc1 auc = gbm.auc(train=False, valid=True, xval=True) assert "valid" in list(auc.keys()) and "xval" in list( auc.keys() ), "expected validation and cross validation metrics to be returned, but got {0}".format( list(auc.keys())) assert len( auc ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format( list(auc.keys())) assert isinstance(auc["valid"], float) and isinstance( auc["xval"], float ), "validation and cross validation metrics to be floats, but got {0} and {1}".format( type(auc["valid"]), type(auc["xval"])) # roc (fprs1, tprs1) = gbm.roc(train=True, valid=False, xval=False) assert isinstance(fprs1, list) assert isinstance(tprs1, list) (fprs2, tprs2) = gbm.roc(train=False, valid=True, xval=False) assert isinstance(fprs2, list) assert isinstance(tprs2, list) (fprs3, tprs3) = gbm.roc(train=False, valid=False, xval=True) assert isinstance(fprs3, list) assert isinstance(tprs3, list) roc = gbm.roc(train=True, valid=True, xval=False) assert "train" in list(roc.keys()) and "valid" in list( roc.keys() ), "expected training and validation metrics to be returned, but got {0}".format( list(roc.keys())) assert len( roc ) == 2, "expected only training and validation metrics to be returned, but got {0}".format( list(roc.keys())) assert isinstance(roc["train"], tuple) and isinstance( roc["valid"], tuple ), "expected training and validation metrics to be tuples, but got {0} and {1}".format( type(roc["train"]), type(roc["valid"])) assert roc["valid"][0] == fprs2 assert roc["valid"][1] == tprs2 roc = gbm.roc(train=True, valid=False, xval=True) assert "train" in list(roc.keys()) and "xval" in list( roc.keys() ), "expected training and cross validation metrics to be returned, but got {0}".format( list(roc.keys())) assert len( roc ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( list(roc.keys())) assert isinstance(roc["train"], tuple) and isinstance( roc["xval"], tuple ), "expected training and cross validation metrics to be tuples, but got {0} and {1}".format( type(roc["train"]), type(roc["xval"])) assert roc["xval"][0] == fprs3 assert roc["xval"][1] == tprs3 roc = gbm.roc(train=True, valid=True, xval=True) assert "train" in list(roc.keys()) and "valid" in list(roc.keys( )) and "xval" in list( roc.keys() ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format( list(roc.keys())) assert len( roc ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format( list(roc.keys())) assert isinstance(roc["train"], tuple) and isinstance( roc["valid"], tuple ) and isinstance( roc["xval"], tuple ), "expected training, validation, and cross validation metrics to be tuples, but got {0}, {1}, and {2}".format( type(roc["train"]), type(roc["valid"]), type(roc["xval"])) (fprs, tprs) = gbm.roc(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(fprs, list) assert isinstance(tprs, list) assert fprs == fprs1 assert tprs == tprs1 roc = gbm.roc(train=False, valid=True, xval=True) assert "valid" in list(roc.keys()) and "xval" in list( roc.keys() ), "expected validation and cross validation metrics to be returned, but got {0}".format( list(roc.keys())) assert len( roc ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format( list(roc.keys())) assert isinstance(roc["valid"], tuple) and isinstance( roc["xval"], tuple ), "validation and cross validation metrics to be tuples, but got {0} and {1}".format( type(roc["valid"]), type(roc["xval"])) # logloss logloss1 = gbm.logloss(train=True, valid=False, xval=False) assert isinstance(logloss1, float) logloss2 = gbm.logloss(train=False, valid=True, xval=False) assert isinstance(logloss2, float) logloss3 = gbm.logloss(train=False, valid=False, xval=True) assert isinstance(logloss3, float) logloss = gbm.logloss(train=True, valid=True, xval=False) assert "train" in list(logloss.keys()) and "valid" in list( logloss.keys() ), "expected training and validation metrics to be returned, but got {0}".format( list(logloss.keys())) assert len( logloss ) == 2, "expected only training and validation metrics to be returned, but got {0}".format( list(logloss.keys())) assert isinstance(logloss["train"], float) and isinstance( logloss["valid"], float ), "expected training and validation metrics to be floats, but got {0} and {1}".format( type(logloss["train"]), type(logloss["valid"])) assert logloss["valid"] == logloss2 logloss = gbm.logloss(train=True, valid=False, xval=True) assert "train" in list(logloss.keys()) and "xval" in list( logloss.keys() ), "expected training and cross validation metrics to be returned, but got {0}".format( list(logloss.keys())) assert len( logloss ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( list(logloss.keys())) assert isinstance(logloss["train"], float) and isinstance( logloss["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(logloss["train"]), type(logloss["xval"])) assert logloss["xval"] == logloss3 logloss = gbm.logloss(train=True, valid=True, xval=True) assert "train" in list(logloss.keys()) and "valid" in list(logloss.keys( )) and "xval" in list( logloss.keys() ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format( list(logloss.keys())) assert len( logloss ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format( list(logloss.keys())) assert isinstance(logloss["train"], float) and isinstance( logloss["valid"], float ) and isinstance( logloss["xval"], float ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format( type(logloss["train"]), type(logloss["valid"]), type(logloss["xval"])) logloss = gbm.logloss(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(logloss, float) assert logloss == logloss1 logloss = gbm.logloss(train=False, valid=True, xval=True) assert "valid" in list(logloss.keys()) and "xval" in list( logloss.keys() ), "expected validation and cross validation metrics to be returned, but got {0}".format( list(logloss.keys())) assert len( logloss ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format( list(logloss.keys())) assert isinstance(logloss["valid"], float) and isinstance( logloss["xval"], float ), "validation and cross validation metrics to be floats, but got {0} and {1}".format( type(logloss["valid"]), type(logloss["xval"])) # giniCoef giniCoef1 = gbm.giniCoef(train=True, valid=False, xval=False) assert isinstance(giniCoef1, float) giniCoef2 = gbm.giniCoef(train=False, valid=True, xval=False) assert isinstance(giniCoef2, float) giniCoef3 = gbm.giniCoef(train=False, valid=False, xval=True) assert isinstance(giniCoef3, float) giniCoef = gbm.giniCoef(train=True, valid=True, xval=False) assert "train" in list(giniCoef.keys()) and "valid" in list( giniCoef.keys() ), "expected training and validation metrics to be returned, but got {0}".format( list(giniCoef.keys())) assert len( giniCoef ) == 2, "expected only training and validation metrics to be returned, but got {0}".format( list(giniCoef.keys())) assert isinstance(giniCoef["train"], float) and isinstance( giniCoef["valid"], float ), "expected training and validation metrics to be floats, but got {0} and {1}".format( type(giniCoef["train"]), type(giniCoef["valid"])) assert giniCoef["valid"] == giniCoef2 giniCoef = gbm.giniCoef(train=True, valid=False, xval=True) assert "train" in list(giniCoef.keys()) and "xval" in list( giniCoef.keys() ), "expected training and cross validation metrics to be returned, but got {0}".format( list(giniCoef.keys())) assert len( giniCoef ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( list(giniCoef.keys())) assert isinstance(giniCoef["train"], float) and isinstance( giniCoef["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(giniCoef["train"]), type(giniCoef["xval"])) assert giniCoef["xval"] == giniCoef3 giniCoef = gbm.giniCoef(train=True, valid=True, xval=True) assert "train" in list(giniCoef.keys()) and "valid" in list(giniCoef.keys( )) and "xval" in list( giniCoef.keys() ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format( list(giniCoef.keys())) assert len( giniCoef ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format( list(giniCoef.keys())) assert isinstance(giniCoef["train"], float) and isinstance( giniCoef["valid"], float ) and isinstance( giniCoef["xval"], float ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format( type(giniCoef["train"]), type(giniCoef["valid"]), type(giniCoef["xval"])) giniCoef = gbm.giniCoef(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(giniCoef, float) assert giniCoef == giniCoef1 giniCoef = gbm.giniCoef(train=False, valid=True, xval=True) assert "valid" in list(giniCoef.keys()) and "xval" in list( giniCoef.keys() ), "expected validation and cross validation metrics to be returned, but got {0}".format( list(giniCoef.keys())) assert len( giniCoef ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format( list(giniCoef.keys())) assert isinstance(giniCoef["valid"], float) and isinstance( giniCoef["xval"], float ), "validation and cross validation metrics to be floats, but got {0} and {1}".format( type(giniCoef["valid"]), type(giniCoef["xval"])) # F1 F11 = gbm.F1(train=True, valid=False, xval=False) F12 = gbm.F1(train=False, valid=True, xval=False) F13 = gbm.F1(train=False, valid=False, xval=True) F1 = gbm.F1(train=True, valid=True, xval=False) F1 = gbm.F1(train=True, valid=False, xval=True) F1 = gbm.F1(train=True, valid=True, xval=True) F1 = gbm.F1(train=False, valid=False, xval=False) # default: return training metrics F1 = gbm.F1(train=False, valid=True, xval=True) # F0point5 F0point51 = gbm.F0point5(train=True, valid=False, xval=False) F0point52 = gbm.F0point5(train=False, valid=True, xval=False) F0point53 = gbm.F0point5(train=False, valid=False, xval=True) F0point5 = gbm.F0point5(train=True, valid=True, xval=False) F0point5 = gbm.F0point5(train=True, valid=False, xval=True) F0point5 = gbm.F0point5(train=True, valid=True, xval=True) F0point5 = gbm.F0point5(train=False, valid=False, xval=False) # default: return training metrics F0point5 = gbm.F0point5(train=False, valid=True, xval=True) # F2 F21 = gbm.F2(train=True, valid=False, xval=False) F22 = gbm.F2(train=False, valid=True, xval=False) F23 = gbm.F2(train=False, valid=False, xval=True) F2 = gbm.F2(train=True, valid=True, xval=False) F2 = gbm.F2(train=True, valid=False, xval=True) F2 = gbm.F2(train=True, valid=True, xval=True) F2 = gbm.F2(train=False, valid=False, xval=False) # default: return training metrics F2 = gbm.F2(train=False, valid=True, xval=True) # accuracy accuracy1 = gbm.accuracy(train=True, valid=False, xval=False) accuracy2 = gbm.accuracy(train=False, valid=True, xval=False) accuracy3 = gbm.accuracy(train=False, valid=False, xval=True) accuracy = gbm.accuracy(train=True, valid=True, xval=False) accuracy = gbm.accuracy(train=True, valid=False, xval=True) accuracy = gbm.accuracy(train=True, valid=True, xval=True) accuracy = gbm.accuracy(train=False, valid=False, xval=False) # default: return training metrics accuracy = gbm.accuracy(train=False, valid=True, xval=True) # error error1 = gbm.error(train=True, valid=False, xval=False) error2 = gbm.error(train=False, valid=True, xval=False) error3 = gbm.error(train=False, valid=False, xval=True) error = gbm.error(train=True, valid=True, xval=False) error = gbm.error(train=True, valid=False, xval=True) error = gbm.error(train=True, valid=True, xval=True) error = gbm.error(train=False, valid=False, xval=False) # default: return training metrics error = gbm.error(train=False, valid=True, xval=True) # precision precision1 = gbm.precision(train=True, valid=False, xval=False) precision2 = gbm.precision(train=False, valid=True, xval=False) precision3 = gbm.precision(train=False, valid=False, xval=True) precision = gbm.precision(train=True, valid=True, xval=False) precision = gbm.precision(train=True, valid=False, xval=True) precision = gbm.precision(train=True, valid=True, xval=True) precision = gbm.precision(train=False, valid=False, xval=False) # default: return training metrics precision = gbm.precision(train=False, valid=True, xval=True) # mcc mcc1 = gbm.mcc(train=True, valid=False, xval=False) mcc2 = gbm.mcc(train=False, valid=True, xval=False) mcc3 = gbm.mcc(train=False, valid=False, xval=True) mcc = gbm.mcc(train=True, valid=True, xval=False) mcc = gbm.mcc(train=True, valid=False, xval=True) mcc = gbm.mcc(train=True, valid=True, xval=True) mcc = gbm.mcc(train=False, valid=False, xval=False) # default: return training metrics mcc = gbm.mcc(train=False, valid=True, xval=True) # max_per_class_error max_per_class_error1 = gbm.max_per_class_error(train=True, valid=False, xval=False) max_per_class_error2 = gbm.max_per_class_error(train=False, valid=True, xval=False) max_per_class_error3 = gbm.max_per_class_error(train=False, valid=False, xval=True) max_per_class_error = gbm.max_per_class_error(train=True, valid=True, xval=False) max_per_class_error = gbm.max_per_class_error(train=True, valid=False, xval=True) max_per_class_error = gbm.max_per_class_error(train=True, valid=True, xval=True) max_per_class_error = gbm.max_per_class_error( train=False, valid=False, xval=False) # default: return training metrics max_per_class_error = gbm.max_per_class_error(train=False, valid=True, xval=True) # confusion_matrix confusion_matrix1 = gbm.confusion_matrix(train=True, valid=False, xval=False) confusion_matrix2 = gbm.confusion_matrix(train=False, valid=True, xval=False) confusion_matrix3 = gbm.confusion_matrix(train=False, valid=False, xval=True) confusion_matrix = gbm.confusion_matrix(train=True, valid=True, xval=False) confusion_matrix = gbm.confusion_matrix(train=True, valid=False, xval=True) confusion_matrix = gbm.confusion_matrix(train=True, valid=True, xval=True) confusion_matrix = gbm.confusion_matrix( train=False, valid=False, xval=False) # default: return training metrics confusion_matrix = gbm.confusion_matrix(train=False, valid=True, xval=True) # # plot # plot1 = gbm.plot(train=True, valid=False, xval=False) # plot2 = gbm.plot(train=False, valid=True, xval=False) # plot3 = gbm.plot(train=False, valid=False, xval=True) # plot = gbm.plot(train=True, valid=True, xval=False) # plot = gbm.plot(train=True, valid=False, xval=True) # plot = gbm.plot(train=True, valid=True, xval=True) # plot = gbm.plot(train=False, valid=False, xval=False) # default: return training metrics # plot = gbm.plot(train=False, valid=True, xval=True) # # tpr # tpr1 = gbm.tpr(train=True, valid=False, xval=False) # tpr2 = gbm.tpr(train=False, valid=True, xval=False) # tpr3 = gbm.tpr(train=False, valid=False, xval=True) # tpr = gbm.tpr(train=True, valid=True, xval=False) # tpr = gbm.tpr(train=True, valid=False, xval=True) # tpr = gbm.tpr(train=True, valid=True, xval=True) # tpr = gbm.tpr(train=False, valid=False, xval=False) # default: return training metrics # tpr = gbm.tpr(train=False, valid=True, xval=True) # # # tnr # tnr1 = gbm.tnr(train=True, valid=False, xval=False) # tnr2 = gbm.tnr(train=False, valid=True, xval=False) # tnr3 = gbm.tnr(train=False, valid=False, xval=True) # tnr = gbm.tnr(train=True, valid=True, xval=False) # tnr = gbm.tnr(train=True, valid=False, xval=True) # tnr = gbm.tnr(train=True, valid=True, xval=True) # tnr = gbm.tnr(train=False, valid=False, xval=False) # default: return training metrics # tnr = gbm.tnr(train=False, valid=True, xval=True) # # # fnr # fnr1 = gbm.fnr(train=True, valid=False, xval=False) # fnr2 = gbm.fnr(train=False, valid=True, xval=False) # fnr3 = gbm.fnr(train=False, valid=False, xval=True) # fnr = gbm.fnr(train=True, valid=True, xval=False) # fnr = gbm.fnr(train=True, valid=False, xval=True) # fnr = gbm.fnr(train=True, valid=True, xval=True) # fnr = gbm.fnr(train=False, valid=False, xval=False) # default: return training metrics # fnr = gbm.fnr(train=False, valid=True, xval=True) # # # fpr # fpr1 = gbm.fpr(train=True, valid=False, xval=False) # fpr2 = gbm.fpr(train=False, valid=True, xval=False) # fpr3 = gbm.fpr(train=False, valid=False, xval=True) # fpr = gbm.fpr(train=True, valid=True, xval=False) # fpr = gbm.fpr(train=True, valid=False, xval=True) # fpr = gbm.fpr(train=True, valid=True, xval=True) # fpr = gbm.fpr(train=False, valid=False, xval=False) # default: return training metrics # fpr = gbm.fpr(train=False, valid=True, xval=True) # multinomial cars = h2o.import_file( path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) cars["cylinders"] = cars["cylinders"].asfactor() r = cars[0].runif() train = cars[r > .2] valid = cars[r <= .2] response_col = "cylinders" distribution = "multinomial" predictors = ["displacement", "power", "weight", "acceleration", "year"] gbm = h2o.gbm(y=train[response_col], x=train[predictors], validation_y=valid[response_col], validation_x=valid[predictors], nfolds=3, distribution=distribution, fold_assignment="Random") # mse mse1 = gbm.mse(train=True, valid=False, xval=False) assert isinstance(mse1, float) mse2 = gbm.mse(train=False, valid=True, xval=False) assert isinstance(mse2, float) mse3 = gbm.mse(train=False, valid=False, xval=True) assert isinstance(mse3, float) mse = gbm.mse(train=True, valid=True, xval=False) assert "train" in list(mse.keys()) and "valid" in list( mse.keys() ), "expected training and validation metrics to be returned, but got {0}".format( list(mse.keys())) assert len( mse ) == 2, "expected only training and validation metrics to be returned, but got {0}".format( list(mse.keys())) assert isinstance(mse["train"], float) and isinstance( mse["valid"], float ), "expected training and validation metrics to be floats, but got {0} and {1}".format( type(mse["train"]), type(mse["valid"])) assert mse["valid"] == mse2 mse = gbm.mse(train=True, valid=False, xval=True) assert "train" in list(mse.keys()) and "xval" in list( mse.keys() ), "expected training and cross validation metrics to be returned, but got {0}".format( list(mse.keys())) assert len( mse ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( list(mse.keys())) assert isinstance(mse["train"], float) and isinstance( mse["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(mse["train"]), type(mse["xval"])) assert mse["xval"] == mse3 mse = gbm.mse(train=True, valid=True, xval=True) assert "train" in list(mse.keys()) and "valid" in list(mse.keys( )) and "xval" in list( mse.keys() ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format( list(mse.keys())) assert len( mse ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format( list(mse.keys())) assert isinstance(mse["train"], float) and isinstance( mse["valid"], float ) and isinstance( mse["xval"], float ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format( type(mse["train"]), type(mse["valid"]), type(mse["xval"])) mse = gbm.mse(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(mse, float) assert mse == mse1 mse = gbm.mse(train=False, valid=True, xval=True) assert "valid" in list(mse.keys()) and "xval" in list( mse.keys() ), "expected validation and cross validation metrics to be returned, but got {0}".format( list(mse.keys())) assert len( mse ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format( list(mse.keys())) assert isinstance(mse["valid"], float) and isinstance( mse["xval"], float ), "validation and cross validation metrics to be floats, but got {0} and {1}".format( type(mse["valid"]), type(mse["xval"])) # logloss logloss1 = gbm.logloss(train=True, valid=False, xval=False) assert isinstance(logloss1, float) logloss2 = gbm.logloss(train=False, valid=True, xval=False) assert isinstance(logloss2, float) logloss3 = gbm.logloss(train=False, valid=False, xval=True) assert isinstance(logloss3, float) logloss = gbm.logloss(train=True, valid=True, xval=False) assert "train" in list(logloss.keys()) and "valid" in list( logloss.keys() ), "expected training and validation metrics to be returned, but got {0}".format( list(logloss.keys())) assert len( logloss ) == 2, "expected only training and validation metrics to be returned, but got {0}".format( list(logloss.keys())) assert isinstance(logloss["train"], float) and isinstance( logloss["valid"], float ), "expected training and validation metrics to be floats, but got {0} and {1}".format( type(logloss["train"]), type(logloss["valid"])) assert logloss["valid"] == logloss2 logloss = gbm.logloss(train=True, valid=False, xval=True) assert "train" in list(logloss.keys()) and "xval" in list( logloss.keys() ), "expected training and cross validation metrics to be returned, but got {0}".format( list(logloss.keys())) assert len( logloss ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( list(logloss.keys())) assert isinstance(logloss["train"], float) and isinstance( logloss["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(logloss["train"]), type(logloss["xval"])) assert logloss["xval"] == logloss3 logloss = gbm.logloss(train=True, valid=True, xval=True) assert "train" in list(logloss.keys()) and "valid" in list(logloss.keys( )) and "xval" in list( logloss.keys() ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format( list(logloss.keys())) assert len( logloss ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format( list(logloss.keys())) assert isinstance(logloss["train"], float) and isinstance( logloss["valid"], float ) and isinstance( logloss["xval"], float ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format( type(logloss["train"]), type(logloss["valid"]), type(logloss["xval"])) logloss = gbm.logloss(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(logloss, float) assert logloss == logloss1 logloss = gbm.logloss(train=False, valid=True, xval=True) assert "valid" in list(logloss.keys()) and "xval" in list( logloss.keys() ), "expected validation and cross validation metrics to be returned, but got {0}".format( list(logloss.keys())) assert len( logloss ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format( list(logloss.keys())) assert isinstance(logloss["valid"], float) and isinstance( logloss["xval"], float ), "validation and cross validation metrics to be floats, but got {0} and {1}".format( type(logloss["valid"]), type(logloss["xval"])) # hit_ratio_table hit_ratio_table1 = gbm.hit_ratio_table(train=True, valid=False, xval=False) hit_ratio_table2 = gbm.hit_ratio_table(train=False, valid=True, xval=False) hit_ratio_table3 = gbm.hit_ratio_table(train=False, valid=False, xval=True) hit_ratio_table = gbm.hit_ratio_table(train=True, valid=True, xval=False) hit_ratio_table = gbm.hit_ratio_table(train=True, valid=False, xval=True) hit_ratio_table = gbm.hit_ratio_table(train=True, valid=True, xval=True) hit_ratio_table = gbm.hit_ratio_table( train=False, valid=False, xval=False) # default: return training metrics hit_ratio_table = gbm.hit_ratio_table(train=False, valid=True, xval=True) # clustering iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv")) km = h2o.kmeans(x=iris[0:4], nfolds=3, k=3) # betweenss betweenss1 = km.betweenss(train=True, valid=False, xval=False) assert isinstance(betweenss1, float) betweenss3 = km.betweenss(train=False, valid=False, xval=True) assert isinstance(betweenss3, float) betweenss = km.betweenss(train=True, valid=False, xval=True) assert "train" in list(betweenss.keys()) and "xval" in list( betweenss.keys() ), "expected training and cross validation metrics to be returned, but got {0}".format( list(betweenss.keys())) assert len( betweenss ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( list(betweenss.keys())) assert isinstance(betweenss["train"], float) and isinstance( betweenss["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(betweenss["train"]), type(betweenss["xval"])) assert betweenss["xval"] == betweenss3 betweenss = km.betweenss(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(betweenss, float) assert betweenss == betweenss1 # totss totss1 = km.totss(train=True, valid=False, xval=False) assert isinstance(totss1, float) totss3 = km.totss(train=False, valid=False, xval=True) assert isinstance(totss3, float) totss = km.totss(train=True, valid=False, xval=True) assert "train" in list(totss.keys()) and "xval" in list( totss.keys() ), "expected training and cross validation metrics to be returned, but got {0}".format( list(totss.keys())) assert len( totss ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( list(totss.keys())) assert isinstance(totss["train"], float) and isinstance( totss["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(totss["train"]), type(totss["xval"])) assert totss["xval"] == totss3 totss = km.totss(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(totss, float) assert totss == totss1 # tot_withinss tot_withinss1 = km.tot_withinss(train=True, valid=False, xval=False) assert isinstance(tot_withinss1, float) tot_withinss3 = km.tot_withinss(train=False, valid=False, xval=True) assert isinstance(tot_withinss3, float) tot_withinss = km.tot_withinss(train=True, valid=False, xval=True) assert "train" in list(tot_withinss.keys()) and "xval" in list( tot_withinss.keys() ), "expected training and cross validation metrics to be returned, but got {0}".format( list(tot_withinss.keys())) assert len( tot_withinss ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( list(tot_withinss.keys())) assert isinstance(tot_withinss["train"], float) and isinstance( tot_withinss["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(tot_withinss["train"]), type(tot_withinss["xval"])) assert tot_withinss["xval"] == tot_withinss3 tot_withinss = km.tot_withinss( train=False, valid=False, xval=False) # default: return training metrics assert isinstance(tot_withinss, float) assert tot_withinss == tot_withinss1 # withinss withinss1 = km.withinss(train=True, valid=False, xval=False) withinss3 = km.withinss(train=False, valid=False, xval=True) withinss = km.withinss(train=True, valid=False, xval=True) withinss = km.withinss(train=False, valid=False, xval=False) # default: return training metrics # centroid_stats centroid_stats1 = km.centroid_stats(train=True, valid=False, xval=False) centroid_stats3 = km.centroid_stats(train=False, valid=False, xval=True) centroid_stats = km.centroid_stats(train=True, valid=False, xval=True) centroid_stats = km.centroid_stats( train=False, valid=False, xval=False) # default: return training metrics # size size1 = km.size(train=True, valid=False, xval=False) size3 = km.size(train=False, valid=False, xval=True) size = km.size(train=True, valid=False, xval=True) size = km.size(train=False, valid=False, xval=False) # default: return training metrics
def test_modelselection_gaussian(): d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) my_y = "GLEASON" my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] model_maxrsweep = modelSelection(seed=12345, max_predictor_number=3, mode="maxrsweep") model_maxrsweep.train(training_frame=d, x=my_x, y=my_y) model_maxr = modelSelection(seed=12345, max_predictor_number=3, mode="maxr") model_maxr.train(training_frame=d, x=my_x, y=my_y) # make sure results returned by maxr and maxrsweep are the same pyunit_utils.compare_frames_local(model_maxr.result()[2:4], model_maxrsweep.result()[2:4], prob=1.0, tol=1e-6) model_allsubsets = modelSelection(seed=12345, max_predictor_number=3, mode="allsubsets") model_allsubsets.train(training_frame=d, x=my_x, y=my_y) best_r2_value_allsubsets = model_allsubsets.get_best_R2_values() best_predictor_names_allsubsets = model_allsubsets.get_best_model_predictors( ) best_r2_value_maxr = model_maxr.get_best_R2_values() # assert that model returned with one predictor found by modelselection is the best by comparing it to manual training result one_pred_r2 = [] for pred in my_x: x = [pred] m = glm(seed=12345) m.train(training_frame=d, x=x, y=my_y) one_pred_r2.append(m.r2()) best_r2 = max(one_pred_r2) assert abs(best_r2-best_r2_value_allsubsets[0]) < 1e-6, "expected best r2: {0}, allsubset: actual best r2:{1}. " \ " They are different.".format(best_r2, best_r2_value_allsubsets[0]) assert abs(best_r2-best_r2_value_maxr[0]) < 1e-6, "expected best r2: {0}, maxr: actual best r2:{1}. " \ " They are different.".format(best_r2, best_r2_value_maxr[0]) assert abs(best_r2_value_allsubsets[0]-best_r2_value_maxr[0]) < 1e-6, "allsubset best r2: {0}, maxr best r2:{1}. " \ " They are different." \ "".format(best_r2_value_allsubsets[0], best_r2_value_maxr[0]) print("Best one predictor model uses predictor: {0}".format( best_predictor_names_allsubsets[0])) my_x3 = [["AGE", "RACE", "CAPSULE"], ["AGE", "RACE", "DCAPS"], ["AGE", "RACE", "PSA"], ["AGE", "RACE", "VOL"], ["AGE", "RACE", "DPROS"], ["AGE", "CAPSULE", "DCAPS"], ["AGE", "CAPSULE", "PSA"], ["AGE", "CAPSULE", "VOL"], ["AGE", "CAPSULE", "DPROS"], ["AGE", "DCAPS", "PSA"], ["AGE", "DCAPS", "PSA"], ["AGE", "DCAPS", "VOL"], ["AGE", "DCAPS", "DPROS"], ["AGE", "PSA", "VOL"], ["AGE", "PSA", "VOL"], ["AGE", "PSA", "DPROS"], ["AGE", "VOL", "DPROS"], ["RACE", "CAPSULE", "DCAPS"], ["RACE", "CAPSULE", "PSA"], ["RACE", "CAPSULE", "VOL"], ["RACE", "CAPSULE", "DPROS"], ["RACE", "DCAPS", "PSA"], ["RACE", "DCAPS", "VOL"], ["RACE", "DCAPS", "DPROS"], ["RACE", "PSA", "VOL"], ["RACE", "PSA", "DPROS"], ["RACE", "VOL", "DPROS"], ["CAPSULE", "DCAPS", "PSA"], ["CAPSULE", "DCAPS", "VOL"], ["CAPSULE", "DCAPS", "DPROS"], ["DCAPS", "PSA", "VOL"], ["DCAPS", "PSA", "DPROS"], ["DCAPS", "VOL", "DPROS"], ["PSA", "VOL", "DPROS"]] two_pred_r2 = [] for pred2 in my_x3: x = pred2 m = glm(seed=12345) m.train(training_frame=d, x=x, y=my_y) two_pred_r2.append(m.r2()) best_r2_two_pred = max(two_pred_r2) assert abs(best_r2_two_pred-best_r2_value_allsubsets[2]) < 1e-6, "expected best r2: {0}, allsubsets: actual best " \ "r2:{1}. They are different." \ "".format(best_r2_two_pred, best_r2_value_allsubsets[2]) assert abs(best_r2_two_pred-best_r2_value_maxr[2]) < 1e-6, "expected best r2: {0}, maxr: actual best " \ "r2:{1}. They are different." \ "".format(best_r2_two_pred, best_r2_value_maxr[2]) assert abs(best_r2_value_allsubsets[2]-best_r2_value_maxr[2]) < 1e-6, "allsubset best r2: {0}, maxr: actual best " \ "r2:{1}. They are different." \ "".format(best_r2_value_allsubsets[2], best_r2_value_maxr[2]) print("Best three predictors model uses predictors: {0}".format( best_predictor_names_allsubsets[2]))
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** H2O AutoML ****\n") # Mapping of benchmark metrics to H2O metrics metrics_mapping = dict(acc='mean_per_class_error', auc='AUC', logloss='logloss', mae='mae', mse='mse', rmse='rmse', rmsle='rmsle') sort_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if sort_metric is None: # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping log.warning("Performance metric %s not supported, defaulting to AUTO.", config.metric) try: log.info("Starting H2O cluster with %s cores, %smb memory.", config.cores, config.max_mem_size_mb) h2o.init(nthreads=config.cores, max_mem_size=str(config.max_mem_size_mb) + "M") # Load train as an H2O Frame, but test as a Pandas DataFrame log.debug("Loading train data from %s.", dataset.train.path) train = h2o.import_file(dataset.train.path) # train.impute(method='mean') log.debug("Loading test data from %s.", dataset.test.path) test = h2o.import_file(dataset.test.path) # test.impute(method='mean') log.info("Running model on task %s, fold %s.", config.name, config.fold) log.debug( "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.", config.max_runtime_seconds, config.cores, sort_metric) aml = H2OAutoML(max_runtime_secs=config.max_runtime_seconds, sort_metric=sort_metric, seed=config.seed, **config.framework_params) with Timer() as training: aml.train(y=dataset.target.index, training_frame=train) if not aml.leader: raise NoResultError( "H2O could not produce any model in the requested time.") log.debug("Leaderboard:\n%s", str(aml.leaderboard.as_data_frame())) preds = aml.predict(test).as_data_frame() # predictions = h2o.get_model(aml.leaderboard[0][1, 0]).predict(test).as_data_frame() y_pred = preds.iloc[:, 0] y_truth = test[:, dataset.target.index].as_data_frame(header=False) predictions = y_pred.values probabilities = preds.iloc[:, 1:].values save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_truth.values) return dict(models_count=len(aml.leaderboard), training_duration=training.duration) finally: if h2o.connection(): h2o.remove_all() h2o.connection().close()
def metric_json_check(): df = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) # Regression metric json reg_mod = h2o.gbm(y=df["CAPSULE"], x=df[3:], training_frame=df, distribution="gaussian") reg_met = reg_mod.model_performance() reg_metric_json_keys_have = list(reg_met._metric_json.keys()) reg_metric_json_keys_desired = [ u'model_category', u'description', u'r2', u'frame', u'model_checksum', u'MSE', u'__meta', u'scoring_time', u'predictions', u'model', u'duration_in_ms', u'frame_checksum', u'mean_residual_deviance' ] reg_metric_diff = list( set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired)) assert not reg_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) regression " \ "metric json. The difference is {2}".format(reg_metric_json_keys_have, reg_metric_json_keys_desired, reg_metric_diff) # Regression metric json (GLM) reg_mod = h2o.glm(y=df["CAPSULE"], x=df[3:], training_frame=df, family="gaussian") reg_met = reg_mod.model_performance() reg_metric_json_keys_have = list(reg_met._metric_json.keys()) reg_metric_json_keys_desired = [ u'model_category', u'description', u'r2', u'residual_degrees_of_freedom', u'frame', u'model_checksum', u'MSE', u'__meta', u'null_deviance', u'scoring_time', u'null_degrees_of_freedom', u'predictions', u'AIC', u'model', u'duration_in_ms', u'frame_checksum', u'residual_deviance', u'mean_residual_deviance' ] reg_metric_diff = list( set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired)) assert not reg_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) glm-regression " \ "metric json. The difference is {2}".format(reg_metric_json_keys_have, reg_metric_json_keys_desired, reg_metric_diff) # Binomial metric json bin_mod = h2o.gbm(y=df["CAPSULE"].asfactor(), x=df[3:], training_frame=df, distribution="bernoulli") bin_met = bin_mod.model_performance() bin_metric_json_keys_have = list(bin_met._metric_json.keys()) bin_metric_json_keys_desired = [ u'AUC', u'Gini', u'model_category', u'description', u'r2', u'frame', u'model_checksum', u'MSE', u'__meta', u'gains_lift_table', u'logloss', u'scoring_time', u'thresholds_and_metric_scores', u'predictions', u'max_criteria_and_metric_scores', u'model', u'duration_in_ms', u'frame_checksum', u'domain' ] bin_metric_diff = list( set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired)) assert not bin_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) binomial " \ "metric json. The difference is {2}".format(bin_metric_json_keys_have, bin_metric_json_keys_desired, bin_metric_diff) # Binomial metric json (GLM) bin_mod = h2o.glm(y=df["CAPSULE"].asfactor(), x=df[3:], training_frame=df, family="binomial") bin_met = bin_mod.model_performance() bin_metric_json_keys_have = list(bin_met._metric_json.keys()) bin_metric_json_keys_desired = [ u'frame', u'residual_deviance', u'max_criteria_and_metric_scores', u'MSE', u'frame_checksum', u'AIC', u'logloss', u'Gini', u'predictions', u'AUC', u'description', u'model_checksum', u'duration_in_ms', u'model_category', u'gains_lift_table', u'r2', u'residual_degrees_of_freedom', u'__meta', u'null_deviance', u'scoring_time', u'null_degrees_of_freedom', u'model', u'thresholds_and_metric_scores', u'domain' ] bin_metric_diff = list( set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired)) assert not bin_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) glm-binomial " \ "metric json. The difference is {2}".format(bin_metric_json_keys_have, bin_metric_json_keys_desired, bin_metric_diff) # Multinomial metric json df = h2o.import_file( path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip")) myX = [ "Origin", "Dest", "IsDepDelayed", "UniqueCarrier", "Distance", "fDayofMonth", "fDayOfWeek" ] myY = "fYear" mul_mod = h2o.gbm(x=df[myX], y=df[myY], training_frame=df, distribution="multinomial") mul_met = mul_mod.model_performance() mul_metric_json_keys_have = list(mul_met._metric_json.keys()) mul_metric_json_keys_desired = [ u'cm', u'model_category', u'description', u'r2', u'frame', u'model_checksum', u'MSE', u'__meta', u'logloss', u'scoring_time', u'predictions', u'hit_ratio_table', u'model', u'duration_in_ms', u'frame_checksum' ] mul_metric_diff = list( set(mul_metric_json_keys_have) - set(mul_metric_json_keys_desired)) assert not mul_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) multinomial " \ "metric json. The difference is {2}".format(mul_metric_json_keys_have, mul_metric_json_keys_desired, mul_metric_diff) # Clustering metric json df = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv")) clus_mod = h2o.kmeans(x=df[0:4], k=3, standardize=False) clus_met = clus_mod.model_performance() clus_metric_json_keys_have = list(clus_met._metric_json.keys()) clus_metric_json_keys_desired = [ u'tot_withinss', u'model_category', u'description', u'frame', u'model_checksum', u'MSE', u'__meta', u'scoring_time', u'betweenss', u'predictions', u'totss', u'model', u'duration_in_ms', u'frame_checksum', u'centroid_stats' ] clus_metric_diff = list( set(clus_metric_json_keys_have) - set(clus_metric_json_keys_desired)) assert not clus_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) clustering " \ "metric json. The difference is {2}".format(clus_metric_json_keys_have, clus_metric_json_keys_desired, clus_metric_diff)
import numpy as np import pandas as pd import h2o print('Loading data') h2o.init() feats = ["id", 'era', 'data_type'] pred_columns = [] for i in range(50): pred_columns.append("feature" + str(i + 1).strip()) feats.append("feature" + str(i + 1).strip()) feats.append("target") df = h2o.import_file("../input/numerai_training_data.csv") test = h2o.import_file('../input/numerai_tournament_data.csv') #valid=test[test['data_type']=='validation'] from h2o.estimators.random_forest import H2ORandomForestEstimator from h2o.estimators.gbm import H2OGradientBoostingEstimator from h2o.estimators.deepwater import H2ODeepWaterEstimator from h2o.estimators.deeplearning import H2ODeepLearningEstimator from h2o.estimators.glm import H2OGeneralizedLinearEstimator from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator #GBM=H2OGradientBoostingEstimator( # ntrees=10, # learn_rate=0.2, # learn_rate_annealing = 0.99, # sample_rate = 0.8, # col_sample_rate = 0.8, # seed = 1234,