Ejemplo n.º 1
0
def tree_test():

    # GBM
    airlines = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    gbm = H2OGradientBoostingEstimator(ntrees = 1)
    gbm.train(x = ["Origin", "Dest"], y = "IsDepDelayed", training_frame=airlines)

    tree = H2OTree(gbm, 0, "NO") # Indexing from 0 in Python. There is exactly one tree built
    check_tree(tree, 0, "NO")
    assert tree.root_node.left_levels is not None#Only categoricals in the model, guaranteed to have categorical split
    assert tree.root_node.right_levels is not None #Only categoricals in the model, guaranteed to have categorical split

    # DRF
    cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_nice_header.csv"))
    drf = H2ORandomForestEstimator(ntrees=2)
    drf.train(x = ["power", "acceleration"], y="cylinders", training_frame=cars)

    drf_tree = H2OTree(drf, 1, None)
    check_tree(drf_tree, 1)

    # ISOFOR
    ecg_discord = h2o.import_file(pyunit_utils.locate("smalldata/anomaly/ecg_discord_train.csv"))
    isofor = H2OIsolationForestEstimator(ntrees=3, seed=12, sample_size=5)
    isofor.train(training_frame=ecg_discord)

    if_tree = H2OTree(isofor, 2)
    check_tree(if_tree, 2)
Ejemplo n.º 2
0
	def __init__(self, ID, params):
		Model.__init__(self, ID, params)
		h2o.init()

		datadir = os.path.expanduser('~') +'/FSA/data/'
		trainingFile = datadir + params[1][0]
		valFile = datadir + params[1][1]
		testingFile = datadir + params[1][2]


		self.trainData = h2o.import_file(path=trainingFile)
		self.valData = h2o.import_file(path=valFile)
		#self.valData = self.trainData
		self.testData = h2o.import_file(path=testingFile)

		# print self.trainData.col_names()
		# drop the invalid columns
		self.trainData = self.trainData.drop("away_score").drop("home_score")
		self.valData = self.valData.drop("away_score").drop("home_score")
		self.testData = self.testData.drop("away_score").drop("home_score")

		self.params = params

		if self.params[0] == False:
			self.trainData = self.trainData.drop('spread')
			# self.valData   = self.valData.drop('spread')
			self.testData  = self.testData.drop('spread')

		# for h2o, creating the model is the same as training the model so
		# need to hold of here
		self.model = None
Ejemplo n.º 3
0
def pubdev_6339():
    
    cluster = h2o.cluster()
    # number of nodes
    cloud_size = cluster.cloud_size
    # number of CPUs
    cores = sum(node["num_cpus"] for node in cluster.nodes)


    # path to file
    file_paths = [
        pyunit_utils.locate("smalldata/arcene/arcene_train.data"),
        pyunit_utils.locate("smalldata/census_income/adult_data.csv"),
        pyunit_utils.locate("smalldata/chicago/chicagoAllWeather.csv"),
        pyunit_utils.locate("smalldata/gbm_test/alphabet_cattest.csv"),
        pyunit_utils.locate("smalldata/wa_cannabis/raw/Dashboard_Usable_Sales_w_Weight_Daily.csv")
    ]

    for file_path in file_paths:
        # read data and parse setup to get number of columns 
        data_raw = h2o.import_file(path=file_path,parse=False)
        setup = h2o.parse_setup(data_raw)

        # get number of columns from setup
        num_cols = setup['number_columns']
        # get the chunk size
        chunk_size = calculate_chunk_size(file_path, num_cols, cores, cloud_size)
    
        # get chunk size to compare if calculation is correct
        result_size = setup['chunk_size']
        assert chunk_size == result_size, "Calculated chunk size is incorrect!"
        print("chunk size for file", file_path, "is:", chunk_size)

    data_raw = h2o.import_file(path=file_paths[1],parse=False)
    setup = h2o.parse_setup(data_raw)
def directory_import():

    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        url1 = "hdfs://{0}{1}".format(hdfs_name_node, "/datasets/iris/identical_iris_files/iris1.csv")
        url2 = "hdfs://{0}{1}".format(hdfs_name_node, "/datasets/iris/identical_iris_files/")
        print "Importing HDFS file {0} and directory {1}".format(url1, url2)
        frm_one = h2o.import_file(url1)
        frm_all = h2o.import_file(url2)

        r1, c1 = frm_one.dim
        ra, ca = frm_all.dim

        assert r1*3 == ra, "Expected 3 times the rows, but got {0} and {1}".format(r1,ra)
        assert c1 == ca, "Expected same number of cols, but got {0} and {1}".format(c1,ca)
    else:
        raise(EnvironmentError, "Not running on H2O internal network.  No access to HDFS.")

    small1 = pyunit_utils.locate("smalldata/jira/identical_files/iris1.csv")
    small2 = small1.split("iris1.csv")[0]
    print "Importing smalldata file {0} and directory {1}".format(small1, small2)
    frm_one = h2o.import_file(small1)
    frm_all = h2o.import_file(small2)

    r1, c1 = frm_one.dim
    ra, ca = frm_all.dim

    assert r1*3 == ra, "Expected 3 times the rows, but got {0} and {1}".format(r1,ra)
    assert c1 == ca, "Expected same number of cols, but got {0} and {1}".format(c1,ca)
    def setup_data(self):
        """
        This function performs all initializations necessary:
        load the data sets and set the training set indices and response column index
        """

        # create and clean out the sandbox directory first
        self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, True)
        
        # randomly choose which family of GBM algo to use
        self.family = self.families[random.randint(0, len(self.families)-1)]

        # preload datasets, set x_indices, y_index and change response to factor for classification
        if 'multinomial' in self.family:
            self.training_metric = 'logloss'
            self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filenames[1]))
            self.y_index = self.training1_data.ncol-1
            self.x_indices = list(range(self.y_index))
            self.training1_data[self.y_index] = self.training1_data[self.y_index].round().asfactor()
            self.scale_model = 1

        else:
            self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filenames[0]))
            self.y_index = self.training1_data.ncol-1
            self.x_indices = list(range(self.y_index))
            self.scale_model = 0.75

        # save the training data files just in case the code crashed.
        pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
Ejemplo n.º 6
0
def ecologyGBM():

  ecology_train = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv"))
  ntrees = 100
  max_depth = 5
  min_rows = 10
  learn_rate = 0.1

  # Prepare data for scikit use
  trainData = pandas.read_csv(pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv"))
  trainData.dropna(inplace=True)

  le = preprocessing.LabelEncoder()
  le.fit(trainData['Method'])
  trainData['Method'] = le.transform(trainData['Method'])

  trainDataResponse = trainData["Angaus"]
  trainDataFeatures = trainData[["SegSumT","SegTSeas","SegLowFlow","DSDist","DSMaxSlope","USAvgT",
                                 "USRainDays","USSlope","USNative","DSDam","Method","LocSed"]]


  ecology_train["Angaus"] = ecology_train["Angaus"].asfactor()
  # Train H2O GBM Model:

  gbm_h2o = H2OGradientBoostingEstimator(ntrees=ntrees,
                                         learn_rate=learn_rate,
                                         distribution="bernoulli",
                                         min_rows=min_rows,
                                         max_depth=max_depth,
                                         categorical_encoding='label_encoder')
  gbm_h2o.train(x=list(range(2,ecology_train.ncol)), y="Angaus", training_frame=ecology_train)

  # Train scikit GBM Model:
  gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learn_rate, n_estimators=ntrees, max_depth=max_depth,
                                                min_samples_leaf=min_rows, max_features=None)
  gbm_sci.fit(trainDataFeatures,trainDataResponse)

  # Evaluate the trained models on test data
  # Load the test data (h2o)
  ecology_test = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/ecology_eval.csv"))

  # Load the test data (scikit)
  testData = pandas.read_csv(pyunit_utils.locate("smalldata/gbm_test/ecology_eval.csv"))
  testData.dropna(inplace=True)
  testData['Method'] = le.transform(testData['Method'])

  testDataResponse = testData["Angaus"]
  testDataFeatures = testData[["SegSumT","SegTSeas","SegLowFlow","DSDist","DSMaxSlope","USAvgT",
                               "USRainDays","USSlope","USNative","DSDam","Method","LocSed"]]

  # Score on the test data and compare results

  # scikit
  auc_sci = roc_auc_score(testDataResponse, gbm_sci.predict_proba(testDataFeatures)[:,1])

  # h2o
  gbm_perf = gbm_h2o.model_performance(ecology_test)
  auc_h2o = gbm_perf.auc()

  assert auc_h2o >= auc_sci, "h2o (auc) performance degradation, with respect to scikit"
Ejemplo n.º 7
0
def pubdev_1953():

    # small_test = [h2o.locate("bigdata/laptop/citibike-nyc/2013-10.csv")]
    # data = h2o.import_file(path=small_test)
    # startime = data["starttime"]
    # secsPerDay=1000*60*60*24
    # data["Days"] = (startime/secsPerDay).floor()
    # grouped = data.group_by(["Days","start station name"])
    # bpd = grouped.count(name="bikes").get_frame()
    # secs = bpd["Days"]*secsPerDay
    # bpd["Month"]     = secs.month().asfactor()
    # bpd["DayOfWeek"] = secs.dayOfWeek()
    # wthr1 = h2o.import_file(path=[h2o.locate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv"), h2o.locate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv")])
    # wthr2 = wthr1[["Year Local","Month Local","Day Local","Hour Local","Dew Point (C)","Humidity Fraction","Precipitation One Hour (mm)","Temperature (C)","Weather Code 1/ Description"]]
    # wthr2.set_name(wthr2.index("Precipitation One Hour (mm)"), "Rain (mm)")
    # wthr2.set_name(wthr2.index("Weather Code 1/ Description"), "WC1")
    # wthr3 = wthr2[ wthr2["Hour Local"]==12 ]
    # wthr3["msec"] = h2o.H2OFrame.mktime(year=wthr3["Year Local"], month=wthr3["Month Local"]-1, day=wthr3["Day Local"]-1, hour=wthr3["Hour Local"])
    # secsPerDay=1000*60*60*24
    # wthr3["Days"] = (wthr3["msec"]/secsPerDay).floor()
    # wthr4 = wthr3.drop("Year Local").drop("Month Local").drop("Day Local").drop("Hour Local").drop("msec")
    # rain = wthr4["Rain (mm)"]
    # rain[ rain.isna() ] = 0
    # bpd_with_weather = bpd.merge(wthr4,allLeft=True,allRite=False)
    # r = bpd_with_weather['Days'].runif(seed=356964763)
    # train = bpd_with_weather[  r  < 0.6]
    # test  = bpd_with_weather[(0.6 <= r) & (r < 0.9)]

    predictors = ['DayOfWeek', 'WC1', 'start station name', 'Temperature (C)', 'Days', 'Month', 'Humidity Fraction', 'Rain (mm)', 'Dew Point (C)']

    train = h2o.import_file(h2o.locate("smalldata/glm_test/citibike_small_train.csv"))
    test = h2o.import_file(h2o.locate("smalldata/glm_test/citibike_small_test.csv"))

    glm0 = h2o.glm(x=train[predictors], y=train["bikes"], validation_x=test[predictors], validation_y=test["bikes"], family="poisson")
Ejemplo n.º 8
0
def plot_test():
    
    
    kwargs = {}
    kwargs['server'] = True

    air = h2o.import_file(h2o.locate("smalldata/airlines/AirlinesTrain.csv.zip"))

    # Constructing test and train sets by sampling (20/80)
    s = air[0].runif()
    air_train = air[s <= 0.8]
    air_valid = air[s > 0.8]

    myX = ["Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek"]
    myY = "IsDepDelayed"

    air_gbm = h2o.gbm(x=air_train[myX], y=air_train[myY], validation_x=air_valid[myX], validation_y=air_valid[myY],
                      distribution="bernoulli", ntrees=100, max_depth=3, learn_rate=0.01)

    # Plot ROC for training and validation sets
    air_gbm.plot(type="roc", train=True, **kwargs)
    air_gbm.plot(type="roc", valid=True, **kwargs)

    air_test = h2o.import_file(h2o.locate("smalldata/airlines/AirlinesTest.csv.zip"))
    perf = air_gbm.model_performance(air_test)

    #Plot ROC for test set
    perf.plot(type="roc", **kwargs)
Ejemplo n.º 9
0
def fiftycatRF(ip, port):

    # Training set has only 45 categories cat1 through cat45
    # Log.info("Importing 50_cattest_train.csv data...\n")
    train = h2o.import_file(path=h2o.locate("smalldata/gbm_test/50_cattest_train.csv"))
    train["y"] = train["y"].asfactor()

    # Log.info("Summary of 50_cattest_train.csv from H2O:\n")
    # train.summary()

    # Train H2O DRF Model:
    # Log.info(paste("H2O DRF with parameters:\nclassification = TRUE, ntree = 50, depth = 20, nbins = 500\n", sep = ""))
    model = h2o.random_forest(x=train[["x1", "x2"]], y=train["y"], ntrees=50, max_depth=20, nbins=500)

    # Test dataset has all 50 categories cat1 through cat50
    # Log.info("Importing 50_cattest_test.csv data...\n")
    test = h2o.import_file(path=h2o.locate("smalldata/gbm_test/50_cattest_test.csv"))

    # Log.info("Summary of 50_cattest_test.csv from H2O:\n")
    # test.summary()

    # Predict on test dataset with DRF model:
    # Log.info("Performing predictions on test dataset...\n")
    preds = model.predict(test)
    preds.head()

    # Get the confusion matrix and AUC
    # Log.info("Confusion matrix of predictions (max accuracy):\n")
    perf = model.model_performance(test)
    perf.show()
    cm = perf.confusion_matrix()
    print(cm)
Ejemplo n.º 10
0
def frame_slicing(ip,port):
    
    

    iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    prostate = h2o.import_file(path=h2o.locate("smalldata/prostate/prostate.csv.zip"))
    airlines = h2o.import_file(path=h2o.locate("smalldata/airlines/allyears2k.zip"))
    iris.show()
    prostate.show()
    airlines.show()

    ###################################################################

    # H2OFrame[int] (column slice)
    res1 = iris[0]
    assert abs(res1[8,:] - 4.4) < 1e-10, "incorrect values"

    # H2OFrame[int,int]
    res2 = prostate[13, 3]
    assert abs(res2 - 1) < 1e-10, "incorrect values"

    # H2OFrame[int, slice]
    res3 = airlines[12, 0:3]
    assert abs(res3[0,0] - 1987) < 1e-10 and abs(res3[0,1] - 10) < 1e-10 and abs(res3[0,2] - 29) < 1e-10, \
        "incorrect values"

    # H2OFrame[slice, int]
    res4 = iris[5:8, 1]
    assert abs(res4[0,:] - 3.9) < 1e-10 and abs(res4[1,:] - 3.4) < 1e-10 and abs(res4[2,:] - 3.4) < 1e-10, "incorrect values"

    # H2OFrame[slice, slice]
    res5 = prostate[5:8, 0:3]
    assert abs(res5[0,0] - 6) < 1e-10 and abs(res5[1,1] - 0) < 1e-10 and abs(res5[2,2] - 61) < 1e-10, "incorrect values"
Ejemplo n.º 11
0
def fiftycatGBM(ip,port):
  
  

  # Training set has only 45 categories cat1 through cat45
  #Log.info("Importing 50_cattest_train.csv data...\n")
  train = h2o.import_file(path=h2o.locate("smalldata/gbm_test/50_cattest_train.csv"))
  train["y"] = train["y"].asfactor()

  #Log.info("Summary of 50_cattest_train.csv from H2O:\n")
  #train.summary()
  
  # Train H2O GBM Model:
  #Log.info(paste("H2O GBM with parameters:\nntrees = 10, max_depth = 20, nbins = 20\n", sep = ""))
  model = h2o.gbm(x=train[["x1","x2"]], y=train["y"], distribution="bernoulli", ntrees=10, max_depth=5, nbins=20)
  model.show()
 
  # Test dataset has all 50 categories cat1 through cat50
  #Log.info("Importing 50_cattest_test.csv data...\n")
  test = h2o.import_file(path=h2o.locate("smalldata/gbm_test/50_cattest_test.csv"))
  #Log.info("Summary of 50_cattest_test.csv from H2O:\n")
  #test.summary()
  
  # Predict on test dataset with GBM model:
  #Log.info("Performing predictions on test dataset...\n")
  predictions = model.predict(test)
  predictions.show()
  
  # Get the confusion matrix and AUC
  #Log.info("Confusion matrix of predictions (max accuracy):\n")
  performance = model.model_performance(test)
  test_cm = performance.confusion_matrix()
  test_auc = performance.auc()
Ejemplo n.º 12
0
def table_check():
  df = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate.csv"))
  print(df[['AGE','RACE']].table(dense=True).head().as_data_frame(True))
  print(df[['AGE','RACE']].table(dense=False).head().as_data_frame(True))
  print(df[['RACE','AGE']].table(dense=True).head().as_data_frame(True))
  print(df[['RACE','AGE']].table(dense=False).head().as_data_frame(True))
  iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv"))

  # single column (frame)
  table1 = iris["C5"].table()
  assert table1[0,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[0,0], table1[0,1])
  assert table1[1,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[1,0], table1[1,1])
  assert table1[2,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[2,0], table1[2,1])

  # two-column (one argument)
  
  #dense
  table2 = iris["C1"].table(iris["C5"])
  
  #not dense
  table3 = iris["C1"].table(iris["C5"],dense=False)
  
  #check same value
  assert (table3[table3['C1'] == 5,'Iris-setosa'] == table2[(table2['C1'] == 5) & (table2['C5'] == 'Iris-setosa'),'Counts']).all()
  
  assert (table2 == iris[["C1","C5"]].table()).all()
  assert (table3 == iris[["C1","C5"]].table(dense=False)).all()

  cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
  table = cars[2].table().as_data_frame()
  table = dict(table[1:])
  table = {k:int(v) for k,v in list(table.items())}
  expected = Counter(itertools.chain(*cars[2].as_data_frame()[1:]))
  assert table == expected, "Expected {} for table counts but got {}".format(expected, table)
Ejemplo n.º 13
0
def user():

    a = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))[0:4]
    a.head()

    print(a[0].names)  # Column header
    print(a[2,0])           # column 0, row 2 value
    print(a[2,"sepal_len"]) # Column 0, row 2 value
    (a[0] + 2).show()  # Add 2 to every element; broadcast a constant
    (a[0] + a[1]).show()  # Add 2 columns; broadcast parallel add
    sum(a).show()
    print(a["sepal_len"].mean())

    print()
    print("Rows 50 through 77 in the `sepal_len` column")
    a[50:78, "sepal_len"].show()  # print out rows 50 thru 77 inclusive
    print()

    a["sepal_len"].show()

    print(a[50:78, ["sepal_len", "sepal_wid"]].show())

    a.show()

    print("The column means: ")
    print(a.mean())
    print()

    try:
        print(a["Sepal_len"].dim)  # Error, misspelt column name
    except Exception:
        pass  # Expected error

    b = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))[0:4]
    c = a + b
    d = c + c + sum(a)
    e = c + a + 1
    e.show()
    # Note that "d=c+..." keeps the internal C expressions alive, until "d" goes
    # out of scope even as we nuke "c"
    c.show()
    c = None
    # Internal "ExprNode(c=a+b)" not dead!

    print(1 + (a[0] + b[1]).mean())

    import collections

    c = h2o.H2OFrame(collections.OrderedDict({"A": [1, 2, 3], "B": [4, 5, 6]}))
    c.show()

    c.describe()
    c.head()

    c[0].show()
    print(c[1,0])
    c[0:2,0].show()

    sliced = a[0:51,0]
    sliced.show()
Ejemplo n.º 14
0
def stackedensemble_metalearner_seed_test():

    # Import training set
    train = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/higgs_train_5k.csv"),
                            destination_frame="higgs_train_5k")
    test = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/higgs_test_5k.csv"),
                           destination_frame="higgs_test_5k")

    # Identify predictors and response
    x = train.columns
    y = "response"
    x.remove(y)

    # Convert response to a factor
    train[y] = train[y].asfactor()
    test[y] = test[y].asfactor()

    # Set number of folds for base learners
    nfolds = 3

    #Metalearner params for gbm, drf, glm, and deep deeplearning
    gbm_params = {"sample_rate" : 0.3, "col_sample_rate" : 0.3}

    # Train and cross-validate a GBM
    my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
                                          ntrees=10,
                                          nfolds=nfolds,
                                          keep_cross_validation_predictions=True,
                                          seed=1)
    my_gbm.train(x=x, y=y, training_frame=train)

    # Train and cross-validate a RF
    my_rf = H2ORandomForestEstimator(ntrees=10,
                                     nfolds=nfolds,
                                     keep_cross_validation_predictions=True,
                                     seed=1)
    my_rf.train(x=x, y=y, training_frame=train)

    #Train two SE models with same metalearner seeds
    stack_gbm1 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm",
                                            metalearner_params = gbm_params, seed = 55555)
    stack_gbm2 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm",
                                            metalearner_params = gbm_params, seed = 55555)
    stack_gbm1.train(x=x, y=y, training_frame=train)
    stack_gbm2.train(x=x, y=y, training_frame=train)
    meta_gbm1 = h2o.get_model(stack_gbm1.metalearner()['name'])
    meta_gbm2 = h2o.get_model(stack_gbm2.metalearner()['name'])

    assert meta_gbm1.rmse(train=True) == meta_gbm2.rmse(train=True), "RMSE should match if same seed"

    #Train two SE models with diff metalearner seeds
    stack_gbm3 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm",
                                             metalearner_params = gbm_params, seed = 55555)
    stack_gbm4 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm",
                                             metalearner_params = gbm_params, seed = 98765)
    stack_gbm3.train(x=x, y=y, training_frame=train)
    stack_gbm4.train(x=x, y=y, training_frame=train)
    meta_gbm3 = h2o.get_model(stack_gbm3.metalearner()['name'])
    meta_gbm4 = h2o.get_model(stack_gbm4.metalearner()['name'])
    assert meta_gbm3.rmse(train=True) != meta_gbm4.rmse(train=True), "RMSE should NOT match if diff seed"
Ejemplo n.º 15
0
def col_names_check():

  iris_wheader = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
  assert iris_wheader.col_names == ["sepal_len","sepal_wid","petal_len","petal_wid","class"], \
      "Expected {0} for column names but got {1}".format(["sepal_len","sepal_wid","petal_len","petal_wid","class"],
                                                         iris_wheader.col_names)

  iris = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris.csv"))
  assert iris.col_names == ["C1","C2","C3","C4","C5"], "Expected {0} for column names but got " \
                                                         "{1}".format(["C1","C2","C3","C4","C5"], iris.col_names)

  df = h2o.H2OFrame.from_python(list(zip(*np.random.randn(100,4).tolist())), column_names=list("ABCD"), column_types=["enum"]*4)
  df.head()
  assert df.col_names == list("ABCD"), "Expected {} for column names but got {}".format(list("ABCD"), df.col_names)
  assert list(df.types.values()) == ["enum"]*4, "Expected {} for column types but got {}".format(["enum"]*4, df.types)

  df = h2o.H2OFrame(list(zip(*np.random.randn(100,4).tolist())))
  df.head()
  assert df.col_names == ["C1","C2","C3","C4"], "Expected {} for column names but got {}".format(["C1","C2","C3","C4"]
                                                                                                 , df.col_names)
  assert list(df.types.values()) == ["real"]*4, "Expected {} for column types but got {}".format(["real"]*4, df.types)

  df = h2o.H2OFrame({'B': ['a', 'a', 'b', 'NA', 'NA']})
  df.head()
  assert df.col_names == ["B"], "Expected {} for column names but got {}".format(["B"], df.col_names)

  df = h2o.H2OFrame.from_python({'B': ['a', 'a', 'b', 'NA', 'NA']}, column_names=["X"])
  df.head()
  assert df.col_names == ["X"], "Expected {} for column names but got {}".format(["X"], df.col_names)
def test_hdfs_io():
    '''
    Test H2O read and write to hdfs
    '''
    hdfs_name_node = os.getenv("NAME_NODE")
    print("Importing hdfs data")
    h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/airlines/airlines_all.05p.csv")

    print("Spliting data")
    for c in ["Month","DayofMonth","IsArrDelayed"]:
        h2o_data[c] = h2o_data[c].asfactor()
    myX = ["Month","DayofMonth","Distance"]
    train,test = h2o_data.split_frame(ratios=[0.9])

    print("Exporting file to hdfs")
    h2o.export_file(test[:,["Year","DayOfWeek"]], "hdfs://" + hdfs_name_node + "/datasets/exported.csv")

    print("Reading file back in and comparing if data is the same")
    new_test = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/exported.csv")
    assert((test[:,"DayOfWeek"] - new_test[:,"DayOfWeek"]).sum() == 0)

    print("Training")
    h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01)
    h2o_glm.train(x=myX, y="IsArrDelayed", training_frame=train) # dont need to train on all features

    hdfs_model_path = os.getenv("MODEL_PATH")
    print("Saving model")
    new_model_path = h2o.save_model(h2o_glm, "hdfs://" + hdfs_name_node + "/" + hdfs_model_path)
    print("Loading back model")
    new_model = h2o.load_model(new_model_path)
    print("Running predictions")
    preds = new_model.predict(test)
def anomaly():
  print("Deep Learning Anomaly Detection MNIST")

  train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz"))
  test = h2o.import_file(pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz"))

  predictors = list(range(0,784))
  resp = 784

  # unsupervised -> drop the response column (digit: 0-9)
  train = train[predictors]
  test = test[predictors]

  # 1) LEARN WHAT'S NORMAL
  # train unsupervised Deep Learning autoencoder model on train_hex

  ae_model = H2OAutoEncoderEstimator(activation="Tanh", hidden=[2], l1=1e-5, ignore_const_cols=False, epochs=1)
  ae_model.train(x=predictors,training_frame=train)

  # 2) DETECT OUTLIERS
  # anomaly app computes the per-row reconstruction error for the test data set
  # (passing it through the autoencoder model and computing mean square error (MSE) for each row)
  test_rec_error = ae_model.anomaly(test)

  # 3) VISUALIZE OUTLIERS
  # Let's look at the test set points with low/median/high reconstruction errors.
  # We will now visualize the original test set points and their reconstructions obtained
  # by propagating them through the narrow neural net.

  # Convert the test data into its autoencoded representation (pass through narrow neural net)
  test_recon = ae_model.predict(test)
Ejemplo n.º 18
0
def test_relevel():
    #First, compare againts itself
    print("Importing prostate_cat.csv data...\n")
    d = h2o.import_file(path = pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"), na_strings=["NA","NA","NA","NA","NA","NA","NA","NA"])

    mh2o1 = H2OGeneralizedLinearEstimator(family = "binomial", Lambda=0, missing_values_handling = "Skip")
    mh2o1.train(x=list(range(1, d.ncol)), y=0, training_frame=d)
    ns = mh2o1.coef().keys()
    print(ns)
    assert("DPROS.None" in ns, "None level IS NOT expected to be skipped by default")
    assert(("DPROS.Both" not in ns), "Both level IS expected to be skipped by default")
    x = d["DPROS"].relevel("None")
    print(x)
    d["DPROS"] = x[0]

    mh2o2 = H2OGeneralizedLinearEstimator(family = "binomial", Lambda=0, missing_values_handling = "Skip")
    mh2o2.train(x=list(range(1, d.ncol)), y=0, training_frame=d)
    ns2 = mh2o2.coef().keys()
    print(ns2)
    assert("DPROS.None" in ns2, "None level IS NOT expected to be skipped by default")
    assert(("DPROS.Both" not in ns2), "Both level IS expected to be skipped by default")

    #Second, compare against R input (taken from runit_relevel.R)
    dr = h2o.import_file(path = pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
    dr["DPROS"] = d["DPROS"].relevel("None")
    #Results are from R but manualy reordered and renamed to match h2o naming and order
    exp_coefs = {"Intercept": -7.63245 , "DPROS.Both": 1.39185, "DPROS.Left": 0.73482, "DPROS.Right": 1.51437, "RACE.White": 0.65160, "DCAPS.Yes": 0.49233,
                 "AGE":-0.01189 , "PSA": 0.02990, "VOL": -0.01141, "GLEASON": 0.96466927}
    coeff_diff = {key: abs(exp_coefs[key] - mh2o2.coef().get(key, 0)) for key in exp_coefs.keys()}
    assert (max(coeff_diff.values()) < 1e-4)
Ejemplo n.º 19
0
def bigcat_gbm():
    covtype = h2o.import_file(path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data"))
    covtype[54] = covtype[54].asfactor()
    covtypeTest = h2o.import_file(path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data"))
    covtypeTest[54] = covtype[54].asfactor()

    regular = H2OGradientBoostingEstimator(ntrees=10, seed=1234)
    regular.train(x=list(range(54)), y=54, training_frame=covtype)

    # do prediction on original dataset, no warnings
    check_warnings(regular, 0, covtypeTest)
    # drop response, no warnings
    covtypeTest = covtypeTest.drop(54)
    check_warnings(regular, 0, covtypeTest)

    covtypeTest = covtypeTest.drop(1)
    covtypeTest=covtypeTest.drop(1)
    check_warnings(regular, 2, covtypeTest)

    covtypeTest = h2o.import_file(path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data"))
    covtypeTest[54] = covtype[54].asfactor()
    covtypeTest=covtypeTest.drop(3)
    covtypeTest=covtypeTest.drop(5)
    covtypeTest=covtypeTest.drop(7)
    check_warnings(regular, 3, covtypeTest)
Ejemplo n.º 20
0
def test_hadoop():
    '''
    Test H2O read and write to hdfs
    '''
    hdfs_name_node = os.getenv("NAME_NODE")
    print("Importing hdfs data")
    h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/100k.csv")

    print("Spliting data")
    train,test = h2o_data.split_frame(ratios=[0.9])

    print("Exporting file to hdfs")
    h2o.export_file(test[:,0:2], "hdfs://" + hdfs_name_node + "/datasets/exported.csv")

    print("Reading file back in and comparing if data is the same")
    new_test = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/exported.csv")
    assert((test[:,1] - new_test[:,1]).sum() == 0)

    print("Training")
    h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01)
    h2o_glm.train(x=range(1, 10), y=0, training_frame=train) # dont need to train on all features

    hdfs_model_path = os.getenv("MODEL_PATH")
    print("Saving model")
    new_model_path = h2o.save_model(h2o_glm, "hdfs://" + hdfs_name_node + "/" + hdfs_model_path)
    print("Loading back model")
    new_model = h2o.load_model(new_model_path)
    print("Running predictions")
    preds = new_model.predict(test)
def glrm_catagorical_bug_fix():
    trainData = h2o.import_file(pyunit_utils.locate("smalldata/airlines/AirlinesTest.csv.zip"))
    testData = h2o.import_file(pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip"))
    glrmModel = H2OGeneralizedLowRankEstimator(k=4)
    glrmModel.train(x=trainData.names, training_frame=trainData)
    predV = glrmModel.predict(testData)
    print(predV)
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
          "skipped.".format("pyunit_INTERNAL_HDFS_milsongs_orc.py"))
            pass
        else:
            hdfs_orc_file = "/datasets/orc_parser/milsongs_orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            hdfs_csv_file = "/datasets/orc_parser/milsongs_csv"
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            multi_file_csv = h2o.import_file(url_csv)
            multi_file_orc = h2o.import_file(url_orc)

            multi_file_csv.summary()
            csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]

            multi_file_orc.summary()
            orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"]

            pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        # run a quick test to determine if the hive-exec is too old.

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_prostate_orc.py"))
            pass
        else:

            tol_time = 200              # comparing in ms or ns
            tol_numeric = 1e-5          # tolerance for comparing other numeric fields
            numElements2Compare = 10   # choose number of elements per column to compare.  Save test time.

            hdfs_orc_file = "/datasets/orc_parser/orc/prostate_NA.orc"
            hdfs_csv_file = "/datasets/orc_parser/csv/prostate_NA.csv"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            h2oOrc = h2o.import_file(url_orc)
            h2oCsv = h2o.import_file(url_csv)

            # compare the two frames
            assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \
                "H2O frame parsed from orc and csv files are different!"
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_iris_import_types_orc.py"))
            pass
        else:

            numElements2Compare = 100
            tol_time = 200
            tol_numeric = 1e-5

            hdfs_orc_file = "/datasets/orc_parser/orc/iris.orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            hdfs_csv_file = "/datasets/orc_parser/csv/iris.csv"
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            h2oframe_csv = h2o.import_file(url_csv)
            data_types = ['real', 'real', 'real', 'real', 'enum']
            h2oframe_orc = h2o.import_file(url_orc, col_types = data_types)

            # compare the two frames
            assert pyunit_utils.compare_frames(h2oframe_orc, h2oframe_csv, numElements2Compare, tol_time, tol_numeric,
                                               True), "H2O frame parsed from orc and csv files are different!"
    else:
        raise EnvironmentError
Ejemplo n.º 25
0
def col_names_check():

  iris_wheader = h2o.import_file(tests.locate("smalldata/iris/iris_wheader.csv"))
  assert iris_wheader.col_names == ["sepal_len","sepal_wid","petal_len","petal_wid","class"], \
      "Expected {0} for column names but got {1}".format(["sepal_len","sepal_wid","petal_len","petal_wid","class"],
                                                         iris_wheader.col_names)

  iris = h2o.import_file(tests.locate("smalldata/iris/iris.csv"))
  assert iris.col_names == ["C1","C2","C3","C4","C5"], "Expected {0} for column names but got " \
                                                         "{1}".format(["C1","C2","C3","C4","C5"], iris.col_names)

  df = h2o.H2OFrame(np.random.randn(100,4).tolist(), column_names=list("ABCD"), column_types=["Enum"]*4)
  df.head()
  assert df.col_names == list("ABCD"), "Expected {} for column names but got {}".format(list("ABCD"), df.col_names)
  assert df.types == {"A": "Enum", "C": "Enum", "B": "Enum", "D": "Enum"}, "Expected {} for column types " \
                              "but got {}".format({"A": "Enum", "C": "Enum", "B": "Enum", "D": "Enum"},
                                                  df.types)

  df = h2o.H2OFrame(np.random.randn(100,4).tolist())
  df.head()
  assert df.col_names == ["C1","C2","C3","C4"], "Expected {} for column names but got {}".format(["C1","C2","C3","C4"]
                                                                                                 , df.col_names)
  assert df.types == {"C3": "Numeric", "C2": "Numeric", "C1": "Numeric", "C4": "Numeric"}, "Expected {}" \
                      " for column types but got {}".format({"C3": "Numeric", "C2": "Numeric", "C1": "Numeric",
                                                             "C4": "Numeric"}, df.types)
Ejemplo n.º 26
0
def fiftycat_gbm():
  # Training set has only 45 categories cat1 through cat45
  train = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/50_cattest_train.csv"))
  train["y"] = train["y"].asfactor()

  # Train H2O GBM Model:
  from h2o.estimators.gbm import H2OGradientBoostingEstimator
  model = H2OGradientBoostingEstimator(distribution="bernoulli",
                                       ntrees=10,
                                       max_depth=5,
                                       nbins=20)
  model.train(x=["x1","x2"],y="y", training_frame=train)
  model.show()

  # Test dataset has all 50 categories cat1 through cat50
  test = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/50_cattest_test.csv"))


  # Predict on test dataset with GBM model:
  predictions = model.predict(test)
  predictions.show()

  # Get the confusion matrix and AUC
  performance = model.model_performance(test)
  test_cm = performance.confusion_matrix()
  test_auc = performance.auc()
Ejemplo n.º 27
0
def directory_import():

    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        url1 = "hdfs://{0}{1}".format(hdfs_name_node, "/datasets/iris/identical_iris_files/iris1.csv")
        url2 = "hdfs://{0}{1}".format(hdfs_name_node, "/datasets/iris/identical_iris_files/")
        print "Importing HDFS file {0} and directory {1}".format(url1, url2)
        frm_one = h2o.import_file(url1)
        frm_all = h2o.import_file(url2)

        r1, c1 = frm_one.dim
        ra, ca = frm_all.dim

        assert r1*3 == ra, "Expected 3 times the rows, but got {0} and {1}".format(r1,ra)
        assert c1 == ca, "Expected same number of cols, but got {0} and {1}".format(c1,ca)

    small1 = h2o.locate("smalldata/jira/identical_files/iris1.csv")
    small2 = small1.split("iris1.csv")[0]
    print "Importing smalldata file {0} and directory {1}".format(small1, small2)
    frm_one = h2o.import_file(small1)
    frm_all = h2o.import_file(small2)

    r1, c1 = frm_one.dim
    ra, ca = frm_all.dim

    assert r1*3 == ra, "Expected 3 times the rows, but got {0} and {1}".format(r1,ra)
    assert c1 == ca, "Expected same number of cols, but got {0} and {1}".format(c1,ca)
def additional_parameters():

    #col_types as list
    dest_frame="dev29&hex%"
    c_names = ["a", "b", "c"]
    c_types = ["enum", "enum", "string"]

    fhex = h2o.import_file(tests.locate("smalldata/jira/hexdev_29.csv"),
                           destination_frame=dest_frame,
                           col_names=c_names,
                           col_types=c_types)
    fhex.describe()

    assert fhex._id == dest_frame.replace("%",".").replace("&",".")
    assert fhex.col_names == c_names
    col_summary = h2o.frame(fhex._id)["frames"][0]["columns"]
    for i in range(len(col_summary)):
        assert col_summary[i]["type"] == c_types[i]

    #col_types as dictionary
    dest_frame="dev29&hex%"
    c_names = ["a", "b", "c"]
    c_types = {"c":"string", "a":"enum", "b": "enum"}

    fhex = h2o.import_file(tests.locate("smalldata/jira/hexdev_29.csv"),
                           destination_frame=dest_frame,
                           col_names=c_names,
                           col_types=c_types)
    fhex.describe()

    assert fhex._id == dest_frame.replace("%",".").replace("&",".")
    assert fhex.col_names == c_names
    col_summary = h2o.frame(fhex._id)["frames"][0]["columns"]
    for i in range(len(col_summary)):
      assert col_summary[i]["type"] == c_types[c_names[i]]
Ejemplo n.º 29
0
def pubdev_1431(ip, port):

    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        airlines_billion_file_1 = "/datasets/airlinesbillion.csv"
        url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file_1)
        airlines_billion_1 = h2o.import_file(url)

        airlines_billion_1[30] = airlines_billion_1[30].asfactor()
        gbm = h2o.gbm(x=airlines_billion_1[0:30], y=airlines_billion_1[30], ntrees=1, distribution="bernoulli", max_depth=1)

        predictions = gbm.predict(airlines_billion_1)

        csv = os.path.join(os.getcwd(),"delete.csv")
        h2o.download_csv(predictions,csv)

        airlines_billion_2 = h2o.import_file(csv)
        os.remove(csv)

        r1, c1 = airlines_billion_1.dim
        r2, c2 = airlines_billion_2.dim
        assert r1 == r2 and c1 == c2, "Expect rows to be equal. r1: {0} and r2: {1}. Expect cols to be equal c1: {0} " \
                                      "c2: {1}".format(r1,r2,c1,c2)
    else:
        print "Not running on H2O internal network.  No access to HDFS."
def test_arrange_OOM():
    '''
    PUBDEV-5990 customer reported that h2o.arrange (sorting) takes way more memory than normal for sparse
    datasets of 1G.

    Thanks to Lauren DiPerna for finding the dataset to repo the problem.
    '''

    df = h2o.import_file(pyunit_utils.locate("bigdata/laptop/jira/sort_OOM.csv"))
    t1 = time.time()
    newFrame = df.sort("sort_col")
    print(newFrame[0,0])
    elapsed_time = time.time()-t1
    print("time taken to perform sort is {0}".format(elapsed_time))

    # check and make sure the sort columns contain the right value after sorting!
    answerFrame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/jira/sort_OOM_answer.csv"))

    # compare sort_col from my sort with answer Frame
    pyunit_utils.compare_frames_local(answerFrame["sort_col"], newFrame["sort_col"])

    # compare 10 more columns with answer Frame.  Compare all columns will take too long
    allColumns = list(range(0, df.ncols))
    random.shuffle(allColumns)
    pyunit_utils.compare_frames_local(answerFrame[allColumns[0:5]], newFrame[allColumns[0:5]])
This event flow is for generating gradient boost model for
classification approach
'''

import h2o
from h2o.estimators import H2OGradientBoostingEstimator

print 'A2 Benchmark'
print '------------'

# Initialize H2O server
h2o.init(max_mem_size_GB=5)

# Load train and test data as H2O frames
train = h2o.import_file('processed-data/A2Benchmark_train.csv')
test = h2o.import_file('processed-data/A2Benchmark_test.csv')

# Define input and response columns
response_column = 'is_anomaly'
input_columns = train.col_names
input_columns.remove(response_column)
input_columns.remove('timestamp')

print 'Input columns   :', input_columns
print 'Response column :', response_column

# Explicitly imply response column contains label data
train[response_column] = train[response_column].asfactor()
test[response_column] = test[response_column].asfactor()
Ejemplo n.º 32
0
def weights_check(ip, port):
    def check_same(data1, data2, min_rows_scale):
        gbm1_regression = h2o.gbm(x=data1[[
            "displacement", "power", "weight", "acceleration", "year"
        ]],
                                  y="economy",
                                  training_frame=data1,
                                  min_rows=5,
                                  ntrees=5,
                                  max_depth=5)
        gbm2_regression = h2o.gbm(x=data2[[
            "displacement", "power", "weight", "acceleration", "year",
            "weights"
        ]],
                                  y=data2["economy"],
                                  min_rows=5 * min_rows_scale,
                                  weights_column=data2["weights"],
                                  ntrees=5,
                                  max_depth=5)
        gbm1_binomial = h2o.gbm(x=data1[[
            "displacement", "power", "weight", "acceleration", "year"
        ]],
                                y=data1["economy_20mpg"],
                                min_rows=5,
                                distribution="bernoulli",
                                ntrees=5,
                                max_depth=5)
        gbm2_binomial = h2o.gbm(x=data2[[
            "displacement", "power", "weight", "acceleration", "year",
            "weights"
        ]],
                                y=data2["economy_20mpg"],
                                weights_column="weights",
                                training_frame=data2,
                                min_rows=5 * min_rows_scale,
                                distribution="bernoulli",
                                ntrees=5,
                                max_depth=5)
        gbm1_multinomial = h2o.gbm(x=data1[[
            "displacement", "power", "weight", "acceleration", "year"
        ]],
                                   y=data1["cylinders"],
                                   min_rows=5,
                                   distribution="multinomial",
                                   ntrees=5,
                                   max_depth=5)
        gbm2_multinomial = h2o.gbm(x=data2[[
            "displacement", "power", "weight", "acceleration", "year",
            "weights"
        ]],
                                   y=data2["cylinders"],
                                   weights_column="weights",
                                   training_frame=data2,
                                   min_rows=5 * min_rows_scale,
                                   distribution="multinomial",
                                   ntrees=5,
                                   max_depth=5)

        reg1_mse = gbm1_regression.mse()
        reg2_mse = gbm2_regression.mse()
        bin1_auc = gbm1_binomial.auc()
        bin2_auc = gbm2_binomial.auc()
        mul1_mse = gbm1_multinomial.mse()
        mul2_mse = gbm2_multinomial.mse()

        print "MSE (regresson)   no weights vs. weights: {0}, {1}".format(
            reg1_mse, reg2_mse)
        print "AUC (binomial)    no weights vs. weights: {0}, {1}".format(
            bin1_auc, bin2_auc)
        print "MSE (multinomial) no weights vs. weights: {0}, {1}".format(
            mul1_mse, mul2_mse)

        assert abs(
            reg1_mse - reg2_mse
        ) < 1e-6 * reg1_mse, "Expected mse's to be the same, but got {0}, and {1}".format(
            reg1_mse, reg2_mse)
        assert abs(
            bin1_auc - bin2_auc
        ) < 1e-6 * bin1_auc, "Expected auc's to be the same, but got {0}, and {1}".format(
            bin1_auc, bin2_auc)
        assert abs(
            mul1_mse - mul1_mse
        ) < 1e-6 * mul1_mse, "Expected auc's to be the same, but got {0}, and {1}".format(
            mul1_mse, mul2_mse)

    h2o_cars_data = h2o.import_file(
        h2o.locate("smalldata/junit/cars_20mpg.csv"))
    h2o_cars_data["economy_20mpg"] = h2o_cars_data["economy_20mpg"].asfactor()
    h2o_cars_data["cylinders"] = h2o_cars_data["cylinders"].asfactor()

    # uniform weights same as no weights
    random.seed(2222)
    weight = random.randint(1, 10)
    uniform_weights = [[weight] for r in range(406)]
    h2o_uniform_weights = h2o.H2OFrame(python_obj=uniform_weights)
    h2o_uniform_weights.setNames(["weights"])
    h2o_data_uniform_weights = h2o_cars_data.cbind(h2o_uniform_weights)

    print "Checking that using uniform weights is equivalent to no weights:"
    print
    check_same(h2o_cars_data, h2o_data_uniform_weights, weight)

    # zero weights same as removed observations
    zero_weights = [[0] if random.randint(0, 1) else [1] for r in range(406)]
    h2o_zero_weights = h2o.H2OFrame(python_obj=zero_weights)
    h2o_zero_weights.setNames(["weights"])
    h2o_data_zero_weights = h2o_cars_data.cbind(h2o_zero_weights)
    h2o_data_zeros_removed = h2o_cars_data[h2o_zero_weights["weights"] == 1]

    print "Checking that using some zero weights is equivalent to removing those observations:"
    print
    check_same(h2o_data_zeros_removed, h2o_data_zero_weights, 1)

    # doubled weights same as doubled observations
    doubled_weights = [[1] if random.randint(0, 1) else [2]
                       for r in range(406)]
    h2o_doubled_weights = h2o.H2OFrame(python_obj=doubled_weights)
    h2o_doubled_weights.setNames(["weights"])
    h2o_data_doubled_weights = h2o_cars_data.cbind(h2o_doubled_weights)

    doubled_data = h2o.as_list(h2o_cars_data, use_pandas=False)
    colnames = doubled_data.pop(0)
    for idx, w in enumerate(doubled_weights):
        if w[0] == 2: doubled_data.append(doubled_data[idx])
    h2o_data_doubled = h2o.H2OFrame(python_obj=doubled_data)
    h2o_data_doubled.setNames(colnames)

    h2o_data_doubled["economy_20mpg"] = h2o_data_doubled[
        "economy_20mpg"].asfactor()
    h2o_data_doubled["cylinders"] = h2o_data_doubled["cylinders"].asfactor()
    h2o_data_doubled_weights["economy_20mpg"] = h2o_data_doubled_weights[
        "economy_20mpg"].asfactor()
    h2o_data_doubled_weights["cylinders"] = h2o_data_doubled_weights[
        "cylinders"].asfactor()

    print "Checking that doubling some weights is equivalent to doubling those observations:"
    print
    check_same(h2o_data_doubled, h2o_data_doubled_weights, 1)
def grid_lambda_search():

    # Log.info("Importing prostate.csv data...\n")
    prostate = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))

    #prostate.summary()

    # Log.info("H2O GLM (binomial) with parameters: alpha = c(0.25, 0.5), nlambda = 20, lambda_search = TRUE, nfolds: 2\n")
    model = H2OGeneralizedLinearEstimator(family="binomial",
                                          nlambdas=5,
                                          lambda_search=True,
                                          n_folds=2)
    model.train(x=list(range(2, 9)), y=1, training_frame=prostate)

    # model = h2o.glm(x=prostate[2:9], y=prostate[1], family="binomial", nlambdas=5, lambda_search=True, n_folds=2)
    if random.random() < 0.5:
        model_idx = 0
    else:
        model_idx = 1

    model_bestlambda = model.models(model_idx)
    params_bestlambda = model.params()

    # Log.info(cat("All lambda values returned:\n", params_bestlambda.lambdas()))
    assert len(params_bestlambda.lambdas()) <= 5, "expected 5 or less lambdas"

    random_lambda = random.choice(params_bestlambda.lambdas())
    print("RANDOM LAMBDA")
    print(random_lambda)

    # Log.info(cat("Retrieving model corresponding to alpha =", params_bestlambda.alpha(), "and randomly chosen lambda", random_lambda, "\n"))
    random_model = model.getGLMLambdaModel(model_bestlambda, random_lambda)

    # Log.info("EXPECTING THESE TO BE EQUAL")
    print(random_model.Lambda())
    print(random_lambda)

    assert random_model.Lambda(
    ) == random_lambda, "expected lambdas to be equal"

    # Log.info(cat("Retrieving model corresponding to alpha =", params_bestlambda.alpha(), "and best lambda", params_bestlambda.lambdaBest(), "\n"))
    best_model = h2o.getGLMLambdaModel(model_bestlambda,
                                       params_bestlambda.lambda_best())
    assert best_model.model() == model_bestlambda.model(
    ), "expected models to be equal"

    # Log.info("H2O GLM (binomial) with parameters: alpha = [0.25, 0.5], nlambda = 20, lambda_search = TRUE, nfolds: 2\n")
    prostate_search = H2OGeneralizedLinearEstimator(family="binomial",
                                                    alpha=[0.25, 0.5],
                                                    nlambdas=5,
                                                    lambda_search=True,
                                                    n_folds=2)
    prostate_search.train(x=list(range(2, 9)), y=1, training_frame=prostate)
    # prostate_search = h2o.glm(x=prostate[2:9], y=prostate[1], family="binomial", alpha=[0.25, 0.5], nlambdas=5, lambda_search=True, n_folds=2)
    model_search = prostate_search.models(model_idx)
    models_best = model_search.models(model_search.best_model())
    params_best = models_best.params()

    assert params_bestlambda.lambda_best() == params_best.lambda_best(
    ), "expected lambdas to be equal"
    assert len(params_best.lambda_all()) <= 20, "expected 20 or fewer lambdas"
Ejemplo n.º 34
0
def algo_pr_auc_test():
    '''
    This pyunit test is written to expose the pr_auc for all binomial runs of all algos per PUBDEV-5665.
    '''

    seed = 123456789
    prostate_train = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate_train.csv"))
    prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor()

    # Build H2O GBM classification model:
    gbm_h2o = H2OGradientBoostingEstimator(ntrees=10,
                                           learn_rate=0.1,
                                           max_depth=4,
                                           min_rows=10,
                                           distribution="bernoulli",
                                           seed=seed)
    gbm_h2o.train(x=list(range(1, prostate_train.ncol)),
                  y="CAPSULE",
                  training_frame=prostate_train)
    print("***************************   Printing GBM model")
    print(gbm_h2o)
    assert_found_pr_auc(
        gbm_h2o, 'training_pr_auc'
    )  # check and make sure pr_auc is found in scoring history

    # Build H2O GLM classification model:
    glm_h2o = H2OGeneralizedLinearEstimator(family='binomial', seed=seed)
    glm_h2o.train(x=list(range(1, prostate_train.ncol)),
                  y="CAPSULE",
                  training_frame=prostate_train)
    print("***************************   Printing GLM model")
    print(glm_h2o
          )  # glm scoring history does not contain AUC, and hence no pr_auc

    rf_h2o = H2ORandomForestEstimator(ntrees=10, score_tree_interval=0)
    rf_h2o.train(x=list(range(1, prostate_train.ncol)),
                 y="CAPSULE",
                 training_frame=prostate_train)
    print("***************************   Printing random forest model")
    print(rf_h2o)
    assert_found_pr_auc(
        rf_h2o, 'training_pr_auc'
    )  # check and make sure pr_auc is found in scoring history

    dl_h2o = H2ODeepLearningEstimator(distribution='bernoulli',
                                      seed=seed,
                                      hidden=[2, 2])
    dl_h2o.train(x=list(range(1, prostate_train.ncol)),
                 y="CAPSULE",
                 training_frame=prostate_train)
    print("***************************   Printing deeplearning model")
    print(dl_h2o)
    assert_found_pr_auc(
        dl_h2o, 'training_pr_auc'
    )  # check and make sure pr_auc is found in scoring history

    print(
        "precision/recall AUC for gbm is {0}, for glm is {1},\n for rf is {2}, for deeplearning is"
        " {3}".format(
            gbm_h2o._model_json["output"]
            ["training_metrics"]._metric_json["pr_auc"],
            glm_h2o._model_json["output"]
            ["training_metrics"]._metric_json["pr_auc"],
            rf_h2o._model_json["output"]
            ["training_metrics"]._metric_json["pr_auc"],
            dl_h2o._model_json["output"]
            ["training_metrics"]._metric_json["pr_auc"]))

    assert abs(gbm_h2o._model_json["output"]["training_metrics"]._metric_json["pr_auc"] -
               glm_h2o._model_json["output"]["training_metrics"]._metric_json["pr_auc"]) < 0.9, \
        "problem with pr_auc values"

    assert abs(rf_h2o._model_json["output"]["training_metrics"]._metric_json["pr_auc"] -
               dl_h2o._model_json["output"]["training_metrics"]._metric_json["pr_auc"]) < 0.9, \
        "problem with pr_auc values"
Ejemplo n.º 35
0




#==============================================================================
#  POJO is a standalone Java class with no dependencies on the full H2O stack 
# (only the h2o-genmodel.jar file, which defines the POJO interface).
#==============================================================================

import h2o
from h2o.estimators.kmeans import H2OKMeansEstimator

h2o.init()

h2o_df = h2o.import_file(path = "C:/xxx.csv",
                         parse=True,header=0,sep=",")

type(h2o_df)

h2o_df.describe()
h2o_df.columns
h2o_df.types
h2o_df.as_data_frame()
h2o_df.as_data_frame()
type(_)



h2o.import_file?
h2o.parse_setup?
Ejemplo n.º 36
0
'''
This event flow is for generating deep learning model for
regression approach
'''

import h2o
from h2o.estimators import H2ODeepLearningEstimator

# Initialize H2O server
h2o.init(max_mem_size_GB=5)

# Load train and test data as H2O frames
train = h2o.import_file('processed-data/train.csv')
test = h2o.import_file('processed-data/test.csv')

# Define input and response columns
response_column = 'RUL'
input_columns = train.col_names
input_columns.remove('UnitNumber')
input_columns.remove('Time')
input_columns.remove('Setting1')
input_columns.remove('Setting2')
input_columns.remove('Setting3')
input_columns.remove('RUL')

# Define model and train model
model = H2ODeepLearningEstimator(hidden=[500, 500], nfolds=10, epochs=100)
model.train(x=input_columns, y=response_column, training_frame=train)

# Test model
performance = model.model_performance(test_data=test)
def algo_max_runtime_secs():
    '''
    This pyunit test is written to ensure that the various model will not crash if the max_runtime_secs
    is set to be too short.  See PUBDEV-4802.
    '''
    global model_within_max_runtime
    seed = 12345

    # word2vec
    train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"),
                            header=1,
                            col_types=["string"])
    used = train[0:170000, 0]
    w2v_model = H2OWord2vecEstimator()
    grabRuntimeInfo(w2v_model, used, [], 0)
    cleanUp([train, used, w2v_model])

    # kmeans
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/kmeans_8_centers_3_coords.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OKMeansEstimator(k=10)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([training1_data, model])

    # PCA, pca_method=Power
    training1_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gridsearch/pca1000by25.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OPCA(k=10,
                   transform="STANDARDIZE",
                   pca_method="Power",
                   compute_metrics=True)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([model])

    # PCA, pca_method=Randomized
    model = H2OPCA(k=10,
                   transform="STANDARDIZE",
                   pca_method="Randomized",
                   compute_metrics=True)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([model])

    # PCA, pca_method=GLRM
    model = H2OPCA(k=10,
                   transform="STANDARDIZE",
                   pca_method="GLRM",
                   compute_metrics=True,
                   use_all_factor_levels=True)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([model])

    # deeplearning
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/gaussian_training1_set.csv"))
    y_index = training1_data.ncol - 1
    x_indices = list(range(y_index))
    model = H2ODeepLearningEstimator(distribution='gaussian',
                                     seed=seed,
                                     hidden=[10, 10, 10])
    grabRuntimeInfo(model, training1_data, x_indices, y_index)
    cleanUp([training1_data, model])

    # stack ensemble, stacking part is not iterative
    print(
        "******************** Skip testing stack ensemble.  Not an iterative algo."
    )

    # GBM run
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/multinomial_training1_set.csv"))
    y_index = training1_data.ncol - 1
    x_indices = list(range(y_index))
    training1_data[y_index] = training1_data[y_index].round().asfactor()
    model = H2OGradientBoostingEstimator(distribution="multinomial", seed=seed)
    grabRuntimeInfo(model, training1_data, x_indices, y_index)
    cleanUp([model])

    # GLM run
    model = H2OGeneralizedLinearEstimator(family='multinomial', seed=seed)
    grabRuntimeInfo(model, training1_data, x_indices, y_index)
    cleanUp([model])

    # naivebayes, not iterative
    print(
        "******************** Skip testing Naives Bayes.  Not an iterative algo."
    )

    # random foreset
    model = H2ORandomForestEstimator(ntrees=100, score_tree_interval=0)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([model, training1_data])

    # deepwater
    if H2ODeepWaterEstimator.available():
        training1_data = h2o.import_file(
            path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv"))
        training1_data = training1_data.drop('Site')
        training1_data['Angaus'] = training1_data['Angaus'].asfactor()
        y_index = "Angaus"
        x_indices = list(range(1, training1_data.ncol))
        model = H2ODeepWaterEstimator(epochs=50,
                                      hidden=[4096, 4096, 4096],
                                      hidden_dropout_ratios=[0.2, 0.2, 0.2])
        grabRuntimeInfo(model, training1_data, x_indices, y_index)
        cleanUp([training1_data, model])

    # GLRM, do not make sense to stop in the middle of an iteration
    training1_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gridsearch/glrmdata1000x25.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OGeneralizedLowRankEstimator(k=10,
                                           loss="Quadratic",
                                           gamma_x=0.3,
                                           gamma_y=0.3,
                                           transform="STANDARDIZE",
                                           recover_svd=True)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([training1_data, model])

    if sum(model_within_max_runtime) > 0:
        sys.exit(1)
# Import Modules
import h2o
import pandas
import random

project_path = "/gtc-2017"

# Connect or Start H2O
h2o.init()

# Import Data
mnist_training = h2o.import_file(project_path + "/data/mnist-training.csv")
mnist_testing = h2o.import_file(project_path + "/data/mnist-testing.csv")

mnist_training["label"] = mnist_training["label"].asfactor()
mnist_testing["label"] = mnist_testing["label"].asfactor()

# Explore Data
print(mnist_training.head())

# Build Deep Water Model
from h2o.estimators.deepwater import H2ODeepWaterEstimator
model_mnist_lenet_mx = H2ODeepWaterEstimator(epochs=80, network="lenet")
model_mnist_lenet_mx.train(x=["uri"],
                           y="label",
                           training_frame=mnist_training,
                           validation_frame=mnist_testing,
                           model_id="model_mnist_lenet_mx")

model_mnist_lenet_mx.show()
Ejemplo n.º 39
0
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.formula.api as smf
import pandas as pd


data=pd.read_csv("random_data.csv")
data.head()
mod = smf.OLS(data["y"], data["x"])
res = mod.fit()
print res.summary()


import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
h2o.init()
h2o_df = h2o.import_file("random_data.csv")
h2o_df.summary()
m = H2OGeneralizedLinearEstimator(model_id="GLM_1",nfolds=0)
x = h2o_df.col_names[0]
y = h2o_df.col_names[1]
m.train(x,y,h2o_df)
m

Ejemplo n.º 40
0
import h2o
import os
import tabulate
import operator
from h2o.estimators.gbm import H2OGradientBoostingEstimator

h2o.init()

#Loading Data
productionprocess = h2o.import_file(path=os.path.realpath(
    "/home/iconnect4/bespoke_manufacturing/data/production-process-data.csv"),
                                    destination_frame="bespokemanufacturing",
                                    header=1,
                                    col_types=[
                                        "string", "string", "string", "string",
                                        "string", "string", "string"
                                    ])
productionprocess.describe()

data_cols = [
    "productshortname", "prodordertype", "prodordercategory",
    "orderitempriority", "ordersource", "address_dl_country",
    "prod_allocated_process"
]
for col in data_cols:
    productionprocess[col] = productionprocess[col].asfactor()

#Split into train and test frames
train, test = productionprocess.split_frame(ratios=[0.7])
print(train.nrows)
print(test.nrows)
Ejemplo n.º 41
0
def download_mojo_filename():
    fr = h2o.import_file(
        path=pyunit_utils.locate("smalldata/prostate/prostate.csv"))

    model = H2OGradientBoostingEstimator(ntrees=10, seed=1234)
    model.train(x=list(range(2, fr.ncol)), y=1, training_frame=fr)

    # Default location is current working directory and filename is model_id
    mojo_path = model.download_mojo()
    assert_equals(os.path.join(os.getcwd(), model.model_id + ".zip"),
                  mojo_path, "Not expected path")
    mojo_model = h2o.import_mojo(mojo_path)
    assert isinstance(mojo_model, H2OGenericEstimator)

    # Location is parent of current working directory and filename is model_id
    mojo_path = model.download_mojo("..")
    assert_equals(
        os.path.abspath(os.path.join(os.pardir, model.model_id + ".zip")),
        mojo_path, "Not expected path")
    mojo_model = h2o.import_mojo(mojo_path)
    assert isinstance(mojo_model, H2OGenericEstimator)

    # Location is home directory and filename is model_id
    mojo_path = model.download_mojo("~")
    assert_equals(
        os.path.abspath(
            os.path.expanduser(os.path.join("~", model.model_id + ".zip"))),
        mojo_path, "Not expected path")
    mojo_model = h2o.import_mojo(mojo_path)
    assert isinstance(mojo_model, H2OGenericEstimator)

    # Default locations is current working directory with custom filename
    mojo_path = model.download_mojo("gbm_prostate.zip")
    assert_equals(os.path.join(os.getcwd(), "gbm_prostate.zip"), mojo_path,
                  "Not expected path")
    mojo_model = h2o.import_mojo(mojo_path)
    assert isinstance(mojo_model, H2OGenericEstimator)

    # Location is current working directory with custom filename
    mojo_path = model.download_mojo("./gbm_prostate.zip")
    assert_equals(os.path.join(os.getcwd(), "gbm_prostate.zip"), mojo_path,
                  "Not expected path")
    mojo_model = h2o.import_mojo(mojo_path)
    assert isinstance(mojo_model, H2OGenericEstimator)

    # Location is parent of current working directory with custom filename
    mojo_path = model.download_mojo("../gbm_prostate.zip")
    assert_equals(os.path.abspath(os.path.join(os.pardir, "gbm_prostate.zip")),
                  mojo_path, "Not expected path")
    mojo_model = h2o.import_mojo(mojo_path)
    assert isinstance(mojo_model, H2OGenericEstimator)

    # Location is home directory with custom filename
    mojo_path = model.download_mojo("~/gbm_prostate.zip")
    assert_equals(
        os.path.abspath(
            os.path.expanduser(os.path.join("~", "gbm_prostate.zip"))),
        mojo_path, "Not expected path")
    mojo_model = h2o.import_mojo(mojo_path)
    assert isinstance(mojo_model, H2OGenericEstimator)

    # Custom filename with custom path
    tmpdir = tempfile.mkdtemp()
    mojo_path = model.download_mojo(os.path.join(tmpdir, "gbm_prostate.zip"))
    assert_equals(os.path.join(tmpdir, "gbm_prostate.zip"), mojo_path,
                  "Not expected path")
    mojo_model = h2o.import_mojo(mojo_path)
    assert isinstance(mojo_model, H2OGenericEstimator)
Ejemplo n.º 42
0
def binop_eq(ip, port):

    iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    rows, cols = iris.dim
    iris.show()

    #frame/scaler
    res = iris == 4.7
    res_rows, res_cols = res.dim
    assert res_rows == rows and res_cols == cols, "dimension mismatch"
    new_rows = iris[res[0]].nrow
    assert new_rows == 2, "wrong number of rows returned"

    res = 3.5 == iris
    res_rows, res_cols = res.dim
    assert res_rows == rows and res_cols == cols, "dimension mismatch"
    new_rows = iris[res[1]].nrow
    assert new_rows == 6, "wrong number of rows returned"

    #frame/vec
    #try:
    #    res = iris == iris[0]
    #    res.show()
    #    assert False, "expected error. objects of different dimensions not supported."
    #except EnvironmentError:
    #    pass

    #try:
    #    res = iris[2] == iris
    #    res.show()
    #    assert False, "expected error. objects of different dimensions not supported."
    #except EnvironmentError:
    #    pass

    #vec/vec
    res = iris[0] == iris[1]
    res_rows = res.nrow
    assert res_rows == rows, "dimension mismatch"
    new_rows = iris[res].nrow
    assert new_rows == 0, "wrong number of rows returned"

    res = iris[2] == iris[2]
    res_rows = res.nrow
    assert res_rows == rows, "dimension mismatch"
    new_rows = iris[res].nrow
    assert new_rows == 150, "wrong number of rows returned"

    #vec/scaler
    res = iris[0] == 4.7
    res_rows = res.nrow
    assert res_rows == rows, "dimension mismatch"
    new_rows = iris[res].nrow
    assert new_rows == 2, "wrong number of rows returned"

    res = 3.5 == iris[1]
    res_rows = res.nrow
    assert res_rows == rows, "dimension mismatch"
    new_rows = iris[res].nrow
    assert new_rows == 6, "wrong number of rows returned"

    # frame/frame
    res = iris == iris
    res_rows, res_cols = res.dim
    assert res_rows == rows and res_cols == cols, "dimension mismatch"

    res = iris[0:2] == iris[1:3]
    res_rows, res_cols = res.dim
    assert res_rows == rows and res_cols == 2, "dimension mismatch"
def testGLMGaussianScoringHistory():
    col_list_compare = [
        "iterations", "objective", "negative_log_likelihood", "training_rmse",
        "validation_rmse", "training_mae", "validation_mae",
        "training_deviance", "validation_deviance", "deviance_train",
        "deviance_test"
    ]

    h2o_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
    enum_columns = [
        "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"
    ]
    for cname in enum_columns:
        h2o_data[cname] = h2o_data[cname]
    myY = "C21"
    myX = h2o_data.names.remove(myY)
    data_frames = h2o_data.split_frame(ratios=[0.8])
    training_data = data_frames[0]
    test_data = data_frames[1]

    # build gaussian model with score_each_interval to true
    model = glm(family="gaussian",
                score_each_iteration=True,
                generate_scoring_history=True)
    model.train(x=myX,
                y=myY,
                training_frame=training_data,
                validation_frame=test_data)
    # build gaussian model with score_iteration_interval to 1
    model_score_each = glm(family="gaussian",
                           score_iteration_interval=1,
                           generate_scoring_history=True)
    model_score_each.train(x=myX,
                           y=myY,
                           training_frame=training_data,
                           validation_frame=test_data)
    pyunit_utils.assert_equal_scoring_history(model, model_score_each,
                                              col_list_compare)

    # build gaussian model with score_each_interval to true, with CV
    model_cv = glm(family="gaussian",
                   score_each_iteration=True,
                   nfolds=3,
                   fold_assignment='modulo',
                   seed=1234,
                   generate_scoring_history=True)
    model_cv.train(x=myX,
                   y=myY,
                   training_frame=training_data,
                   validation_frame=test_data)
    # build gaussian model with score_iteration_interval to 1, with CV
    model_score_each_cv = glm(family="gaussian",
                              score_iteration_interval=1,
                              nfolds=3,
                              fold_assignment='modulo',
                              seed=1234,
                              generate_scoring_history=True)
    model_score_each_cv.train(x=myX,
                              y=myY,
                              training_frame=training_data,
                              validation_frame=test_data)
    pyunit_utils.assert_equal_scoring_history(model_cv, model_score_each_cv,
                                              col_list_compare)
    model_cv_4th = glm(family="gaussian",
                       score_iteration_interval=4,
                       nfolds=3,
                       fold_assignment='modulo',
                       seed=1234,
                       generate_scoring_history=True)
    model_cv_4th.train(x=myX,
                       y=myY,
                       training_frame=training_data,
                       validation_frame=test_data)
    pyunit_utils.assertEqualScoringHistoryIteration(model_cv_4th, model_cv,
                                                    col_list_compare)
Ejemplo n.º 44
0
import h2o
h2o.init()
path = h2o.system_file("prostate.csv")
h2o_df = h2o.import_file(path)
h2o_df["CAPSULE"] = h2o_df["CAPSULE"].asfactor()
h2o_df.summary()
Ejemplo n.º 45
0
# Load the H2O library and start up the H2O cluter locally on your machine
import h2o
# Import H2O GLM:
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

if __name__ == "__main__":

    # Number of threads, nthreads = -1, means use all cores on your machine
    # max_mem_size is the maximum memory (in GB) to allocate to H2O
    h2o.init(nthreads=-1, max_mem_size=8)

    #loan_csv = "/Volumes/H2OTOUR/loan.csv"  # modify this for your machine
    # Alternatively, you can import the data directly from a URL
    loan_csv = "https://raw.githubusercontent.com/h2oai/app-consumer-loan/master/data/loan.csv"
    data = h2o.import_file(loan_csv)  # 163,987 rows x 15 columns
    data['bad_loan'] = data['bad_loan'].asfactor(
    )  #encode the binary response as a factor
    #data['bad_loan'].levels()  #optional: after encoding, this shows the two factor levels, '0' and '1'

    y = 'bad_loan'
    x = list(data.columns)
    x.remove(y)  #remove the response
    x.remove(
        'int_rate'
    )  #remove the interest rate column because it's correlated with the outcome

    # Initialize the GLM estimator:
    # Similar to R's glm() and H2O's R GLM, H2O's GLM has the "family" argument
    glm_fit1 = H2OGeneralizedLinearEstimator(family='binomial',
                                             model_id='glm_fit1')
Ejemplo n.º 46
0
def deeplearning_grid_cars():



    cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    r = cars[0].runif(seed=42)
    train = cars[r > .2]

    validation_scheme = random.randint(1,3) # 1:none, 2:cross-validation, 3:validation set
    print "Validation scheme: {0}".format(validation_scheme)
    if validation_scheme == 2:
        nfolds = 2
        print "Nfolds: 2"
    if validation_scheme == 3:
        valid = cars[r <= .2]

    grid_space = pyunit_utils.make_random_grid_space(algo="dl")
    print "Grid space: {0}".format(grid_space)

    predictors = ["displacement","power","weight","acceleration","year"]
    if grid_space['distribution'][0] == 'bernoulli':
        response_col = "economy_20mpg"
    elif grid_space['distribution'][0] == 'gaussian':
        response_col = "economy"
    else:
        response_col = "cylinders"

    print "Predictors: {0}".format(predictors)
    print "Response: {0}".format(response_col)

    if grid_space['distribution'][0] in ['bernoulli', 'multinomial']:
        print "Converting the response column to a factor..."
        train[response_col] = train[response_col].asfactor()
        if validation_scheme == 3:
            valid[response_col] = valid[response_col].asfactor()

    print "Constructing the grid of gbm models..."
    cars_dl_grid = H2OGridSearch(H2ODeepLearningEstimator, hyper_params=grid_space)
    if validation_scheme == 1:
        cars_dl_grid.train(x=predictors,y=response_col,training_frame=train)
    elif validation_scheme == 2:
        cars_dl_grid.train(x=predictors,y=response_col,training_frame=train,nfolds=nfolds)
    else:
        cars_dl_grid.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid)

    print "Performing various checks of the constructed grid..."

    print "Check cardinality of grid, that is, the correct number of models have been created..."
    size_of_grid_space = 1
    for v in grid_space.values():
        size_of_grid_space = size_of_grid_space * len(v)
    actual_size = len(cars_dl_grid)
    assert size_of_grid_space ==  actual_size, "Expected size of grid to be {0}, but got {1}" \
                                               "".format(size_of_grid_space,actual_size)
    print "Duplicate-entries-in-grid-space check"
    new_grid_space = copy.deepcopy(grid_space)
    for name in grid_space.keys():
        if not name == "distribution":
            new_grid_space[name] = grid_space[name] + grid_space[name]
    print "The new search space: {0}".format(new_grid_space)
    print "Constructing the new grid of gbm models..."
    cars_dl_grid2 = H2OGridSearch(H2ODeepLearningEstimator, hyper_params=new_grid_space)
    if validation_scheme == 1:
        cars_dl_grid2.train(x=predictors,y=response_col,training_frame=train)
    elif validation_scheme == 2:
        cars_dl_grid2.train(x=predictors,y=response_col,training_frame=train,nfolds=nfolds)
    else:
        cars_dl_grid2.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid)
    actual_size2 = len(cars_dl_grid2)
    assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \
                                    "size: {1}".format(actual_size, actual_size2)

    print "Check that the hyper_params that were passed to grid, were used to construct the models..."
    for name in grid_space.keys():
        pyunit_utils.expect_model_param(cars_dl_grid, name, grid_space[name])
def test_gam_cv_fold_columns():
    # create frame knots
    knots1 = [-1.99905699, -0.98143075, 0.02599159, 1.00770987, 1.99942290]
    frameKnots1 = h2o.H2OFrame(python_obj=knots1)
    knots2 = [-1.999821861, -1.005257990, -0.006716042, 1.002197392, 1.999073589]
    frameKnots2 = h2o.H2OFrame(python_obj=knots2)
    knots3 = [-1.999675688, -0.979893796, 0.007573327, 1.011437347, 1.999611676]
    frameKnots3 = h2o.H2OFrame(python_obj=knots3)

    # import the dataset
    h2o_data = h2o.import_file(
        pyunit_utils.locate("smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv"))
    # convert the C1, C2, and C11 columns to factors
    h2o_data["C1"] = h2o_data["C1"].asfactor()
    h2o_data["C2"] = h2o_data["C2"].asfactor()
    h2o_data["C11"] = h2o_data["C11"].asfactor()

    # split into train and validation sets
    train, test = h2o_data.split_frame(ratios=[.8])

    # set the predictor and response columns
    y = "C11"
    x = ["C1", "C2"]

    # specify the knots array
    numKnots = [5, 5, 5]

    # Both of these gives an NPE, should be fixed now.

    # build the GAM model gam_columns=["C6","C7","C8"]
    h2o_model = H2OGeneralizedAdditiveEstimator(family='multinomial',
                                                gam_columns=["C6", "C7", "C8"],
                                                scale=[0, 1, 2],
                                                num_knots=numKnots,
                                                knot_ids=[frameKnots1.key, frameKnots2.key, frameKnots3.key],
                                                nfolds=5,
                                                seed=1234,
                                                fold_assignment='modulo')

    h2o_model.train(x=x, y=y, training_frame=train)

    # create a fold column for train
    fold_numbers = train.kfold_column(n_folds=5, seed=1234)
    # rename the column "fold_numbers"
    fold_numbers.set_names(["fold_numbers"])
    train = train.cbind(fold_numbers)

    # build the GAM model
    h2o_model_fold_column = H2OGeneralizedAdditiveEstimator(family='multinomial',
                                                            gam_columns=["C6", "C7", "C8"],
                                                            scale=[0, 1, 2],
                                                            num_knots=numKnots,
                                                            knot_ids=[frameKnots1.key, frameKnots2.key,
                                                                      frameKnots3.key])

    h2o_model_fold_column.train(x=x, y=y, training_frame=train, fold_column="fold_numbers")

    # both model should return the same coefficients since they use the same fold assignment
    coeff = h2o_model.coef()
    coeff_fold_column = h2o_model_fold_column.coef()
    pyunit_utils.assertCoefDictEqual(coeff['coefficients'], coeff_fold_column['coefficients'])
def glm_alpha_lambda_arrays():
    # read in the dataset and construct training set (and validation set)
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    mL = glm(family='binomial',
             Lambda=[0.9, 0.5, 0.1],
             alpha=[0.1, 0.5, 0.9],
             solver='COORDINATE_DESCENT',
             cold_start=False)
    mL.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1)
    r = glm.getGLMRegularizationPath(mL)
    regKeys = [
        "alphas", "lambdas", "explained_deviance_valid",
        "explained_deviance_train"
    ]
    best_submodel_index = mL._model_json["output"]["best_submodel_index"]
    m2 = glm.makeGLMModel(model=mL,
                          coefs=r['coefficients'][best_submodel_index])
    dev1 = r['explained_deviance_train'][best_submodel_index]
    p2 = m2.model_performance(d)
    dev2 = 1 - p2.residual_deviance() / p2.null_deviance()
    print(dev1, " =?= ", dev2)
    assert abs(dev1 - dev2) < 1e-6
    responseMean = d[1].mean()
    initIntercept = math.log(responseMean / (1.0 - responseMean))
    startValInit = [0, 0, 0, 0, 0, 0, 0, initIntercept]
    startVal = [0, 0, 0, 0, 0, 0, 0, initIntercept]
    orderedCoeffNames = [
        "AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON", "Intercept"
    ]
    for l in range(0, len(r['lambdas'])):
        m = glm(family='binomial',
                alpha=[r['alphas'][l]],
                Lambda=[r['lambdas'][l]],
                solver='COORDINATE_DESCENT',
                startval=startVal)
        m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1)
        mr = glm.getGLMRegularizationPath(m)

        cs = r['coefficients'][l]
        cs_norm = r['coefficients_std'][l]
        pyunit_utils.assertEqualCoeffDicts(cs, m.coef(), tol=1e-3)
        pyunit_utils.assertEqualCoeffDicts(cs_norm, m.coef_norm(), 1e-3)
        if (l + 1) < len(
                r['lambdas']) and r['alphas'][l] != r['alphas'][l + 1]:
            startVal = startValInit
        else:
            startVal = pyunit_utils.extractNextCoeff(
                cs_norm, orderedCoeffNames,
                startVal)  # prepare startval for next round

        p = m.model_performance(d)
        devm = 1 - p.residual_deviance() / p.null_deviance()
        devn = r['explained_deviance_train'][l]
        assert abs(devm - devn) < 1e-4
        pyunit_utils.assertEqualRegPaths(regKeys, r, l, mr, tol=1e-4)
        if (l == best_submodel_index
            ):  # check training metrics, should equal for best submodel index
            pyunit_utils.assertEqualModelMetrics(
                m._model_json["output"]["training_metrics"],
                mL._model_json["output"]["training_metrics"],
                tol=1e-4)
        else:  # for other submodel, should have worse residual_deviance() than best submodel
            assert p.residual_deviance() >= p2.residual_deviance(), "Best submodel does not have lowerest " \
                                                                    "residual_deviance()!"
Ejemplo n.º 49
0
def testGLMBinomialScoringHistory():
    col_list_compare = [
        "iterations", "objective", "negative_log_likelihood",
        "training_logloss", "validation_logloss",
        "training_classification_error", "validation_classification_error",
        "training_rmse", "validation_rmse", "training_auc", "validation_auc",
        "training_pr_auc", "validation_pr_auc", "training_lift",
        "validation_lift", "deviance_train", "deviance_test"
    ]
    h2o_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/glm_test/binomial_20_cols_10KRows.csv"))
    for ind in range(10):
        h2o_data[ind] = h2o_data[ind].asfactor()
    h2o_data["C21"] = h2o_data["C21"].asfactor()
    splits_frames = h2o_data.split_frame(ratios=[.8], seed=1234)
    train = splits_frames[0]
    valid = splits_frames[1]
    Y = "C21"
    X = list(range(0, 20))

    print(
        "Building model with score_interval=1.  Should generate same model as "
        "score_each_iteration turned on.")
    h2o_model = glm(family="binomial",
                    score_iteration_interval=1,
                    generate_scoring_history=True)
    h2o_model.train(x=X, y=Y, training_frame=train, validation_frame=valid)
    print("Building model with score_each_iteration turned on.")
    h2o_model_score_each = glm(family="binomial",
                               score_each_iteration=True,
                               generate_scoring_history=True)
    h2o_model_score_each.train(x=X,
                               y=Y,
                               training_frame=train,
                               validation_frame=valid)
    # scoring history from h2o_model_score_each and h2o_model should be the same
    pyunit_utils.assert_equal_scoring_history(h2o_model_score_each, h2o_model,
                                              col_list_compare)

    print("Building model with score_each_iteration turned on, with  CV.")
    h2o_model_score_each_cv = glm(family="binomial",
                                  score_each_iteration=True,
                                  nfolds=3,
                                  fold_assignment='modulo',
                                  seed=1234,
                                  generate_scoring_history=True)
    h2o_model_score_each_cv.train(x=X,
                                  y=Y,
                                  training_frame=train,
                                  validation_frame=valid)
    print(
        "Building model with score_interval=1, and CV.  Should generate same model as score_each_iteration turned "
        "on, with lambda search and CV.")
    h2o_model_cv = glm(family="binomial",
                       score_iteration_interval=1,
                       nfolds=3,
                       fold_assignment='modulo',
                       seed=1234,
                       generate_scoring_history=True)
    h2o_model_cv.train(x=X, y=Y, training_frame=train, validation_frame=valid)
    # scoring history from h2o_model_score_each_cv and h2o_model_cv should be the same
    col_list_compare.append("deviance_xval")
    col_list_compare.append("deviance_se")
    pyunit_utils.assert_equal_scoring_history(h2o_model_score_each_cv,
                                              h2o_model_cv, col_list_compare)

    # check if scoring_interval is set to 4, the output should be the same for every fourth iteration
    h2o_model_cv_4th = glm(family="binomial",
                           score_iteration_interval=3,
                           nfolds=3,
                           fold_assignment='modulo',
                           seed=1234,
                           generate_scoring_history=True)
    h2o_model_cv_4th.train(x=X,
                           y=Y,
                           training_frame=train,
                           validation_frame=valid)
    pyunit_utils.assertEqualScoringHistoryIteration(h2o_model_cv,
                                                    h2o_model_cv_4th,
                                                    col_list_compare)
Ejemplo n.º 50
0
# We visualize the nature of H2O Deep Learning (DL), H2O's tree methods (GBM/DRF) and H2O's generalized linear modeling (GLM) by plotting the decision boundary between the red and black spirals:

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from h2o.estimators.deeplearning import H2OAutoEncoderEstimator, H2ODeepLearningEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator


# First, we need to upload our datasets to the the H2O cluster. The data is imported into H2OFrames, which operate similarly in function to pandas DataFrames.

import os
spiral = h2o.import_file(path = os.path.realpath("input/spiral.csv"))
grid  = h2o.import_file(path = os.path.realpath("input/grid.csv"))


# Spiral is a simple data set consisting of two spirals of black and red dots.
# Grid is a 201 by 201 matrix with dimensions [-1.5, 1.5] by [-1.5, 1.5].
#
# To visualize these datasets, we can pull them from H2OFrames into pandas DataFrames for easier plotting.

spiral_df = spiral.as_data_frame(use_pandas=True)
grid_df = grid.as_data_frame(use_pandas=True)
grid_x, grid_y = grid_df.x.reshape(201,201), grid_df.y.reshape(201,201)
spiral_r = spiral_df[spiral_df.color == "Red"]
spiral_k = spiral_df[spiral_df.color == "Black"]

spiral_xr, spiral_yr = spiral_r[spiral_r.columns[0]], spiral_r[spiral_r.columns[1]]
def test_random_forrest_effective_parameters():
    frame = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv"))
    frame["Angaus"] = frame["Angaus"].asfactor()
    frame["Weights"] = h2o.H2OFrame.from_python(abs(np.random.randn(frame.nrow, 1)).tolist())[0]
    train, calib = frame.split_frame(ratios=[.8], destination_frames=["eco_train", "eco_calib"], seed=42)

    rf1 = H2ORandomForestEstimator(ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5, weights_column="Weights",
                                   stopping_rounds = 3, calibrate_model=True, calibration_frame=calib, seed = 1234)
    rf1.train(x=list(range(2, train.ncol)), y="Angaus", training_frame=train)

    rf2 = H2ORandomForestEstimator(ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5, weights_column="Weights",
                                   stopping_rounds = 3, stopping_metric='logloss', calibrate_model=True, calibration_frame=calib,
                                   seed = 1234, categorical_encoding = 'Enum')
    rf2.train(x=list(range(2, train.ncol)), y="Angaus", training_frame=train)

    assert rf1.parms['stopping_metric']['input_value'] == 'AUTO'
    assert rf1.parms['stopping_metric']['actual_value'] ==  rf2.parms['stopping_metric']['actual_value']
    np.testing.assert_almost_equal(rf1.logloss(), rf2.logloss())
    assert rf1.parms['distribution']['input_value'] == 'bernoulli'
    assert rf1.parms['distribution']['actual_value'] == rf2.parms['distribution']['actual_value']
    assert rf1.parms['categorical_encoding']['input_value'] == 'AUTO'
    assert rf1.parms['categorical_encoding']['actual_value'] == rf2.parms['categorical_encoding']['actual_value']
    assert rf1.parms['fold_assignment']['input_value'] == 'AUTO'
    assert rf1.parms['fold_assignment']['actual_value'] == None

    rf1 = H2ORandomForestEstimator(ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5, weights_column="Weights",
                                   nfolds = 5, calibrate_model=True, calibration_frame=calib, seed = 1234)
    rf1.train(x=list(range(2, train.ncol)), y="Angaus", training_frame=train)

    rf2 = H2ORandomForestEstimator(ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5, weights_column="Weights",
                                   nfolds=5, fold_assignment='Random', calibrate_model=True, calibration_frame=calib, seed = 1234,
                                   categorical_encoding = 'Enum')
    rf2.train(x=list(range(2, train.ncol)), y="Angaus", training_frame=train)

    assert rf1.parms['stopping_metric']['input_value'] == 'AUTO'
    assert rf1.parms['stopping_metric']['actual_value'] is None
    np.testing.assert_almost_equal(rf1.logloss(), rf2.logloss())
    assert rf1.parms['distribution']['input_value'] == 'bernoulli'
    assert rf1.parms['distribution']['actual_value'] == rf2.parms['distribution']['actual_value']
    assert rf1.parms['fold_assignment']['input_value'] == 'AUTO'
    assert rf1.parms['fold_assignment']['actual_value'] == rf2.parms['fold_assignment']['actual_value']
    assert rf1.parms['categorical_encoding']['input_value'] == 'AUTO'
    assert rf1.parms['categorical_encoding']['actual_value'] == rf2.parms['categorical_encoding']['actual_value']

    try:
        h2o.rapids("(setproperty \"{}\" \"{}\")".format("sys.ai.h2o.algos.evaluate_auto_model_parameters", "false"))
        rf1 = H2ORandomForestEstimator(ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5, weights_column="Weights",
                                   nfolds = 5, calibrate_model=True, calibration_frame=calib, seed = 1234)
        rf1.train(x=list(range(2, train.ncol)), y="Angaus", training_frame=train)

        rf2 = H2ORandomForestEstimator(ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5, weights_column="Weights",
                                   nfolds=5, fold_assignment='Random', calibrate_model=True, calibration_frame=calib, seed = 1234,
                                   categorical_encoding = 'Enum')
        rf2.train(x=list(range(2, train.ncol)), y="Angaus", training_frame=train)

        assert rf1.parms['stopping_metric']['input_value'] == 'AUTO'
        assert rf1.parms['stopping_metric']['actual_value'] == 'AUTO'
        np.testing.assert_almost_equal(rf1.logloss(), rf2.logloss())
        assert rf1.parms['distribution']['input_value'] == 'bernoulli'
        assert rf1.parms['distribution']['actual_value'] == rf2.parms['distribution']['actual_value']
        assert rf1.parms['fold_assignment']['input_value'] == 'AUTO'
        assert rf1.parms['fold_assignment']['actual_value'] == 'AUTO'
        assert rf1.parms['categorical_encoding']['input_value'] == 'AUTO'
        assert rf1.parms['categorical_encoding']['actual_value'] == 'AUTO'
    finally:
        h2o.rapids("(setproperty \"{}\" \"{}\")".format("sys.ai.h2o.algos.evaluate_auto_model_parameters", "true"))
Ejemplo n.º 52
0
# Used swedish insurance data from smalldata instead of MASS/insurance due to the license of the MASS R package.
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
h2o.init()

h2o_df = h2o.import_file(
    "http://h2o-public-test-data.s3.amazonaws.com/smalldata/glm_test/Motor_insurance_sweden.txt",
    sep='\t')
poisson_fit = H2OGeneralizedLinearEstimator(family="poisson")
poisson_fit.train(
    y="Claims",
    x=["Payment", "Insured", "Kilometres", "Zone", "Bonus", "Make"],
    training_frame=h2o_df)
Ejemplo n.º 53
0
def offset_1388():
    

    print "Loading datasets..."
    pros_hex = h2o.import_file(tests.locate("smalldata/prostate/prostate.csv"))
    pros_hex[1] = pros_hex[1].asfactor()
    pros_hex[3] = pros_hex[3].asfactor()
    pros_hex[4] = pros_hex[4].asfactor()
    pros_hex[5] = pros_hex[5].asfactor()
    pros_hex[8] = pros_hex[8].asfactor()

    cars_hex = h2o.import_file(tests.locate("smalldata/junit/cars.csv"))
    cars_hex[0] = cars_hex[0].asfactor()
    cars_hex[2] = cars_hex[2].asfactor()

    print "Running Binomial Comparison..."
    glm_bin_h2o = h2o.glm(x=pros_hex[2:9], y=pros_hex[1], training_frame=pros_hex, family="binomial", standardize=False,
                          offset_column="AGE", Lambda=[0], max_iterations=100)
    print "binomial"
    print "R:"
    print "deviance: {0}".format(1464.9565781185)
    print "null deviance: {0}".format(2014.93087862689)
    print "aic: {0}".format(1494.9565781185)

    print "H2O:"
    print "deviance {0}".format(glm_bin_h2o.residual_deviance())
    print "null deviance {0}".format(glm_bin_h2o.null_deviance())
    print "aic {0}".format(glm_bin_h2o.aic())

    assert abs(1464.9565781185 - glm_bin_h2o.residual_deviance()) < 0.1
    assert abs(2014.93087862689 - glm_bin_h2o.null_deviance()) < 0.1
    assert abs(1494.9565781185 - glm_bin_h2o.aic()) < 0.1

    print "Running Regression Comparisons..."

    glm_h2o = h2o.glm(x=cars_hex[2:8], y=cars_hex[1], training_frame=cars_hex, family="gaussian", standardize=False,
                      offset_column="year", Lambda = [0], max_iterations = 100)
    print "gaussian"
    print "R:"
    print "deviance: {0}".format(4204.68399275449)
    print "null deviance: {0}".format(16072.0955102041)
    print "aic: {0}".format(2062.54330117177)

    print "H2O:"
    print "deviance {0}".format(glm_h2o.residual_deviance())
    print "null deviance {0}".format(glm_h2o.null_deviance())
    print "aic {0}".format(glm_h2o.aic())

    assert abs(4204.68399275449 - glm_h2o.residual_deviance()) < 0.1
    assert abs(16072.0955102041 - glm_h2o.null_deviance()) < 0.1
    assert abs(2062.54330117177 - glm_h2o.aic()) < 0.1

    glm_h2o = h2o.glm(x=cars_hex[2:8], y=cars_hex[1], training_frame=cars_hex, family="poisson", standardize=False,
                      offset_column="year", Lambda = [0], max_iterations = 100)
    print "poisson"
    print "R:"
    print "deviance: {0}".format(54039.1725227918)
    print "null deviance: {0}".format(59381.5624028358)
    print "aic: {0}".format("Inf")

    print "H2O:"
    print "deviance {0}".format(glm_h2o.residual_deviance())
    print "null deviance {0}".format(glm_h2o.null_deviance())
    print "aic {0}".format(glm_h2o.aic())

    assert abs(54039.1725227918 - glm_h2o.residual_deviance()) < 0.1
    assert abs(59381.5624028358 - glm_h2o.null_deviance()) < 0.1
    assert abs(float('inf') - glm_h2o.aic()) < 0.1
Ejemplo n.º 54
0
    def setup_data(self):
        """
        This function performs all initializations necessary:
        1. generates all the random parameter values for our dynamic tests like the Gaussian
        noise std, column count and row count for training/test data sets.
        2. randomly choose the distribution family (gaussian, binomial, multinomial)
        to test.
        3. with the chosen distribution family, generate the appropriate data sets
        4. load the data sets and set the training set indices and response column index
        """

        # create and clean out the sandbox directory first
        self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, True)

        # randomly set Gaussian noise standard deviation as a fraction of actual predictor standard deviation
        self.noise_std = random.uniform(0, math.sqrt(pow((self.max_p_value - self.min_p_value), 2) / 12))
        self.noise_var = self.noise_std*self.noise_std

        # randomly determine data set size in terms of column and row counts
        self.train_col_count = random.randint(1, self.max_col_count)
        self.train_row_count = round(self.train_col_count * random.uniform(self.min_col_count_ratio,
                                                                           self.max_col_count_ratio))

        #  DEBUGGING setup_data, remember to comment them out once done.
        # self.train_col_count = 3
        # self.train_row_count = 200
        # self.max_real_number = 3
        # self.max_int_number = 3
        # end DEBUGGING

        # randomly choose which family of GLM algo to use
        self.family = self.families[random.randint(0, len(self.families)-1)]

        # set class number for classification
        if 'binomial' in self.family:
            self.class_number = 2
        elif 'multinomial' in self.family:
            self.class_number = random.randint(3, self.max_class_number)    # randomly set number of classes K

        # generate real value weight vector and training/validation/test data sets for GLM
        pyunit_utils.write_syn_floating_point_dataset_glm(self.training1_data_file, "",
                                                          self.training2_data_file, self.weight_data_file,
                                                          self.train_row_count, self.train_col_count, 2,
                                                          self.max_p_value, self.min_p_value, self.max_w_value,
                                                          self.min_w_value, self.noise_std, self.family,
                                                          self.train_row_count, self.train_row_count,
                                                          class_number=self.class_number,
                                                          class_method=['probability', 'probability',
                                                                        'probability'])

        # preload data sets
        self.training1_data = h2o.import_file(pyunit_utils.locate(self.training1_data_file))
        self.training2_data = h2o.import_file(pyunit_utils.locate(self.training2_data_file))

        # set data set indices for predictors and response
        self.y_index = self.training1_data.ncol-1
        self.x_indices = list(range(self.y_index))

        # set response to be categorical for classification tasks
        if ('binomial' in self.family) or ('multinomial' in self.family):
            self.training1_data[self.y_index] = self.training1_data[self.y_index].round().asfactor()

            # check to make sure all response classes are represented, otherwise, quit
            if self.training1_data[self.y_index].nlevels()[0] < self.class_number:
                print("Response classes are not represented in training dataset.")
                sys.exit(0)

            self.training2_data[self.y_index] = self.training2_data[self.y_index].round().asfactor()

        # save the training data files just in case the code crashed.
        pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
Ejemplo n.º 55
0
def pyunit_mean_per_class_error():
    gbm = H2OGradientBoostingEstimator(nfolds=3,
                                       fold_assignment="Random",
                                       seed=1234)

    ## Binomial
    cars = h2o.import_file("/users/arno/h2o-3/smalldata/junit/cars_20mpg.csv")
    cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
    r = cars[0].runif(seed=1234)
    train = cars[r > .2]
    valid = cars[r <= .2]
    response_col = "economy_20mpg"
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    gbm.distribution = "bernoulli"
    gbm.train(y=response_col,
              x=predictors,
              validation_frame=valid,
              training_frame=train)
    print(gbm)
    mpce = gbm.mean_per_class_error([0.5, 0.8])  ## different thresholds
    assert (abs(mpce[0][1] - 0.004132231404958664) < 1e-5)
    assert (abs(mpce[1][1] - 0.021390374331550777) < 1e-5)

    ## score on train first
    print(
        gbm.model_performance(train).mean_per_class_error(
            thresholds=[0.3, 0.5]))

    ## Multinomial
    cars = h2o.import_file("/users/arno/h2o-3/smalldata/junit/cars_20mpg.csv")
    cars["cylinders"] = cars["cylinders"].asfactor()
    r = cars[0].runif(seed=1234)
    train = cars[r > .2]
    valid = cars[r <= .2]
    response_col = "cylinders"
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    gbm.distribution = "multinomial"
    gbm.train(x=predictors,
              y=response_col,
              training_frame=train,
              validation_frame=valid)
    print(gbm)
    mpce = gbm.mean_per_class_error(train=True)
    assert (mpce == 0)
    mpce = gbm.mean_per_class_error(valid=True)
    assert (abs(mpce - 0.207142857143) < 1e-5)
    mpce = gbm.mean_per_class_error(xval=True)
    assert (abs(mpce - 0.350071715433) < 1e-5)

    ## Early stopping
    gbm.stopping_rounds = 2
    gbm.stopping_metric = "mean_per_class_error"
    gbm.ntrees = 10000
    gbm.max_depth = 3
    gbm.min_rows = 1
    gbm.learn_rate = 0.01
    gbm.score_tree_interval = 1
    gbm.nfolds = None
    gbm.fold_assignment = None
    gbm.train(x=predictors,
              y=response_col,
              training_frame=train,
              validation_frame=valid)
    print(gbm)
    print(gbm.scoring_history())

    ## Grid search
    hyper_params_tune = {
        'max_depth': list(range(1, 10 + 1, 1)),
        'sample_rate': [x / 100. for x in range(20, 101)],
        'col_sample_rate': [x / 100. for x in range(20, 101)],
        'col_sample_rate_per_tree': [x / 100. for x in range(20, 101)],
        'col_sample_rate_change_per_level': [x / 100. for x in range(90, 111)],
        'min_rows':
        [2**x for x in range(0,
                             int(math.log(train.nrow, 2) - 2) + 1)],
        'nbins': [2**x for x in range(4, 11)],
        'nbins_cats': [2**x for x in range(4, 13)],
        'min_split_improvement': [0, 1e-8, 1e-6, 1e-4],
        'histogram_type': ["UniformAdaptive", "QuantilesGlobal", "RoundRobin"]
    }

    search_criteria_tune = {
        'strategy': "RandomDiscrete",
        'max_runtime_secs': 600,  ## limit the runtime to 10 minutes
        'max_models': 10,
        'seed': 1234,
        'stopping_rounds': 5,
        'stopping_metric': "mean_per_class_error",
        'stopping_tolerance': 1e-3
    }

    grid = H2OGridSearch(H2OGradientBoostingEstimator,
                         hyper_params=hyper_params_tune,
                         search_criteria=search_criteria_tune)
    grid.train(x=predictors,
               y=response_col,
               training_frame=train,
               validation_frame=valid,
               distribution="multinomial",
               seed=1234,
               stopping_rounds=10,
               stopping_metric="mean_per_class_error",
               stopping_tolerance=1e-3)

    print(grid)  ## sorted by logloss
    print(grid.get_grid("mean_per_class_error"))
Ejemplo n.º 56
0
def metric_accessors():

    cars = h2o.import_file(
        path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    r = cars[0].runif()
    train = cars[r > .2]
    valid = cars[r <= .2]

    # regression
    response_col = "economy"
    distribution = "gaussian"
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    gbm = h2o.gbm(y=train[response_col],
                  x=train[predictors],
                  validation_y=valid[response_col],
                  validation_x=valid[predictors],
                  nfolds=3,
                  distribution=distribution,
                  fold_assignment="Random")

    #   mse
    mse1 = gbm.mse(train=True, valid=False, xval=False)
    assert isinstance(mse1, float)

    mse2 = gbm.mse(train=False, valid=True, xval=False)
    assert isinstance(mse2, float)

    mse3 = gbm.mse(train=False, valid=False, xval=True)
    assert isinstance(mse3, float)

    mse = gbm.mse(train=True, valid=True, xval=False)
    assert "train" in list(mse.keys()) and "valid" in list(
        mse.keys()
    ), "expected training and validation metrics to be returned, but got {0}".format(
        list(mse.keys()))
    assert len(
        mse
    ) == 2, "expected only training and validation metrics to be returned, but got {0}".format(
        list(mse.keys()))
    assert isinstance(mse["train"], float) and isinstance(
        mse["valid"], float
    ), "expected training and validation metrics to be floats, but got {0} and {1}".format(
        type(mse["train"]), type(mse["valid"]))
    assert mse["valid"] == mse2

    mse = gbm.mse(train=True, valid=False, xval=True)
    assert "train" in list(mse.keys()) and "xval" in list(
        mse.keys()
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        list(mse.keys()))
    assert len(
        mse
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        list(mse.keys()))
    assert isinstance(mse["train"], float) and isinstance(
        mse["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(mse["train"]), type(mse["xval"]))
    assert mse["xval"] == mse3

    mse = gbm.mse(train=True, valid=True, xval=True)
    assert "train" in list(mse.keys()) and "valid" in list(mse.keys(
    )) and "xval" in list(
        mse.keys()
    ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(
        list(mse.keys()))
    assert len(
        mse
    ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(
        list(mse.keys()))
    assert isinstance(mse["train"], float) and isinstance(
        mse["valid"], float
    ) and isinstance(
        mse["xval"], float
    ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(
        type(mse["train"]), type(mse["valid"]), type(mse["xval"]))

    mse = gbm.mse(train=False, valid=False,
                  xval=False)  # default: return training metrics
    assert isinstance(mse, float)
    assert mse == mse1

    mse = gbm.mse(train=False, valid=True, xval=True)
    assert "valid" in list(mse.keys()) and "xval" in list(
        mse.keys()
    ), "expected validation and cross validation metrics to be returned, but got {0}".format(
        list(mse.keys()))
    assert len(
        mse
    ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(
        list(mse.keys()))
    assert isinstance(mse["valid"], float) and isinstance(
        mse["xval"], float
    ), "validation and cross validation metrics to be floats, but got {0} and {1}".format(
        type(mse["valid"]), type(mse["xval"]))

    #   r2
    r21 = gbm.r2(train=True, valid=False, xval=False)
    assert isinstance(r21, float)

    r22 = gbm.r2(train=False, valid=True, xval=False)
    assert isinstance(r22, float)

    r23 = gbm.r2(train=False, valid=False, xval=True)
    assert isinstance(r23, float)

    r2 = gbm.r2(train=True, valid=True, xval=False)
    assert "train" in list(r2.keys()) and "valid" in list(
        r2.keys()
    ), "expected training and validation metrics to be returned, but got {0}".format(
        list(r2.keys()))
    assert len(
        r2
    ) == 2, "expected only training and validation metrics to be returned, but got {0}".format(
        list(r2.keys()))
    assert isinstance(r2["train"], float) and isinstance(
        r2["valid"], float
    ), "expected training and validation metrics to be floats, but got {0} and {1}".format(
        type(r2["train"]), type(r2["valid"]))
    assert r2["valid"] == r22

    r2 = gbm.r2(train=True, valid=False, xval=True)
    assert "train" in list(r2.keys()) and "xval" in list(
        r2.keys()
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        list(r2.keys()))
    assert len(
        r2
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        list(r2.keys()))
    assert isinstance(r2["train"], float) and isinstance(
        r2["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(r2["train"]), type(r2["xval"]))
    assert r2["xval"] == r23

    r2 = gbm.r2(train=True, valid=True, xval=True)
    assert "train" in list(r2.keys()) and "valid" in list(r2.keys(
    )) and "xval" in list(
        r2.keys()
    ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(
        list(r2.keys()))
    assert len(
        r2
    ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(
        list(r2.keys()))
    assert isinstance(r2["train"], float) and isinstance(
        r2["valid"], float
    ) and isinstance(
        r2["xval"], float
    ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(
        type(r2["train"]), type(r2["valid"]), type(r2["xval"]))

    r2 = gbm.r2(train=False, valid=False,
                xval=False)  # default: return training metrics
    assert isinstance(r2, float)
    assert r2 == r21

    r2 = gbm.r2(train=False, valid=True, xval=True)
    assert "valid" in list(r2.keys()) and "xval" in list(
        r2.keys()
    ), "expected validation and cross validation metrics to be returned, but got {0}".format(
        list(r2.keys()))
    assert len(
        r2
    ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(
        list(r2.keys()))
    assert isinstance(r2["valid"], float) and isinstance(
        r2["xval"], float
    ), "validation and cross validation metrics to be floats, but got {0} and {1}".format(
        type(r2["valid"]), type(r2["xval"]))

    #   mean_residual_deviance
    mean_residual_deviance1 = gbm.mean_residual_deviance(train=True,
                                                         valid=False,
                                                         xval=False)
    assert isinstance(mean_residual_deviance1, float)

    mean_residual_deviance2 = gbm.mean_residual_deviance(train=False,
                                                         valid=True,
                                                         xval=False)
    assert isinstance(mean_residual_deviance2, float)

    mean_residual_deviance3 = gbm.mean_residual_deviance(train=False,
                                                         valid=False,
                                                         xval=True)
    assert isinstance(mean_residual_deviance3, float)

    mean_residual_deviance = gbm.mean_residual_deviance(train=True,
                                                        valid=True,
                                                        xval=False)
    assert "train" in list(mean_residual_deviance.keys()) and "valid" in list(
        mean_residual_deviance.keys()
    ), "expected training and validation metrics to be returned, but got {0}".format(
        list(mean_residual_deviance.keys()))
    assert len(
        mean_residual_deviance
    ) == 2, "expected only training and validation metrics to be returned, but got {0}".format(
        list(mean_residual_deviance.keys()))
    assert isinstance(mean_residual_deviance["train"], float) and isinstance(
        mean_residual_deviance["valid"], float
    ), "expected training and validation metrics to be floats, but got {0} and {1}".format(
        type(mean_residual_deviance["train"]),
        type(mean_residual_deviance["valid"]))
    assert mean_residual_deviance["valid"] == mean_residual_deviance2

    mean_residual_deviance = gbm.mean_residual_deviance(train=True,
                                                        valid=False,
                                                        xval=True)
    assert "train" in list(mean_residual_deviance.keys()) and "xval" in list(
        mean_residual_deviance.keys()
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        list(mean_residual_deviance.keys()))
    assert len(
        mean_residual_deviance
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        list(mean_residual_deviance.keys()))
    assert isinstance(mean_residual_deviance["train"], float) and isinstance(
        mean_residual_deviance["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(mean_residual_deviance["train"]),
        type(mean_residual_deviance["xval"]))
    assert mean_residual_deviance["xval"] == mean_residual_deviance3

    mean_residual_deviance = gbm.mean_residual_deviance(train=True,
                                                        valid=True,
                                                        xval=True)
    assert "train" in list(mean_residual_deviance.keys(
    )) and "valid" in list(mean_residual_deviance.keys()) and "xval" in list(
        mean_residual_deviance.keys()
    ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(
        list(mean_residual_deviance.keys()))
    assert len(
        mean_residual_deviance
    ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(
        list(mean_residual_deviance.keys()))
    assert isinstance(mean_residual_deviance["train"], float) and isinstance(
        mean_residual_deviance["valid"], float
    ) and isinstance(
        mean_residual_deviance["xval"], float
    ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(
        type(mean_residual_deviance["train"]),
        type(mean_residual_deviance["valid"]),
        type(mean_residual_deviance["xval"]))

    mean_residual_deviance = gbm.mean_residual_deviance(
        train=False, valid=False,
        xval=False)  # default: return training metrics
    assert isinstance(mean_residual_deviance, float)
    assert mean_residual_deviance == mean_residual_deviance1

    mean_residual_deviance = gbm.mean_residual_deviance(train=False,
                                                        valid=True,
                                                        xval=True)
    assert "valid" in list(mean_residual_deviance.keys()) and "xval" in list(
        mean_residual_deviance.keys()
    ), "expected validation and cross validation metrics to be returned, but got {0}".format(
        list(mean_residual_deviance.keys()))
    assert len(
        mean_residual_deviance
    ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(
        list(mean_residual_deviance.keys()))
    assert isinstance(mean_residual_deviance["valid"], float) and isinstance(
        mean_residual_deviance["xval"], float
    ), "validation and cross validation metrics to be floats, but got {0} and {1}".format(
        type(mean_residual_deviance["valid"]),
        type(mean_residual_deviance["xval"]))

    # binomial
    cars = h2o.import_file(
        path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
    r = cars[0].runif()
    train = cars[r > .2]
    valid = cars[r <= .2]
    response_col = "economy_20mpg"
    distribution = "bernoulli"
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    gbm = h2o.gbm(y=train[response_col],
                  x=train[predictors],
                  validation_y=valid[response_col],
                  validation_x=valid[predictors],
                  nfolds=3,
                  distribution=distribution,
                  fold_assignment="Random")

    #   auc
    auc1 = gbm.auc(train=True, valid=False, xval=False)
    assert isinstance(auc1, float)

    auc2 = gbm.auc(train=False, valid=True, xval=False)
    assert isinstance(auc2, float)

    auc3 = gbm.auc(train=False, valid=False, xval=True)
    assert isinstance(auc3, float)

    auc = gbm.auc(train=True, valid=True, xval=False)
    assert "train" in list(auc.keys()) and "valid" in list(
        auc.keys()
    ), "expected training and validation metrics to be returned, but got {0}".format(
        list(auc.keys()))
    assert len(
        auc
    ) == 2, "expected only training and validation metrics to be returned, but got {0}".format(
        list(auc.keys()))
    assert isinstance(auc["train"], float) and isinstance(
        auc["valid"], float
    ), "expected training and validation metrics to be floats, but got {0} and {1}".format(
        type(auc["train"]), type(auc["valid"]))
    assert auc["valid"] == auc2

    auc = gbm.auc(train=True, valid=False, xval=True)
    assert "train" in list(auc.keys()) and "xval" in list(
        auc.keys()
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        list(auc.keys()))
    assert len(
        auc
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        list(auc.keys()))
    assert isinstance(auc["train"], float) and isinstance(
        auc["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(auc["train"]), type(auc["xval"]))
    assert auc["xval"] == auc3

    auc = gbm.auc(train=True, valid=True, xval=True)
    assert "train" in list(auc.keys()) and "valid" in list(auc.keys(
    )) and "xval" in list(
        auc.keys()
    ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(
        list(auc.keys()))
    assert len(
        auc
    ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(
        list(auc.keys()))
    assert isinstance(auc["train"], float) and isinstance(
        auc["valid"], float
    ) and isinstance(
        auc["xval"], float
    ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(
        type(auc["train"]), type(auc["valid"]), type(auc["xval"]))

    auc = gbm.auc(train=False, valid=False,
                  xval=False)  # default: return training metrics
    assert isinstance(auc, float)
    assert auc == auc1

    auc = gbm.auc(train=False, valid=True, xval=True)
    assert "valid" in list(auc.keys()) and "xval" in list(
        auc.keys()
    ), "expected validation and cross validation metrics to be returned, but got {0}".format(
        list(auc.keys()))
    assert len(
        auc
    ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(
        list(auc.keys()))
    assert isinstance(auc["valid"], float) and isinstance(
        auc["xval"], float
    ), "validation and cross validation metrics to be floats, but got {0} and {1}".format(
        type(auc["valid"]), type(auc["xval"]))

    # roc
    (fprs1, tprs1) = gbm.roc(train=True, valid=False, xval=False)
    assert isinstance(fprs1, list)
    assert isinstance(tprs1, list)

    (fprs2, tprs2) = gbm.roc(train=False, valid=True, xval=False)
    assert isinstance(fprs2, list)
    assert isinstance(tprs2, list)

    (fprs3, tprs3) = gbm.roc(train=False, valid=False, xval=True)
    assert isinstance(fprs3, list)
    assert isinstance(tprs3, list)

    roc = gbm.roc(train=True, valid=True, xval=False)
    assert "train" in list(roc.keys()) and "valid" in list(
        roc.keys()
    ), "expected training and validation metrics to be returned, but got {0}".format(
        list(roc.keys()))
    assert len(
        roc
    ) == 2, "expected only training and validation metrics to be returned, but got {0}".format(
        list(roc.keys()))
    assert isinstance(roc["train"], tuple) and isinstance(
        roc["valid"], tuple
    ), "expected training and validation metrics to be tuples, but got {0} and {1}".format(
        type(roc["train"]), type(roc["valid"]))
    assert roc["valid"][0] == fprs2
    assert roc["valid"][1] == tprs2

    roc = gbm.roc(train=True, valid=False, xval=True)
    assert "train" in list(roc.keys()) and "xval" in list(
        roc.keys()
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        list(roc.keys()))
    assert len(
        roc
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        list(roc.keys()))
    assert isinstance(roc["train"], tuple) and isinstance(
        roc["xval"], tuple
    ), "expected training and cross validation metrics to be tuples, but got {0} and {1}".format(
        type(roc["train"]), type(roc["xval"]))
    assert roc["xval"][0] == fprs3
    assert roc["xval"][1] == tprs3

    roc = gbm.roc(train=True, valid=True, xval=True)
    assert "train" in list(roc.keys()) and "valid" in list(roc.keys(
    )) and "xval" in list(
        roc.keys()
    ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(
        list(roc.keys()))
    assert len(
        roc
    ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(
        list(roc.keys()))
    assert isinstance(roc["train"], tuple) and isinstance(
        roc["valid"], tuple
    ) and isinstance(
        roc["xval"], tuple
    ), "expected training, validation, and cross validation metrics to be tuples, but got {0}, {1}, and {2}".format(
        type(roc["train"]), type(roc["valid"]), type(roc["xval"]))

    (fprs, tprs) = gbm.roc(train=False, valid=False,
                           xval=False)  # default: return training metrics
    assert isinstance(fprs, list)
    assert isinstance(tprs, list)
    assert fprs == fprs1
    assert tprs == tprs1

    roc = gbm.roc(train=False, valid=True, xval=True)
    assert "valid" in list(roc.keys()) and "xval" in list(
        roc.keys()
    ), "expected validation and cross validation metrics to be returned, but got {0}".format(
        list(roc.keys()))
    assert len(
        roc
    ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(
        list(roc.keys()))
    assert isinstance(roc["valid"], tuple) and isinstance(
        roc["xval"], tuple
    ), "validation and cross validation metrics to be tuples, but got {0} and {1}".format(
        type(roc["valid"]), type(roc["xval"]))

    #   logloss
    logloss1 = gbm.logloss(train=True, valid=False, xval=False)
    assert isinstance(logloss1, float)

    logloss2 = gbm.logloss(train=False, valid=True, xval=False)
    assert isinstance(logloss2, float)

    logloss3 = gbm.logloss(train=False, valid=False, xval=True)
    assert isinstance(logloss3, float)

    logloss = gbm.logloss(train=True, valid=True, xval=False)
    assert "train" in list(logloss.keys()) and "valid" in list(
        logloss.keys()
    ), "expected training and validation metrics to be returned, but got {0}".format(
        list(logloss.keys()))
    assert len(
        logloss
    ) == 2, "expected only training and validation metrics to be returned, but got {0}".format(
        list(logloss.keys()))
    assert isinstance(logloss["train"], float) and isinstance(
        logloss["valid"], float
    ), "expected training and validation metrics to be floats, but got {0} and {1}".format(
        type(logloss["train"]), type(logloss["valid"]))
    assert logloss["valid"] == logloss2

    logloss = gbm.logloss(train=True, valid=False, xval=True)
    assert "train" in list(logloss.keys()) and "xval" in list(
        logloss.keys()
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        list(logloss.keys()))
    assert len(
        logloss
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        list(logloss.keys()))
    assert isinstance(logloss["train"], float) and isinstance(
        logloss["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(logloss["train"]), type(logloss["xval"]))
    assert logloss["xval"] == logloss3

    logloss = gbm.logloss(train=True, valid=True, xval=True)
    assert "train" in list(logloss.keys()) and "valid" in list(logloss.keys(
    )) and "xval" in list(
        logloss.keys()
    ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(
        list(logloss.keys()))
    assert len(
        logloss
    ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(
        list(logloss.keys()))
    assert isinstance(logloss["train"], float) and isinstance(
        logloss["valid"], float
    ) and isinstance(
        logloss["xval"], float
    ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(
        type(logloss["train"]), type(logloss["valid"]), type(logloss["xval"]))

    logloss = gbm.logloss(train=False, valid=False,
                          xval=False)  # default: return training metrics
    assert isinstance(logloss, float)
    assert logloss == logloss1

    logloss = gbm.logloss(train=False, valid=True, xval=True)
    assert "valid" in list(logloss.keys()) and "xval" in list(
        logloss.keys()
    ), "expected validation and cross validation metrics to be returned, but got {0}".format(
        list(logloss.keys()))
    assert len(
        logloss
    ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(
        list(logloss.keys()))
    assert isinstance(logloss["valid"], float) and isinstance(
        logloss["xval"], float
    ), "validation and cross validation metrics to be floats, but got {0} and {1}".format(
        type(logloss["valid"]), type(logloss["xval"]))

    #   giniCoef
    giniCoef1 = gbm.giniCoef(train=True, valid=False, xval=False)
    assert isinstance(giniCoef1, float)

    giniCoef2 = gbm.giniCoef(train=False, valid=True, xval=False)
    assert isinstance(giniCoef2, float)

    giniCoef3 = gbm.giniCoef(train=False, valid=False, xval=True)
    assert isinstance(giniCoef3, float)

    giniCoef = gbm.giniCoef(train=True, valid=True, xval=False)
    assert "train" in list(giniCoef.keys()) and "valid" in list(
        giniCoef.keys()
    ), "expected training and validation metrics to be returned, but got {0}".format(
        list(giniCoef.keys()))
    assert len(
        giniCoef
    ) == 2, "expected only training and validation metrics to be returned, but got {0}".format(
        list(giniCoef.keys()))
    assert isinstance(giniCoef["train"], float) and isinstance(
        giniCoef["valid"], float
    ), "expected training and validation metrics to be floats, but got {0} and {1}".format(
        type(giniCoef["train"]), type(giniCoef["valid"]))
    assert giniCoef["valid"] == giniCoef2

    giniCoef = gbm.giniCoef(train=True, valid=False, xval=True)
    assert "train" in list(giniCoef.keys()) and "xval" in list(
        giniCoef.keys()
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        list(giniCoef.keys()))
    assert len(
        giniCoef
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        list(giniCoef.keys()))
    assert isinstance(giniCoef["train"], float) and isinstance(
        giniCoef["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(giniCoef["train"]), type(giniCoef["xval"]))
    assert giniCoef["xval"] == giniCoef3

    giniCoef = gbm.giniCoef(train=True, valid=True, xval=True)
    assert "train" in list(giniCoef.keys()) and "valid" in list(giniCoef.keys(
    )) and "xval" in list(
        giniCoef.keys()
    ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(
        list(giniCoef.keys()))
    assert len(
        giniCoef
    ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(
        list(giniCoef.keys()))
    assert isinstance(giniCoef["train"], float) and isinstance(
        giniCoef["valid"], float
    ) and isinstance(
        giniCoef["xval"], float
    ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(
        type(giniCoef["train"]), type(giniCoef["valid"]),
        type(giniCoef["xval"]))

    giniCoef = gbm.giniCoef(train=False, valid=False,
                            xval=False)  # default: return training metrics
    assert isinstance(giniCoef, float)
    assert giniCoef == giniCoef1

    giniCoef = gbm.giniCoef(train=False, valid=True, xval=True)
    assert "valid" in list(giniCoef.keys()) and "xval" in list(
        giniCoef.keys()
    ), "expected validation and cross validation metrics to be returned, but got {0}".format(
        list(giniCoef.keys()))
    assert len(
        giniCoef
    ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(
        list(giniCoef.keys()))
    assert isinstance(giniCoef["valid"], float) and isinstance(
        giniCoef["xval"], float
    ), "validation and cross validation metrics to be floats, but got {0} and {1}".format(
        type(giniCoef["valid"]), type(giniCoef["xval"]))

    #   F1
    F11 = gbm.F1(train=True, valid=False, xval=False)
    F12 = gbm.F1(train=False, valid=True, xval=False)
    F13 = gbm.F1(train=False, valid=False, xval=True)
    F1 = gbm.F1(train=True, valid=True, xval=False)
    F1 = gbm.F1(train=True, valid=False, xval=True)
    F1 = gbm.F1(train=True, valid=True, xval=True)
    F1 = gbm.F1(train=False, valid=False,
                xval=False)  # default: return training metrics
    F1 = gbm.F1(train=False, valid=True, xval=True)

    #   F0point5
    F0point51 = gbm.F0point5(train=True, valid=False, xval=False)
    F0point52 = gbm.F0point5(train=False, valid=True, xval=False)
    F0point53 = gbm.F0point5(train=False, valid=False, xval=True)
    F0point5 = gbm.F0point5(train=True, valid=True, xval=False)
    F0point5 = gbm.F0point5(train=True, valid=False, xval=True)
    F0point5 = gbm.F0point5(train=True, valid=True, xval=True)
    F0point5 = gbm.F0point5(train=False, valid=False,
                            xval=False)  # default: return training metrics
    F0point5 = gbm.F0point5(train=False, valid=True, xval=True)

    #   F2
    F21 = gbm.F2(train=True, valid=False, xval=False)
    F22 = gbm.F2(train=False, valid=True, xval=False)
    F23 = gbm.F2(train=False, valid=False, xval=True)
    F2 = gbm.F2(train=True, valid=True, xval=False)
    F2 = gbm.F2(train=True, valid=False, xval=True)
    F2 = gbm.F2(train=True, valid=True, xval=True)
    F2 = gbm.F2(train=False, valid=False,
                xval=False)  # default: return training metrics
    F2 = gbm.F2(train=False, valid=True, xval=True)

    #   accuracy
    accuracy1 = gbm.accuracy(train=True, valid=False, xval=False)
    accuracy2 = gbm.accuracy(train=False, valid=True, xval=False)
    accuracy3 = gbm.accuracy(train=False, valid=False, xval=True)
    accuracy = gbm.accuracy(train=True, valid=True, xval=False)
    accuracy = gbm.accuracy(train=True, valid=False, xval=True)
    accuracy = gbm.accuracy(train=True, valid=True, xval=True)
    accuracy = gbm.accuracy(train=False, valid=False,
                            xval=False)  # default: return training metrics
    accuracy = gbm.accuracy(train=False, valid=True, xval=True)

    #   error
    error1 = gbm.error(train=True, valid=False, xval=False)
    error2 = gbm.error(train=False, valid=True, xval=False)
    error3 = gbm.error(train=False, valid=False, xval=True)
    error = gbm.error(train=True, valid=True, xval=False)
    error = gbm.error(train=True, valid=False, xval=True)
    error = gbm.error(train=True, valid=True, xval=True)
    error = gbm.error(train=False, valid=False,
                      xval=False)  # default: return training metrics
    error = gbm.error(train=False, valid=True, xval=True)

    #   precision
    precision1 = gbm.precision(train=True, valid=False, xval=False)
    precision2 = gbm.precision(train=False, valid=True, xval=False)
    precision3 = gbm.precision(train=False, valid=False, xval=True)
    precision = gbm.precision(train=True, valid=True, xval=False)
    precision = gbm.precision(train=True, valid=False, xval=True)
    precision = gbm.precision(train=True, valid=True, xval=True)
    precision = gbm.precision(train=False, valid=False,
                              xval=False)  # default: return training metrics
    precision = gbm.precision(train=False, valid=True, xval=True)

    #   mcc
    mcc1 = gbm.mcc(train=True, valid=False, xval=False)
    mcc2 = gbm.mcc(train=False, valid=True, xval=False)
    mcc3 = gbm.mcc(train=False, valid=False, xval=True)
    mcc = gbm.mcc(train=True, valid=True, xval=False)
    mcc = gbm.mcc(train=True, valid=False, xval=True)
    mcc = gbm.mcc(train=True, valid=True, xval=True)
    mcc = gbm.mcc(train=False, valid=False,
                  xval=False)  # default: return training metrics
    mcc = gbm.mcc(train=False, valid=True, xval=True)

    #   max_per_class_error
    max_per_class_error1 = gbm.max_per_class_error(train=True,
                                                   valid=False,
                                                   xval=False)
    max_per_class_error2 = gbm.max_per_class_error(train=False,
                                                   valid=True,
                                                   xval=False)
    max_per_class_error3 = gbm.max_per_class_error(train=False,
                                                   valid=False,
                                                   xval=True)
    max_per_class_error = gbm.max_per_class_error(train=True,
                                                  valid=True,
                                                  xval=False)
    max_per_class_error = gbm.max_per_class_error(train=True,
                                                  valid=False,
                                                  xval=True)
    max_per_class_error = gbm.max_per_class_error(train=True,
                                                  valid=True,
                                                  xval=True)
    max_per_class_error = gbm.max_per_class_error(
        train=False, valid=False,
        xval=False)  # default: return training metrics
    max_per_class_error = gbm.max_per_class_error(train=False,
                                                  valid=True,
                                                  xval=True)

    #   confusion_matrix
    confusion_matrix1 = gbm.confusion_matrix(train=True,
                                             valid=False,
                                             xval=False)
    confusion_matrix2 = gbm.confusion_matrix(train=False,
                                             valid=True,
                                             xval=False)
    confusion_matrix3 = gbm.confusion_matrix(train=False,
                                             valid=False,
                                             xval=True)
    confusion_matrix = gbm.confusion_matrix(train=True, valid=True, xval=False)
    confusion_matrix = gbm.confusion_matrix(train=True, valid=False, xval=True)
    confusion_matrix = gbm.confusion_matrix(train=True, valid=True, xval=True)
    confusion_matrix = gbm.confusion_matrix(
        train=False, valid=False,
        xval=False)  # default: return training metrics
    confusion_matrix = gbm.confusion_matrix(train=False, valid=True, xval=True)

    # #   plot
    # plot1 = gbm.plot(train=True,  valid=False, xval=False)
    # plot2 = gbm.plot(train=False, valid=True,  xval=False)
    # plot3 = gbm.plot(train=False, valid=False, xval=True)
    # plot = gbm.plot(train=True,  valid=True,  xval=False)
    # plot = gbm.plot(train=True,  valid=False, xval=True)
    # plot = gbm.plot(train=True,  valid=True,  xval=True)
    # plot = gbm.plot(train=False, valid=False, xval=False) # default: return training metrics
    # plot = gbm.plot(train=False, valid=True,  xval=True)

    # #   tpr
    # tpr1 = gbm.tpr(train=True,  valid=False, xval=False)
    # tpr2 = gbm.tpr(train=False, valid=True,  xval=False)
    # tpr3 = gbm.tpr(train=False, valid=False, xval=True)
    # tpr = gbm.tpr(train=True,  valid=True,  xval=False)
    # tpr = gbm.tpr(train=True,  valid=False, xval=True)
    # tpr = gbm.tpr(train=True,  valid=True,  xval=True)
    # tpr = gbm.tpr(train=False, valid=False, xval=False) # default: return training metrics
    # tpr = gbm.tpr(train=False, valid=True,  xval=True)
    #
    # #   tnr
    # tnr1 = gbm.tnr(train=True,  valid=False, xval=False)
    # tnr2 = gbm.tnr(train=False, valid=True,  xval=False)
    # tnr3 = gbm.tnr(train=False, valid=False, xval=True)
    # tnr = gbm.tnr(train=True,  valid=True,  xval=False)
    # tnr = gbm.tnr(train=True,  valid=False, xval=True)
    # tnr = gbm.tnr(train=True,  valid=True,  xval=True)
    # tnr = gbm.tnr(train=False, valid=False, xval=False) # default: return training metrics
    # tnr = gbm.tnr(train=False, valid=True,  xval=True)
    #
    # #   fnr
    # fnr1 = gbm.fnr(train=True,  valid=False, xval=False)
    # fnr2 = gbm.fnr(train=False, valid=True,  xval=False)
    # fnr3 = gbm.fnr(train=False, valid=False, xval=True)
    # fnr = gbm.fnr(train=True,  valid=True,  xval=False)
    # fnr = gbm.fnr(train=True,  valid=False, xval=True)
    # fnr = gbm.fnr(train=True,  valid=True,  xval=True)
    # fnr = gbm.fnr(train=False, valid=False, xval=False) # default: return training metrics
    # fnr = gbm.fnr(train=False, valid=True,  xval=True)
    #
    # #   fpr
    # fpr1 = gbm.fpr(train=True,  valid=False, xval=False)
    # fpr2 = gbm.fpr(train=False, valid=True,  xval=False)
    # fpr3 = gbm.fpr(train=False, valid=False, xval=True)
    # fpr = gbm.fpr(train=True,  valid=True,  xval=False)
    # fpr = gbm.fpr(train=True,  valid=False, xval=True)
    # fpr = gbm.fpr(train=True,  valid=True,  xval=True)
    # fpr = gbm.fpr(train=False, valid=False, xval=False) # default: return training metrics
    # fpr = gbm.fpr(train=False, valid=True,  xval=True)

    # multinomial
    cars = h2o.import_file(
        path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    cars["cylinders"] = cars["cylinders"].asfactor()
    r = cars[0].runif()
    train = cars[r > .2]
    valid = cars[r <= .2]
    response_col = "cylinders"
    distribution = "multinomial"
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    gbm = h2o.gbm(y=train[response_col],
                  x=train[predictors],
                  validation_y=valid[response_col],
                  validation_x=valid[predictors],
                  nfolds=3,
                  distribution=distribution,
                  fold_assignment="Random")

    #   mse
    mse1 = gbm.mse(train=True, valid=False, xval=False)
    assert isinstance(mse1, float)

    mse2 = gbm.mse(train=False, valid=True, xval=False)
    assert isinstance(mse2, float)

    mse3 = gbm.mse(train=False, valid=False, xval=True)
    assert isinstance(mse3, float)

    mse = gbm.mse(train=True, valid=True, xval=False)
    assert "train" in list(mse.keys()) and "valid" in list(
        mse.keys()
    ), "expected training and validation metrics to be returned, but got {0}".format(
        list(mse.keys()))
    assert len(
        mse
    ) == 2, "expected only training and validation metrics to be returned, but got {0}".format(
        list(mse.keys()))
    assert isinstance(mse["train"], float) and isinstance(
        mse["valid"], float
    ), "expected training and validation metrics to be floats, but got {0} and {1}".format(
        type(mse["train"]), type(mse["valid"]))
    assert mse["valid"] == mse2

    mse = gbm.mse(train=True, valid=False, xval=True)
    assert "train" in list(mse.keys()) and "xval" in list(
        mse.keys()
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        list(mse.keys()))
    assert len(
        mse
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        list(mse.keys()))
    assert isinstance(mse["train"], float) and isinstance(
        mse["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(mse["train"]), type(mse["xval"]))
    assert mse["xval"] == mse3

    mse = gbm.mse(train=True, valid=True, xval=True)
    assert "train" in list(mse.keys()) and "valid" in list(mse.keys(
    )) and "xval" in list(
        mse.keys()
    ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(
        list(mse.keys()))
    assert len(
        mse
    ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(
        list(mse.keys()))
    assert isinstance(mse["train"], float) and isinstance(
        mse["valid"], float
    ) and isinstance(
        mse["xval"], float
    ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(
        type(mse["train"]), type(mse["valid"]), type(mse["xval"]))

    mse = gbm.mse(train=False, valid=False,
                  xval=False)  # default: return training metrics
    assert isinstance(mse, float)
    assert mse == mse1

    mse = gbm.mse(train=False, valid=True, xval=True)
    assert "valid" in list(mse.keys()) and "xval" in list(
        mse.keys()
    ), "expected validation and cross validation metrics to be returned, but got {0}".format(
        list(mse.keys()))
    assert len(
        mse
    ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(
        list(mse.keys()))
    assert isinstance(mse["valid"], float) and isinstance(
        mse["xval"], float
    ), "validation and cross validation metrics to be floats, but got {0} and {1}".format(
        type(mse["valid"]), type(mse["xval"]))

    #   logloss
    logloss1 = gbm.logloss(train=True, valid=False, xval=False)
    assert isinstance(logloss1, float)

    logloss2 = gbm.logloss(train=False, valid=True, xval=False)
    assert isinstance(logloss2, float)

    logloss3 = gbm.logloss(train=False, valid=False, xval=True)
    assert isinstance(logloss3, float)

    logloss = gbm.logloss(train=True, valid=True, xval=False)
    assert "train" in list(logloss.keys()) and "valid" in list(
        logloss.keys()
    ), "expected training and validation metrics to be returned, but got {0}".format(
        list(logloss.keys()))
    assert len(
        logloss
    ) == 2, "expected only training and validation metrics to be returned, but got {0}".format(
        list(logloss.keys()))
    assert isinstance(logloss["train"], float) and isinstance(
        logloss["valid"], float
    ), "expected training and validation metrics to be floats, but got {0} and {1}".format(
        type(logloss["train"]), type(logloss["valid"]))
    assert logloss["valid"] == logloss2

    logloss = gbm.logloss(train=True, valid=False, xval=True)
    assert "train" in list(logloss.keys()) and "xval" in list(
        logloss.keys()
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        list(logloss.keys()))
    assert len(
        logloss
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        list(logloss.keys()))
    assert isinstance(logloss["train"], float) and isinstance(
        logloss["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(logloss["train"]), type(logloss["xval"]))
    assert logloss["xval"] == logloss3

    logloss = gbm.logloss(train=True, valid=True, xval=True)
    assert "train" in list(logloss.keys()) and "valid" in list(logloss.keys(
    )) and "xval" in list(
        logloss.keys()
    ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(
        list(logloss.keys()))
    assert len(
        logloss
    ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(
        list(logloss.keys()))
    assert isinstance(logloss["train"], float) and isinstance(
        logloss["valid"], float
    ) and isinstance(
        logloss["xval"], float
    ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(
        type(logloss["train"]), type(logloss["valid"]), type(logloss["xval"]))

    logloss = gbm.logloss(train=False, valid=False,
                          xval=False)  # default: return training metrics
    assert isinstance(logloss, float)
    assert logloss == logloss1

    logloss = gbm.logloss(train=False, valid=True, xval=True)
    assert "valid" in list(logloss.keys()) and "xval" in list(
        logloss.keys()
    ), "expected validation and cross validation metrics to be returned, but got {0}".format(
        list(logloss.keys()))
    assert len(
        logloss
    ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(
        list(logloss.keys()))
    assert isinstance(logloss["valid"], float) and isinstance(
        logloss["xval"], float
    ), "validation and cross validation metrics to be floats, but got {0} and {1}".format(
        type(logloss["valid"]), type(logloss["xval"]))

    #   hit_ratio_table
    hit_ratio_table1 = gbm.hit_ratio_table(train=True, valid=False, xval=False)
    hit_ratio_table2 = gbm.hit_ratio_table(train=False, valid=True, xval=False)
    hit_ratio_table3 = gbm.hit_ratio_table(train=False, valid=False, xval=True)
    hit_ratio_table = gbm.hit_ratio_table(train=True, valid=True, xval=False)
    hit_ratio_table = gbm.hit_ratio_table(train=True, valid=False, xval=True)
    hit_ratio_table = gbm.hit_ratio_table(train=True, valid=True, xval=True)
    hit_ratio_table = gbm.hit_ratio_table(
        train=False, valid=False,
        xval=False)  # default: return training metrics
    hit_ratio_table = gbm.hit_ratio_table(train=False, valid=True, xval=True)

    # clustering
    iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv"))
    km = h2o.kmeans(x=iris[0:4], nfolds=3, k=3)

    #   betweenss
    betweenss1 = km.betweenss(train=True, valid=False, xval=False)
    assert isinstance(betweenss1, float)

    betweenss3 = km.betweenss(train=False, valid=False, xval=True)
    assert isinstance(betweenss3, float)

    betweenss = km.betweenss(train=True, valid=False, xval=True)
    assert "train" in list(betweenss.keys()) and "xval" in list(
        betweenss.keys()
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        list(betweenss.keys()))
    assert len(
        betweenss
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        list(betweenss.keys()))
    assert isinstance(betweenss["train"], float) and isinstance(
        betweenss["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(betweenss["train"]), type(betweenss["xval"]))
    assert betweenss["xval"] == betweenss3

    betweenss = km.betweenss(train=False, valid=False,
                             xval=False)  # default: return training metrics
    assert isinstance(betweenss, float)
    assert betweenss == betweenss1

    #   totss
    totss1 = km.totss(train=True, valid=False, xval=False)
    assert isinstance(totss1, float)

    totss3 = km.totss(train=False, valid=False, xval=True)
    assert isinstance(totss3, float)

    totss = km.totss(train=True, valid=False, xval=True)
    assert "train" in list(totss.keys()) and "xval" in list(
        totss.keys()
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        list(totss.keys()))
    assert len(
        totss
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        list(totss.keys()))
    assert isinstance(totss["train"], float) and isinstance(
        totss["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(totss["train"]), type(totss["xval"]))
    assert totss["xval"] == totss3

    totss = km.totss(train=False, valid=False,
                     xval=False)  # default: return training metrics
    assert isinstance(totss, float)
    assert totss == totss1

    #   tot_withinss
    tot_withinss1 = km.tot_withinss(train=True, valid=False, xval=False)
    assert isinstance(tot_withinss1, float)

    tot_withinss3 = km.tot_withinss(train=False, valid=False, xval=True)
    assert isinstance(tot_withinss3, float)

    tot_withinss = km.tot_withinss(train=True, valid=False, xval=True)
    assert "train" in list(tot_withinss.keys()) and "xval" in list(
        tot_withinss.keys()
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        list(tot_withinss.keys()))
    assert len(
        tot_withinss
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        list(tot_withinss.keys()))
    assert isinstance(tot_withinss["train"], float) and isinstance(
        tot_withinss["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(tot_withinss["train"]), type(tot_withinss["xval"]))
    assert tot_withinss["xval"] == tot_withinss3

    tot_withinss = km.tot_withinss(
        train=False, valid=False,
        xval=False)  # default: return training metrics
    assert isinstance(tot_withinss, float)
    assert tot_withinss == tot_withinss1

    #   withinss
    withinss1 = km.withinss(train=True, valid=False, xval=False)
    withinss3 = km.withinss(train=False, valid=False, xval=True)
    withinss = km.withinss(train=True, valid=False, xval=True)
    withinss = km.withinss(train=False, valid=False,
                           xval=False)  # default: return training metrics

    #   centroid_stats
    centroid_stats1 = km.centroid_stats(train=True, valid=False, xval=False)
    centroid_stats3 = km.centroid_stats(train=False, valid=False, xval=True)
    centroid_stats = km.centroid_stats(train=True, valid=False, xval=True)
    centroid_stats = km.centroid_stats(
        train=False, valid=False,
        xval=False)  # default: return training metrics

    #   size
    size1 = km.size(train=True, valid=False, xval=False)
    size3 = km.size(train=False, valid=False, xval=True)
    size = km.size(train=True, valid=False, xval=True)
    size = km.size(train=False, valid=False,
                   xval=False)  # default: return training metrics
def test_modelselection_gaussian():
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    my_y = "GLEASON"
    my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
    model_maxrsweep = modelSelection(seed=12345,
                                     max_predictor_number=3,
                                     mode="maxrsweep")
    model_maxrsweep.train(training_frame=d, x=my_x, y=my_y)
    model_maxr = modelSelection(seed=12345,
                                max_predictor_number=3,
                                mode="maxr")
    model_maxr.train(training_frame=d, x=my_x, y=my_y)

    # make sure results returned by maxr and maxrsweep are the same
    pyunit_utils.compare_frames_local(model_maxr.result()[2:4],
                                      model_maxrsweep.result()[2:4],
                                      prob=1.0,
                                      tol=1e-6)

    model_allsubsets = modelSelection(seed=12345,
                                      max_predictor_number=3,
                                      mode="allsubsets")
    model_allsubsets.train(training_frame=d, x=my_x, y=my_y)
    best_r2_value_allsubsets = model_allsubsets.get_best_R2_values()
    best_predictor_names_allsubsets = model_allsubsets.get_best_model_predictors(
    )
    best_r2_value_maxr = model_maxr.get_best_R2_values()

    # assert that model returned with one predictor found by modelselection is the best by comparing it to manual training result
    one_pred_r2 = []
    for pred in my_x:
        x = [pred]
        m = glm(seed=12345)
        m.train(training_frame=d, x=x, y=my_y)
        one_pred_r2.append(m.r2())
    best_r2 = max(one_pred_r2)
    assert abs(best_r2-best_r2_value_allsubsets[0]) < 1e-6, "expected best r2: {0}, allsubset: actual best r2:{1}. " \
                                                            " They are different.".format(best_r2, best_r2_value_allsubsets[0])
    assert abs(best_r2-best_r2_value_maxr[0]) < 1e-6, "expected best r2: {0}, maxr: actual best r2:{1}. " \
                                                      " They are different.".format(best_r2, best_r2_value_maxr[0])
    assert abs(best_r2_value_allsubsets[0]-best_r2_value_maxr[0]) < 1e-6, "allsubset best r2: {0}, maxr best r2:{1}. " \
                                                                          " They are different." \
                                                                          "".format(best_r2_value_allsubsets[0],
                                                                                    best_r2_value_maxr[0])

    print("Best one predictor model uses predictor: {0}".format(
        best_predictor_names_allsubsets[0]))

    my_x3 = [["AGE", "RACE", "CAPSULE"], ["AGE", "RACE", "DCAPS"],
             ["AGE", "RACE", "PSA"], ["AGE", "RACE", "VOL"],
             ["AGE", "RACE", "DPROS"], ["AGE", "CAPSULE", "DCAPS"],
             ["AGE", "CAPSULE", "PSA"], ["AGE", "CAPSULE", "VOL"],
             ["AGE", "CAPSULE", "DPROS"], ["AGE", "DCAPS", "PSA"],
             ["AGE", "DCAPS", "PSA"], ["AGE", "DCAPS", "VOL"],
             ["AGE", "DCAPS", "DPROS"], ["AGE", "PSA", "VOL"],
             ["AGE", "PSA", "VOL"], ["AGE", "PSA", "DPROS"],
             ["AGE", "VOL", "DPROS"], ["RACE", "CAPSULE", "DCAPS"],
             ["RACE", "CAPSULE", "PSA"], ["RACE", "CAPSULE", "VOL"],
             ["RACE", "CAPSULE", "DPROS"], ["RACE", "DCAPS", "PSA"],
             ["RACE", "DCAPS", "VOL"], ["RACE", "DCAPS", "DPROS"],
             ["RACE", "PSA", "VOL"], ["RACE", "PSA", "DPROS"],
             ["RACE", "VOL", "DPROS"], ["CAPSULE", "DCAPS", "PSA"],
             ["CAPSULE", "DCAPS", "VOL"], ["CAPSULE", "DCAPS", "DPROS"],
             ["DCAPS", "PSA", "VOL"], ["DCAPS", "PSA", "DPROS"],
             ["DCAPS", "VOL", "DPROS"], ["PSA", "VOL", "DPROS"]]
    two_pred_r2 = []
    for pred2 in my_x3:
        x = pred2
        m = glm(seed=12345)
        m.train(training_frame=d, x=x, y=my_y)
        two_pred_r2.append(m.r2())
    best_r2_two_pred = max(two_pred_r2)
    assert abs(best_r2_two_pred-best_r2_value_allsubsets[2]) < 1e-6, "expected best r2: {0}, allsubsets: actual best " \
                                                                     "r2:{1}.  They are different." \
                                                     "".format(best_r2_two_pred, best_r2_value_allsubsets[2])
    assert abs(best_r2_two_pred-best_r2_value_maxr[2]) < 1e-6, "expected best r2: {0}, maxr: actual best " \
                                                                     "r2:{1}.  They are different." \
                                                                     "".format(best_r2_two_pred, best_r2_value_maxr[2])
    assert abs(best_r2_value_allsubsets[2]-best_r2_value_maxr[2]) < 1e-6, "allsubset best r2: {0}, maxr: actual best " \
                                                               "r2:{1}.  They are different." \
                                                               "".format(best_r2_value_allsubsets[2], best_r2_value_maxr[2])
    print("Best three predictors model uses predictors: {0}".format(
        best_predictor_names_allsubsets[2]))
Ejemplo n.º 58
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** H2O AutoML ****\n")
    # Mapping of benchmark metrics to H2O metrics
    metrics_mapping = dict(acc='mean_per_class_error',
                           auc='AUC',
                           logloss='logloss',
                           mae='mae',
                           mse='mse',
                           rmse='rmse',
                           rmsle='rmsle')
    sort_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if sort_metric is None:
        # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping
        log.warning("Performance metric %s not supported, defaulting to AUTO.",
                    config.metric)

    try:
        log.info("Starting H2O cluster with %s cores, %smb memory.",
                 config.cores, config.max_mem_size_mb)
        h2o.init(nthreads=config.cores,
                 max_mem_size=str(config.max_mem_size_mb) + "M")

        # Load train as an H2O Frame, but test as a Pandas DataFrame
        log.debug("Loading train data from %s.", dataset.train.path)
        train = h2o.import_file(dataset.train.path)
        # train.impute(method='mean')
        log.debug("Loading test data from %s.", dataset.test.path)
        test = h2o.import_file(dataset.test.path)
        # test.impute(method='mean')

        log.info("Running model on task %s, fold %s.", config.name,
                 config.fold)
        log.debug(
            "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.",
            config.max_runtime_seconds, config.cores, sort_metric)

        aml = H2OAutoML(max_runtime_secs=config.max_runtime_seconds,
                        sort_metric=sort_metric,
                        seed=config.seed,
                        **config.framework_params)

        with Timer() as training:
            aml.train(y=dataset.target.index, training_frame=train)

        if not aml.leader:
            raise NoResultError(
                "H2O could not produce any model in the requested time.")

        log.debug("Leaderboard:\n%s", str(aml.leaderboard.as_data_frame()))

        preds = aml.predict(test).as_data_frame()
        # predictions = h2o.get_model(aml.leaderboard[0][1, 0]).predict(test).as_data_frame()

        y_pred = preds.iloc[:, 0]
        y_truth = test[:, dataset.target.index].as_data_frame(header=False)

        predictions = y_pred.values
        probabilities = preds.iloc[:, 1:].values

        save_predictions_to_file(dataset=dataset,
                                 output_file=config.output_predictions_file,
                                 probabilities=probabilities,
                                 predictions=predictions,
                                 truth=y_truth.values)

        return dict(models_count=len(aml.leaderboard),
                    training_duration=training.duration)

    finally:
        if h2o.connection():
            h2o.remove_all()
            h2o.connection().close()
Ejemplo n.º 59
0
def metric_json_check():

    df = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))

    # Regression metric json
    reg_mod = h2o.gbm(y=df["CAPSULE"],
                      x=df[3:],
                      training_frame=df,
                      distribution="gaussian")
    reg_met = reg_mod.model_performance()
    reg_metric_json_keys_have = list(reg_met._metric_json.keys())
    reg_metric_json_keys_desired = [
        u'model_category', u'description', u'r2', u'frame', u'model_checksum',
        u'MSE', u'__meta', u'scoring_time', u'predictions', u'model',
        u'duration_in_ms', u'frame_checksum', u'mean_residual_deviance'
    ]
    reg_metric_diff = list(
        set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired))
    assert not reg_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) regression " \
                                "metric json. The difference is {2}".format(reg_metric_json_keys_have,
                                                                            reg_metric_json_keys_desired,
                                                                            reg_metric_diff)
    # Regression metric json (GLM)
    reg_mod = h2o.glm(y=df["CAPSULE"],
                      x=df[3:],
                      training_frame=df,
                      family="gaussian")
    reg_met = reg_mod.model_performance()
    reg_metric_json_keys_have = list(reg_met._metric_json.keys())
    reg_metric_json_keys_desired = [
        u'model_category', u'description', u'r2',
        u'residual_degrees_of_freedom', u'frame', u'model_checksum', u'MSE',
        u'__meta', u'null_deviance', u'scoring_time',
        u'null_degrees_of_freedom', u'predictions', u'AIC', u'model',
        u'duration_in_ms', u'frame_checksum', u'residual_deviance',
        u'mean_residual_deviance'
    ]
    reg_metric_diff = list(
        set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired))
    assert not reg_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) glm-regression " \
                                "metric json. The difference is {2}".format(reg_metric_json_keys_have,
                                                                            reg_metric_json_keys_desired,
                                                                            reg_metric_diff)

    # Binomial metric json
    bin_mod = h2o.gbm(y=df["CAPSULE"].asfactor(),
                      x=df[3:],
                      training_frame=df,
                      distribution="bernoulli")
    bin_met = bin_mod.model_performance()
    bin_metric_json_keys_have = list(bin_met._metric_json.keys())
    bin_metric_json_keys_desired = [
        u'AUC', u'Gini', u'model_category', u'description', u'r2', u'frame',
        u'model_checksum', u'MSE', u'__meta', u'gains_lift_table', u'logloss',
        u'scoring_time', u'thresholds_and_metric_scores', u'predictions',
        u'max_criteria_and_metric_scores', u'model', u'duration_in_ms',
        u'frame_checksum', u'domain'
    ]
    bin_metric_diff = list(
        set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired))
    assert not bin_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) binomial " \
                                "metric json. The difference is {2}".format(bin_metric_json_keys_have,
                                                                            bin_metric_json_keys_desired,
                                                                            bin_metric_diff)

    # Binomial metric json (GLM)
    bin_mod = h2o.glm(y=df["CAPSULE"].asfactor(),
                      x=df[3:],
                      training_frame=df,
                      family="binomial")
    bin_met = bin_mod.model_performance()
    bin_metric_json_keys_have = list(bin_met._metric_json.keys())
    bin_metric_json_keys_desired = [
        u'frame', u'residual_deviance', u'max_criteria_and_metric_scores',
        u'MSE', u'frame_checksum', u'AIC', u'logloss', u'Gini', u'predictions',
        u'AUC', u'description', u'model_checksum', u'duration_in_ms',
        u'model_category', u'gains_lift_table', u'r2',
        u'residual_degrees_of_freedom', u'__meta', u'null_deviance',
        u'scoring_time', u'null_degrees_of_freedom', u'model',
        u'thresholds_and_metric_scores', u'domain'
    ]
    bin_metric_diff = list(
        set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired))
    assert not bin_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) glm-binomial " \
                                "metric json. The difference is {2}".format(bin_metric_json_keys_have,
                                                                            bin_metric_json_keys_desired,
                                                                            bin_metric_diff)

    # Multinomial metric json
    df = h2o.import_file(
        path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip"))
    myX = [
        "Origin", "Dest", "IsDepDelayed", "UniqueCarrier", "Distance",
        "fDayofMonth", "fDayOfWeek"
    ]
    myY = "fYear"
    mul_mod = h2o.gbm(x=df[myX],
                      y=df[myY],
                      training_frame=df,
                      distribution="multinomial")
    mul_met = mul_mod.model_performance()
    mul_metric_json_keys_have = list(mul_met._metric_json.keys())
    mul_metric_json_keys_desired = [
        u'cm', u'model_category', u'description', u'r2', u'frame',
        u'model_checksum', u'MSE', u'__meta', u'logloss', u'scoring_time',
        u'predictions', u'hit_ratio_table', u'model', u'duration_in_ms',
        u'frame_checksum'
    ]
    mul_metric_diff = list(
        set(mul_metric_json_keys_have) - set(mul_metric_json_keys_desired))
    assert not mul_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) multinomial " \
                                "metric json. The difference is {2}".format(mul_metric_json_keys_have,
                                                                            mul_metric_json_keys_desired,
                                                                            mul_metric_diff)

    # Clustering metric json
    df = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv"))
    clus_mod = h2o.kmeans(x=df[0:4], k=3, standardize=False)
    clus_met = clus_mod.model_performance()
    clus_metric_json_keys_have = list(clus_met._metric_json.keys())
    clus_metric_json_keys_desired = [
        u'tot_withinss', u'model_category', u'description', u'frame',
        u'model_checksum', u'MSE', u'__meta', u'scoring_time', u'betweenss',
        u'predictions', u'totss', u'model', u'duration_in_ms',
        u'frame_checksum', u'centroid_stats'
    ]
    clus_metric_diff = list(
        set(clus_metric_json_keys_have) - set(clus_metric_json_keys_desired))
    assert not clus_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) clustering " \
                                "metric json. The difference is {2}".format(clus_metric_json_keys_have,
                                                                            clus_metric_json_keys_desired,
                                                                            clus_metric_diff)
Ejemplo n.º 60
0
import numpy as np
import pandas as pd
import h2o

print('Loading data')
h2o.init()
feats = ["id", 'era', 'data_type']
pred_columns = []
for i in range(50):
    pred_columns.append("feature" + str(i + 1).strip())
    feats.append("feature" + str(i + 1).strip())
feats.append("target")
df = h2o.import_file("../input/numerai_training_data.csv")

test = h2o.import_file('../input/numerai_tournament_data.csv')
#valid=test[test['data_type']=='validation']

from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.deepwater import H2ODeepWaterEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator

#GBM=H2OGradientBoostingEstimator(
#        ntrees=10,
#        learn_rate=0.2,
#        learn_rate_annealing = 0.99,
#        sample_rate = 0.8,
#        col_sample_rate = 0.8,
#        seed = 1234,