Example #1
0
def main(do_tests=False, do_bench=False):
	# Neither tests nor benchmarks are being run.
	if not do_tests and not do_bench:
		print("test: neither tests nor benchmarks are enabled")
		parser.print_help()
		return

	if do_tests:
		print("[!] Running test suite!")
		tests.run_test()

	if do_bench:
		print("[!] Running benchmarks! (may take some time)")
		tests.run_bench()
Example #2
0
import sys
sys.path.insert(1, "../../")
import h2o, tests


def vec_as_list():

    iris = h2o.import_file(
        path=tests.locate("smalldata/iris/iris_wheader.csv"))

    res = h2o.as_list(iris[0], use_pandas=False)
    assert abs(float(res[4][0]) - 4.6) < 1e-10 and abs(float(res[6][0]) - 5.4) < 1e-10 and \
           abs(float(res[10][0]) - 4.9) < 1e-10, "incorrect values"

    res = 2 - iris
    res2 = h2o.as_list(res[0], use_pandas=False)
    assert abs(float(res2[4][0]) - -2.6) < 1e-10 and abs(float(res2[18][0]) - -3.1) < 1e-10 and \
           abs(float(res2[25][0]) - -2.8) < 1e-10, "incorrect values"

    res3 = h2o.as_list(res[1], use_pandas=False)
    assert abs(float(res3[4][0]) - -1.1) < 1e-10 and abs(float(res3[6][0]) - -1.9) < 1e-10 and \
           abs(float(res3[10][0]) - -1.1) < 1e-10, "incorrect values"


if __name__ == "__main__":
    tests.run_test(sys.argv, vec_as_list)
    assert abs(1515.91815848623 - prostate_glm_h2o.residual_deviance()) < 0.1

    print "Checking binomial model without offset..."
    prostate_glm_h2o = h2o.glm(
        x=prostate_hex[["RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"]],
        y=prostate_hex["CAPSULE"],
        training_frame=prostate_hex,
        family="poisson",
        standardize=False,
    )
    print "h2o residual: {0}".format(prostate_glm_h2o.residual_deviance())
    print "r residual: {0}".format(216.339989007507)
    assert abs(216.339989007507 - prostate_glm_h2o.residual_deviance()) < 0.1

    print "Checking binomial model with offset..."
    prostate_glm_h2o = h2o.glm(
        x=prostate_hex[["RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON", "AGE"]],
        y=prostate_hex["CAPSULE"],
        training_frame=prostate_hex,
        family="poisson",
        offset_column="AGE",
        standardize=False,
    )
    print "h2o residual: {0}".format(prostate_glm_h2o.residual_deviance())
    print "r residual: {0}".format(2761.76218461138)
    assert abs(2761.76218461138 - prostate_glm_h2o.residual_deviance()) < 0.1


if __name__ == "__main__":
    tests.run_test(sys.argv, offset_1897)
import sys
sys.path.insert(1, "../../../")
import h2o, tests

def weights_and_distributions(ip,port):

    htable  = h2o.upload_file(h2o.locate("smalldata/gbm_test/moppe.csv"))
    htable["premiekl"] = htable["premiekl"].asfactor()
    htable["moptva"] = htable["moptva"].asfactor()
    htable["zon"] = htable["zon"]

    # gamma
    dl = h2o.deeplearning(x=htable[0:3],y=htable["medskad"],training_frame=htable,distribution="gamma",weights_column="antskad")
    predictions = dl.predict(htable)

    # gaussian
    dl = h2o.deeplearning(x=htable[0:3],y=htable["medskad"],training_frame=htable,distribution="gaussian",weights_column="antskad")
    predictions = dl.predict(htable)

    # poisson
    dl = h2o.deeplearning(x=htable[0:3],y=htable["medskad"],training_frame=htable,distribution="poisson",weights_column="antskad")
    predictions = dl.predict(htable)

    # tweedie
    dl = h2o.deeplearning(x=htable[0:3],y=htable["medskad"],training_frame=htable,distribution="tweedie",weights_column="antskad")
    predictions = dl.predict(htable)

if __name__ == "__main__":
    tests.run_test(sys.argv, weights_and_distributions)
Example #5
0
import sys
sys.path.insert(1, "../../")
import h2o, tests

def rep_len_check():
    # Connect to a pre-existing cluster
    

    iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv"))

    # data is single column (vec)
    vec = iris[0].rep_len(length_out=301)
    assert vec.nrow == 301, "Expected an H2OVec with 301 rows, but got {0} rows".format(vec.nrow)
    for r in range(len(vec)): assert vec[r,:] == vec[r % 150,:], "Expected {0}, but got {1}".format(vec[r % 150,:], vec[r,:])

    # data is frame
    fr = iris.rep_len(length_out=7)
    assert fr.nrow == 150 and fr.ncol == 7, "Expected an H2OFrame with 150 rows and 7 columns, but got {0} rows and {1} cols".format(fr.nrow, fr.ncol)

if __name__ == "__main__":
    tests.run_test(sys.argv, rep_len_check)
import sys
sys.path.insert(1, "../../")
import h2o, tests

def javapredict_iris_drf():

    # optional parameters
    params = {'ntrees':100, 'max_depth':5, 'min_rows':10}
    print "Parameter list:"
    for k,v in zip(params.keys(), params.values()): print "{0}, {1}".format(k,v)

    train = h2o.import_file(tests.locate("smalldata/iris/iris_train.csv"))
    test = h2o.import_file(tests.locate("smalldata/iris/iris_train.csv"))
    x = ["sepal_len","sepal_wid","petal_len","petal_wid"]
    y = "species"

    tests.javapredict("random_forest", "class", train, test, x, y, **params)

if __name__ == "__main__":
    tests.run_test(sys.argv, javapredict_iris_drf)
import sys
sys.path.insert(1, "../../../")
import h2o, tests

def get_modelGBM():
  
  

  prostate = h2o.import_file(path=tests.locate("smalldata/logreg/prostate.csv"))
  prostate.describe()
  prostate[1] = prostate[1].asfactor()
  prostate_gbm = h2o.gbm(y=prostate[1], x=prostate[2:9], distribution="bernoulli")
  prostate_gbm.show()

  prostate_gbm.predict(prostate)
  model = h2o.get_model(prostate_gbm._id)
  model.show()

if __name__ == "__main__":
  tests.run_test(sys.argv, get_modelGBM)
import sys
sys.path.insert(1,"../../../")
import h2o, tests

def gbm_mean_residual_deviance():

    cars =  h2o.import_file(path=tests.locate("smalldata/junit/cars_20mpg.csv"))
    s = cars[0].runif()
    train = cars[s > 0.2]
    valid = cars[s <= 0.2]
    predictors = ["displacement","power","weight","acceleration","year"]
    response_col = "economy"
    gbm = h2o.gbm(x=train[predictors],
                  y=train[response_col],
                  validation_x=valid[predictors],
                  validation_y=valid[response_col],
                  nfolds=3)
    gbm_mrd = gbm.mean_residual_deviance(train=True,valid=True,xval=True)
    assert isinstance(gbm_mrd['train'],float), "Expected training mean residual deviance to be a float, but got " \
                                              "{0}".format(type(gbm_mrd['train']))
    assert isinstance(gbm_mrd['valid'],float), "Expected validation mean residual deviance to be a float, but got " \
                                              "{0}".format(type(gbm_mrd['valid']))
    assert isinstance(gbm_mrd['xval'],float), "Expected cross-validation mean residual deviance to be a float, but got " \
                                             "{0}".format(type(gbm_mrd['xval']))

if __name__ == '__main__':
    tests.run_test(sys.argv, gbm_mean_residual_deviance)
Example #9
0
    # build  transformation pipeline using sklearn's Pipeline and H2O transforms
    pipe = Pipeline([("standardize", H2OScaler()),
                     ("pca", H2OPCA(n_components=2)),
                     ("rf", H2ORandomForestEstimator(seed=42, ntrees=50))])

    params = {
        "standardize__center": [True, False],  # Parameters to test
        "standardize__scale": [True, False],
        "pca__n_components": randint(2, iris[1:].shape[1]),
        "rf__ntrees": randint(50, 60),
        "rf__max_depth": randint(4, 8),
        "rf__min_rows": randint(5, 10),
    }

    custom_cv = H2OKFold(iris, n_folds=5, seed=42)
    random_search = RandomizedSearchCV(pipe,
                                       params,
                                       n_iter=5,
                                       scoring=make_scorer(h2o_r2_score),
                                       cv=custom_cv,
                                       random_state=42,
                                       n_jobs=1)

    random_search.fit(iris[1:], iris[0])

    print random_search.best_estimator_


if __name__ == "__main__":
    tests.run_test(sys.argv, scale_pca_rf_pipe)
Example #10
0
import sys
sys.path.insert(1, "../../")
import h2o, tests


def colname_set_basic():

    print "Uploading iris data..."

    no_headers = h2o.upload_file(tests.locate("smalldata/iris/iris.csv"))
    headers_and = h2o.upload_file(
        tests.locate("smalldata/iris/iris_header.csv"))

    print no_headers.names
    print headers_and.names

    no_headers.set_names(headers_and.names)
    assert no_headers.names == headers_and.names, "Expected the same column names but got {0} and {1}".\
        format(no_headers.names, headers_and.names)


if __name__ == "__main__":
    tests.run_test(sys.argv, colname_set_basic)
Example #11
0
import sys
sys.path.insert(1, "../../")
import h2o, tests

def vec_show(ip,port):
    
    

    iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    print "iris:"
    iris.show()

    ###################################################################

    res = 2 - iris
    res2 = res[0]
    print "res2:"
    res2.show()

    res3 = res[1]
    print "res3:"
    res3.show()

    iris[2].show()

if __name__ == "__main__":
    tests.run_test(sys.argv, vec_show)
Example #12
0
import sys
sys.path.insert(1, "../../../")
import h2o, tests


def checkpoint_new_category_in_response():

    sv = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv"))
    iris = h2o.upload_file(tests.locate("smalldata/iris/iris.csv"))

    m1 = h2o.gbm(x=sv[[0, 1, 2, 3]], y=sv[4], ntrees=100)

    # attempt to continue building model, but with an expanded categorical response domain.
    # this should fail
    try:
        m2 = h2o.gbm(x=iris[[0, 1, 2, 3]],
                     y=iris[4],
                     ntrees=200,
                     checkpoint=m1.model_id)
        assert False, "Expected continued model-building to fail with new categories introduced in response"
    except EnvironmentError:
        pass


if __name__ == '__main__':
    tests.run_test(sys.argv, checkpoint_new_category_in_response)
Example #13
0

def benign():

    training_data = h2o.import_file(
        tests.locate("smalldata/logreg/benign.csv"))

    Y = 3
    X = range(3) + range(4, 11)

    #Log.info("Build the model")
    model = h2o.glm(y=training_data[Y].asfactor(),
                    x=training_data[X],
                    family="binomial",
                    alpha=[0],
                    Lambda=[1e-5])

    #Log.info("Check that the columns used in the model are the ones we passed in.")
    #Log.info("===================Columns passed in: ================")
    in_names = [training_data.names[i] for i in X]
    #Log.info("===================Columns passed out: ================")
    out_names = [
        model._model_json['output']['coefficients_table'].cell_values[c][0]
        for c in range(len(X) + 1)
    ]
    assert in_names == out_names[1:]


if __name__ == "__main__":
    tests.run_test(sys.argv, benign)
Example #14
0
    assert [h2o_rows,
            h2o_cols] == [np_rows,
                          np_cols], "expected equal number of columns and rows"

    # Log.info("Slice out a column and data frame it, try dim on it...")

    h2o_slice = h2o_data[4]
    np_slice = np_data[:, 4]

    h2o_rows, h2o_cols = h2o_slice.dim
    np_rows = np_slice.shape[0]

    print 'The dimensions of h2o column slice is: {0} x {1}'.format(
        h2o_rows, h2o_cols)
    print 'The dimensions of numpy array column slice is: {0} x 1'.format(
        np_rows)

    assert [h2o_rows,
            h2o_cols] == [np_rows,
                          1], "expected equal number of columns and rows"

    # Log.info("OK, now try an operator, e.g. '&', and then check dimensions agao...")

    h2oColAmpFive = h2o_slice & 5

    assert h2oColAmpFive.nrow == h2o_rows, "expected the number of rows to remain unchanged"


if __name__ == "__main__":
    tests.run_test(sys.argv, dim_checks)
Example #15
0
import sys
sys.path.insert(1, "../../")
import h2o, tests

def http_import():
    
    

    url = "http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip"
    aa = h2o.import_file(path=url)
    aa.show()

if __name__ == "__main__":
    tests.run_test(sys.argv, http_import)
import sys
sys.path.insert(1, "../../")
import h2o, tests
import random

def pyunit_remove_vecs():
    # TODO PUBDEV-1789
    pros = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv"))
    rows, cols = pros.dim

    remove = random.randint(1,5)
    p1 = pros.remove_vecs(cols=random.sample(range(cols),remove))
    new_rows, new_cols = p1.dim
    assert new_rows == rows and new_cols == cols-remove, "Expected {0} rows and {1} columns, but got {2} rows and {3} " \
                                                         "columns.".format(rows,cols,new_rows,new_cols)

    remove = random.randint(1,5)
    p1 = pros.remove_vecs(cols=random.sample(pros.names,remove))
    new_rows, new_cols = p1.dim
    assert new_rows == rows and new_cols == cols-remove, "Expected {0} rows and {1} columns, but got {2} rows and {3} " \
                                                         "columns.".format(rows,cols,new_rows,new_cols)

if __name__ == "__main__":
    tests.run_test(sys.argv, pyunit_remove_vecs)
    # Training set has two predictor columns
    # X1: 10 categorical levels, 100 observations per level; X2: Unif(0,1) noise
    # Ratio of y = 1 per Level: cat01 = 1.0 (strong predictor), cat02 to cat10 = 0.5 (weak predictors)

    
    

    #Log.info("Importing swpreds_1000x3.csv data...\n")
    swpreds = h2o.import_file(path=tests.locate("smalldata/gbm_test/swpreds_1000x3.csv"))
    swpreds["y"] = swpreds["y"].asfactor()

    #Log.info("Summary of swpreds_1000x3.csv from H2O:\n")
    #swpreds.summary()

    # Train H2O DRF without Noise Column
    #Log.info("Distributed Random Forest with only Predictor Column")
    model1 = h2o.random_forest(x=swpreds[["X1"]], y=swpreds["y"], ntrees=50, max_depth=20, nbins=500)
    model1.show()
    perf1 = model1.model_performance(swpreds)
    print(perf1.auc())

    # Train H2O DRF Model including Noise Column:
    #Log.info("Distributed Random Forest including Noise Column")
    model2 = h2o.random_forest(x=swpreds[["X1","X2"]], y=swpreds["y"], ntrees=50, max_depth=20, nbins=500)
    model2.show()
    perf2 = model2.model_performance(swpreds)
    print(perf2.auc())
  
if __name__ == "__main__":
  tests.run_test(sys.argv, swpredsRF)
            ncols, c)

    # prostate[int,slice]
    for ncols in range(1, cols + 1):
        r, c = prostate[random.randint(0, rows - 1), 0:ncols].dim
        assert r == 1, "incorrect number of rows. correct: {0}, computed: {1}".format(
            1, r)
        assert c == ncols, "incorrect number of cols. correct: {0}, computed: {1}".format(
            ncols, c)

    # prostate[slice,int]
    for nrows in range(1, 10):
        r, c = prostate[0:nrows, random.randint(0, cols - 1)].dim
        assert r == nrows, "incorrect number of rows. correct: {0}, computed: {1}".format(
            nrows, r)
        assert c == 1, "incorrect number of cols. correct: {0}, computed: {1}".format(
            1, c)

    # prostate[slice,slice]
    for nrows in range(1, 10):
        for ncols in range(1, cols + 1):
            r, c = prostate[0:nrows, 0:ncols].dim
            assert r == nrows, "incorrect number of rows. correct: {0}, computed: {1}".format(
                nrows, r)
            assert c == ncols, "incorrect number of cols. correct: {0}, computed: {1}".format(
                ncols, c)


if __name__ == "__main__":
    tests.run_test(sys.argv, slicing_shape)
  df_hex.summary()

  assert (not df_hex['h1'].isfactor())
  assert (df_hex['h2'].isfactor())
  assert (not df_hex['h3'].isfactor())

  df_hex['h1'] = df_hex['h1'].asfactor()
  df_hex['h2'] = df_hex['h2'].asfactor()
  df_hex['h3'] = df_hex['h3'].asfactor()

  df_hex.show()
  df_hex.summary()

  assert (df_hex['h1'].isfactor())
  assert (df_hex['h2'].isfactor())
  assert (df_hex['h3'].isfactor())

  df_hex['h1'] = df_hex['h1'].asnumeric()
  df_hex['h2'] = df_hex['h2'].asnumeric()
  df_hex['h3'] = df_hex['h3'].asnumeric()

  df_hex.show()
  df_hex.summary()

  assert (not df_hex['h1'].isfactor())
  assert (not df_hex['h2'].isfactor())
  assert (not df_hex['h3'].isfactor())

if __name__ == "__main__":
    tests.run_test(sys.argv, continuous_or_categorical)
Example #20
0
    print py_dict_to_h2o_2.describe()

    # using collections.OrderedDict

    import collections
    d = {"colA": ["bilbo", "baggins"], "colB": ["meow"]}  # still unordered!
    py_ordered_dict_to_h2o = H2OFrame(python_obj=collections.OrderedDict(d))

    py_ordered_dict_to_h2o.describe()

    # make an ordered dictionary!
    d2 = collections.OrderedDict()
    d2["colA"] = ["bilbo", "baggins"]
    d2["colB"] = ["meow"]

    py_ordered_dict_to_h2o_2 = H2OFrame(python_obj=collections.OrderedDict(d2))
    py_ordered_dict_to_h2o_2.describe()

    # numpy.array

    # import numpy as np
    #
    # py_numpy_ary_to_h2o = H2OFrame(python_obj=np.ones((50, 100), dtype=int))
    #
    # py_numpy_ary_to_h2o.describe()


if __name__ == "__main__":
    tests.run_test(sys.argv, upload_file)
Example #21
0
    #    res.show()
    #    assert False, "expected error. objects of different dimensions not supported."
    #except EnvironmentError:
    #    pass

    #vec/vec
    res = iris[0] > iris[1]
    res_rows = res.nrow
    assert res_rows == rows, "dimension mismatch"
    new_rows = iris[res].nrow
    assert new_rows == 150, "wrong number of rows returned"

    # frame/frame
    res = iris > iris
    res_rows, res_cols = res.dim
    assert res_rows == rows and res_cols == cols, "dimension mismatch"

    res = iris[0:2] > iris[1:3]
    res_rows, res_cols = res.dim
    assert res_rows == rows and res_cols == 2, "dimension mismatch"

    #try:
    #    res = iris > iris[0:3]
    #    res.show()
    #    assert False, "expected error. frames are different dimensions."
    #except EnvironmentError:
    #    pass

if __name__ == "__main__":
  tests.run_test(sys.argv, binop_gt)
Example #22
0
import sys
sys.path.insert(1, "../../")
import h2o, tests


def hist_test(ip, port):

    kwargs = {}
    kwargs['server'] = True

    print "Import small prostate dataset"
    hex = h2o.import_file(h2o.locate("smalldata/logreg/prostate.csv"))
    hex["AGE"].hist(**kwargs)
    hex["VOL"].hist(**kwargs)


if __name__ == "__main__":
    tests.run_test(sys.argv, hist_test)
import sys
sys.path.insert(1, "../../../")
import h2o, tests

def vec_slicing():
    
    

    iris = h2o.import_file(path=tests.locate("smalldata/iris/iris_wheader.csv"))
    iris.show()

    ###################################################################

    # H2OVec[int]
    res = 2 - iris
    res2 = res[0]
    assert abs(res2[3,0] - -2.6) < 1e-10 and abs(res2[17,0] - -3.1) < 1e-10 and abs(res2[24,0] - -2.8) < 1e-10, "incorrect values"

    # H2OVec[slice]
    res = iris[12:25,1]
    assert abs(res[0,0] - 3.0) < 1e-10 and abs(res[1,0] - 3.0) < 1e-10 and abs(res[5,0] - 3.5) < 1e-10, "incorrect values"

if __name__ == "__main__":
    tests.run_test(sys.argv, vec_slicing)
Example #24
0
import os, sys

sys.path.insert(1, "../../../")
import h2o, tests


def deeplearning_multi():

    print(
        "Test checks if Deep Learning works fine with a multiclass training and test dataset"
    )

    prostate = h2o.import_file(tests.locate("smalldata/logreg/prostate.csv"))

    prostate[4] = prostate[4].asfactor()

    hh = h2o.deeplearning(x=prostate[0:2],
                          y=prostate[4],
                          validation_x=prostate[0:2],
                          validation_y=prostate[4],
                          loss='CrossEntropy')
    hh.show()


if __name__ == '__main__':
    tests.run_test(sys.argv, deeplearning_multi)
from sklearn.preprocessing import Imputer

def get_modelKmeans():
    # Connect to a pre-existing cluster
      # connect to localhost:54321

    #Log.info("Importing benign.csv data...\n")
    benign_h2o = h2o.import_file(path=h2o.locate("smalldata/logreg/benign.csv"))
    #benign_h2o.summary()

    benign_sci = np.genfromtxt(h2o.locate("smalldata/logreg/benign.csv"), delimiter=",")
    # Impute missing values with column mean
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    benign_sci = imp.fit_transform(benign_sci)

    for i in range(2,7):
        # Log.info("H2O K-Means")
        km_h2o = h2o.kmeans(x=benign_h2o, k=i)
        km_h2o.show()
        model = h2o.get_model(km_h2o._id)
        model.show()

        km_sci = KMeans(n_clusters=i, init='k-means++', n_init=1)
        km_sci.fit(benign_sci)
        print "sckit centers"
        print km_sci.cluster_centers_

if __name__ == "__main__":
   tests.run_test(sys.argv, get_modelKmeans)

Example #26
0
app_name = config.get('application', 'name')
application = service.Application(app_name)
log_file = config.get('log', 'file')
log_path = config.get('log', 'directory')
log_level = config.get('log', 'level')

logfile = CustomDailyLogFile(log_file, log_path)

application.setComponent(
    ILogObserver,
    log.FileLogObserver(logfile, log_level, exclude_systems=[]).emit)

if __name__ == '__main__':
    app_config = {
        'no_save': True,
        'nodaemon': False,
        'profile': False,
        'debug': False
    }

    oldstdout = sys.stdout
    oldstderr = sys.stderr

    profiler = app.AppProfiler(app_config)
    logger = app.AppLogger(app_config)

    logger.start(application)
    sys.stdout = oldstdout
    run_test()
    logger.stop()
Example #27
0
import sys
sys.path.insert(1, "../../../")
import h2o, tests

def anyfactor():
    
    

    iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv"))

    # frame (positive example)
    assert iris.anyfactor(), "Expected true, but got false. Column 5 is a factor."

    # frame (negative example)
    assert not iris[:,:4].anyfactor(), "Expected false, but got true. Columns 1-4 are numeric."

    # vec (positive example)
    assert iris[4].anyfactor(), "Expected true, but got false. Column 5 is a factor."

    # vec (negative example)
    assert not iris[0].anyfactor(), "Expected false, but got true. Columns 1 is numeric."

if __name__ == "__main__":
    tests.run_test(sys.argv, anyfactor)
Example #28
0
    assert pros[2, 0] == 60, "Incorrect slicing result"
    assert pros[3, 0] == 62, "Incorrect slicing result"
    assert pros[4, 0] == 71, "Incorrect slicing result"
    assert pros[5, 0] == 67, "Incorrect slicing result"

    # prostate [int,slice] case
    # 189,1,69,1,3,2,8,31.2,6
    pros = prostate[188, 0:3]
    assert pros[0, 0] == 189, "Incorrect slicing result"
    assert pros[0, 1] + 1 == 2, "Incorrect slicing result"
    assert pros[0, 2] == 69, "Incorrect slicing result"

    # prostate [slice,slice] case
    # 84,0,75,1,2,1,11,35,7
    # 85,0,75,1,1,1,9.9,15.4,7
    # 86,1,75,1,3,1,3.7,0,6
    pros = prostate[83:86, 1:4]
    assert pros[0, 0] == 0, "Incorrect slicing result"
    assert pros[0, 1] == 75, "Incorrect slicing result"
    assert pros[0, 2] - 1 == 0, "Incorrect slicing result"
    assert pros[1, 0] == 0, "Incorrect slicing result"
    assert pros[1, 1] + 75 == 150, "Incorrect slicing result"
    assert pros[1, 2] == 1, "Incorrect slicing result"
    assert pros[2, 0] + 1 == 2, "Incorrect slicing result"
    assert pros[2, 1] == 75, "Incorrect slicing result"
    assert pros[2, 2] == 1, "Incorrect slicing result"


if __name__ == "__main__":
    tests.run_test(sys.argv, multi_dim_slicing)
                                                                            mul_metric_diff)

    # Clustering metric json
    df = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv"))
    clus_mod = h2o.kmeans(x=df[0:4], k=3, standardize=False)
    clus_met = clus_mod.model_performance()
    clus_metric_json_keys_have = clus_met._metric_json.keys()
    clus_metric_json_keys_desired = [u'tot_withinss',
                                     u'model_category',
                                     u'description',
                                     u'frame',
                                     u'model_checksum',
                                     u'MSE',
                                     u'__meta',
                                     u'scoring_time',
                                     u'betweenss',
                                     u'predictions',
                                     u'totss',
                                     u'model',
                                     u'duration_in_ms',
                                     u'frame_checksum',
                                     u'centroid_stats']
    clus_metric_diff = list(set(clus_metric_json_keys_have) - set(clus_metric_json_keys_desired))
    assert not clus_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) clustering " \
                                "metric json. The difference is {2}".format(clus_metric_json_keys_have,
                                                                            clus_metric_json_keys_desired,
                                                                            clus_metric_diff)

if __name__ == "__main__":
    tests.run_test(sys.argv, metric_json_check)
Example #30
0
    try:
        h2o.glrm(x=prostateH2O,
                 k=5,
                 loss_by_col=rd.sample(NUM_LOSS, 1),
                 loss_by_col_idx=rd.sample(CAT_COLS, 1))
        assert False, "Expected GLRM to throw error since numeric loss cannot apply to categorical column"
    except:
        pass

    try:
        h2o.glrm(x=prostateH2O,
                 k=5,
                 loss_by_col=rd.sample(CAT_LOSS, 1),
                 loss_by_col_idx=rd.sample(NUM_COLS, 1))
        assert False, "Expected GLRM to throw error since categorical loss cannot apply to numeric column"
    except:
        pass

    print "Run GLRM with loss_by_col = [" + ', '.join(
        loss_all) + "] and loss_by_col_idx = [" + ', '.join(
            [str(a) for a in loss_idx_all]) + "]"
    glrm_h2o = h2o.glrm(x=prostateH2O,
                        k=5,
                        loss_by_col=loss_all,
                        loss_by_col_idx=loss_idx_all)
    glrm_h2o.show()


if __name__ == "__main__":
    tests.run_test(sys.argv, glrm_set_loss_by_col_rand)
                                epochs=1,
                                reproducible=True, #slow, turn off for real problems
                                seed=1234)

    # conver train_supervised with autoencoder to lower-dimensional space
    train_supervised_features = ae_model.deepfeatures(train_supervised[0:resp]._frame(), 0)

    assert train_supervised_features.ncol == nfeatures, "Dimensionality of reconstruction is wrong!"

    # Train DRF on extracted feature space
    drf_model = h2o.random_forest(x=train_supervised_features[0:20],
                                  y=train_supervised[resp],
                                  ntrees=10,
                                  min_rows=10,
                                  seed=1234)

    # Test the DRF model on the test set (processed through deep features)
    test_features = ae_model.deepfeatures(test_hex[0:resp]._frame(), 0)
    test_features = test_features.cbind(test_hex[resp])._frame()

    # Confusion Matrix and assertion
    cm = drf_model.confusion_matrix(test_features)
    cm.show()

    # 10% error +/- 0.001
    assert abs(cm.cell_values[10][10] - 0.082) < 0.001, "Error. Expected 0.082, but got {0}".format(cm.cell_values[10][10])

if __name__ == '__main__':
    tests.run_test(sys.argv, deeplearning_autoencoder)

Example #32
0
################################################################################
##
## Verifying that Python can support user-specified strings to be treated as
## missing.
##
################################################################################
import sys, urllib
sys.path.insert(1, "../../")
import h2o, tests

def na_strings():
    path = "smalldata/jira/hexdev_29.csv"

    fhex = h2o.import_file(tests.locate(path))
    fhex.summary()
    fhex_col_summary =  h2o.H2OConnection.get_json("Frames/" + urllib.quote(fhex._id) + "/summary")["frames"][0]["columns"]
    fhex_missing_count = sum([e["missing_count"] for e in fhex_col_summary])

    fhex_na_strings = h2o.import_file(tests.locate(path),
                           na_strings=[[],["fish", "xyz"],[]])
    fhex_na_strings.summary()
    fhex__na_strings_col_summary =  h2o.H2OConnection.get_json("Frames/" + urllib.quote(fhex_na_strings._id) + "/summary")["frames"][0]["columns"]
    fhex_na_strings_missing_count = sum([e["missing_count"] for e in fhex__na_strings_col_summary])

    assert fhex_missing_count == 0
    assert fhex_na_strings_missing_count == 2

if __name__ == "__main__":
    tests.run_test(sys.argv, na_strings)
Example #33
0
from sklearn.cluster import KMeans
from sklearn.preprocessing import Imputer

def benignKmeans():
    # Connect to a pre-existing cluster
      # connect to localhost:54321


    #  Log.info("Importing benign.csv data...\n")
    benign_h2o = h2o.import_file(path=h2o.locate("smalldata/logreg/benign.csv"))
    #benign_h2o.summary()

    benign_sci = np.genfromtxt(h2o.locate("smalldata/logreg/benign.csv"), delimiter=",")
    # Impute missing values with column mean
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    benign_sci = imp.fit_transform(benign_sci)

    # Log.info(paste("H2O K-Means with ", i, " clusters:\n", sep = ""))
    for i in range(1,7):
        benign_h2o_km = h2o.kmeans(x=benign_h2o, k=i)
        print "H2O centers"
        print benign_h2o_km.centers()

        benign_sci_km = KMeans(n_clusters=i, init='k-means++', n_init=1)
        benign_sci_km.fit(benign_sci)
        print "sckit centers"
        print benign_sci_km.cluster_centers_

if __name__ == "__main__":
  tests.run_test(sys.argv, benignKmeans)
Example #34
0
################################################################################
##
## Verifying that Python can define features as categorical or continuous on import
##
################################################################################
import sys, os
sys.path.insert(1, "../../")
import h2o, tests


def continuous_or_categorical():
    fraw = h2o.lazy_import(tests.locate("smalldata/jira/hexdev_29.csv"))
    fsetup = h2o.parse_setup(fraw)
    fsetup["column_types"][0] = "ENUM"
    fsetup["column_types"][1] = "ENUM"
    fsetup["column_types"][2] = "ENUM"

    df_hex = h2o.parse_raw(fsetup)

    df_hex.summary()

    assert (df_hex['h1'].isfactor())
    assert (df_hex['h2'].isfactor())
    assert (df_hex['h3'].isfactor())


if __name__ == "__main__":
    tests.run_test(sys.argv, continuous_or_categorical)
    h2o_zero_weights.set_names(["weights"])
    h2o_data_zero_weights = h2o_cars_data.cbind(h2o_zero_weights)
    h2o_data_zeros_removed = h2o_cars_data[h2o_zero_weights["weights"] == 1]

    print "\n\nChecking that using some zero weights is equivalent to removing those observations:"
    check_same(h2o_data_zeros_removed, h2o_data_zero_weights, 1)

    # doubled weights same as doubled observations
    doubled_weights = [[1] if random.randint(0,1) else [2] for r in range(406)]
    h2o_doubled_weights = h2o.H2OFrame(python_obj=doubled_weights)
    h2o_doubled_weights.set_names(["weights"])
    h2o_data_doubled_weights = h2o_cars_data.cbind(h2o_doubled_weights)

    doubled_data = h2o.as_list(h2o_cars_data, use_pandas=False)
    colnames = doubled_data.pop(0)
    for idx, w in enumerate(doubled_weights):
        if w[0] == 2: doubled_data.append(doubled_data[idx])
    h2o_data_doubled = h2o.H2OFrame(python_obj=doubled_data)
    h2o_data_doubled.set_names(colnames)

    h2o_data_doubled["economy_20mpg"] = h2o_data_doubled["economy_20mpg"].asfactor()
    h2o_data_doubled["cylinders"] = h2o_data_doubled["cylinders"].asfactor()
    h2o_data_doubled_weights["economy_20mpg"] = h2o_data_doubled_weights["economy_20mpg"].asfactor()
    h2o_data_doubled_weights["cylinders"] = h2o_data_doubled_weights["cylinders"].asfactor()

    print "\n\nChecking that doubling some weights is equivalent to doubling those observations:"
    check_same(h2o_data_doubled, h2o_data_doubled_weights, 1)

if __name__ == "__main__":
    tests.run_test(sys.argv, weights_var_imp)
Example #36
0
                      nfolds=cars.nrow + 1,
                      family=family,
                      fold_assignment="Modulo")
        assert False, "Expected model-build to fail when nfolds > nobs"
    except EnvironmentError:
        assert True

    # 3. fold_column and nfolds both specified
    try:
        glm = h2o.glm(y=cars[response_col],
                      x=cars[predictors],
                      nfolds=3,
                      fold_column="fold_assignments",
                      family=family,
                      training_frame=cars)
        assert False, "Expected model-build to fail when fold_column and nfolds both specified"
    except EnvironmentError:
        assert True

    # # 4. fold_column and fold_assignment both specified
    # try:
    #     glm = h2o.glm(y=cars[response_col], x=cars[predictors], fold_assignment="Random", fold_column="fold_assignments",
    #                   family=family, training_frame=cars)
    #     assert False, "Expected model-build to fail when fold_column and fold_assignment both specified"
    # except EnvironmentError:
    #     assert True


if __name__ == "__main__":
    tests.run_test(sys.argv, cv_carsGLM)
Example #37
0
    res = iris[0] == 4.7
    res_rows = res.nrow
    assert res_rows == rows, "dimension mismatch"
    new_rows = iris[res].nrow
    assert new_rows == 2, "wrong number of rows returned"

    res = 3.5 == iris[1]
    res_rows = res.nrow
    assert res_rows == rows, "dimension mismatch"
    new_rows = iris[res].nrow
    assert new_rows == 6, "wrong number of rows returned"

    # frame/frame
    res = iris == iris
    res_rows, res_cols = res.dim
    assert res_rows == rows and res_cols == cols, "dimension mismatch"

    res = iris[0:2] == iris[1:3]
    res_rows, res_cols = res.dim
    assert res_rows == rows and res_cols == 2, "dimension mismatch"

    #try:
    #    res = iris == iris[0:3]
    #    res.show()
    #    assert False, "expected error. frames are different dimensions."
    #except EnvironmentError:
    #    pass

if __name__ == "__main__":
    tests.run_test(sys.argv, binop_eq)
Example #38
0
import sys
sys.path.insert(1, "../../")
import h2o, tests

def sub_gsub_check():
    # Connect to a pre-existing cluster
    

    frame = h2o.import_file(path=tests.locate("smalldata/iris/iris.csv"), col_types=["numeric","numeric","numeric","numeric","string"])

    # single column (frame)
    frame["C5"] = frame["C5"].gsub("s", "z")
    assert frame[0,4] == "Iriz-zetoza", "Expected 'Iriz-zetoza', but got {0}".format(frame[0,4])

    frame["C5"]= frame["C5"].sub("z", "s")
    assert frame[1,4] == "Iris-zetoza", "Expected 'Iris-zetoza', but got {0}".format(frame[1,4])


    # single column (vec)
    vec = frame["C5"]
    vec = vec.sub("z", "s")
    assert vec[2,0] == "Iris-setoza", "Expected 'Iris-setoza', but got {0}".format(vec[2,0])

    vec = vec.gsub("s", "z")
    assert vec[3,0] == "Iriz-zetoza", "Expected 'Iriz-zetoza', but got {0}".format(vec[3,0])

if __name__ == "__main__":
    tests.run_test(sys.argv, sub_gsub_check)
import numpy as np

def wide_dataset_large():
    
    

    print("Reading in Arcene training data for binomial modeling.")
    trainDataResponse = np.genfromtxt(tests.locate("smalldata/arcene/arcene_train_labels.labels"), delimiter=' ')
    trainDataResponse = np.where(trainDataResponse == -1, 0, 1)
    trainDataFeatures = np.genfromtxt(tests.locate("smalldata/arcene/arcene_train.data"), delimiter=' ')
    trainData = h2o.H2OFrame(np.column_stack((trainDataResponse, trainDataFeatures)).tolist())

    print("Run model on 3250 columns of Arcene with strong rules off.")
    model = h2o.glm(x=trainData[1:3250], y=trainData[0].asfactor(), family="binomial", lambda_search=False, alpha=[1])

    print("Test model on validation set.")
    validDataResponse = np.genfromtxt(tests.locate("smalldata/arcene/arcene_valid_labels.labels"), delimiter=' ')
    validDataResponse = np.where(validDataResponse == -1, 0, 1)
    validDataFeatures = np.genfromtxt(tests.locate("smalldata/arcene/arcene_valid.data"), delimiter=' ')
    validData = h2o.H2OFrame(np.column_stack((validDataResponse, validDataFeatures)).tolist())
    prediction = model.predict(validData)

    print("Check performance of predictions.")
    performance = model.model_performance(validData)

    print("Check that prediction AUC better than guessing (0.5).")
    assert performance.auc() > 0.5, "predictions should be better then pure chance"

if __name__ == "__main__":
    tests.run_test(sys.argv, wide_dataset_large)
import h2o, tests


def expr_as_list(ip,port):
    
    

    iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris_wheader.csv"))

    # multiple rows and columns
    res = 2 - iris
    res = h2o.as_list(res, use_pandas=False)
    assert abs(float(res[4][0]) - -2.6) < 1e-10 and abs(float(res[5][1]) - -1.6) < 1e-10 and \
           abs(float(res[11][2]) - 0.5) < 1e-10, "incorrect values"

    # single column
    res = 2 - iris
    res = h2o.as_list(res[0], use_pandas=False)
    assert abs(float(res[4][0]) - -2.6) < 1e-10 and abs(float(res[18][0]) - -3.1) < 1e-10 and \
           abs(float(res[25][0]) - -2.8) < 1e-10, "incorrect values"

    # local data
    frm = h2o.as_list(h2o.H2OFrame(python_obj=[1,2,3]), use_pandas=False)
    assert float(frm[1][2]) == 3, "incorrect values"

    frm = h2o.as_list(h2o.H2OFrame(python_obj=[[1,2,3], [4,5,6]]), use_pandas=False)
    assert float(frm[2][1]) == 5, "incorrect values"

if __name__ == "__main__":
    tests.run_test(sys.argv, expr_as_list)
Example #41
0
import sys
sys.path.insert(1, "../../../")
import h2o, tests

def demo_glm():

    h2o.demo(func="glm", interactive=False, test=True)

if __name__ == "__main__":
    tests.run_test(sys.argv, demo_glm)
Example #42
0
sys.path.insert(1, "../../")
import h2o, tests


def upload_import_small():
    # Connect to a pre-existing cluster

    various_datasets = [
        "smalldata/iris/iris.csv", "smalldata/iris/iris_wheader.csv",
        "smalldata/prostate/prostate.csv",
        "smalldata/prostate/prostate_woheader.csv.gz"
    ]

    for dataset in various_datasets:
        uploaded_frame = h2o.upload_file(tests.locate(dataset))
        imported_frame = h2o.import_file(tests.locate(dataset))

        rows_u, cols_u = uploaded_frame.dim
        rows_i, cols_i = imported_frame.dim

        assert rows_u == rows_i, "Expected same number of rows regardless of method. upload: {0}, import: " \
                                 "{1}.".format(rows_u, rows_i)

        assert cols_u == cols_i, "Expected same number of cols regardless of method. upload: {0}, import: " \
                                 "{1}.".format(cols_u, cols_i)


if __name__ == "__main__":
    tests.run_test(sys.argv, upload_import_small)
Example #43
0
import sys
sys.path.insert(1, "../../")
import h2o, tests

def hist_test(ip,port):
    
    

    kwargs = {}
    kwargs['server'] = True

    print "Import small prostate dataset"
    hex = h2o.import_file(h2o.locate("smalldata/logreg/prostate.csv"))
    hex["AGE"].hist(**kwargs)
    hex["VOL"].hist(**kwargs)

if __name__ == "__main__":
    tests.run_test(sys.argv, hist_test)
Example #44
0
    print
    print "======================================================================"
    print "============================== Binomial =============================="
    print "======================================================================"
    for i in range(10):
        attack(pros_train, pros_valid,
               random.sample([2, 3, 4, 5, 6, 7, 8], random.randint(1, 7)), 1)

    print
    print "======================================================================"
    print "============================== Gaussian =============================="
    print "======================================================================"
    for i in range(10):
        attack(cars_train, cars_valid,
               random.sample([2, 3, 4, 5, 6, 7], random.randint(1, 6)), 1)

    print
    print "======================================================================"
    print "============================= Multinomial ============================"
    print "======================================================================"
    cars_train[2] = cars_train[2].asfactor()
    cars_valid[2] = cars_valid[2].asfactor()
    for i in range(10):
        attack(cars_train, cars_valid,
               random.sample([1, 3, 4, 5, 6, 7], random.randint(1, 6)), 2)


if __name__ == "__main__":
    tests.run_test(sys.argv, random_attack)
    # Connect to a pre-existing cluster
    

    insurance = h2o.import_file(h2o.locate("smalldata/glm_test/insurance.csv"))

    insurance["offset"] = insurance["Holders"].log()

    gbm = h2o.gbm(x=insurance[0:3], y=insurance["Claims"], distribution="gaussian", ntrees=600, max_depth=1, min_rows=1,
                  learn_rate=.1, offset_column="offset", training_frame=insurance)

    predictions = gbm.predict(insurance)

    # Comparison result generated from R's gbm:
    #	fit2 <- gbm(Claims ~ District + Group + Age+ offset(log(Holders)) , interaction.depth = 1,n.minobsinnode = 1,
    #               shrinkage = .1,bag.fraction = 1,train.fraction = 1,
    #   data = Insurance, distribution ="gaussian", n.trees = 600)
    #   pg = predict(fit2, newdata = Insurance, type = "response", n.trees=600)
    #   pr = pg - - log(Insurance$Holders)
    assert abs(44.33016 - gbm._model_json['output']['init_f']) < 1e-5, "expected init_f to be {0}, but got {1}". \
        format(44.33016, gbm._model_json['output']['init_f'])
    assert abs(1491.135 - gbm.mse()) < 1e-2, "expected mse to be {0}, but got {1}".format(1491.135, gbm.mse())
    assert abs(49.23438 - predictions.mean()) < 1e-2, "expected prediction mean to be {0}, but got {1}". \
        format(49.23438, predictions.mean())
    assert abs(-45.5720659304 - predictions.min()) < 1e-2, "expected prediction min to be {0}, but got {1}". \
        format(-45.5720659304, predictions.min())
    assert abs(207.387 - predictions.max()) < 1e-2, "expected prediction max to be {0}, but got {1}". \
        format(207.387, predictions.max())

if __name__ == "__main__":
    tests.run_test(sys.argv, offset_gaussian)
        cross1_km = h2o.kmeans(training_frame=cross_h2o,
                               x=cross_h2o[0:57],
                               k=ncent,
                               max_iterations=miters)
        print cross1_km

        print "Run k-means with init = final cluster centers and max_iterations = 1"
        init_centers = h2o.H2OFrame(cross1_km.centers())
        init_centers_key = init_centers.send_frame()
        cross2_km = h2o.kmeans(training_frame=cross_h2o,
                               x=cross_h2o[0:57],
                               k=ncent,
                               user_points=init_centers_key,
                               max_iterations=1)
        print cross2_km

        print "Check k-means converged or maximum iterations reached"
        c1 = h2o.H2OFrame(cross1_km.centers())
        c2 = h2o.H2OFrame(cross2_km.centers())
        avg_change = ((c1 - c2)**2).sum() / ncent
        iters = cross1_km._model_json['output']['model_summary'].cell_values[
            0][3]
        assert avg_change < 1e-6 or iters > miters, "Expected k-means to converge or reach max iterations. avg_change = " \
                                                    "{0} and iterations = {1}".format(avg_change, iters)
    else:
        print "Not running on H2O internal network.  No access to HDFS."


if __name__ == "__main__":
    tests.run_test(sys.argv, hdfs_kmeans_converge)
Example #47
0
import pandas as pd
import numpy as np

def group_by():
    # Connect to a pre-existing cluster
    

    h2o_iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    pd_iris = pd.read_csv(h2o.locate("smalldata/iris/iris_wheader.csv"))

    na_handling = ["ignore","rm","all"]
    col_names = h2o_iris.col_names[0:4]

    print "Running smoke test"

    # smoke test
    for na in na_handling:
      grouped = h2o_iris.group_by("class")
      grouped \
        .count(na=na) \
        .min(  na=na) \
        .max(  na=na) \
        .mean( na=na) \
        .var(  na=na) \
        .sd(   na=na) \
        .ss(   na=na) \
        .sum(  na=na)
      print grouped.get_frame()
if __name__ == "__main__":
    tests.run_test(sys.argv, group_by)
import sys
sys.path.insert(1, "../../")
import h2o, tests


def spaces_in_column_names():

    train_data = h2o.upload_file(
        path=tests.locate("smalldata/jira/spaces_in_column_names.csv"))
    train_data.show()
    train_data.describe()
    X = [
        "p r e d i c t o r 1", "predictor2", "p r e d i ctor3", "pre d ictor4",
        "predictor5"
    ]
    gbm = h2o.gbm(x=train_data[X],
                  y=train_data["r e s p o n s e"].asfactor(),
                  ntrees=1,
                  distribution="bernoulli",
                  min_rows=1)
    gbm.show()


if __name__ == "__main__":
    tests.run_test(sys.argv, spaces_in_column_names)
  # Ratio of y = 1 per Level: cat01 = 1.0 (strong predictor), cat02 to cat10 = 0.5 (weak predictors)

  
  
  
  #Log.info("Importing swpreds_1000x3.csv data...\n")
  swpreds = h2o.import_file(path=tests.locate("smalldata/gbm_test/swpreds_1000x3.csv"))
  swpreds["y"] = swpreds["y"].asfactor()

  #Log.info("Summary of swpreds_1000x3.csv from H2O:\n")
  #swpreds.summary()
  
  # Train H2O GBM without Noise Column
  #Log.info("H2O GBM with parameters:\nntrees = 50, max_depth = 20, nbins = 500\n")
  h2o_gbm_model1 = h2o.gbm(x=swpreds[["X1"]], y=swpreds["y"], distribution="bernoulli", ntrees=50, max_depth=20,
                           nbins=500)
  h2o_gbm_model1.show()
  h2o_gbm_perf1 = h2o_gbm_model1.model_performance(swpreds)
  h2o_auc1 = h2o_gbm_perf1.auc()

  # Train H2O GBM Model including Noise Column:
  #Log.info("H2O GBM with parameters:\nntrees = 50, max_depth = 20, nbins = 500\n")
  h2o_gbm_model2 = h2o.gbm(x=swpreds[["X1","X2"]], y=swpreds["y"], distribution="bernoulli", ntrees=50, max_depth=20,
                           nbins=500)
  h2o_gbm_model2.show()
  h2o_gbm_perf2 = h2o_gbm_model2.model_performance(swpreds)
  h2o_auc2 = h2o_gbm_perf2.auc()

if __name__ == "__main__":
  tests.run_test(sys.argv, swpredsGBM)
Example #50
0
################################################################################
##
## Verifying that Python can support importing without parsing.
##
################################################################################
import sys, os
sys.path.insert(1, "../../")
import h2o, tests


def parse_false():

    fraw = h2o.import_file(tests.locate("smalldata/jira/hexdev_29.csv"),
                           parse=False)
    assert isinstance(fraw, list)

    fhex = h2o.parse_raw(h2o.parse_setup(fraw))
    fhex.summary()
    assert fhex.__class__.__name__ == "H2OFrame"


if __name__ == "__main__":
    tests.run_test(sys.argv, parse_false)
import sys
sys.path.insert(1, "../../")
import h2o, tests

def score_history_test(ip,port):
    
    

    air_train = h2o.import_file(path=h2o.locate("smalldata/airlines/AirlinesTrain.csv.zip"))

    gbm_mult = h2o.gbm(x=air_train[["Origin", "Dest", "Distance", "UniqueCarrier", "IsDepDelayed", "fDayofMonth","fMonth"]],
                       y=air_train["fDayOfWeek"].asfactor(),
                       distribution="multinomial")
    score_history = gbm_mult.score_history()
    print score_history

if __name__ == "__main__":
    tests.run_test(sys.argv, score_history_test)
Example #52
0
        u'frame_checksum'
    ]
    mul_metric_diff = list(
        set(mul_metric_json_keys_have) - set(mul_metric_json_keys_desired))
    assert not mul_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) multinomial " \
                                "metric json. The difference is {2}".format(mul_metric_json_keys_have,
                                                                            mul_metric_json_keys_desired,
                                                                            mul_metric_diff)

    # Clustering metric json
    df = h2o.import_file(path=tests.locate("smalldata/iris/iris.csv"))
    clus_mod = h2o.kmeans(x=df[0:4], k=3, standardize=False)
    clus_met = clus_mod.model_performance()
    clus_metric_json_keys_have = clus_met._metric_json.keys()
    clus_metric_json_keys_desired = [
        u'tot_withinss', u'model_category', u'description', u'frame',
        u'model_checksum', u'MSE', u'__meta', u'scoring_time', u'betweenss',
        u'predictions', u'totss', u'model', u'duration_in_ms',
        u'frame_checksum', u'centroid_stats'
    ]
    clus_metric_diff = list(
        set(clus_metric_json_keys_have) - set(clus_metric_json_keys_desired))
    assert not clus_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) clustering " \
                                "metric json. The difference is {2}".format(clus_metric_json_keys_have,
                                                                            clus_metric_json_keys_desired,
                                                                            clus_metric_diff)


if __name__ == "__main__":
    tests.run_test(sys.argv, metric_json_check)
Example #53
0
##
# Test out the sdev() functionality
# If NAs in the frame, they are skipped in calculation unless na.rm = F
# If any categorical columns, throw an error
##

import sys
sys.path.insert(1, "../../../")
import h2o, tests
import numpy as np

def sdev(ip,port):
  
  

  iris_h2o = h2o.import_file(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
  iris_np = np.genfromtxt(h2o.locate("smalldata/iris/iris_wheader.csv"),
                          delimiter=',',
                          skip_header=1,
                          usecols=(0, 1, 2, 3))

  sd_np = np.std(iris_np, axis=0, ddof=1)
  for i in range(4):
    sd_h2o = iris_h2o[i].sd()
    assert abs(sd_np[i] - sd_h2o) < 1e-10, "expected standard deviations to be the same"

  iris_h2o[0:2].sd()
  
if __name__ == "__main__":
  tests.run_test(sys.argv, sdev)
Example #54
0
import sys
sys.path.insert(1, "../../")
import h2o, tests


def expr_show():

    iris = h2o.import_file(
        path=tests.locate("smalldata/iris/iris_wheader.csv"))
    print "iris:"
    iris.show()

    ###################################################################

    # expr[int], expr._data is pending
    res = 2 - iris
    res2 = res[0]
    print "res2:"
    res2.show()

    # expr[int], expr._data is remote
    res3 = res[0]
    print "res3:"
    res3.show()


if __name__ == "__main__":
    tests.run_test(sys.argv, expr_show)
Example #55
0
import sys

sys.path.insert(1, "../../")
import h2o, tests


def varimp_test(ip, port):

    train = h2o.import_file(path=h2o.locate("smalldata/iris/iris_wheader.csv"))

    # Run GBM
    my_gbm = h2o.gbm(y=train["class"], x=train[1:4], ntrees=50, learn_rate=0.1, distribution="multinomial")

    should_be_none = my_gbm.varimp()
    assert should_be_none is None, "expected varimp to return None, but returned {0}".format(should_be_none)

    should_be_list = my_gbm.varimp(return_list=True)
    assert len(should_be_list) == 3, "expected varimp list to contain 3 entries, but it has " "{0}".format(
        len(should_be_list)
    )
    assert len(should_be_list[0]) == 4, (
        "expected varimp entry to contain 4 elements (variable, relative_importance, "
        "scaled_importance, percentage), but it has {0}".format(len(should_be_list[0]))
    )


if __name__ == "__main__":
    tests.run_test(sys.argv, varimp_test)
Example #56
0
import sys
sys.path.insert(1, "../../../")
import h2o, tests

def grid_wineGBM(ip,port):
    
    

    wine = h2o.import_file(path=h2o.locate("smalldata/gbm_test/wine.data"))
    #wine.summary()
    x_cols = range(2,14) + [0]
    wine_grid = h2o.gbm(y=wine[1],
                        x=wine[x_cols],
                        distribution='gaussian',
                        ntrees=[5,10,15],
                        max_depth=[2,3,4],
                        learn_rate=[0.1,0.2])
    wine_grid.show()

if __name__ == "__main__":
    tests.run_test(sys.argv, grid_wineGBM)
  print zz.show()


  zz = fr.apply(lambda row: h2o.ifelse(row[0] == 1, row[2], row[3]), axis=1)

  print zz.show()


  fr.apply(lambda col: col.abs()).show()
  fr.apply(lambda col: col.cos()).show()
  fr.apply(lambda col: col.sin()).show()
  fr.apply(lambda col: col.ceil()).show()
  fr.apply(lambda col: col.floor()).show()
  fr.apply(lambda col: col.cosh()).show()
  fr.apply(lambda col: col.exp()).show()
  fr.apply(lambda col: col.log()).show()
  fr.apply(lambda col: col.sqrt()).show()
  fr.apply(lambda col: col.tan()).show()
  fr.apply(lambda col: col.tanh()).show()

  fr.apply(lambda col: (col*col - col*5*col).abs() - 55/col ).show()


  fr.apply(lambda row: h2o.ifelse(row[0] < 5, (row[2]-3).expm1(), (row[2] - 999).expm1()), axis=1)
  fr.apply(lambda row: h2o.ifelse(row[0] < 5, (row[2]-3).expm1(), 55), axis=1)
  fr.apply(lambda row: h2o.ifelse(row[0] < 5, 3, (row[2] - 1).expm1()), axis=1)

if __name__ == "__main__":
  tests.run_test(sys.argv, pyunit_apply)
Example #58
0
import sys
sys.path.insert(1, "../../../")
import h2o, tests

def bigcatGBM():
  
  
  
  #Log.info("Importing bigcat_5000x2.csv data...\n")
  bigcat = h2o.import_file(path=tests.locate("smalldata/gbm_test/bigcat_5000x2.csv"))
  bigcat["y"] = bigcat["y"].asfactor()
  #Log.info("Summary of bigcat_5000x2.csv from H2O:\n")
  #bigcat.summary()
  
  # Train H2O GBM Model:
  #Log.info("H2O GBM with parameters:\nntrees = 1, max_depth = 1, nbins = 100\n")
  model = h2o.gbm(x=bigcat[["X"]], y = bigcat["y"], distribution="bernoulli", ntrees=1, max_depth=1, nbins=100)
  model.show()
  performance = model.model_performance(bigcat)
  performance.show()
  
  # Check AUC and overall prediction error
  #test_accuracy = performance.accuracy()
  test_auc = performance.auc()

if __name__ == "__main__":
  tests.run_test(sys.argv, bigcatGBM)
Example #59
0
        h2o.locate("smalldata/iris/iris_wheader.csv"), col_types=["numeric", "numeric", "numeric", "numeric", "string"]
    )  # import data
    assembly = H2OAssembly(
        steps=[
            ("col_select", H2OColSelect(["sepal_len", "petal_len", "class"])),  # col selection
            ("cos_sep_len", H2OColOp(fun=H2OFrame.cos, col="sepal_len", inplace=True)),  # math operation
            ("str_cnt_species", H2OColOp(fun=H2OFrame.countmatches, col="class", inplace=False, pattern="s")),
        ]
    )  # string operation

    result = assembly.fit(fr)  # fit the assembly
    result.show()  # show the result of the fit

    assembly.to_pojo("MungingPojoDemo")  # , path="/Users/spencer/Desktop/munging_pojo")  # export POJO

    # java api usage:
    #
    #   String rawRow = framework.nextTuple();
    #   H2OMungingPOJO munger = new GeneratedH2OMungingPojo_001();
    #   EasyPredictModelWrapper model = new EasyPredictModelWrapper(new GeneratedH2OGbmPojo_001());
    #
    #   RowData row = new RowData();
    #   row.fill(rawRow);
    #   row = munger.fit(row);
    #   BinomialModelPrediction pred = model.predictBinomial(row);
    #   // Use prediction!


if __name__ == "__main__":
    tests.run_test(sys.argv, assembly_demo)
Example #60
0
    print "min_rows model 2: {0}".format(min_rows2)
    model2 = h2o.random_forest(x=milsong_train[1:],
                               y=milsong_train[0],
                               ntrees=ntrees2,
                               max_depth=max_depth2,
                               min_rows=min_rows2,
                               validation_x=milsong_valid[1:],
                               validation_y=milsong_valid[0],
                               checkpoint=restored_model._id,
                               seed=1234)

    # build the equivalent of model 2 in one shot
    model3 = h2o.random_forest(x=milsong_train[1:],
                               y=milsong_train[0],
                               ntrees=ntrees2,
                               max_depth=max_depth2,
                               min_rows=min_rows2,
                               validation_x=milsong_valid[1:],
                               validation_y=milsong_valid[0],
                               seed=1234)

    assert isinstance(model2, type(model3))
    assert model2.mse(valid=True) == model3.mse(
        valid=True
    ), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(
        model2.mse(valid=True), model3.mse(valid=True))


if __name__ == "__main__":
    tests.run_test(sys.argv, milsong_checkpoint)