Exemple #1
0
        h2o.kmeans(x=frame, k=5)
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    # Log.info("Training data with a categorical column(s)")
    data = [[random.choice(string.ascii_uppercase) for c in range(cols)]
            for r in range(rows)]
    frame = h2o.H2OFrame(data)

    km_model = h2o.kmeans(x=frame, k=5)
    centers = km_model.centers()
    assert len(centers) == 5, "expected 5 centers"
    for c in range(len(centers)):
        assert len(centers[c]) == 10, "expected center to be 10 " + str(
            len(centers[c]))

    # Log.info("Importing iris.csv data...\n")
    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))

    km_model = h2o.kmeans(x=iris, k=5)
    centers = km_model.centers()
    assert len(centers) == 5, "expected 5 centers"
    for c in range(len(centers)):
        assert len(
            centers[c]) == 5, "expected center to be 5 " + str(len(centers[c]))


if __name__ == "__main__":
    h2o.run_test(sys.argv, baddataKmeans)
    row_sum = 0
    for level in air.levels(16):
       if level == "ANC": continue
       r, c = air[str(level) == air["Origin"]].dim()
       row_sum = row_sum + r
    assert row_sum == rows - 1, "expected equal number of rows"

    # ==, !=
    jan = air[1 == air["Month"]]
    not_jan = air[1 != air["Month"]]
    no_rows, no_cols = not_jan.dim()
    yes_rows, yes_cols = jan.dim()
    assert (no_rows + yes_rows) == rows and no_cols == yes_cols == cols, "expected equal number of rows and cols"

    # >, <=
    g = air[1990 <= air["Year"]]
    L = air[1990 > air["Year"]]
    g_rows, g_cols = g.dim()
    L_rows, L_cols = L.dim()
    assert (L_rows + g_rows) == rows and L_cols == g_cols == cols, "expected equal number of rows and cols"

    # >=, <
    G = air[15 < air["DayofMonth"]]
    l = air[15 >= air["DayofMonth"]]
    G_rows, G_cols = G.dim()
    l_rows, l_cols = l.dim()
    assert (l_rows + G_rows) == rows and l_cols == G_cols == cols, "expected equal number of rows and cols"

if __name__ == "__main__":
    h2o.run_test(sys.argv, vec_scaler_comparisons)
    ###################################################################

    # H2OFrame[int] (column slice)
    res = 2 - iris
    res2 = res[0]
    assert abs(res2[3,:] - -2.6) < 1e-10 and abs(res2[17,:] - -3.1) < 1e-10 and abs(res2[24,:] - -2.8) < 1e-10, \
        "incorrect values"

    # H2OFrame[int,int]
    assert abs(res[13, 3] - 1.9) < 1e-10, "incorrect values"

    # H2OFrame[int, slice]
    res4 = res[12, 0:4]
    assert abs(res4[0,0] - -2.8) < 1e-10 and abs(res4[0,1] - -1.0) < 1e-10 and abs(res4[0,2] - 0.6) < 1e-10 and \
        abs(res4[0,3] - 1.9) < 1e-10, "incorrect values"

    # H2OFrame[slice, int]
    res5 = res[5:9, 1]
    assert abs(res5[0,:] - -1.9) < 1e-10 and abs(res5[1,:] - -1.4) < 1e-10 and abs(res5[2,:] - -1.4) < 1e-10 and \
           abs(res5[3,:] - -0.9) < 1e-10, "incorrect values"

    # H2OFrame[slice, slice]
    res = iris * 2
    res6 = res[5:9, 0:4]
    assert abs(res6[0,0] - 10.8) < 1e-10 and abs(res6[1,1] - 6.8) < 1e-10 and abs(res6[2,2] - 3.0) < 1e-10 and \
           abs(res6[3,3] - 0.4) < 1e-10, "incorrect values"

if __name__ == "__main__":
    h2o.run_test(sys.argv, expr_slicing)
import sys
sys.path.insert(1, "../../")
import h2o

def frame_show(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    prostate = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate.csv.zip"))
    airlines = h2o.import_frame(path=h2o.locate("smalldata/airlines/allyears2k.zip"))

    iris.show()
    prostate.show()
    airlines.show()

if __name__ == "__main__":
    h2o.run_test(sys.argv, frame_show)
import sys
sys.path.insert(1, "../../")
import h2o
import random
import numpy as np

def quantile(ip,port):
    # Connect to a pre-existing cluster
    

    data = [[random.uniform(-10000,10000)] for c in range(1000)]
    h2o_data = h2o.H2OFrame(python_obj=data)
    np_data = np.array(data)

    h2o_quants = h2o_data.quantile()
    np_quants = np.percentile(np_data,[1, 10, 25, 33.3, 50, 66.7, 75, 90, 99],axis=0)

    for e in range(9):
        h2o_val = h2o_quants[e,1]
        np_val = np_quants[e][0]
        assert abs(h2o_val - np_val) < 1e-06, \
        "check unsuccessful! h2o computed {0} and numpy computed {1}. expected equal quantile values between h2o " \
        "and numpy".format(h2o_val,np_val)

if __name__ == "__main__":
    h2o.run_test(sys.argv, quantile)
Exemple #6
0
    #
    # #   d. jagged
    # python_obj = np.array([[6,7,8,9,10], [1,2,3,4], [3,2,2]])
    # the_frame = h2o.H2OFrame(python_obj=python_obj)
    # # check_dims_values_jagged() TODO
    #
    # ## 6. pandas.DataFrame
    # #   a. single row
    # python_obj = pd.DataFrame({'foo' : pd.Series([1]), 'bar' : pd.Series([6]), 'baz' : pd.Series(["a"]) })
    # the_frame = h2o.H2OFrame(python_obj=python_obj)
    # h2o.check_dims_values(python_obj, the_frame, rows=1, cols=3)
    #
    # #   b. single column
    # python_obj = pd.DataFrame({'foo' : pd.Series([1, 2, 3, 7.8, 9])})
    # the_frame = h2o.H2OFrame(python_obj=python_obj)
    # h2o.check_dims_values(python_obj, the_frame, rows=5, cols=1)
    #
    # #   c. multiple rows, columns
    # python_obj = pd.DataFrame({'foo' : pd.Series([6,7,8,9,10]), 'bar' : pd.Series([1,2,3,4,5]),
    #                            'baz' : pd.Series([3,2,2,2,2])})
    # the_frame = h2o.H2OFrame(python_obj=python_obj)
    # h2o.check_dims_values(python_obj, the_frame, rows=5, cols=3)
    #
    # #   d. jagged
    # python_obj = pd.DataFrame({'foo' : pd.Series([6,7,8]), 'bar' : pd.Series([1,2,3,4,5]), 'baz' : pd.Series([3,2,2,2])})
    # the_frame = h2o.H2OFrame(python_obj=python_obj)
    # # check_dims_values_jagged() TODO

if __name__ == "__main__":
    h2o.run_test(sys.argv, to_H2OFrame)
    gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learn_rate, n_estimators=ntrees, max_depth=max_depth, min_samples_leaf=min_rows, max_features=None)
    gbm_sci.fit(X_train,y_train)

    # Generate testing dataset
    test_rows = 2000
    test_cols = 10

    #  Generate variables V1, ... V10
    X_test = np.random.randn(test_rows, test_cols)

    #  y = +1 if sum_i x_{ij}^2 > chisq median on 10 df
    y_test = np.asarray([1 if rs > scipy.stats.chi2.ppf(0.5, 10) else -1 for rs in [sum(r) for r in np.multiply(X_test,X_test).tolist()]])

    # Score (AUC) the scikit gbm model on the test data
    auc_sci = roc_auc_score(y_test, gbm_sci.predict_proba(X_test)[:,1])

    # Compare this result to H2O
    train_h2o = H2OFrame(np.column_stack((y_train, X_train)).tolist())
    test_h2o = H2OFrame(np.column_stack((y_test, X_test)).tolist())

    gbm_h2o = h2o.gbm(x=train_h2o[1:], y=train_h2o["C1"], distribution=distribution, ntrees=ntrees, min_rows=min_rows, max_depth=max_depth, learn_rate=learn_rate, nbins=nbins)
    gbm_perf = gbm_h2o.model_performance(test_h2o)
    auc_h2o = gbm_perf.auc()

    #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o))
    assert auc_h2o >= auc_sci, "h2o (auc) performance degradation, with respect to scikit"

if __name__ == "__main__":
    h2o.run_test(sys.argv, bernoulli_synthetic_data_mediumGBM)
import h2o


def vi_toy_test(ip, port):

    toy_data = h2o.import_file(
        path=h2o.locate("smalldata/gbm_test/toy_data_RF.csv"))
    #toy_data.summary()

    toy_data[6] = toy_data[6].asfactor()
    toy_data.show()
    rf = h2o.random_forest(x=toy_data[[0, 1, 2, 3, 4, 5]],
                           y=toy_data[6],
                           ntrees=500,
                           max_depth=20,
                           nbins=100,
                           seed=0)

    ranking = [
        rf._model_json['output']['variable_importances'].cell_values[v][0]
        for v in range(toy_data.ncol() - 1)
    ]
    print(ranking)
    assert tuple(ranking) == tuple(
        ["V3", "V2", "V6", "V5", "V1",
         "V4"]), "expected specific variable importance ranking"


if __name__ == "__main__":
    h2o.run_test(sys.argv, vi_toy_test)
Exemple #9
0
def parametersKmeans(ip, port):
    # Connect to a pre-existing cluster
    h2o.init(ip, port)  # connect to localhost:54321

    #Log.info("Getting data...")
    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))

    #Log.info("Create and and duplicate...")
    iris_km = h2o.kmeans(x=iris[0:4], k=3, seed=1234)
    parameters = iris_km._model_json['parameters']
    param_dict = {}
    for p in range(len(parameters)):
        param_dict[parameters[p]['label']] = parameters[p]['actual_value']

    iris_km_again = h2o.kmeans(x=iris[0:4], **param_dict)

    #Log.info("wmse")
    wmse = iris_km.within_mse().sort()
    wmse_again = iris_km_again.within_mse().sort()
    assert wmse == wmse_again, "expected wmse to be equal"

    #Log.info("centers")
    centers = iris_km.centers()
    centers_again = iris_km_again.centers()
    assert centers == centers_again, "expected centers to be the same"


if __name__ == "__main__":
    h2o.run_test(sys.argv, parametersKmeans)
import sys, os
sys.path.insert(1, "../../")
import h2o
import random

def download_csv(ip,port):
    
    

    iris1 = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv"))

    h2o.download_csv(iris1,"iris_delete.csv")

    iris2 = h2o.import_file(path=h2o.locate("iris_delete.csv"))
    os.remove("iris_delete.csv")

    rand_row = random.randint(0,iris1.nrow()-1)
    rand_col = random.randint(0,3)
    assert abs(iris1[rand_row, rand_col] - iris2[rand_row, rand_col]) < 1e-10, "Expected elements from the datasets to " \
                                                                                "be the same, but got {0} and {1}" \
                                                                                "".format(iris1[rand_row, rand_col],
                                                                                          iris2[rand_row, rand_col])
if __name__ == "__main__":
    h2o.run_test(sys.argv, download_csv)
Exemple #11
0
    myX = [
        "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth",
        "fDayOfWeek"
    ]
    myY = "IsDepDelayed"

    air_gbm = h2o.gbm(x=air_train[myX],
                      y=air_train[myY],
                      validation_x=air_valid[myX],
                      validation_y=air_valid[myY],
                      distribution="bernoulli",
                      ntrees=100,
                      max_depth=3,
                      learn_rate=0.01)

    # Plot ROC for training and validation sets
    air_gbm.plot(type="roc", train=True, **kwargs)
    air_gbm.plot(type="roc", valid=True, **kwargs)

    air_test = h2o.import_frame(
        h2o.locate("smalldata/airlines/AirlinesTest.csv.zip"))
    perf = air_gbm.model_performance(air_test)

    #Plot ROC for test set
    perf.plot(type="roc", **kwargs)


if __name__ == "__main__":
    h2o.run_test(sys.argv, plot_test)
Exemple #12
0
                             k=ncent,
                             user_points=centers_key,
                             max_iterations=1)
        centers = h2o.H2OFrame(rep_fit.centers())
        centers_key = centers.send_frame()

    # Log.info(paste("Run k-means with max_iter=miters"))
    all_fit = h2o.kmeans(x=ozone_h2o,
                         k=ncent,
                         user_points=start.eager(),
                         max_iterations=miters)
    assert rep_fit.centers() == all_fit.centers(
    ), "expected the centers to be the same"

    # Log.info("Check cluster centers have converged")
    all_fit2 = h2o.kmeans(x=ozone_h2o,
                          k=ncent,
                          user_points=h2o.H2OFrame(
                              all_fit.centers()).send_frame(),
                          max_iterations=1)
    avg_change = sum([
        sum([pow((e1 - e2), 2) for e1, e2 in zip(c1, c2)])
        for c1, c2 in zip(all_fit.centers(), all_fit2.centers())
    ]) / ncent
    assert avg_change < 1e-6 or all_fit._model_json['output'][
        'iterations'] < miters


if __name__ == "__main__":
    h2o.run_test(sys.argv, convergeKmeans)
Exemple #13
0
import sys
sys.path.insert(1, "../../../")
import h2o


def frame_as_list(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    prostate = h2o.import_frame(
        path=h2o.locate("smalldata/prostate/prostate.csv.zip"))

    print(prostate % 10).show()
    print(prostate[4] % 10).show()

    airlines = h2o.import_frame(
        path=h2o.locate("smalldata/airlines/allyears2k_headers.zip"))

    print(airlines["CRSArrTime"] % 100).show()


if __name__ == "__main__":
    h2o.run_test(sys.argv, frame_as_list)
Exemple #14
0
def frame_reducers(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    data = [[random.uniform(-10000, 10000) for r in range(10)]
            for c in range(10)]
    h2o_data = h2o.H2OFrame(python_obj=data)
    np_data = np.array(data)

    h2o_val = h2o_data.min()
    num_val = np.min(np_data)
    assert abs(h2o_val - num_val) < 1e-06, \
        "check unsuccessful! h2o computed {0} and numpy computed {1}. expected equal min values between h2o and " \
        "numpy".format(h2o_val,num_val)
    h2o_val = h2o_data.max()
    num_val = np.max(np_data)
    assert abs(h2o_val - num_val) < 1e-06, \
        "check unsuccessful! h2o computed {0} and numpy computed {1}. expected equal max values between h2o and " \
        "numpy".format(h2o_val,num_val)
    h2o_val = h2o_data.sum()
    num_val = np.sum(np_data)
    assert abs(h2o_val - num_val) < 1e-06, \
        "check unsuccessful! h2o computed {0} and numpy computed {1}. expected equal sum values between h2o and " \
        "numpy".format(h2o_val,num_val)
    #h2o.np_comparison_check(h2o.var(h2o_data), np.cov(np_data, rowvar=0, ddof=1), 10)


if __name__ == "__main__":
    h2o.run_test(sys.argv, frame_reducers)
import sys
sys.path.insert(1, "../../../")
import h2o

def pca_scoring(ip, port):
    

    print "Importing arrests.csv data..."
    arrestsH2O = h2o.upload_file(h2o.locate("smalldata/pca_test/USArrests.csv"))

    print "Run PCA with transform = 'DEMEAN'"
    fitH2O = h2o.prcomp(x=arrestsH2O[0:4], k = 4, transform = "DEMEAN")
    # TODO: fitH2O.show()

    print "Project training data into eigenvector subspace"
    predH2O = fitH2O.predict(arrestsH2O)
    print "H2O Projection:"
    print predH2O.head()

if __name__ == "__main__":
    h2o.run_test(sys.argv, pca_scoring)
Exemple #16
0
    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    running_inside_h2o = h2o.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = h2o.get_h2o_internal_hdfs_name_node()
        hdfs_file = "/datasets/airlines_all.csv"

        print "Import airlines_all.csv from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file)
        airlines_h2o = h2o.import_frame(url)
        n = airlines_h2o.nrow()
        print "rows: {0}".format(n)

        print "Run k-means++ with k = 7 and max_iterations = 10"
        myX = range(8) + range(11, 16) + range(18, 21) + range(24, 29) + [9]
        airlines_km = h2o.kmeans(training_frame=airlines_h2o,
                                 x=airlines_h2o[myX],
                                 k=7,
                                 init="Furthest",
                                 max_iterations=10,
                                 standardize=True)
        print airlines_km
    else:
        print "Not running on H2O internal network.  No access to HDFS."


if __name__ == "__main__":
    h2o.run_test(sys.argv, hdfs_kmeans_airlines)
  iris_sci = iris_sci[:,0:4]

  s =[[4.9,3.0,1.4,0.2],
  [5.6,2.5,3.9,1.1],
  [6.5,3.0,5.2,2.0]]

  start = h2o.H2OFrame(s)
  start_key = start.send_frame()

  h2o_km = h2o.kmeans(x=iris_h2o[0:4], k=3, user_points=start_key, standardize=False)

  sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1)
  sci_km.fit(iris_sci)

  # Log.info("Cluster centers from H2O:")
  print "Cluster centers from H2O:"
  h2o_centers = h2o_km.centers()
  print h2o_centers

  # Log.info("Cluster centers from scikit:")
  print "Cluster centers from scikit:"
  sci_centers = sci_km.cluster_centers_.tolist()
  print sci_centers

  for hcenter, scenter in zip(h2o_centers, sci_centers):
    for hpoint, spoint in zip(hcenter,scenter):
      assert (hpoint- spoint) < 1e-10, "expected centers to be the same"

if __name__ == "__main__":
  h2o.run_test(sys.argv, iris_h2o_vs_sciKmeans)
Exemple #18
0
import sys, os
sys.path.insert(1, "../../")
import h2o
from h2o.model.binomial import H2OBinomialModel


def save_load_model(ip, port):

    prostate = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv"))
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()
    prostate_glm = h2o.glm(y=prostate["CAPSULE"],
                           x=prostate[["AGE", "RACE", "PSA", "DCAPS"]],
                           family="binomial",
                           alpha=[0.5])
    model_path = h2o.save_model(prostate_glm, name="delete_model", force=True)
    the_model = h2o.load_model(model_path)

    assert isinstance(
        the_model,
        H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(
            the_model)


if __name__ == "__main__":
    h2o.run_test(sys.argv, save_load_model)
import sys

sys.path.insert(1, "../../../")
import h2o


def iris_nfolds(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))

    model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50, nfolds=5)
    model.show()

    # Can specify both nfolds >= 2 and validation = H2OParsedData at once
    try:
        h2o.random_forest(y=iris[4], x=iris[0:4], validation_y=iris[4], validation_x=iris[0:4], ntrees=50, nfolds=5)
        assert True
    except EnvironmentError:
        assert False, "expected an error"


if __name__ == "__main__":
    h2o.run_test(sys.argv, iris_nfolds)
Exemple #20
0
def nb_init_err(ip, port):

    print "Importing iris_wheader.csv data...\n"
    iris = h2o.upload_file(h2o.locate("smalldata/iris/iris_wheader.csv"))
    iris.describe

    print "Laplace smoothing parameter is negative"
    try:
        h2o.naive_bayes(x=iris[0:4], y=iris[4], laplace=-1)
        assert False, "Expected naive bayes algo to fail on negative laplace training parameter"
    except:
        pass

    print "Minimum standard deviation is zero"
    try:
        h2o.naive_bayes(x=iris[0:4], y=iris[4], min_sdev=0)
        assert False, "Expected naive bayes algo to fail on min_sdev = 0"
    except:
        pass

    print "Response column is not categorical"
    try:
        h2o.naive_bayes(x=iris[0:3], y=iris[3], min_sdev=0)
        assert False, "Expected naive bayes algo to fail on response not categorical"
    except:
        pass


if __name__ == "__main__":
    h2o.run_test(sys.argv, nb_init_err)
Exemple #21
0
import sys

sys.path.insert(1, "../../../")
import h2o


def nb_prostate(ip, port):
    h2o.init(ip, port)

    print "Importing prostate.csv data..."
    prostate = h2o.upload_file(h2o.locate("smalldata/logreg/prostate.csv"))

    print "Converting CAPSULE, RACE, DCAPS, and DPROS to categorical"
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()
    prostate["RACE"] = prostate["CAPSULE"].asfactor()
    prostate["DCAPS"] = prostate["DCAPS"].asfactor()
    prostate["DPROS"] = prostate["DPROS"].asfactor()

    print "Compare with Naive Bayes when x = 3:9, y = 2"
    prostate_nb = h2o.naive_bayes(x=prostate[2:9], y=prostate[1], laplace=0)
    prostate_nb.show()

    print "Predict on training data"
    prostate_pred = prostate_nb.predict(prostate)
    prostate_pred.head()


if __name__ == "__main__":
    h2o.run_test(sys.argv, nb_prostate)
Exemple #22
0
import sys

sys.path.insert(1, "../../")
import h2o


def ls_test(ip, port):

    iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv"))

    h2o.ls()


if __name__ == "__main__":
    h2o.run_test(sys.argv, ls_test)
import sys
sys.path.insert(1, "../../")
import h2o
from h2o.frame import H2OVec

def vec_as_list(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))

    res = h2o.as_list(iris[0])
    assert abs(res[3][0] - 4.6) < 1e-10 and abs(res[5][0] - 5.4) < 1e-10 and abs(res[9][0] - 4.9) < 1e-10, \
        "incorrect values"

    res = 2 - iris
    res2 = h2o.as_list(H2OVec(name="C0", expr=res[0]))
    assert abs(res2[3][0] - -2.6) < 1e-10 and abs(res2[17][0] - -3.1) < 1e-10 and abs(res2[24][0] - -2.8) < 1e-10, \
        "incorrect values"

    res3 = h2o.as_list(H2OVec(name="C1", expr=res[1]))
    assert abs(res3[3][0] - -1.1) < 1e-10 and abs(res3[5][0] - -1.9) < 1e-10 and abs(res3[9][0] - -1.1) < 1e-10, \
        "incorrect values"

if __name__ == "__main__":
    h2o.run_test(sys.argv, vec_as_list)
Exemple #24
0
import sys
sys.path.insert(1, "../../")
import h2o


def rep_len_check(ip, port):
    # Connect to a pre-existing cluster

    iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv"))

    # data is single column (vec)
    vec = iris[0].rep_len(length_out=301)
    assert vec.nrow(
    ) == 301, "Expected an H2OVec with 301 rows, but got {0} rows".format(
        vec.nrow())
    for r in range(len(vec)):
        assert vec[r] == vec[r % 150], "Expected {0}, but got {1}".format(
            vec[r % 150], vec[r])

    # data is frame
    #TODO: there's a NPE bug here
    #fr = h2o.rep_len(iris, length_out=7)
    #assert fr.nrow() == 150 and fr.ncol() == 7, "Expected an H2OFrame with 150 rows and 7 columns, but got {0} rows and {1} " \
    #                                            "cols".format(fr.nrow(), fr.ncol())


if __name__ == "__main__":
    h2o.run_test(sys.argv, rep_len_check)
Exemple #25
0
    #    res = iris ** iris[0:3]
    #    res.show()
    #    assert False, "expected error. frames are different dimensions."
    #except EnvironmentError:
    #    pass

    # LHS: H2OFrame, RHS: H2OVec
    #try:
    #    res = iris ** iris[0]
    #    res.show()
    #    assert False, "expected error. objects of different dimensions not supported."
    #except EnvironmentError:
    #    pass

    # LHS: H2OFrame, RHS: scaler
    res = 1.2 ** iris[2]
    res2 = iris ** res[63,:]
    res2.show()

    # LHS: H2OFrame, RHS: scaler
    res = iris ** 2
    res_rows, res_cols = res.dim()
    assert res_rows == rows and res_cols == cols, "dimension mismatch"
    for x, y in zip([res[c].sum() for c in range(cols-1)], [1800.33, 709.32, 382.69, 30.74]):
        assert abs(x - y) < 1e-2,  "expected same values"

    ###################################################################

if __name__ == "__main__":
    h2o.run_test(sys.argv, binop_pow)
Exemple #26
0
import sys
sys.path.insert(1, "../../")
import h2o


def screeplot_test(ip, port):
    # Connect to h2o
    h2o.init(ip, port)
    kwargs = {}
    kwargs['server'] = True

    australia = h2o.upload_file(
        h2o.locate("smalldata/pca_test/AustraliaCoast.csv"))
    australia_pca = h2o.prcomp(x=australia[0:8], k=4, transform="STANDARDIZE")
    australia_pca.screeplot(type="barplot", **kwargs)
    australia_pca.screeplot(type="lines", **kwargs)


if __name__ == "__main__":
    h2o.run_test(sys.argv, screeplot_test)
    print "H2O Singular Values: {0}".format(h2o_d)
    for r, h in zip(r_d, h2o_d): assert abs(r - h) < 1e-6, "H2O got {0}, but R got {1}".format(h, r)

    print "Compare right singular vectors (V)"
    h2o_v = fitH2O._model_json['output']['v']
    r_v = [[-0.04239181, 0.01616262, -0.06588426, 0.99679535],
           [-0.94395706, 0.32068580, 0.06655170, -0.04094568],
           [-0.30842767, -0.93845891, 0.15496743, 0.01234261],
           [-0.10963744, -0.12725666, -0.98347101, -0.06760284]]
    print "R Right Singular Vectors: {0}".format(r_v)
    print "H2O Right Singular Vectors: {0}".format(h2o_v)
    for rl, hl in zip(r_v, h2o_v):
        for r, h in zip(rl, hl): assert abs(abs(r) - abs(h)) < 1e-5, "H2O got {0}, but R got {1}".format(h, r)

    print "Compare left singular vectors (U)"
    h2o_u = h2o.as_list(h2o.get_frame(fitH2O._model_json['output']['u_key']['name']), use_pandas=False)
    h2o_u.pop(0)
    r_u = [[-0.1716251, 0.096325710, 0.06515480, 0.15369551],
           [-0.1891166, 0.173452566, -0.42665785, -0.17801438],
           [-0.2155930, 0.078998111, 0.02063740, -0.28070784],
           [-0.1390244, 0.059889811, 0.01392269, 0.01610418],
           [-0.2067788, -0.009812026, -0.17633244, -0.21867425],
           [-0.1558794, -0.064555293, -0.28288280, -0.11797419]]
    print "R Left Singular Vectors: {0}".format(r_u)
    print "H2O Left Singular Vectors: {0}".format(h2o_u)
    for rl, hl in zip(r_u, h2o_u):
        for r, h in zip(rl, hl): assert abs(abs(r) - abs(float(h))) < 1e-5, "H2O got {0}, but R got {1}".format(h, r)

if __name__ == "__main__":
    h2o.run_test(sys.argv, svd_1_golden)
Exemple #28
0
    assert c4_imputed == 5, "Wrong value imputed. Expected imputed value of 5, but got {0}".format(
        c4_imputed)

    # mode-categorical
    h2o_data = h2o.H2OFrame(python_obj=data)
    h2o_data.impute(column="C5", method="mode")
    c5_imputed = h2o_data[4, 4]
    assert c5_imputed == 'b', "Wrong value imputed. Expected imputed value of b, but got {0}".format(
        c5_imputed)

    # mode-numeric
    h2o_data = h2o.H2OFrame(python_obj=data)
    h2o_data.impute(column="C6", method="mode")
    c6_imputed = h2o_data[5, 5]
    assert c6_imputed == 1, "Wrong value imputed. Expected imputed value of 1, but got {0}".format(
        c6_imputed)

    # mean-group by C7
    h2o_data = h2o.H2OFrame(python_obj=data)
    h2o_data.impute(column="C3", method="mean", by="C7")
    imputed1 = h2o_data[2, 2]
    imputed2 = h2o_data[3, 2]
    assert imputed1 == 3.5, "Wrong value imputed. Expected imputed value of 3.5, but got {0}".format(
        imputed1)
    assert imputed2 == 9.5, "Wrong value imputed. Expected imputed value of 9.5, but got {0}".format(
        imputed2)


if __name__ == "__main__":
    h2o.run_test(sys.argv, impute2)
            if method == 3:
                s = []
                for p in range(kwargs['k']):
                    s.append([random.uniform(train[c].mean()-100,train[c].mean()+100) for c in x])
                start = h2o.H2OFrame(python_obj=s)
                kwargs['user_points'] = start
            else:
                kwargs['init'] = ["Furthest","Random", "PlusPlus"][method]
        if random.randint(0,1): kwargs['seed'] = random.randint(1,10000)

        # display the parameters and their corresponding values
        print "-----------------------"
        print "x: {0}".format(x)
        for k, v in zip(kwargs.keys(), kwargs.values()):
            if k == 'user_points':
                print k + ": "
                start.show()
            else:
                print k + ": {0}".format(v)
        h2o.kmeans(x=train[x],  **kwargs)
        print "-----------------------"

    print "Import and data munging..."
    ozone = h2o.import_frame(path=h2o.locate("smalldata/glm_test/ozone.csv"))

    for i in range(50):
        attack(ozone, random.sample([0,1,2,3],random.randint(1,4)))

if __name__ == "__main__":
    h2o.run_test(sys.argv, random_attack)
Exemple #30
0
import sys
sys.path.insert(1, "../../")
import h2o
import random


def create_frame_test(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    # REALLY basic test TODO: add more checks
    r = random.randint(1, 1000)
    c = random.randint(1, 1000)

    frame = h2o.create_frame(rows=r, cols=c)
    assert frame.nrow() == r and frame.ncol() == c, "Expected {0} rows and {1} cols, but got {2} rows and {3} " \
                                                    "cols.".format(r,c,frame.nrow(),frame.ncol())


if __name__ == "__main__":
    h2o.run_test(sys.argv, create_frame_test)
import sys
sys.path.insert(1, "../../")
import h2o

def frame_as_list(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    prostate = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate.csv.zip"))
    airlines = h2o.import_frame(path=h2o.locate("smalldata/airlines/allyears2k.zip"))

    res1 = h2o.as_list(iris)
    assert abs(res1[8][0] - 4.4) < 1e-10 and abs(res1[8][1] - 2.9) < 1e-10 and abs(res1[8][2] - 1.4) < 1e-10, \
        "incorrect values"

    res2 = h2o.as_list(prostate)
    assert abs(res2[6][0] - 7) < 1e-10 and abs(res2[6][1] - 0) < 1e-10 and abs(res2[6][2] - 68) < 1e-10, \
        "incorrect values"

    res3 = h2o.as_list(airlines)
    assert abs(res3[3][0] - 1987) < 1e-10 and abs(res3[3][1] - 10) < 1e-10 and abs(res3[3][2] - 18) < 1e-10, \
        "incorrect values"

if __name__ == "__main__":
    h2o.run_test(sys.argv, frame_as_list)
Exemple #32
0
#----------------------------------------------------------------------
# Try to slice by using != factor_level
#----------------------------------------------------------------------

import sys
sys.path.insert(1, "../../")
import h2o

def not_equal_factor(ip,port):
    # Connect to a pre-existing cluster
    h2o.init(ip,port)

    air = h2o.import_frame(path=h2o.locate("smalldata/airlines/allyears2k_headers.zip"))

    # Print dataset size.
    rows, cols = air.dim()

    #
    # Example 1: Select all flights not departing from SFO
    #

    not_sfo = air[air["Origin"] != "SFO"]
    sfo = air[air["Origin"] == "SFO"]
    no_rows, no_cols = not_sfo.dim()
    yes_rows, yes_cols = sfo.dim()
    assert (no_rows + yes_rows) == rows and no_cols == yes_cols == cols, "dimension mismatch"

if __name__ == "__main__":
    h2o.run_test(sys.argv, not_equal_factor)
    assert set(['a', 'b', 'c']) == set(levels), \
        "Expected levels to be {0}, but got {1}".format(set(['a', 'b', 'c']),levels)
    assert nlevels == 3, "Expected nlevels to be 3, but got {0}".format(nlevels)
    assert iris[0,4] == 'a'

    levels = iris[4].levels()
    nlevels = iris[4].nlevels()
    assert set(['a', 'b', 'c']) == set(levels), \
        "Expected levels to be {0}, but got {1}".format(set(['a', 'b', 'c']),levels)
    assert nlevels == 3, "Expected nlevels to be 3, but got {0}".format(nlevels)

    iris[4] = iris[4].setLevel(level='b')
    levels = iris.levels(col=4)
    nlevels = iris.nlevels(col=4)
    assert set(['a', 'b', 'c']) == set(levels), \
        "Expected levels to be {0}, but got {1}".format(set(['a', 'b', 'c']),levels)
    assert nlevels == 3, "Expected nlevels to be 3, but got {0}".format(nlevels)
    assert iris[0,4] == 'b'

    levels = iris[1].levels()
    nlevels = iris[1].nlevels()
    assert levels == None, "Expected levels to be None, but got {0}".format(levels)
    assert nlevels == 0, "Expected nlevels to be 0, but got {0}".format(nlevels)

    one_column_frame = iris[4]
    one_column_frame = one_column_frame.setLevel(level='c')
    assert one_column_frame[0,0] == 'c'

if __name__ == "__main__":
    h2o.run_test(sys.argv, levels_nlevels_setlevel_setLevels_test)
Exemple #34
0
    imbalanced_perf.show()

    balanced = h2o.random_forest(x=covtype[0:54],
                                 y=covtype[54],
                                 ntrees=10,
                                 balance_classes=True,
                                 nfolds=3)
    balanced_perf = balanced.model_performance(covtype)
    balanced_perf.show()

    ##compare error for class 6 (difficult minority)
    class_6_err_imbalanced = imbalanced_perf.confusion_matrix(
    ).cell_values[5][7]
    class_6_err_balanced = balanced_perf.confusion_matrix().cell_values[5][7]

    print("--------------------")
    print("")
    print("class_6_err_imbalanced")
    print(class_6_err_imbalanced)
    print("")
    print("class_6_err_balanced")
    print(class_6_err_balanced)
    print("")
    print("--------------------")

    assert class_6_err_imbalanced >= 0.9 * class_6_err_balanced, "balance_classes makes it at least 10% worse!"


if __name__ == "__main__":
    h2o.run_test(sys.argv, imbalanced)
  df_hex.summary()

  assert (not df_hex['h1'].isfactor())
  assert (df_hex['h2'].isfactor())
  assert (not df_hex['h3'].isfactor())

  df_hex['h1'] = df_hex['h1'].asfactor()
  df_hex['h2'] = df_hex['h2'].asfactor()
  df_hex['h3'] = df_hex['h3'].asfactor()

  df_hex.show()
  df_hex.summary()

  assert (df_hex['h1'].isfactor())
  assert (df_hex['h2'].isfactor())
  assert (df_hex['h3'].isfactor())

  df_hex['h1'] = df_hex['h1'].asnumeric()
  df_hex['h2'] = df_hex['h2'].asnumeric()
  df_hex['h3'] = df_hex['h3'].asnumeric()

  df_hex.show()
  df_hex.summary()

  assert (not df_hex['h1'].isfactor())
  assert (not df_hex['h2'].isfactor())
  assert (not df_hex['h3'].isfactor())

if __name__ == "__main__":
    h2o.run_test(sys.argv, continuous_or_categorical)
        dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=random.sample([-1,1], 1)[0])
        assert False, "Expected model-build to fail when nfolds is 1 or < 0"
    except EnvironmentError:
        assert True

    # 2. more folds than observations
    try:
        dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow()+1, fold_assignment="Modulo")
        assert False, "Expected model-build to fail when nfolds > nobs"
    except EnvironmentError:
        assert True

    # 3. fold_column and nfolds both specified
    try:
        rf = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=3, fold_column="fold_assignments",
                              training_frame=cars)
        assert False, "Expected model-build to fail when fold_column and nfolds both specified"
    except EnvironmentError:
        assert True

    # # 4. fold_column and fold_assignment both specified
    # try:
    #     rf = h2o.deeplearning(y=cars[response_col], x=cars[predictors], fold_assignment="Random",
    #                           fold_column="fold_assignments", training_frame=cars)
    #     assert False, "Expected model-build to fail when fold_column and fold_assignment both specified"
    # except EnvironmentError:
    #     assert True

if __name__ == "__main__":
    h2o.run_test(sys.argv, cv_carsDL)
    covtype = h2o.import_file(path=h2o.locate("smalldata/covtype/covtype.20k.data"))
    covtype[54] = covtype[54].asfactor()

    hh_imbalanced = h2o.gbm(x=covtype[0:54], y=covtype[54], ntrees=10, balance_classes=False, nfolds=3, distribution="multinomial")
    hh_imbalanced_perf = hh_imbalanced.model_performance(covtype)
    hh_imbalanced_perf.show()

    hh_balanced = h2o.gbm(x=covtype[0:54], y=covtype[54], ntrees=10, balance_classes=True, seed=123, nfolds=3, distribution="multinomial")
    hh_balanced_perf = hh_balanced.model_performance(covtype)
    hh_balanced_perf.show()

    #compare error for class 6 (difficult minority)
    class_6_err_imbalanced = hh_imbalanced_perf.confusion_matrix().cell_values[5][7]
    class_6_err_balanced = hh_balanced_perf.confusion_matrix().cell_values[5][7]

    print("--------------------")
    print("")
    print("class_6_err_imbalanced")
    print(class_6_err_imbalanced)
    print("")
    print("class_6_err_balanced")
    print(class_6_err_balanced)
    print("")
    print("--------------------")

    assert class_6_err_imbalanced >= 0.90*class_6_err_balanced, "balance_classes makes it at least 10% worse!"

if __name__ == "__main__":
    h2o.run_test(sys.argv, imbalancedGBM)
                                  family="binomial",
                                  link="logit")
    assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \
        h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal"

    print("POISSON: ")
    h2o_model_unspecified = h2o.glm(x=h2o_data[2:9],
                                    y=h2o_data[1],
                                    family="poisson")
    h2o_model_specified = h2o.glm(x=h2o_data[2:9],
                                  y=h2o_data[1],
                                  family="poisson",
                                  link="log")
    assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \
        h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal"

    print("GAMMA: ")
    h2o_model_unspecified = h2o.glm(x=h2o_data[3:9],
                                    y=h2o_data[2],
                                    family="gamma")
    h2o_model_specified = h2o.glm(x=h2o_data[3:9],
                                  y=h2o_data[2],
                                  family="gamma",
                                  link="inverse")
    assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \
        h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal"


if __name__ == "__main__":
    h2o.run_test(sys.argv, link_correct_default)
import numpy as np


def wide_dataset_large(ip, port):

    print("Reading in Arcene training data for binomial modeling.")
    trainDataResponse = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_train_labels.labels"), delimiter=" ")
    trainDataResponse = np.where(trainDataResponse == -1, 0, 1)
    trainDataFeatures = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_train.data"), delimiter=" ")
    trainData = h2o.H2OFrame(np.column_stack((trainDataResponse, trainDataFeatures)).tolist())

    print("Run model on 3250 columns of Arcene with strong rules off.")
    model = h2o.glm(x=trainData[1:3250], y=trainData[0].asfactor(), family="binomial", lambda_search=False, alpha=[1])

    print("Test model on validation set.")
    validDataResponse = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_valid_labels.labels"), delimiter=" ")
    validDataResponse = np.where(validDataResponse == -1, 0, 1)
    validDataFeatures = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_valid.data"), delimiter=" ")
    validData = h2o.H2OFrame(np.column_stack((validDataResponse, validDataFeatures)).tolist())
    prediction = model.predict(validData)

    print("Check performance of predictions.")
    performance = model.model_performance(validData)

    print("Check that prediction AUC better than guessing (0.5).")
    assert performance.auc() > 0.5, "predictions should be better then pure chance"


if __name__ == "__main__":
    h2o.run_test(sys.argv, wide_dataset_large)
        assert False, "Expected model-build to fail when nfolds is 1 or < 0"
    except EnvironmentError:
        assert True

    # 2. more folds than observations
    try:
        glm = h2o.glm(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow()+1, family=family,
                      fold_assignment="Modulo")
        assert False, "Expected model-build to fail when nfolds > nobs"
    except EnvironmentError:
        assert True

    # 3. fold_column and nfolds both specified
    try:
        glm = h2o.glm(y=cars[response_col], x=cars[predictors], nfolds=3, fold_column="fold_assignments",
                      family=family, training_frame=cars)
        assert False, "Expected model-build to fail when fold_column and nfolds both specified"
    except EnvironmentError:
        assert True

    # # 4. fold_column and fold_assignment both specified
    # try:
    #     glm = h2o.glm(y=cars[response_col], x=cars[predictors], fold_assignment="Random", fold_column="fold_assignments",
    #                   family=family, training_frame=cars)
    #     assert False, "Expected model-build to fail when fold_column and fold_assignment both specified"
    # except EnvironmentError:
    #     assert True

if __name__ == "__main__":
    h2o.run_test(sys.argv, cv_carsGLM)
def link_functions_binomial(ip,port):
	# Connect to h2o
	h2o.init(ip,port)

	print("Read in prostate data.")
	h2o_data = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip"))
	h2o_data.head()

	sm_data = pd.read_csv(zipfile.ZipFile(h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).open("prostate_complete.csv")).as_matrix()
	sm_data_response = sm_data[:,2]
	sm_data_features = sm_data[:,[1,3,4,5,6,7,8,9]]

	print("Testing for family: BINOMIAL")
	print("Set variables for h2o.")
	myY = "CAPSULE"
	myX = ["ID","AGE","RACE","GLEASON","DCAPS","PSA","VOL","DPROS"]

	print("Create models with canonical link: LOGIT")
	h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY].asfactor(), family="binomial", link="logit",alpha=[0.5], Lambda=[0])
	sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Binomial(sm.families.links.logit)).fit()

	print("Compare model deviances for link function logit")
	h2o_deviance = h2o_model._model_json['output']['residual_deviance'] / h2o_model._model_json['output']['null_deviance']
	sm_deviance = sm_model.deviance / sm_model.null_deviance
	assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"

if __name__ == "__main__":
	h2o.run_test(sys.argv, link_functions_binomial)


    dataset2_python_weighted = copy.deepcopy(dataset2_python)
    [r.append(0.8) for r in dataset2_python_weighted]

    ##### combine dataset1 and dataset2
    combined_dataset_python = []
    [combined_dataset_python.append(r) for r in dataset1_python_weighted]
    [combined_dataset_python.append(r) for r in dataset2_python_weighted]
    combined_dataset_h2o = h2o.H2OFrame(python_obj=combined_dataset_python)
    combined_dataset_h2o.setNames(["response", "p1", "p2", "p3", "weights"])

    ##### recompute the variable importances. the relative order should be the same as above.
    model_combined_dataset = h2o.deeplearning(
        x=combined_dataset_h2o[["p1", "p2", "p3"]],
        y=combined_dataset_h2o["response"],
        training_frame=combined_dataset_h2o,
        variable_importances=True,
        weights_column="weights",
        hidden=[1],
        reproducible=True,
        seed=1234,
        activation="Tanh")

    varimp_combined = tuple(
        [p[0] for p in model_combined_dataset.varimp(return_list=True)])
    assert varimp_combined == ('p3', 'p1', 'p2'), "Expected the following relative variable importance on the combined " \
                                                  "dataset: ('p3', 'p1', 'p2'), but got: {0}".format(varimp_combined)


if __name__ == "__main__":
    h2o.run_test(sys.argv, weights_vi)
        assert False, "expected error. objects of different dimensions not supported."
    except EnvironmentError:
        pass

    #vec/vec
    res = iris[0] * iris[1]
    res.show()
    assert abs(sum([res[i].eager() for i in range(rows)]) - 2670.98) < 1e-2, "expected different column sum"

    res = iris[0] * iris[1] * iris[2] * iris[3]
    res.show()
    assert abs(sum([res[i].eager() for i in range(rows)]) - 16560.42) < 1e-2, "expected different sum"

    # frame/frame
    res = iris * iris
    res_rows, res_cols = res.dim()
    assert res_rows == rows and res_cols == cols, "dimension mismatch"

    res = iris[0:2] * iris[1:3]
    res_rows, res_cols = res.dim()
    assert res_rows == rows and res_cols == 2, "dimension mismatch"

    try:
        res = iris * iris[0:3]
        assert False, "expected error. frames are different dimensions."
    except EnvironmentError:
        pass

if __name__ == "__main__":
    h2o.run_test(sys.argv, binop_star)
import sys
sys.path.insert(1, "../../../")
import h2o


def cupMediumGBM(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    train = h2o.import_frame(
        path=h2o.locate("bigdata/laptop/usecases/cup98LRN_z.csv"))
    test = h2o.import_frame(
        path=h2o.locate("bigdata/laptop/usecases/cup98VAL_z.csv"))

    train["TARGET_B"] = train["TARGET_B"].asfactor()

    # Train H2O GBM Model:
    train_cols = train.names()
    for c in ['', "TARGET_D", "TARGET_B", "CONTROLN"]:
        train_cols.remove(c)
    model = h2o.gbm(x=train[train_cols],
                    y=train["TARGET_B"],
                    distribution="bernoulli",
                    ntrees=5)


if __name__ == "__main__":
    h2o.run_test(sys.argv, cupMediumGBM)
import sys
sys.path.insert(1, "../../")
import h2o

def https_import(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    url = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip"
    aa = h2o.import_frame(path=url)
    aa.show()

if __name__ == "__main__":
    h2o.run_test(sys.argv, https_import)
Exemple #46
0
    s = [[4.9, 3.0, 1.4, 0.2], [5.6, 2.5, 3.9, 1.1], [6.5, 3.0, 5.2, 2.0]]

    start = h2o.H2OFrame(s)
    start_key = start.send_frame()

    h2o_km = h2o.kmeans(x=iris_h2o[0:4],
                        k=3,
                        user_points=start_key,
                        standardize=False)

    sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1)
    sci_km.fit(iris_sci)

    # Log.info("Cluster centers from H2O:")
    print "Cluster centers from H2O:"
    h2o_centers = h2o_km.centers()
    print h2o_centers

    # Log.info("Cluster centers from scikit:")
    print "Cluster centers from scikit:"
    sci_centers = sci_km.cluster_centers_.tolist()
    print sci_centers

    for hcenter, scenter in zip(h2o_centers, sci_centers):
        for hpoint, spoint in zip(hcenter, scenter):
            assert (hpoint - spoint) < 1e-10, "expected centers to be the same"


if __name__ == "__main__":
    h2o.run_test(sys.argv, iris_h2o_vs_sciKmeans)
sys.path.insert(1, "../../")
import h2o

def hit_ratio_test(ip,port):
    
    

    air_train = h2o.import_file(path=h2o.locate("smalldata/airlines/AirlinesTrain.csv.zip"))
    air_valid = h2o.import_file(path=h2o.locate("smalldata/airlines/AirlinesTest.csv.zip"))
    air_test = h2o.import_file(path=h2o.locate("smalldata/airlines/AirlinesTest.csv.zip"))

    gbm_mult = h2o.gbm(x=air_train[["Origin", "Dest", "Distance", "UniqueCarrier", "IsDepDelayed", "fDayofMonth","fMonth"]],
                       y=air_train["fDayOfWeek"].asfactor(),
                       validation_x=air_valid[["Origin", "Dest", "Distance", "UniqueCarrier", "IsDepDelayed", "fDayofMonth",
                                              "fMonth"]],
                       validation_y=air_valid["fDayOfWeek"].asfactor(),
                       distribution="multinomial")

    training_hit_ratio_table = gbm_mult.hit_ratio_table(train=True)
    training_hit_ratio_table.show()

    validation_hit_ratio_table = gbm_mult.hit_ratio_table(valid=True)
    validation_hit_ratio_table.show()

    perf = gbm_mult.model_performance(air_test)
    test_hit_ratio_table = perf.hit_ratio_table()
    test_hit_ratio_table.show()

if __name__ == "__main__":
    h2o.run_test(sys.argv, hit_ratio_test)
    print
    print "======================================================================"
    print "============================== Binomial =============================="
    print "======================================================================"
    for i in range(10):
        attack(pros_train, pros_valid,
               random.sample([2, 3, 4, 5, 6, 7, 8], random.randint(1, 7)), 1)

    print
    print "======================================================================"
    print "============================== Gaussian =============================="
    print "======================================================================"
    for i in range(10):
        attack(cars_train, cars_valid,
               random.sample([2, 3, 4, 5, 6, 7], random.randint(1, 6)), 1)

    print
    print "======================================================================"
    print "============================= Multinomial ============================"
    print "======================================================================"
    cars_train[2] = cars_train[2].asfactor()
    cars_valid[2] = cars_valid[2].asfactor()
    for i in range(10):
        attack(cars_train, cars_valid,
               random.sample([1, 3, 4, 5, 6, 7], random.randint(1, 6)), 2)


if __name__ == "__main__":
    h2o.run_test(sys.argv, random_attack)
  

  # Log.info("Importing covtype.20k.data...\n")
  covtype = h2o.import_file(path=h2o.locate("smalldata/covtype/covtype.20k.data"))
  #
  myY = 54
  myX = [x for x in range(0,54) if x not in [20,28]]

  # Set response to be indicator of a particular class
  res_class = random.randint(1,4)
  # Log.info(paste("Setting response column", myY, "to be indicator of class", res_class, "\n"))
  covtype[54] = (covtype[54] == res_class)

  #covtype.summary()

  # L2: alpha = 0, lambda = 0
  covtype_mod1 = h2o.glm(y=covtype[myY], x=covtype[myX], family="binomial", alpha=[0], Lambda=[0])
  covtype_mod1.show()

  # Elastic: alpha = 0.5, lambda = 1e-4
  covtype_mod2 = h2o.glm(y=covtype[myY], x=covtype[myX], family="binomial", alpha=[0.5], Lambda=[1e-4])
  covtype_mod2.show()

  # L1: alpha = 1, lambda = 1e-4
  covtype_mod3 = h2o.glm(y=covtype[myY], x=covtype[myX], family="binomial", alpha=[1], Lambda=[1e-4])
  covtype_mod3.show()

if __name__ == "__main__":
  h2o.run_test(sys.argv, covtype)

Exemple #50
0
sys.path.insert(1, "../../../")
import h2o


def pca_prostate(ip, port):
    h2o.init(ip, port)

    print "Importing prostate.csv data...\n"
    prostate = h2o.upload_file(h2o.locate("smalldata/logreg/prostate.csv"))

    print "Converting CAPSULE, RACE, DPROS and DCAPS columns to factors"
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()
    prostate["RACE"] = prostate["RACE"].asfactor()
    prostate["DPROS"] = prostate["DPROS"].asfactor()
    prostate["DCAPS"] = prostate["DCAPS"].asfactor()
    prostate.describe()

    print "PCA on columns 3 to 9 with k = 3, retx = FALSE, transform = 'STANDARDIZE'"
    fitPCA = h2o.prcomp(x=prostate[2:9],
                        k=3,
                        transform="NONE",
                        pca_method="Power")
    pred = fitPCA.predict(prostate)

    print "Projection matrix:\n"
    print pred.head()


if __name__ == "__main__":
    h2o.run_test(sys.argv, pca_prostate)
    h2o_data_zero_weights = h2o.cbind(h2o_data, h2o_zero_weights)
    h2o_data_zeros_removed = h2o_data[h2o_zero_weights["weights"] == 1]

    print "Checking that using some zero weights is equivalent to removing those observations:"
    print
    check_same(h2o_data_zeros_removed, h2o_data_zero_weights)

    # doubled weights same as doubled observations
    doubled_weights = [[1] if random.randint(0,1) else [2] for r in range(100)]
    h2o_doubled_weights = h2o.H2OFrame(python_obj=doubled_weights)
    h2o_doubled_weights.setNames(["weights"])
    h2o_data_doubled_weights = h2o.cbind(h2o_data, h2o_doubled_weights)

    doubled_data = copy.deepcopy(data)
    for d, w in zip(data,doubled_weights):
        if w[0] == 2: doubled_data.append(d)
    h2o_data_doubled = h2o.H2OFrame(python_obj=doubled_data)

    print "Checking that doubling some weights is equivalent to doubling those observations:"
    print
    check_same(h2o_data_doubled, h2o_data_doubled_weights)

    # TODO: random weights

    # TODO: all zero weights???

    # TODO: negative weights???

if __name__ == "__main__":
    h2o.run_test(sys.argv, weights_check)
Exemple #52
0
import sys
sys.path.insert(1, "../../")
import h2o


def vec_show(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    print "iris:"
    iris.show()

    ###################################################################

    res = 2 - iris
    res2 = res[0]
    print "res2:"
    res2.show()

    res3 = res[1]
    print "res3:"
    res3.show()

    iris[2].show()


if __name__ == "__main__":
    h2o.run_test(sys.argv, vec_show)
import sys
sys.path.insert(1, "../../")
import h2o

def show_jira(ip, port):
    h2o.init(ip, port)
    local_data = [[1, 'a'],[0, 'b']]
    h2o_data = h2o.H2OFrame(python_obj=local_data)
    h2o_data.setNames(['response', 'predictor'])
    h2o_data.show()

if __name__ == "__main__":
    h2o.run_test(sys.argv, show_jira)
Exemple #54
0
import sys
sys.path.insert(1, "../../../")
import h2o

def demo_gbm(ip,port):
    # Connect to a pre-existing cluster
    h2o.init(ip,port)

    # Execute gbm demo
    h2o.demo(func="gbm", interactive=False, test=True)

if __name__ == "__main__":
    h2o.run_test(sys.argv, demo_gbm)
                       [8,19,61,20.1],
                       [16,256,69,9.7],
                       [11,290,66,9.2],
                       [14,274,68,10.9]]
    for i in random.sample(range(0,ncent-1), nempty):
        initial_centers[i] = [100*i for z in range(1,len(initial_centers[0])+1)]

    initial_centers_h2o = h2o.H2OFrame(initial_centers)
    initial_centers_h2o_key = initial_centers_h2o.send_frame()
    initial_centers_sci = np.asarray(initial_centers)

    #Log.info("Initial cluster centers:")
    print "H2O initial centers:"
    initial_centers_h2o.show()
    print "scikit initial centers:"
    print initial_centers_sci

    # H2O can handle empty clusters and so can scikit
    #Log.info("Check that H2O can handle badly initialized centers")
    km_sci = KMeans(n_clusters=ncent, init=initial_centers_sci, n_init=1)
    km_sci.fit(preprocessing.scale(ozone_sci))
    print "scikit final centers"
    print km_sci.cluster_centers_

    km_h2o = h2o.kmeans(x=ozone_h2o, k=ncent, user_points=initial_centers_h2o_key, standardize=True)
    print "H2O final centers"
    print km_h2o.centers()

if __name__ == "__main__":
   h2o.run_test(sys.argv, emptyclusKmeans)
import h2o
import random


def get_set_list_timezones(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    origTZ = h2o.get_timezone()
    print "Original timezone: {0}".format(origTZ)

    timezones = h2o.list_timezones()
    # don't use the first one..it's a header for the table
    print "timezones[0]:", timezones[0]
    zone = timezones[random.randint(1,
                                    timezones.nrow() - 1),
                     0].split(" ")[1].split(",")[0]
    print "Setting the timezone: {0}".format(zone)
    h2o.set_timezone(zone)

    newTZ = h2o.get_timezone()
    assert newTZ == zone, "Expected new timezone to be {0}, but got {01}".format(
        zone, newTZ)

    print "Setting the timezone back to original: {0}".format(origTZ)
    h2o.set_timezone(origTZ)


if __name__ == "__main__":
    h2o.run_test(sys.argv, get_set_list_timezones)
    # # 85,0,75,1,1,1,9.9,15.4,7
    # # 86,1,75,1,3,1,3.7,0,6
    # pros = prostate[[1,2,3],83:86]
    # assert pros[0,0] == 0, "Incorrect slicing result"
    # assert pros[1,0] == 75, "Incorrect slicing result"
    # assert pros[2,0] == 1, "Incorrect slicing result"
    # assert pros[0,1] == 0, "Incorrect slicing result"
    # assert pros[1,1] == 75, "Incorrect slicing result"
    # assert pros[2,1] == 1, "Incorrect slicing result"
    # assert pros[0,2] == 1, "Incorrect slicing result"
    # assert pros[1,2] == 75, "Incorrect slicing result"
    # assert pros[2,2] == 1, "Incorrect slicing result"
    #
    # # prostate [list,list] case
    # # 27,0,67,1,2,1,2.8,25.6,7
    # # 9,0,69,1,1,1,3.9,24,7
    # # 201,0,57,1,1,1,10.2,0,6
    # pros = prostate[[5,6,7],[26,8,200]]
    # assert pros[0,0] == 1, "Incorrect slicing result"
    # assert (pros[1,0]-3.9) < 1e-10, "Incorrect slicing result"
    # assert pros[2,0] == 24, "Incorrect slicing result"
    # assert pros[0,1] == 1, "Incorrect slicing result"
    # assert (pros[1,1]-2.8) < 1e-10, "Incorrect slicing result"
    # assert (pros[2,1]-25.6) < 1e-10, "Incorrect slicing result"
    # assert pros[0,2] == 1, "Incorrect slicing result"
    # assert (pros[1,2]-10.2) < 1e-10, "Incorrect slicing result"
    # assert pros[2,2] == 0, "Incorrect slicing result"

if __name__ == "__main__":
    h2o.run_test(sys.argv, multi_dim_slicing)
    assert check_values(h2o.sign(h2o_data2), np.sign(np_data2)),         "expected equal sign values between h2o and numpy"
    assert check_values(h2o.sqrt(h2o_data3), np.sqrt(np_data3)),         "expected equal sqrt values between h2o and numpy"
    assert check_values(h2o.trunc(h2o_data3), np.trunc(np_data3)),       "expected equal trunc values between h2o and numpy"
    assert check_values(h2o.ceil(h2o_data3), np.ceil(np_data3)),         "expected equal ceil values between h2o and numpy"
    assert check_values(h2o.floor(h2o_data3), np.floor(np_data3)),       "expected equal floor values between h2o and numpy"
    assert check_values(h2o.log(h2o_data3), np.log(np_data3)),           "expected equal log values between h2o and numpy"
    assert check_values(h2o.log10(h2o_data3), np.log10(np_data3)),       "expected equal log10 values between h2o and numpy"
    assert check_values(h2o.log1p(h2o_data3), np.log1p(np_data3)),       "expected equal log1p values between h2o and numpy"
    assert check_values(h2o.log2(h2o_data3), np.log2(np_data3)),         "expected equal log2 values between h2o and numpy"
    assert check_values(h2o.exp(h2o_data3), np.exp(np_data3)),           "expected equal exp values between h2o and numpy"
    assert check_values(h2o.expm1(h2o_data3), np.expm1(np_data3)),       "expected equal expm1 values between h2o and numpy"
    h2o_val = h2o.as_list(h2o.gamma(h2o_data3))[5][5]
    num_val = math.gamma(h2o.as_list(h2o_data3)[5][5])
    assert abs(h2o_val - num_val) < max(abs(h2o_val), abs(num_val)) * 1e-6, \
        "check unsuccessful! h2o computed {0} and math computed {1}. expected equal gamma values between h2o and math".format(h2o_val,num_val)
    h2o_val = h2o.as_list(h2o.lgamma(h2o_data3))[5][5]
    num_val = math.lgamma(h2o.as_list(h2o_data3)[5][5])
    assert abs(h2o_val - num_val) < max(abs(h2o_val), abs(num_val)) * 1e-6, \
        "check unsuccessful! h2o computed {0} and math computed {1}. expected equal lgamma values between h2o and math".format(h2o_val,num_val)
    h2o_val = h2o.as_list(h2o.digamma(h2o_data3))[5][5]
    num_val = scipy.special.polygamma(0,h2o.as_list(h2o_data3)[5][5])
    assert abs(h2o_val - num_val) < max(abs(h2o_val), abs(num_val)) * 1e-6, \
        "check unsuccessful! h2o computed {0} and math computed {1}. expected equal digamma values between h2o and math".format(h2o_val,num_val)
    h2o_val = h2o.as_list(h2o.trigamma(h2o_data3))[5][5]
    num_val = scipy.special.polygamma(1,h2o.as_list(h2o_data3)[5][5])
    assert abs(h2o_val - num_val) < max(abs(h2o_val), abs(num_val)) * 1e-6, \
        "check unsuccessful! h2o computed {0} and math computed {1}. expected equal trigamma values between h2o and math".format(h2o_val,num_val)

if __name__ == "__main__":
    h2o.run_test(sys.argv, expr_math_ops)
import sys
sys.path.insert(1, "../../")
import h2o

def col_names_check(ip,port):
    
    

    iris_wheader = h2o.import_file(h2o.locate("smalldata/iris/iris_wheader.csv"))
    assert iris_wheader.col_names == ["sepal_len","sepal_wid","petal_len","petal_wid","class"], \
        "Expected {0} for column names but got {1}".format(["sepal_len","sepal_wid","petal_len","petal_wid","class"],
                                                           iris_wheader.col_names)

    iris = h2o.import_file(h2o.locate("smalldata/iris/iris.csv"))
    assert iris.col_names == ["C1","C2","C3","C4","C5"], "Expected {0} for column names but got " \
                                                           "{1}".format(["C1","C2","C3","C4","C5"], iris.col_names)

if __name__ == "__main__":
    h2o.run_test(sys.argv, col_names_check)
    res = iris[0] == 4.7
    res_rows = len(res)
    assert res_rows == rows, "dimension mismatch"
    new_rows = iris[res].nrow()
    assert new_rows == 2, "wrong number of rows returned"

    res = 3.5 == iris[1]
    res_rows = len(res)
    assert res_rows == rows, "dimension mismatch"
    new_rows = iris[res].nrow()
    assert new_rows == 6, "wrong number of rows returned"

    # frame/frame
    res = iris == iris
    res_rows, res_cols = res.dim()
    assert res_rows == rows and res_cols == cols, "dimension mismatch"

    res = iris[0:2] == iris[1:3]
    res_rows, res_cols = res.dim()
    assert res_rows == rows and res_cols == 2, "dimension mismatch"

    try:
        res = iris == iris[0:3]
        assert False, "expected error. frames are different dimensions."
    except EnvironmentError:
        pass


if __name__ == "__main__":
    h2o.run_test(sys.argv, binop_eq)