def swpredsGBM():
  # Training set has two predictor columns
  # X1: 10 categorical levels, 100 observations per level; X2: Unif(0,1) noise
  # Ratio of y = 1 per Level: cat01 = 1.0 (strong predictor), cat02 to cat10 = 0.5 (weak predictors)

  
  
  
  #Log.info("Importing swpreds_1000x3.csv data...\n")
  swpreds = h2o.import_file(path=tests.locate("smalldata/gbm_test/swpreds_1000x3.csv"))
  swpreds["y"] = swpreds["y"].asfactor()

  #Log.info("Summary of swpreds_1000x3.csv from H2O:\n")
  #swpreds.summary()
  
  # Train H2O GBM without Noise Column
  #Log.info("H2O GBM with parameters:\nntrees = 50, max_depth = 20, nbins = 500\n")
  h2o_gbm_model1 = h2o.gbm(x=swpreds[["X1"]], y=swpreds["y"], distribution="bernoulli", ntrees=50, max_depth=20,
                           nbins=500)
  h2o_gbm_model1.show()
  h2o_gbm_perf1 = h2o_gbm_model1.model_performance(swpreds)
  h2o_auc1 = h2o_gbm_perf1.auc()

  # Train H2O GBM Model including Noise Column:
  #Log.info("H2O GBM with parameters:\nntrees = 50, max_depth = 20, nbins = 500\n")
  h2o_gbm_model2 = h2o.gbm(x=swpreds[["X1","X2"]], y=swpreds["y"], distribution="bernoulli", ntrees=50, max_depth=20,
                           nbins=500)
  h2o_gbm_model2.show()
  h2o_gbm_perf2 = h2o_gbm_model2.model_performance(swpreds)
  h2o_auc2 = h2o_gbm_perf2.auc()
def milsong_checkpoint(ip,port):

    milsong_train = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
    milsong_valid = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))
    distribution = "gaussian"

    # build first model
    ntrees1 = random.sample(range(50,100),1)[0]
    max_depth1 = random.sample(range(2,6),1)[0]
    min_rows1 = random.sample(range(10,16),1)[0]
    print "ntrees model 1: {0}".format(ntrees1)
    print "max_depth model 1: {0}".format(max_depth1)
    print "min_rows model 1: {0}".format(min_rows1)
    model1 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1,
                     distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0])

    # save the model, then load the model
    model_path = h2o.save_model(model1, name="delete_model", force=True)
    restored_model = h2o.load_model(model_path)
    shutil.rmtree("delete_model")

    # continue building the model
    ntrees2 = ntrees1 + 50
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print "ntrees model 2: {0}".format(ntrees2)
    print "max_depth model 2: {0}".format(max_depth2)
    print "min_rows model 2: {0}".format(min_rows2)
    model2 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                     distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0],
                     checkpoint=restored_model._id)

    # build the equivalent of model 2 in one shot
    model3 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                     distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0])
def imbalancedGBM(ip,port):
    
    

    covtype = h2o.import_file(path=h2o.locate("smalldata/covtype/covtype.20k.data"))
    covtype[54] = covtype[54].asfactor()

    hh_imbalanced = h2o.gbm(x=covtype[0:54], y=covtype[54], ntrees=10, balance_classes=False, nfolds=3, distribution="multinomial")
    hh_imbalanced_perf = hh_imbalanced.model_performance(covtype)
    hh_imbalanced_perf.show()

    hh_balanced = h2o.gbm(x=covtype[0:54], y=covtype[54], ntrees=10, balance_classes=True, seed=123, nfolds=3, distribution="multinomial")
    hh_balanced_perf = hh_balanced.model_performance(covtype)
    hh_balanced_perf.show()

    #compare error for class 6 (difficult minority)
    class_6_err_imbalanced = hh_imbalanced_perf.confusion_matrix().cell_values[5][7]
    class_6_err_balanced = hh_balanced_perf.confusion_matrix().cell_values[5][7]

    print("--------------------")
    print("")
    print("class_6_err_imbalanced")
    print(class_6_err_imbalanced)
    print("")
    print("class_6_err_balanced")
    print(class_6_err_balanced)
    print("")
    print("--------------------")

    assert class_6_err_imbalanced >= 0.90*class_6_err_balanced, "balance_classes makes it at least 10% worse!"
Beispiel #4
0
def get_model_test(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    prostate = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv"))

    r = prostate[0].runif()
    train = prostate[r < 0.70]
    test = prostate[r >= 0.30]

    # Regression
    regression_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="gaussian")
    predictions1 = regression_gbm1.predict(test)

    regression_gbm2 = h2o.get_model(regression_gbm1._key)
    assert regression_gbm2._model_json['output']['model_category'] == "Regression"
    predictions2 = regression_gbm2.predict(test)

    for r in range(predictions1.nrow()):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected regression predictions to be the same for row {0}, but got {1} and {2}" \
                         "".format(r, p1, p2)

    # Binomial
    train[1] = train[1].asfactor()
    bernoulli_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="bernoulli")
    predictions1 = bernoulli_gbm1.predict(test)

    bernoulli_gbm2 = h2o.get_model(bernoulli_gbm1._key)
    assert bernoulli_gbm2._model_json['output']['model_category'] == "Binomial"
    predictions2 = bernoulli_gbm2.predict(test)

    for r in range(predictions1.nrow()):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected binomial predictions to be the same for row {0}, but got {1} and {2}" \
                         "".format(r, p1, p2)

    # Clustering
    benign_h2o = h2o.import_frame(path=h2o.locate("smalldata/logreg/benign.csv"))
    km_h2o = h2o.kmeans(x=benign_h2o, k=3)
    benign_km = h2o.get_model(km_h2o._key)
    assert benign_km._model_json['output']['model_category'] == "Clustering"

    # Multinomial
    train[4] = train[4].asfactor()
    multinomial_dl1 = h2o.deeplearning(x=train[0:2], y=train[4], loss='CrossEntropy')
    predictions1 = multinomial_dl1.predict(test)

    multinomial_dl2 = h2o.get_model(multinomial_dl1._key)
    assert multinomial_dl2._model_json['output']['model_category'] == "Multinomial"
    predictions2 = multinomial_dl2.predict(test)

    for r in range(predictions1.nrow()):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected multinomial predictions to be the same for row {0}, but got {1} and {2}" \
                         "".format(r, p1, p2)
def pubdev_1696(ip, port):

    iris = h2o.import_file(h2o.locate("smalldata/iris/iris.csv"))

    try:
        h2o.gbm(x=iris[0:3], y=iris[3], nfolds=-99)
        assert False, "expected an error"
    except EnvironmentError:
        assert True
    def check_same(data1, data2):
        gbm1_regression = h2o.gbm(x=data1[2:20], y=data1[1])
        gbm2_regression = h2o.gbm(x=data2[2:21], y=data2[1], weights_column="weights")
        gbm1_binomial = h2o.gbm(x=data1[1:20], y=data1[0], distribution="bernoulli")
        gbm2_binomial = h2o.gbm(x=data2[1:21], y=data2[0], weights_column="weights", distribution="bernoulli")

        assert abs(gbm1_regression.mse() - gbm2_regression.mse()) < 1e-6, "Expected mse's to be the same, but got {0}, " \
                                                                          "and {1}".format(gbm1_regression.mse(),
                                                                                           gbm2_regression.mse())
        assert abs(gbm1_binomial.auc() - gbm2_binomial.auc()) < 1e-6, "Expected auc's to be the same, but got {0}, and " \
                                                                      "{1}".format(gbm1_binomial.auc(), gbm2_binomial.auc())
def pub_444_spaces_in_filenames(ip,port):
    
    

    # tempdir = "smalldata/jira/"
    # if was okay to write to smalldata, it's okay to write to the current directory
    # probably don't want to, but can't find what the standard temp directory is supposed to be. no sandbox?
    tempdir = "./"
    # make a few files with spaces in the name
    f1 = open(h2o.locate(tempdir) + "foo .csv", "w")
    f1.write("response, predictor\n")
    for i in range(10):
        f1.write("1, a\n")
        f1.write("0, b\n")
        f1.write("1, a\n" if random.randint(0,1) else "0, b\n")
    f1.close()

    f2 = open(h2o.locate(tempdir) + "b a r .csv", "w")
    f2.write("response, predictor\n")
    for i in range(10):
        f2.write("1, a\n")
        f2.write("0, b\n")
        f2.write("1, a\n" if random.randint(0,1) else "0, b\n")
    f2.close()

    f3 = open(h2o.locate(tempdir) + " ba z.csv", "w")
    for i in range(10):
        f3.write("1, a\n")
        f3.write("0, b\n")
        f3.write("1, a\n" if random.randint(0,1) else "0, b\n")
    f3.close()

    train_data = h2o.upload_file(path=h2o.locate(tempdir + "foo .csv"))
    train_data.show()
    train_data.describe()
    gbm = h2o.gbm(x=train_data[1:], y=train_data["response"].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1)
    gbm.show()

    train_data = h2o.upload_file(path=h2o.locate(tempdir + "b a r .csv"))
    train_data.show()
    train_data.describe()
    gbm = h2o.gbm(x=train_data[1:], y=train_data["response"].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1)
    gbm.show()

    train_data = h2o.upload_file(path=h2o.locate(tempdir + " ba z.csv"))
    train_data.show()
    train_data.describe()
    gbm = h2o.gbm(x=train_data[1:], y=train_data[0].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1)
    gbm.show()

    os.remove(h2o.locate(tempdir) + "foo .csv")
    os.remove(h2o.locate(tempdir) + "b a r .csv")
    os.remove(h2o.locate(tempdir) + " ba z.csv")
def distribution_behaviorGBM():

  #Log.info("==============================")
  #Log.info("Default Behavior - Gaussian")
  #Log.info("==============================")
  eco = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv"))
  # 0/1 response: expect gaussian
  eco_model = h2o.gbm(x=eco[2:13], y=eco["Angaus"])
  # more than 2 integers for response: expect gaussian
  cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars.csv"))
  cars_model = h2o.gbm(x=cars[3:7], y=cars["cylinders"])

  #Log.info("==============================")
  #Log.info("Gaussian Behavior")
  #Log.info("==============================")
  # 0/1 response: expect gaussian
  eco_model = h2o.gbm(x=eco[2:13], y=eco["Angaus"], distribution="gaussian")
  # character response: expect error
  try:
    eco_model = h2o.gbm(x=eco[1:8], y=eco["Method"], distribution="gaussian")
    assert False, "expected an error"
  except EnvironmentError:
    assert True

  #Log.info("==============================")
  #Log.info("Bernoulli Behavior")
  #Log.info("==============================")
  # 0/1 response: expect bernoulli
  eco_model = h2o.gbm(x=eco[2:13], y=eco["Angaus"].asfactor(), distribution="bernoulli")
  # 2 level character response: expect bernoulli
  tree = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/test_tree_minmax.csv"))
  tree_model = h2o.gbm(x=tree[0:3], y=tree["response"], distribution="bernoulli", min_rows=1)
  # more than two integers for response: expect error
  try:
    cars_mod = h2o.gbm(x=cars[3:7], y=cars["cylinders"], distribution="bernoulli")
    assert False, "expected an error"
  except EnvironmentError:
    assert True
  # more than two character levels for response: expect error
  try:
    eco_model = h2o.gbm(x=eco[0:8], y=eco["Method"], distribution="bernoulli")
    assert False, "expected an error"
  except EnvironmentError:
    assert True

  #Log.info("==============================")
  #Log.info("Multinomial Behavior")
  #Log.info("==============================")
  # more than two integers for response: expect multinomial
  cars_model = h2o.gbm(x=cars[3:7], y=cars["cylinders"].asfactor(), distribution="multinomial")
  # more than two character levels for response: expect multinomial
  eco_model = h2o.gbm(x=eco[0:8], y=eco["Method"], distribution="multinomial")
def pubdev_1829():

    train =  h2o.import_file(path=pyunit_utils.locate("smalldata/jira/gbm_checkpoint_train.csv"))
    valid =  h2o.import_file(path=pyunit_utils.locate("smalldata/jira/gbm_checkpoint_valid.csv"))

    predictors = ["displacement","power","weight","acceleration","year"]
    response_col = "economy_20mpg"
    distribution = "bernoulli"
    train[response_col] = train[response_col].asfactor()
    valid[response_col] = valid[response_col].asfactor()

    ntrees1 = 5
    max_depth1 = 5
    min_rows1 = 10
    model1 = h2o.gbm(x=train[predictors],
                     y=train[response_col],
                     ntrees=ntrees1,
                     max_depth=max_depth1,
                     min_rows=min_rows1,
                     score_each_iteration=True,
                     distribution=distribution,
                     validation_x=valid[predictors],
                     validation_y=valid[response_col])

    ntrees2 = 10
    max_depth2 = 5
    min_rows2 = 10
    model2 = h2o.gbm(x=train[predictors],
                     y=train[response_col],
                     ntrees=ntrees2,
                     max_depth=max_depth2,
                     min_rows=min_rows2,
                     distribution=distribution,
                     score_each_iteration=True,
                     validation_x=valid[predictors],
                     validation_y=valid[response_col],
                     checkpoint=model1._id)

    model4 = h2o.gbm(x=train[predictors],
                     y=train[response_col],
                     ntrees=ntrees2,
                     max_depth=max_depth2,
                     min_rows=min_rows2,
                     distribution=distribution,
                     score_each_iteration=True,
                     validation_x=valid[predictors],
                     validation_y=valid[response_col])


    assert model2.auc(valid=True)==model4.auc(valid=True), "Expected Model 2 AUC: {0} to be the same as Model 4 AUC: {1}".format(model2.auc(valid=True), model4.auc(valid=True))
    assert model2.giniCoef(valid=True)==model4.giniCoef(valid=True), "Expected Model 2 Gini Coef {0} to be the same as Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True))
    assert model2.logloss(valid=True)==model4.logloss(valid=True), "Expected Model 2 Log Loss: {0} to be the same as Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True))
Beispiel #10
0
def hexdev_394():
    path = pyunit_utils.locate("smalldata/covtype/covtype.20k.data")
    c_types = [None] * 55
    c_types[10] = "enum"
    c_types[11] = "enum"
    c_types[12] = "enum"
    train = h2o.import_file(path, col_types=c_types)

    cols = train.col_names  # This returned space for first column name
    x_cols = [colname for colname in cols if colname != "C55"]
    x_cols

    splits = train.split_frame()
    newtrain = splits[0]
    newvalid = splits[1]
    newtrain_x = newtrain[x_cols]
    newtrain_y = newtrain[54].asfactor()
    newvalid_x = newvalid[x_cols]
    newvalid_y = newvalid[54].asfactor()

    my_gbm = h2o.gbm(
        y=newtrain_y,
        validation_y=newvalid_y,
        x=newtrain_x,
        validation_x=newvalid_x,
        distribution="multinomial",
        ntrees=100,
        learn_rate=0.1,
        max_depth=6,
    )

    split1, split2 = train.split_frame()

    newtrain_x = split1[x_cols]
    newtrain_y = split1[54].asfactor()
    newvalid_x = split2[x_cols]
    newvalid_y = split2[54].asfactor()

    my_gbm = h2o.gbm(
        y=newtrain_y,
        validation_y=newvalid_y,
        x=newtrain_x,
        validation_x=newvalid_x,
        distribution="multinomial",
        ntrees=100,
        learn_rate=0.1,
        max_depth=6,
    )

    print "KEEPING FRAME???"
    print train._keep
def pubdev_random_cv():

    cars =  h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    response_col = "economy"
    distribution = "gaussian"
    predictors = ["displacement","power","weight","acceleration","year"]

    gbm1 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=3, distribution=distribution,fold_assignment="Random")
    gbm2 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=3, distribution=distribution,fold_assignment="Random")

    mse1 = gbm1.mse(xval=True)
    mse2 = gbm2.mse(xval=True)
    assert mse1 != mse2, "The first model has an MSE of {0} and the second model has an MSE of {1}. Expected the " \
                         "first to be different from the second.".format(mse1, mse2)
def checkpoint_new_category_in_response():

    sv = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv"))
    iris = h2o.upload_file(tests.locate("smalldata/iris/iris.csv"))

    m1 = h2o.gbm(x=sv[[0,1,2,3]], y=sv[4], ntrees=100)

    # attempt to continue building model, but with an expanded categorical response domain.
    # this should fail
    try:
        m2 = h2o.gbm(x=iris[[0,1,2,3]], y=iris[4], ntrees=200, checkpoint=m1.model_id)
        assert False, "Expected continued model-building to fail with new categories introduced in response"
    except EnvironmentError:
        pass
Beispiel #13
0
def hexdev_394():
  path = tests.locate("smalldata/covtype/covtype.20k.data")
  trainraw = h2o.lazy_import(path)
  tsetup = h2o.parse_setup(trainraw)
  tsetup["column_types"][10] = "ENUM"
  tsetup["column_types"][11] = "ENUM"
  tsetup["column_types"][12] = "ENUM"
  train = h2o.parse_raw(tsetup)
  
  cols = train.col_names  # This returned space for first column name
  x_cols = [colname for colname in cols if colname != "C55"]
  x_cols
  
  
  splits = train.split_frame()
  newtrain = splits[0]
  newvalid = splits[1]
  newtrain_x = newtrain[x_cols]
  newtrain_y = newtrain[54].asfactor()
  newvalid_x = newvalid[x_cols]
  newvalid_y = newvalid[54].asfactor()
  
  
  my_gbm = h2o.gbm(y=newtrain_y,
                   validation_y=newvalid_y,
                   x=newtrain_x,
                   validation_x=newvalid_x,
                   distribution =  "multinomial",
                   ntrees=100,
                   learn_rate=0.1,
                   max_depth=6)
  
  split1, split2 = train.split_frame()
  
  newtrain_x = split1[x_cols]
  newtrain_y = split1[54].asfactor()
  newvalid_x = split2[x_cols]
  newvalid_y = split2[54].asfactor()
  
  my_gbm = h2o.gbm(y=newtrain_y,
                   validation_y=newvalid_y,
                   x=newtrain_x,
                   validation_x=newvalid_x,
                   distribution = "multinomial",
                   ntrees=100,
                   learn_rate=0.1,
                   max_depth=6) 

  print "KEEPING FRAME???"
  print train._keep
def pub_444_spaces_in_filenames(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    # make a few files with spaces in the name
    f1 = open(h2o.locate("smalldata/jira/") + "foo .csv", "w")
    f1.write("response, predictor\n")
    for i in range(10):
        f1.write("1, a\n")
        f1.write("0, b\n")
        f1.write("1, a\n" if random.randint(0,1) else "0, b\n")
    f1.close()

    f2 = open(h2o.locate("smalldata/jira/") + "b a r .csv", "w")
    f2.write("response, predictor\n")
    for i in range(10):
        f2.write("1, a\n")
        f2.write("0, b\n")
        f2.write("1, a\n" if random.randint(0,1) else "0, b\n")
    f2.close()

    f3 = open(h2o.locate("smalldata/jira/") + " ba z.csv", "w")
    for i in range(10):
        f3.write("1, a\n")
        f3.write("0, b\n")
        f3.write("1, a\n" if random.randint(0,1) else "0, b\n")
    f3.close()

    train_data = h2o.upload_file(path=h2o.locate("smalldata/jira/foo .csv"))
    train_data.show()
    train_data.describe()
    gbm = h2o.gbm(x=train_data[1:], y=train_data["response"].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1)
    gbm.show()

    train_data = h2o.upload_file(path=h2o.locate("smalldata/jira/b a r .csv"))
    train_data.show()
    train_data.describe()
    gbm = h2o.gbm(x=train_data[1:], y=train_data["response"].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1)
    gbm.show()

    train_data = h2o.upload_file(path=h2o.locate("smalldata/jira/ ba z.csv"))
    train_data.show()
    train_data.describe()
    gbm = h2o.gbm(x=train_data[1:], y=train_data[0].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1)
    gbm.show()

    os.remove(h2o.locate("smalldata/jira/") + "foo .csv")
    os.remove(h2o.locate("smalldata/jira/") + "b a r .csv")
    os.remove(h2o.locate("smalldata/jira/") + " ba z.csv")
def pubdev_1431(ip, port):

    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        airlines_billion_file_1 = "/datasets/airlinesbillion.csv"
        url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file_1)
        airlines_billion_1 = h2o.import_file(url)

        airlines_billion_1[30] = airlines_billion_1[30].asfactor()
        gbm = h2o.gbm(x=airlines_billion_1[0:30], y=airlines_billion_1[30], ntrees=1, distribution="bernoulli", max_depth=1)

        predictions = gbm.predict(airlines_billion_1)

        csv = os.path.join(os.getcwd(),"delete.csv")
        h2o.download_csv(predictions,csv)

        airlines_billion_2 = h2o.import_file(csv)
        os.remove(csv)

        r1, c1 = airlines_billion_1.dim
        r2, c2 = airlines_billion_2.dim
        assert r1 == r2 and c1 == c2, "Expect rows to be equal. r1: {0} and r2: {1}. Expect cols to be equal c1: {0} " \
                                      "c2: {1}".format(r1,r2,c1,c2)
    else:
        print "Not running on H2O internal network.  No access to HDFS."
def fiftycatGBM(ip,port):
  
  

  # Training set has only 45 categories cat1 through cat45
  #Log.info("Importing 50_cattest_train.csv data...\n")
  train = h2o.import_file(path=h2o.locate("smalldata/gbm_test/50_cattest_train.csv"))
  train["y"] = train["y"].asfactor()

  #Log.info("Summary of 50_cattest_train.csv from H2O:\n")
  #train.summary()
  
  # Train H2O GBM Model:
  #Log.info(paste("H2O GBM with parameters:\nntrees = 10, max_depth = 20, nbins = 20\n", sep = ""))
  model = h2o.gbm(x=train[["x1","x2"]], y=train["y"], distribution="bernoulli", ntrees=10, max_depth=5, nbins=20)
  model.show()
 
  # Test dataset has all 50 categories cat1 through cat50
  #Log.info("Importing 50_cattest_test.csv data...\n")
  test = h2o.import_file(path=h2o.locate("smalldata/gbm_test/50_cattest_test.csv"))
  #Log.info("Summary of 50_cattest_test.csv from H2O:\n")
  #test.summary()
  
  # Predict on test dataset with GBM model:
  #Log.info("Performing predictions on test dataset...\n")
  predictions = model.predict(test)
  predictions.show()
  
  # Get the confusion matrix and AUC
  #Log.info("Confusion matrix of predictions (max accuracy):\n")
  performance = model.model_performance(test)
  test_cm = performance.confusion_matrix()
  test_auc = performance.auc()
def offset_gaussian(ip,port):
    # Connect to a pre-existing cluster
    

    insurance = h2o.import_file(h2o.locate("smalldata/glm_test/insurance.csv"))

    insurance["offset"] = insurance["Holders"].log()

    gbm = h2o.gbm(x=insurance[0:3], y=insurance["Claims"], distribution="gaussian", ntrees=600, max_depth=1, min_rows=1,
                  learn_rate=.1, offset_column="offset", training_frame=insurance)

    predictions = gbm.predict(insurance)

    # Comparison result generated from R's gbm:
    #	fit2 <- gbm(Claims ~ District + Group + Age+ offset(log(Holders)) , interaction.depth = 1,n.minobsinnode = 1,
    #               shrinkage = .1,bag.fraction = 1,train.fraction = 1,
    #   data = Insurance, distribution ="gaussian", n.trees = 600)
    #   pg = predict(fit2, newdata = Insurance, type = "response", n.trees=600)
    #   pr = pg - - log(Insurance$Holders)
    assert abs(44.33016 - gbm._model_json['output']['init_f']) < 1e-5, "expected init_f to be {0}, but got {1}". \
        format(44.33016, gbm._model_json['output']['init_f'])
    assert abs(1491.135 - gbm.mse()) < 1e-2, "expected mse to be {0}, but got {1}".format(1491.135, gbm.mse())
    assert abs(49.23438 - predictions.mean()) < 1e-2, "expected prediction mean to be {0}, but got {1}". \
        format(49.23438, predictions.mean())
    assert abs(-45.5720659304 - predictions.min()) < 1e-2, "expected prediction min to be {0}, but got {1}". \
        format(-45.5720659304, predictions.min())
    assert abs(207.387 - predictions.max()) < 1e-2, "expected prediction max to be {0}, but got {1}". \
        format(207.387, predictions.max())
Beispiel #18
0
def plot_test():
    
    
    kwargs = {}
    kwargs['server'] = True

    air = h2o.import_file(h2o.locate("smalldata/airlines/AirlinesTrain.csv.zip"))

    # Constructing test and train sets by sampling (20/80)
    s = air[0].runif()
    air_train = air[s <= 0.8]
    air_valid = air[s > 0.8]

    myX = ["Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek"]
    myY = "IsDepDelayed"

    air_gbm = h2o.gbm(x=air_train[myX], y=air_train[myY], validation_x=air_valid[myX], validation_y=air_valid[myY],
                      distribution="bernoulli", ntrees=100, max_depth=3, learn_rate=0.01)

    # Plot ROC for training and validation sets
    air_gbm.plot(type="roc", train=True, **kwargs)
    air_gbm.plot(type="roc", valid=True, **kwargs)

    air_test = h2o.import_file(h2o.locate("smalldata/airlines/AirlinesTest.csv.zip"))
    perf = air_gbm.model_performance(air_test)

    #Plot ROC for test set
    perf.plot(type="roc", **kwargs)
def offset_gamma():
    # Connect to a pre-existing cluster
    

    insurance = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/insurance.csv"))

    insurance["offset"] = insurance["Holders"].log()

    gbm = h2o.gbm(x=insurance[0:3], y=insurance["Claims"], distribution="gamma", ntrees=600, max_depth=1, min_rows=1,
                  learn_rate=.1, offset_column="offset", training_frame=insurance)

    predictions = gbm.predict(insurance)

    # Comparison result generated from harrysouthworth's gbm:
    #	fit2 = gbm(Claims ~ District + Group + Age+ offset(log(Holders)) , interaction.depth = 1,n.minobsinnode = 1,shrinkage = .1,bag.fraction = 1,train.fraction = 1,
    #           data = Insurance, distribution ="gamma", n.trees = 600)
    #	pr = predict(fit2, Insurance)
    #	pr = exp(pr+log(Insurance$Holders))
    assert abs(-1.714958 - gbm._model_json['output']['init_f']) < 1e-5, "expected init_f to be {0}, but got {1}". \
        format(-1.714958, gbm._model_json['output']['init_f'])
    assert abs(50.1087 - predictions.mean()) < 1e-2, "expected prediction mean to be {0}, but got {1}". \
        format(50.1087, predictions.mean())
    assert abs(0.9133843 - predictions.min()) < 1e-4, "expected prediction min to be {0}, but got {1}". \
        format(0.9133843, predictions.min())
    assert abs(392.6667 - predictions.max()) < 0.1, "expected prediction max to be {0}, but got {1}". \
        format(392.6667, predictions.max())
def offset_tweedie(ip,port):
    # Connect to a pre-existing cluster
    h2o.init(ip,port)

    insurance = h2o.import_frame(h2o.locate("smalldata/glm_test/insurance.csv"))

    insurance["offset"] = insurance["Holders"].log()

    gbm = h2o.gbm(x=insurance[0:3], y=insurance["Claims"], distribution="tweedie", ntrees=600, max_depth=1, min_rows=1,
                  learn_rate=.1, offset_column="offset", training_frame=insurance)

    predictions = gbm.predict(insurance)

    # Comparison result generated from harrysouthworth's gbm:
    #	fit2 = gbm(Claims ~ District + Group + Age+ offset(log(Holders)) , interaction.depth = 1,n.minobsinnode = 1,shrinkage = .1,bag.fraction = 1,train.fraction = 1,
    #           data = Insurance, distribution ="tweedie", n.trees = 600)
    #	pr = predict(fit2, Insurance)
    #	pr = exp(pr+log(Insurance$Holders))
    assert abs(-1.869702 - gbm._model_json['output']['init_f']) < 1e-5, "expected init_f to be {0}, but got {1}".\
        format(-1.869702, gbm._model_json['output']['init_f'])
    assert abs(49.21591 - predictions.mean()) < 1e-4, "expected prediction mean to be {0}, but got {1}". \
        format(49.21591, predictions.mean())
    assert abs(1.0258 - predictions.min()) < 1e-4, "expected prediction min to be {0}, but got {1}". \
        format(1.0258, predictions.min())
    assert abs(392.4651 - predictions.max()) < 1e-2, "expected prediction max to be {0}, but got {1}". \
        format(392.4651, predictions.max())
def offset_poisson(ip,port):
    # Connect to a pre-existing cluster
    

    insurance = h2o.import_file(h2o.locate("smalldata/glm_test/insurance.csv"))

    insurance["offset"] = insurance["Holders"].log()

    gbm = h2o.gbm(x=insurance[0:3], y=insurance["Claims"], distribution="poisson", ntrees=600, max_depth=1, min_rows=1,
                  learn_rate=.1, offset_column="offset", training_frame=insurance)

    predictions = gbm.predict(insurance)

    # Comparison result generated from R's gbm:
    #fit2 = gbm(Claims ~ District + Group + Age+ offset(log(Holders)) , interaction.depth = 1,n.minobsinnode = 1,
    #           shrinkage = .1,bag.fraction = 1,train.fraction = 1, data = Insurance, distribution ="poisson",
    #           n.trees = 600)
    #link = predict.gbm(fit2, Insurance, n.trees=600, type="link")
    #link.offset = link + log(Insurance$Holders)
    ##for poisson
    #pr = exp(link.offset)
    assert abs(-2.003262 - gbm._model_json['output']['init_f']) < 1e-5, "expected init_f to be {0}, but got {1}". \
        format(-2.003262, gbm._model_json['output']['init_f'])
    assert abs(49.23437 - predictions.mean()) < 1e-4, "expected prediction mean to be {0}, but got {1}". \
        format(49.23437, predictions.mean())
    assert abs(1.077275 - predictions.min()) < 1e-4, "expected prediction min to be {0}, but got {1}". \
        format(1.077275, predictions.min())
    assert abs(398.0608 - predictions.max()) < 1e-2, "expected prediction max to be {0}, but got {1}". \
        format(398.0608, predictions.max())
def nfold_predict():
  fr = h2o.import_file(path=h2o.locate("smalldata/logreg/prostate_train.csv"))
  m  = h2o.gbm(x=fr[2:], y=fr[1], nfolds=10, ntrees=10)
  xval_models = m.get_xval_models()
  fr["weights"]=1
  preds = [model.predict(fr) for model in xval_models]
  (sum(preds)/10).show()
def offset_bernoulli_cars():
    # Connect to a pre-existing cluster


    cars = h2o.upload_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    cars = cars[cars["economy_20mpg"].isna() == 0]
    cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
    offset = h2o.H2OFrame([[.5 for x in range(398)]])
    offset.set_names(["x1"])
    cars = cars.cbind(offset)

    gbm = h2o.gbm(x=cars[2:8], y=cars["economy_20mpg"], distribution="bernoulli", ntrees=1, max_depth=1, min_rows=1,
                  learn_rate=1, offset_column="x1", training_frame=cars)

    predictions = gbm.predict(cars)

    # Comparison result generated from R's gbm:
    #	gg = gbm(formula = economy_20mpg~cylinders+displacement+power+weight+acceleration+year+offset(rep(.5,398)),
    #            distribution = "bernoulli",data = df,n.trees = 1,interaction.depth = 1,n.minobsinnode = 1,shrinkage = 1,
    #            train.fraction = 1,bag.fraction = 1)
    #   pr = predict.gbm(object = gg,newdata = df,n.trees = 1,type = "link")
    #   pr = 1/(1+exp(-df$x1 - pr))
    assert abs(-0.1041234 - gbm._model_json['output']['init_f']) < 1e-6, "expected init_f to be {0}, but got {1}". \
        format(-0.1041234, gbm._model_json['output']['init_f'])
    assert abs(0.577326 - predictions[:,2].mean()[0]) < 1e-6, "expected prediction mean to be {0}, but got {1}". \
        format(0.577326, predictions[:,2].mean()[0])
    assert abs(0.1621461 - predictions[:,2].min()) < 1e-6, "expected prediction min to be {0}, but got {1}". \
        format(0.1621461, predictions[:,2].min())
    assert abs(0.8506528 - predictions[:,2].max()) < 1e-6, "expected prediction max to be {0}, but got {1}". \
        format(0.8506528, predictions[:,2].max())
def cv_nfoldsGBM(ip,port):
  # Connect to h2o
  h2o.init(ip,port)

  prostate = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv"))
  #prostate.summary()

  prostate_gbm = h2o.gbm(y=prostate[1], x=prostate[2:9], nfolds = 5, distribution="bernoulli")
  prostate_gbm.show()
  
  # Can't specify both nfolds >= 2 and validation data at once
  try:
    h2o.gbm(y=prostate[1], x=prostate[2:9], nfolds=5, validation_y=prostate[1], validation_x=prostate[2:9], distribution="bernoulli")
    assert False, "expected an error"
  except EnvironmentError:
    assert True
def gbm_mean_residual_deviance(ip, port):

    cars = h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv"))
    s = cars[0].runif()
    train = cars[s > 0.2]
    valid = cars[s <= 0.2]
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    response_col = "economy"
    gbm = h2o.gbm(
        x=train[predictors],
        y=train[response_col],
        validation_x=valid[predictors],
        validation_y=valid[response_col],
        nfolds=3,
    )
    gbm_mrd = gbm.mean_residual_deviance(train=True, valid=True, xval=True)
    assert isinstance(
        gbm_mrd["train"], float
    ), "Expected training mean residual deviance to be a float, but got " "{0}".format(type(gbm_mrd["train"]))
    assert isinstance(
        gbm_mrd["valid"], float
    ), "Expected validation mean residual deviance to be a float, but got " "{0}".format(type(gbm_mrd["valid"]))
    assert isinstance(
        gbm_mrd["xval"], float
    ), "Expected cross-validation mean residual deviance to be a float, but got " "{0}".format(type(gbm_mrd["xval"]))
def smallcatGBM(ip,port):
  # Training set has 26 categories from A to Z
  # Categories A, C, E, G, ... are perfect predictors of y = 1
  # Categories B, D, F, H, ... are perfect predictors of y = 0

  
  

  #Log.info("Importing alphabet_cattest.csv data...\n")
  alphabet = h2o.import_file(path=h2o.locate("smalldata/gbm_test/alphabet_cattest.csv"))
  alphabet["y"] = alphabet["y"].asfactor()
  #Log.info("Summary of alphabet_cattest.csv from H2O:\n")
  #alphabet.summary()

  # Prepare data for scikit use
  trainData = np.loadtxt(h2o.locate("smalldata/gbm_test/alphabet_cattest.csv"), delimiter=',', skiprows=1,
                         converters={0:lambda s: ord(s.split("\"")[1])})
  trainDataResponse = trainData[:,1]
  trainDataFeatures = trainData[:,0]
  
  # Train H2O GBM Model:
  #Log.info("H2O GBM (Naive Split) with parameters:\nntrees = 1, max_depth = 1, nbins = 100\n")
  gbm_h2o = h2o.gbm(x=alphabet[['X']], y=alphabet["y"], distribution="bernoulli", ntrees=1, max_depth=1, nbins=100)
  gbm_h2o.show()
  
  # Train scikit GBM Model:
  # Log.info("scikit GBM with same parameters:")
  gbm_sci = ensemble.GradientBoostingClassifier(n_estimators=1, max_depth=1, max_features=None)
  gbm_sci.fit(trainDataFeatures[:,np.newaxis],trainDataResponse)
def weights_gamma():

    htable = h2o.upload_file(h2o.locate("smalldata/gbm_test/moppe.csv"))
    htable["premiekl"] = htable["premiekl"].asfactor()
    htable["moptva"] = htable["moptva"].asfactor()
    htable["zon"] = htable["zon"]
    # gg = gbm(formula = medskad ~ premiekl + moptva + zon,data = table.1.2,distribution = "gamma", weights = table.1.2$antskad ,
    #     n.trees = 20,interaction.depth = 1,n.minobsinnode = 1,shrinkage = 1,bag.fraction = 1,train.fraction = 1)
    # pr = predict(gg,newdata = table.1.2,type = "response")
    # htable= as.h2o(table.1.2,destination_frame = "htable")
    hh = h2o.gbm(
        x=htable[0:3],
        y=htable["medskad"],
        training_frame=htable,
        distribution="gamma",
        weights_column="antskad",
        ntrees=20,
        max_depth=1,
        min_rows=1,
        learn_rate=1,
    )
    ph = hh.predict(htable)

    assert abs(8.804447 - hh._model_json["output"]["init_f"]) < 1e-6 * 8.804447
    assert abs(3751.01 - ph[0].min()) < 1e-4 * 3751.01
    assert abs(15298.87 - ph[0].max()) < 1e-4 * 15298.87
    assert abs(8121.98 - ph[0].mean()) < 1e-4 * 8121.98
def framesliceGBM():

    # Log.info("Importing prostate data...\n")
    prostate = h2o.import_file(path=tests.locate("smalldata/logreg/prostate.csv"))
    prostate = prostate[1:9]

    # Log.info("Running GBM on a sliced data frame...\n")
    model = h2o.gbm(x=prostate[1:8], y=prostate[0])
def cv_nfoldsGBM():
  
  

  prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
  prostate[1] = prostate[1].asfactor()
  prostate.summary()

  prostate_gbm = h2o.gbm(y=prostate[1], x=prostate[2:9], nfolds = 5, distribution="bernoulli")
  prostate_gbm.show()
  
  # Can specify both nfolds >= 2 and validation data at once
  try:
    h2o.gbm(y=prostate[1], x=prostate[2:9], nfolds=5, validation_y=prostate[1], validation_x=prostate[2:9], distribution="bernoulli")
    assert True
  except EnvironmentError:
    assert False, "expected an error"
Beispiel #30
0
def ntrain():
    
    h2o.init(ip="zurich.h2o.ai",strict_version_check=False)
    weather = load_weather()
    training = load_training()    
    X = assemble_X(training, weather)
    mean, std = normalize(X)
    y =assemble_y(training)
    xd=[]
    for l in X:
        xd.append(l.tolist())
        
    y=np.asarray(y,dtype='bool_')    
        
    xtr=H2OFrame(python_obj=xd)
    ytr=H2OFrame(python_obj=y.tolist()) 
    
    ytr["C1"]._name = "C40"  # Rename the default column
        
    gb = h2o.gbm(x =xtr[1:39],y =ytr['C40'],
                distribution = "bernoulli",
                ntrees=1000, # 500 works well
                max_depth=12,
                learn_rate=0.01)
                
    dl= h2o.deeplearning(x =xtr[1:39],y =ytr['C40'],
                variable_importances=True,balance_classes=True,
                input_dropout_ratio=0.2,rho=0.899,
                hidden_dropout_ratios=[0.4,0.4,0.4,0.4],
                activation="Tanh",hidden=[39,325,325,1],epochs=100)
                
    rf= h2o.random_forest(x =xtr[1:39],y =ytr['C40'],
                seed=1234, ntrees=600, 
                max_depth=20, balance_classes=False)

    
    testing = load_testing()
    X_test= assemble_X(testing, weather) 
    normalize(X_test, mean, std)
    
    xd=[]
    for l in X_test:
        xd.append(l.tolist())
    xts=H2OFrame(python_obj=xd)
    
#    gp=gb.predict(xts)
    dp=dl.predict(xts) 
    rp=rf.predict(xts)
    gbp=gb.predict(xts) 
    
    gp=dp*0.35+rp*0.3+gbp*0.35
    
    gph=h2o.as_list(gp)
    Id= np.arange(gp.nrow()+1)[1:].reshape(gp.nrow(),1)
    df = pd.DataFrame(Id)
    df_concat = pd.concat([df, gph.True],axis=1)
    df_concat.columns=['Id','WnvPresent']
    df_concat.to_csv("wnvh.csv",index=False)
Beispiel #31
0
def gbm_mean_residual_deviance(ip, port):

    cars = h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv"))
    s = cars[0].runif()
    train = cars[s > 0.2]
    valid = cars[s <= 0.2]
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    response_col = "economy"
    gbm = h2o.gbm(x=train[predictors],
                  y=train[response_col],
                  validation_x=valid[predictors],
                  validation_y=valid[response_col],
                  nfolds=3)
    gbm_mrd = gbm.mean_residual_deviance(train=True, valid=True, xval=True)
    assert isinstance(gbm_mrd['train'],float), "Expected training mean residual deviance to be a float, but got " \
                                              "{0}".format(type(gbm_mrd['train']))
    assert isinstance(gbm_mrd['valid'],float), "Expected validation mean residual deviance to be a float, but got " \
                                              "{0}".format(type(gbm_mrd['valid']))
    assert isinstance(gbm_mrd['xval'],float), "Expected cross-validation mean residual deviance to be a float, but got " \
                                             "{0}".format(type(gbm_mrd['xval']))
Beispiel #32
0
def split_fit_predict(data):
    # Classic Test/Train split
    r = data['Days'].runif()  # Random UNIForm numbers, one per row
    train = data[r < 0.6]
    test = data[(0.6 <= r) & (r < 0.9)]
    hold = data[0.9 <= r]
    print "Training data has", train.ncol(), "columns and", train.nrow(
    ), "rows, test has", test.nrow(), "rows, holdout has", hold.nrow()

    # Run GBM
    gbm = h2o.gbm(
        x=train.drop("bikes"),
        y=train["bikes"],
        validation_x=test.drop("bikes"),
        validation_y=test["bikes"],
        ntrees=500,  # 500 works well
        max_depth=6,
        min_rows=10,
        nbins=20,
        learn_rate=0.1)
    #gbm.show()

    # Run GLM
    glm = h2o.glm(x=train.drop("bikes"),
                  y=train["bikes"],
                  validation_x=test.drop("bikes"),
                  validation_y=test["bikes"],
                  dropNA20Cols=True)
    #glm.show()

    # ----------
    # 4- Score on holdout set & report
    train_r2_gbm = gbm.model_performance(train).r2()
    test_r2_gbm = gbm.model_performance(test).r2()
    hold_r2_gbm = gbm.model_performance(hold).r2()
    print "GBM R2 TRAIN=", train_r2_gbm, ", R2 TEST=", test_r2_gbm, ", R2 HOLDOUT=", hold_r2_gbm

    train_r2_glm = glm.model_performance(train).r2()
    test_r2_glm = glm.model_performance(test).r2()
    hold_r2_glm = glm.model_performance(hold).r2()
    print "GLM R2 TRAIN=", train_r2_glm, ", R2 TEST=", test_r2_glm, ", R2 HOLDOUT=", hold_r2_glm
def pubdev_1431(ip, port):

    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        airlines_billion_file = "/datasets/airlinesbillion.csv"
        url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file)
        airlines_billion = h2o.import_file(url)
        airlines_billion[30] = airlines_billion[30].asfactor()
        gbm = h2o.gbm(x=airlines_billion[0:30],
                      y=airlines_billion[30],
                      ntrees=1,
                      distribution="bernoulli",
                      max_depth=1)
        predictions = gbm.predict(airlines_billion)
        csv = os.path.join(os.getcwd(), "delete.csv")
        h2o.download_csv(predictions, csv)
        os.remove(csv)
    else:
        print "Not running on H2O internal network.  No access to HDFS."
def prep_airlines(ip,port):
    # Connect to a pre-existing cluster
    h2o.init(ip,port)

    air = h2o.import_frame(h2o.locate("smalldata/airlines/allyears2k_headers.zip"))

    numRows, numCols = air.dim()

    x_cols = ["Month", "DayofMonth", "DayOfWeek", "CRSDepTime", "CRSArrTime", "UniqueCarrier", "CRSElapsedTime", "Origin", "Dest", "Distance"]
    y_col = "SynthDepDelayed"

    noDepDelayedNAs = air[air["DepDelay"].isna() == 0]
    print "Dimensions of new dataset: {0}".format(noDepDelayedNAs.dim())

    minutesOfDelayWeTolerate = 15
    noDepDelayedNAs.cbind(noDepDelayedNAs["DepDelay"] > minutesOfDelayWeTolerate)
    noDepDelayedNAs[numCols] = noDepDelayedNAs[numCols].asfactor()
    noDepDelayedNAs._vecs[numCols].setName(y_col)

    gbm = h2o.gbm(x=noDepDelayedNAs[x_cols], y=noDepDelayedNAs[y_col], distribution="bernoulli")
    gbm.show()
def deepLearningDemo(ip, port):

    h2o.init(ip, port)

    # Training data
    train_data = h2o.import_frame(
        path=h2o.locate("smalldata/gbm_test/ecology_model.csv"))
    train_data = train_data.drop('Site')
    train_data['Angaus'] = train_data['Angaus'].asfactor()
    print train_data.describe()
    train_data.head()

    # Testing data
    test_data = h2o.import_frame(
        path=h2o.locate("smalldata/gbm_test/ecology_eval.csv"))
    test_data['Angaus'] = test_data['Angaus'].asfactor()
    print test_data.describe()
    test_data.head()

    # Run GBM
    gbm = h2o.gbm(x=train_data[1:],
                  y=train_data['Angaus'],
                  validation_x=test_data[1:],
                  validation_y=test_data['Angaus'],
                  ntrees=100,
                  distribution="bernoulli")

    gbm.show()

    # Run DeepLearning

    dl = h2o.deeplearning(x=train_data[1:],
                          y=train_data['Angaus'],
                          validation_x=test_data[1:],
                          validation_y=test_data['Angaus'],
                          loss='CrossEntropy',
                          epochs=1000,
                          hidden=[20, 20, 20])

    dl.show()
def fiftycatGBM(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    # Training set has only 45 categories cat1 through cat45
    #Log.info("Importing 50_cattest_train.csv data...\n")
    train = h2o.import_frame(
        path=h2o.locate("smalldata/gbm_test/50_cattest_train.csv"))
    train["y"] = train["y"].asfactor()

    #Log.info("Summary of 50_cattest_train.csv from H2O:\n")
    #train.summary()

    # Train H2O GBM Model:
    #Log.info(paste("H2O GBM with parameters:\nntrees = 10, max_depth = 20, nbins = 20\n", sep = ""))
    model = h2o.gbm(x=train[["x1", "x2"]],
                    y=train["y"],
                    loss="bernoulli",
                    ntrees=10,
                    max_depth=5,
                    nbins=20)
    model.show()

    # Test dataset has all 50 categories cat1 through cat50
    #Log.info("Importing 50_cattest_test.csv data...\n")
    test = h2o.import_frame(
        path=h2o.locate("smalldata/gbm_test/50_cattest_test.csv"))
    #Log.info("Summary of 50_cattest_test.csv from H2O:\n")
    #test.summary()

    # Predict on test dataset with GBM model:
    #Log.info("Performing predictions on test dataset...\n")
    predictions = model.predict(test)
    predictions.show()

    # Get the confusion matrix and AUC
    #Log.info("Confusion matrix of predictions (max accuracy):\n")
    performance = model.model_performance(test)
    test_cm = performance.confusion_matrices()
    test_auc = performance.auc()
def smallcatGBM(ip, port):
    # Training set has 26 categories from A to Z
    # Categories A, C, E, G, ... are perfect predictors of y = 1
    # Categories B, D, F, H, ... are perfect predictors of y = 0

    # Connect to h2o
    h2o.init(ip, port)

    #Log.info("Importing alphabet_cattest.csv data...\n")
    alphabet = h2o.import_frame(
        path=h2o.locate("smalldata/gbm_test/alphabet_cattest.csv"))
    alphabet["y"] = alphabet["y"].asfactor()
    #Log.info("Summary of alphabet_cattest.csv from H2O:\n")
    #alphabet.summary()

    # Prepare data for scikit use
    trainData = np.loadtxt(
        h2o.locate("smalldata/gbm_test/alphabet_cattest.csv"),
        delimiter=',',
        skiprows=1,
        converters={0: lambda s: ord(s.split("\"")[1])})
    trainDataResponse = trainData[:, 1]
    trainDataFeatures = trainData[:, 0]

    # Train H2O GBM Model:
    #Log.info("H2O GBM (Naive Split) with parameters:\nntrees = 1, max_depth = 1, nbins = 100\n")
    gbm_h2o = h2o.gbm(x=alphabet[['X']],
                      y=alphabet["y"],
                      loss="bernoulli",
                      ntrees=1,
                      max_depth=1,
                      nbins=100)
    gbm_h2o.show()

    # Train scikit GBM Model:
    # Log.info("scikit GBM with same parameters:")
    gbm_sci = ensemble.GradientBoostingClassifier(n_estimators=1,
                                                  max_depth=1,
                                                  max_features=None)
    gbm_sci.fit(trainDataFeatures[:, np.newaxis], trainDataResponse)
Beispiel #38
0
def plot_test(ip, port):
    # Connect to h2o
    h2o.init(ip, port)
    kwargs = {}
    kwargs['server'] = True

    air = h2o.import_frame(
        h2o.locate("smalldata/airlines/AirlinesTrain.csv.zip"))

    # Constructing test and train sets by sampling (20/80)
    s = air[0].runif()
    air_train = air[s <= 0.8]
    air_valid = air[s > 0.8]

    myX = [
        "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth",
        "fDayOfWeek"
    ]
    myY = "IsDepDelayed"

    air_gbm = h2o.gbm(x=air_train[myX],
                      y=air_train[myY],
                      validation_x=air_valid[myX],
                      validation_y=air_valid[myY],
                      distribution="bernoulli",
                      ntrees=100,
                      max_depth=3,
                      learn_rate=0.01)

    # Plot ROC for training and validation sets
    air_gbm.plot(type="roc", train=True, **kwargs)
    air_gbm.plot(type="roc", valid=True, **kwargs)

    air_test = h2o.import_frame(
        h2o.locate("smalldata/airlines/AirlinesTest.csv.zip"))
    perf = air_gbm.model_performance(air_test)

    #Plot ROC for test set
    perf.plot(type="roc", **kwargs)
Beispiel #39
0
def irisGBM(ip, port):
    # Connect to a pre-existing cluster
    # connect to localhost:54321

    # Import training data
    train = h2o.import_file(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    train.describe()

    # Run GBM
    my_gbm = h2o.gbm(y=train["class"],
                     validation_y=train["class"],
                     x=train[1:4],
                     validation_x=train[1:4],
                     ntrees=50,
                     learn_rate=0.1,
                     distribution="multinomial")
    my_gbm.show()

    my_gbm_metrics = my_gbm.model_performance(train)
    my_gbm_metrics.show()

    my_gbm_metrics  #.show(criterion=my_gbm_metrics.theCriteria.PRECISION)
def offset_gaussian(ip, port):
    # Connect to a pre-existing cluster
    h2o.init(ip, port)

    insurance = h2o.import_frame(
        h2o.locate("smalldata/glm_test/insurance.csv"))

    insurance["offset"] = insurance["Holders"].log()

    gbm = h2o.gbm(x=insurance[0:3],
                  y=insurance["Claims"],
                  distribution="gaussian",
                  ntrees=600,
                  max_depth=1,
                  min_rows=1,
                  learn_rate=.1,
                  offset_column="offset",
                  training_frame=insurance)

    predictions = gbm.predict(insurance)

    # Comparison result generated from R's gbm:
    #	fit2 <- gbm(Claims ~ District + Group + Age+ offset(log(Holders)) , interaction.depth = 1,n.minobsinnode = 1,
    #               shrinkage = .1,bag.fraction = 1,train.fraction = 1,
    #   data = Insurance, distribution ="gaussian", n.trees = 600)
    #   pg = predict(fit2, newdata = Insurance, type = "response", n.trees=600)
    #   pr = pg - - log(Insurance$Holders)
    assert abs(44.33016 - gbm._model_json['output']['init_f']) < 1e-5, "expected init_f to be {0}, but got {1}". \
        format(44.33016, gbm._model_json['output']['init_f'])
    assert abs(1491.135 -
               gbm.mse()) < 1e-3, "expected mse to be {0}, but got {1}".format(
                   1491.135, gbm.mse())
    assert abs(49.23438 - predictions.mean()) < 1e-3, "expected prediction mean to be {0}, but got {1}". \
        format(49.23438, predictions.mean())
    assert abs(-45.54382 - predictions.min()) < 1e-1, "expected prediction min to be {0}, but got {1}". \
        format(-45.54382, predictions.min())
    assert abs(207.348 - predictions.max()) < 1e-1, "expected prediction max to be {0}, but got {1}". \
        format(207.348, predictions.max())
def pubdev_1431():

    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        airlines_billion_file = "/datasets/airlinesbillion.csv"
        url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file)
        airlines_billion = h2o.import_file(url)
        airlines_billion[30] = airlines_billion[30].asfactor()
        gbm = h2o.gbm(x=airlines_billion[0:30],
                      y=airlines_billion[30],
                      ntrees=1,
                      distribution="bernoulli",
                      max_depth=1)
        predictions = gbm.predict(airlines_billion)
        csv = os.path.join(os.getcwd(), "delete.csv")
        h2o.download_csv(predictions, csv)
        os.remove(csv)
    else:
        raise (EnvironmentError,
               "Not running on H2O internal network.  No access to HDFS.")
def confusion_matrices_check(ip, port):

    local_data = [[1, 'a'], [1, 'a'], [1, 'a'], [1, 'a'], [1, 'a'], [1, 'a'],
                  [1, 'a'], [1, 'a'], [1, 'a'], [1, 'a'], [0, 'b'], [0, 'b'],
                  [0, 'b'], [0, 'b'], [0, 'b'], [0, 'b'], [0, 'b'], [0, 'b'],
                  [0, 'b'], [0, 'b']]
    h2o_data = h2o.H2OFrame(python_obj=local_data)
    h2o_data.setNames(['response', 'predictor'])
    h2o_data.show()

    gbm = h2o.gbm(x=h2o_data[1:],
                  y=h2o_data["response"].asfactor(),
                  ntrees=1,
                  distribution="bernoulli")
    gbm.show()
    perf = gbm.model_performance()
    tps = perf.metric("tps", [perf.find_threshold_by_max_metric("f1")])[0][1]
    tns = perf.metric("tns", [perf.find_threshold_by_max_metric("f1")])[0][1]
    fps = perf.metric("fps", [perf.find_threshold_by_max_metric("f1")])[0][1]
    fns = perf.metric("fns", [perf.find_threshold_by_max_metric("f1")])[0][1]

    assert tps + tns + fps + fns == 20, "incorrect confusion matrix computation: tps: {0}, fps: {1}, tns: {2}, fns: " \
                                        "{3}. Should sum to 20.".format(tps, fps, tns, fns)
def prostateGBM():
    # Connect to a pre-existing cluster
    # connect to localhost:54321

    df = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    df.describe()

    # Remove ID from training frame
    train = df.drop("ID")

    # For VOL & GLEASON, a zero really means "missing"
    vol = train['VOL']
    vol[vol == 0] = None
    gle = train['GLEASON']
    gle[gle == 0] = None

    # Convert CAPSULE to a logical factor
    train['CAPSULE'] = train['CAPSULE'].asfactor()

    # See that the data is ready
    train.describe()

    # Run GBM
    my_gbm = h2o.gbm(y=train["CAPSULE"],
                     validation_y=train["CAPSULE"],
                     x=train[1:],
                     validation_x=train[1:],
                     ntrees=50,
                     learn_rate=0.1,
                     distribution="bernoulli")
    my_gbm.show()

    my_gbm_metrics = my_gbm.model_performance(train)
    my_gbm_metrics.show()

    my_gbm_metrics  #.show(criterion=my_gbm_metrics.theCriteria.PRECISION)
def offset_bernoulli_cars():
    # Connect to a pre-existing cluster

    cars = h2o.upload_file(
        pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    cars = cars[cars["economy_20mpg"].isna() == 0]
    cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
    offset = h2o.H2OFrame([[.5 for x in range(398)]])
    offset.set_names(["x1"])
    cars = cars.cbind(offset)

    gbm = h2o.gbm(x=cars[2:8],
                  y=cars["economy_20mpg"],
                  distribution="bernoulli",
                  ntrees=1,
                  max_depth=1,
                  min_rows=1,
                  learn_rate=1,
                  offset_column="x1",
                  training_frame=cars)

    predictions = gbm.predict(cars)

    # Comparison result generated from R's gbm:
    #	gg = gbm(formula = economy_20mpg~cylinders+displacement+power+weight+acceleration+year+offset(rep(.5,398)),
    #            distribution = "bernoulli",data = df,n.trees = 1,interaction.depth = 1,n.minobsinnode = 1,shrinkage = 1,
    #            train.fraction = 1,bag.fraction = 1)
    #   pr = predict.gbm(object = gg,newdata = df,n.trees = 1,type = "link")
    #   pr = 1/(1+exp(-df$x1 - pr))
    assert abs(-0.1041234 - gbm._model_json['output']['init_f']) < 1e-6, "expected init_f to be {0}, but got {1}". \
        format(-0.1041234, gbm._model_json['output']['init_f'])
    assert abs(0.577326 - predictions[:,2].mean()[0]) < 1e-6, "expected prediction mean to be {0}, but got {1}". \
        format(0.577326, predictions[:,2].mean()[0])
    assert abs(0.1621461 - predictions[:,2].min()) < 1e-6, "expected prediction min to be {0}, but got {1}". \
        format(0.1621461, predictions[:,2].min())
    assert abs(0.8506528 - predictions[:,2].max()) < 1e-6, "expected prediction max to be {0}, but got {1}". \
        format(0.8506528, predictions[:,2].max())
Beispiel #45
0
def offset_poisson():
    # Connect to a pre-existing cluster

    insurance = h2o.import_file(
        tests.locate("smalldata/glm_test/insurance.csv"))

    insurance["offset"] = insurance["Holders"].log()

    gbm = h2o.gbm(x=insurance[0:3],
                  y=insurance["Claims"],
                  distribution="poisson",
                  ntrees=600,
                  max_depth=1,
                  min_rows=1,
                  learn_rate=.1,
                  offset_column="offset",
                  training_frame=insurance)

    predictions = gbm.predict(insurance)

    # Comparison result generated from R's gbm:
    #fit2 = gbm(Claims ~ District + Group + Age+ offset(log(Holders)) , interaction.depth = 1,n.minobsinnode = 1,
    #           shrinkage = .1,bag.fraction = 1,train.fraction = 1, data = Insurance, distribution ="poisson",
    #           n.trees = 600)
    #link = predict.gbm(fit2, Insurance, n.trees=600, type="link")
    #link.offset = link + log(Insurance$Holders)
    ##for poisson
    #pr = exp(link.offset)
    assert abs(-2.003262 - gbm._model_json['output']['init_f']) < 1e-5, "expected init_f to be {0}, but got {1}". \
        format(-2.003262, gbm._model_json['output']['init_f'])
    assert abs(49.23437 - predictions.mean()) < 1e-4, "expected prediction mean to be {0}, but got {1}". \
        format(49.23437, predictions.mean())
    assert abs(1.077275 - predictions.min()) < 1e-4, "expected prediction min to be {0}, but got {1}". \
        format(1.077275, predictions.min())
    assert abs(398.0608 - predictions.max()) < 1e-2, "expected prediction max to be {0}, but got {1}". \
        format(398.0608, predictions.max())
Beispiel #46
0
def bigcatGBM(ip, port):

    #Log.info("Importing bigcat_5000x2.csv data...\n")
    bigcat = h2o.import_file(
        path=h2o.locate("smalldata/gbm_test/bigcat_5000x2.csv"))
    bigcat["y"] = bigcat["y"].asfactor()
    #Log.info("Summary of bigcat_5000x2.csv from H2O:\n")
    #bigcat.summary()

    # Train H2O GBM Model:
    #Log.info("H2O GBM with parameters:\nntrees = 1, max_depth = 1, nbins = 100\n")
    model = h2o.gbm(x=bigcat[["X"]],
                    y=bigcat["y"],
                    distribution="bernoulli",
                    ntrees=1,
                    max_depth=1,
                    nbins=100)
    model.show()
    performance = model.model_performance(bigcat)
    performance.show()

    # Check AUC and overall prediction error
    #test_accuracy = performance.accuracy()
    test_auc = performance.auc()
Beispiel #47
0
def metric_accessors(ip, port):

    cars = h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv"))
    r = cars[0].runif()
    train = cars[r > .2]
    valid = cars[r <= .2]

    # regression
    response_col = "economy"
    distribution = "gaussian"
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    gbm = h2o.gbm(y=train[response_col],
                  x=train[predictors],
                  validation_y=valid[response_col],
                  validation_x=valid[predictors],
                  nfolds=3,
                  distribution=distribution,
                  fold_assignment="Random")

    #   mse
    mse1 = gbm.mse(train=True, valid=False, xval=False)
    assert isinstance(mse1, float)

    mse2 = gbm.mse(train=False, valid=True, xval=False)
    assert isinstance(mse2, float)

    mse3 = gbm.mse(train=False, valid=False, xval=True)
    assert isinstance(mse3, float)

    mse = gbm.mse(train=True, valid=True, xval=False)
    assert "train" in mse.keys() and "valid" in mse.keys(
    ), "expected training and validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert len(
        mse
    ) == 2, "expected only training and validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert isinstance(mse["train"], float) and isinstance(
        mse["valid"], float
    ), "expected training and validation metrics to be floats, but got {0} and {1}".format(
        type(mse["train"]), type(mse["valid"]))
    assert mse["valid"] == mse2

    mse = gbm.mse(train=True, valid=False, xval=True)
    assert "train" in mse.keys() and "xval" in mse.keys(
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert len(
        mse
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert isinstance(mse["train"], float) and isinstance(
        mse["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(mse["train"]), type(mse["xval"]))
    assert mse["xval"] == mse3

    mse = gbm.mse(train=True, valid=True, xval=True)
    assert "train" in mse.keys() and "valid" in mse.keys(
    ) and "xval" in mse.keys(
    ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert len(
        mse
    ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert isinstance(mse["train"], float) and isinstance(
        mse["valid"], float
    ) and isinstance(
        mse["xval"], float
    ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(
        type(mse["train"]), type(mse["valid"]), type(mse["xval"]))

    mse = gbm.mse(train=False, valid=False,
                  xval=False)  # default: return training metrics
    assert isinstance(mse, float)
    assert mse == mse1

    mse = gbm.mse(train=False, valid=True, xval=True)
    assert "valid" in mse.keys() and "xval" in mse.keys(
    ), "expected validation and cross validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert len(
        mse
    ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert isinstance(mse["valid"], float) and isinstance(
        mse["xval"], float
    ), "validation and cross validation metrics to be floats, but got {0} and {1}".format(
        type(mse["valid"]), type(mse["xval"]))

    #   r2
    r21 = gbm.r2(train=True, valid=False, xval=False)
    assert isinstance(r21, float)

    r22 = gbm.r2(train=False, valid=True, xval=False)
    assert isinstance(r22, float)

    r23 = gbm.r2(train=False, valid=False, xval=True)
    assert isinstance(r23, float)

    r2 = gbm.r2(train=True, valid=True, xval=False)
    assert "train" in r2.keys() and "valid" in r2.keys(
    ), "expected training and validation metrics to be returned, but got {0}".format(
        r2.keys())
    assert len(
        r2
    ) == 2, "expected only training and validation metrics to be returned, but got {0}".format(
        r2.keys())
    assert isinstance(r2["train"], float) and isinstance(
        r2["valid"], float
    ), "expected training and validation metrics to be floats, but got {0} and {1}".format(
        type(r2["train"]), type(r2["valid"]))
    assert r2["valid"] == r22

    r2 = gbm.r2(train=True, valid=False, xval=True)
    assert "train" in r2.keys() and "xval" in r2.keys(
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        r2.keys())
    assert len(
        r2
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        r2.keys())
    assert isinstance(r2["train"], float) and isinstance(
        r2["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(r2["train"]), type(r2["xval"]))
    assert r2["xval"] == r23

    r2 = gbm.r2(train=True, valid=True, xval=True)
    assert "train" in r2.keys() and "valid" in r2.keys() and "xval" in r2.keys(
    ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(
        r2.keys())
    assert len(
        r2
    ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(
        r2.keys())
    assert isinstance(r2["train"], float) and isinstance(
        r2["valid"], float
    ) and isinstance(
        r2["xval"], float
    ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(
        type(r2["train"]), type(r2["valid"]), type(r2["xval"]))

    r2 = gbm.r2(train=False, valid=False,
                xval=False)  # default: return training metrics
    assert isinstance(r2, float)
    assert r2 == r21

    r2 = gbm.r2(train=False, valid=True, xval=True)
    assert "valid" in r2.keys() and "xval" in r2.keys(
    ), "expected validation and cross validation metrics to be returned, but got {0}".format(
        r2.keys())
    assert len(
        r2
    ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(
        r2.keys())
    assert isinstance(r2["valid"], float) and isinstance(
        r2["xval"], float
    ), "validation and cross validation metrics to be floats, but got {0} and {1}".format(
        type(r2["valid"]), type(r2["xval"]))

    #   mean_residual_deviance
    mean_residual_deviance1 = gbm.mean_residual_deviance(train=True,
                                                         valid=False,
                                                         xval=False)
    assert isinstance(mean_residual_deviance1, float)

    mean_residual_deviance2 = gbm.mean_residual_deviance(train=False,
                                                         valid=True,
                                                         xval=False)
    assert isinstance(mean_residual_deviance2, float)

    mean_residual_deviance3 = gbm.mean_residual_deviance(train=False,
                                                         valid=False,
                                                         xval=True)
    assert isinstance(mean_residual_deviance3, float)

    mean_residual_deviance = gbm.mean_residual_deviance(train=True,
                                                        valid=True,
                                                        xval=False)
    assert "train" in mean_residual_deviance.keys(
    ) and "valid" in mean_residual_deviance.keys(
    ), "expected training and validation metrics to be returned, but got {0}".format(
        mean_residual_deviance.keys())
    assert len(
        mean_residual_deviance
    ) == 2, "expected only training and validation metrics to be returned, but got {0}".format(
        mean_residual_deviance.keys())
    assert isinstance(mean_residual_deviance["train"], float) and isinstance(
        mean_residual_deviance["valid"], float
    ), "expected training and validation metrics to be floats, but got {0} and {1}".format(
        type(mean_residual_deviance["train"]),
        type(mean_residual_deviance["valid"]))
    assert mean_residual_deviance["valid"] == mean_residual_deviance2

    mean_residual_deviance = gbm.mean_residual_deviance(train=True,
                                                        valid=False,
                                                        xval=True)
    assert "train" in mean_residual_deviance.keys(
    ) and "xval" in mean_residual_deviance.keys(
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        mean_residual_deviance.keys())
    assert len(
        mean_residual_deviance
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        mean_residual_deviance.keys())
    assert isinstance(mean_residual_deviance["train"], float) and isinstance(
        mean_residual_deviance["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(mean_residual_deviance["train"]),
        type(mean_residual_deviance["xval"]))
    assert mean_residual_deviance["xval"] == mean_residual_deviance3

    mean_residual_deviance = gbm.mean_residual_deviance(train=True,
                                                        valid=True,
                                                        xval=True)
    assert "train" in mean_residual_deviance.keys(
    ) and "valid" in mean_residual_deviance.keys(
    ) and "xval" in mean_residual_deviance.keys(
    ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(
        mean_residual_deviance.keys())
    assert len(
        mean_residual_deviance
    ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(
        mean_residual_deviance.keys())
    assert isinstance(mean_residual_deviance["train"], float) and isinstance(
        mean_residual_deviance["valid"], float
    ) and isinstance(
        mean_residual_deviance["xval"], float
    ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(
        type(mean_residual_deviance["train"]),
        type(mean_residual_deviance["valid"]),
        type(mean_residual_deviance["xval"]))

    mean_residual_deviance = gbm.mean_residual_deviance(
        train=False, valid=False,
        xval=False)  # default: return training metrics
    assert isinstance(mean_residual_deviance, float)
    assert mean_residual_deviance == mean_residual_deviance1

    mean_residual_deviance = gbm.mean_residual_deviance(train=False,
                                                        valid=True,
                                                        xval=True)
    assert "valid" in mean_residual_deviance.keys(
    ) and "xval" in mean_residual_deviance.keys(
    ), "expected validation and cross validation metrics to be returned, but got {0}".format(
        mean_residual_deviance.keys())
    assert len(
        mean_residual_deviance
    ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(
        mean_residual_deviance.keys())
    assert isinstance(mean_residual_deviance["valid"], float) and isinstance(
        mean_residual_deviance["xval"], float
    ), "validation and cross validation metrics to be floats, but got {0} and {1}".format(
        type(mean_residual_deviance["valid"]),
        type(mean_residual_deviance["xval"]))

    # binomial
    cars = h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv"))
    cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
    r = cars[0].runif()
    train = cars[r > .2]
    valid = cars[r <= .2]
    response_col = "economy_20mpg"
    distribution = "bernoulli"
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    gbm = h2o.gbm(y=train[response_col],
                  x=train[predictors],
                  validation_y=valid[response_col],
                  validation_x=valid[predictors],
                  nfolds=3,
                  distribution=distribution,
                  fold_assignment="Random")

    #   auc
    auc1 = gbm.auc(train=True, valid=False, xval=False)
    assert isinstance(auc1, float)

    auc2 = gbm.auc(train=False, valid=True, xval=False)
    assert isinstance(auc2, float)

    auc3 = gbm.auc(train=False, valid=False, xval=True)
    assert isinstance(auc3, float)

    auc = gbm.auc(train=True, valid=True, xval=False)
    assert "train" in auc.keys() and "valid" in auc.keys(
    ), "expected training and validation metrics to be returned, but got {0}".format(
        auc.keys())
    assert len(
        auc
    ) == 2, "expected only training and validation metrics to be returned, but got {0}".format(
        auc.keys())
    assert isinstance(auc["train"], float) and isinstance(
        auc["valid"], float
    ), "expected training and validation metrics to be floats, but got {0} and {1}".format(
        type(auc["train"]), type(auc["valid"]))
    assert auc["valid"] == auc2

    auc = gbm.auc(train=True, valid=False, xval=True)
    assert "train" in auc.keys() and "xval" in auc.keys(
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        auc.keys())
    assert len(
        auc
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        auc.keys())
    assert isinstance(auc["train"], float) and isinstance(
        auc["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(auc["train"]), type(auc["xval"]))
    assert auc["xval"] == auc3

    auc = gbm.auc(train=True, valid=True, xval=True)
    assert "train" in auc.keys() and "valid" in auc.keys(
    ) and "xval" in auc.keys(
    ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(
        auc.keys())
    assert len(
        auc
    ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(
        auc.keys())
    assert isinstance(auc["train"], float) and isinstance(
        auc["valid"], float
    ) and isinstance(
        auc["xval"], float
    ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(
        type(auc["train"]), type(auc["valid"]), type(auc["xval"]))

    auc = gbm.auc(train=False, valid=False,
                  xval=False)  # default: return training metrics
    assert isinstance(auc, float)
    assert auc == auc1

    auc = gbm.auc(train=False, valid=True, xval=True)
    assert "valid" in auc.keys() and "xval" in auc.keys(
    ), "expected validation and cross validation metrics to be returned, but got {0}".format(
        auc.keys())
    assert len(
        auc
    ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(
        auc.keys())
    assert isinstance(auc["valid"], float) and isinstance(
        auc["xval"], float
    ), "validation and cross validation metrics to be floats, but got {0} and {1}".format(
        type(auc["valid"]), type(auc["xval"]))

    #   logloss
    logloss1 = gbm.logloss(train=True, valid=False, xval=False)
    assert isinstance(logloss1, float)

    logloss2 = gbm.logloss(train=False, valid=True, xval=False)
    assert isinstance(logloss2, float)

    logloss3 = gbm.logloss(train=False, valid=False, xval=True)
    assert isinstance(logloss3, float)

    logloss = gbm.logloss(train=True, valid=True, xval=False)
    assert "train" in logloss.keys() and "valid" in logloss.keys(
    ), "expected training and validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert len(
        logloss
    ) == 2, "expected only training and validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert isinstance(logloss["train"], float) and isinstance(
        logloss["valid"], float
    ), "expected training and validation metrics to be floats, but got {0} and {1}".format(
        type(logloss["train"]), type(logloss["valid"]))
    assert logloss["valid"] == logloss2

    logloss = gbm.logloss(train=True, valid=False, xval=True)
    assert "train" in logloss.keys() and "xval" in logloss.keys(
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert len(
        logloss
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert isinstance(logloss["train"], float) and isinstance(
        logloss["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(logloss["train"]), type(logloss["xval"]))
    assert logloss["xval"] == logloss3

    logloss = gbm.logloss(train=True, valid=True, xval=True)
    assert "train" in logloss.keys() and "valid" in logloss.keys(
    ) and "xval" in logloss.keys(
    ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert len(
        logloss
    ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert isinstance(logloss["train"], float) and isinstance(
        logloss["valid"], float
    ) and isinstance(
        logloss["xval"], float
    ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(
        type(logloss["train"]), type(logloss["valid"]), type(logloss["xval"]))

    logloss = gbm.logloss(train=False, valid=False,
                          xval=False)  # default: return training metrics
    assert isinstance(logloss, float)
    assert logloss == logloss1

    logloss = gbm.logloss(train=False, valid=True, xval=True)
    assert "valid" in logloss.keys() and "xval" in logloss.keys(
    ), "expected validation and cross validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert len(
        logloss
    ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert isinstance(logloss["valid"], float) and isinstance(
        logloss["xval"], float
    ), "validation and cross validation metrics to be floats, but got {0} and {1}".format(
        type(logloss["valid"]), type(logloss["xval"]))

    #   giniCoef
    giniCoef1 = gbm.giniCoef(train=True, valid=False, xval=False)
    assert isinstance(giniCoef1, float)

    giniCoef2 = gbm.giniCoef(train=False, valid=True, xval=False)
    assert isinstance(giniCoef2, float)

    giniCoef3 = gbm.giniCoef(train=False, valid=False, xval=True)
    assert isinstance(giniCoef3, float)

    giniCoef = gbm.giniCoef(train=True, valid=True, xval=False)
    assert "train" in giniCoef.keys() and "valid" in giniCoef.keys(
    ), "expected training and validation metrics to be returned, but got {0}".format(
        giniCoef.keys())
    assert len(
        giniCoef
    ) == 2, "expected only training and validation metrics to be returned, but got {0}".format(
        giniCoef.keys())
    assert isinstance(giniCoef["train"], float) and isinstance(
        giniCoef["valid"], float
    ), "expected training and validation metrics to be floats, but got {0} and {1}".format(
        type(giniCoef["train"]), type(giniCoef["valid"]))
    assert giniCoef["valid"] == giniCoef2

    giniCoef = gbm.giniCoef(train=True, valid=False, xval=True)
    assert "train" in giniCoef.keys() and "xval" in giniCoef.keys(
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        giniCoef.keys())
    assert len(
        giniCoef
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        giniCoef.keys())
    assert isinstance(giniCoef["train"], float) and isinstance(
        giniCoef["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(giniCoef["train"]), type(giniCoef["xval"]))
    assert giniCoef["xval"] == giniCoef3

    giniCoef = gbm.giniCoef(train=True, valid=True, xval=True)
    assert "train" in giniCoef.keys() and "valid" in giniCoef.keys(
    ) and "xval" in giniCoef.keys(
    ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(
        giniCoef.keys())
    assert len(
        giniCoef
    ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(
        giniCoef.keys())
    assert isinstance(giniCoef["train"], float) and isinstance(
        giniCoef["valid"], float
    ) and isinstance(
        giniCoef["xval"], float
    ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(
        type(giniCoef["train"]), type(giniCoef["valid"]),
        type(giniCoef["xval"]))

    giniCoef = gbm.giniCoef(train=False, valid=False,
                            xval=False)  # default: return training metrics
    assert isinstance(giniCoef, float)
    assert giniCoef == giniCoef1

    giniCoef = gbm.giniCoef(train=False, valid=True, xval=True)
    assert "valid" in giniCoef.keys() and "xval" in giniCoef.keys(
    ), "expected validation and cross validation metrics to be returned, but got {0}".format(
        giniCoef.keys())
    assert len(
        giniCoef
    ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(
        giniCoef.keys())
    assert isinstance(giniCoef["valid"], float) and isinstance(
        giniCoef["xval"], float
    ), "validation and cross validation metrics to be floats, but got {0} and {1}".format(
        type(giniCoef["valid"]), type(giniCoef["xval"]))

    #   F1
    F11 = gbm.F1(train=True, valid=False, xval=False)
    F12 = gbm.F1(train=False, valid=True, xval=False)
    F13 = gbm.F1(train=False, valid=False, xval=True)
    F1 = gbm.F1(train=True, valid=True, xval=False)
    F1 = gbm.F1(train=True, valid=False, xval=True)
    F1 = gbm.F1(train=True, valid=True, xval=True)
    F1 = gbm.F1(train=False, valid=False,
                xval=False)  # default: return training metrics
    F1 = gbm.F1(train=False, valid=True, xval=True)

    #   F0point5
    F0point51 = gbm.F0point5(train=True, valid=False, xval=False)
    F0point52 = gbm.F0point5(train=False, valid=True, xval=False)
    F0point53 = gbm.F0point5(train=False, valid=False, xval=True)
    F0point5 = gbm.F0point5(train=True, valid=True, xval=False)
    F0point5 = gbm.F0point5(train=True, valid=False, xval=True)
    F0point5 = gbm.F0point5(train=True, valid=True, xval=True)
    F0point5 = gbm.F0point5(train=False, valid=False,
                            xval=False)  # default: return training metrics
    F0point5 = gbm.F0point5(train=False, valid=True, xval=True)

    #   F2
    F21 = gbm.F2(train=True, valid=False, xval=False)
    F22 = gbm.F2(train=False, valid=True, xval=False)
    F23 = gbm.F2(train=False, valid=False, xval=True)
    F2 = gbm.F2(train=True, valid=True, xval=False)
    F2 = gbm.F2(train=True, valid=False, xval=True)
    F2 = gbm.F2(train=True, valid=True, xval=True)
    F2 = gbm.F2(train=False, valid=False,
                xval=False)  # default: return training metrics
    F2 = gbm.F2(train=False, valid=True, xval=True)

    #   accuracy
    accuracy1 = gbm.accuracy(train=True, valid=False, xval=False)
    accuracy2 = gbm.accuracy(train=False, valid=True, xval=False)
    accuracy3 = gbm.accuracy(train=False, valid=False, xval=True)
    accuracy = gbm.accuracy(train=True, valid=True, xval=False)
    accuracy = gbm.accuracy(train=True, valid=False, xval=True)
    accuracy = gbm.accuracy(train=True, valid=True, xval=True)
    accuracy = gbm.accuracy(train=False, valid=False,
                            xval=False)  # default: return training metrics
    accuracy = gbm.accuracy(train=False, valid=True, xval=True)

    #   error
    error1 = gbm.error(train=True, valid=False, xval=False)
    error2 = gbm.error(train=False, valid=True, xval=False)
    error3 = gbm.error(train=False, valid=False, xval=True)
    error = gbm.error(train=True, valid=True, xval=False)
    error = gbm.error(train=True, valid=False, xval=True)
    error = gbm.error(train=True, valid=True, xval=True)
    error = gbm.error(train=False, valid=False,
                      xval=False)  # default: return training metrics
    error = gbm.error(train=False, valid=True, xval=True)

    #   precision
    precision1 = gbm.precision(train=True, valid=False, xval=False)
    precision2 = gbm.precision(train=False, valid=True, xval=False)
    precision3 = gbm.precision(train=False, valid=False, xval=True)
    precision = gbm.precision(train=True, valid=True, xval=False)
    precision = gbm.precision(train=True, valid=False, xval=True)
    precision = gbm.precision(train=True, valid=True, xval=True)
    precision = gbm.precision(train=False, valid=False,
                              xval=False)  # default: return training metrics
    precision = gbm.precision(train=False, valid=True, xval=True)

    #   mcc
    mcc1 = gbm.mcc(train=True, valid=False, xval=False)
    mcc2 = gbm.mcc(train=False, valid=True, xval=False)
    mcc3 = gbm.mcc(train=False, valid=False, xval=True)
    mcc = gbm.mcc(train=True, valid=True, xval=False)
    mcc = gbm.mcc(train=True, valid=False, xval=True)
    mcc = gbm.mcc(train=True, valid=True, xval=True)
    mcc = gbm.mcc(train=False, valid=False,
                  xval=False)  # default: return training metrics
    mcc = gbm.mcc(train=False, valid=True, xval=True)

    #   max_per_class_error
    max_per_class_error1 = gbm.max_per_class_error(train=True,
                                                   valid=False,
                                                   xval=False)
    max_per_class_error2 = gbm.max_per_class_error(train=False,
                                                   valid=True,
                                                   xval=False)
    max_per_class_error3 = gbm.max_per_class_error(train=False,
                                                   valid=False,
                                                   xval=True)
    max_per_class_error = gbm.max_per_class_error(train=True,
                                                  valid=True,
                                                  xval=False)
    max_per_class_error = gbm.max_per_class_error(train=True,
                                                  valid=False,
                                                  xval=True)
    max_per_class_error = gbm.max_per_class_error(train=True,
                                                  valid=True,
                                                  xval=True)
    max_per_class_error = gbm.max_per_class_error(
        train=False, valid=False,
        xval=False)  # default: return training metrics
    max_per_class_error = gbm.max_per_class_error(train=False,
                                                  valid=True,
                                                  xval=True)

    #   confusion_matrix
    confusion_matrix1 = gbm.confusion_matrix(train=True,
                                             valid=False,
                                             xval=False)
    confusion_matrix2 = gbm.confusion_matrix(train=False,
                                             valid=True,
                                             xval=False)
    confusion_matrix3 = gbm.confusion_matrix(train=False,
                                             valid=False,
                                             xval=True)
    confusion_matrix = gbm.confusion_matrix(train=True, valid=True, xval=False)
    confusion_matrix = gbm.confusion_matrix(train=True, valid=False, xval=True)
    confusion_matrix = gbm.confusion_matrix(train=True, valid=True, xval=True)
    confusion_matrix = gbm.confusion_matrix(
        train=False, valid=False,
        xval=False)  # default: return training metrics
    confusion_matrix = gbm.confusion_matrix(train=False, valid=True, xval=True)

    # #   plot
    # plot1 = gbm.plot(train=True,  valid=False, xval=False)
    # plot2 = gbm.plot(train=False, valid=True,  xval=False)
    # plot3 = gbm.plot(train=False, valid=False, xval=True)
    # plot = gbm.plot(train=True,  valid=True,  xval=False)
    # plot = gbm.plot(train=True,  valid=False, xval=True)
    # plot = gbm.plot(train=True,  valid=True,  xval=True)
    # plot = gbm.plot(train=False, valid=False, xval=False) # default: return training metrics
    # plot = gbm.plot(train=False, valid=True,  xval=True)

    # #   tpr
    # tpr1 = gbm.tpr(train=True,  valid=False, xval=False)
    # tpr2 = gbm.tpr(train=False, valid=True,  xval=False)
    # tpr3 = gbm.tpr(train=False, valid=False, xval=True)
    # tpr = gbm.tpr(train=True,  valid=True,  xval=False)
    # tpr = gbm.tpr(train=True,  valid=False, xval=True)
    # tpr = gbm.tpr(train=True,  valid=True,  xval=True)
    # tpr = gbm.tpr(train=False, valid=False, xval=False) # default: return training metrics
    # tpr = gbm.tpr(train=False, valid=True,  xval=True)
    #
    # #   tnr
    # tnr1 = gbm.tnr(train=True,  valid=False, xval=False)
    # tnr2 = gbm.tnr(train=False, valid=True,  xval=False)
    # tnr3 = gbm.tnr(train=False, valid=False, xval=True)
    # tnr = gbm.tnr(train=True,  valid=True,  xval=False)
    # tnr = gbm.tnr(train=True,  valid=False, xval=True)
    # tnr = gbm.tnr(train=True,  valid=True,  xval=True)
    # tnr = gbm.tnr(train=False, valid=False, xval=False) # default: return training metrics
    # tnr = gbm.tnr(train=False, valid=True,  xval=True)
    #
    # #   fnr
    # fnr1 = gbm.fnr(train=True,  valid=False, xval=False)
    # fnr2 = gbm.fnr(train=False, valid=True,  xval=False)
    # fnr3 = gbm.fnr(train=False, valid=False, xval=True)
    # fnr = gbm.fnr(train=True,  valid=True,  xval=False)
    # fnr = gbm.fnr(train=True,  valid=False, xval=True)
    # fnr = gbm.fnr(train=True,  valid=True,  xval=True)
    # fnr = gbm.fnr(train=False, valid=False, xval=False) # default: return training metrics
    # fnr = gbm.fnr(train=False, valid=True,  xval=True)
    #
    # #   fpr
    # fpr1 = gbm.fpr(train=True,  valid=False, xval=False)
    # fpr2 = gbm.fpr(train=False, valid=True,  xval=False)
    # fpr3 = gbm.fpr(train=False, valid=False, xval=True)
    # fpr = gbm.fpr(train=True,  valid=True,  xval=False)
    # fpr = gbm.fpr(train=True,  valid=False, xval=True)
    # fpr = gbm.fpr(train=True,  valid=True,  xval=True)
    # fpr = gbm.fpr(train=False, valid=False, xval=False) # default: return training metrics
    # fpr = gbm.fpr(train=False, valid=True,  xval=True)

    # multinomial
    cars = h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv"))
    cars["cylinders"] = cars["cylinders"].asfactor()
    r = cars[0].runif()
    train = cars[r > .2]
    valid = cars[r <= .2]
    response_col = "cylinders"
    distribution = "multinomial"
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    gbm = h2o.gbm(y=train[response_col],
                  x=train[predictors],
                  validation_y=valid[response_col],
                  validation_x=valid[predictors],
                  nfolds=3,
                  distribution=distribution,
                  fold_assignment="Random")

    #   mse
    mse1 = gbm.mse(train=True, valid=False, xval=False)
    assert isinstance(mse1, float)

    mse2 = gbm.mse(train=False, valid=True, xval=False)
    assert isinstance(mse2, float)

    mse3 = gbm.mse(train=False, valid=False, xval=True)
    assert isinstance(mse3, float)

    mse = gbm.mse(train=True, valid=True, xval=False)
    assert "train" in mse.keys() and "valid" in mse.keys(
    ), "expected training and validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert len(
        mse
    ) == 2, "expected only training and validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert isinstance(mse["train"], float) and isinstance(
        mse["valid"], float
    ), "expected training and validation metrics to be floats, but got {0} and {1}".format(
        type(mse["train"]), type(mse["valid"]))
    assert mse["valid"] == mse2

    mse = gbm.mse(train=True, valid=False, xval=True)
    assert "train" in mse.keys() and "xval" in mse.keys(
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert len(
        mse
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert isinstance(mse["train"], float) and isinstance(
        mse["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(mse["train"]), type(mse["xval"]))
    assert mse["xval"] == mse3

    mse = gbm.mse(train=True, valid=True, xval=True)
    assert "train" in mse.keys() and "valid" in mse.keys(
    ) and "xval" in mse.keys(
    ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert len(
        mse
    ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert isinstance(mse["train"], float) and isinstance(
        mse["valid"], float
    ) and isinstance(
        mse["xval"], float
    ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(
        type(mse["train"]), type(mse["valid"]), type(mse["xval"]))

    mse = gbm.mse(train=False, valid=False,
                  xval=False)  # default: return training metrics
    assert isinstance(mse, float)
    assert mse == mse1

    mse = gbm.mse(train=False, valid=True, xval=True)
    assert "valid" in mse.keys() and "xval" in mse.keys(
    ), "expected validation and cross validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert len(
        mse
    ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert isinstance(mse["valid"], float) and isinstance(
        mse["xval"], float
    ), "validation and cross validation metrics to be floats, but got {0} and {1}".format(
        type(mse["valid"]), type(mse["xval"]))

    #   logloss
    logloss1 = gbm.logloss(train=True, valid=False, xval=False)
    assert isinstance(logloss1, float)

    logloss2 = gbm.logloss(train=False, valid=True, xval=False)
    assert isinstance(logloss2, float)

    logloss3 = gbm.logloss(train=False, valid=False, xval=True)
    assert isinstance(logloss3, float)

    logloss = gbm.logloss(train=True, valid=True, xval=False)
    assert "train" in logloss.keys() and "valid" in logloss.keys(
    ), "expected training and validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert len(
        logloss
    ) == 2, "expected only training and validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert isinstance(logloss["train"], float) and isinstance(
        logloss["valid"], float
    ), "expected training and validation metrics to be floats, but got {0} and {1}".format(
        type(logloss["train"]), type(logloss["valid"]))
    assert logloss["valid"] == logloss2

    logloss = gbm.logloss(train=True, valid=False, xval=True)
    assert "train" in logloss.keys() and "xval" in logloss.keys(
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert len(
        logloss
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert isinstance(logloss["train"], float) and isinstance(
        logloss["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(logloss["train"]), type(logloss["xval"]))
    assert logloss["xval"] == logloss3

    logloss = gbm.logloss(train=True, valid=True, xval=True)
    assert "train" in logloss.keys() and "valid" in logloss.keys(
    ) and "xval" in logloss.keys(
    ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert len(
        logloss
    ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert isinstance(logloss["train"], float) and isinstance(
        logloss["valid"], float
    ) and isinstance(
        logloss["xval"], float
    ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(
        type(logloss["train"]), type(logloss["valid"]), type(logloss["xval"]))

    logloss = gbm.logloss(train=False, valid=False,
                          xval=False)  # default: return training metrics
    assert isinstance(logloss, float)
    assert logloss == logloss1

    logloss = gbm.logloss(train=False, valid=True, xval=True)
    assert "valid" in logloss.keys() and "xval" in logloss.keys(
    ), "expected validation and cross validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert len(
        logloss
    ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert isinstance(logloss["valid"], float) and isinstance(
        logloss["xval"], float
    ), "validation and cross validation metrics to be floats, but got {0} and {1}".format(
        type(logloss["valid"]), type(logloss["xval"]))

    #   hit_ratio_table
    hit_ratio_table1 = gbm.hit_ratio_table(train=True, valid=False, xval=False)
    hit_ratio_table2 = gbm.hit_ratio_table(train=False, valid=True, xval=False)
    hit_ratio_table3 = gbm.hit_ratio_table(train=False, valid=False, xval=True)
    hit_ratio_table = gbm.hit_ratio_table(train=True, valid=True, xval=False)
    hit_ratio_table = gbm.hit_ratio_table(train=True, valid=False, xval=True)
    hit_ratio_table = gbm.hit_ratio_table(train=True, valid=True, xval=True)
    hit_ratio_table = gbm.hit_ratio_table(
        train=False, valid=False,
        xval=False)  # default: return training metrics
    hit_ratio_table = gbm.hit_ratio_table(train=False, valid=True, xval=True)

    # clustering
    iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv"))
    km = h2o.kmeans(x=iris[0:4], nfolds=3, k=3)

    #   betweenss
    betweenss1 = km.betweenss(train=True, valid=False, xval=False)
    assert isinstance(betweenss1, float)

    betweenss3 = km.betweenss(train=False, valid=False, xval=True)
    assert isinstance(betweenss3, float)

    betweenss = km.betweenss(train=True, valid=False, xval=True)
    assert "train" in betweenss.keys() and "xval" in betweenss.keys(
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        betweenss.keys())
    assert len(
        betweenss
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        betweenss.keys())
    assert isinstance(betweenss["train"], float) and isinstance(
        betweenss["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(betweenss["train"]), type(betweenss["xval"]))
    assert betweenss["xval"] == betweenss3

    betweenss = km.betweenss(train=False, valid=False,
                             xval=False)  # default: return training metrics
    assert isinstance(betweenss, float)
    assert betweenss == betweenss1

    #   totss
    totss1 = km.totss(train=True, valid=False, xval=False)
    assert isinstance(totss1, float)

    totss3 = km.totss(train=False, valid=False, xval=True)
    assert isinstance(totss3, float)

    totss = km.totss(train=True, valid=False, xval=True)
    assert "train" in totss.keys() and "xval" in totss.keys(
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        totss.keys())
    assert len(
        totss
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        totss.keys())
    assert isinstance(totss["train"], float) and isinstance(
        totss["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(totss["train"]), type(totss["xval"]))
    assert totss["xval"] == totss3

    totss = km.totss(train=False, valid=False,
                     xval=False)  # default: return training metrics
    assert isinstance(totss, float)
    assert totss == totss1

    #   tot_withinss
    tot_withinss1 = km.tot_withinss(train=True, valid=False, xval=False)
    assert isinstance(tot_withinss1, float)

    tot_withinss3 = km.tot_withinss(train=False, valid=False, xval=True)
    assert isinstance(tot_withinss3, float)

    tot_withinss = km.tot_withinss(train=True, valid=False, xval=True)
    assert "train" in tot_withinss.keys() and "xval" in tot_withinss.keys(
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        tot_withinss.keys())
    assert len(
        tot_withinss
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        tot_withinss.keys())
    assert isinstance(tot_withinss["train"], float) and isinstance(
        tot_withinss["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(tot_withinss["train"]), type(tot_withinss["xval"]))
    assert tot_withinss["xval"] == tot_withinss3

    tot_withinss = km.tot_withinss(
        train=False, valid=False,
        xval=False)  # default: return training metrics
    assert isinstance(tot_withinss, float)
    assert tot_withinss == tot_withinss1

    #   withinss
    withinss1 = km.withinss(train=True, valid=False, xval=False)
    withinss3 = km.withinss(train=False, valid=False, xval=True)
    withinss = km.withinss(train=True, valid=False, xval=True)
    withinss = km.withinss(train=False, valid=False,
                           xval=False)  # default: return training metrics

    #   centroid_stats
    centroid_stats1 = km.centroid_stats(train=True, valid=False, xval=False)
    centroid_stats3 = km.centroid_stats(train=False, valid=False, xval=True)
    centroid_stats = km.centroid_stats(train=True, valid=False, xval=True)
    centroid_stats = km.centroid_stats(
        train=False, valid=False,
        xval=False)  # default: return training metrics

    #   size
    size1 = km.size(train=True, valid=False, xval=False)
    size3 = km.size(train=False, valid=False, xval=True)
    size = km.size(train=True, valid=False, xval=True)
    size = km.size(train=False, valid=False,
                   xval=False)  # default: return training metrics
Beispiel #48
0
def cars_checkpoint():

    cars = h2o.upload_file(
        pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    s = cars.runif()
    train = cars[s > .2]
    valid = cars[s <= .2]

    print("\n*** Description (chunk distribution, etc) of training frame:")
    train.describe()
    print("\n*** Description (chunk distribution, etc) of validation frame:")
    valid.describe()

    # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
    # 2:multinomial
    problem = random.sample(list(range(3)), 1)[0]

    # pick the predictors and response column, along with the correct distribution
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    if problem == 1:
        response_col = "economy_20mpg"
        distribution = "bernoulli"
        train[response_col] = train[response_col].asfactor()
        valid[response_col] = valid[response_col].asfactor()
    elif problem == 2:
        response_col = "cylinders"
        distribution = "multinomial"
        train[response_col] = train[response_col].asfactor()
        valid[response_col] = valid[response_col].asfactor()
    else:
        response_col = "economy"
        distribution = "gaussian"

    print("\n*** Distribution: {0}".format(distribution))
    print("\n*** Response column: {0}".format(response_col))

    # build first model
    ntrees1 = 5
    max_depth1 = random.sample(list(range(2, 6)), 1)[0]
    min_rows1 = random.sample(list(range(10, 16)), 1)[0]
    print("\n*** Building model 1 with the following parameters:")
    print("*** ntrees model 1: {0}".format(ntrees1))
    print("*** max_depth model 1: {0}".format(max_depth1))
    print("*** min_rows model 1: {0}".format(min_rows1))
    model1 = h2o.gbm(x=train[predictors],
                     y=train[response_col],
                     ntrees=ntrees1,
                     max_depth=max_depth1,
                     min_rows=min_rows1,
                     score_each_iteration=True,
                     distribution=distribution,
                     validation_x=valid[predictors],
                     validation_y=valid[response_col])

    # save the model, then load the model
    model_path = h2o.save_model(model1, name="delete_model", force=True)
    restored_model = h2o.load_model(model_path)
    shutil.rmtree("delete_model")

    # continue building the model
    ntrees2 = ntrees1 + 5
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print(
        "\n*** Continuing to build model 1 (now called model 2) with the following parameters:"
    )
    print("*** ntrees model 2: {0}".format(ntrees2))
    print("*** max_depth model 2: {0}".format(max_depth2))
    print("*** min_rows model 2: {0}".format(min_rows2))
    model2 = h2o.gbm(x=train[predictors],
                     y=train[response_col],
                     ntrees=ntrees2,
                     max_depth=max_depth2,
                     min_rows=min_rows2,
                     distribution=distribution,
                     score_each_iteration=True,
                     validation_x=valid[predictors],
                     validation_y=valid[response_col],
                     checkpoint=restored_model._id)

    # continue building the model, but with different number of trees
    ntrees3 = ntrees2 + 50
    max_depth3 = max_depth1
    min_rows3 = min_rows1
    print(
        "\n*** Continuing to build model 1 (now called model 3) with the following parameters:"
    )
    print("*** ntrees model 3: {0}".format(ntrees3))
    print("*** max_depth model 3: {0}".format(max_depth3))
    print("*** min_rows model 3: {0}".format(min_rows3))
    model3 = h2o.gbm(x=train[predictors],
                     y=train[response_col],
                     ntrees=ntrees3,
                     max_depth=max_depth3,
                     min_rows=min_rows3,
                     distribution=distribution,
                     score_each_iteration=True,
                     validation_x=valid[predictors],
                     validation_y=valid[response_col],
                     checkpoint=restored_model._id)

    # build the equivalent of model 2 in one shot
    print(
        "\n*** Building the equivalent of model 2 (called model 4) in one shot:"
    )
    model4 = h2o.gbm(x=train[predictors],
                     y=train[response_col],
                     ntrees=ntrees2,
                     max_depth=max_depth2,
                     min_rows=min_rows2,
                     distribution=distribution,
                     score_each_iteration=True,
                     validation_x=valid[predictors],
                     validation_y=valid[response_col])

    print("\n*** Model Summary for model 2:")
    print(model2.summary())
    print("\n*** Model Summary for model 3:")
    print(model3.summary())
    print("\n*** Model Summary for model 4:")
    print(model4.summary())

    print("\n*** Score History for model 2:")
    print(model2.scoring_history())
    print("\n*** Score History for model 3:")
    print(model3.scoring_history())
    print("\n*** Score History for model 4:")
    print(model4.scoring_history())

    # checks
    if problem == 0:
        assert isinstance(model2, type(model4))
        assert model2.mse(valid=True) == model4.mse(
            valid=True
        ), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(
            model2.mse(valid=True), model4.mse(valid=True))
        #assert model3.mse(valid=True)!=model4.mse(valid=True), "Expected Model 3 MSE: {0} to be different from Model 4 MSE: {1}".format(model3.mse(valid=True), model4.mse(valid=True))

    elif problem == 1:
        assert isinstance(model2, type(model4))
        assert model2.auc(valid=True) == model4.auc(
            valid=True
        ), "Expected Model 2 AUC: {0} to be the same as Model 4 AUC: {1}".format(
            model2.auc(valid=True), model4.auc(valid=True))
        #assert model3.auc(valid=True)!=model4.auc(valid=True), "Expected Model 3 AUC: {0} to be different from Model 4 AUC: {1}".format(model3.auc(valid=True), model4.auc(valid=True))

        assert model2.logloss(valid=True) == model4.logloss(
            valid=True
        ), "Expected Model 2 Log Loss: {0} to be the same as Model 4 Log Loss: {1}".format(
            model2.logloss(valid=True), model4.logloss(valid=True))
        #assert model3.logloss(valid=True)!=model4.logloss(valid=True), "Expected Model 3 Log Loss: {0} to be different from Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True))

        assert model2.giniCoef(valid=True) == model4.giniCoef(
            valid=True
        ), "Expected Model 2 Gini Coef {0} to be the same as Model 4 Gini Coef: {1}".format(
            model2.giniCoef(valid=True), model4.giniCoef(valid=True))
        #assert model3.giniCoef(valid=True)!=model4.giniCoef(valid=True), "Expected Model 3 Gini Coef: {0} to be different from Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True))

    else:
        assert isinstance(model2, type(model4))
        assert model2.mse(valid=True) == model4.mse(
            valid=True
        ), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(
            model2.mse(valid=True), model4.mse(valid=True))
        #assert model3.mse(valid=True)!=model4.mse(valid=True), "Expected Model 3 MSE: {0} to be different from Model 4 MSE: {1}".format(model3.mse(valid=True), model4.mse(valid=True))

        assert model2.r2(valid=True) == model4.r2(
            valid=True
        ), "Expected Model 2 R2: {0} to be the same as Model 4 R2: {1}".format(
            model2.r2(valid=True), model4.r2(valid=True))
Beispiel #49
0
def ecologyGBM(ip, port):

    #Log.info("Importing ecology_model.csv data...\n")
    ecology_train = h2o.import_file(
        path=h2o.locate("smalldata/gbm_test/ecology_model.csv"))
    #Log.info("Summary of the ecology data from h2o: \n")
    #ecology.summary()

    # Log.info("==============================")
    # Log.info("H2O GBM Params: ")
    # Log.info("x = ecology_train[2:14]")
    # Log.info("y = ecology_train["Angaus"]")
    # Log.info("ntrees = 100")
    # Log.info("max_depth = 5")
    # Log.info("min_rows = 10")
    # Log.info("learn_rate = 0.1")
    # Log.info("==============================")
    # Log.info("==============================")
    # Log.info("scikit GBM Params: ")
    # Log.info("learning_rate=0.1")
    # Log.info("n_estimators=100")
    # Log.info("max_depth=5")
    # Log.info("min_samples_leaf = 10")
    # Log.info("n.minobsinnode = 10")
    # Log.info("max_features=None")
    # Log.info("==============================")

    ntrees = 100
    max_depth = 5
    min_rows = 10
    learn_rate = 0.1

    # Prepare data for scikit use
    trainData = np.genfromtxt(
        h2o.locate("smalldata/gbm_test/ecology_model.csv"),
        delimiter=',',
        dtype=None,
        names=("Site", "Angaus", "SegSumT", "SegTSeas", "SegLowFlow", "DSDist",
               "DSMaxSlope", "USAvgT", "USRainDays", "USSlope", "USNative",
               "DSDam", "Method", "LocSed"),
        skip_header=1,
        missing_values=('NA'),
        filling_values=(np.nan))
    trainDataResponse = trainData["Angaus"]
    trainDataFeatures = trainData[[
        "SegSumT", "SegTSeas", "SegLowFlow", "DSDist", "DSMaxSlope", "USAvgT",
        "USRainDays", "USSlope", "USNative", "DSDam", "Method", "LocSed"
    ]]

    ecology_train["Angaus"] = ecology_train["Angaus"].asfactor()
    # Train H2O GBM Model:
    gbm_h2o = h2o.gbm(x=ecology_train[2:],
                      y=ecology_train["Angaus"],
                      ntrees=ntrees,
                      learn_rate=learn_rate,
                      max_depth=max_depth,
                      min_rows=min_rows,
                      distribution="bernoulli")

    # Train scikit GBM Model:
    gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learn_rate,
                                                  n_estimators=ntrees,
                                                  max_depth=max_depth,
                                                  min_samples_leaf=min_rows,
                                                  max_features=None)
    gbm_sci.fit(trainDataFeatures[:, np.newaxis], trainDataResponse)

    # Evaluate the trained models on test data
    # Load the test data (h2o)
    ecology_test = h2o.import_file(
        path=h2o.locate("smalldata/gbm_test/ecology_eval.csv"))

    # Load the test data (scikit)
    testData = np.genfromtxt(h2o.locate("smalldata/gbm_test/ecology_eval.csv"),
                             delimiter=',',
                             dtype=None,
                             names=("Angaus", "SegSumT", "SegTSeas",
                                    "SegLowFlow", "DSDist", "DSMaxSlope",
                                    "USAvgT", "USRainDays", "USSlope",
                                    "USNative", "DSDam", "Method", "LocSed"),
                             skip_header=1,
                             missing_values=('NA'),
                             filling_values=(np.nan))
    testDataResponse = testData["Angaus"]
    testDataFeatures = testData[[
        "SegSumT", "SegTSeas", "SegLowFlow", "DSDist", "DSMaxSlope", "USAvgT",
        "USRainDays", "USSlope", "USNative", "DSDam", "Method", "LocSed"
    ]]

    # Score on the test data and compare results

    # scikit
    auc_sci = roc_auc_score(
        testDataResponse,
        gbm_sci.predict_proba(testDataFeatures[:, np.newaxis])[:, 1])

    # h2o
    gbm_perf = gbm_h2o.model_performance(ecology_test)
    auc_h2o = gbm_perf.auc()

    #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o))
    assert auc_h2o >= auc_sci, "h2o (auc) performance degradation, with respect to scikit"
Beispiel #50
0
def cv_carsGBM(ip,port):

    # read in the dataset and construct training set (and validation set)
    cars =  h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv"))

    # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
    # 2:multinomial
    problem = random.sample(range(3),1)[0]

    # pick the predictors and response column, along with the correct distribution
    predictors = ["displacement","power","weight","acceleration","year"]
    if problem == 1   :
        response_col = "economy_20mpg"
        distribution = "bernoulli"
        cars[response_col] = cars[response_col].asfactor()
    elif problem == 2 :
        response_col = "cylinders"
        distribution = "multinomial"
        cars[response_col] = cars[response_col].asfactor()
    else              :
        response_col = "economy"
        distribution = "gaussian"

    print "Distribution: {0}".format(distribution)
    print "Response column: {0}".format(response_col)

    ## cross-validation
    # 1. check that cv metrics are the same over repeated "Modulo" runs
    nfolds = random.randint(3,10)
    gbm1 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, distribution=distribution, ntrees=5,
                   fold_assignment="Modulo")
    gbm2 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, distribution=distribution, ntrees=5,
                   fold_assignment="Modulo")
    h2o.check_models(gbm1, gbm2, True)

    # 2. check that cv metrics are different over repeated "Random" runs
    nfolds = random.randint(3,10)
    gbm1 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, distribution=distribution, ntrees=5,
                   fold_assignment="Random")
    gbm2 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, distribution=distribution, ntrees=5,
                   fold_assignment="Random")
    try:
        h2o.check_models(gbm1, gbm2, True)
        assert False, "Expected models to be different over repeated Random runs"
    except AssertionError:
        assert True

    # 3. folds_column
    num_folds = random.randint(2,5)
    fold_assignments = h2o.H2OFrame(python_obj=[[random.randint(0,num_folds-1)] for f in range(cars.nrow())])
    fold_assignments.setNames(["fold_assignments"])
    cars = cars.cbind(fold_assignments)
    gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], training_frame=cars, distribution=distribution, ntrees=5,
                  fold_column="fold_assignments", keep_cross_validation_predictions=True)
    num_cv_models = len(gbm._model_json['output']['cross_validation_models'])
    assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \
                                                    "{1}".format(num_folds, num_cv_models)
    cv_model1 = h2o.get_model(gbm._model_json['output']['cross_validation_models'][0]['name'])
    cv_model2 = h2o.get_model(gbm._model_json['output']['cross_validation_models'][1]['name'])
    assert isinstance(cv_model1, type(gbm)), "Expected cross-validation model to be the same model type as the " \
                                             "constructed model, but got {0} and {1}".format(type(cv_model1),type(gbm))
    assert isinstance(cv_model2, type(gbm)), "Expected cross-validation model to be the same model type as the " \
                                             "constructed model, but got {0} and {1}".format(type(cv_model2),type(gbm))

    # 4. keep_cross_validation_predictions
    cv_predictions = gbm1._model_json['output']['cross_validation_predictions']
    assert cv_predictions is None, "Expected cross-validation predictions to be None, but got {0}".format(cv_predictions)

    cv_predictions = gbm._model_json['output']['cross_validation_predictions']
    assert len(cv_predictions)==num_folds, "Expected the same number of cross-validation predictions " \
                                                          "as folds, but got {0}".format(len(cv_predictions))

    # # 5. manually construct models
    # fold1 = cars[cars["fold_assignments"]==0]
    # fold2 = cars[cars["fold_assignments"]==1]
    # manual_model1 = h2o.gbm(y=fold2[response_col],
    #                         x=fold2[predictors],
    #                         validation_y=fold1[response_col],
    #                         validation_x=fold1[predictors], ntrees=5,
    #                         distribution=distribution)
    # manual_model2 = h2o.gbm(y=fold1[response_col],
    #                         x=fold1[predictors],
    #                         validation_y=fold2[response_col],
    #                         validation_x=fold2[predictors], ntrees=5,
    #                         distribution=distribution)


    ## boundary cases
    # 1. nfolds = number of observations (leave-one-out cross-validation)
    gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow(), distribution=distribution, ntrees=5,
                  fold_assignment="Modulo")

    # 2. nfolds = 0
    gbm1 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=0, distribution=distribution, ntrees=5)
    # check that this is equivalent to no nfolds
    gbm2 = h2o.gbm(y=cars[response_col], x=cars[predictors], distribution=distribution, ntrees=5)
    h2o.check_models(gbm1, gbm2)

    # 3. cross-validation and regular validation attempted
    gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=random.randint(3,10), validation_y=cars[response_col], ntrees=5,
                  validation_x=cars[predictors], distribution=distribution)


    ## error cases
    # 1. nfolds == 1 or < 0
    try:
        gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=random.sample([-1,1], 1)[0], ntrees=5,
                      distribution=distribution)
        assert False, "Expected model-build to fail when nfolds is 1 or < 0"
    except EnvironmentError:
        assert True

    # 2. more folds than observations
    try:
        gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow()+1, distribution=distribution, ntrees=5,
                      fold_assignment="Modulo")
        assert False, "Expected model-build to fail when nfolds > nobs"
    except EnvironmentError:
        assert True

    # 3. fold_column and nfolds both specified
    try:
        gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=3, fold_column="fold_assignments", ntrees=5,
                      distribution=distribution, training_frame=cars)
        assert False, "Expected model-build to fail when fold_column and nfolds both specified"
    except EnvironmentError:
        assert True

    # 4. fold_column and fold_assignment both specified
    try:
        gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], fold_assignment="Random", fold_column="fold_assignments", ntrees=5,
                      distribution=distribution, training_frame=cars)
        assert False, "Expected model-build to fail when fold_column and fold_assignment both specified"
    except EnvironmentError:
        assert True
Beispiel #51
0
def milsong_checkpoint():

    milsong_train = h2o.upload_file(
        tests.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
    milsong_valid = h2o.upload_file(
        tests.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))
    distribution = "gaussian"

    # build first model
    ntrees1 = random.sample(range(50, 100), 1)[0]
    max_depth1 = random.sample(range(2, 6), 1)[0]
    min_rows1 = random.sample(range(10, 16), 1)[0]
    print "ntrees model 1: {0}".format(ntrees1)
    print "max_depth model 1: {0}".format(max_depth1)
    print "min_rows model 1: {0}".format(min_rows1)
    model1 = h2o.gbm(x=milsong_train[1:],
                     y=milsong_train[0],
                     ntrees=ntrees1,
                     max_depth=max_depth1,
                     min_rows=min_rows1,
                     distribution=distribution,
                     validation_x=milsong_valid[1:],
                     validation_y=milsong_valid[0])

    # save the model, then load the model
    path = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..",
                     "results"))

    assert os.path.isdir(
        path), "Expected save directory {0} to exist, but it does not.".format(
            path)
    model_path = h2o.save_model(model1, path=path, force=True)

    assert os.path.isdir(
        model_path
    ), "Expected load directory {0} to exist, but it does not.".format(
        model_path)
    restored_model = h2o.load_model(model_path)

    # continue building the model
    ntrees2 = ntrees1 + 50
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print "ntrees model 2: {0}".format(ntrees2)
    print "max_depth model 2: {0}".format(max_depth2)
    print "min_rows model 2: {0}".format(min_rows2)
    model2 = h2o.gbm(x=milsong_train[1:],
                     y=milsong_train[0],
                     ntrees=ntrees2,
                     max_depth=max_depth2,
                     min_rows=min_rows2,
                     distribution=distribution,
                     validation_x=milsong_valid[1:],
                     validation_y=milsong_valid[0],
                     checkpoint=restored_model._id)

    # build the equivalent of model 2 in one shot
    model3 = h2o.gbm(x=milsong_train[1:],
                     y=milsong_train[0],
                     ntrees=ntrees2,
                     max_depth=max_depth2,
                     min_rows=min_rows2,
                     distribution=distribution,
                     validation_x=milsong_valid[1:],
                     validation_y=milsong_valid[0])
Beispiel #52
0
def domain_check():

    air_train = h2o.import_file(
        path=tests.locate("smalldata/airlines/AirlinesTrain.csv.zip"))
    air_train.show()
    air_test = h2o.import_file(
        path=tests.locate("smalldata/airlines/AirlinesTest.csv.zip"))
    air_test.show()

    actual_domain = [u'YES', u'NO']
    print "actual domain of the response: {0}".format(actual_domain)

    ### DRF ###
    print
    print "-------------- DRF:"
    print
    rf = h2o.random_forest(x=air_train[[
        "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth",
        "fDayOfWeek"
    ]],
                           y=air_train["IsDepDelayed"].asfactor(),
                           training_frame=air_train)
    computed_domain = rf._model_json['output'][
        'training_metrics']._metric_json['domain']
    domain_diff = list(set(computed_domain) - set(actual_domain))
    assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \
                            "The difference is {2}".format(actual_domain, computed_domain, domain_diff)

    perf = rf.model_performance(test_data=air_test)
    computed_domain = perf._metric_json['domain']
    domain_diff = list(set(computed_domain) - set(actual_domain))
    assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \
                            "The difference is {2}".format(actual_domain, computed_domain, domain_diff)

    ### GBM ###
    print
    print "-------------- GBM:"
    print
    gbm = h2o.gbm(x=air_train[[
        "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth",
        "fDayOfWeek"
    ]],
                  y=air_train["IsDepDelayed"].asfactor(),
                  training_frame=air_train,
                  distribution="bernoulli")
    computed_domain = gbm._model_json['output'][
        'training_metrics']._metric_json['domain']
    domain_diff = list(set(computed_domain) - set(actual_domain))
    assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \
                            "The difference is {2}".format(actual_domain, computed_domain, domain_diff)

    perf = rf.model_performance(test_data=air_test)
    computed_domain = perf._metric_json['domain']
    domain_diff = list(set(computed_domain) - set(actual_domain))
    assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \
                            "The difference is {2}".format(actual_domain, computed_domain, domain_diff)

    ### Deeplearning ###
    print
    print "-------------- Deeplearning:"
    print
    dl = h2o.deeplearning(x=air_train[[
        "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth",
        "fDayOfWeek"
    ]],
                          y=air_train["IsDepDelayed"].asfactor(),
                          training_frame=air_train,
                          activation="Tanh",
                          hidden=[2, 2, 2],
                          epochs=10)
    computed_domain = dl._model_json['output'][
        'training_metrics']._metric_json['domain']
    domain_diff = list(set(computed_domain) - set(actual_domain))
    assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \
                            "The difference is {2}".format(actual_domain, computed_domain, domain_diff)

    perf = rf.model_performance(test_data=air_test)
    computed_domain = perf._metric_json['domain']
    domain_diff = list(set(computed_domain) - set(actual_domain))
    assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \
                            "The difference is {2}".format(actual_domain, computed_domain, domain_diff)

    ### GLM ###
    print
    print "-------------- GLM:"
    print
    glm = h2o.glm(x=air_train[[
        "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth",
        "fDayOfWeek"
    ]],
                  y=air_train["IsDepDelayed"],
                  training_frame=air_train,
                  family="binomial")
    computed_domain = glm._model_json['output'][
        'training_metrics']._metric_json['domain']
    domain_diff = list(set(computed_domain) - set(actual_domain))
    assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \
                        "The difference is {2}".format(actual_domain, computed_domain, domain_diff)

    perf = glm.model_performance(test_data=air_test)
    computed_domain = perf._metric_json['domain']
    domain_diff = list(set(computed_domain) - set(actual_domain))
    assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \
                            "The difference is {2}".format(actual_domain, computed_domain, domain_diff)
Beispiel #53
0
def loss_behaviorGBM(ip,port):
  # Connect to h2o
  h2o.init(ip,port)

  #Log.info("==============================")
  #Log.info("Default Behavior - Gaussian")
  #Log.info("==============================")
  eco = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/ecology_model.csv"))
  # 0/1 response: expect gaussian
  eco_model = h2o.gbm(x=eco[2:13], y=eco["Angaus"])
  assert isinstance(eco_model,h2o.model.regression.H2ORegressionModel)
  # more than 2 integers for response: expect gaussian
  cars = h2o.import_frame(path=h2o.locate("smalldata/junit/cars.csv"))
  cars_model = h2o.gbm(x=cars[3:7], y=cars["cylinders"])
  assert isinstance(cars_model,h2o.model.regression.H2ORegressionModel)

# AUTO loss works now - no longer dies here
#  # character response: expect error
#  try:
#    eco_model = h2o.gbm(x=eco[0:8], y=eco["Method"])
#    assert False, "expected an error"
#  except EnvironmentError:
#    assert True

  #Log.info("==============================")
  #Log.info("Gaussian Behavior")
  #Log.info("==============================")
  # 0/1 response: expect gaussian
  eco_model = h2o.gbm(x=eco[2:13], y=eco["Angaus"], loss="gaussian")
  assert isinstance(eco_model,h2o.model.regression.H2ORegressionModel)
  # character response: expect error
  try:
    eco_model = h2o.gbm(x=eco[1:8], y=eco["Method"], loss="gaussian")
    assert False, "expected an error"
  except EnvironmentError:
    assert True

  #Log.info("==============================")
  #Log.info("Bernoulli Behavior")
  #Log.info("==============================")
  # 0/1 response: expect bernoulli
  eco_model = h2o.gbm(x=eco[2:13], y=eco["Angaus"].asfactor(), loss="bernoulli")
  assert isinstance(eco_model,h2o.model.binomial.H2OBinomialModel)
  # 2 level character response: expect bernoulli
  tree = h2o.import_frame(path=h2o.locate("smalldata/junit/test_tree_minmax.csv"))
  tree_model = h2o.gbm(x=tree[0:3], y=tree["response"], loss="bernoulli", min_rows=1)
  assert isinstance(tree_model,h2o.model.binomial.H2OBinomialModel)
  # more than two integers for response: expect error
  try:
    cars_mod = h2o.gbm(x=cars[3:7], y=cars["cylinders"], loss="bernoulli")
    assert False, "expected an error"
  except EnvironmentError:
    assert True
  # more than two character levels for response: expect error
  try:
    eco_model = h2o.gbm(x=eco[0:8], y=eco["Method"], loss="bernoulli")
    assert False, "expected an error"
  except EnvironmentError:
    assert True

  #Log.info("==============================")
  #Log.info("Multinomial Behavior")
  #Log.info("==============================")
  # more than two integers for response: expect multinomial
  cars_model = h2o.gbm(x=cars[3:7], y=cars["cylinders"].asfactor(), loss="multinomial")
  assert isinstance(cars_model,h2o.model.multinomial.H2OMultinomialModel)
  # more than two character levels for response: expect multinomial
  eco_model = h2o.gbm(x=eco[0:8], y=eco["Method"], loss="multinomial")
  assert isinstance(eco_model,h2o.model.multinomial.H2OMultinomialModel)
Beispiel #54
0
def javapredict(algo, equality, train, test, x, y, **kwargs):
    print "Creating model in H2O"
    if algo == "gbm":
        model = h2o.gbm(x=train[x], y=train[y], **kwargs)
    elif algo == "random_forest":
        model = h2o.random_forest(x=train[x], y=train[y], **kwargs)
    elif algo == "deeplearning":
        model = h2o.deeplearning(x=train[x], y=train[y], **kwargs)
    elif algo == "glm":
        model = h2o.glm(x=train[x], y=train[y], **kwargs)
    else:
        raise (ValueError, "algo {0} is not supported".format(algo))
    print model

    print "Downloading Java prediction model code from H2O"
    tmpdir = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                     "results", model._id))
    os.mkdir(tmpdir)
    h2o.download_pojo(model, path=tmpdir)
    h2o_genmodel_jar = os.path.join(tmpdir, "h2o-genmodel.jar")
    assert os.path.exists(
        h2o_genmodel_jar
    ), "Expected file {0} to exist, but it does not.".format(h2o_genmodel_jar)
    print "h2o-genmodel.jar saved in {0}".format(h2o_genmodel_jar)
    java_file = os.path.join(tmpdir, model._id + ".java")
    assert os.path.exists(
        java_file), "Expected file {0} to exist, but it does not.".format(
            java_file)
    print "java code saved in {0}".format(java_file)

    print "Predicting in H2O"
    predictions = model.predict(test)
    predictions.summary()
    predictions.head()
    out_h2o_csv = os.path.join(tmpdir, "out_h2o.csv")
    h2o.download_csv(predictions, out_h2o_csv)
    assert os.path.exists(
        out_h2o_csv), "Expected file {0} to exist, but it does not.".format(
            out_h2o_csv)
    print "H2O Predictions saved in {0}".format(out_h2o_csv)

    print "Setting up for Java POJO"
    in_csv = os.path.join(tmpdir, "in.csv")
    h2o.download_csv(test[x], in_csv)

    # hack: the PredictCsv driver can't handle quoted strings, so remove them
    f = open(in_csv, 'r+')
    csv = f.read()
    csv = re.sub('\"', '', csv)
    f.seek(0)
    f.write(csv)
    f.truncate()
    f.close()
    assert os.path.exists(
        in_csv), "Expected file {0} to exist, but it does not.".format(in_csv)
    print "Input CSV to PredictCsv saved in {0}".format(in_csv)

    print "Compiling Java Pojo"
    javac_cmd = [
        "javac", "-cp", h2o_genmodel_jar, "-J-Xmx4g", "-J-XX:MaxPermSize=256m",
        java_file
    ]
    subprocess.check_call(javac_cmd)

    print "Running PredictCsv Java Program"
    out_pojo_csv = os.path.join(tmpdir, "out_pojo.csv")
    cp_sep = ";" if sys.platform == "win32" else ":"
    java_cmd = [
        "java", "-ea", "-cp", h2o_genmodel_jar + cp_sep + tmpdir, "-Xmx4g",
        "-XX:MaxPermSize=256m", "-XX:ReservedCodeCacheSize=256m",
        "hex.genmodel.tools.PredictCsv", "--header", "--model", model._id,
        "--input", in_csv, "--output", out_pojo_csv
    ]
    p = subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT)
    o, e = p.communicate()
    print "Java output: {0}".format(o)
    assert os.path.exists(
        out_pojo_csv), "Expected file {0} to exist, but it does not.".format(
            out_pojo_csv)
    predictions2 = h2o.import_file(path=out_pojo_csv)
    print "Pojo predictions saved in {0}".format(out_pojo_csv)

    print "Comparing predictions between H2O and Java POJO"
    # Dimensions
    hr, hc = predictions.dim
    pr, pc = predictions2.dim
    assert hr == pr, "Exepcted the same number of rows, but got {0} and {1}".format(
        hr, pr)
    assert hc == pc, "Exepcted the same number of cols, but got {0} and {1}".format(
        hc, pc)

    # Value
    for r in range(hr):
        hp = predictions[r, 0]
        if equality == "numeric":
            pp = float.fromhex(predictions2[r, 0])
            assert abs(
                hp - pp
            ) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format(
                r, hp, pp)
        elif equality == "class":
            pp = predictions2[r, 0]
            assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format(
                r, hp, pp)
        else:
            raise (ValueError,
                   "equality type {0} is not supported".format(equality))
Beispiel #55
0
def bernoulliGBM(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    #Log.info("Importing prostate.csv data...\n")
    prostate_train = h2o.import_frame(
        path=h2o.locate("smalldata/logreg/prostate_train.csv"))

    #Log.info("Converting CAPSULE and RACE columns to factors...\n")
    prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor()

    #Log.info("H2O Summary of prostate frame:\n")
    #prostate.summary()

    # Import prostate_train.csv as numpy array for scikit comparison
    trainData = np.loadtxt(h2o.locate("smalldata/logreg/prostate_train.csv"),
                           delimiter=',',
                           skiprows=1)
    trainDataResponse = trainData[:, 0]
    trainDataFeatures = trainData[:, 1:]

    ntrees = 100
    learning_rate = 0.1
    depth = 5
    min_rows = 10
    # Build H2O GBM classification model:
    #Log.info(paste("H2O GBM with parameters:\ndistribution = 'bernoulli', ntrees = ", ntrees, ", max_depth = 5,
    # min_rows = 10, learn_rate = 0.1\n", sep = ""))
    gbm_h2o = h2o.gbm(x=prostate_train[1:],
                      y=prostate_train["CAPSULE"],
                      ntrees=ntrees,
                      learn_rate=learning_rate,
                      max_depth=depth,
                      min_rows=min_rows,
                      distribution="bernoulli")

    # Build scikit GBM classification model
    #Log.info("scikit GBM with same parameters\n")
    gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learning_rate,
                                                  n_estimators=ntrees,
                                                  max_depth=depth,
                                                  min_samples_leaf=min_rows,
                                                  max_features=None)
    gbm_sci.fit(trainDataFeatures, trainDataResponse)

    #Log.info("Importing prostate_test.csv data...\n")
    prostate_test = h2o.import_frame(
        path=h2o.locate("smalldata/logreg/prostate_test.csv"))

    #Log.info("Converting CAPSULE and RACE columns to factors...\n")
    prostate_test["CAPSULE"] = prostate_test["CAPSULE"].asfactor()

    # Import prostate_test.csv as numpy array for scikit comparison
    testData = np.loadtxt(h2o.locate("smalldata/logreg/prostate_test.csv"),
                          delimiter=',',
                          skiprows=1)
    testDataResponse = testData[:, 0]
    testDataFeatures = testData[:, 1:]

    # Score on the test data and compare results

    # scikit
    auc_sci = roc_auc_score(testDataResponse,
                            gbm_sci.predict_proba(testDataFeatures)[:, 1])

    # h2o
    gbm_perf = gbm_h2o.model_performance(prostate_test)
    auc_h2o = gbm_perf.auc()

    #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o))
    assert auc_h2o >= auc_sci, "h2o (auc) performance degradation, with respect to scikit"
Beispiel #56
0
def gbm_demo(interactive, echo, test):
    h2o_data_path = system_file("prostate.csv")

    demo_description = [
        '\n-----------------------------------------------------------------',
        'This is a demo of H2O\'s GBM function.',
        'It uploads a dataset to h2o, parses it, and shows a description.',
        'Then, it divides the dataset into training and test sets, ',
        'builds a GBM from the training set, and predicts on the test set.',
        'Finally, default performance metrics are displayed.',
        '-----------------------------------------------------------------'
    ]

    demo_commands = [
        '# Connect to h2o', '>>> h2o.init()\n',
        '\n# Upload the prostate dataset that comes included in the h2o python package',
        '>>> prostate = h2o.upload_file(path = ' + h2o_data_path + '))\n',
        '\n# Print a description of the prostate data',
        '>>> prostate.summary()\n',
        '\n# Randomly split the dataset into ~70/30, training/test sets',
        '>>> r = prostate[0].runif()', '>>> train = prostate[r < 0.70]',
        '>>> valid = prostate[r >= 0.30]\n',
        '\n# Convert the response columns to factors (for binary classification problems)',
        '>>> train["CAPSULE"] = train["CAPSULE"].asfactor()',
        '>>> test["CAPSULE"] = test["CAPSULE"].asfactor()\n',
        '\n# Build a (classification) GBM',
        '>>> prostate_gbm = h2o.gbm(x=train[["AGE", "RACE", "PSA", "VOL", "GLEASON"]], '
        'y=train["CAPSULE"], distribution="bernoulli", ntrees=10, max_depth=8, min_rows=10, '
        'learn_rate=0.2)\n', '\n# Show the model', '>>> prostate_gbm.show()\n',
        '\n# Predict on the test set and show the first ten predictions',
        '>>> predictions = prostate_gbm.predict(test)',
        '>>> predictions.show()\n', '\n# Show default performance metrics',
        '>>> performance = prostate_gbm.model_performance(test)',
        '>>> performance.show()\n'
    ]

    for line in demo_description:
        print line
    print

    echo_and_interact(demo_commands, interactive, echo)
    if not test: h2o.init()

    echo_and_interact(demo_commands, interactive, echo)
    prostate = h2o.upload_file(path=h2o_data_path)

    echo_and_interact(demo_commands, interactive, echo)
    prostate.summary()

    echo_and_interact(demo_commands, interactive, echo, npop=4)
    r = prostate[0].runif()
    train = prostate[r < 0.70]
    test = prostate[r >= 0.30]

    echo_and_interact(demo_commands, interactive, echo, npop=3)
    train["CAPSULE"] = train["CAPSULE"].asfactor()
    test["CAPSULE"] = test["CAPSULE"].asfactor()

    echo_and_interact(demo_commands, interactive, echo)
    prostate_gbm = h2o.gbm(x=train[["AGE", "RACE", "PSA", "VOL", "GLEASON"]],
                           y=train["CAPSULE"],
                           distribution="bernoulli",
                           ntrees=10,
                           max_depth=8,
                           min_rows=10,
                           learn_rate=0.2)

    echo_and_interact(demo_commands, interactive, echo)
    prostate_gbm.show()

    echo_and_interact(demo_commands, interactive, echo, npop=3)
    predictions = prostate_gbm.predict(test)
    predictions.show()

    echo_and_interact(demo_commands, interactive, echo, npop=3)
    performance = prostate_gbm.model_performance(test)
    performance.show()
def metric_json_check():

    df = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))

    # Regression metric json
    reg_mod = h2o.gbm(y=df["CAPSULE"],
                      x=df[3:],
                      training_frame=df,
                      distribution="gaussian")
    reg_met = reg_mod.model_performance()
    reg_metric_json_keys_have = reg_met._metric_json.keys()
    reg_metric_json_keys_desired = [
        u'model_category', u'description', u'r2', u'frame', u'model_checksum',
        u'MSE', u'__meta', u'scoring_time', u'predictions', u'model',
        u'duration_in_ms', u'frame_checksum', u'mean_residual_deviance'
    ]
    reg_metric_diff = list(
        set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired))
    assert not reg_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) regression " \
                                "metric json. The difference is {2}".format(reg_metric_json_keys_have,
                                                                            reg_metric_json_keys_desired,
                                                                            reg_metric_diff)
    # Regression metric json (GLM)
    reg_mod = h2o.glm(y=df["CAPSULE"],
                      x=df[3:],
                      training_frame=df,
                      family="gaussian")
    reg_met = reg_mod.model_performance()
    reg_metric_json_keys_have = reg_met._metric_json.keys()
    reg_metric_json_keys_desired = [
        u'model_category', u'description', u'r2',
        u'residual_degrees_of_freedom', u'frame', u'model_checksum', u'MSE',
        u'__meta', u'null_deviance', u'scoring_time',
        u'null_degrees_of_freedom', u'predictions', u'AIC', u'model',
        u'duration_in_ms', u'frame_checksum', u'residual_deviance',
        u'mean_residual_deviance'
    ]
    reg_metric_diff = list(
        set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired))
    assert not reg_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) glm-regression " \
                                "metric json. The difference is {2}".format(reg_metric_json_keys_have,
                                                                            reg_metric_json_keys_desired,
                                                                            reg_metric_diff)

    # Binomial metric json
    bin_mod = h2o.gbm(y=df["CAPSULE"].asfactor(),
                      x=df[3:],
                      training_frame=df,
                      distribution="bernoulli")
    bin_met = bin_mod.model_performance()
    bin_metric_json_keys_have = bin_met._metric_json.keys()
    bin_metric_json_keys_desired = [
        u'AUC', u'Gini', u'model_category', u'description', u'r2', u'frame',
        u'model_checksum', u'MSE', u'__meta', u'logloss', u'scoring_time',
        u'thresholds_and_metric_scores', u'predictions',
        u'max_criteria_and_metric_scores', u'model', u'duration_in_ms',
        u'frame_checksum', u'domain'
    ]
    bin_metric_diff = list(
        set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired))
    assert not bin_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) binomial " \
                                "metric json. The difference is {2}".format(bin_metric_json_keys_have,
                                                                            bin_metric_json_keys_desired,
                                                                            bin_metric_diff)

    # Binomial metric json (GLM)
    bin_mod = h2o.glm(y=df["CAPSULE"].asfactor(),
                      x=df[3:],
                      training_frame=df,
                      family="binomial")
    bin_met = bin_mod.model_performance()
    bin_metric_json_keys_have = bin_met._metric_json.keys()
    bin_metric_json_keys_desired = [
        u'frame', u'residual_deviance', u'max_criteria_and_metric_scores',
        u'MSE', u'frame_checksum', u'AIC', u'logloss', u'Gini', u'predictions',
        u'AUC', u'description', u'model_checksum', u'duration_in_ms',
        u'model_category', u'r2', u'residual_degrees_of_freedom', u'__meta',
        u'null_deviance', u'scoring_time', u'null_degrees_of_freedom',
        u'model', u'thresholds_and_metric_scores', u'domain'
    ]
    bin_metric_diff = list(
        set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired))
    assert not bin_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) glm-binomial " \
                                "metric json. The difference is {2}".format(bin_metric_json_keys_have,
                                                                            bin_metric_json_keys_desired,
                                                                            bin_metric_diff)

    # Multinomial metric json
    df = h2o.import_file(
        path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip"))
    myX = [
        "Origin", "Dest", "IsDepDelayed", "UniqueCarrier", "Distance",
        "fDayofMonth", "fDayOfWeek"
    ]
    myY = "fYear"
    mul_mod = h2o.gbm(x=df[myX],
                      y=df[myY],
                      training_frame=df,
                      distribution="multinomial")
    mul_met = mul_mod.model_performance()
    mul_metric_json_keys_have = mul_met._metric_json.keys()
    mul_metric_json_keys_desired = [
        u'cm', u'model_category', u'description', u'r2', u'frame',
        u'model_checksum', u'MSE', u'__meta', u'logloss', u'scoring_time',
        u'predictions', u'hit_ratio_table', u'model', u'duration_in_ms',
        u'frame_checksum'
    ]
    mul_metric_diff = list(
        set(mul_metric_json_keys_have) - set(mul_metric_json_keys_desired))
    assert not mul_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) multinomial " \
                                "metric json. The difference is {2}".format(mul_metric_json_keys_have,
                                                                            mul_metric_json_keys_desired,
                                                                            mul_metric_diff)

    # Clustering metric json
    df = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv"))
    clus_mod = h2o.kmeans(x=df[0:4], k=3, standardize=False)
    clus_met = clus_mod.model_performance()
    clus_metric_json_keys_have = clus_met._metric_json.keys()
    clus_metric_json_keys_desired = [
        u'tot_withinss', u'model_category', u'description', u'frame',
        u'model_checksum', u'MSE', u'__meta', u'scoring_time', u'betweenss',
        u'predictions', u'totss', u'model', u'duration_in_ms',
        u'frame_checksum', u'centroid_stats'
    ]
    clus_metric_diff = list(
        set(clus_metric_json_keys_have) - set(clus_metric_json_keys_desired))
    assert not clus_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) clustering " \
                                "metric json. The difference is {2}".format(clus_metric_json_keys_have,
                                                                            clus_metric_json_keys_desired,
                                                                            clus_metric_diff)
def all_confusion_matrix_funcs():

    metrics = [
        "min_per_class_accuracy", "absolute_MCC", "precision", "accuracy",
        "f0point5", "f2", "f1"
    ]
    train = [True, False]
    valid = [True, False]

    print("PARSING TRAINING DATA")
    air_train = h2o.import_file(
        path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip"))

    print("PARSING TESTING DATA")
    air_test = h2o.import_file(
        path=pyunit_utils.locate("smalldata/airlines/AirlinesTest.csv.zip"))

    print()
    print("RUNNING FIRST GBM: ")
    print()
    gbm_bin = h2o.gbm(x=air_train[[
        "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth",
        "fDayOfWeek"
    ]],
                      y=air_train["IsDepDelayed"].asfactor(),
                      validation_x=air_test[[
                          "Origin", "Dest", "Distance", "UniqueCarrier",
                          "fMonth", "fDayofMonth", "fDayOfWeek"
                      ]],
                      validation_y=air_test["IsDepDelayed"].asfactor(),
                      distribution="bernoulli")

    print()
    print("RUNNING SECOND GBM: ")
    print()
    gbm_mult = h2o.gbm(x=air_train[[
        "Origin", "Dest", "Distance", "UniqueCarrier", "IsDepDelayed",
        "fDayofMonth", "fMonth"
    ]],
                       y=air_train["fDayOfWeek"].asfactor(),
                       validation_x=air_test[[
                           "Origin", "Dest", "Distance", "UniqueCarrier",
                           "IsDepDelayed", "fDayofMonth", "fMonth"
                       ]],
                       validation_y=air_test["fDayOfWeek"].asfactor(),
                       distribution="multinomial")

    def dim_check(cm, m, t, v):
        assert len(cm) == 2 and len(cm[0]) == 2 and len(cm[1]) == 2, "incorrect confusion matrix dimensions " \
                                                                     "for metric/thresh: {0}, train: {1}, valid: " \
                                                                     "{2}".format(m, t, v)

    def type_check(cm, m, t, v):
        assert isinstance(cm[0][0], (int, float)) and isinstance(cm[0][1], (int, float)) and \
               isinstance(cm[1][0], (int, float)) and isinstance(cm[0][0], (int, float)), \
            "confusion matrix entries should be integers or floats but got {0}, {1}, {2}, {3}. metric/thresh: {4}, " \
            "train: {5}, valid: {6}".format(type(cm[0][0]), type(cm[0][1]), type(cm[1][0]), type(cm[1][1]), m,
                                            t, v)

    def count_check(cm, m, t, v):
        if v:
            assert cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1] == air_test.nrow, \
                "incorrect confusion matrix elements: {0}, {1}, {2}, {3}. Should sum " \
                "to {4}. metric/thresh: {5}, train: {6}, valid: {7}".format(cm[0][0], cm[0][1], cm[1][0], cm[1][1],
                                                                     air_test.nrow, m, t, v)
        else:
            assert cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1] == air_train.nrow, \
                "incorrect confusion matrix elements: {0}, {1}, {2}, {3}. Should sum " \
                "to {4}. metric/thresh: {5}, train: {6}, valid: {7}".format(cm[0][0], cm[0][1], cm[1][0], cm[1][1],
                                                                     air_train.nrow, m, t, v)

    # H2OBinomialModel.confusion_matrix()
    for m in metrics:
        for t in train:
            for v in valid:
                if t and v: continue
                cm = gbm_bin.confusion_matrix(metrics=m, train=t, valid=v)
                if cm:
                    cm = cm.to_list()
                    dim_check(cm, m, t, v)
                    type_check(cm, m, t, v)
                    count_check(cm, m, t, v)

    # H2OBinomialModel.confusion_matrix()
    for x in range(10):
        for t in train:
            for v in valid:
                if t and v: continue
                thresholds = [
                    gbm_bin.find_threshold_by_max_metric(m, t, v)
                    for m in random.sample(metrics,
                                           random.randint(1, len(metrics)))
                ]
                cms = gbm_bin.confusion_matrix(thresholds=thresholds,
                                               train=t,
                                               valid=v)
                if not isinstance(cms, list): cms = [cms]
                for idx, cm in enumerate(cms):
                    cm = cm.to_list()
                    dim_check(cm, thresholds[idx], t, v)
                    type_check(cm, thresholds[idx], t, v)
                    count_check(cm, thresholds[idx], t, v)

    # H2OMultinomialModel.confusion_matrix()
    cm = gbm_mult.confusion_matrix(data=air_test)
    cm_count = 0
    for r in range(7):
        for c in range(7):
            cm_count += cm.cell_values[r][c]
    assert cm_count == air_test.nrow, "incorrect confusion matrix elements. Should sum to {0}, but got {1}".\
        format(air_test.nrow, cm_count)

    # H2OBinomialModelMetrics.confusion_matrix()
    bin_perf = gbm_bin.model_performance(valid=True)
    for metric in metrics:
        cm = bin_perf.confusion_matrix(metrics=metric).to_list()
        dim_check(cm, metric, False, True)
        type_check(cm, metric, False, True)
        count_check(cm, metric, False, True)

    # H2OBinomialModelMetrics.confusion_matrix()
    bin_perf = gbm_bin.model_performance(train=True)
    for x in range(10):
        thresholds = [
            gbm_bin.find_threshold_by_max_metric(m, t, v)
            for m in random.sample(metrics, random.randint(1, len(metrics)))
        ]
        cms = bin_perf.confusion_matrix(thresholds=thresholds)
        if not isinstance(cms, list): cms = [cms]
        for idx, cm in enumerate(cms):
            cm = cm.to_list()
            dim_check(cm, thresholds[idx], True, False)
            type_check(cm, thresholds[idx], True, False)
            count_check(cm, thresholds[idx], True, False)

    # H2OMultinomialModelMetrics.confusion_matrix()
    mult_perf = gbm_mult.model_performance(valid=True)
    cm = mult_perf.confusion_matrix()
    cm_count = 0
    for r in range(7):
        for c in range(7):
            cm_count += cm.cell_values[r][c]
    assert cm_count == air_test.nrow, "incorrect confusion matrix elements. Should sum to {0}, but got {1}". \
        format(air_test.nrow, cm_count)
Beispiel #59
0
def bernoulli_synthetic_data_mediumGBM(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    # Generate training dataset (adaptation of http://www.stat.missouri.edu/~speckman/stat461/boost.R)
    train_rows = 10000
    train_cols = 10

    #  Generate variables V1, ... V10
    X_train = np.random.randn(train_rows, train_cols)

    #  y = +1 if sum_i x_{ij}^2 > chisq median on 10 df
    y_train = np.asarray([
        1 if rs > scipy.stats.chi2.ppf(0.5, 10) else -1
        for rs in [sum(r) for r in np.multiply(X_train, X_train).tolist()]
    ])

    # Train scikit gbm
    # TODO: grid-search
    distribution = "bernoulli"
    ntrees = 150
    min_rows = 1
    max_depth = 2
    learn_rate = .01
    nbins = 20

    gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learn_rate,
                                                  n_estimators=ntrees,
                                                  max_depth=max_depth,
                                                  min_samples_leaf=min_rows,
                                                  max_features=None)
    gbm_sci.fit(X_train, y_train)

    # Generate testing dataset
    test_rows = 2000
    test_cols = 10

    #  Generate variables V1, ... V10
    X_test = np.random.randn(test_rows, test_cols)

    #  y = +1 if sum_i x_{ij}^2 > chisq median on 10 df
    y_test = np.asarray([
        1 if rs > scipy.stats.chi2.ppf(0.5, 10) else -1
        for rs in [sum(r) for r in np.multiply(X_test, X_test).tolist()]
    ])

    # Score (AUC) the scikit gbm model on the test data
    auc_sci = roc_auc_score(y_test, gbm_sci.predict_proba(X_test)[:, 1])

    # Compare this result to H2O
    train_h2o = H2OFrame(np.column_stack((y_train, X_train)).tolist())
    test_h2o = H2OFrame(np.column_stack((y_test, X_test)).tolist())

    gbm_h2o = h2o.gbm(x=train_h2o[1:],
                      y=train_h2o["C1"].asfactor(),
                      distribution=distribution,
                      ntrees=ntrees,
                      min_rows=min_rows,
                      max_depth=max_depth,
                      learn_rate=learn_rate,
                      nbins=nbins)
    gbm_perf = gbm_h2o.model_performance(test_h2o)
    auc_h2o = gbm_perf.auc()

    #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o))
    assert abs(auc_h2o - auc_sci) < 5e-3, "h2o (auc) performance degradation, with respect to scikit. h2o auc: {0} " \
                               "scickit auc: {1}".format(auc_h2o, auc_sci)
Beispiel #60
0
def pubdev_1829():

    train = h2o.import_file(
        path=pyunit_utils.locate("smalldata/jira/gbm_checkpoint_train.csv"))
    valid = h2o.import_file(
        path=pyunit_utils.locate("smalldata/jira/gbm_checkpoint_valid.csv"))

    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    response_col = "economy_20mpg"
    distribution = "bernoulli"
    train[response_col] = train[response_col].asfactor()
    valid[response_col] = valid[response_col].asfactor()

    ntrees1 = 5
    max_depth1 = 5
    min_rows1 = 10
    model1 = h2o.gbm(x=train[predictors],
                     y=train[response_col],
                     ntrees=ntrees1,
                     max_depth=max_depth1,
                     min_rows=min_rows1,
                     score_each_iteration=True,
                     distribution=distribution,
                     validation_x=valid[predictors],
                     validation_y=valid[response_col])

    ntrees2 = 10
    max_depth2 = 5
    min_rows2 = 10
    model2 = h2o.gbm(x=train[predictors],
                     y=train[response_col],
                     ntrees=ntrees2,
                     max_depth=max_depth2,
                     min_rows=min_rows2,
                     distribution=distribution,
                     score_each_iteration=True,
                     validation_x=valid[predictors],
                     validation_y=valid[response_col],
                     checkpoint=model1._id)

    model4 = h2o.gbm(x=train[predictors],
                     y=train[response_col],
                     ntrees=ntrees2,
                     max_depth=max_depth2,
                     min_rows=min_rows2,
                     distribution=distribution,
                     score_each_iteration=True,
                     validation_x=valid[predictors],
                     validation_y=valid[response_col])

    assert model2.auc(valid=True) == model4.auc(
        valid=True
    ), "Expected Model 2 AUC: {0} to be the same as Model 4 AUC: {1}".format(
        model2.auc(valid=True), model4.auc(valid=True))
    assert model2.giniCoef(valid=True) == model4.giniCoef(
        valid=True
    ), "Expected Model 2 Gini Coef {0} to be the same as Model 4 Gini Coef: {1}".format(
        model2.giniCoef(valid=True), model4.giniCoef(valid=True))
    assert model2.logloss(valid=True) == model4.logloss(
        valid=True
    ), "Expected Model 2 Log Loss: {0} to be the same as Model 4 Log Loss: {1}".format(
        model2.logloss(valid=True), model4.logloss(valid=True))