Ejemplo n.º 1
0
def milsong_checkpoint(ip, port):

    milsong_train = h2o.upload_file(
        h2o.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
    milsong_valid = h2o.upload_file(
        h2o.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))

    # build first model
    ntrees1 = random.sample(range(50, 100), 1)[0]
    max_depth1 = random.sample(range(2, 6), 1)[0]
    min_rows1 = random.sample(range(10, 16), 1)[0]
    print "ntrees model 1: {0}".format(ntrees1)
    print "max_depth model 1: {0}".format(max_depth1)
    print "min_rows model 1: {0}".format(min_rows1)
    model1 = h2o.random_forest(x=milsong_train[1:],
                               y=milsong_train[0],
                               ntrees=ntrees1,
                               max_depth=max_depth1,
                               min_rows=min_rows1,
                               validation_x=milsong_valid[1:],
                               validation_y=milsong_valid[0],
                               seed=1234)

    # save the model, then load the model
    model_path = h2o.save_model(model1, name="delete_model", force=True)
    restored_model = h2o.load_model(model_path)
    shutil.rmtree("delete_model")

    # continue building the model
    ntrees2 = ntrees1 + 50
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print "ntrees model 2: {0}".format(ntrees2)
    print "max_depth model 2: {0}".format(max_depth2)
    print "min_rows model 2: {0}".format(min_rows2)
    model2 = h2o.random_forest(x=milsong_train[1:],
                               y=milsong_train[0],
                               ntrees=ntrees2,
                               max_depth=max_depth2,
                               min_rows=min_rows2,
                               validation_x=milsong_valid[1:],
                               validation_y=milsong_valid[0],
                               checkpoint=restored_model._id,
                               seed=1234)

    # build the equivalent of model 2 in one shot
    model3 = h2o.random_forest(x=milsong_train[1:],
                               y=milsong_train[0],
                               ntrees=ntrees2,
                               max_depth=max_depth2,
                               min_rows=min_rows2,
                               validation_x=milsong_valid[1:],
                               validation_y=milsong_valid[0],
                               seed=1234)

    assert isinstance(model2, type(model3))
    assert model2.mse(valid=True) == model3.mse(
        valid=True
    ), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(
        model2.mse(valid=True), model3.mse(valid=True))
    def attack(train, valid, x, y):
        kwargs = {}

        # randomly select parameters and their corresponding values
        if random.randint(0,1): kwargs['mtries'] = random.randint(1,len(x))
        if random.randint(0,1): kwargs['sample_rate'] = random.random()
        if random.randint(0,1): kwargs['build_tree_one_node'] = True
        if random.randint(0,1): kwargs['ntrees'] = random.randint(1,10)
        if random.randint(0,1): kwargs['max_depth'] = random.randint(1,5)
        if random.randint(0,1): kwargs['min_rows'] = random.randint(1,10)
        if random.randint(0,1): kwargs['nbins'] = random.randint(1,20)
        if random.randint(0,1):
            kwargs['balance_classes'] = True
            if random.randint(0,1): kwargs['max_after_balance_size'] = random.uniform(0,10)
        if random.randint(0,1): kwargs['seed'] = random.randint(1,10000)
        do_validation = [True, False][random.randint(0,1)]

        # display the parameters and their corresponding values
        print "-----------------------"
        print "x: {0}".format(x)
        print "y: {0}".format(y)
        print "validation: {0}".format(do_validation)
        for k, v in zip(kwargs.keys(), kwargs.values()): print k + ": {0}".format(v)
        if do_validation: h2o.random_forest(x=train[x], y=train[y], validation_x=valid[x], validation_y=valid[y], **kwargs)
        else: h2o.random_forest(x=train[x], y=train[y], **kwargs)
        print "-----------------------"
Ejemplo n.º 3
0
def imbalanced():
    
    

    covtype = h2o.import_file(path=h2o.locate("smalldata/covtype/covtype.20k.data"))
    covtype[54] = covtype[54].asfactor()

    imbalanced = h2o.random_forest(x=covtype[0:54], y=covtype[54], ntrees=10, balance_classes=False, nfolds=3)
    imbalanced_perf = imbalanced.model_performance(covtype)
    imbalanced_perf.show()

    balanced = h2o.random_forest(x=covtype[0:54], y=covtype[54], ntrees=10, balance_classes=True, seed=123, nfolds=3)
    balanced_perf = balanced.model_performance(covtype)
    balanced_perf.show()

    ##compare error for class 6 (difficult minority)
    class_6_err_imbalanced = imbalanced_perf.confusion_matrix().cell_values[5][7]
    class_6_err_balanced = balanced_perf.confusion_matrix().cell_values[5][7]

    print("--------------------")
    print("")
    print("class_6_err_imbalanced")
    print(class_6_err_imbalanced)
    print("")
    print("class_6_err_balanced")
    print(class_6_err_balanced)
    print("")
    print("--------------------")

    assert class_6_err_imbalanced >= 0.9*class_6_err_balanced, "balance_classes makes it at least 10% worse!"
Ejemplo n.º 4
0
def swpredsRF(ip, port):
    # Training set has two predictor columns
    # X1: 10 categorical levels, 100 observations per level; X2: Unif(0,1) noise
    # Ratio of y = 1 per Level: cat01 = 1.0 (strong predictor), cat02 to cat10 = 0.5 (weak predictors)

    #Log.info("Importing swpreds_1000x3.csv data...\n")
    swpreds = h2o.import_file(
        path=h2o.locate("smalldata/gbm_test/swpreds_1000x3.csv"))
    swpreds["y"] = swpreds["y"].asfactor()

    #Log.info("Summary of swpreds_1000x3.csv from H2O:\n")
    #swpreds.summary()

    # Train H2O DRF without Noise Column
    #Log.info("Distributed Random Forest with only Predictor Column")
    model1 = h2o.random_forest(x=swpreds[["X1"]],
                               y=swpreds["y"],
                               ntrees=50,
                               max_depth=20,
                               nbins=500)
    model1.show()
    perf1 = model1.model_performance(swpreds)
    print(perf1.auc())

    # Train H2O DRF Model including Noise Column:
    #Log.info("Distributed Random Forest including Noise Column")
    model2 = h2o.random_forest(x=swpreds[["X1", "X2"]],
                               y=swpreds["y"],
                               ntrees=50,
                               max_depth=20,
                               nbins=500)
    model2.show()
    perf2 = model2.model_performance(swpreds)
    print(perf2.auc())
def swpredsRF():
    # Training set has two predictor columns
    # X1: 10 categorical levels, 100 observations per level; X2: Unif(0,1) noise
    # Ratio of y = 1 per Level: cat01 = 1.0 (strong predictor), cat02 to cat10 = 0.5 (weak predictors)

    
    

    #Log.info("Importing swpreds_1000x3.csv data...\n")
    swpreds = h2o.import_file(path=tests.locate("smalldata/gbm_test/swpreds_1000x3.csv"))
    swpreds["y"] = swpreds["y"].asfactor()

    #Log.info("Summary of swpreds_1000x3.csv from H2O:\n")
    #swpreds.summary()

    # Train H2O DRF without Noise Column
    #Log.info("Distributed Random Forest with only Predictor Column")
    model1 = h2o.random_forest(x=swpreds[["X1"]], y=swpreds["y"], ntrees=50, max_depth=20, nbins=500)
    model1.show()
    perf1 = model1.model_performance(swpreds)
    print(perf1.auc())

    # Train H2O DRF Model including Noise Column:
    #Log.info("Distributed Random Forest including Noise Column")
    model2 = h2o.random_forest(x=swpreds[["X1","X2"]], y=swpreds["y"], ntrees=50, max_depth=20, nbins=500)
    model2.show()
    perf2 = model2.model_performance(swpreds)
    print(perf2.auc())
Ejemplo n.º 6
0
    def attack(train, valid, x, y):
        kwargs = {}

        # randomly select parameters and their corresponding values
        if random.randint(0, 1): kwargs['mtries'] = random.randint(1, len(x))
        if random.randint(0, 1): kwargs['sample_rate'] = random.random()
        if random.randint(0, 1): kwargs['build_tree_one_node'] = True
        if random.randint(0, 1): kwargs['ntrees'] = random.randint(1, 10)
        if random.randint(0, 1): kwargs['max_depth'] = random.randint(1, 5)
        if random.randint(0, 1): kwargs['min_rows'] = random.randint(1, 10)
        if random.randint(0, 1): kwargs['nbins'] = random.randint(1, 20)
        if random.randint(0, 1):
            kwargs['balance_classes'] = True
            if random.randint(0, 1):
                kwargs['max_after_balance_size'] = random.uniform(0, 10)
        if random.randint(0, 1): kwargs['seed'] = random.randint(1, 10000)
        do_validation = [True, False][random.randint(0, 1)]

        # display the parameters and their corresponding values
        print "-----------------------"
        print "x: {0}".format(x)
        print "y: {0}".format(y)
        print "validation: {0}".format(do_validation)
        for k, v in zip(kwargs.keys(), kwargs.values()):
            print k + ": {0}".format(v)
        if do_validation:
            h2o.random_forest(x=train[x],
                              y=train[y],
                              validation_x=valid[x],
                              validation_y=valid[y],
                              **kwargs)
        else:
            h2o.random_forest(x=train[x], y=train[y], **kwargs)
        print "-----------------------"
Ejemplo n.º 7
0
    def check_same(data1, data2):
        rf1_regression = h2o.random_forest(x=data1[2:20], y=data1[1])
        rf2_regression = h2o.random_forest(x=data2[2:21], y=data2[1], weights_column="weights")
        rf1_binomial = h2o.random_forest(x=data1[1:20], y=data1[0])
        rf2_binomial = h2o.random_forest(x=data2[1:21], y=data2[0], weights_column="weights")

        assert abs(rf1_regression.mse() - rf2_regression.mse()) < 1e-6, "Expected mse's to be the same, but got {0}, " \
                                                                          "and {1}".format(rf1_regression.mse(),
                                                                                           rf2_regression.mse())
        assert abs(rf1_binomial.auc() - rf2_binomial.auc()) < 1e-6, "Expected auc's to be the same, but got {0}, and " \
                                                                      "{1}".format(rf1_binomial.auc(), rf2_binomial.auc())
def iris_nfolds():

    iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv"))

    model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50, nfolds=5)
    model.show()

    # Can specify both nfolds >= 2 and validation = H2OParsedData at once
    try:
        h2o.random_forest(y=iris[4], x=iris[0:4], validation_y=iris[4], validation_x=iris[0:4], ntrees=50, nfolds=5)
        assert True
    except EnvironmentError:
        assert False, "expected an error"
def iris_nfolds(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))

    model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50, nfolds=5)
    model.show()
  
    # Can't specify both nfolds >= 2 and validation = H2OParsedData at once
    try:
        h2o.random_forest(y=iris[4], x=iris[0:4], validation_y=iris[4], validation_x=iris[0:4], ntrees=50, nfolds=5)
        assert False, "expected an error"
    except EnvironmentError:
        assert True
Ejemplo n.º 10
0
def iris_nfolds(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))

    model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50, nfolds=5)
    model.show()
  
    # Can specify both nfolds >= 2 and validation = H2OParsedData at once
    try:
        h2o.random_forest(y=iris[4], x=iris[0:4], validation_y=iris[4], validation_x=iris[0:4], ntrees=50, nfolds=5)
        assert True
    except EnvironmentError:
        assert False, "expected an error"
def iris_ignore():

    iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris2.csv"))

    for maxx in range(4):
        model = h2o.random_forest(y=iris[4], x=iris[range(maxx + 1)], ntrees=50, max_depth=100)
        model.show()
def iris_nfolds():



  iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv"))

  model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50, nfolds=5)
  model.show()

  # Can specify both nfolds >= 2 and validation = H2OParsedData at once



  try:
      H2ORandomForestEstimator(ntrees=50, nfolds=5).train(y=4, x=list(range(4)), validation_frame=iris)
      assert True
  except EnvironmentError:
      assert False, "expected an error"



  if __name__ == "__main__":
    pyunit_utils.standalone_test(iris_nfolds)
  else:
    iris_nfolds()
Ejemplo n.º 13
0
def deeplearning_autoencoder():

    resp = 784
    nfeatures = 20  # number of features (smallest hidden layer)

    train_hex = h2o.upload_file(
        pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz"))
    train_hex[resp] = train_hex[resp].asfactor()

    test_hex = h2o.upload_file(
        pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz"))
    test_hex[resp] = test_hex[resp].asfactor()

    # split data into two parts
    sid = train_hex[0].runif(1234)

    # unsupervised data for autoencoder
    train_unsupervised = train_hex[sid >= 0.5]
    train_unsupervised.drop(resp)
    train_unsupervised.describe()

    # supervised data for drf
    train_supervised = train_hex[sid < 0.5]
    train_supervised.describe()

    # train autoencoder
    ae_model = h2o.deeplearning(
        x=train_unsupervised[0:resp],
        activation="Tanh",
        autoencoder=True,
        hidden=[nfeatures],
        epochs=1,
        reproducible=True,  #slow, turn off for real problems
        seed=1234)

    # conver train_supervised with autoencoder to lower-dimensional space
    train_supervised_features = ae_model.deepfeatures(
        train_supervised[0:resp]._frame(), 0)

    assert train_supervised_features.ncol == nfeatures, "Dimensionality of reconstruction is wrong!"

    # Train DRF on extracted feature space
    drf_model = h2o.random_forest(x=train_supervised_features[0:20],
                                  y=train_supervised[resp],
                                  ntrees=10,
                                  min_rows=10,
                                  seed=1234)

    # Test the DRF model on the test set (processed through deep features)
    test_features = ae_model.deepfeatures(test_hex[0:resp]._frame(), 0)
    test_features = test_features.cbind(test_hex[resp])._frame()

    # Confusion Matrix and assertion
    cm = drf_model.confusion_matrix(test_features)
    cm.show()

    # 10% error +/- 0.001
    assert abs(cm.cell_values[10][10] -
               0.081) < 0.001, "Error. Expected 0.081, but got {0}".format(
                   cm.cell_values[10][10])
Ejemplo n.º 14
0
def fiftycatRF(ip, port):

    # Training set has only 45 categories cat1 through cat45
    # Log.info("Importing 50_cattest_train.csv data...\n")
    train = h2o.import_file(path=h2o.locate("smalldata/gbm_test/50_cattest_train.csv"))
    train["y"] = train["y"].asfactor()

    # Log.info("Summary of 50_cattest_train.csv from H2O:\n")
    # train.summary()

    # Train H2O DRF Model:
    # Log.info(paste("H2O DRF with parameters:\nclassification = TRUE, ntree = 50, depth = 20, nbins = 500\n", sep = ""))
    model = h2o.random_forest(x=train[["x1", "x2"]], y=train["y"], ntrees=50, max_depth=20, nbins=500)

    # Test dataset has all 50 categories cat1 through cat50
    # Log.info("Importing 50_cattest_test.csv data...\n")
    test = h2o.import_file(path=h2o.locate("smalldata/gbm_test/50_cattest_test.csv"))

    # Log.info("Summary of 50_cattest_test.csv from H2O:\n")
    # test.summary()

    # Predict on test dataset with DRF model:
    # Log.info("Performing predictions on test dataset...\n")
    preds = model.predict(test)
    preds.head()

    # Get the confusion matrix and AUC
    # Log.info("Confusion matrix of predictions (max accuracy):\n")
    perf = model.model_performance(test)
    perf.show()
    cm = perf.confusion_matrix()
    print(cm)
Ejemplo n.º 15
0
def vi_toy_test(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    toy_data = h2o.import_frame(
        path=h2o.locate("smalldata/gbm_test/toy_data_RF.csv"))
    #toy_data.summary()

    toy_data[6] = toy_data[6].asfactor()
    toy_data.show()
    rf = h2o.random_forest(x=toy_data[[0, 1, 2, 3, 4, 5]],
                           y=toy_data[6],
                           ntrees=500,
                           max_depth=20,
                           nbins=100,
                           seed=0)

    ranking = [
        rf._model_json['output']['variable_importances'].cell_values[v][0]
        for v in range(toy_data.ncol() - 1)
    ]
    print(ranking)
    assert tuple(ranking) == tuple(
        ["V3", "V2", "V6", "V5", "V1",
         "V4"]), "expected specific variable importance ranking"
Ejemplo n.º 16
0
def bigcatRF(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    # Training set has 100 categories from cat001 to cat100
    # Categories cat001, cat003, ... are perfect predictors of y = 1
    # Categories cat002, cat004, ... are perfect predictors of y = 0

    #Log.info("Importing bigcat_5000x2.csv data...\n")
    bigcat = h2o.import_frame(
        path=h2o.locate("smalldata/gbm_test/bigcat_5000x2.csv"))
    bigcat["y"] = bigcat["y"].asfactor()

    #Log.info("Summary of bigcat_5000x2.csv from H2O:\n")
    #bigcat.summary()

    # Train H2O DRF Model:
    #Log.info("H2O DRF (Naive Split) with parameters:\nclassification = TRUE, ntree = 1, depth = 1, nbins = 100, nbins_cats=10\n")
    model = h2o.random_forest(x=bigcat[["X"]],
                              y=bigcat["y"],
                              ntrees=1,
                              max_depth=1,
                              nbins=100,
                              nbins_cats=10)
    model.show()
def cars_checkpoint():

    cars = h2o.upload_file(h2o.locate("smalldata/junit/cars_20mpg.csv"))
    predictors = ["displacement","power","weight","acceleration","year"]
    response_col = "economy"

    # build first model
    model1 = h2o.random_forest(x=cars[predictors],y=cars[response_col],ntrees=10,max_depth=2, min_rows=10)

    # continue building the model
    model2 = h2o.random_forest(x=cars[predictors],y=cars[response_col],ntrees=11,max_depth=3, min_rows=9,r2_stopping=0.8,
                               checkpoint=model1._id)

    #   erroneous, not MODIFIABLE_BY_CHECKPOINT_FIELDS
    # PUBDEV-1833

    #   mtries
    try:
        model = h2o.random_forest(y=cars[response_col], x=cars[predictors],mtries=2,checkpoint=model1._id)
        assert False, "Expected model-build to fail because mtries not modifiable by checkpoint"
    except EnvironmentError:
        assert True

    #   sample_rate
    try:
        model = h2o.random_forest(y=cars[response_col], x=cars[predictors],sample_rate=0.5,checkpoint=model1._id)
        assert False, "Expected model-build to fail because sample_rate not modifiable by checkpoint"
    except EnvironmentError:
        assert True

    #   nbins_cats
    try:
        model = h2o.random_forest(y=cars[response_col], x=cars[predictors],nbins_cats=99,checkpoint=model1._id)
        assert False, "Expected model-build to fail because nbins_cats not modifiable by checkpoint"
    except EnvironmentError:
        assert True

    #   nbins
    try:
        model = h2o.random_forest(y=cars[response_col], x=cars[predictors],nbins=99,checkpoint=model1._id)
        assert False, "Expected model-build to fail because nbins not modifiable by checkpoint"
    except EnvironmentError:
        assert True

    #   balance_classes
    try:
        model = h2o.random_forest(y=cars[response_col], x=cars[predictors],balance_classes=True,checkpoint=model1._id)
        assert False, "Expected model-build to fail because balance_classes not modifiable by checkpoint"
    except EnvironmentError:
        assert True

    #   nfolds
    try:
        model = h2o.random_forest(y=cars[response_col], x=cars[predictors],nfolds=3,checkpoint=model1._id)
        assert False, "Expected model-build to fail because nfolds not modifiable by checkpoint"
    except EnvironmentError:
        assert True
Ejemplo n.º 18
0
def imbalanced(ip, port):

    # Connect to h2o
    h2o.init(ip, port)

    covtype = h2o.import_frame(
        path=h2o.locate("smalldata/covtype/covtype.20k.data"))
    covtype[54] = covtype[54].asfactor()

    imbalanced = h2o.random_forest(x=covtype[0:54],
                                   y=covtype[54],
                                   ntrees=50,
                                   balance_classes=False,
                                   nfolds=10)
    imbalanced_perf = imbalanced.model_performance(covtype)
    imbalanced_perf.show()

    balanced = h2o.random_forest(x=covtype[0:54],
                                 y=covtype[54],
                                 ntrees=50,
                                 balance_classes=True,
                                 nfolds=10)
    balanced_perf = balanced.model_performance(covtype)
    balanced_perf.show()

    ##compare error for class 6 (difficult minority)
    ##confusion_matrix element at position A,P for N classes is at: model$confusion[P*(N+1)-(N-A+1)]
    ##Here, A=6 P=8, N=7 -> need element 8*(7+1)-(7-6+1) = 62

    class_6_err_imbalanced = imbalanced_perf.error()[6]
    class_6_err_balanced = balanced_perf.error()[6]

    if (class_6_err_imbalanced < class_6_err_balanced):
        print("--------------------")
        print("")
        print("FAIL, balanced error greater than imbalanced error")
        print("")
        print("")
        print("class_6_err_imbalanced")
        print(class_6_err_imbalanced)
        print("")
        print("class_6_err_balanced")
        print(class_6_err_balanced)
        print("")
        print("--------------------")

    assert class_6_err_imbalanced >= 0.9 * class_6_err_balanced, "balance_classes makes it at least 10% worse!"
Ejemplo n.º 19
0
	def train(self, x, y):
		self.model = h2o.random_forest(x = self.trainData.drop('score diff'),
			                           y = self.trainData['score diff'],
			                           validation_x = self.valData.drop('score diff'),
			                           validation_y = self.valData['score diff'],
			                           ntrees=self.params[2],
			                           max_depth=self.params[3],
			                           nfolds=self.params[4])
Ejemplo n.º 20
0
def iris_all(ip,port):
    
    

    iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris2.csv"))

    model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50, max_depth=100)
    model.show()
Ejemplo n.º 21
0
def iris_all(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris2.csv"))

    model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50, max_depth=100)
    model.show()
Ejemplo n.º 22
0
def checkpoint_new_category_in_predictor():

    sv1 = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv"))
    sv2 = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv"))
    vir = h2o.upload_file(tests.locate("smalldata/iris/virginica.csv"))

    m1 = h2o.random_forest(x=sv1[[0,1,2,4]], y=sv1[3], ntrees=100)

    m2 = h2o.random_forest(x=sv2[[0,1,2,4]], y=sv2[3], ntrees=200, checkpoint=m1.id)

    # attempt to continue building model, but with an expanded categorical predictor domain.
    # this should fail until we figure out proper behavior
    try:
        m3 = h2o.random_forest(x=vir[[0,1,2,4]], y=vir[3], ntrees=200, checkpoint=m1.id)
        assert False, "Expected continued model-building to fail with new categories introduced in predictor"
    except EnvironmentError:
        pass
def hexdev_422():

    fr = h2o.import_file(h2o.locate("bigdata/laptop/jira/z_repro.csv.gz"))
    fr[0] = fr[0].asfactor()

    rf = h2o.random_forest(x=fr[1:fr.ncol], y=fr[0], min_rows=1, ntrees=25, max_depth=45)

    h2o.download_pojo(rf)
Ejemplo n.º 24
0
def iris_all():
    
    

    iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris2.csv"))

    model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50, max_depth=100)
    model.show()
Ejemplo n.º 25
0
def ntrain():
    
    h2o.init(ip="zurich.h2o.ai",strict_version_check=False)
    weather = load_weather()
    training = load_training()    
    X = assemble_X(training, weather)
    mean, std = normalize(X)
    y =assemble_y(training)
    xd=[]
    for l in X:
        xd.append(l.tolist())
        
    y=np.asarray(y,dtype='bool_')    
        
    xtr=H2OFrame(python_obj=xd)
    ytr=H2OFrame(python_obj=y.tolist()) 
    
    ytr["C1"]._name = "C40"  # Rename the default column
        
    gb = h2o.gbm(x =xtr[1:39],y =ytr['C40'],
                distribution = "bernoulli",
                ntrees=1000, # 500 works well
                max_depth=12,
                learn_rate=0.01)
                
    dl= h2o.deeplearning(x =xtr[1:39],y =ytr['C40'],
                variable_importances=True,balance_classes=True,
                input_dropout_ratio=0.2,rho=0.899,
                hidden_dropout_ratios=[0.4,0.4,0.4,0.4],
                activation="Tanh",hidden=[39,325,325,1],epochs=100)
                
    rf= h2o.random_forest(x =xtr[1:39],y =ytr['C40'],
                seed=1234, ntrees=600, 
                max_depth=20, balance_classes=False)

    
    testing = load_testing()
    X_test= assemble_X(testing, weather) 
    normalize(X_test, mean, std)
    
    xd=[]
    for l in X_test:
        xd.append(l.tolist())
    xts=H2OFrame(python_obj=xd)
    
#    gp=gb.predict(xts)
    dp=dl.predict(xts) 
    rp=rf.predict(xts)
    gbp=gb.predict(xts) 
    
    gp=dp*0.35+rp*0.3+gbp*0.35
    
    gph=h2o.as_list(gp)
    Id= np.arange(gp.nrow()+1)[1:].reshape(gp.nrow(),1)
    df = pd.DataFrame(Id)
    df_concat = pd.concat([df, gph.True],axis=1)
    df_concat.columns=['Id','WnvPresent']
    df_concat.to_csv("wnvh.csv",index=False)
Ejemplo n.º 26
0
def iris_ignore(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris2.csv"))
  
    for maxx in range(4):
      model = h2o.random_forest(y=iris[4], x=iris[range(maxx+1)], ntrees=50, max_depth=100)
      model.show()
Ejemplo n.º 27
0
def iris_get_model():

    iris = h2o.import_file(path=tests.locate("smalldata/iris/iris.csv"))

    model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50)
    model.show()

    model = h2o.get_model(model._id)
    model.show()
def deeplearning_autoencoder():

    resp = 784
    nfeatures = 20  # number of features (smallest hidden layer)

    train_hex = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz"))
    train_hex[resp] = train_hex[resp].asfactor()

    test_hex = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz"))
    test_hex[resp] = test_hex[resp].asfactor()

    # split data into two parts
    sid = train_hex[0].runif(1234)

    # unsupervised data for autoencoder
    train_unsupervised = train_hex[sid >= 0.5]
    train_unsupervised.drop(resp)
    train_unsupervised.describe()

    # supervised data for drf
    train_supervised = train_hex[sid < 0.5]
    train_supervised.describe()

    # train autoencoder
    ae_model = h2o.deeplearning(
        x=train_unsupervised[0:resp],
        activation="Tanh",
        autoencoder=True,
        hidden=[nfeatures],
        epochs=1,
        reproducible=True,  # slow, turn off for real problems
        seed=1234,
    )

    # conver train_supervised with autoencoder to lower-dimensional space
    train_supervised_features = ae_model.deepfeatures(train_supervised[0:resp], 0)

    assert train_supervised_features.ncol == nfeatures, "Dimensionality of reconstruction is wrong!"

    # Train DRF on extracted feature space
    drf_model = h2o.random_forest(
        x=train_supervised_features[0:20], y=train_supervised[resp], ntrees=10, min_rows=10, seed=1234
    )

    # Test the DRF model on the test set (processed through deep features)
    test_features = ae_model.deepfeatures(test_hex[0:resp], 0)
    test_features = test_features.cbind(test_hex[resp])

    # Confusion Matrix and assertion
    cm = drf_model.confusion_matrix(test_features)
    cm.show()

    # 10% error +/- 0.001
    assert abs(cm.cell_values[10][10] - 0.086) < 0.001, "Error. Expected 0.086, but got {0}".format(
        cm.cell_values[10][10]
    )
Ejemplo n.º 29
0
def iris_ignore(ip, port):

    iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris2.csv"))

    for maxx in range(4):
        model = h2o.random_forest(y=iris[4],
                                  x=iris[range(maxx + 1)],
                                  ntrees=50,
                                  max_depth=100)
        model.show()
Ejemplo n.º 30
0
def deeplearning_autoencoder(ip, port):
    h2o.init(ip, port)

    resp = 784
    nfeatures = 20  # number of features (smallest hidden layer)

    train_hex = h2o.import_frame(
        h2o.locate("bigdata/laptop/mnist/train.csv.gz"))
    test_hex = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/test.csv.gz"))

    # split data into two parts
    sid = train_hex[1].runif(1234)

    # unsupervised data for autoencoder
    train_unsupervised = train_hex[sid >= 0.5]
    train_unsupervised.describe()

    # supervised data for drf
    train_supervised = train_hex[sid < 0.5]
    train_supervised.describe()

    # train autoencoder
    ae_model = h2o.deeplearning(
        x=train_unsupervised.drop(resp),
        y=train_unsupervised[resp],  #ignored (pick any non-constant)
        activation="Tanh",
        autoencoder=True,
        hidden=[nfeatures],
        epochs=1,
        reproducible=True,  #slow, turn off for real problems
        seed=1234)

    # conver train_supervised with autoencoder to lower-dimensional space
    train_supervised_features = ae_model.deepfeatures(train_supervised, 0)
    train_supervised_features.describe()

    assert train_supervised_features.ncol(
    ) == nfeatures, "Dimensionality of reconstruction is wrong!"

    # Train DRF on extracted feature space
    drf_model = h2o.random_forest(x=train_supervised_features,
                                  y=train_supervised[resp].asfactor(),
                                  ntrees=10,
                                  seed=1234)

    # Test the DRF model on the test set (processed through deep features)
    test_features = ae_model.deepfeatures(test_hex.drop(resp), 0)
    test_features.cbind(test_hex[resp])

    # Confusion Matrix and assertion
    cm = drf_model.confusion_matrix(test_features)
    cm.show()

    # 10% error +/- 0.001
    assert abs(cm["Totals", "Error"] - 0.1038) < 0.001, "Error not as expected"
Ejemplo n.º 31
0
def javapredict(algo, train, test, x, y, **kwargs):
    print "Creating model in H2O"
    if algo == "gbm":
        model = h2o.gbm(x=train[x], y=train[y], **kwargs)
    elif algo == "random_forest":
        model = h2o.random_forest(x=train[x], y=train[y], **kwargs)
    else:
        raise(ValueError, "algo {0} is not supported".format(algo))
    print model

    print "Downloading Java prediction model code from H2O"
    tmpdir = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","results",model._id))
    os.makedirs(tmpdir)
    h2o.download_pojo(model,path=tmpdir)

    print "Predicting in H2O"
    predictions = model.predict(test)
    predictions.summary()
    predictions.head()
    h2o.download_csv(predictions,os.path.join(tmpdir,"out_h2o.csv"))

    print "Setting up for Java POJO"
    h2o.download_csv(test[x],os.path.join(tmpdir,"in.csv"))
    # hack: the PredictCsv driver can't handle quoted strings, so remove them
    f = open(os.path.join(tmpdir,"in.csv"), 'r+')
    in_csv = f.read()
    in_csv = re.sub('\"', '', in_csv)
    f.seek(0)
    f.write(in_csv)
    f.truncate()
    f.close()

    subprocess.call(["javac", "-cp", os.path.join(tmpdir,"h2o-genmodel.jar"), "-J-Xmx4g", "-J-XX:MaxPermSize=256m", os.path.join(tmpdir,model._id+".java")], stderr=subprocess.STDOUT)
    subprocess.call(["java", "-ea", "-cp", os.path.join(tmpdir,"h2o-genmodel.jar")+":{0}".format(tmpdir), "-Xmx4g", "-XX:MaxPermSize=256m", "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.PredictCsv", "--header", "--model", model._id, "--input", os.path.join(tmpdir,"in.csv"), "--output", os.path.join(tmpdir,"out_pojo.csv")], stderr=subprocess.STDOUT)

    predictions2 = h2o.import_file(os.path.join(tmpdir,"out_pojo.csv"))

    print "Comparing predictions between H2O and Java POJO"
    # Dimensions
    hr, hc = predictions.dim
    pr, pc = predictions2.dim
    assert hr == pr, "Exepcted the same number of rows, but got {0} and {1}".format(hr, pr)
    assert hc == pc, "Exepcted the same number of cols, but got {0} and {1}".format(hc, pc)

    # Value
    for r in range(hr):
        hp = predictions[r,0]
        if algo == "gbm":
            pp = float.fromhex(predictions2[r,0])
            assert abs(hp - pp) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format(r,hp, pp)
        elif algo == "random_forest":
            pp = predictions2[r,0]
            assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format(r,hp, pp)
        else:
            raise(ValueError, "algo {0} is not supported".format(algo))
def iris_nfolds_getModel(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))

    model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50, nfolds=5)
    model.show()

    model = h2o.getModel(model._key)
    model.show()
def iris_nfolds_getModel(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))

    model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50, nfolds=5)
    model.show()

    model = h2o.getModel(model._key)
    model.show()
def milsong_checkpoint():

    milsong_train = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
    milsong_valid = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))

    # build first model
    ntrees1 = random.sample(list(range(50,100)),1)[0]
    max_depth1 = random.sample(list(range(2,6)),1)[0]
    min_rows1 = random.sample(list(range(10,16)),1)[0]
    print("ntrees model 1: {0}".format(ntrees1))
    print("max_depth model 1: {0}".format(max_depth1))
    print("min_rows model 1: {0}".format(min_rows1))
    model1 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1,
                               validation_x=milsong_valid[1:],validation_y=milsong_valid[0],seed=1234)

    # save the model, then load the model
    path = pyunit_utils.locate("results")

    assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path)
    model_path = h2o.save_model(model1, path=path, force=True)

    assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path)
    restored_model = h2o.load_model(model_path)

    # continue building the model
    ntrees2 = ntrees1 + 50
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print("ntrees model 2: {0}".format(ntrees2))
    print("max_depth model 2: {0}".format(max_depth2))
    print("min_rows model 2: {0}".format(min_rows2))
    model2 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                               validation_x=milsong_valid[1:],validation_y=milsong_valid[0],
                               checkpoint=restored_model._id,seed=1234)

    # build the equivalent of model 2 in one shot
    model3 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                               validation_x=milsong_valid[1:],validation_y=milsong_valid[0],seed=1234)

    assert isinstance(model2,type(model3))
    assert model2.mse(valid=True)==model3.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model3.mse(valid=True))
def deeplearning_autoencoder(ip, port):
  h2o.init(ip, port)

  resp = 784
  nfeatures = 20 # number of features (smallest hidden layer)


  train_hex = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/train.csv.gz"))
  test_hex = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/test.csv.gz"))

  # split data into two parts
  sid = train_hex[1].runif(1234)

  # unsupervised data for autoencoder
  train_unsupervised = train_hex[sid >= 0.5]
  train_unsupervised.describe()

  # supervised data for drf
  train_supervised = train_hex[sid < 0.5]
  train_supervised.describe()

  # train autoencoder
  ae_model = h2o.deeplearning(x=train_unsupervised.drop(resp),
                              y=train_unsupervised[resp], #ignored (pick any non-constant)
                              activation="Tanh",
                              autoencoder=True,
                              hidden=[nfeatures],
                              epochs=1,
                              reproducible=True, #slow, turn off for real problems
                              seed=1234)

  # conver train_supervised with autoencoder to lower-dimensional space
  train_supervised_features = ae_model.deepfeatures(train_supervised, 0)
  train_supervised_features.describe()

  assert train_supervised_features.ncol() == nfeatures, "Dimensionality of reconstruction is wrong!"

  # Train DRF on extracted feature space
  drf_model = h2o.random_forest(x=train_supervised_features,
                                y=train_supervised[resp].asfactor(),
                                ntrees=10,
                                seed=1234)

  # Test the DRF model on the test set (processed through deep features)
  test_features = ae_model.deepfeatures(test_hex.drop(resp), 0)
  test_features.cbind(test_hex[resp])

  # Confusion Matrix and assertion
  cm = drf_model.confusionMatrix(test_features)
  cm.show()

  # 10% error +/- 0.001
  assert abs(cm["Totals", "Error"] - 0.1038) < 0.001, "Error not as expected"
def vi_reg():
    
    

    data = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/BostonHousing.csv"))
    #data.summary()

    rf = h2o.random_forest(x=data[0:13], y=data[13], ntrees=100, max_depth=20, nbins=100, seed=0)

    ranking = [rf._model_json['output']['variable_importances'].cell_values[v][0] for v in range(data.ncol-1)]
    print(ranking)
    assert tuple([ranking[0],ranking[1]]) == tuple(["rm","lstat"]), "expected specific variable importance ranking"
Ejemplo n.º 37
0
def vi_reg(ip,port):
    
    

    data = h2o.import_file(path=h2o.locate("smalldata/gbm_test/BostonHousing.csv"))
    #data.summary()

    rf = h2o.random_forest(x=data[0:13], y=data[13], ntrees=100, max_depth=20, nbins=100, seed=0)

    ranking = [rf._model_json['output']['variable_importances'].cell_values[v][0] for v in range(data.ncol-1)]
    print(ranking)
    assert tuple([ranking[0],ranking[1]]) == tuple(["rm","lstat"]), "expected specific variable importance ranking"
Ejemplo n.º 38
0
def imbalanced(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    covtype = h2o.import_frame(
        path=h2o.locate("smalldata/covtype/covtype.20k.data"))
    covtype[54] = covtype[54].asfactor()

    imbalanced = h2o.random_forest(x=covtype[0:54],
                                   y=covtype[54],
                                   ntrees=10,
                                   balance_classes=False,
                                   nfolds=3)
    imbalanced_perf = imbalanced.model_performance(covtype)
    imbalanced_perf.show()

    balanced = h2o.random_forest(x=covtype[0:54],
                                 y=covtype[54],
                                 ntrees=10,
                                 balance_classes=True,
                                 nfolds=3)
    balanced_perf = balanced.model_performance(covtype)
    balanced_perf.show()

    ##compare error for class 6 (difficult minority)
    class_6_err_imbalanced = imbalanced_perf.confusion_matrix(
    ).cell_values[5][7]
    class_6_err_balanced = balanced_perf.confusion_matrix().cell_values[5][7]

    print("--------------------")
    print("")
    print("class_6_err_imbalanced")
    print(class_6_err_imbalanced)
    print("")
    print("class_6_err_balanced")
    print(class_6_err_balanced)
    print("")
    print("--------------------")

    assert class_6_err_imbalanced >= 0.9 * class_6_err_balanced, "balance_classes makes it at least 10% worse!"
Ejemplo n.º 39
0
def czechboardRF(ip,port):

    # Connect to h2o
    h2o.init(ip,port)

    # Training set has checkerboard pattern
    board = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/czechboard_300x300.csv"))
    board["C3"] = board["C3"].asfactor()
    board.summary()

    # Train H2O DRF Model:
    model = h2o.random_forest(x=board[["C1", "C2"]], y=board["C3"], ntrees=50, max_depth=20, nbins=500)
    model.show()
Ejemplo n.º 40
0
def czechboardRF(ip,port):

    
    

    # Training set has checkerboard pattern
    board = h2o.import_file(path=h2o.locate("smalldata/gbm_test/czechboard_300x300.csv"))
    board["C3"] = board["C3"].asfactor()
    board.summary()

    # Train H2O DRF Model:
    model = h2o.random_forest(x=board[["C1", "C2"]], y=board["C3"], ntrees=50, max_depth=20, nbins=500)
    model.show()
Ejemplo n.º 41
0
def smallcatRF(ip, port):

    # Training set has 26 categories from A to Z
    # Categories A, C, E, G, ... are perfect predictors of y = 1
    # Categories B, D, F, H, ... are perfect predictors of y = 0

    # Connect to h2o
    h2o.init(ip, port)

    #Log.info("Importing alphabet_cattest.csv data...\n")
    alphabet = h2o.import_frame(
        path=h2o.locate("smalldata/gbm_test/alphabet_cattest.csv"))
    alphabet["y"] = alphabet["y"].asfactor()
    #Log.info("Summary of alphabet_cattest.csv from H2O:\n")
    #alphabet.summary()

    # Prepare data for scikit use
    trainData = np.loadtxt(
        h2o.locate("smalldata/gbm_test/alphabet_cattest.csv"),
        delimiter=',',
        skiprows=1,
        converters={0: lambda s: ord(s.split("\"")[1])})
    trainDataResponse = trainData[:, 1]
    trainDataFeatures = trainData[:, 0]

    # Train H2O GBM Model:
    #Log.info("H2O GBM (Naive Split) with parameters:\nntrees = 1, max_depth = 1, nbins = 100\n")
    rf_h2o = h2o.random_forest(x=alphabet[['X']],
                               y=alphabet["y"],
                               ntrees=1,
                               max_depth=1,
                               nbins=100)

    # Train scikit GBM Model:
    # Log.info("scikit GBM with same parameters:")
    rf_sci = ensemble.RandomForestClassifier(n_estimators=1,
                                             criterion='entropy',
                                             max_depth=1)
    rf_sci.fit(trainDataFeatures[:, np.newaxis], trainDataResponse)

    # h2o
    rf_perf = rf_h2o.model_performance(alphabet)
    auc_h2o = rf_perf.auc()

    # scikit
    auc_sci = roc_auc_score(
        trainDataResponse,
        rf_sci.predict_proba(trainDataFeatures[:, np.newaxis])[:, 1])

    #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o))
    assert auc_h2o >= auc_sci, "h2o (auc) performance degradation, with respect to scikit"
def imbalanced(ip,port):

    # Connect to h2o
    h2o.init(ip,port)

    covtype = h2o.import_frame(path=h2o.locate("smalldata/covtype/covtype.20k.data"))
    covtype[54] = covtype[54].asfactor()

    imbalanced = h2o.random_forest(x=covtype[0:54], y=covtype[54], ntrees=50, balance_classes=False, nfolds=10)
    imbalanced_perf = imbalanced.model_performance(covtype)
    imbalanced_perf.show()

    balanced = h2o.random_forest(x=covtype[0:54], y=covtype[54], ntrees=50, balance_classes=True, nfolds=10)
    balanced_perf = balanced.model_performance(covtype)
    balanced_perf.show()

    ##compare error for class 6 (difficult minority)
    ##confusion_matrix element at position A,P for N classes is at: model$confusion[P*(N+1)-(N-A+1)]
    ##Here, A=6 P=8, N=7 -> need element 8*(7+1)-(7-6+1) = 62

    class_6_err_imbalanced = imbalanced_perf.error()[6]
    class_6_err_balanced = balanced_perf.error()[6]

    if (class_6_err_imbalanced < class_6_err_balanced):
        print("--------------------")
        print("")
        print("FAIL, balanced error greater than imbalanced error")
        print("")
        print("")
        print("class_6_err_imbalanced")
        print(class_6_err_imbalanced)
        print("")
        print("class_6_err_balanced")
        print(class_6_err_balanced)
        print("")
        print("--------------------")

    assert class_6_err_imbalanced >= 0.9*class_6_err_balanced, "balance_classes makes it at least 10% worse!"
Ejemplo n.º 43
0
def vi_toy_test(ip,port):
    
    

    toy_data = h2o.import_file(path=h2o.locate("smalldata/gbm_test/toy_data_RF.csv"))
    #toy_data.summary()

    toy_data[6] = toy_data[6].asfactor()
    toy_data.show()
    rf = h2o.random_forest(x=toy_data[[0,1,2,3,4,5]], y=toy_data[6], ntrees=500, max_depth=20, nbins=100, seed=0)

    ranking = [rf._model_json['output']['variable_importances'].cell_values[v][0] for v in range(toy_data.ncol-1)]
    print(ranking)
    assert tuple(ranking) == tuple(["V3","V2","V6","V5","V1","V4"]), "expected specific variable importance ranking"
Ejemplo n.º 44
0
def milsong_checkpoint(ip,port):

    milsong_train = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
    milsong_valid = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))

    # build first model
    ntrees1 = random.sample(range(50,100),1)[0]
    max_depth1 = random.sample(range(2,6),1)[0]
    min_rows1 = random.sample(range(10,16),1)[0]
    print "ntrees model 1: {0}".format(ntrees1)
    print "max_depth model 1: {0}".format(max_depth1)
    print "min_rows model 1: {0}".format(min_rows1)
    model1 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1,
                               validation_x=milsong_valid[1:],validation_y=milsong_valid[0],seed=1234)

    # save the model, then load the model
    model_path = h2o.save_model(model1,force=True)
    restored_model = h2o.load_model(model_path)
    shutil.rmtree(model_path)

    # continue building the model
    ntrees2 = ntrees1 + 50
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print "ntrees model 2: {0}".format(ntrees2)
    print "max_depth model 2: {0}".format(max_depth2)
    print "min_rows model 2: {0}".format(min_rows2)
    model2 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                               validation_x=milsong_valid[1:],validation_y=milsong_valid[0],
                               checkpoint=restored_model._id,seed=1234)

    # build the equivalent of model 2 in one shot
    model3 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                               validation_x=milsong_valid[1:],validation_y=milsong_valid[0],seed=1234)

    assert isinstance(model2,type(model3))
    assert model2.mse(valid=True)==model3.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model3.mse(valid=True))
def czechboardRF(ip,port):

    # Connect to h2o
    h2o.init(ip,port)

    # Training set has checkerboard pattern
    #Log.info("Importing czechboard_300x300.csv data...\n")
    board = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/czechboard_300x300.csv"))

    board["C3"] = board["C3"].asfactor()
    #Log.info("Summary of czechboard_300x300.csv from H2O:\n")
    #board.summary()

    # Train H2O DRF Model:
    #Log.info("H2O DRF (Naive Split) with parameters:\nclassification = TRUE, ntree = 50, depth = 20, nbins = 500\n")
    model = h2o.random_forest(x=board[["C1", "C2"]], y=board["C3"], ntrees=50, max_depth=20, nbins=500)
    model.show()
Ejemplo n.º 46
0
def bigcatRF():

    # Training set has 100 categories from cat001 to cat100
    # Categories cat001, cat003, ... are perfect predictors of y = 1
    # Categories cat002, cat004, ... are perfect predictors of y = 0

    # Log.info("Importing bigcat_5000x2.csv data...\n")
    bigcat = h2o.import_file(path=h2o.locate("smalldata/gbm_test/bigcat_5000x2.csv"))
    bigcat["y"] = bigcat["y"].asfactor()

    # Log.info("Summary of bigcat_5000x2.csv from H2O:\n")
    # bigcat.summary()

    # Train H2O DRF Model:
    # Log.info("H2O DRF (Naive Split) with parameters:\nclassification = TRUE, ntree = 1, depth = 1, nbins = 100, nbins_cats=10\n")
    model = h2o.random_forest(x=bigcat[["X"]], y=bigcat["y"], ntrees=1, max_depth=1, nbins=100, nbins_cats=10)
    model.show()
def rf_mean_residual_deviance(ip,port):

    cars =  h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv"))
    s = cars[0].runif()
    train = cars[s > 0.2]
    valid = cars[s <= 0.2]
    predictors = ["displacement","power","weight","acceleration","year"]
    response_col = "economy"
    rf = h2o.random_forest(x=train[predictors],
                           y=train[response_col],
                           validation_x=valid[predictors],
                           validation_y=valid[response_col],
                           nfolds=3)
    rf_mrd = rf.mean_residual_deviance(train=True,valid=True,xval=True)
    assert isinstance(rf_mrd['train'],float), "Expected training mean residual deviance to be a float, but got " \
                                              "{0}".format(type(rf_mrd['train']))
    assert isinstance(rf_mrd['valid'],float), "Expected validation mean residual deviance to be a float, but got " \
                                              "{0}".format(type(rf_mrd['valid']))
    assert isinstance(rf_mrd['xval'],float), "Expected cross-validation mean residual deviance to be a float, but got " \
                                             "{0}".format(type(rf_mrd['xval']))
def rf_mean_residual_deviance(ip, port):

    cars = h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv"))
    s = cars[0].runif()
    train = cars[s > 0.2]
    valid = cars[s <= 0.2]
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    response_col = "economy"
    rf = h2o.random_forest(x=train[predictors],
                           y=train[response_col],
                           validation_x=valid[predictors],
                           validation_y=valid[response_col],
                           nfolds=3)
    rf_mrd = rf.mean_residual_deviance(train=True, valid=True, xval=True)
    assert isinstance(rf_mrd['train'],float), "Expected training mean residual deviance to be a float, but got " \
                                              "{0}".format(type(rf_mrd['train']))
    assert isinstance(rf_mrd['valid'],float), "Expected validation mean residual deviance to be a float, but got " \
                                              "{0}".format(type(rf_mrd['valid']))
    assert isinstance(rf_mrd['xval'],float), "Expected cross-validation mean residual deviance to be a float, but got " \
                                             "{0}".format(type(rf_mrd['xval']))
Ejemplo n.º 49
0
def fiftycatRF(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    # Training set has only 45 categories cat1 through cat45
    #Log.info("Importing 50_cattest_train.csv data...\n")
    train = h2o.import_frame(
        path=h2o.locate("smalldata/gbm_test/50_cattest_train.csv"))
    train["y"] = train["y"].asfactor()

    #Log.info("Summary of 50_cattest_train.csv from H2O:\n")
    #train.summary()

    # Train H2O DRF Model:
    #Log.info(paste("H2O DRF with parameters:\nclassification = TRUE, ntree = 50, depth = 20, nbins = 500\n", sep = ""))
    model = h2o.random_forest(x=train[["x1", "x2"]],
                              y=train["y"],
                              ntrees=50,
                              max_depth=20,
                              nbins=500)

    # Test dataset has all 50 categories cat1 through cat50
    #Log.info("Importing 50_cattest_test.csv data...\n")
    test = h2o.import_frame(
        path=h2o.locate("smalldata/gbm_test/50_cattest_test.csv"))

    #Log.info("Summary of 50_cattest_test.csv from H2O:\n")
    #test.summary()

    # Predict on test dataset with DRF model:
    #Log.info("Performing predictions on test dataset...\n")
    preds = model.predict(test)
    preds.head()

    # Get the confusion matrix and AUC
    #Log.info("Confusion matrix of predictions (max accuracy):\n")
    perf = model.model_performance(test)
    perf.show()
    cm = perf.confusion_matrices()
    print(cm)
Ejemplo n.º 50
0
def weights_vi(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    ###### create synthetic dataset1 with 3 predictors: p1 predicts response ~90% of the time, p2 ~70%, p3 ~50%
    response = ['a' for y in range(10000)]
    [response.append('b') for y in range(10000)]

    p1 = [(1 if random.uniform(0, 1) < 0.9 else 0) if y == 'a' else
          (0 if random.uniform(0, 1) < 0.9 else 1) for y in response]
    p2 = [(1 if random.uniform(0, 1) < 0.7 else 0) if y == 'a' else
          (0 if random.uniform(0, 1) < 0.7 else 1) for y in response]
    p3 = [(1 if random.uniform(0, 1) < 0.5 else 0) if y == 'a' else
          (0 if random.uniform(0, 1) < 0.5 else 1) for y in response]

    dataset1_python = [[r, one, two, three]
                       for r, one, two, three in zip(response, p1, p2, p3)]
    dataset1_h2o = h2o.H2OFrame(python_obj=dataset1_python)
    dataset1_h2o.setNames(["response", "p1", "p2", "p3"])

    ##### create synthetic dataset2 with 3 predictors: p3 predicts response ~90% of the time, p1 ~70%, p2 ~50%
    response = ['a' for y in range(10000)]
    [response.append('b') for y in range(10000)]

    p1 = [(1 if random.uniform(0, 1) < 0.7 else 0) if y == 'a' else
          (0 if random.uniform(0, 1) < 0.7 else 1) for y in response]
    p2 = [(1 if random.uniform(0, 1) < 0.5 else 0) if y == 'a' else
          (0 if random.uniform(0, 1) < 0.5 else 1) for y in response]
    p3 = [(1 if random.uniform(0, 1) < 0.9 else 0) if y == 'a' else
          (0 if random.uniform(0, 1) < 0.9 else 1) for y in response]

    dataset2_python = [[r, one, two, three]
                       for r, one, two, three in zip(response, p1, p2, p3)]
    dataset2_h2o = h2o.H2OFrame(python_obj=dataset2_python)
    dataset2_h2o.setNames(["response", "p1", "p2", "p3"])

    ##### compute variable importances on dataset1 and dataset2
    model_dataset1 = h2o.random_forest(x=dataset1_h2o[["p1", "p2", "p3"]],
                                       y=dataset1_h2o["response"])
    varimp_dataset1 = tuple(
        [p[0] for p in model_dataset1.varimp(return_list=True)])
    assert varimp_dataset1 == ('p1', 'p2', 'p3'), "Expected the following relative variable importance on dataset1: " \
                                                  "('p1', 'p2', 'p3'), but got: {0}".format(varimp_dataset1)

    model_dataset2 = h2o.random_forest(x=dataset2_h2o[["p1", "p2", "p3"]],
                                       y=dataset2_h2o["response"])
    varimp_dataset2 = tuple(
        [p[0] for p in model_dataset2.varimp(return_list=True)])
    assert varimp_dataset2 == ('p3', 'p1', 'p2'), "Expected the following relative variable importance on dataset2: " \
                                                  "('p3', 'p1', 'p2'), but got: {0}".format(varimp_dataset2)

    ############ Test1 #############
    ##### weight the combined dataset 80/20 in favor of dataset 1
    dataset1_python_weighted = copy.deepcopy(dataset1_python)
    [r.append(0.8) for r in dataset1_python_weighted]
    dataset2_python_weighted = copy.deepcopy(dataset2_python)
    [r.append(0.2) for r in dataset2_python_weighted]

    ##### combine dataset1 and dataset2
    combined_dataset_python = []
    [combined_dataset_python.append(r) for r in dataset1_python_weighted]
    [combined_dataset_python.append(r) for r in dataset2_python_weighted]
    combined_dataset_h2o = h2o.H2OFrame(python_obj=combined_dataset_python)
    combined_dataset_h2o.setNames(["response", "p1", "p2", "p3", "weights"])

    ##### recompute the variable importances. the relative order should be the same as above.
    model_combined_dataset = h2o.random_forest(
        x=combined_dataset_h2o[["p1", "p2", "p3"]],
        y=combined_dataset_h2o["response"],
        training_frame=combined_dataset_h2o,
        weights_column="weights")

    varimp_combined = tuple(
        [p[0] for p in model_combined_dataset.varimp(return_list=True)])
    assert varimp_combined == ('p1', 'p2', 'p3'), "Expected the following relative variable importance on the combined " \
                                                  "dataset: ('p1', 'p2', 'p3'), but got: {0}".format(varimp_combined)

    ############ Test2 #############
    ##### weight the combined dataset 80/20 in favor of dataset 2
    dataset1_python_weighted = copy.deepcopy(dataset1_python)
    [r.append(0.2) for r in dataset1_python_weighted]
    dataset2_python_weighted = copy.deepcopy(dataset2_python)
    [r.append(0.8) for r in dataset2_python_weighted]

    ##### combine dataset1 and dataset2
    combined_dataset_python = []
    [combined_dataset_python.append(r) for r in dataset1_python_weighted]
    [combined_dataset_python.append(r) for r in dataset2_python_weighted]
    combined_dataset_h2o = h2o.H2OFrame(python_obj=combined_dataset_python)
    combined_dataset_h2o.setNames(["response", "p1", "p2", "p3", "weights"])

    ##### recompute the variable importances. the relative order should be the same as above.
    model_combined_dataset = h2o.random_forest(
        x=combined_dataset_h2o[["p1", "p2", "p3"]],
        y=combined_dataset_h2o["response"],
        training_frame=combined_dataset_h2o,
        weights_column="weights")

    varimp_combined = tuple(
        [p[0] for p in model_combined_dataset.varimp(return_list=True)])
    assert varimp_combined == ('p3', 'p1', 'p2'), "Expected the following relative variable importance on the combined " \
                                                  "dataset: ('p3', 'p1', 'p2'), but got: {0}".format(varimp_combined)
Ejemplo n.º 51
0
def cars_checkpoint(ip, port):

    cars = h2o.upload_file(h2o.locate("smalldata/junit/cars_20mpg.csv"))
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    response_col = "economy"

    # build first model
    model1 = h2o.random_forest(x=cars[predictors],
                               y=cars[response_col],
                               ntrees=10,
                               max_depth=2,
                               min_rows=10)

    # continue building the model
    model2 = h2o.random_forest(x=cars[predictors],
                               y=cars[response_col],
                               ntrees=11,
                               max_depth=3,
                               min_rows=9,
                               r2_stopping=0.8,
                               checkpoint=model1._id)

    #   erroneous, not MODIFIABLE_BY_CHECKPOINT_FIELDS
    # PUBDEV-1833

    #   mtries
    try:
        model = h2o.random_forest(y=cars[response_col],
                                  x=cars[predictors],
                                  mtries=2,
                                  checkpoint=model1._id)
        assert False, "Expected model-build to fail because mtries not modifiable by checkpoint"
    except EnvironmentError:
        assert True

    #   sample_rate
    try:
        model = h2o.random_forest(y=cars[response_col],
                                  x=cars[predictors],
                                  sample_rate=0.5,
                                  checkpoint=model1._id)
        assert False, "Expected model-build to fail because sample_rate not modifiable by checkpoint"
    except EnvironmentError:
        assert True

    #   nbins_cats
    try:
        model = h2o.random_forest(y=cars[response_col],
                                  x=cars[predictors],
                                  nbins_cats=99,
                                  checkpoint=model1._id)
        assert False, "Expected model-build to fail because nbins_cats not modifiable by checkpoint"
    except EnvironmentError:
        assert True

    #   nbins
    try:
        model = h2o.random_forest(y=cars[response_col],
                                  x=cars[predictors],
                                  nbins=99,
                                  checkpoint=model1._id)
        assert False, "Expected model-build to fail because nbins not modifiable by checkpoint"
    except EnvironmentError:
        assert True

    #   balance_classes
    try:
        model = h2o.random_forest(y=cars[response_col],
                                  x=cars[predictors],
                                  balance_classes=True,
                                  checkpoint=model1._id)
        assert False, "Expected model-build to fail because balance_classes not modifiable by checkpoint"
    except EnvironmentError:
        assert True

    #   nfolds
    try:
        model = h2o.random_forest(y=cars[response_col],
                                  x=cars[predictors],
                                  nfolds=3,
                                  checkpoint=model1._id)
        assert False, "Expected model-build to fail because nfolds not modifiable by checkpoint"
    except EnvironmentError:
        assert True
Ejemplo n.º 52
0
def javapredict(algo, equality, train, test, x, y, **kwargs):
    print "Creating model in H2O"
    if algo == "gbm":
        model = h2o.gbm(x=train[x], y=train[y], **kwargs)
    elif algo == "random_forest":
        model = h2o.random_forest(x=train[x], y=train[y], **kwargs)
    elif algo == "deeplearning":
        model = h2o.deeplearning(x=train[x], y=train[y], **kwargs)
    elif algo == "glm":
        model = h2o.glm(x=train[x], y=train[y], **kwargs)
    else:
        raise (ValueError, "algo {0} is not supported".format(algo))
    print model

    print "Downloading Java prediction model code from H2O"
    tmpdir = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                     "results", model._id))
    os.mkdir(tmpdir)
    h2o.download_pojo(model, path=tmpdir)
    h2o_genmodel_jar = os.path.join(tmpdir, "h2o-genmodel.jar")
    assert os.path.exists(
        h2o_genmodel_jar
    ), "Expected file {0} to exist, but it does not.".format(h2o_genmodel_jar)
    print "h2o-genmodel.jar saved in {0}".format(h2o_genmodel_jar)
    java_file = os.path.join(tmpdir, model._id + ".java")
    assert os.path.exists(
        java_file), "Expected file {0} to exist, but it does not.".format(
            java_file)
    print "java code saved in {0}".format(java_file)

    print "Predicting in H2O"
    predictions = model.predict(test)
    predictions.summary()
    predictions.head()
    out_h2o_csv = os.path.join(tmpdir, "out_h2o.csv")
    h2o.download_csv(predictions, out_h2o_csv)
    assert os.path.exists(
        out_h2o_csv), "Expected file {0} to exist, but it does not.".format(
            out_h2o_csv)
    print "H2O Predictions saved in {0}".format(out_h2o_csv)

    print "Setting up for Java POJO"
    in_csv = os.path.join(tmpdir, "in.csv")
    h2o.download_csv(test[x], in_csv)

    # hack: the PredictCsv driver can't handle quoted strings, so remove them
    f = open(in_csv, 'r+')
    csv = f.read()
    csv = re.sub('\"', '', csv)
    f.seek(0)
    f.write(csv)
    f.truncate()
    f.close()
    assert os.path.exists(
        in_csv), "Expected file {0} to exist, but it does not.".format(in_csv)
    print "Input CSV to PredictCsv saved in {0}".format(in_csv)

    print "Compiling Java Pojo"
    javac_cmd = [
        "javac", "-cp", h2o_genmodel_jar, "-J-Xmx4g", "-J-XX:MaxPermSize=256m",
        java_file
    ]
    subprocess.check_call(javac_cmd)

    print "Running PredictCsv Java Program"
    out_pojo_csv = os.path.join(tmpdir, "out_pojo.csv")
    cp_sep = ";" if sys.platform == "win32" else ":"
    java_cmd = [
        "java", "-ea", "-cp", h2o_genmodel_jar + cp_sep + tmpdir, "-Xmx4g",
        "-XX:MaxPermSize=256m", "-XX:ReservedCodeCacheSize=256m",
        "hex.genmodel.tools.PredictCsv", "--header", "--model", model._id,
        "--input", in_csv, "--output", out_pojo_csv
    ]
    p = subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT)
    o, e = p.communicate()
    print "Java output: {0}".format(o)
    assert os.path.exists(
        out_pojo_csv), "Expected file {0} to exist, but it does not.".format(
            out_pojo_csv)
    predictions2 = h2o.import_file(path=out_pojo_csv)
    print "Pojo predictions saved in {0}".format(out_pojo_csv)

    print "Comparing predictions between H2O and Java POJO"
    # Dimensions
    hr, hc = predictions.dim
    pr, pc = predictions2.dim
    assert hr == pr, "Exepcted the same number of rows, but got {0} and {1}".format(
        hr, pr)
    assert hc == pc, "Exepcted the same number of cols, but got {0} and {1}".format(
        hc, pc)

    # Value
    for r in range(hr):
        hp = predictions[r, 0]
        if equality == "numeric":
            pp = float.fromhex(predictions2[r, 0])
            assert abs(
                hp - pp
            ) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format(
                r, hp, pp)
        elif equality == "class":
            pp = predictions2[r, 0]
            assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format(
                r, hp, pp)
        else:
            raise (ValueError,
                   "equality type {0} is not supported".format(equality))
Ejemplo n.º 53
0
def cv_carsRF():

    # read in the dataset and construct training set (and validation set)
    cars = h2o.import_file(
        path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))

    # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
    # 2:multinomial
    problem = random.sample(range(3), 1)[0]
    problem = 2

    # pick the predictors and the correct response column
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    if problem == 1:
        response_col = "economy_20mpg"
        cars[response_col] = cars[response_col].asfactor()
    elif problem == 2:
        response_col = "cylinders"
        cars[response_col] = cars[response_col].asfactor()
    else:
        response_col = "economy"

    print "Response column: {0}".format(response_col)

    ## cross-validation
    # 1. check that cv metrics are the same over repeated seeded "Modulo" runs
    nfolds = random.randint(3, 10)
    rf1 = h2o.random_forest(y=cars[response_col],
                            x=cars[predictors],
                            nfolds=nfolds,
                            fold_assignment="Modulo",
                            seed=1234)
    rf2 = h2o.random_forest(y=cars[response_col],
                            x=cars[predictors],
                            nfolds=nfolds,
                            fold_assignment="Modulo",
                            seed=1234)
    pyunit_utils.check_models(rf1, rf2, True)

    # 2. check that cv metrics are different over repeated "Random" runs
    nfolds = random.randint(3, 10)
    rf1 = h2o.random_forest(y=cars[response_col],
                            x=cars[predictors],
                            nfolds=nfolds,
                            fold_assignment="Random")
    rf2 = h2o.random_forest(y=cars[response_col],
                            x=cars[predictors],
                            nfolds=nfolds,
                            fold_assignment="Random")
    try:
        pyunit_utils.check_models(rf1, rf2, True)
        assert False, "Expected models to be different over repeated Random runs"
    except AssertionError:
        assert True

    # 3. folds_column
    num_folds = random.randint(2, 5)
    fold_assignments = h2o.H2OFrame(python_obj=[[
        random.randint(0, num_folds - 1) for f in range(cars.nrow)
    ]])
    fold_assignments.set_names(["fold_assignments"])
    cars = cars.cbind(fold_assignments)
    rf = h2o.random_forest(y=cars[response_col],
                           x=cars[predictors],
                           training_frame=cars,
                           fold_column="fold_assignments",
                           keep_cross_validation_predictions=True)
    num_cv_models = len(rf._model_json['output']['cross_validation_models'])
    assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \
                                                    "{1}".format(num_folds, num_cv_models)
    cv_model1 = h2o.get_model(
        rf._model_json['output']['cross_validation_models'][0]['name'])
    cv_model2 = h2o.get_model(
        rf._model_json['output']['cross_validation_models'][1]['name'])

    # 4. keep_cross_validation_predictions
    cv_predictions = rf1._model_json['output']['cross_validation_predictions']
    assert cv_predictions is None, "Expected cross-validation predictions to be None, but got {0}".format(
        cv_predictions)

    cv_predictions = rf._model_json['output']['cross_validation_predictions']
    assert len(cv_predictions)==num_folds, "Expected the same number of cross-validation predictions " \
                                                          "as folds, but got {0}".format(len(cv_predictions))

    ## boundary cases
    # 1. nfolds = number of observations (leave-one-out cross-validation)
    rf = h2o.random_forest(y=cars[response_col],
                           x=cars[predictors],
                           nfolds=cars.nrow,
                           fold_assignment="Modulo")

    # 2. nfolds = 0
    rf1 = h2o.random_forest(y=cars[response_col],
                            x=cars[predictors],
                            nfolds=0,
                            seed=1234)
    # check that this is equivalent to no nfolds
    rf2 = h2o.random_forest(y=cars[response_col],
                            x=cars[predictors],
                            seed=1234)
    pyunit_utils.check_models(rf1, rf2)

    # 3. cross-validation and regular validation attempted
    rf = h2o.random_forest(y=cars[response_col],
                           x=cars[predictors],
                           nfolds=random.randint(3, 10),
                           validation_y=cars[response_col],
                           validation_x=cars[predictors])

    ## error cases
    # 1. nfolds == 1 or < 0
    try:
        rf = h2o.random_forest(y=cars[response_col],
                               x=cars[predictors],
                               nfolds=random.sample([-1, 1], 1)[0])
        assert False, "Expected model-build to fail when nfolds is 1 or < 0"
    except EnvironmentError:
        assert True

    # 2. more folds than observations
    try:
        rf = h2o.random_forest(y=cars[response_col],
                               x=cars[predictors],
                               nfolds=cars.nrow + 1,
                               fold_assignment="Modulo")
        assert False, "Expected model-build to fail when nfolds > nobs"
    except EnvironmentError:
        assert True

    # 3. fold_column and nfolds both specified
    try:
        rf = h2o.random_forest(y=cars[response_col],
                               x=cars[predictors],
                               nfolds=3,
                               fold_column="fold_assignments",
                               training_frame=cars)
        assert False, "Expected model-build to fail when fold_column and nfolds both specified"
    except EnvironmentError:
        assert True
def cars_checkpoint():

    cars = h2o.upload_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    s = cars.runif()
    train = cars[s > .2]
    valid = cars[s <= .2]

    print("\n*** Description (chunk distribution, etc) of training frame:")
    train.describe()
    print("\n*** Description (chunk distribution, etc) of validation frame:")
    valid.describe()

    # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
    # 2:multinomial
    problem = random.sample(list(range(3)),1)[0]

    # pick the predictors and response column, along with the correct
    predictors = ["displacement","power","weight","acceleration","year"]
    if problem == 1   :
        response_col = "economy_20mpg"
        train[response_col] = train[response_col].asfactor()
        valid[response_col] = valid[response_col].asfactor()
    elif problem == 2 :
        response_col = "cylinders"
        train[response_col] = train[response_col].asfactor()
        valid[response_col] = valid[response_col].asfactor()
    else              :
        response_col = "economy"

    print("\n*** Response column: {0}".format(response_col))

    # build first model
    ntrees1 = 5
    max_depth1 = random.sample(list(range(2,6)),1)[0]
    min_rows1 = random.sample(list(range(10,16)),1)[0]
    print("\n*** Building model 1 with the following parameters:")
    print("*** ntrees model 1: {0}".format(ntrees1))
    print("*** max_depth model 1: {0}".format(max_depth1))
    print("*** min_rows model 1: {0}".format(min_rows1))
    model1 = h2o.random_forest(x=train[predictors],
                     y=train[response_col],
                     ntrees=ntrees1,
                     max_depth=max_depth1,
                     min_rows=min_rows1,
                     score_each_iteration=True,
                     validation_x=valid[predictors],
                     validation_y=valid[response_col],
                     seed=1234)

    # save the model, then load the model
    model_path = h2o.save_model(model1, name="delete_model", force=True)
    restored_model = h2o.load_model(model_path)
    shutil.rmtree("delete_model")

    # continue building the model
    ntrees2 = ntrees1 + 5
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print("\n*** Continuing to build model 1 (now called model 2) with the following parameters:")
    print("*** ntrees model 2: {0}".format(ntrees2))
    print("*** max_depth model 2: {0}".format(max_depth2))
    print("*** min_rows model 2: {0}".format(min_rows2))
    model2 = h2o.random_forest(x=train[predictors],
                     y=train[response_col],
                     ntrees=ntrees2,
                     max_depth=max_depth2,
                     min_rows=min_rows2,
                     score_each_iteration=True,
                     validation_x=valid[predictors],
                     validation_y=valid[response_col],
                     checkpoint=restored_model._id,
                     seed=1234)

    # continue building the model, but with different number of trees
    ntrees3 = ntrees2 + 50
    max_depth3 = max_depth1
    min_rows3 = min_rows1
    print("\n*** Continuing to build model 1 (now called model 3) with the following parameters:")
    print("*** ntrees model 3: {0}".format(ntrees3))
    print("*** max_depth model 3: {0}".format(max_depth3))
    print("*** min_rows model 3: {0}".format(min_rows3))
    model3 = h2o.random_forest(x=train[predictors],
                     y=train[response_col],
                     ntrees=ntrees3,
                     max_depth=max_depth3,
                     min_rows=min_rows3,
                     score_each_iteration=True,
                     validation_x=valid[predictors],
                     validation_y=valid[response_col],
                     checkpoint=restored_model._id,
                     seed=1234)

    # build the equivalent of model 2 in one shot
    print("\n*** Building the equivalent of model 2 (called model 4) in one shot:")
    model4 = h2o.random_forest(x=train[predictors],
                     y=train[response_col],
                     ntrees=ntrees2,
                     max_depth=max_depth2,
                     min_rows=min_rows2,
                     score_each_iteration=True,
                     validation_x=valid[predictors],
                     validation_y=valid[response_col],
                     seed=1234)

    print("\n*** Model Summary for model 2:")
    print(model2.summary())
    print("\n*** Model Summary for model 3:")
    print(model3.summary())
    print("\n*** Model Summary for model 4:")
    print(model4.summary())

    print("\n*** Score History for model 2:")
    print(model2.score_history())
    print("\n*** Score History for model 3:")
    print(model3.score_history())
    print("\n*** Score History for model 4:")
    print(model4.score_history())

    # checks
    if problem == 0:
        assert isinstance(model2,type(model4))
        assert model2.mse(valid=True)==model4.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model4.mse(valid=True))
        #assert model3.mse(valid=True)!=model4.mse(valid=True), "Expected Model 3 MSE: {0} to be different from Model 4 MSE: {1}".format(model3.mse(valid=True), model4.mse(valid=True))

    elif problem == 1:
        assert isinstance(model2,type(model4))
        assert model2.auc(valid=True)==model4.auc(valid=True), "Expected Model 2 AUC: {0} to be the same as Model 4 AUC: {1}".format(model2.auc(valid=True), model4.auc(valid=True))
        #assert model3.auc(valid=True)!=model4.auc(valid=True), "Expected Model 3 AUC: {0} to be different from Model 4 AUC: {1}".format(model3.auc(valid=True), model4.auc(valid=True))

        assert model2.logloss(valid=True)==model4.logloss(valid=True), "Expected Model 2 Log Loss: {0} to be the same as Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True))
        #assert model3.logloss(valid=True)!=model4.logloss(valid=True), "Expected Model 3 Log Loss: {0} to be different from Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True))

        assert model2.giniCoef(valid=True)==model4.giniCoef(valid=True), "Expected Model 2 Gini Coef {0} to be the same as Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True))
        #assert model3.giniCoef(valid=True)!=model4.giniCoef(valid=True), "Expected Model 3 Gini Coef: {0} to be different from Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True))

    else:
        assert isinstance(model2,type(model4))
        assert model2.mse(valid=True)==model4.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model4.mse(valid=True))
        #assert model3.mse(valid=True)!=model4.mse(valid=True), "Expected Model 3 MSE: {0} to be different from Model 4 MSE: {1}".format(model3.mse(valid=True), model4.mse(valid=True))

        assert model2.r2(valid=True)==model4.r2(valid=True), "Expected Model 2 R2: {0} to be the same as Model 4 R2: {1}".format(model2.r2(valid=True), model4.r2(valid=True))
Ejemplo n.º 55
0
def weights_vi():

    ###### create synthetic dataset1 with 3 predictors: p1 predicts response ~90% of the time, p2 ~70%, p3 ~50%
    response = ['a'] * 10000 + ['b'] * 10000

    p1 = [(1 if random.uniform(0, 1) < 0.9 else 0) if y == 'a' else
          (0 if random.uniform(0, 1) < 0.9 else 1) for y in response]
    p2 = [(1 if random.uniform(0, 1) < 0.7 else 0) if y == 'a' else
          (0 if random.uniform(0, 1) < 0.7 else 1) for y in response]
    p3 = [(1 if random.uniform(0, 1) < 0.5 else 0) if y == 'a' else
          (0 if random.uniform(0, 1) < 0.5 else 1) for y in response]

    dataset1_python = [response, p1, p2, p3]
    dataset1_h2o = h2o.H2OFrame(dataset1_python)
    dataset1_h2o.set_names(["response", "p1", "p2", "p3"])

    ##### create synthetic dataset2 with 3 predictors: p3 predicts response ~90% of the time, p1 ~70%, p2 ~50%
    response = ['a' for y in range(10000)]
    [response.append('b') for y in range(10000)]

    p1 = [(1 if random.uniform(0, 1) < 0.7 else 0) if y == 'a' else
          (0 if random.uniform(0, 1) < 0.7 else 1) for y in response]
    p2 = [(1 if random.uniform(0, 1) < 0.5 else 0) if y == 'a' else
          (0 if random.uniform(0, 1) < 0.5 else 1) for y in response]
    p3 = [(1 if random.uniform(0, 1) < 0.9 else 0) if y == 'a' else
          (0 if random.uniform(0, 1) < 0.9 else 1) for y in response]

    dataset2_python = [response, p1, p2, p3]
    dataset2_h2o = h2o.H2OFrame(dataset2_python)
    dataset2_h2o.set_names(["response", "p1", "p2", "p3"])

    ##### compute variable importances on dataset1 and dataset2
    model_dataset1 = h2o.random_forest(x=dataset1_h2o[["p1", "p2", "p3"]],
                                       y=dataset1_h2o["response"])
    varimp_dataset1 = tuple([p[0] for p in model_dataset1.varimp()])
    assert varimp_dataset1 == ('p1', 'p2', 'p3'), "Expected the following relative variable importance on dataset1: " \
                                                  "('p1', 'p2', 'p3'), but got: {0}".format(varimp_dataset1)

    model_dataset2 = h2o.random_forest(x=dataset2_h2o[["p1", "p2", "p3"]],
                                       y=dataset2_h2o["response"])
    varimp_dataset2 = tuple([p[0] for p in model_dataset2.varimp()])
    assert varimp_dataset2 == ('p3', 'p1', 'p2'), "Expected the following relative variable importance on dataset2: " \
                                                  "('p3', 'p1', 'p2'), but got: {0}".format(varimp_dataset2)

    ############ Test1 #############
    ##### weight the combined dataset 80/20 in favor of dataset 1
    dataset1_python_weighted = copy.deepcopy(dataset1_python) + [[.8] * 20000]
    dataset2_python_weighted = copy.deepcopy(dataset2_python) + [[.2] * 20000]

    ##### combine dataset1 and dataset2
    combined_dataset_python = [
        dataset1_python_weighted[i] + dataset2_python_weighted[i]
        for i in range(len(dataset1_python_weighted))
    ]
    combined_dataset_h2o = h2o.H2OFrame(combined_dataset_python)
    combined_dataset_h2o.set_names(["response", "p1", "p2", "p3", "weights"])

    ##### recompute the variable importances. the relative order should be the same as above.
    model_combined_dataset = h2o.random_forest(
        x=combined_dataset_h2o[["p1", "p2", "p3"]],
        y=combined_dataset_h2o["response"],
        training_frame=combined_dataset_h2o,
        weights_column="weights")

    varimp_combined = tuple([p[0] for p in model_combined_dataset.varimp()])
    assert varimp_combined == ('p1', 'p2', 'p3'), "Expected the following relative variable importance on the combined " \
                                                  "dataset: ('p1', 'p2', 'p3'), but got: {0}".format(varimp_combined)

    ############ Test2 #############
    ##### weight the combined dataset 80/20 in favor of dataset 2
    dataset1_python_weighted = copy.deepcopy(dataset1_python) + [[.2] * 20000]
    dataset2_python_weighted = copy.deepcopy(dataset2_python) + [[.8] * 20000]

    ##### combine dataset1 and dataset2
    combined_dataset_python = [
        dataset1_python_weighted[i] + dataset2_python_weighted[i]
        for i in range(len(dataset1_python_weighted))
    ]
    combined_dataset_h2o = h2o.H2OFrame(combined_dataset_python)
    combined_dataset_h2o.set_names(["response", "p1", "p2", "p3", "weights"])

    ##### recompute the variable importances. the relative order should be the same as above.
    model_combined_dataset = h2o.random_forest(
        x=combined_dataset_h2o[["p1", "p2", "p3"]],
        y=combined_dataset_h2o["response"],
        training_frame=combined_dataset_h2o,
        weights_column="weights")

    varimp_combined = tuple([p[0] for p in model_combined_dataset.varimp()])
    assert varimp_combined == ('p3', 'p1', 'p2'), "Expected the following relative variable importance on the combined " \
                                                  "dataset: ('p3', 'p1', 'p2'), but got: {0}".format(varimp_combined)
Ejemplo n.º 56
0
def domain_check():

    air_train = h2o.import_file(
        path=tests.locate("smalldata/airlines/AirlinesTrain.csv.zip"))
    air_train.show()
    air_test = h2o.import_file(
        path=tests.locate("smalldata/airlines/AirlinesTest.csv.zip"))
    air_test.show()

    actual_domain = [u'YES', u'NO']
    print "actual domain of the response: {0}".format(actual_domain)

    ### DRF ###
    print
    print "-------------- DRF:"
    print
    rf = h2o.random_forest(x=air_train[[
        "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth",
        "fDayOfWeek"
    ]],
                           y=air_train["IsDepDelayed"].asfactor(),
                           training_frame=air_train)
    computed_domain = rf._model_json['output'][
        'training_metrics']._metric_json['domain']
    domain_diff = list(set(computed_domain) - set(actual_domain))
    assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \
                            "The difference is {2}".format(actual_domain, computed_domain, domain_diff)

    perf = rf.model_performance(test_data=air_test)
    computed_domain = perf._metric_json['domain']
    domain_diff = list(set(computed_domain) - set(actual_domain))
    assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \
                            "The difference is {2}".format(actual_domain, computed_domain, domain_diff)

    ### GBM ###
    print
    print "-------------- GBM:"
    print
    gbm = h2o.gbm(x=air_train[[
        "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth",
        "fDayOfWeek"
    ]],
                  y=air_train["IsDepDelayed"].asfactor(),
                  training_frame=air_train,
                  distribution="bernoulli")
    computed_domain = gbm._model_json['output'][
        'training_metrics']._metric_json['domain']
    domain_diff = list(set(computed_domain) - set(actual_domain))
    assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \
                            "The difference is {2}".format(actual_domain, computed_domain, domain_diff)

    perf = rf.model_performance(test_data=air_test)
    computed_domain = perf._metric_json['domain']
    domain_diff = list(set(computed_domain) - set(actual_domain))
    assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \
                            "The difference is {2}".format(actual_domain, computed_domain, domain_diff)

    ### Deeplearning ###
    print
    print "-------------- Deeplearning:"
    print
    dl = h2o.deeplearning(x=air_train[[
        "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth",
        "fDayOfWeek"
    ]],
                          y=air_train["IsDepDelayed"].asfactor(),
                          training_frame=air_train,
                          activation="Tanh",
                          hidden=[2, 2, 2],
                          epochs=10)
    computed_domain = dl._model_json['output'][
        'training_metrics']._metric_json['domain']
    domain_diff = list(set(computed_domain) - set(actual_domain))
    assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \
                            "The difference is {2}".format(actual_domain, computed_domain, domain_diff)

    perf = rf.model_performance(test_data=air_test)
    computed_domain = perf._metric_json['domain']
    domain_diff = list(set(computed_domain) - set(actual_domain))
    assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \
                            "The difference is {2}".format(actual_domain, computed_domain, domain_diff)

    ### GLM ###
    print
    print "-------------- GLM:"
    print
    glm = h2o.glm(x=air_train[[
        "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth",
        "fDayOfWeek"
    ]],
                  y=air_train["IsDepDelayed"],
                  training_frame=air_train,
                  family="binomial")
    computed_domain = glm._model_json['output'][
        'training_metrics']._metric_json['domain']
    domain_diff = list(set(computed_domain) - set(actual_domain))
    assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \
                        "The difference is {2}".format(actual_domain, computed_domain, domain_diff)

    perf = glm.model_performance(test_data=air_test)
    computed_domain = perf._metric_json['domain']
    domain_diff = list(set(computed_domain) - set(actual_domain))
    assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \
                            "The difference is {2}".format(actual_domain, computed_domain, domain_diff)