コード例 #1
0
def checkpoint_new_category_in_predictor():

    sv1 = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv"))
    sv2 = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv"))
    vir = h2o.upload_file(tests.locate("smalldata/iris/virginica.csv"))

    m1 = h2o.deeplearning(x=sv1[[0, 1, 2, 4]], y=sv1[3], epochs=100)

    m2 = h2o.deeplearning(x=sv2[[0, 1, 2, 4]],
                          y=sv2[3],
                          epochs=200,
                          checkpoint=m1.model_id)

    # attempt to continue building model, but with an expanded categorical predictor domain.
    # this should fail
    try:
        m3 = h2o.deeplearning(x=vir[[0, 1, 2, 4]],
                              y=vir[3],
                              epochs=200,
                              checkpoint=m1.model_id)
        assert False, "Expected continued model-building to fail with new categories introduced in predictor"
    except EnvironmentError:
        pass

    # attempt to predict on new model, but with observations that have expanded categorical predictor domain.
    predictions = m2.predict(vir)
コード例 #2
0
def tweedie_weights(ip,port):

    data = h2o.import_file(h2o.locate("smalldata/glm_test/cancar_logIn.csv"))
    data["C1M3"] = (data["Class"] == 1 and data["Merit"] == 3).asfactor()
    data["C3M3"] = (data["Class"] == 3 and data["Merit"] == 3).asfactor()
    data["C4M3"] = (data["Class"] == 4 and data["Merit"] == 3).asfactor()
    data["C1M2"] = (data["Class"] == 1 and data["Merit"] == 2).asfactor()
    data["Merit"] = data["Merit"].asfactor()
    data["Class"] = data["Class"].asfactor()
    loss = data["Cost"] / data["Insured"]
    loss.setName(0,"Loss")
    cancar = loss.cbind(data)

    # Without weights
    myX = ["Merit","Class","C1M3","C4M3"]
    dl = h2o.deeplearning(x = cancar[myX],y = cancar["Loss"],distribution ="tweedie",hidden = [1],epochs = 1000,
                          train_samples_per_iteration = -1,reproducible = True,activation = "Tanh",balance_classes = False,
                          force_load_balance = False, seed = 2353123,tweedie_power = 1.5,score_training_samples = 0,
                          score_validation_samples = 0)

    mean_residual_deviance = dl.mean_residual_deviance()

    # With weights
    dl = h2o.deeplearning(x = cancar[myX],y = cancar["Loss"],distribution ="tweedie",hidden = [1],epochs = 1000,
                          train_samples_per_iteration = -1,reproducible = True,activation = "Tanh",balance_classes = False,
                          force_load_balance = False, seed = 2353123,tweedie_power = 1.5,score_training_samples = 0,
                          score_validation_samples = 0,weights_column = "Insured",training_frame = cancar)
コード例 #3
0
def tweedie_weights():

    data = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/cancar_logIn.csv"))
    data["C1M3"] = (data["Class"] == 1 and data["Merit"] == 3).asfactor()
    data["C3M3"] = (data["Class"] == 3 and data["Merit"] == 3).asfactor()
    data["C4M3"] = (data["Class"] == 4 and data["Merit"] == 3).asfactor()
    data["C1M2"] = (data["Class"] == 1 and data["Merit"] == 2).asfactor()
    data["Merit"] = data["Merit"].asfactor()
    data["Class"] = data["Class"].asfactor()
    loss = data["Cost"] / data["Insured"]
    loss.set_name(0,"Loss")
    cancar = loss.cbind(data)

    # Without weights
    myX = ["Merit","Class","C1M3","C4M3"]
    dl = h2o.deeplearning(x = cancar[myX],y = cancar["Loss"],distribution ="tweedie",hidden = [1],epochs = 1000,
                          train_samples_per_iteration = -1,reproducible = True,activation = "Tanh",balance_classes = False,
                          force_load_balance = False, seed = 2353123,tweedie_power = 1.5,score_training_samples = 0,
                          score_validation_samples = 0)

    mean_residual_deviance = dl.mean_residual_deviance()

    # With weights
    dl = h2o.deeplearning(x = cancar[myX],y = cancar["Loss"],distribution ="tweedie",hidden = [1],epochs = 1000,
                          train_samples_per_iteration = -1,reproducible = True,activation = "Tanh",balance_classes = False,
                          force_load_balance = False, seed = 2353123,tweedie_power = 1.5,score_training_samples = 0,
                          score_validation_samples = 0,weights_column = "Insured",training_frame = cancar)
def offsets_and_distributions(ip,port):

    # cars
    cars = h2o.upload_file(h2o.locate("smalldata/junit/cars_20mpg.csv"))
    cars = cars[cars["economy_20mpg"].isna() == 0]
    cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
    offset = h2o.H2OFrame(python_obj=[[.5] for x in range(398)])
    offset.setNames(["x1"])
    cars = cars.cbind(offset)

    # insurance
    insurance = h2o.import_file(h2o.locate("smalldata/glm_test/insurance.csv"))
    insurance["offset"] = insurance["Holders"].log()

    # bernoulli - offset not supported
    #dl = h2o.deeplearning(x=cars[2:8], y=cars["economy_20mpg"], distribution="bernoulli", offset_column="x1",
    #                       training_frame=cars)
    #predictions = dl.predict(cars)

    # gamma
    dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gamma", offset_column="offset", training_frame=insurance)
    predictions = dl.predict(insurance)

    # gaussian
    dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gaussian", offset_column="offset", training_frame=insurance)
    predictions = dl.predict(insurance)

    # poisson
    dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="poisson", offset_column="offset", training_frame=insurance)
    predictions = dl.predict(insurance)

    # tweedie
    dl = h2o.deeplearning(x=insurance.names[0:3], y="Claims", distribution="tweedie", offset_column="offset", training_frame=insurance)
    predictions = dl.predict(insurance)
コード例 #5
0
def offsets_and_distributions():

    # cars
    cars = h2o.upload_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    cars = cars[cars["economy_20mpg"].isna() == 0]
    cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
    offset = h2o.H2OFrame([[.5]]*398)
    offset.set_name(0,"x1")
    cars = cars.cbind(offset)

    # insurance
    insurance = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/insurance.csv"))
    insurance["offset"] = insurance["Holders"].log()

    # bernoulli - offset not supported
    #dl = h2o.deeplearning(x=cars[2:8], y=cars["economy_20mpg"], distribution="bernoulli", offset_column="x1",
    #                       training_frame=cars)
    #predictions = dl.predict(cars)

    # gamma
    dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gamma", offset_column="offset", training_frame=insurance)
    predictions = dl.predict(insurance)

    # gaussian
    dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gaussian", offset_column="offset", training_frame=insurance)
    predictions = dl.predict(insurance)

    # poisson
    dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="poisson", offset_column="offset", training_frame=insurance)
    predictions = dl.predict(insurance)

    # tweedie
    dl = h2o.deeplearning(x=insurance.names[0:3], y="Claims", distribution="tweedie", offset_column="offset", training_frame=insurance)
    predictions = dl.predict(insurance)
def weights_and_distributions():

    htable = h2o.upload_file(h2o.locate("smalldata/gbm_test/moppe.csv"))
    htable["premiekl"] = htable["premiekl"].asfactor()
    htable["moptva"] = htable["moptva"].asfactor()
    htable["zon"] = htable["zon"]

    # gamma
    dl = h2o.deeplearning(
        x=htable[0:3], y=htable["medskad"], training_frame=htable, distribution="gamma", weights_column="antskad"
    )
    predictions = dl.predict(htable)

    # gaussian
    dl = h2o.deeplearning(
        x=htable[0:3], y=htable["medskad"], training_frame=htable, distribution="gaussian", weights_column="antskad"
    )
    predictions = dl.predict(htable)

    # poisson
    dl = h2o.deeplearning(
        x=htable[0:3], y=htable["medskad"], training_frame=htable, distribution="poisson", weights_column="antskad"
    )
    predictions = dl.predict(htable)

    # tweedie
    dl = h2o.deeplearning(
        x=htable[0:3], y=htable["medskad"], training_frame=htable, distribution="tweedie", weights_column="antskad"
    )
    predictions = dl.predict(htable)
コード例 #7
0
def imbalance(ip, port):

    print "Test checks if Deep Learning works fine with an imbalanced dataset"

    covtype = h2o.upload_file(h2o.locate("smalldata/covtype/covtype.20k.data"))
    covtype[54] = covtype[54].asfactor()
    hh_imbalanced = h2o.deeplearning(
        x=covtype[0:54],
        y=covtype[54],
        l1=1e-5,
        activation="Rectifier",
        loss="CrossEntropy",
        hidden=[200, 200],
        epochs=1,
        training_frame=covtype,
        balance_classes=False,
        reproducible=True,
        seed=1234,
    )
    print hh_imbalanced

    hh_balanced = h2o.deeplearning(
        x=covtype[0:54],
        y=covtype[54],
        l1=1e-5,
        activation="Rectifier",
        loss="CrossEntropy",
        hidden=[200, 200],
        epochs=1,
        training_frame=covtype,
        balance_classes=True,
        reproducible=True,
        seed=1234,
    )
    print hh_balanced

    # compare error for class 6 (difficult minority)
    class_6_err_imbalanced = hh_imbalanced.confusion_matrix(covtype).cell_values[5][7]
    class_6_err_balanced = hh_balanced.confusion_matrix(covtype).cell_values[5][7]

    if class_6_err_imbalanced < class_6_err_balanced:
        print "--------------------"
        print ""
        print "FAIL, balanced error greater than imbalanced error"
        print ""
        print ""
        print "class_6_err_imbalanced"
        print class_6_err_imbalanced
        print ""
        print "class_6_err_balanced"
        print class_6_err_balanced
        print ""
        print "--------------------"

    assert class_6_err_imbalanced >= class_6_err_balanced, "balance_classes makes it worse!"
コード例 #8
0
def imbalance(ip, port):
    h2o.init(ip, port)

    print "Test checks if Deep Learning works fine with an imbalanced dataset"

    covtype = h2o.upload_file(h2o.locate("smalldata/covtype/covtype.20k.data"))
    covtype[54] = covtype[54].asfactor()
    hh_imbalanced = h2o.deeplearning(x=covtype[0:54],
                                     y=covtype[54],
                                     l1=1e-5,
                                     activation="Rectifier",
                                     loss="CrossEntropy",
                                     hidden=[200, 200],
                                     epochs=1,
                                     training_frame=covtype,
                                     balance_classes=False,
                                     reproducible=True,
                                     seed=1234)
    print hh_imbalanced

    hh_balanced = h2o.deeplearning(x=covtype[0:54],
                                   y=covtype[54],
                                   l1=1e-5,
                                   activation="Rectifier",
                                   loss="CrossEntropy",
                                   hidden=[200, 200],
                                   epochs=1,
                                   training_frame=covtype,
                                   balance_classes=True,
                                   reproducible=True,
                                   seed=1234)
    print hh_balanced

    #compare error for class 6 (difficult minority)
    class_6_err_imbalanced = hh_imbalanced.confusion_matrix(
        covtype).cell_values[5][7]
    class_6_err_balanced = hh_balanced.confusion_matrix(
        covtype).cell_values[5][7]

    if class_6_err_imbalanced < class_6_err_balanced:
        print "--------------------"
        print ""
        print "FAIL, balanced error greater than imbalanced error"
        print ""
        print ""
        print "class_6_err_imbalanced"
        print class_6_err_imbalanced
        print ""
        print "class_6_err_balanced"
        print class_6_err_balanced
        print ""
        print "--------------------"

    assert class_6_err_imbalanced >= class_6_err_balanced, "balance_classes makes it worse!"
コード例 #9
0
def offsets_and_distributions(ip, port):

    # cars
    cars = h2o.upload_file(h2o.locate("smalldata/junit/cars_20mpg.csv"))
    cars = cars[cars["economy_20mpg"].isna() == 0]
    cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
    offset = h2o.H2OFrame(python_obj=[[.5] for x in range(398)])
    offset.setNames(["x1"])
    cars = cars.cbind(offset)

    # insurance
    insurance = h2o.import_frame(
        h2o.locate("smalldata/glm_test/insurance.csv"))
    insurance["offset"] = insurance["Holders"].log()

    # bernoulli
    dl = h2o.deeplearning(x=cars[2:8],
                          y=cars["economy_20mpg"],
                          distribution="bernoulli",
                          offset_column="x1",
                          training_frame=cars)
    predictions = dl.predict(cars)

    # gamma
    dl = h2o.deeplearning(x=insurance[0:3],
                          y=insurance["Claims"],
                          distribution="gamma",
                          offset_column="offset",
                          training_frame=insurance)
    predictions = dl.predict(insurance)

    # gaussian
    dl = h2o.deeplearning(x=insurance[0:3],
                          y=insurance["Claims"],
                          distribution="gaussian",
                          offset_column="offset",
                          training_frame=insurance)
    predictions = dl.predict(insurance)

    # poisson
    dl = h2o.deeplearning(x=insurance[0:3],
                          y=insurance["Claims"],
                          distribution="poisson",
                          offset_column="offset",
                          training_frame=insurance)
    predictions = dl.predict(insurance)

    # tweedie
    dl = h2o.deeplearning(x=insurance[0:3],
                          y=insurance["Claims"],
                          distribution="tweedie",
                          offset_column="offset",
                          training_frame=insurance)
    predictions = dl.predict(insurance)
コード例 #10
0
ファイル: pyunit_pubdev_2041.py プロジェクト: konor/h2o-3
def pubdev_2041():

    iris = h2o.import_file(tests.locate("smalldata/iris/iris.csv"))

    s = iris.runif(seed=12345)
    train1 = iris[s >= 0.5]
    train2 = iris[s <  0.5]

    m1 = h2o.deeplearning(x=train1[0:4], y=train1[4], epochs=100)

    # update m1 with new training data
    m2 = h2o.deeplearning(x=train2[0:4], y=train2[4], epochs=200, checkpoint=m1.model_id)
コード例 #11
0
def cv_carsDL(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    # read in the dataset and construct training set (and validation set)
    cars =  h2o.import_frame(path=h2o.locate("smalldata/junit/cars_20mpg.csv"))

    # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
    # 2:multinomial
    problem = random.sample(range(3),1)[0]

    # pick the predictors and the correct response column
    predictors = ["displacement","power","weight","acceleration","year"]
    if problem == 1   :
        response_col = "economy_20mpg"
        cars[response_col] = cars[response_col].asfactor()
    elif problem == 2 :
        response_col = "cylinders"
        cars[response_col] = cars[response_col].asfactor()
    else              :
        response_col = "economy"

    print "Response column: {0}".format(response_col)

    ## cross-validation
    ## basic
    nfolds = random.randint(3,10)
    dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=nfolds)

    ## boundary case
    # nfolds = 0
    dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=0)

    ## error cases
    # 1. nfolds == 1 or < 0
    # TODO: PUBDEV-1696
    try:
        dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=random.randint(-10000,-1))
        dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=1)
        assert False, "Expected model-build to fail when nfolds is 1 or < 0"
    except EnvironmentError:
        assert True

    # 2. cross-validation and regular validation attempted
    r = cars[0].runif()
    train = cars[r > .2]
    valid = cars[r <= .2]
    try:
        dl = h2o.deeplearning(y=train[response_col], x=train[predictors], nfolds=random.randint(3,10),
                               validation_y=valid[1], validation_x=valid[predictors])
        assert False, "Expected model-build to fail when both cross-validation and regular validation is attempted"
    except EnvironmentError:
        assert True
コード例 #12
0
def imbalance():

    print "Test checks if Deep Learning works fine with an imbalanced dataset"

    covtype = h2o.upload_file(
        pyunit_utils.locate("smalldata/covtype/covtype.20k.data"))
    covtype[54] = covtype[54].asfactor()
    hh_imbalanced = h2o.deeplearning(x=covtype[0:54],
                                     y=covtype[54],
                                     l1=1e-5,
                                     activation="Rectifier",
                                     loss="CrossEntropy",
                                     hidden=[200, 200],
                                     epochs=1,
                                     training_frame=covtype,
                                     balance_classes=False,
                                     reproducible=True,
                                     seed=1234)
    print hh_imbalanced

    hh_balanced = h2o.deeplearning(x=covtype[0:54],
                                   y=covtype[54],
                                   l1=1e-5,
                                   activation="Rectifier",
                                   loss="CrossEntropy",
                                   hidden=[200, 200],
                                   epochs=1,
                                   training_frame=covtype,
                                   balance_classes=True,
                                   reproducible=True,
                                   seed=1234)
    print hh_balanced

    #compare overall logloss
    class_6_err_imbalanced = hh_imbalanced.logloss()
    class_6_err_balanced = hh_balanced.logloss()

    if class_6_err_imbalanced < class_6_err_balanced:
        print "--------------------"
        print ""
        print "FAIL, balanced error greater than imbalanced error"
        print ""
        print ""
        print "class_6_err_imbalanced"
        print class_6_err_imbalanced
        print ""
        print "class_6_err_balanced"
        print class_6_err_balanced
        print ""
        print "--------------------"

    assert class_6_err_imbalanced >= class_6_err_balanced, "balance_classes makes it worse!"
def checkpoint_new_category_in_response():

    sv = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv"))
    iris = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris.csv"))

    m1 = h2o.deeplearning(x=sv[[0,1,2,3]], y=sv[4], epochs=100)

    # attempt to continue building model, but with an expanded categorical response domain.
    # this should fail
    try:
        m2 = h2o.deeplearning(x=iris[[0,1,2,3]], y=iris[4], epochs=200, checkpoint=m1.model_id)
        assert False, "Expected continued model-building to fail with new categories introduced in response"
    except EnvironmentError:
        pass
コード例 #14
0
def deep_learning_metrics_test():
    # connect to existing cluster

    df = h2o.import_file(path=tests.locate("smalldata/logreg/prostate.csv"))

    df.drop("ID")  # remove ID
    df['CAPSULE'] = df['CAPSULE'].asfactor()  # make CAPSULE categorical
    vol = df['VOL']
    vol[vol == 0] = float("nan")  # 0 VOL means 'missing'

    r = vol.runif()  # random train/test split
    train = df[r < 0.8]
    test = df[r >= 0.8]

    # See that the data is ready
    train.describe()
    train.head()
    train.tail()
    test.describe()
    test.head()
    test.tail()

    # Run DeepLearning
    print "Train a Deeplearning model: "
    dl = h2o.deeplearning(x=train[1:],
                          y=train['CAPSULE'],
                          epochs=100,
                          hidden=[10, 10, 10],
                          loss='CrossEntropy')
    print "Binomial Model Metrics: "
    print
    dl.show()
    dl.model_performance(test).show()
コード例 #15
0
def deep_learning_metrics_test(ip, port):
    h2o.init(ip, port)  # connect to existing cluster

    df = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv"))

    df.drop("ID")  # remove ID
    df["CAPSULE"] = df["CAPSULE"].asfactor()  # make CAPSULE categorical
    vol = df["VOL"]
    vol[vol == 0] = float("nan")  # 0 VOL means 'missing'

    r = vol.runif()  # random train/test split
    train = df[r < 0.8]
    test = df[r >= 0.8]

    # See that the data is ready
    train.describe()
    train.head()
    train.tail()
    test.describe()
    test.head()
    test.tail()

    # Run DeepLearning
    print "Train a Deeplearning model: "
    dl = h2o.deeplearning(x=train[1:], y=train["CAPSULE"], epochs=100, hidden=[10, 10, 10], loss="CrossEntropy")
    print "Binomial Model Metrics: "
    print
    dl.show()
    dl.model_performance(test).show()
コード例 #16
0
def pubdev_2223():

    covtype = h2o.import_file(
        pyunit_utils.locate("smalldata/covtype/covtype.20k.data"))
    covtype[54] = covtype[54].asfactor()
    dlmodel = h2o.deeplearning(x=covtype[0:54],
                               y=covtype[54],
                               hidden=[17, 191],
                               epochs=1,
                               training_frame=covtype,
                               balance_classes=False,
                               reproducible=True,
                               seed=1234,
                               export_weights_and_biases=True)

    print(
        "Normalization/Standardization multipliers for numeric predictors: {0}\n"
        .format(dlmodel.normmul()))
    print(
        "Normalization/Standardization offsets for numeric predictors: {0}\n".
        format(dlmodel.normsub()))
    print(
        "Normalization/Standardization multipliers for numeric response: {0}\n"
        .format(dlmodel.respmul()))
    print("Normalization/Standardization offsets for numeric response: {0}\n".
          format(dlmodel.respsub()))
    print("Categorical offsets for one-hot encoding: {0}\n".format(
        dlmodel.catoffsets()))
コード例 #17
0
def deeplearning_autoencoder():

    resp = 784
    nfeatures = 20  # number of features (smallest hidden layer)

    train_hex = h2o.upload_file(
        pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz"))
    train_hex[resp] = train_hex[resp].asfactor()

    test_hex = h2o.upload_file(
        pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz"))
    test_hex[resp] = test_hex[resp].asfactor()

    # split data into two parts
    sid = train_hex[0].runif(1234)

    # unsupervised data for autoencoder
    train_unsupervised = train_hex[sid >= 0.5]
    train_unsupervised.drop(resp)
    train_unsupervised.describe()

    # supervised data for drf
    train_supervised = train_hex[sid < 0.5]
    train_supervised.describe()

    # train autoencoder
    ae_model = h2o.deeplearning(
        x=train_unsupervised[0:resp],
        activation="Tanh",
        autoencoder=True,
        hidden=[nfeatures],
        epochs=1,
        reproducible=True,  #slow, turn off for real problems
        seed=1234)

    # conver train_supervised with autoencoder to lower-dimensional space
    train_supervised_features = ae_model.deepfeatures(
        train_supervised[0:resp]._frame(), 0)

    assert train_supervised_features.ncol == nfeatures, "Dimensionality of reconstruction is wrong!"

    # Train DRF on extracted feature space
    drf_model = h2o.random_forest(x=train_supervised_features[0:20],
                                  y=train_supervised[resp],
                                  ntrees=10,
                                  min_rows=10,
                                  seed=1234)

    # Test the DRF model on the test set (processed through deep features)
    test_features = ae_model.deepfeatures(test_hex[0:resp]._frame(), 0)
    test_features = test_features.cbind(test_hex[resp])._frame()

    # Confusion Matrix and assertion
    cm = drf_model.confusion_matrix(test_features)
    cm.show()

    # 10% error +/- 0.001
    assert abs(cm.cell_values[10][10] -
               0.081) < 0.001, "Error. Expected 0.081, but got {0}".format(
                   cm.cell_values[10][10])
コード例 #18
0
def anomaly():
    

    print "Deep Learning Anomaly Detection MNIST"

    train = h2o.import_file(tests.locate("bigdata/laptop/mnist/train.csv.gz"))
    test = h2o.import_file(tests.locate("bigdata/laptop/mnist/test.csv.gz"))

    predictors = range(0,784)
    resp = 784

    # unsupervised -> drop the response column (digit: 0-9)
    train = train[predictors]
    test = test[predictors]

    # 1) LEARN WHAT'S NORMAL
    # train unsupervised Deep Learning autoencoder model on train_hex
    ae_model = h2o.deeplearning(x=train[predictors], training_frame=train, activation="Tanh", autoencoder=True,
                                hidden=[50], l1=1e-5, ignore_const_cols=False, epochs=1)

    # 2) DETECT OUTLIERS
    # anomaly app computes the per-row reconstruction error for the test data set
    # (passing it through the autoencoder model and computing mean square error (MSE) for each row)
    test_rec_error = ae_model.anomaly(test)

    # 3) VISUALIZE OUTLIERS
    # Let's look at the test set points with low/median/high reconstruction errors.
    # We will now visualize the original test set points and their reconstructions obtained
    # by propagating them through the narrow neural net.

    # Convert the test data into its autoencoded representation (pass through narrow neural net)
    test_recon = ae_model.predict(test)
コード例 #19
0
def deep_learning_metrics_test(ip, port):
    h2o.init(ip, port)  # connect to existing cluster
    df = h2o.import_frame(path="smalldata/logreg/prostate.csv")

    del df['ID']  # remove ID
    df['CAPSULE'] = df['CAPSULE'].asfactor()  # make CAPSULE categorical
    vol = df['VOL']
    vol[vol == 0] = None  # 0 VOL means 'missing'

    r = vol.runif()  # random train/test split
    train = df[r < 0.8]
    test = df[r >= 0.8]

    # See that the data is ready
    train.describe()
    train.head()
    test.describe()
    test.head()

    # Run DeepLearning

    print "Train a Deeplearning model: "
    dl = h2o.deeplearning(x=train[1:],
                          y=train['CAPSULE'],
                          epochs=100,
                          hidden=[10, 10, 10])
    print "Binomial Model Metrics: "
    print
    dl.model_performance(test).show()
コード例 #20
0
ファイル: pyunit_get_model.py プロジェクト: cursedninja/h2o-3
def get_model_test(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    prostate = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv"))

    r = prostate[0].runif()
    train = prostate[r < 0.70]
    test = prostate[r >= 0.30]

    # Regression
    regression_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="gaussian")
    predictions1 = regression_gbm1.predict(test)

    regression_gbm2 = h2o.get_model(regression_gbm1._key)
    assert regression_gbm2._model_json['output']['model_category'] == "Regression"
    predictions2 = regression_gbm2.predict(test)

    for r in range(predictions1.nrow()):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected regression predictions to be the same for row {0}, but got {1} and {2}" \
                         "".format(r, p1, p2)

    # Binomial
    train[1] = train[1].asfactor()
    bernoulli_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="bernoulli")
    predictions1 = bernoulli_gbm1.predict(test)

    bernoulli_gbm2 = h2o.get_model(bernoulli_gbm1._key)
    assert bernoulli_gbm2._model_json['output']['model_category'] == "Binomial"
    predictions2 = bernoulli_gbm2.predict(test)

    for r in range(predictions1.nrow()):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected binomial predictions to be the same for row {0}, but got {1} and {2}" \
                         "".format(r, p1, p2)

    # Clustering
    benign_h2o = h2o.import_frame(path=h2o.locate("smalldata/logreg/benign.csv"))
    km_h2o = h2o.kmeans(x=benign_h2o, k=3)
    benign_km = h2o.get_model(km_h2o._key)
    assert benign_km._model_json['output']['model_category'] == "Clustering"

    # Multinomial
    train[4] = train[4].asfactor()
    multinomial_dl1 = h2o.deeplearning(x=train[0:2], y=train[4], loss='CrossEntropy')
    predictions1 = multinomial_dl1.predict(test)

    multinomial_dl2 = h2o.get_model(multinomial_dl1._key)
    assert multinomial_dl2._model_json['output']['model_category'] == "Multinomial"
    predictions2 = multinomial_dl2.predict(test)

    for r in range(predictions1.nrow()):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected multinomial predictions to be the same for row {0}, but got {1} and {2}" \
                         "".format(r, p1, p2)
コード例 #21
0
def deeplearning_basic(ip, port):
    h2o.init(ip, port)

    iris_hex = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))
    hh = h2o.deeplearning(x=iris_hex[:3],
                          y=iris_hex[4],
                          loss='CrossEntropy')
    hh.show()
コード例 #22
0
def checkpoint_new_category_in_response():

    sv = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv"))
    iris = h2o.upload_file(tests.locate("smalldata/iris/iris.csv"))

    m1 = h2o.deeplearning(x=sv[[0, 1, 2, 3]], y=sv[4], epochs=100)

    # attempt to continue building model, but with an expanded categorical response domain.
    # this should fail
    try:
        m2 = h2o.deeplearning(x=iris[[0, 1, 2, 3]],
                              y=iris[4],
                              epochs=200,
                              checkpoint=m1.id)
        assert False, "Expected continued model-building to fail with new categories introduced in response"
    except EnvironmentError:
        pass
コード例 #23
0
def deeplearning_basic():
    

    iris_hex = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv"))
    hh = h2o.deeplearning(x=iris_hex[:3],
                          y=iris_hex[4],
                          loss='CrossEntropy')
    hh.show()
コード例 #24
0
ファイル: h2o_DL.py プロジェクト: Aakash282/1ia
	def train(self, x, y):
		self.model = h2o.deeplearning(x = self.trainData.drop('score diff'),
			                           y = self.trainData['score diff'],
			                           validation_x = self.valData.drop('score diff'),
			                           validation_y = self.valData['score diff'],
			                           hidden=self.params[2],
			                           epochs=self.params[3],
			                           nfolds=self.params[4])
コード例 #25
0
ファイル: wnvh.py プロジェクト: tanayz/Kaggle
def ntrain():
    
    h2o.init(ip="zurich.h2o.ai",strict_version_check=False)
    weather = load_weather()
    training = load_training()    
    X = assemble_X(training, weather)
    mean, std = normalize(X)
    y =assemble_y(training)
    xd=[]
    for l in X:
        xd.append(l.tolist())
        
    y=np.asarray(y,dtype='bool_')    
        
    xtr=H2OFrame(python_obj=xd)
    ytr=H2OFrame(python_obj=y.tolist()) 
    
    ytr["C1"]._name = "C40"  # Rename the default column
        
    gb = h2o.gbm(x =xtr[1:39],y =ytr['C40'],
                distribution = "bernoulli",
                ntrees=1000, # 500 works well
                max_depth=12,
                learn_rate=0.01)
                
    dl= h2o.deeplearning(x =xtr[1:39],y =ytr['C40'],
                variable_importances=True,balance_classes=True,
                input_dropout_ratio=0.2,rho=0.899,
                hidden_dropout_ratios=[0.4,0.4,0.4,0.4],
                activation="Tanh",hidden=[39,325,325,1],epochs=100)
                
    rf= h2o.random_forest(x =xtr[1:39],y =ytr['C40'],
                seed=1234, ntrees=600, 
                max_depth=20, balance_classes=False)

    
    testing = load_testing()
    X_test= assemble_X(testing, weather) 
    normalize(X_test, mean, std)
    
    xd=[]
    for l in X_test:
        xd.append(l.tolist())
    xts=H2OFrame(python_obj=xd)
    
#    gp=gb.predict(xts)
    dp=dl.predict(xts) 
    rp=rf.predict(xts)
    gbp=gb.predict(xts) 
    
    gp=dp*0.35+rp*0.3+gbp*0.35
    
    gph=h2o.as_list(gp)
    Id= np.arange(gp.nrow()+1)[1:].reshape(gp.nrow(),1)
    df = pd.DataFrame(Id)
    df_concat = pd.concat([df, gph.True],axis=1)
    df_concat.columns=['Id','WnvPresent']
    df_concat.to_csv("wnvh.csv",index=False)
コード例 #26
0
def get_model_test():
    
    

    prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))

    r = prostate[0].runif()
    train = prostate[r < 0.70]
    test = prostate[r >= 0.70]

    # Regression
    regression_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="gaussian")
    predictions1 = regression_gbm1.predict(test)

    regression_gbm2 = h2o.get_model(regression_gbm1._id)
    assert regression_gbm2._model_json['output']['model_category'] == "Regression"
    predictions2 = regression_gbm2.predict(test)

    for r in range(predictions1.nrow):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected regression predictions to be the same for row {}, but got {} and {}".format(r, p1, p2)

    # Binomial
    train[1] = train[1].asfactor()
    bernoulli_gbm1 = h2o.gbm(y=train[1], x=train[2:], distribution="bernoulli")
    predictions1 = bernoulli_gbm1.predict(test)

    bernoulli_gbm2 = h2o.get_model(bernoulli_gbm1._id)
    assert bernoulli_gbm2._model_json['output']['model_category'] == "Binomial"
    predictions2 = bernoulli_gbm2.predict(test)

    for r in range(predictions1.nrow):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected binomial predictions to be the same for row {}, but got {} and {}".format(r, p1, p2)

    # Clustering
    benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv"))
    km_h2o = h2o.kmeans(x=benign_h2o, k=3)
    benign_km = h2o.get_model(km_h2o._id)
    assert benign_km._model_json['output']['model_category'] == "Clustering"

    # Multinomial
    train[4] = train[4].asfactor()
    multinomial_dl1 = h2o.deeplearning(x=train[0:2], y=train[4], loss='CrossEntropy')
    predictions1 = multinomial_dl1.predict(test)

    multinomial_dl2 = h2o.get_model(multinomial_dl1._id)
    assert multinomial_dl2._model_json['output']['model_category'] == "Multinomial"
    predictions2 = multinomial_dl2.predict(test)

    for r in range(predictions1.nrow):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected multinomial predictions to be the same for row {0}, but got {1} and {2}" \
                         "".format(r, p1, p2)
コード例 #27
0
def weights_and_biases():

    print "Test checks if Deep Learning weights and biases are accessible from R"

    covtype = h2o.upload_file(h2o.locate("smalldata/covtype/covtype.20k.data"))
    covtype[54] = covtype[54].asfactor()
    dlmodel = h2o.deeplearning(
        x=covtype[0:54],
        y=covtype[54],
        hidden=[17, 191],
        epochs=1,
        training_frame=covtype,
        balance_classes=False,
        reproducible=True,
        seed=1234,
        export_weights_and_biases=True,
    )
    print dlmodel

    weights1 = dlmodel.weights(0)
    weights2 = dlmodel.weights(1)
    weights3 = dlmodel.weights(2)

    biases1 = dlmodel.biases(0)
    biases2 = dlmodel.biases(1)
    biases3 = dlmodel.biases(2)

    w1c = weights1.ncol
    w1r = weights1.nrow
    assert w1c == 52, "wrong dimensionality! expected {0}, but got {1}.".format(52, w1c)
    assert w1r == 17, "wrong dimensionality! expected {0}, but got {1}.".format(17, w1r)

    w2c = weights2.ncol
    w2r = weights2.nrow
    assert w2c == 17, "wrong dimensionality! expected {0}, but got {1}.".format(17, w2c)
    assert w2r == 191, "wrong dimensionality! expected {0}, but got {1}.".format(191, w2r)

    w3c = weights3.ncol
    w3r = weights3.nrow
    assert w3c == 191, "wrong dimensionality! expected {0}, but got {1}.".format(191, w3c)
    assert w3r == 7, "wrong dimensionality! expected {0}, but got {1}.".format(7, w3r)

    b1c = biases1.ncol
    b1r = biases1.nrow
    assert b1c == 1, "wrong dimensionality! expected {0}, but got {1}.".format(1, b1c)
    assert b1r == 17, "wrong dimensionality! expected {0}, but got {1}.".format(17, b1r)

    b2c = biases2.ncol
    b2r = biases2.nrow
    assert b2c == 1, "wrong dimensionality! expected {0}, but got {1}.".format(1, b2c)
    assert b2r == 191, "wrong dimensionality! expected {0}, but got {1}.".format(191, b2r)

    b3c = biases3.ncol
    b3r = biases3.nrow
    assert b3c == 1, "wrong dimensionality! expected {0}, but got {1}.".format(1, b3c)
    assert b3r == 7, "wrong dimensionality! expected {0}, but got {1}.".format(7, b3r)
コード例 #28
0
def deeplearning_autoencoder():

    resp = 784
    nfeatures = 20  # number of features (smallest hidden layer)

    train_hex = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz"))
    train_hex[resp] = train_hex[resp].asfactor()

    test_hex = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz"))
    test_hex[resp] = test_hex[resp].asfactor()

    # split data into two parts
    sid = train_hex[0].runif(1234)

    # unsupervised data for autoencoder
    train_unsupervised = train_hex[sid >= 0.5]
    train_unsupervised.drop(resp)
    train_unsupervised.describe()

    # supervised data for drf
    train_supervised = train_hex[sid < 0.5]
    train_supervised.describe()

    # train autoencoder
    ae_model = h2o.deeplearning(
        x=train_unsupervised[0:resp],
        activation="Tanh",
        autoencoder=True,
        hidden=[nfeatures],
        epochs=1,
        reproducible=True,  # slow, turn off for real problems
        seed=1234,
    )

    # conver train_supervised with autoencoder to lower-dimensional space
    train_supervised_features = ae_model.deepfeatures(train_supervised[0:resp], 0)

    assert train_supervised_features.ncol == nfeatures, "Dimensionality of reconstruction is wrong!"

    # Train DRF on extracted feature space
    drf_model = h2o.random_forest(
        x=train_supervised_features[0:20], y=train_supervised[resp], ntrees=10, min_rows=10, seed=1234
    )

    # Test the DRF model on the test set (processed through deep features)
    test_features = ae_model.deepfeatures(test_hex[0:resp], 0)
    test_features = test_features.cbind(test_hex[resp])

    # Confusion Matrix and assertion
    cm = drf_model.confusion_matrix(test_features)
    cm.show()

    # 10% error +/- 0.001
    assert abs(cm.cell_values[10][10] - 0.086) < 0.001, "Error. Expected 0.086, but got {0}".format(
        cm.cell_values[10][10]
    )
コード例 #29
0
def deeplearning_autoencoder(ip, port):
    h2o.init(ip, port)

    resp = 784
    nfeatures = 20  # number of features (smallest hidden layer)

    train_hex = h2o.import_frame(
        h2o.locate("bigdata/laptop/mnist/train.csv.gz"))
    test_hex = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/test.csv.gz"))

    # split data into two parts
    sid = train_hex[1].runif(1234)

    # unsupervised data for autoencoder
    train_unsupervised = train_hex[sid >= 0.5]
    train_unsupervised.describe()

    # supervised data for drf
    train_supervised = train_hex[sid < 0.5]
    train_supervised.describe()

    # train autoencoder
    ae_model = h2o.deeplearning(
        x=train_unsupervised.drop(resp),
        y=train_unsupervised[resp],  #ignored (pick any non-constant)
        activation="Tanh",
        autoencoder=True,
        hidden=[nfeatures],
        epochs=1,
        reproducible=True,  #slow, turn off for real problems
        seed=1234)

    # conver train_supervised with autoencoder to lower-dimensional space
    train_supervised_features = ae_model.deepfeatures(train_supervised, 0)
    train_supervised_features.describe()

    assert train_supervised_features.ncol(
    ) == nfeatures, "Dimensionality of reconstruction is wrong!"

    # Train DRF on extracted feature space
    drf_model = h2o.random_forest(x=train_supervised_features,
                                  y=train_supervised[resp].asfactor(),
                                  ntrees=10,
                                  seed=1234)

    # Test the DRF model on the test set (processed through deep features)
    test_features = ae_model.deepfeatures(test_hex.drop(resp), 0)
    test_features.cbind(test_hex[resp])

    # Confusion Matrix and assertion
    cm = drf_model.confusion_matrix(test_features)
    cm.show()

    # 10% error +/- 0.001
    assert abs(cm["Totals", "Error"] - 0.1038) < 0.001, "Error not as expected"
コード例 #30
0
def checkpoint_new_category_in_predictor():

    sv1 = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv"))
    sv2 = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv"))
    vir = h2o.upload_file(tests.locate("smalldata/iris/virginica.csv"))

    m1 = h2o.deeplearning(x=sv1[[0, 1, 2, 4]], y=sv1[3], epochs=100)

    m2 = h2o.deeplearning(x=sv2[[0, 1, 2, 4]], y=sv2[3], epochs=200, checkpoint=m1.id)

    # attempt to continue building model, but with an expanded categorical predictor domain.
    # this should fail
    try:
        m3 = h2o.deeplearning(x=vir[[0, 1, 2, 4]], y=vir[3], epochs=200, checkpoint=m1.id)
        assert False, "Expected continued model-building to fail with new categories introduced in predictor"
    except EnvironmentError:
        pass

    # attempt to predict on new model, but with observations that have expanded categorical predictor domain.
    predictions = m2.predict(vir)
コード例 #31
0
def tweedie_offset(ip,port):

    insurance = h2o.import_file(h2o.locate("smalldata/glm_test/insurance.csv"))
    insurance["offset"] = insurance["Holders"].log()
    insurance["Group"] = insurance["Group"].asfactor()
    insurance["Age"] = insurance["Age"].asfactor()
    insurance["District"] = insurance["District"].asfactor()

    # without offset
    dl = h2o.deeplearning(x=insurance[0:3],y=insurance["Claims"],distribution="tweedie",hidden=[1],epochs=1000,
                          train_samples_per_iteration=-1,reproducible=True,activation="Tanh",single_node_mode=False,
                          balance_classes=False,force_load_balance=False,seed=23123,tweedie_power=1.5,
                          score_training_samples=0,score_validation_samples=0)

    mean_residual_deviance = dl.mean_residual_deviance()
    assert abs(0.561641366536-mean_residual_deviance) < 1e-6, "Expected mean residual deviance to be 0.561641366536, but got " \
                                                         "{0}".format(mean_residual_deviance)
    predictions = dl.predict(insurance)
    assert abs(47.6819999424-predictions[0].mean()) < 1e-6, "Expected mean of predictions to be 47.6819999424, but got " \
                                                          "{0}".format(predictions[0].mean())
    assert abs(1.90409304033-predictions[0].min()) < 1e-6, "Expected min of predictions to be 1.90409304033, but got " \
                                                          "{0}".format(predictions[0].min())
    assert abs(280.735054543-predictions[0].max()) < 1e-6, "Expected max of predictions to be 280.735054543, but got " \
                                                          "{0}".format(predictions[0].max())

    # with offset
    dl = h2o.deeplearning(x=insurance[0:3],y=insurance["Claims"],distribution="tweedie",hidden=[1],epochs=1000,
                          train_samples_per_iteration=-1,reproducible=True,activation="Tanh",single_node_mode=False,
                          balance_classes=False,force_load_balance=False,seed=23123,tweedie_power=1.5,
                          score_training_samples=0,score_validation_samples=0,offset_column="offset",
                          training_frame=insurance)
    mean_residual_deviance = dl.mean_residual_deviance()
    assert abs(0.261065520191-mean_residual_deviance) < 1e-6, "Expected mean residual deviance to be 0.261065520191, but got " \
                                                         "{0}".format(mean_residual_deviance)
    predictions = dl.predict(insurance)
    assert abs(49.2939039783-predictions[0].mean()) < 1e-6, "Expected mean of predictions to be 49.2939039783, but got " \
                                                          "{0}".format(predictions[0].mean())
    assert abs(1.07391126487-predictions[0].min()) < 1e-6, "Expected min of predictions to be 1.07391126487, but got " \
                                                          "{0}".format(predictions[0].min())
    assert abs(397.328758591-predictions[0].max()) < 1e-6, "Expected max of predictions to be 397.328758591, but got " \
                                                          "{0}".format(predictions[0].max())
コード例 #32
0
def tweedie_offset():

    insurance = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/insurance.csv"))
    insurance["offset"] = insurance["Holders"].log()
    insurance["Group"] = insurance["Group"].asfactor()
    insurance["Age"] = insurance["Age"].asfactor()
    insurance["District"] = insurance["District"].asfactor()

    # without offset
    dl = h2o.deeplearning(x=insurance[0:3],y=insurance["Claims"],distribution="tweedie",hidden=[1],epochs=1000,
                          train_samples_per_iteration=-1,reproducible=True,activation="Tanh",single_node_mode=False,
                          balance_classes=False,force_load_balance=False,seed=23123,tweedie_power=1.5,
                          score_training_samples=0,score_validation_samples=0)

    mean_residual_deviance = dl.mean_residual_deviance()
    assert abs(0.556 - mean_residual_deviance) < 1e-3, "Expected mean residual deviance to be 0.556, but got " \
                                                         "{0}".format(mean_residual_deviance)
    predictions = dl.predict(insurance)
    assert abs(47.61-predictions[0].mean()) < 1e-2, "Expected mean of predictions to be 47.61, but got " \
                                                          "{0}".format(predictions[0].mean())
    assert abs(1.94-predictions[0].min()) < 1e-1, "Expected min of predictions to be 1.94, but got " \
                                                          "{0}".format(predictions[0].min())
    assert abs(284.6-predictions[0].max()) < 28, "Expected max of predictions to be 284.6, but got " \
                                                          "{0}".format(predictions[0].max())

    # with offset
    dl = h2o.deeplearning(x=insurance[0:3],y=insurance["Claims"],distribution="tweedie",hidden=[1],epochs=1000,
                          train_samples_per_iteration=-1,reproducible=True,activation="Tanh",single_node_mode=False,
                          balance_classes=False,force_load_balance=False,seed=23123,tweedie_power=1.5,
                          score_training_samples=0,score_validation_samples=0,offset_column="offset",
                          training_frame=insurance)
    mean_residual_deviance = dl.mean_residual_deviance()
    assert abs(0.261-mean_residual_deviance) < 1e-2, "Expected mean residual deviance to be 0.261, but got " \
                                                         "{0}".format(mean_residual_deviance)
    predictions = dl.predict(insurance)
    assert abs(49.53-predictions[0].mean()) < 1e-1, "Expected mean of predictions to be 49.53, but got " \
                                                          "{0}".format(predictions[0].mean())
    assert abs(1.074-predictions[0].min()) < 1e-1, "Expected min of predictions to be 1.074, but got " \
                                                          "{0}".format(predictions[0].min())
    assert abs(397.3-predictions[0].max()) < 40, "Expected max of predictions to be 397.3, but got " \
                                                          "{0}".format(predictions[0].max())
def deeplearning_autoencoder(ip, port):
  h2o.init(ip, port)

  resp = 784
  nfeatures = 20 # number of features (smallest hidden layer)


  train_hex = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/train.csv.gz"))
  test_hex = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/test.csv.gz"))

  # split data into two parts
  sid = train_hex[1].runif(1234)

  # unsupervised data for autoencoder
  train_unsupervised = train_hex[sid >= 0.5]
  train_unsupervised.describe()

  # supervised data for drf
  train_supervised = train_hex[sid < 0.5]
  train_supervised.describe()

  # train autoencoder
  ae_model = h2o.deeplearning(x=train_unsupervised.drop(resp),
                              y=train_unsupervised[resp], #ignored (pick any non-constant)
                              activation="Tanh",
                              autoencoder=True,
                              hidden=[nfeatures],
                              epochs=1,
                              reproducible=True, #slow, turn off for real problems
                              seed=1234)

  # conver train_supervised with autoencoder to lower-dimensional space
  train_supervised_features = ae_model.deepfeatures(train_supervised, 0)
  train_supervised_features.describe()

  assert train_supervised_features.ncol() == nfeatures, "Dimensionality of reconstruction is wrong!"

  # Train DRF on extracted feature space
  drf_model = h2o.random_forest(x=train_supervised_features,
                                y=train_supervised[resp].asfactor(),
                                ntrees=10,
                                seed=1234)

  # Test the DRF model on the test set (processed through deep features)
  test_features = ae_model.deepfeatures(test_hex.drop(resp), 0)
  test_features.cbind(test_hex[resp])

  # Confusion Matrix and assertion
  cm = drf_model.confusionMatrix(test_features)
  cm.show()

  # 10% error +/- 0.001
  assert abs(cm["Totals", "Error"] - 0.1038) < 0.001, "Error not as expected"
コード例 #34
0
def missing():
    # Connect to a pre-existing cluster

    missing_ratios = [0, 0.1, 0.25, 0.5, 0.75, 0.99]
    errors = [0, 0, 0, 0, 0, 0]

    for i in range(len(missing_ratios)):
        data = h2o.upload_file(
            pyunit_utils.locate("smalldata/junit/weather.csv"))
        data[15] = data[15].asfactor()  #ChangeTempDir
        data[16] = data[16].asfactor()  #ChangeTempMag
        data[17] = data[17].asfactor()  #ChangeWindDirect
        data[18] = data[18].asfactor()  #MaxWindPeriod
        data[19] = data[19].asfactor()  #RainToday
        data[21] = data[21].asfactor()  #PressureChange
        data[23] = data[23].asfactor()  #RainTomorrow

        print "For missing {0}%".format(missing_ratios[i] * 100)

        # add missing values to the data section of the file (leave the response alone)
        if missing_ratios[i] > 0:
            resp = data[23]
            pred = data[:, range(23) + range(24, data.ncol)]
            data_missing = pred.insert_missing_values(
                fraction=missing_ratios[i])
            data_fin = data_missing.cbind(resp)
        else:
            data_fin = data

        # split into train + test datasets
        ratio = data_fin[0].runif()
        train = data_fin[ratio <= .75]
        test = data_fin[ratio > .75]

        hh = h2o.deeplearning(x=train[2:22],
                              y=train[23],
                              validation_x=test[2:22],
                              validation_y=test[23],
                              epochs=5,
                              reproducible=True,
                              seed=12345,
                              activation='RectifierWithDropout',
                              l1=1e-5,
                              input_dropout_ratio=0.2)

        errors[i] = hh.error()[0][1]

    for i in range(len(missing_ratios)):
        print "missing ratio: {0}% --> classification error: {1}".format(
            missing_ratios[i] * 100, errors[i])

    assert sum(errors) < 2.2, "Sum of classification errors is too large!"
def weights_and_distributions():

    htable = h2o.upload_file(
        pyunit_utils.locate("smalldata/gbm_test/moppe.csv"))
    htable["premiekl"] = htable["premiekl"].asfactor()
    htable["moptva"] = htable["moptva"].asfactor()
    htable["zon"] = htable["zon"]

    # gamma
    dl = h2o.deeplearning(x=htable[0:3],
                          y=htable["medskad"],
                          training_frame=htable,
                          distribution="gamma",
                          weights_column="antskad")
    predictions = dl.predict(htable)

    # gaussian
    dl = h2o.deeplearning(x=htable[0:3],
                          y=htable["medskad"],
                          training_frame=htable,
                          distribution="gaussian",
                          weights_column="antskad")
    predictions = dl.predict(htable)

    # poisson
    dl = h2o.deeplearning(x=htable[0:3],
                          y=htable["medskad"],
                          training_frame=htable,
                          distribution="poisson",
                          weights_column="antskad")
    predictions = dl.predict(htable)

    # tweedie
    dl = h2o.deeplearning(x=htable[0:3],
                          y=htable["medskad"],
                          training_frame=htable,
                          distribution="tweedie",
                          weights_column="antskad")
    predictions = dl.predict(htable)
コード例 #36
0
def missing():
    # Connect to a pre-existing cluster

    missing_ratios = [0, 0.1, 0.25, 0.5, 0.75, 0.99]
    errors = [0, 0, 0, 0, 0, 0]

    for i in range(len(missing_ratios)):
        data = h2o.upload_file(h2o.locate("smalldata/junit/weather.csv"))
        data[15] = data[15].asfactor()  # ChangeTempDir
        data[16] = data[16].asfactor()  # ChangeTempMag
        data[17] = data[17].asfactor()  # ChangeWindDirect
        data[18] = data[18].asfactor()  # MaxWindPeriod
        data[19] = data[19].asfactor()  # RainToday
        data[21] = data[21].asfactor()  # PressureChange
        data[23] = data[23].asfactor()  # RainTomorrow

        print "For missing {0}%".format(missing_ratios[i] * 100)

        # add missing values to the data section of the file (leave the response alone)
        if missing_ratios[i] > 0:
            resp = data[23]
            pred = data[:, range(23) + range(24, data.ncol)]
            data_missing = pred.insert_missing_values(fraction=missing_ratios[i])
            data_fin = data_missing.cbind(resp)
        else:
            data_fin = data

        # split into train + test datasets
        ratio = data_fin[0].runif()
        train = data_fin[ratio <= 0.75]
        test = data_fin[ratio > 0.75]

        hh = h2o.deeplearning(
            x=train[2:22],
            y=train[23],
            validation_x=test[2:22],
            validation_y=test[23],
            epochs=5,
            reproducible=True,
            seed=12345,
            activation="RectifierWithDropout",
            l1=1e-5,
            input_dropout_ratio=0.2,
        )

        errors[i] = hh.error()[0][1]

    for i in range(len(missing_ratios)):
        print "missing ratio: {0}% --> classification error: {1}".format(missing_ratios[i] * 100, errors[i])

    assert sum(errors) < 2.2, "Sum of classification errors is too large!"
コード例 #37
0
def pubdev_2223():

    covtype = h2o.import_file(pyunit_utils.locate("smalldata/covtype/covtype.20k.data"))
    covtype[54] = covtype[54].asfactor()
    dlmodel = h2o.deeplearning(x=covtype[0:54], y=covtype[54], hidden=[17,191],
                               epochs=1, training_frame=covtype,
                               balance_classes=False, reproducible=True, seed=1234,
                               export_weights_and_biases=True)

    print("Normalization/Standardization multipliers for numeric predictors: {0}\n".format(dlmodel.normmul()))
    print("Normalization/Standardization offsets for numeric predictors: {0}\n".format(dlmodel.normsub()))
    print("Normalization/Standardization multipliers for numeric response: {0}\n".format(dlmodel.respmul()))
    print("Normalization/Standardization offsets for numeric response: {0}\n".format(dlmodel.respsub()))
    print("Categorical offsets for one-hot encoding: {0}\n".format(dlmodel.catoffsets()))
コード例 #38
0
def deeplearning_multi(ip, port):
    

    print("Test checks if Deep Learning works fine with a multiclass training and test dataset")

    prostate = h2o.import_file(h2o.locate("smalldata/logreg/prostate.csv"))

    prostate[4] = prostate[4].asfactor()

    hh = h2o.deeplearning(x             = prostate[0:2],
                          y             = prostate[4],
                          validation_x  = prostate[0:2],
                          validation_y  = prostate[4],
                          loss          = 'CrossEntropy')
    hh.show()
def deeplearning_multi():
    

    print("Test checks if Deep Learning works fine with a multiclass training and test dataset")

    prostate = h2o.import_file(tests.locate("smalldata/logreg/prostate.csv"))

    prostate[4] = prostate[4].asfactor()

    hh = h2o.deeplearning(x             = prostate[0:2],
                          y             = prostate[4],
                          validation_x  = prostate[0:2],
                          validation_y  = prostate[4],
                          loss          = 'CrossEntropy')
    hh.show()
def imbalance():
    

    print "Test checks if Deep Learning works fine with an imbalanced dataset"

    covtype = h2o.upload_file(tests.locate("smalldata/covtype/covtype.20k.data"))
    covtype[54] = covtype[54].asfactor()
    hh_imbalanced = h2o.deeplearning(x=covtype[0:54], y=covtype[54], l1=1e-5, activation="Rectifier", loss="CrossEntropy",
                                     hidden=[200,200], epochs=1, training_frame=covtype, balance_classes=False,
                                     reproducible=True, seed=1234)
    print hh_imbalanced

    hh_balanced = h2o.deeplearning(x=covtype[0:54], y=covtype[54], l1=1e-5, activation="Rectifier", loss="CrossEntropy",
                                   hidden=[200,200], epochs=1, training_frame=covtype, balance_classes=True,
                                   reproducible=True, seed=1234)
    print hh_balanced

    #compare overall logloss
    class_6_err_imbalanced = hh_imbalanced.logloss()
    class_6_err_balanced = hh_balanced.logloss()

    if class_6_err_imbalanced < class_6_err_balanced:
        print "--------------------"
        print ""
        print "FAIL, balanced error greater than imbalanced error"
        print ""
        print ""
        print "class_6_err_imbalanced"
        print class_6_err_imbalanced
        print ""
        print "class_6_err_balanced"
        print class_6_err_balanced
        print ""
        print "--------------------"

    assert class_6_err_imbalanced >= class_6_err_balanced, "balance_classes makes it worse!"
コード例 #41
0
def weights_and_biases():
    

    print("Test checks if Deep Learning weights and biases are accessible from R")

    covtype = h2o.upload_file(pyunit_utils.locate("smalldata/covtype/covtype.20k.data"))
    covtype[54] = covtype[54].asfactor()
    dlmodel = h2o.deeplearning(x=covtype[0:54], y=covtype[54], hidden=[17,191], epochs=1, training_frame=covtype,
                               balance_classes=False, reproducible=True, seed=1234, export_weights_and_biases=True)
    print(dlmodel)

    weights1 = dlmodel.weights(0)
    weights2 = dlmodel.weights(1)
    weights3 = dlmodel.weights(2)

    biases1 = dlmodel.biases(0)
    biases2 = dlmodel.biases(1)
    biases3 = dlmodel.biases(2)

    w1c = weights1.ncol
    w1r = weights1.nrow
    assert w1c == 52, "wrong dimensionality! expected {0}, but got {1}.".format(52, w1c)
    assert w1r == 17, "wrong dimensionality! expected {0}, but got {1}.".format(17, w1r)

    w2c = weights2.ncol
    w2r = weights2.nrow
    assert w2c == 17, "wrong dimensionality! expected {0}, but got {1}.".format(17, w2c)
    assert w2r == 191, "wrong dimensionality! expected {0}, but got {1}.".format(191, w2r)

    w3c = weights3.ncol
    w3r = weights3.nrow
    assert w3c == 191, "wrong dimensionality! expected {0}, but got {1}.".format(191, w3c)
    assert w3r == 7, "wrong dimensionality! expected {0}, but got {1}.".format(7, w3r)

    b1c = biases1.ncol
    b1r = biases1.nrow
    assert b1c == 1, "wrong dimensionality! expected {0}, but got {1}.".format(1, b1c)
    assert b1r == 17, "wrong dimensionality! expected {0}, but got {1}.".format(17, b1r)

    b2c = biases2.ncol
    b2r = biases2.nrow
    assert b2c == 1, "wrong dimensionality! expected {0}, but got {1}.".format(1, b2c)
    assert b2r == 191, "wrong dimensionality! expected {0}, but got {1}.".format(191, b2r)

    b3c = biases3.ncol
    b3r = biases3.nrow
    assert b3c == 1, "wrong dimensionality! expected {0}, but got {1}.".format(1, b3c)
    assert b3r == 7, "wrong dimensionality! expected {0}, but got {1}.".format(7, b3r)
コード例 #42
0
def split_fit_predict_dl(h1, h2, h3, hdr1, hdr2, hdr3, rho, epsilon):
  print "Trying h1, h2, h3, hdr1, hdr2, hdr3, rho, epsilon values of:", h1, h2, h3, hdr1, hdr2, hdr3, rho, epsilon
  dl = h2o.deeplearning(x = train[predictors],
                y = train['EVI'],
                validation_x = test[predictors],
                validation_y = test['EVI'],
                training_frame = train,
                validation_frame = test,
                weights_column = 'PixelReliability',
                hidden = [int(h1), int(h2), int(h3)],
                activation = "RectifierWithDropout",
                hidden_dropout_ratios = [hdr1, hdr2, hdr3],
                fast_mode = True,
                rho = rho, epsilon = epsilon)
  mse = dl.mse(valid=True)
  r2 = dl.r2(valid=True)
  print "Deep learning MSE:", mse
  return([mse, r2])
コード例 #43
0
def split_fit_predict_dl(h1, h2, h3, hdr1, hdr2, hdr3, rho, epsilon):
    print "Trying h1, h2, h3, hdr1, hdr2, hdr3, rho, epsilon values of:", h1, h2, h3, hdr1, hdr2, hdr3, rho, epsilon
    dl = h2o.deeplearning(x=train[predictors],
                          y=train['EVI'],
                          validation_x=test[predictors],
                          validation_y=test['EVI'],
                          training_frame=train,
                          validation_frame=test,
                          weights_column='PixelReliability',
                          hidden=[int(h1), int(h2), int(h3)],
                          activation="RectifierWithDropout",
                          hidden_dropout_ratios=[hdr1, hdr2, hdr3],
                          fast_mode=True,
                          rho=rho,
                          epsilon=epsilon)
    mse = dl.mse(valid=True)
    r2 = dl.r2(valid=True)
    print "Deep learning MSE:", mse
    return ([mse, r2])
コード例 #44
0
def deeplearning_multi():
    print("Test checks if Deep Learning works fine with a categorical dataset")

    # print(locate("smalldata/logreg/protstate.csv"))
    prostate = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    prostate[1] = prostate[1].asfactor()  #CAPSULE -> CAPSULE
    prostate[2] = prostate[2].asfactor()  #AGE -> Factor
    prostate[3] = prostate[3].asfactor()  #RACE -> Factor
    prostate[4] = prostate[4].asfactor()  #DPROS -> Factor
    prostate[5] = prostate[5].asfactor()  #DCAPS -> Factor
    prostate = prostate.drop('ID')  #remove ID
    prostate.describe()

    hh = h2o.deeplearning(x=prostate.drop('CAPSULE'),
                          y=prostate['CAPSULE'],
                          loss='CrossEntropy',
                          hidden=[10, 10],
                          use_all_factor_levels=False)
    hh.show()
コード例 #45
0
def deeplearning_multi():
    print("Test checks if Deep Learning works fine with a categorical dataset")

    # print(locate("smalldata/logreg/protstate.csv"))
    prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    prostate[1] = prostate[1].asfactor()  #CAPSULE -> CAPSULE
    prostate[2] = prostate[2].asfactor()  #AGE -> Factor
    prostate[3] = prostate[3].asfactor()  #RACE -> Factor
    prostate[4] = prostate[4].asfactor()  #DPROS -> Factor
    prostate[5] = prostate[5].asfactor()  #DCAPS -> Factor
    prostate = prostate.drop('ID')        #remove ID
    prostate.describe()


    hh = h2o.deeplearning(x                     = prostate.drop('CAPSULE'),
                          y                     = prostate['CAPSULE'],
                          loss                  = 'CrossEntropy',
                          hidden                = [10, 10],
                          use_all_factor_levels = False)
    hh.show()
コード例 #46
0
def deeplearning_mean_residual_deviance():

    cars =  h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv"))
    s = cars[0].runif()
    train = cars[s > 0.2]
    valid = cars[s <= 0.2]
    predictors = ["displacement","power","weight","acceleration","year"]
    response_col = "economy"
    dl = h2o.deeplearning(x=train[predictors],
                          y=train[response_col],
                          validation_x=valid[predictors],
                          validation_y=valid[response_col],
                          nfolds=3)
    dl_mrd = dl.mean_residual_deviance(train=True,valid=True,xval=True)
    assert isinstance(dl_mrd['train'],float), "Expected training mean residual deviance to be a float, but got " \
                                              "{0}".format(type(dl_mrd['train']))
    assert isinstance(dl_mrd['valid'],float), "Expected validation mean residual deviance to be a float, but got " \
                                              "{0}".format(type(dl_mrd['valid']))
    assert isinstance(dl_mrd['xval'],float), "Expected cross-validation mean residual deviance to be a float, but got " \
                                              "{0}".format(type(dl_mrd['xval']))
コード例 #47
0
def deeplearning_mean_residual_deviance(ip, port):

    cars = h2o.import_frame(path=h2o.locate("smalldata/junit/cars_20mpg.csv"))
    s = cars[0].runif()
    train = cars[s > 0.2]
    valid = cars[s <= 0.2]
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    response_col = "economy"
    dl = h2o.deeplearning(x=train[predictors],
                          y=train[response_col],
                          validation_x=valid[predictors],
                          validation_y=valid[response_col],
                          nfolds=3)
    dl_mrd = dl.mean_residual_deviance(train=True, valid=True, xval=True)
    assert isinstance(dl_mrd['train'],float), "Expected training mean residual deviance to be a float, but got " \
                                              "{0}".format(type(dl_mrd['train']))
    assert isinstance(dl_mrd['valid'],float), "Expected validation mean residual deviance to be a float, but got " \
                                              "{0}".format(type(dl_mrd['valid']))
    assert isinstance(dl_mrd['xval'],float), "Expected cross-validation mean residual deviance to be a float, but got " \
                                              "{0}".format(type(dl_mrd['xval']))
コード例 #48
0
def deepLearningDemo(ip, port):

    # Training data
    train_data = h2o.import_file(path=h2o.locate("smalldata/gbm_test/ecology_model.csv"))
    train_data = train_data.drop("Site")
    train_data["Angaus"] = train_data["Angaus"].asfactor()
    print train_data.describe()
    train_data.head()

    # Testing data
    test_data = h2o.import_file(path=h2o.locate("smalldata/gbm_test/ecology_eval.csv"))
    test_data["Angaus"] = test_data["Angaus"].asfactor()
    print test_data.describe()
    test_data.head()

    # Run GBM
    gbm = h2o.gbm(
        x=train_data[1:],
        y=train_data["Angaus"],
        validation_x=test_data[1:],
        validation_y=test_data["Angaus"],
        ntrees=100,
        distribution="bernoulli",
    )

    gbm.show()

    # Run DeepLearning

    dl = h2o.deeplearning(
        x=train_data[1:],
        y=train_data["Angaus"],
        validation_x=test_data[1:],
        validation_y=test_data["Angaus"],
        loss="CrossEntropy",
        epochs=1000,
        hidden=[20, 20, 20],
    )

    dl.show()
コード例 #49
0
def deepLearningDemo(ip, port):

    h2o.init(ip, port)

    # Training data
    train_data = h2o.import_frame(
        path=h2o.locate("smalldata/gbm_test/ecology_model.csv"))
    train_data = train_data.drop('Site')
    train_data['Angaus'] = train_data['Angaus'].asfactor()
    print train_data.describe()
    train_data.head()

    # Testing data
    test_data = h2o.import_frame(
        path=h2o.locate("smalldata/gbm_test/ecology_eval.csv"))
    test_data['Angaus'] = test_data['Angaus'].asfactor()
    print test_data.describe()
    test_data.head()

    # Run GBM
    gbm = h2o.gbm(x=train_data[1:],
                  y=train_data['Angaus'],
                  validation_x=test_data[1:],
                  validation_y=test_data['Angaus'],
                  ntrees=100,
                  distribution="bernoulli")

    gbm.show()

    # Run DeepLearning

    dl = h2o.deeplearning(x=train_data[1:],
                          y=train_data['Angaus'],
                          validation_x=test_data[1:],
                          validation_y=test_data['Angaus'],
                          loss='CrossEntropy',
                          epochs=1000,
                          hidden=[20, 20, 20])

    dl.show()
コード例 #50
0
def deepLearningDemo():

  

  # Training data
  train_data = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv"))
  train_data = train_data.drop('Site')
  train_data['Angaus'] = train_data['Angaus'].asfactor()
  print train_data.describe()
  train_data.head()

  # Testing data
  test_data = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/ecology_eval.csv"))
  test_data['Angaus'] = test_data['Angaus'].asfactor()
  print test_data.describe()
  test_data.head()


  # Run GBM
  gbm = h2o.gbm(x           = train_data[1:],
                y           = train_data['Angaus'],
                validation_x= test_data [1:] ,
                validation_y= test_data ['Angaus'],
                ntrees=100,
                distribution="bernoulli")

  gbm.show()

  # Run DeepLearning

  dl = h2o.deeplearning(x           = train_data[1:],
                        y           = train_data['Angaus'],
                        validation_x= test_data [1:] ,
                        validation_y= test_data ['Angaus'],
                        loss   = 'CrossEntropy',
                        epochs = 1000,
                        hidden = [20, 20, 20])

  dl.show()
コード例 #51
0
def deeplearning_basic():

    iris_hex = h2o.import_file(path=tests.locate("smalldata/iris/iris.csv"))
    hh = h2o.deeplearning(x=iris_hex[:3], y=iris_hex[4], loss='CrossEntropy')
    hh.show()
コード例 #52
0
def tweedie_offset(ip, port):

    insurance = h2o.import_file(h2o.locate("smalldata/glm_test/insurance.csv"))
    insurance["offset"] = insurance["Holders"].log()
    insurance["Group"] = insurance["Group"].asfactor()
    insurance["Age"] = insurance["Age"].asfactor()
    insurance["District"] = insurance["District"].asfactor()

    # without offset
    dl = h2o.deeplearning(x=insurance[0:3],
                          y=insurance["Claims"],
                          distribution="tweedie",
                          hidden=[1],
                          epochs=1000,
                          train_samples_per_iteration=-1,
                          reproducible=True,
                          activation="Tanh",
                          single_node_mode=False,
                          balance_classes=False,
                          force_load_balance=False,
                          seed=23123,
                          tweedie_power=1.5,
                          score_training_samples=0,
                          score_validation_samples=0)

    mean_residual_deviance = dl.mean_residual_deviance()
    assert abs(0.561641366536-mean_residual_deviance) < 1e-6, "Expected mean residual deviance to be 0.561641366536, but got " \
                                                         "{0}".format(mean_residual_deviance)
    predictions = dl.predict(insurance)
    assert abs(47.6819999424-predictions[0].mean()) < 1e-6, "Expected mean of predictions to be 47.6819999424, but got " \
                                                          "{0}".format(predictions[0].mean())
    assert abs(1.90409304033-predictions[0].min()) < 1e-6, "Expected min of predictions to be 1.90409304033, but got " \
                                                          "{0}".format(predictions[0].min())
    assert abs(280.735054543-predictions[0].max()) < 1e-6, "Expected max of predictions to be 280.735054543, but got " \
                                                          "{0}".format(predictions[0].max())

    # with offset
    dl = h2o.deeplearning(x=insurance[0:3],
                          y=insurance["Claims"],
                          distribution="tweedie",
                          hidden=[1],
                          epochs=1000,
                          train_samples_per_iteration=-1,
                          reproducible=True,
                          activation="Tanh",
                          single_node_mode=False,
                          balance_classes=False,
                          force_load_balance=False,
                          seed=23123,
                          tweedie_power=1.5,
                          score_training_samples=0,
                          score_validation_samples=0,
                          offset_column="offset",
                          training_frame=insurance)
    mean_residual_deviance = dl.mean_residual_deviance()
    assert abs(0.261065520191-mean_residual_deviance) < 1e-6, "Expected mean residual deviance to be 0.261065520191, but got " \
                                                         "{0}".format(mean_residual_deviance)
    predictions = dl.predict(insurance)
    assert abs(49.2939039783-predictions[0].mean()) < 1e-6, "Expected mean of predictions to be 49.2939039783, but got " \
                                                          "{0}".format(predictions[0].mean())
    assert abs(1.07391126487-predictions[0].min()) < 1e-6, "Expected min of predictions to be 1.07391126487, but got " \
                                                          "{0}".format(predictions[0].min())
    assert abs(397.328758591-predictions[0].max()) < 1e-6, "Expected max of predictions to be 397.328758591, but got " \
                                                          "{0}".format(predictions[0].max())
コード例 #53
0
def tweedie_offset():

    insurance = h2o.import_file(
        pyunit_utils.locate("smalldata/glm_test/insurance.csv"))
    insurance["offset"] = insurance["Holders"].log()
    insurance["Group"] = insurance["Group"].asfactor()
    insurance["Age"] = insurance["Age"].asfactor()
    insurance["District"] = insurance["District"].asfactor()

    # without offset
    dl = h2o.deeplearning(x=insurance[0:3],
                          y=insurance["Claims"],
                          distribution="tweedie",
                          hidden=[1],
                          epochs=1000,
                          train_samples_per_iteration=-1,
                          reproducible=True,
                          activation="Tanh",
                          single_node_mode=False,
                          balance_classes=False,
                          force_load_balance=False,
                          seed=23123,
                          tweedie_power=1.5,
                          score_training_samples=0,
                          score_validation_samples=0)

    mean_residual_deviance = dl.mean_residual_deviance()
    assert abs(0.556 - mean_residual_deviance) < 1e-3, "Expected mean residual deviance to be 0.556, but got " \
                                                         "{0}".format(mean_residual_deviance)
    predictions = dl.predict(insurance)
    assert abs(47.61-predictions[0].mean()[0]) < 1e-2, "Expected mean of predictions to be 47.61, but got " \
                                                          "{0}".format(predictions[0].mean()[0])
    assert abs(1.94-predictions[0].min()) < 1e-1, "Expected min of predictions to be 1.94, but got " \
                                                          "{0}".format(predictions[0].min())
    assert abs(284.6-predictions[0].max()) < 28, "Expected max of predictions to be 284.6, but got " \
                                                          "{0}".format(predictions[0].max())

    # with offset
    dl = h2o.deeplearning(x=insurance[0:3],
                          y=insurance["Claims"],
                          distribution="tweedie",
                          hidden=[1],
                          epochs=1000,
                          train_samples_per_iteration=-1,
                          reproducible=True,
                          activation="Tanh",
                          single_node_mode=False,
                          balance_classes=False,
                          force_load_balance=False,
                          seed=23123,
                          tweedie_power=1.5,
                          score_training_samples=0,
                          score_validation_samples=0,
                          offset_column="offset",
                          training_frame=insurance)
    mean_residual_deviance = dl.mean_residual_deviance()
    assert abs(0.261-mean_residual_deviance) < 1e-2, "Expected mean residual deviance to be 0.261, but got " \
                                                         "{0}".format(mean_residual_deviance)
    predictions = dl.predict(insurance)
    assert abs(49.53-predictions[0].mean()[0]) < 1e-1, "Expected mean of predictions to be 49.53, but got " \
                                                          "{0}".format(predictions[0].mean()[0])
    assert abs(1.074-predictions[0].min()) < 1e-1, "Expected min of predictions to be 1.074, but got " \
                                                          "{0}".format(predictions[0].min())
    assert abs(397.3-predictions[0].max()) < 40, "Expected max of predictions to be 397.3, but got " \
                                                          "{0}".format(predictions[0].max())
コード例 #54
0
ファイル: demo.py プロジェクト: xc35/h2o-3
def deeplearning_demo(interactive, echo, test):
    h2o_data_path = system_file("prostate.csv")

    demo_description = [
        '\n-----------------------------------------------------------------',
        'This is a demo of H2O\'s Deeplearning function.',
        'It uploads a dataset to h2o, parses it, and shows a description.',
        'Then, it divides the dataset into training and test sets, ',
        'builds a model from the training set, and predicts on the test set.',
        'Finally, default performance metrics are displayed.',
        '-----------------------------------------------------------------'
    ]

    demo_commands = [
        '# Connect to h2o', '>>> h2o.init()\n',
        '\n# Upload the prostate dataset that comes included in the h2o python package',
        '>>> prostate = h2o.upload_file(path = ' + h2o_data_path + '))\n',
        '\n# Print a description of the prostate data',
        '>>> prostate.summary()\n',
        '\n# Randomly split the dataset into ~70/30, training/test sets',
        '>>> r = prostate[0].runif()', '>>> train = prostate[r < 0.70]',
        '>>> valid = prostate[r >= 0.30]\n',
        '\n# Convert the response columns to factors (for binary classification problems)',
        '>>> train["CAPSULE"] = train["CAPSULE"].asfactor()',
        '>>> test["CAPSULE"] = test["CAPSULE"].asfactor()\n',
        '\n# Build a (classification) Deeplearning model',
        '>>> prostate_dl = h2o.deeplearning(x=train[list(set(prostate.col_names)-set(["ID","CAPSULE"]))]'
        ', y=train["CAPSULE"], activation="Tanh", hidden=[10, 10, 10], epochs=10000)\n',
        '\n# Show the model', '>>> prostate_dl.show()\n',
        '\n# Predict on the test set and show the first ten predictions',
        '>>> predictions = prostate_dl.predict(test)',
        '>>> predictions.show()\n', '\n# Show default performance metrics',
        '>>> performance = prostate_dl.model_performance(test)',
        '>>> performance.show()\n'
    ]

    for line in demo_description:
        print line
    print

    echo_and_interact(demo_commands, interactive, echo)
    if not test: h2o.init()

    echo_and_interact(demo_commands, interactive, echo)
    prostate = h2o.upload_file(path=h2o_data_path)

    echo_and_interact(demo_commands, interactive, echo)
    prostate.summary()

    echo_and_interact(demo_commands, interactive, echo, npop=4)
    r = prostate[0].runif()
    train = prostate[r < 0.70]
    test = prostate[r >= 0.30]

    echo_and_interact(demo_commands, interactive, echo, npop=3)
    train["CAPSULE"] = train["CAPSULE"].asfactor()
    test["CAPSULE"] = test["CAPSULE"].asfactor()

    echo_and_interact(demo_commands, interactive, echo)
    prostate_dl = h2o.deeplearning(
        x=train[list(set(prostate.col_names) - set(["ID", "CAPSULE"]))],
        y=train["CAPSULE"],
        activation="Tanh",
        hidden=[10, 10, 10],
        epochs=10000)

    echo_and_interact(demo_commands, interactive, echo)
    prostate_dl.show()

    echo_and_interact(demo_commands, interactive, echo, npop=3)
    predictions = prostate_dl.predict(test)
    predictions.show()

    echo_and_interact(demo_commands, interactive, echo, npop=3)
    performance = prostate_dl.model_performance(test)
    performance.show()
コード例 #55
0
def domain_check():

    air_train = h2o.import_file(
        path=tests.locate("smalldata/airlines/AirlinesTrain.csv.zip"))
    air_train.show()
    air_test = h2o.import_file(
        path=tests.locate("smalldata/airlines/AirlinesTest.csv.zip"))
    air_test.show()

    actual_domain = [u'YES', u'NO']
    print "actual domain of the response: {0}".format(actual_domain)

    ### DRF ###
    print
    print "-------------- DRF:"
    print
    rf = h2o.random_forest(x=air_train[[
        "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth",
        "fDayOfWeek"
    ]],
                           y=air_train["IsDepDelayed"].asfactor(),
                           training_frame=air_train)
    computed_domain = rf._model_json['output'][
        'training_metrics']._metric_json['domain']
    domain_diff = list(set(computed_domain) - set(actual_domain))
    assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \
                            "The difference is {2}".format(actual_domain, computed_domain, domain_diff)

    perf = rf.model_performance(test_data=air_test)
    computed_domain = perf._metric_json['domain']
    domain_diff = list(set(computed_domain) - set(actual_domain))
    assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \
                            "The difference is {2}".format(actual_domain, computed_domain, domain_diff)

    ### GBM ###
    print
    print "-------------- GBM:"
    print
    gbm = h2o.gbm(x=air_train[[
        "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth",
        "fDayOfWeek"
    ]],
                  y=air_train["IsDepDelayed"].asfactor(),
                  training_frame=air_train,
                  distribution="bernoulli")
    computed_domain = gbm._model_json['output'][
        'training_metrics']._metric_json['domain']
    domain_diff = list(set(computed_domain) - set(actual_domain))
    assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \
                            "The difference is {2}".format(actual_domain, computed_domain, domain_diff)

    perf = rf.model_performance(test_data=air_test)
    computed_domain = perf._metric_json['domain']
    domain_diff = list(set(computed_domain) - set(actual_domain))
    assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \
                            "The difference is {2}".format(actual_domain, computed_domain, domain_diff)

    ### Deeplearning ###
    print
    print "-------------- Deeplearning:"
    print
    dl = h2o.deeplearning(x=air_train[[
        "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth",
        "fDayOfWeek"
    ]],
                          y=air_train["IsDepDelayed"].asfactor(),
                          training_frame=air_train,
                          activation="Tanh",
                          hidden=[2, 2, 2],
                          epochs=10)
    computed_domain = dl._model_json['output'][
        'training_metrics']._metric_json['domain']
    domain_diff = list(set(computed_domain) - set(actual_domain))
    assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \
                            "The difference is {2}".format(actual_domain, computed_domain, domain_diff)

    perf = rf.model_performance(test_data=air_test)
    computed_domain = perf._metric_json['domain']
    domain_diff = list(set(computed_domain) - set(actual_domain))
    assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \
                            "The difference is {2}".format(actual_domain, computed_domain, domain_diff)

    ### GLM ###
    print
    print "-------------- GLM:"
    print
    glm = h2o.glm(x=air_train[[
        "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth",
        "fDayOfWeek"
    ]],
                  y=air_train["IsDepDelayed"],
                  training_frame=air_train,
                  family="binomial")
    computed_domain = glm._model_json['output'][
        'training_metrics']._metric_json['domain']
    domain_diff = list(set(computed_domain) - set(actual_domain))
    assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \
                        "The difference is {2}".format(actual_domain, computed_domain, domain_diff)

    perf = glm.model_performance(test_data=air_test)
    computed_domain = perf._metric_json['domain']
    domain_diff = list(set(computed_domain) - set(actual_domain))
    assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \
                            "The difference is {2}".format(actual_domain, computed_domain, domain_diff)
コード例 #56
0
def deeplearning_basic(ip, port):
    h2o.init(ip, port)

    iris_hex = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))
    hh = h2o.deeplearning(x=iris_hex[:3], y=iris_hex[4], loss='CrossEntropy')
    hh.show()
コード例 #57
0
ファイル: dl_h2o.py プロジェクト: akiratu/deep-domino
import sys
sys.prefix = "/usr/local"

# Start up H2O
import h2o
h2o.init(start_h2o=True)

# Load the dataset
prostate = h2o.upload_file(path=h2o.locate("datasets/prostate.csv"))
prostate.describe()

# Set the CAPSULE column to be a factor column then build the model
prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()
model = h2o.deeplearning(x=prostate[list(set(prostate.col_names) - set(["ID", "CAPSULE"]))],
                         y = prostate["CAPSULE"],
                         training_frame=prostate,
                         activation="Tanh",
                         hidden=[10, 10, 10],
                         epochs=10000)
model.show()

# Make predictions with the trained model
predictions = model.predict(prostate)
predictions.show()

# Check performance of the classification model
performance = model.model_performance(prostate)
performance.show()

# Domino Diagnostic Statistics
r2 = performance.r2()
mse = performance.mse()
コード例 #58
0
ファイル: utilsPY.py プロジェクト: konor/h2o-3
def javapredict(algo, equality, train, test, x, y, **kwargs):
    print "Creating model in H2O"
    if algo == "gbm":
        model = h2o.gbm(x=train[x], y=train[y], **kwargs)
    elif algo == "random_forest":
        model = h2o.random_forest(x=train[x], y=train[y], **kwargs)
    elif algo == "deeplearning":
        model = h2o.deeplearning(x=train[x], y=train[y], **kwargs)
    elif algo == "glm":
        model = h2o.glm(x=train[x], y=train[y], **kwargs)
    else:
        raise (ValueError, "algo {0} is not supported".format(algo))
    print model

    print "Downloading Java prediction model code from H2O"
    tmpdir = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                     "results", model._id))
    os.mkdir(tmpdir)
    h2o.download_pojo(model, path=tmpdir)
    h2o_genmodel_jar = os.path.join(tmpdir, "h2o-genmodel.jar")
    assert os.path.exists(
        h2o_genmodel_jar
    ), "Expected file {0} to exist, but it does not.".format(h2o_genmodel_jar)
    print "h2o-genmodel.jar saved in {0}".format(h2o_genmodel_jar)
    java_file = os.path.join(tmpdir, model._id + ".java")
    assert os.path.exists(
        java_file), "Expected file {0} to exist, but it does not.".format(
            java_file)
    print "java code saved in {0}".format(java_file)

    print "Predicting in H2O"
    predictions = model.predict(test)
    predictions.summary()
    predictions.head()
    out_h2o_csv = os.path.join(tmpdir, "out_h2o.csv")
    h2o.download_csv(predictions, out_h2o_csv)
    assert os.path.exists(
        out_h2o_csv), "Expected file {0} to exist, but it does not.".format(
            out_h2o_csv)
    print "H2O Predictions saved in {0}".format(out_h2o_csv)

    print "Setting up for Java POJO"
    in_csv = os.path.join(tmpdir, "in.csv")
    h2o.download_csv(test[x], in_csv)

    # hack: the PredictCsv driver can't handle quoted strings, so remove them
    f = open(in_csv, 'r+')
    csv = f.read()
    csv = re.sub('\"', '', csv)
    f.seek(0)
    f.write(csv)
    f.truncate()
    f.close()
    assert os.path.exists(
        in_csv), "Expected file {0} to exist, but it does not.".format(in_csv)
    print "Input CSV to PredictCsv saved in {0}".format(in_csv)

    print "Compiling Java Pojo"
    javac_cmd = [
        "javac", "-cp", h2o_genmodel_jar, "-J-Xmx4g", "-J-XX:MaxPermSize=256m",
        java_file
    ]
    subprocess.check_call(javac_cmd)

    print "Running PredictCsv Java Program"
    out_pojo_csv = os.path.join(tmpdir, "out_pojo.csv")
    cp_sep = ";" if sys.platform == "win32" else ":"
    java_cmd = [
        "java", "-ea", "-cp", h2o_genmodel_jar + cp_sep + tmpdir, "-Xmx4g",
        "-XX:MaxPermSize=256m", "-XX:ReservedCodeCacheSize=256m",
        "hex.genmodel.tools.PredictCsv", "--header", "--model", model._id,
        "--input", in_csv, "--output", out_pojo_csv
    ]
    p = subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT)
    o, e = p.communicate()
    print "Java output: {0}".format(o)
    assert os.path.exists(
        out_pojo_csv), "Expected file {0} to exist, but it does not.".format(
            out_pojo_csv)
    predictions2 = h2o.import_file(path=out_pojo_csv)
    print "Pojo predictions saved in {0}".format(out_pojo_csv)

    print "Comparing predictions between H2O and Java POJO"
    # Dimensions
    hr, hc = predictions.dim
    pr, pc = predictions2.dim
    assert hr == pr, "Exepcted the same number of rows, but got {0} and {1}".format(
        hr, pr)
    assert hc == pc, "Exepcted the same number of cols, but got {0} and {1}".format(
        hc, pc)

    # Value
    for r in range(hr):
        hp = predictions[r, 0]
        if equality == "numeric":
            pp = float.fromhex(predictions2[r, 0])
            assert abs(
                hp - pp
            ) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format(
                r, hp, pp)
        elif equality == "class":
            pp = predictions2[r, 0]
            assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format(
                r, hp, pp)
        else:
            raise (ValueError,
                   "equality type {0} is not supported".format(equality))