コード例 #1
0
ファイル: pyunit_ks_metric.py プロジェクト: zoudongyang/h2o-3
def kolmogorov_smirnov():
    # Train a model
    airlines = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    model = H2OGradientBoostingEstimator(ntrees=1, gainslift_bins=20)
    model.train(x=["Origin", "Distance"], y="IsDepDelayed", training_frame=airlines)
    verify_ks(model, airlines)

    model = H2OGradientBoostingEstimator(ntrees=1, gainslift_bins=5)
    model.train(x=["Origin", "Distance"], y="IsDepDelayed", training_frame=airlines)
    ks = model.kolmogorov_smirnov()
    print(ks)
    ks_verification = ks_metric(model, airlines)
    print(ks_verification)
    assert round(ks, 5) != round(ks_verification, 5)

    model = H2OXGBoostEstimator(gainslift_bins=10)
    model.train(x=["Origin", "Distance"], y="IsDepDelayed", training_frame=airlines)
    print(model.gains_lift())
    ks = model.kolmogorov_smirnov()
    assert ks is not None
    assert 0 < ks < 1

    # Test GS is null whern gainslift_bins = 0
    model = H2OGradientBoostingEstimator(ntrees=1, gainslift_bins=0)
    model.train(x=["Origin", "Distance"], y="IsDepDelayed", training_frame=airlines)
    assert model.gains_lift() is None
コード例 #2
0
def gbm_monotone_quantile_test():
    # generate data
    x = np.atleast_1d(np.random.uniform(0, 10.0, size=100)).T

    y = f(x).ravel()

    dy = 1.5 + 1.0 * np.random.random(y.shape)
    noise = np.random.normal(0, dy)
    y += noise

    train = h2o.H2OFrame({'x': x.tolist(), 'y': y.tolist()})

    # train a model with 1 constraint on x
    gbm_mono = H2OGradientBoostingEstimator(seed=42,
                                            distribution="quantile",
                                            monotone_constraints={"x": 1})
    gbm_mono.train(y='y', training_frame=train)

    mono_pred = gbm_mono.predict(train).as_data_frame().iloc[:, 0].tolist()
    x_sorted, mono_pred_sorted = zip(*sorted(zip(x, mono_pred)))
    assert all(x <= y for x, y in zip(mono_pred_sorted, mono_pred_sorted[1:])
               ), "The predictions should be monotone."

    # train a model with -1 constraint on x
    gbm_adverse = H2OGradientBoostingEstimator(seed=42,
                                               distribution="quantile",
                                               monotone_constraints={"x": -1})
    gbm_adverse.train(y='y', training_frame=train)

    adverse_pred = gbm_adverse.predict(train).as_data_frame().iloc[:,
                                                                   0].tolist()
    x_sorted, adverse_pred_sorted = zip(*sorted(zip(x, adverse_pred)))
    assert all(x >= y for x, y in zip(adverse_pred_sorted, adverse_pred_sorted[1:])), \
        "The predictions should be monotone."
コード例 #3
0
def checkpointing_test():
    airlines = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    gbm = H2OGradientBoostingEstimator(ntrees=1)
    gbm.train(x=["Origin", "Dest"], y="Distance", training_frame=airlines, validation_frame=airlines)
    
    checkpointed_gbm = H2OGradientBoostingEstimator(ntrees = 2, checkpoint=gbm)
    checkpointed_gbm.train(x=["Origin", "Dest"], y="Distance", training_frame=airlines, validation_frame=airlines)
    assert checkpointed_gbm.checkpoint == gbm

    checkpointed_gbm = H2OGradientBoostingEstimator(ntrees = 2, checkpoint=gbm.model_id)
    checkpointed_gbm.train(x=["Origin", "Dest"], y="Distance", training_frame=airlines, validation_frame=airlines)
    assert checkpointed_gbm.checkpoint == gbm.model_id
コード例 #4
0
def test_frames_can_be_passed_to_constructor():
    ds = import_dataset()

    gbm = H2OGradientBoostingEstimator(ntrees=10,
                                       nfolds=0,
                                       seed=seed,
                                       training_frame=ds['train'],
                                       validation_frame=ds['valid'])
    gbm.train(y=ds['target'])

    rf = H2ORandomForestEstimator(ntrees=10,
                                  nfolds=0,
                                  seed=seed,
                                  training_frame=ds['train'],
                                  validation_frame=ds['valid'])
    rf.train(y=ds['target'])

    se = H2OStackedEnsembleEstimator(base_models=[gbm, rf],
                                     seed=seed,
                                     training_frame=ds['train'],
                                     validation_frame=ds['valid'],
                                     blending_frame=ds['blend'])
    se.train(y=ds['target'])

    assert se.auc() > 0
コード例 #5
0
def mojo_conveniece():
    
    # Train a model
    airlines = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    model = H2OGradientBoostingEstimator(ntrees = 1)
    model.train(x = ["Origin", "Dest"], y = "IsDepDelayed", training_frame=airlines)
    
    #Save the previously created model into a temporary file
    original_model_filename = tempfile.mkdtemp()
    original_model_filename = model.save_mojo(original_model_filename)
    
    # Load the model from the temporary file
    mojo_model = h2o.import_mojo(original_model_filename)
    assert isinstance(mojo_model, H2OGenericEstimator)
    
    # Test scoring is available on the model
    predictions = mojo_model.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421

    #####
    # MOJO UPLOAD TEST
    #####

    # Download the MOJO
    original_model_filename = model.download_mojo(original_model_filename)
    # Load the model from the temporary file
    mojo_model = h2o.upload_mojo(original_model_filename)
    assert isinstance(mojo_model, H2OGenericEstimator)

    # Test scoring is available on the model
    predictions = mojo_model.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421
def partial_plots():
    data = h2o.import_file(
        pyunit_utils.locate('smalldata/prostate/prostate.csv'))

    x = ['AGE', 'RACE']
    y = 'CAPSULE'
    data[y] = data[y].asfactor()
    data['RACE'] = data['RACE'].asfactor()

    gbm_model = H2OGradientBoostingEstimator(ntrees=50, learn_rate=0.05)
    gbm_model.train(x=x, y=y, training_frame=data)

    # test saving:
    with TemporaryDirectory() as tmpdir:
        path1 = "{}/plot1.png".format(tmpdir)
        path2 = "{}/plot2.png".format(tmpdir)
        test_plot_result_saving(
            gbm_model.partial_plot(data=data,
                                   cols=['AGE'],
                                   server=True,
                                   plot=True,
                                   row_index=1), path2,
            gbm_model.partial_plot(data=data,
                                   cols=['AGE'],
                                   server=True,
                                   plot=True,
                                   row_index=1,
                                   save_plot_path=path1), path1)
コード例 #7
0
def tree_test():

    # GBM
    airlines = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    gbm = H2OGradientBoostingEstimator(ntrees = 1)
    gbm.train(x = ["Origin", "Dest"], y = "IsDepDelayed", training_frame=airlines)

    tree = H2OTree(gbm, 0, "NO") # Indexing from 0 in Python. There is exactly one tree built
    check_tree(tree, 0, "NO")
    assert tree.root_node.left_levels is not None#Only categoricals in the model, guaranteed to have categorical split
    assert tree.root_node.right_levels is not None #Only categoricals in the model, guaranteed to have categorical split

    # DRF
    cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_nice_header.csv"))
    drf = H2ORandomForestEstimator(ntrees=2)
    drf.train(x = ["power", "acceleration"], y="cylinders", training_frame=cars)

    drf_tree = H2OTree(drf, 1, None)
    check_tree(drf_tree, 1)

    # ISOFOR
    ecg_discord = h2o.import_file(pyunit_utils.locate("smalldata/anomaly/ecg_discord_train.csv"))
    isofor = H2OIsolationForestEstimator(ntrees=3, seed=12, sample_size=5)
    isofor.train(training_frame=ecg_discord)

    if_tree = H2OTree(isofor, 2)
    check_tree(if_tree, 2)
コード例 #8
0
def download_model():
    prostate = h2o.import_file(
        pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()

    prostate_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
                                                ntrees=10,
                                                max_depth=8,
                                                min_rows=10,
                                                learn_rate=0.2)
    prostate_gbm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"],
                       y="CAPSULE",
                       training_frame=prostate)

    path = pyunit_utils.locate("results")

    downloaded_model_path = prostate_gbm.download_model(path=path)
    assert os.path.isfile(downloaded_model_path), \
        "Expected load file {0} to exist, but it does not.".format(downloaded_model_path)

    loaded_model = h2o.load_model(downloaded_model_path)
    assert isinstance(loaded_model, H2OGradientBoostingEstimator), \
        "Expected an H2OGradientBoostingEstimator, but got {0}".format(downloaded_model_path)

    uploaded_model = h2o.upload_model(downloaded_model_path)
    assert isinstance(uploaded_model, H2OGradientBoostingEstimator), \
        "Expected an H2OGradientBoostingEstimator, but got {0}".format(downloaded_model_path)
コード例 #9
0
    def test_custom_metric(self):
        from custom_metric_class import WeightedFalseNegativeLossMetric
        train_path = "file://" + unit_test_utils.locate("smalldata/loan.csv")
        train = h2o.import_file(train_path, destination_frame="loan_train")
        train["bad_loan"] = train["bad_loan"].asfactor()

        y = "bad_loan"
        x = train.col_names
        x.remove(y)
        x.remove("int_rate")

        train["weight"] = train["loan_amnt"]

        weighted_false_negative_loss_func = h2o.upload_custom_metric(
            WeightedFalseNegativeLossMetric,
            func_name="WeightedFalseNegativeLoss",
            func_file="weighted_false_negative_loss.py")
        from h2o.estimators import H2OGradientBoostingEstimator
        gbm = H2OGradientBoostingEstimator(
            model_id="gbm.hex",
            custom_metric_func=weighted_false_negative_loss_func)
        gbm.train(y=y, x=x, training_frame=train, weights_column="weight")

        perf = gbm.model_performance()
        self.assertEquals(perf.custom_metric_name(),
                          "WeightedFalseNegativeLoss")
        self.assertEquals(perf.custom_metric_value(), 0.24579011595430142)
コード例 #10
0
def test_frames_can_be_overridden_in_train_method():
    ds = import_dataset()

    dummy_frame = h2o.H2OFrame([1, 2, 3])

    gbm = H2OGradientBoostingEstimator(ntrees=10, nfolds=0, seed=seed,
                                       training_frame=dummy_frame,
                                       validation_frame=dummy_frame)
    gbm.train(y=ds['target'],
              training_frame=ds['train'],
              validation_frame=ds['valid'])

    rf = H2ORandomForestEstimator(ntrees=10, nfolds=0, seed=seed,
                                  training_frame=dummy_frame,
                                  validation_frame=dummy_frame)
    rf.train(y=ds['target'],
             training_frame=ds['train'],
             validation_frame=ds['valid'])

    se = H2OStackedEnsembleEstimator(base_models=[gbm, rf], seed=seed,
                                     training_frame=dummy_frame,
                                     validation_frame=dummy_frame,
                                     blending_frame=dummy_frame)
    se.train(y=ds['target'],
             training_frame=ds['train'],
             validation_frame=ds['valid'],
             blending_frame=ds['blend'])

    assert se.auc() > 0
コード例 #11
0
def test_frames_can_be_passed_as_key():
    ds = import_dataset()

    kw_args = [
        dict(training_frame=ds['train'].frame_id),
        dict(training_frame=ds['train'], validation_frame=ds['valid'].frame_id),
        dict(training_frame=ds['train'], blending_frame=ds['blend'].frame_id),
    ]

    # Constructor validation
    for kwargs in kw_args:
        H2OStackedEnsembleEstimator(base_models=[], **kwargs)

    # train method validation
    base_model_params = dict(ntrees=3, nfolds=3, seed=seed, keep_cross_validation_predictions=True)
    for kwargs in kw_args:
        base_training_args = {k: v for k, v in kwargs.items() if k != 'blending_frame'}
        base_training_args['y'] = ds['target']
        gbm = H2OGradientBoostingEstimator(**base_model_params)
        gbm.train(**base_training_args)
        rf = H2ORandomForestEstimator(**base_model_params)
        rf.train(**base_training_args)
        
        se = H2OStackedEnsembleEstimator(base_models=[gbm, rf])
        se.train(y=ds['target'], **kwargs)
コード例 #12
0
def gbm_model_build():
    """
    Train gbm model
    :returns model, training frame 
    """
    prostate_train = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate_train.csv"))
    prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor()

    ntrees = 100
    learning_rate = 0.1
    depth = 5
    min_rows = 10
    # Build H2O GBM classification model:
    gbm_h2o = H2OGradientBoostingEstimator(ntrees=ntrees,
                                           learn_rate=learning_rate,
                                           max_depth=depth,
                                           min_rows=min_rows,
                                           distribution="bernoulli")
    gbm_h2o.train(x=list(range(1, prostate_train.ncol)),
                  y="CAPSULE",
                  training_frame=prostate_train)

    # Doing PFI on test data vs train data: In the end, you need to decide whether you want to know how much the
    # model relies on each feature for making predictions (-> training data) or how much the feature contributes to
    # the performance of the model on unseen data (-> test data). To the best of my knowledge, there is no research
    # addressing the question of training vs. test data
    return gbm_h2o, prostate_train
コード例 #13
0
    def demo_body(go):
        """
        Demo of H2O's Gradient Boosting estimator.

        This demo uploads a dataset to h2o, parses it, and shows a description.
        Then it divides the dataset into training and test sets, builds a GLM
        from the training set, and makes predictions for the test set.
        Finally, default performance metrics are displayed.
        """
        go()
        # Connect to H2O
        h2o.init()

        go()
        # Upload the prostate dataset that comes included in the h2o python package
        prostate = h2o.load_dataset("prostate")

        go()
        # Print a description of the prostate data
        prostate.describe()

        go()
        # Randomly split the dataset into ~70/30, training/test sets
        train, test = prostate.split_frame(ratios=[0.70])

        go()
        # Convert the response columns to factors (for binary classification problems)
        train["CAPSULE"] = train["CAPSULE"].asfactor()
        test["CAPSULE"] = test["CAPSULE"].asfactor()

        go()
        # Build a (classification) GLM
        from h2o.estimators import H2OGradientBoostingEstimator
        prostate_gbm = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=10, max_depth=8,
                                                    min_rows=10, learn_rate=0.2)
        prostate_gbm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"],
                           y="CAPSULE", training_frame=train)

        go()
        # Show the model
        prostate_gbm.show()

        go()
        # Predict on the test set and show the first ten predictions
        predictions = prostate_gbm.predict(test)
        predictions.show()

        go()
        # Fetch a tree, print number of tree nodes, show root node description
        from h2o.tree import H2OTree, H2ONode
        tree = H2OTree(prostate_gbm, 0, "0")
        len(tree)
        tree.left_children
        tree.right_children
        tree.root_node.show()

        go()
        # Show default performance metrics
        performance = prostate_gbm.model_performance(test)
        performance.show()
コード例 #14
0
def generic_blank_constructor():

    # Train a model
    airlines = h2o.import_file(
        path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    model = H2OGradientBoostingEstimator(ntrees=1)
    model.train(x=["Origin", "Dest"],
                y="IsDepDelayed",
                training_frame=airlines)

    #Save the previously created model into a temporary file
    original_model_filename = tempfile.mkdtemp()
    original_model_filename = model.download_mojo(original_model_filename)

    # Load the model from the temporary using an empty constructor
    mojo_model = H2OGenericEstimator()
    mojo_model.path = original_model_filename
    mojo_model.train()
    assert isinstance(mojo_model, H2OGenericEstimator)

    assert mojo_model._model_json["output"][
        "original_model_identifier"] == "gbm"
    assert mojo_model._model_json["output"][
        "original_model_full_name"] == "Gradient Boosting Machine"

    # Test scoring is available on the model
    predictions = mojo_model.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421
コード例 #15
0
def gradient_boosting(name):
    """
    Get the Gradient Boosting Model
    :param name: model name, will determine filename
    :return:
    """
    params = get_params("gradient_boosting")
    return H2OGradientBoostingEstimator(model_id=name, **params)
コード例 #16
0
    def demo_body(go):
        """
        Demo of H2O's Gradient Boosting estimator.

        This demo uploads a dataset to h2o, parses it, and shows a description.
        Then it divides the dataset into training and test sets, builds a GLM
        from the training set, and makes predictions for the test set.
        Finally, default performance metrics are displayed.
        """
        go()
        # Connect to H2O
        h2o.init()

        go()
        # Upload the prostate dataset that comes included in the h2o python package
        prostate = h2o.upload_file(data_file("h2o_data/prostate.csv"))

        go()
        # Print a description of the prostate data
        prostate.summary()

        go()
        # Randomly split the dataset into ~70/30, training/test sets
        r = prostate[0].runif()
        train = prostate[r < 0.70]
        test = prostate[r >= 0.70]

        go()
        # Convert the response columns to factors (for binary classification problems)
        train["CAPSULE"] = train["CAPSULE"].asfactor()
        test["CAPSULE"] = test["CAPSULE"].asfactor()

        go()
        # Build a (classification) GLM
        from h2o.estimators import H2OGradientBoostingEstimator
        prostate_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
                                                    ntrees=10,
                                                    max_depth=8,
                                                    min_rows=10,
                                                    learn_rate=0.2)
        prostate_gbm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"],
                           y="CAPSULE",
                           training_frame=train)

        go()
        # Show the model
        prostate_gbm.show()

        go()
        # Predict on the test set and show the first ten predictions
        predictions = prostate_gbm.predict(test)
        predictions.show()

        go()
        # Show default performance metrics
        performance = prostate_gbm.model_performance(test)
        performance.show()
コード例 #17
0
ファイル: stars_gbm_recipe.py プロジェクト: h2oai/mojoland
 def bake(self) -> H2OGradientBoostingEstimator:
     fr = stars_frame()
     assert fr.type("distance") == "int"
     model = H2OGradientBoostingEstimator(ntrees=100,
                                          distribution="gaussian")
     model.train(y="distance",
                 training_frame=fr,
                 ignored_columns=["name1", "name2"])
     return model
コード例 #18
0
ファイル: names_gbm_recipe.py プロジェクト: h2oai/mojoland
 def bake(self) -> H2OGradientBoostingEstimator:
     fr = names_frame()
     fr = fr[:5000, :]
     fr["name"] = fr["name"].ascharacter().asfactor()  # trim nlevels()
     assert 256 < fr["name"].nlevels()[0] < 500
     model = H2OGradientBoostingEstimator(ntrees=100,
                                          distribution="bernoulli")
     model.train(y="sex", training_frame=fr)
     return model
コード例 #19
0
ファイル: example.py プロジェクト: kibernetika-ai/h2o
def main():
    args = parse_args()
    h2o.init(ip=args.host, port=args.port)

    # Upload the prostate dataset that comes included in the h2o python package
    prostate = h2o.load_dataset("prostate")

    # Print a description of the prostate data
    prostate.describe()

    # Randomly split the dataset into ~70/30, training/test sets
    client.update_task_info({
        'test_train': 0.7,
        'learn_rate': 0.2,
    })

    train, test = prostate.split_frame(ratios=[0.70])

    # Convert the response columns to factors (for binary classification problems)
    train["CAPSULE"] = train["CAPSULE"].asfactor()
    test["CAPSULE"] = test["CAPSULE"].asfactor()

    # Build a (classification) GLM
    from h2o.estimators import H2OGradientBoostingEstimator
    prostate_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
                                                ntrees=10,
                                                max_depth=8,
                                                min_rows=10,
                                                learn_rate=0.2)
    prostate_gbm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"],
                       y="CAPSULE",
                       training_frame=train)

    # Show the model
    prostate_gbm.show()

    # Predict on the test set and show the first ten predictions
    predictions = prostate_gbm.predict(test)
    predictions.show()

    # Fetch a tree, print number of tree nodes, show root node description
    from h2o.tree import H2OTree, H2ONode
    tree = H2OTree(prostate_gbm, 0, "0")
    tree.root_node.show()

    # Show default performance metrics
    performance = prostate_gbm.model_performance(test)
    performance.show()

    client.update_task_info({
        'mse': performance.mse(),
        'rmse': performance.rmse(),
        'auc': performance.auc(),
        'gini': performance.gini(),
        'logloss': performance.logloss(),
    })
コード例 #20
0
ファイル: pyunit_tree.py プロジェクト: zoudongyang/h2o-3
def tree_test():

    # GBM
    airlines = h2o.import_file(
        path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    gbm = H2OGradientBoostingEstimator(ntrees=1)
    gbm.train(x=["Origin", "Dest"], y="IsDepDelayed", training_frame=airlines)

    tree = H2OTree(
        gbm, 0,
        "NO")  # Indexing from 0 in Python. There is exactly one tree built
    check_tree(tree, 0, "NO")
    assert tree.root_node.left_levels is not None  #Only categoricals in the model, guaranteed to have categorical split
    assert tree.root_node.right_levels is not None  #Only categoricals in the model, guaranteed to have categorical split
    assert tree.left_cat_split is not None and tree.right_cat_split is not None
    assert len(tree.left_cat_split) == len(tree.right_cat_split)

    # There are categorical splits only, check none of the cat splits is None
    for i in range(0, len(tree.left_cat_split)):
        if (tree.left_children[i] == -1 and tree.right_children[i]
                == -1):  # Except leaf nodes, those should be None
            assert tree.left_cat_split[i] is None
            assert tree.right_cat_split[i] is None
        else:
            assert tree.left_cat_split[i] is not None
            assert tree.right_cat_split[i] is not None

    # DRF
    cars = h2o.import_file(
        path=pyunit_utils.locate("smalldata/junit/cars_nice_header.csv"))
    drf = H2ORandomForestEstimator(ntrees=2)
    drf.train(x=["power", "acceleration"], y="cylinders", training_frame=cars)

    drf_tree = H2OTree(drf, 1, None)
    check_tree(drf_tree, 1)

    # ISOFOR
    ecg_discord = h2o.import_file(
        pyunit_utils.locate("smalldata/anomaly/ecg_discord_train.csv"))
    isofor = H2OIsolationForestEstimator(ntrees=3, seed=12, sample_size=5)
    isofor.train(training_frame=ecg_discord)

    if_tree = H2OTree(isofor, 2)

    # There are no categoricall splits, check none of the cat splits is None
    for i in range(0, len(if_tree.node_ids)):
        assert if_tree.left_cat_split[i] is None
        assert if_tree.right_cat_split[i] is None
        if (if_tree.left_children[i] == -1 and tree.right_children[i]
                == -1):  # Leaf nodes don't have split thresholds
            assert if_tree.thresholds is None
        else:  # All others nodes should have split thresholds
            assert if_tree.thresholds[i] is not None
            assert if_tree.thresholds[i] is not None
    check_tree(if_tree, 2)
コード例 #21
0
def gbm_monotone_tweedie_test():
    data = h2o.import_file(
        "http://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/autoclaims.csv"
    )
    data = data.drop(
        ['POLICYNO', 'PLCYDATE', 'CLM_FREQ5', 'CLM_FLAG', 'IN_YY'])
    train, test = data.split_frame([0.8], seed=123)
    response = "CLM_AMT5"

    monotone_constraints = {
        "MVR_PTS": 1,
    }

    gbm_regular = H2OGradientBoostingEstimator(seed=42, distribution="tweedie")
    gbm_regular.train(y=response, training_frame=train, validation_frame=test)
    print(gbm_regular.varimp(use_pandas=True))
    top_3_vars_regular = gbm_regular.varimp(
        use_pandas=True).ix[:, 'variable'].head(3).tolist()
    assert "MVR_PTS" in top_3_vars_regular

    gbm_mono = H2OGradientBoostingEstimator(
        monotone_constraints=monotone_constraints,
        seed=42,
        distribution="tweedie")
    gbm_mono.train(y=response, training_frame=train, validation_frame=test)
    print(gbm_regular.varimp(use_pandas=True))
    top_3_vars_mono = gbm_mono.varimp(
        use_pandas=True).ix[:, 'variable'].head(3).tolist()

    # monotone constraints didn't affect the variable importance
    assert top_3_vars_mono == top_3_vars_regular

    # train a model with opposite constraint on MVR_PTS
    gbm_adverse = H2OGradientBoostingEstimator(
        seed=42, distribution="tweedie", monotone_constraints={"MVR_PTS": -1})
    gbm_adverse.train(y=response, training_frame=train, validation_frame=test)

    # variable becomes least important to the model
    assert [
        "MVR_PTS"
    ] == gbm_adverse.varimp(use_pandas=True).ix[:,
                                                'variable'].tail(1).tolist()
コード例 #22
0
def download_model_filename():
    fr = h2o.import_file(
        path=pyunit_utils.locate("smalldata/prostate/prostate.csv"))

    model = H2OGradientBoostingEstimator(ntrees=10, seed=1234)
    model.train(x=list(range(2, fr.ncol)), y=1, training_frame=fr)

    # Default filename is model_id
    model_path = model.download_model()
    # It should be saved in server working directory
    assert model_path.endswith(
        model.model_id), "Not expected path: {0}".format(model_path)
    loaded_model = h2o.load_model(model_path)
    assert isinstance(loaded_model, H2OGradientBoostingEstimator)

    # Default filename is model_id
    tmpdir = tempfile.mkdtemp()
    model_path = model.download_model(tmpdir)
    assert_equals(os.path.join(tmpdir, model.model_id), model_path,
                  "Not expected path")
    loaded_model = h2o.load_model(model_path)
    assert isinstance(loaded_model, H2OGradientBoostingEstimator)

    # Custom filename with custom path
    model_path = model.download_model(tmpdir, filename="gbm_prostate")
    assert_equals(os.path.join(tmpdir, "gbm_prostate"), model_path,
                  "Not expected path")
    loaded_model = h2o.load_model(model_path)
    assert isinstance(loaded_model, H2OGradientBoostingEstimator)

    # Custom filename with custom path
    model_path = model.download_model(tmpdir, filename="gbm_prostate.model")
    assert_equals(os.path.join(tmpdir, "gbm_prostate.model"), model_path,
                  "Not expected path")
    loaded_model = h2o.load_model(model_path)
    assert isinstance(loaded_model, H2OGradientBoostingEstimator)

    # Custom filename with custom path
    model_path = model.download_model(tmpdir,
                                      filename=os.path.join(
                                          "not-existing-folder",
                                          "gbm_prostate.model"))
    assert_equals(
        os.path.join(tmpdir, "not-existing-folder", "gbm_prostate.model"),
        model_path, "Not expected path")
    loaded_model = h2o.load_model(model_path)
    assert isinstance(loaded_model, H2OGradientBoostingEstimator)

    # Custom filename with default path
    model_path = model.download_model(filename="gbm_prostate2.model")
    assert model_path.endswith(
        "gbm_prostate2.model"), "Not expected path: {0}".format(model_path)
    loaded_model = h2o.load_model(model_path)
    assert isinstance(loaded_model, H2OGradientBoostingEstimator)
コード例 #23
0
def test_train_returns_the_trained_model():
    fr = h2o.import_file(path=pu.locate("smalldata/prostate/prostate.csv"))
    target = "CAPSULE"
    fr[target] = fr[target].asfactor()

    gbm = H2OGradientBoostingEstimator(model_id="py_gbm_train_result", seed=42)
    model = gbm.train(y=target, training_frame=fr)

    assert isinstance(model, ModelBase)
    assert model is gbm
    model.predict(fr)
コード例 #24
0
def test_mojo_ids():

    # Train a model
    airlines = h2o.import_file(
        path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    model = H2OGradientBoostingEstimator(ntrees=1)
    model.train(x=["Origin", "Dest"],
                y="IsDepDelayed",
                training_frame=airlines,
                verbose=False)

    # Save the previously created model into a temporary file
    original_model_filename = tempfile.mkdtemp()
    original_model_filename = model.save_mojo(original_model_filename)

    original_model_id = model.model_id
    print(original_model_id)

    # Import MOJO from the temporary file
    mojo_model = h2o.import_mojo(original_model_filename,
                                 model_id=original_model_id)
    print(mojo_model.model_id)
    assert_equals(mojo_model.model_id, original_model_id,
                  "Ids should be the same.")

    # Download the MOJO
    original_model_filename = model.download_mojo(original_model_filename)

    # Upload MOJO from the temporary file
    mojo_model_up = h2o.upload_mojo(original_model_filename,
                                    model_id=original_model_id)
    print(mojo_model_up.model_id)
    assert_equals(mojo_model_up.model_id, original_model_id,
                  "Ids should be the same.")

    # Load MOJO model from file
    mojo_model_from_file = H2OGenericEstimator.from_file(
        original_model_filename, original_model_id)
    print(mojo_model_from_file.model_id)
    assert_equals(mojo_model_from_file.model_id, original_model_id,
                  "Ids should be the same.")

    # Test initialize model_id from path
    mojo_model_up_wid = h2o.upload_mojo(original_model_filename)
    print(mojo_model_up_wid.model_id)
    assert_equals(mojo_model_up_wid.model_id, original_model_id,
                  "Ids should not be the same.")

    mojo_model_im_wid = h2o.import_mojo(original_model_filename)
    print(mojo_model_im_wid.model_id)
    assert_equals(mojo_model_im_wid.model_id, original_model_id,
                  "Ids should not be the same.")
コード例 #25
0
ファイル: h2oModeling.py プロジェクト: JeremyLG/RinseOverRun
def train_gbm(train, valid):
    hf, vf = convert_frames(train, valid)
    gbm = H2OGradientBoostingEstimator(model_id="Ayaya_gbm",
                                       seed=1337,
                                       ntrees=500,
                                       stopping_metric="custom",
                                       stopping_rounds=10,
                                       stopping_tolerance=0.001,
                                       custom_metric_func=mape_func)
    gbm.train(training_frame=hf,
              y="final_rinse_total_turbidity_liter",
              validation_frame=vf)
    return gbm
コード例 #26
0
def retain_keys_test():
    airlines = h2o.import_file(
        path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    gbm = H2OGradientBoostingEstimator(ntrees=1)
    gbm.train(x=["Origin", "Dest"], y="IsDepDelayed", training_frame=airlines)

    h2o.remove_all([airlines.frame_id, gbm.model_id])

    assert h2o.get_frame(airlines.frame_id) is not None
    assert h2o.get_model(gbm.model_id) is not None

    ## Test key not being retained when unspecified
    gbm = H2OGradientBoostingEstimator(ntrees=1)
    gbm.train(x=["Origin", "Dest"], y="IsDepDelayed", training_frame=airlines)

    h2o.remove_all([airlines.frame_id])
    h2o.ls()
    try:
        h2o.get_model(gbm.model_id)
        assert False
    except h2o.exceptions.H2OResponseError as e:
        assert e.args[0].dev_msg.find("not found for argument: key") != -1
コード例 #27
0
def mojo_conveniece():
    # Train a model
    airlines = h2o.import_file(
        path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    model = H2OGradientBoostingEstimator(ntrees=1)
    model.train(x=["Origin", "Dest"],
                y="IsDepDelayed",
                training_frame=airlines)

    #Save the previously created model into a temporary file
    original_model_filename = tempfile.mkdtemp()
    original_model_filename = model.save_mojo(original_model_filename)

    # Load the model from the temporary file
    mojo_model = h2o.import_mojo(original_model_filename)
    assert isinstance(mojo_model, H2OGenericEstimator)

    # Test scoring is available on the model
    predictions = mojo_model.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421

    #####
    # MOJO UPLOAD TEST
    #####

    # Download the MOJO
    original_model_filename = model.download_mojo(original_model_filename)
    # Load the model from the temporary file
    mojo_model = h2o.upload_mojo(original_model_filename)
    assert isinstance(mojo_model, H2OGenericEstimator)

    # Test scoring is available on the model
    predictions = mojo_model.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421

    #####
    # MOJO to POJO Conversion test with POJO re-import
    #####

    pojo_directory = os.path.join(pyunit_utils.locate("results"),
                                  model.model_id + ".java")
    pojo_path = model.download_pojo(path=pojo_directory)
    mojo2_model = h2o.import_mojo(pojo_path)

    predictions2 = mojo2_model.predict(airlines)
    assert predictions2 is not None
    assert predictions2.nrows == 24421
    assert_frame_equal(predictions.as_data_frame(),
                       predictions2.as_data_frame())
コード例 #28
0
def test_checkpointing_gives_equal_model_summary():
    prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()
    predictors = ["ID", "AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"]
    response = "CAPSULE"    
    gbm = H2OGradientBoostingEstimator(ntrees=50, seed=1111)
    gbm.train(x=predictors, y=response, training_frame=prostate)
    
    checkpointed_gbm = H2OGradientBoostingEstimator(ntrees=100, seed=1111, checkpoint=gbm.model_id)
    checkpointed_gbm.train(x=predictors, y=response, training_frame=prostate)

    gbm_ref = H2OGradientBoostingEstimator(ntrees=100, seed=1111)
    gbm_ref.train(x=predictors, y=response, training_frame=prostate)
    assert checkpointed_gbm.checkpoint == gbm.model_id

    checkpoint_summary = checkpointed_gbm._model_json["output"]["model_summary"]
    expected_summary = gbm_ref._model_json["output"]["model_summary"]
    print(checkpoint_summary)
    print(expected_summary)
    assert abs(expected_summary["model_size_in_bytes"][0] - checkpoint_summary["model_size_in_bytes"][0]) <= 20, "Not expected size of model created from checkpoint"
    assert_equals(expected_summary["mean_depth"][0], checkpoint_summary["mean_depth"][0])
    assert_equals(expected_summary["min_leaves"][0], checkpoint_summary["min_leaves"][0])
    assert_equals(expected_summary["max_leaves"][0], checkpoint_summary["max_leaves"][0])
    assert_equals(expected_summary["mean_leaves"][0], checkpoint_summary["mean_leaves"][0])
コード例 #29
0
def trainAndTestH2OPythonGbm(hc, dataset):
    h2oframe = hc.asH2OFrame(dataset)
    label = "CAPSULE"
    gbm = H2OGradientBoostingEstimator(seed=42)
    gbm.train(y=label, training_frame=h2oframe)
    directoryName = tempfile.mkdtemp(prefix="")
    try:
        mojoPath = gbm.download_mojo(directoryName)
        settings = H2OMOJOSettings(withDetailedPredictionCol=True)
        model = H2OMOJOModel.createFromMojo("file://" + mojoPath, settings)
        return model.transform(dataset).select(
            "prediction", "detailed_prediction.probabilities.0",
            "detailed_prediction.probabilities.1")
    finally:
        shutil.rmtree(directoryName)
コード例 #30
0
def stackedensemble_mojo_model_test():
    train = h2o.import_file(
        pyunit_utils.locate("smalldata/iris/iris_train.csv"))
    test = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_test.csv"))
    x = train.columns
    y = "species"

    nfolds = 2
    gbm = H2OGradientBoostingEstimator(nfolds=nfolds,
                                       fold_assignment="Modulo",
                                       keep_cross_validation_predictions=True)
    gbm.train(x=x, y=y, training_frame=train)
    rf = H2ORandomForestEstimator(nfolds=nfolds,
                                  fold_assignment="Modulo",
                                  keep_cross_validation_predictions=True)
    rf.train(x=x, y=y, training_frame=train)
    se = H2OStackedEnsembleEstimator(training_frame=train,
                                     validation_frame=test,
                                     base_models=[gbm.model_id, rf.model_id])
    se.train(x=x, y=y, training_frame=train)
    print(se)
    with Capturing() as original_output:
        se.show()

    original_model_filename = tempfile.mkdtemp()
    original_model_filename = se.download_mojo(original_model_filename)

    key = h2o.lazy_import(original_model_filename)
    fr = h2o.get_frame(key[0])
    generic_mojo_model = H2OGenericEstimator(model_key=fr)
    generic_mojo_model.train()
    compare_params(se, generic_mojo_model)

    predictions = generic_mojo_model.predict(test)
    assert predictions is not None

    # Test constructor generating the model from existing MOJO file
    generic_mojo_model_from_file = H2OGenericEstimator.from_file(
        original_model_filename)
    assert generic_mojo_model_from_file is not None
    predictions = generic_mojo_model_from_file.predict(test)
    assert predictions is not None

    generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo")
    generic_mojo_filename = generic_mojo_model_from_file.download_mojo(
        path=generic_mojo_filename)
    assert os.path.getsize(generic_mojo_filename) == os.path.getsize(
        original_model_filename)