def test_infogram_personal_loan():
    """
    Simple Perosnal loan test to check that when wrong thresholds are specified, warnings should be
    generated.
    :return: 
    """
    fr = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/admissibleml_test/Bank_Personal_Loan_Modelling.csv"))
    target = "Personal Loan"
    fr[target] = fr[target].asfactor()
    x = [
        "Experience", "Income", "Family", "CCAvg", "Education", "Mortgage",
        "Securities Account", "CD Account", "Online", "CreditCard"
    ]
    with pyunit_utils.catch_warnings() as ws:
        infogram_model = H2OInfogram(seed=12345,
                                     protected_columns=["Age", "ZIP Code"],
                                     top_n_features=len(x),
                                     net_information_threshold=0.2,
                                     total_information_threshold=0.2)
        infogram_model.train(x=x, y=target, training_frame=fr)
        assert len(
            ws
        ) == 2, "Expected two warnings but received {0} warnings instead.".format(
            len(ws))
        assert pyunit_utils.contains_warning(
            ws, 'information_threshold for fair infogram runs.')
def test_infogram_iris_wrong_thresholds():
    """
    Simple Iris test to check that when wrong thresholds are specified for core infogram, warnings will
    be received
    """
    fr = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/admissibleml_test/irisROriginal.csv"))
    target = "Species"
    fr[target] = fr[target].asfactor()
    x = fr.names
    x.remove(target)
    with pyunit_utils.catch_warnings() as ws:
        infogram_model = H2OInfogram(
            seed=12345,
            distribution='multinomial',
            safety_index_threshold=0.2,
            relevance_index_threshold=0.2,
            top_n_features=len(
                x))  # build infogram model with default settings
        infogram_model.train(x=x, y=target, training_frame=fr)
        assert len(
            ws
        ) == 2, "Expected two warnings but received {0} warnings instead.".format(
            len(ws))
        assert pyunit_utils.contains_warning(
            ws, 'index_threshold for core infogram runs.')
Beispiel #3
0
def test_binomial_response_warning():
    training_data = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"))
    y = "survived"
    features = ["name", "sex"]

    expected_warning = 'We have detected that your response column has only 2 unique values (0/1). ' \
                       'If you wish to train a binary model instead of a regression model, ' \
                       'convert your target column to categorical before training.'

    with pyunit_utils.catch_warnings() as ws:
        model = H2OGradientBoostingEstimator(ntrees=1)
        model.train(x=features, y=y, training_frame=training_data)
        assert pyunit_utils.contains_warning(ws, expected_warning)

    training_data[training_data[y] == 0, y] = -1
    with pyunit_utils.catch_warnings() as ws:
        model = H2OGradientBoostingEstimator(ntrees=1)
        model.train(x=features, y=y, training_frame=training_data)
        assert pyunit_utils.contains_warning(ws, expected_warning)
    def test_reproducible_early_stopping_warning():
        training_data = h2o.import_file(
            pyunit_utils.locate("smalldata/gbm_test/BostonHousing.csv"))

        with pyunit_utils.catch_warnings() as ws:
            model = estimator(stopping_rounds=1, stopping_metric="mse")
            model.train(x=list(range(13)), y=13, training_frame=training_data)
            expected_message = 'early stopping is enabled but neither score_tree_interval or ' \
                               'score_each_iteration are defined. Early stopping will not be reproducible!'
            assert pyunit_utils.contains_warning(ws, expected_message)
    def test_no_warning_score_each_iteration():
        training_data = h2o.import_file(
            pyunit_utils.locate("smalldata/gbm_test/BostonHousing.csv"))

        with pyunit_utils.catch_warnings() as ws:
            model = estimator(stopping_rounds=1,
                              stopping_metric="mse",
                              score_each_iteration=True)
            model.train(x=list(range(13)), y=13, training_frame=training_data)
            assert pyunit_utils.no_warnings(ws)
Beispiel #6
0
def test_lambda_warning():
    training_data = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/BostonHousing.csv"))
    Y = 13
    X = list(range(13))
    model = H2OGeneralizedLinearEstimator(family="Gaussian", lambda_search=True, Lambda=[0.01])
    model.train(x=X, y=Y, training_frame=training_data)

    with pyunit_utils.catch_warnings() as ws:
        model = H2OGeneralizedLinearEstimator(family="Gaussian", lambda_search=True, Lambda=[0.01])
        model.train(x=X, y=Y, training_frame=training_data)

        assert pyunit_utils.contains_warning(ws, 'disabled when user specified any lambda value(s)')
Beispiel #7
0
def test_binomial_response_warning():
    training_data = h2o.import_file(
        pyunit_utils.locate("smalldata/gbm_test/titanic.csv"))
    y = "survived"
    features = ["name", "sex"]

    with pyunit_utils.catch_warnings() as ws:
        model = H2OGradientBoostingEstimator(ntrees=1)
        model.train(x=features, y=y, training_frame=training_data)
        assert pyunit_utils.contains_warning(
            ws,
            'Response is numeric, so the regression model will be trained. However, the cardinality is equaled to two, so if you want to train a classification model, convert the response column to categorical before training.'
        )

    training_data[training_data[y] == 0, y] = -1
    with pyunit_utils.catch_warnings() as ws:
        model = H2OGradientBoostingEstimator(ntrees=1)
        model.train(x=features, y=y, training_frame=training_data)
        assert pyunit_utils.contains_warning(
            ws,
            'Response is numeric, so the regression model will be trained. However, the cardinality is equaled to two, so if you want to train a classification model, convert the response column to categorical before training.'
        )
def gbm_reweight_tree():
    prostate_frame = h2o.import_file(
        path=pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    prostate_frame["RACE"] = prostate_frame["RACE"].asfactor()
    prostate_frame["CAPSULE"] = prostate_frame["CAPSULE"].asfactor()

    x = ["AGE", "RACE", "GLEASON", "DCAPS", "PSA", "VOL", "DPROS"]
    y = 'CAPSULE'

    gbm_model = H2OGradientBoostingEstimator()
    gbm_model.train(x=x, y=y, training_frame=prostate_frame)

    # 1. Get original contributions
    contribs_original = gbm_model.predict_contributions(prostate_frame)
    assert contribs_original.col_names == [
        u'AGE', u'RACE', u'DPROS', u'DCAPS', u'PSA', u'VOL', u'GLEASON',
        u'BiasTerm'
    ]

    # 2. Scale weights => contributions should stay the same
    prostate_frame["weights"] = 2
    gbm_model.update_tree_weights(prostate_frame, "weights")
    contribs_reweighted = gbm_model.predict_contributions(prostate_frame)
    assert_frame_equal(contribs_reweighted.as_data_frame(),
                       contribs_original.as_data_frame())

    # 3. Re-weight based on small subset of the data => contributions are expected to change
    with pyunit_utils.catch_warnings() as ws:
        prostate_subset = prostate_frame.head(10)
        gbm_model.update_tree_weights(prostate_subset, "weights")
        contribs_subset = gbm_model.predict_contributions(prostate_subset)
        assert contribs_subset["BiasTerm"].min(
        ) != contribs_original["BiasTerm"].min()
        assert any(
            issubclass(w.category, UserWarning)
            and 'Some of the updated nodes have zero weights' in str(w.message)
            for w in ws)
def xgboost_reweight_tree():
    prostate_frame = h2o.import_file(
        path=pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    prostate_frame["RACE"] = prostate_frame["RACE"].asfactor()
    prostate_frame["CAPSULE"] = prostate_frame["CAPSULE"].asfactor()

    x = ["AGE", "RACE", "GLEASON", "DCAPS", "PSA", "VOL", "DPROS"]
    y = 'CAPSULE'

    xgb_model = H2OXGBoostEstimator()
    xgb_model.train(x=x, y=y, training_frame=prostate_frame)

    # 0. Save original MOJO
    oring_mojo_path = xgb_model.download_mojo()
    orig_mojo_str = h2o.print_mojo(oring_mojo_path)

    # 1. Get original contributions
    contribs_original = xgb_model.predict_contributions(prostate_frame)
    assert contribs_original.col_names == [
        u'RACE.0', u'RACE.1', u'RACE.2', u'RACE.missing(NA)', u'AGE', u'DPROS',
        u'DCAPS', u'PSA', u'VOL', u'GLEASON', u'BiasTerm'
    ]

    # 2. Scale weights => contributions should stay the same
    weights_scale = 2
    prostate_frame["weights"] = weights_scale
    xgb_model.update_tree_weights(prostate_frame, "weights")
    contribs_reweighted = xgb_model.predict_contributions(prostate_frame)
    assert_frame_equal(contribs_reweighted.as_data_frame(),
                       contribs_original.as_data_frame(),
                       check_less_precise=3)

    # 3. Re-weight based on small subset of the data => contributions are expected to change
    with pyunit_utils.catch_warnings() as ws:
        prostate_subset = prostate_frame.head(10)
        xgb_model.update_tree_weights(prostate_subset, "weights")
        contribs_subset = xgb_model.predict_contributions(prostate_subset)
        assert contribs_subset["BiasTerm"].min(
        ) != contribs_original["BiasTerm"].min()
        assert any(
            issubclass(w.category, UserWarning)
            and 'Some of the updated nodes have zero weights' in str(w.message)
            for w in ws)

    # 4. Save modified mojo
    reweighted_mojo_path = xgb_model.download_mojo()
    reweighted_mojo_str = h2o.print_mojo(reweighted_mojo_path)

    # Sanity check
    assert orig_mojo_str != reweighted_mojo_str

    # Check first tree weight
    init_f = 1 / (1 + math.exp(0))
    hess_coef = init_f * (1 - init_f)
    orig_trees = json.loads(orig_mojo_str)
    assert orig_trees["trees"][0]["root"][
        "weight"] == prostate_frame.nrow * hess_coef

    reweighted_trees = json.loads(reweighted_mojo_str)
    assert reweighted_trees["trees"][0]["root"][
        "weight"] == prostate_subset.nrow * hess_coef * weights_scale