Ejemplo n.º 1
0
def test_explanation_list_of_models_binomial_classification():
    train = h2o.upload_file(
        pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    y = "CAPSULE"
    train[y] = train[y].asfactor()
    # get at most one column from each type
    cols_to_test = []
    for col, typ in train.types.items():
        for ctt in cols_to_test:
            if typ == train.types[ctt] or col == y:
                break
        else:
            cols_to_test.append(col)

    aml = H2OAutoML(seed=1234, max_models=5)
    aml.train(y=y, training_frame=train)

    models = [
        h2o.get_model(m[0])
        for m in aml.leaderboard["model_id"].as_data_frame(use_pandas=False,
                                                           header=False)
    ]

    # Test named models as well
    gbm = H2OGradientBoostingEstimator(model_id="my_awesome_model")
    gbm.train(y=y, training_frame=train)
    models += [gbm]

    # test variable importance heatmap plot
    assert isinstance(
        h2o.varimp_heatmap(models).figure(), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    # test model correlation heatmap plot
    assert isinstance(
        h2o.model_correlation_heatmap(models, train).figure(),
        matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    # test partial dependences
    for col in cols_to_test:
        assert isinstance(
            h2o.pd_multi_plot(models, train, col).figure(),
            matplotlib.pyplot.Figure)
        matplotlib.pyplot.close()

    # test learning curve
    for model in models:
        assert isinstance(model.learning_curve_plot().figure(),
                          matplotlib.pyplot.Figure)
    matplotlib.pyplot.close("all")

    # test explain
    assert isinstance(h2o.explain(models, train, render=False), H2OExplanation)

    # test explain row
    assert isinstance(h2o.explain_row(models, train, 1, render=False),
                      H2OExplanation)
Ejemplo n.º 2
0
def test_explanation_automl_multinomial_classification():
    train = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris2.csv"))
    y = "response"
    train[y] = train[y].asfactor()
    # get at most one column from each type
    cols_to_test = []
    for col, typ in train.types.items():
        for ctt in cols_to_test:
            if typ == train.types[ctt] or col == y:
                break
        else:
            cols_to_test.append(col)

    aml = H2OAutoML(seed=1234, max_models=5)
    aml.train(y=y, training_frame=train)

    # test variable importance heatmap plot
    assert isinstance(aml.varimp_heatmap().figure(), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    assert len(aml.varimp(use_pandas=False)) == 3  # numpy.ndarray, colnames, rownames
    assert isinstance(aml.varimp(use_pandas=True), pandas.DataFrame)

    # test model correlation heatmap plot
    assert isinstance(aml.model_correlation_heatmap(train).figure(), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    assert len(aml.model_correlation(train, use_pandas=False)) == 2  # numpy.ndarray, colnames and rownames both in the same order => represented by just one vector
    assert isinstance(aml.model_correlation(train, use_pandas=True), pandas.DataFrame)

    # test partial dependences
    for col in cols_to_test:
        assert isinstance(aml.pd_multi_plot(train, col, target="setosa").figure(), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close("all")

    # test explain
    assert isinstance(aml.explain(train, render=False), H2OExplanation)

    # test explain row
    assert isinstance(aml.explain_row(train, 1, render=False), H2OExplanation)

    # Leaderboard slices work
    # test explain
    assert isinstance(h2o.explain(aml.leaderboard[~aml.leaderboard["model_id"].grep("^Stacked", output_logical=True), :], train, render=False), H2OExplanation)

    # test explain row
    assert isinstance(h2o.explain_row(aml.leaderboard[~aml.leaderboard["model_id"].grep("^Stacked", output_logical=True), :], train, 1, render=False), H2OExplanation)
Ejemplo n.º 3
0
def test_explanation_list_of_models_multinomial_classification():
    train = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris2.csv"))
    y = "response"
    train[y] = train[y].asfactor()
    # get at most one column from each type
    cols_to_test = []
    for col, typ in train.types.items():
        for ctt in cols_to_test:
            if typ == train.types[ctt] or col == y:
                break
        else:
            cols_to_test.append(col)

    aml = H2OAutoML(seed=1234, max_models=5)
    aml.train(y=y, training_frame=train)

    models = [h2o.get_model(m[0]) for m in
              aml.leaderboard["model_id"].as_data_frame(use_pandas=False, header=False)]


    # test variable importance heatmap plot
    assert isinstance(h2o.varimp_heatmap(models), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    # test model correlation heatmap plot
    assert isinstance(h2o.model_correlation_heatmap(models, train), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    # test partial dependences
    for col in cols_to_test:
        assert isinstance(h2o.pd_multi_plot(models, train, col, target="setosa"), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close("all")

    # test explain
    assert isinstance(h2o.explain(models, train, render=False), H2OExplanation)

    # test explain row
    assert isinstance(h2o.explain_row(models, train, 1, render=False), H2OExplanation)
Ejemplo n.º 4
0
def test_explanation_automl_binomial_classification():
    train = h2o.upload_file(pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    y = "CAPSULE"
    train[y] = train[y].asfactor()
    # get at most one column from each type
    cols_to_test = []
    for col, typ in train.types.items():
        for ctt in cols_to_test:
            if typ == train.types[ctt] or col == y:
                break
        else:
            cols_to_test.append(col)

    aml = H2OAutoML(seed=1234, max_models=5)
    aml.train(y=y, training_frame=train)

    # test variable importance heatmap plot
    assert isinstance(aml.varimp_heatmap().figure(), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    # test that num_of_features is propagated
    for n_features in [1, 3, 5]:
        assert n_features == len(aml.varimp_heatmap(num_of_features=n_features).figure().get_axes()[0].get_yticks())
        matplotlib.pyplot.close()

    assert len(aml.varimp(use_pandas=False)) == 3  # numpy.ndarray, colnames, rownames
    assert isinstance(aml.varimp(use_pandas=True), pandas.DataFrame)

    # test model correlation heatmap plot
    assert isinstance(aml.model_correlation_heatmap(train).figure(), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    assert len(aml.model_correlation(train, use_pandas=False)) == 2  # numpy.ndarray, colnames and rownames both in the same order => represented by just one vector
    assert isinstance(aml.model_correlation(train, use_pandas=True), pandas.DataFrame)


    # test partial dependences
    for col in cols_to_test:
        assert isinstance(aml.pd_multi_plot(train, col).figure(), matplotlib.pyplot.Figure)
        matplotlib.pyplot.close()

    # test explain
    assert isinstance(aml.explain(train, render=False), H2OExplanation)

    # test explain row
    assert isinstance(aml.explain_row(train, 1, render=False), H2OExplanation)

    # Leaderboard slices work
    # test variable importance heatmap plot
    assert isinstance(aml.varimp_heatmap().figure(), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    leaderboard_without_SE = aml.leaderboard[~aml.leaderboard["model_id"].grep("^Stacked", output_logical=True), :]
    assert len(h2o.explanation.varimp(leaderboard_without_SE, use_pandas=False)) == 3  # numpy.ndarray, colnames, rownames
    assert isinstance(h2o.explanation.varimp(leaderboard_without_SE, use_pandas=True), pandas.DataFrame)

    # test model correlation heatmap plot
    assert isinstance(h2o.model_correlation_heatmap(leaderboard_without_SE, train).figure(), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    assert len(h2o.explanation.model_correlation(leaderboard_without_SE, train, use_pandas=False)) == 2  # numpy.ndarray, colnames and rownames both in the same order => represented by just one vector
    assert isinstance(h2o.explanation.model_correlation(leaderboard_without_SE, train, use_pandas=True), pandas.DataFrame)

    # test partial dependences
    assert isinstance(h2o.pd_multi_plot(leaderboard_without_SE, train, cols_to_test[0]).figure(), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    # test explain
    assert isinstance(h2o.explain(leaderboard_without_SE, train, render=False), H2OExplanation)

    # test explain row
    assert isinstance(h2o.explain_row(leaderboard_without_SE, train, 1, render=False), H2OExplanation)
Ejemplo n.º 5
0
def test_explanation_automl_regression():
    train = h2o.upload_file(pyunit_utils.locate("smalldata/titanic/titanic_expanded.csv"))
    train["name"] = train["name"].asfactor()
    y = "fare"

    # get at most one column from each type
    cols_to_test = []
    for col, typ in train.types.items():
        for ctt in cols_to_test:
            if typ == train.types[ctt] or col == y:
                break
        else:
            cols_to_test.append(col)

    aml = H2OAutoML(seed=1234, max_models=5)
    aml.train(y=y, training_frame=train)

    # test variable importance heatmap plot
    assert isinstance(aml.varimp_heatmap().figure(), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    assert len(aml.varimp(use_pandas=False)) == 3  # numpy.ndarray, colnames, rownames
    assert isinstance(aml.varimp(use_pandas=True), pandas.DataFrame)

    # test model correlation heatmap plot
    assert isinstance(aml.model_correlation_heatmap(train).figure(), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    assert len(aml.model_correlation(train, use_pandas=False)) == 2  # numpy.ndarray, colnames and rownames both in the same order => represented by just one vector
    assert isinstance(aml.model_correlation(train, use_pandas=True), pandas.DataFrame)


    # test partial dependences
    for col in cols_to_test:
        try:
            assert isinstance(aml.pd_multi_plot(train, col).figure(), matplotlib.pyplot.Figure)
        except ValueError:
            assert col == "name", "'name' is a string column which is not supported."
    matplotlib.pyplot.close("all")

    # test explain
    assert isinstance(aml.explain(train, render=False), H2OExplanation)

    # test explain row
    assert isinstance(aml.explain_row(train, 1, render=False), H2OExplanation)

    # test shortening model ids work correctly
    from h2o.explanation._explain import _shorten_model_ids
    model_ids = aml.leaderboard.as_data_frame()["model_id"]
    shortened_model_ids = _shorten_model_ids(model_ids)
    assert len(set(model_ids)) == len(set(shortened_model_ids))
    for i in range(len(model_ids)):
        assert len(model_ids[i]) > len(shortened_model_ids[i])

    # Leaderboard slices work
    # test explain
    assert isinstance(h2o.explain(aml.leaderboard[~aml.leaderboard["model_id"].grep("^Stacked", output_logical=True), :],
                                  train, render=False), H2OExplanation)

    # test explain row
    assert isinstance(h2o.explain_row(aml.leaderboard[~aml.leaderboard["model_id"].grep("^Stacked", output_logical=True), :],
                                      train, 1, render=False), H2OExplanation)