Example #1
0
def test_iforest_average_path_length():
    # It tests non-regression for #8549 which used the wrong formula
    # for average path length, strictly for the integer case

    result_one = 2. * (np.log(4.) + euler_gamma) - 2. * 4. / 5.
    result_two = 2. * (np.log(998.) + euler_gamma) - 2. * 998. / 999.
    assert_almost_equal(_average_path_length(1), 1., decimal=10)
    assert_almost_equal(_average_path_length(5), result_one, decimal=10)
    assert_almost_equal(_average_path_length(999), result_two, decimal=10)
    assert_array_almost_equal(_average_path_length(np.array([1, 5, 999])),
                              [1., result_one, result_two], decimal=10)
def test_iforest_average_path_length():
    # It tests non-regression for #8549 which used the wrong formula
    # for average path length, strictly for the integer case

    result_one = 2. * (np.log(4.) + euler_gamma) - 2. * 4. / 5.
    result_two = 2. * (np.log(998.) + euler_gamma) - 2. * 998. / 999.
    assert_almost_equal(_average_path_length(1), 1., decimal=10)
    assert_almost_equal(_average_path_length(5), result_one, decimal=10)
    assert_almost_equal(_average_path_length(999), result_two, decimal=10)
    assert_array_almost_equal(_average_path_length(np.array([1, 5, 999])),
                              [1., result_one, result_two], decimal=10)
def test_isolation_forest():
    import shap
    import numpy as np
    from sklearn.ensemble import IsolationForest
    from sklearn.ensemble.iforest import _average_path_length

    X, y = shap.datasets.boston()
    iso = IsolationForest(behaviour='new', contamination='auto')
    iso.fit(X)

    explainer = shap.TreeExplainer(iso)
    shap_values = explainer.shap_values(X)

    score_from_shap = -2**(
        -(np.sum(shap_values, axis=1) + explainer.expected_value) /
        _average_path_length(np.array([iso.max_samples_]))[0])
    assert np.allclose(iso.score_samples(X), score_from_shap, atol=1e-7)
Example #4
0
def test_iforest_average_path_length():
    # It tests non-regression for #8549 which used the wrong formula
    # for average path length, strictly for the integer case
    # Updated to check average path length when input is <= 2 (issue #11839)

    result_one = 2. * (np.log(4.) + np.euler_gamma) - 2. * 4. / 5.
    result_two = 2. * (np.log(998.) + np.euler_gamma) - 2. * 998. / 999.
    assert _average_path_length(0) == pytest.approx(0)
    assert _average_path_length(1) == pytest.approx(0)
    assert _average_path_length(2) == pytest.approx(1)
    assert_allclose(_average_path_length(5), result_one)
    assert_allclose(_average_path_length(999), result_two)
    assert_allclose(_average_path_length(np.array([1, 2, 5, 999])),
                    [0., 1., result_one, result_two])

    # _average_path_length is increasing
    avg_path_length = _average_path_length(np.arange(5))
    assert_array_equal(avg_path_length, np.sort(avg_path_length))
def test_iforest_average_path_length():
    # It tests non-regression for #8549 which used the wrong formula
    # for average path length, strictly for the integer case
    # Updated to check average path length when input is <= 2 (issue #11839)
    result_one = 2.0 * (np.log(4.0) + np.euler_gamma) - 2.0 * 4.0 / 5.0
    result_two = 2.0 * (np.log(998.0) + np.euler_gamma) - 2.0 * 998.0 / 999.0
    assert_allclose(_average_path_length([0]), [0.0])
    assert_allclose(_average_path_length([1]), [0.0])
    assert_allclose(_average_path_length([2]), [1.0])
    assert_allclose(_average_path_length([5]), [result_one])
    assert_allclose(_average_path_length([999]), [result_two])
    assert_allclose(
        _average_path_length(np.array([1, 2, 5, 999])),
        [0.0, 1.0, result_one, result_two],
    )
    # _average_path_length is increasing
    avg_path_length = _average_path_length(np.arange(5))
    assert_array_equal(avg_path_length, np.sort(avg_path_length))
Example #6
0
def test_iforest_average_path_length():
    # It tests non-regression for #8549 which used the wrong formula
    # for average path length, strictly for the integer case
    # Updated to check average path length when input is <= 2 (issue #11839)
    result_one = 2.0 * (np.log(4.0) + np.euler_gamma) - 2.0 * 4.0 / 5.0
    result_two = 2.0 * (np.log(998.0) + np.euler_gamma) - 2.0 * 998.0 / 999.0
    assert_allclose(_average_path_length([0]), [0.0])
    assert_allclose(_average_path_length([1]), [0.0])
    assert_allclose(_average_path_length([2]), [1.0])
    assert_allclose(_average_path_length([5]), [result_one])
    assert_allclose(_average_path_length([999]), [result_two])
    assert_allclose(
        _average_path_length(np.array([1, 2, 5, 999])),
        [0.0, 1.0, result_one, result_two],
    )
    # _average_path_length is increasing
    avg_path_length = _average_path_length(np.arange(5))
    assert_array_equal(avg_path_length, np.sort(avg_path_length))
Example #7
0
def test_pyod_isolation_forest():
    import shap
    import numpy as np
    from pyod.models.iforest import IForest
    from sklearn.ensemble.iforest import _average_path_length

    X, _ = shap.datasets.boston()
    for max_features in [1.0, 0.75]:
        iso = IForest(max_features=max_features)
        iso.fit(X)

        explainer = shap.TreeExplainer(iso)
        shap_values = explainer.shap_values(X)

        score_from_shap = -2**(
            -(np.sum(shap_values, axis=1) + explainer.expected_value) /
            _average_path_length(np.array([iso.max_samples_]))[0])
        assert np.allclose(iso.detector_.score_samples(X),
                           score_from_shap,
                           atol=1e-7)