Beispiel #1
0
def test_value_error_threshold(dataset):
    with pytest.raises(ValueError):
        GMMOutlierDetector(threshold=10).fit(dataset)
    with pytest.raises(ValueError):
        GMMOutlierDetector(threshold=-10).fit(dataset)
    with pytest.raises(ValueError):
        GMMOutlierDetector(megatondinosaurhead=1).fit(dataset)
    with pytest.raises(ValueError):
        GMMOutlierDetector(method="dinosaurhead").fit(dataset)
    with pytest.raises(ValueError):
        GMMOutlierDetector(threshold=-10, method="stddev").fit(dataset)
Beispiel #2
0
def test_estimator_checks(test_fn):
    clf_quantile = GMMOutlierDetector(threshold=0.999, method="quantile")
    test_fn(GMMOutlierDetector.__name__ + "_quantile", clf_quantile)

    clf_stddev = GMMOutlierDetector(threshold=2, method="stddev")
    test_fn(GMMOutlierDetector.__name__ + "_stddev", clf_stddev)

    bayes_clf_quantile = BayesianGMMOutlierDetector(threshold=0.999, method="quantile")
    test_fn(BayesianGMMOutlierDetector.__name__ + "_quantile", bayes_clf_quantile)

    bayes_clf_stddev = BayesianGMMOutlierDetector(threshold=2, method="stddev")
    test_fn(BayesianGMMOutlierDetector.__name__ + "_stddev", bayes_clf_stddev)
def test_estimator_checks(test_fn):
    gmm_remover = OutlierRemover(outlier_detector=GMMOutlierDetector(),
                                 refit=True)
    test_fn(OutlierRemover.__name__, gmm_remover)

    isolation_forest_remover = OutlierRemover(
        outlier_detector=IsolationForest(), refit=True)
    test_fn(OutlierRemover.__name__, isolation_forest_remover)
Beispiel #4
0
def create_outlier_detector(data,
                            component_count=1,
                            covariance_type='full',
                            init_params='kmeans',
                            max_iteration_count=DEFAULT_MAX_ITERATION_COUNT,
                            method='quantile',
                            threshold=DEFAULT_CONFIDENCE_LEVEL):
    """Creates a detector based on a Gaussian mixture with the specified number of components and
	fits the specified data with the expectation-maximization (EM) algorithm. Note that the
	variational inference model is using all the components."""
    model = GMMOutlierDetector(n_components=component_count,
                               covariance_type=covariance_type,
                               init_params=init_params,
                               max_iter=max_iteration_count,
                               method=method,
                               threshold=threshold)
    return model.fit(data)
Beispiel #5
0
def test_obvious_usecase_quantile(dataset):
    mod_quantile = GMMOutlierDetector(threshold=0.999, method="quantile")
    clf_quantile = OutlierClassifier(mod_quantile)
    X = dataset
    y = (dataset.max(axis=1) > 3).astype(np.int)
    clf_quantile.fit(X, y)
    assert clf_quantile.predict([[10, 10]]) == np.array([1])
    assert clf_quantile.predict([[0, 0]]) == np.array([0])
    assert isinstance(clf_quantile.score(X, y), float)
def test_pipeline_integration():
    np.random.seed(42)
    dataset = np.concatenate([np.random.normal(0, 1, (2000, 2))])
    isolation_forest_remover = OutlierRemover(outlier_detector=IsolationForest())
    gmm_remover = OutlierRemover(outlier_detector=GMMOutlierDetector())
    pipeline = Pipeline([
        ("isolation_forest_remover", isolation_forest_remover),
        ('gmm_remover', gmm_remover),
        ('kmeans', KMeans())])
    pipeline.fit(dataset)
    pipeline.transform(dataset)
Beispiel #7
0

@pytest.mark.parametrize("estimator", [
    RandomRegressor(strategy="uniform"),
    RandomRegressor(strategy="normal"),
    DeadZoneRegressor(effect="linear", n_iter=100),
    DeadZoneRegressor(effect="quadratic", n_iter=100),
],
                         ids=id_func)
def test_shape_regression(estimator, random_xy_dataset_regr):
    X, y = random_xy_dataset_regr
    assert estimator.fit(X, y).predict(X).shape[0] == y.shape[0]
    pipe = Pipeline(steps=[('scaler', StandardScaler()), ('clf', estimator)])
    assert pipe.fit(X, y).predict(X).shape[0] == y.shape[0]


@pytest.mark.parametrize("estimator", [
    GMMClassifier(),
    BayesianGMMClassifier(),
    GMMOutlierDetector(threshold=0.999, method="quantile"),
    GMMOutlierDetector(threshold=2, method="stddev"),
    BayesianGMMOutlierDetector(threshold=0.999, method="quantile"),
    BayesianGMMOutlierDetector(threshold=2, method="stddev")
],
                         ids=id_func)
def test_shape_classification(estimator, random_xy_dataset_clf):
    X, y = random_xy_dataset_clf
    assert estimator.fit(X, y).predict(X).shape[0] == y.shape[0]
    pipe = Pipeline(steps=[('scaler', StandardScaler()), ('clf', estimator)])
    assert pipe.fit(X, y).predict(X).shape[0] == y.shape[0]
Beispiel #8
0
def test_estimator_checks(test_fn):
    mod_quantile = GMMOutlierDetector(threshold=0.999, method="quantile")
    clf_quantile = OutlierClassifier(mod_quantile)
    test_fn('OutlierClassifier', clf_quantile)
Beispiel #9
0
                                      ]))
def test_estimator_checks(test_fn):
    mod_quantile = GMMOutlierDetector(threshold=0.999, method="quantile")
    clf_quantile = OutlierClassifier(mod_quantile)
    test_fn('OutlierClassifier', clf_quantile)


@pytest.fixture
def dataset():
    np.random.seed(42)
    return np.random.normal(0, 1, (2000, 2))


@pytest.mark.parametrize(
    'outlier_model',
    [GMMOutlierDetector(),
     OneClassSVM(nu=0.05),
     IsolationForest()])
def test_obvious_usecase(dataset, outlier_model):
    outlier_clf = OutlierClassifier(outlier_model)
    X = dataset
    y = (dataset.max(axis=1) > 3).astype(np.int)
    outlier_clf.fit(X, y)
    assert outlier_clf.predict([[10, 10]]) == np.array([1])
    assert outlier_clf.predict([[0, 0]]) == np.array([0])
    np.testing.assert_array_almost_equal(outlier_clf.predict_proba([[0, 0]]),
                                         np.array([[1, 0]]),
                                         decimal=3)
    np.testing.assert_allclose(outlier_clf.predict_proba([[10, 10]]),
                               np.array([[0, 1]]),
                               atol=0.2)
Beispiel #10
0
def test_thresh_effect_quantile(dataset):
    mod1 = GMMOutlierDetector(threshold=0.90, method="quantile").fit(dataset)
    mod2 = GMMOutlierDetector(threshold=0.95, method="quantile").fit(dataset)
    mod3 = GMMOutlierDetector(threshold=0.99, method="quantile").fit(dataset)
    assert mod1.predict(dataset).sum() > mod2.predict(dataset).sum()
    assert mod2.predict(dataset).sum() > mod3.predict(dataset).sum()
Beispiel #11
0
def test_thresh_effect_stddev(dataset):
    mod1 = GMMOutlierDetector(threshold=1, method="stddev").fit(dataset)
    mod2 = GMMOutlierDetector(threshold=2, method="stddev").fit(dataset)
    mod3 = GMMOutlierDetector(threshold=3, method="stddev").fit(dataset)
    assert mod1.predict(dataset).sum() > mod2.predict(dataset).sum()
    assert mod2.predict(dataset).sum() > mod3.predict(dataset).sum()
Beispiel #12
0
def test_obvious_usecase_stddev(dataset):
    mod = GMMOutlierDetector(n_components=2, threshold=2, method="stddev").fit(dataset)
    assert mod.predict([[10, 10], [-10, -10]]).all()
    assert (mod.predict([[0, 0]]) == np.array([-1])).all()
Beispiel #13
0
def test_estimator_checks(test_fn):
    clf_quantile = GMMOutlierDetector(threshold=0.999, method="quantile")
    test_fn(GMMOutlierDetector.__name__ + '_quantile', clf_quantile)

    clf_stddev = GMMOutlierDetector(threshold=2, method="stddev")
    test_fn(GMMOutlierDetector.__name__ + '_stddev', clf_stddev)