def test_value_error_threshold(dataset): with pytest.raises(ValueError): GMMOutlierDetector(threshold=10).fit(dataset) with pytest.raises(ValueError): GMMOutlierDetector(threshold=-10).fit(dataset) with pytest.raises(ValueError): GMMOutlierDetector(megatondinosaurhead=1).fit(dataset) with pytest.raises(ValueError): GMMOutlierDetector(method="dinosaurhead").fit(dataset) with pytest.raises(ValueError): GMMOutlierDetector(threshold=-10, method="stddev").fit(dataset)
def test_estimator_checks(test_fn): clf_quantile = GMMOutlierDetector(threshold=0.999, method="quantile") test_fn(GMMOutlierDetector.__name__ + "_quantile", clf_quantile) clf_stddev = GMMOutlierDetector(threshold=2, method="stddev") test_fn(GMMOutlierDetector.__name__ + "_stddev", clf_stddev) bayes_clf_quantile = BayesianGMMOutlierDetector(threshold=0.999, method="quantile") test_fn(BayesianGMMOutlierDetector.__name__ + "_quantile", bayes_clf_quantile) bayes_clf_stddev = BayesianGMMOutlierDetector(threshold=2, method="stddev") test_fn(BayesianGMMOutlierDetector.__name__ + "_stddev", bayes_clf_stddev)
def test_estimator_checks(test_fn): gmm_remover = OutlierRemover(outlier_detector=GMMOutlierDetector(), refit=True) test_fn(OutlierRemover.__name__, gmm_remover) isolation_forest_remover = OutlierRemover( outlier_detector=IsolationForest(), refit=True) test_fn(OutlierRemover.__name__, isolation_forest_remover)
def create_outlier_detector(data, component_count=1, covariance_type='full', init_params='kmeans', max_iteration_count=DEFAULT_MAX_ITERATION_COUNT, method='quantile', threshold=DEFAULT_CONFIDENCE_LEVEL): """Creates a detector based on a Gaussian mixture with the specified number of components and fits the specified data with the expectation-maximization (EM) algorithm. Note that the variational inference model is using all the components.""" model = GMMOutlierDetector(n_components=component_count, covariance_type=covariance_type, init_params=init_params, max_iter=max_iteration_count, method=method, threshold=threshold) return model.fit(data)
def test_obvious_usecase_quantile(dataset): mod_quantile = GMMOutlierDetector(threshold=0.999, method="quantile") clf_quantile = OutlierClassifier(mod_quantile) X = dataset y = (dataset.max(axis=1) > 3).astype(np.int) clf_quantile.fit(X, y) assert clf_quantile.predict([[10, 10]]) == np.array([1]) assert clf_quantile.predict([[0, 0]]) == np.array([0]) assert isinstance(clf_quantile.score(X, y), float)
def test_pipeline_integration(): np.random.seed(42) dataset = np.concatenate([np.random.normal(0, 1, (2000, 2))]) isolation_forest_remover = OutlierRemover(outlier_detector=IsolationForest()) gmm_remover = OutlierRemover(outlier_detector=GMMOutlierDetector()) pipeline = Pipeline([ ("isolation_forest_remover", isolation_forest_remover), ('gmm_remover', gmm_remover), ('kmeans', KMeans())]) pipeline.fit(dataset) pipeline.transform(dataset)
@pytest.mark.parametrize("estimator", [ RandomRegressor(strategy="uniform"), RandomRegressor(strategy="normal"), DeadZoneRegressor(effect="linear", n_iter=100), DeadZoneRegressor(effect="quadratic", n_iter=100), ], ids=id_func) def test_shape_regression(estimator, random_xy_dataset_regr): X, y = random_xy_dataset_regr assert estimator.fit(X, y).predict(X).shape[0] == y.shape[0] pipe = Pipeline(steps=[('scaler', StandardScaler()), ('clf', estimator)]) assert pipe.fit(X, y).predict(X).shape[0] == y.shape[0] @pytest.mark.parametrize("estimator", [ GMMClassifier(), BayesianGMMClassifier(), GMMOutlierDetector(threshold=0.999, method="quantile"), GMMOutlierDetector(threshold=2, method="stddev"), BayesianGMMOutlierDetector(threshold=0.999, method="quantile"), BayesianGMMOutlierDetector(threshold=2, method="stddev") ], ids=id_func) def test_shape_classification(estimator, random_xy_dataset_clf): X, y = random_xy_dataset_clf assert estimator.fit(X, y).predict(X).shape[0] == y.shape[0] pipe = Pipeline(steps=[('scaler', StandardScaler()), ('clf', estimator)]) assert pipe.fit(X, y).predict(X).shape[0] == y.shape[0]
def test_estimator_checks(test_fn): mod_quantile = GMMOutlierDetector(threshold=0.999, method="quantile") clf_quantile = OutlierClassifier(mod_quantile) test_fn('OutlierClassifier', clf_quantile)
])) def test_estimator_checks(test_fn): mod_quantile = GMMOutlierDetector(threshold=0.999, method="quantile") clf_quantile = OutlierClassifier(mod_quantile) test_fn('OutlierClassifier', clf_quantile) @pytest.fixture def dataset(): np.random.seed(42) return np.random.normal(0, 1, (2000, 2)) @pytest.mark.parametrize( 'outlier_model', [GMMOutlierDetector(), OneClassSVM(nu=0.05), IsolationForest()]) def test_obvious_usecase(dataset, outlier_model): outlier_clf = OutlierClassifier(outlier_model) X = dataset y = (dataset.max(axis=1) > 3).astype(np.int) outlier_clf.fit(X, y) assert outlier_clf.predict([[10, 10]]) == np.array([1]) assert outlier_clf.predict([[0, 0]]) == np.array([0]) np.testing.assert_array_almost_equal(outlier_clf.predict_proba([[0, 0]]), np.array([[1, 0]]), decimal=3) np.testing.assert_allclose(outlier_clf.predict_proba([[10, 10]]), np.array([[0, 1]]), atol=0.2)
def test_thresh_effect_quantile(dataset): mod1 = GMMOutlierDetector(threshold=0.90, method="quantile").fit(dataset) mod2 = GMMOutlierDetector(threshold=0.95, method="quantile").fit(dataset) mod3 = GMMOutlierDetector(threshold=0.99, method="quantile").fit(dataset) assert mod1.predict(dataset).sum() > mod2.predict(dataset).sum() assert mod2.predict(dataset).sum() > mod3.predict(dataset).sum()
def test_thresh_effect_stddev(dataset): mod1 = GMMOutlierDetector(threshold=1, method="stddev").fit(dataset) mod2 = GMMOutlierDetector(threshold=2, method="stddev").fit(dataset) mod3 = GMMOutlierDetector(threshold=3, method="stddev").fit(dataset) assert mod1.predict(dataset).sum() > mod2.predict(dataset).sum() assert mod2.predict(dataset).sum() > mod3.predict(dataset).sum()
def test_obvious_usecase_stddev(dataset): mod = GMMOutlierDetector(n_components=2, threshold=2, method="stddev").fit(dataset) assert mod.predict([[10, 10], [-10, -10]]).all() assert (mod.predict([[0, 0]]) == np.array([-1])).all()
def test_estimator_checks(test_fn): clf_quantile = GMMOutlierDetector(threshold=0.999, method="quantile") test_fn(GMMOutlierDetector.__name__ + '_quantile', clf_quantile) clf_stddev = GMMOutlierDetector(threshold=2, method="stddev") test_fn(GMMOutlierDetector.__name__ + '_stddev', clf_stddev)