Example #1
0
 def test_check_univariate(self):
     with assert_raises(ValueError):
         MAD().fit(X=[[0.0, 0.0],
                      [0.0, 0.0]])
     with assert_raises(ValueError):
         MAD().decision_function(X=[[0.0, 0.0],
                                    [0.0, 0.0]])
Example #2
0
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test, n_features=1,
            contamination=self.contamination, random_state=42)

        self.clf = MAD()
        self.clf.fit(self.X_train)
Example #3
0
 def test_parameters(self):
     assert (hasattr(self.clf, 'decision_scores_')
             and self.clf.decision_scores_ is not None)
     assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)
     assert (hasattr(self.clf, 'threshold_')
             and self.clf.threshold_ is not None)
     with assert_raises(TypeError):
         MAD(threshold='str')
Example #4
0
 def setUp(self):
     self.n_train = 100
     self.n_test = 50
     self.contamination = 0.1
     self.roc_floor = 0.8
     # generate data and fit model without missing or infinite values:
     self.X_train, self.X_test, self.y_train, self.y_test = generate_data(
         n_train=self.n_train, n_test=self.n_test, n_features=1,
         contamination=self.contamination, random_state=42)
     self.clf = MAD()
     self.clf.fit(self.X_train)
     # generate data and fit model with missing value:
     self.X_train_nan, self.X_test_nan, self.y_train_nan, self.y_test_nan = generate_data(
         n_train=self.n_train, n_test=self.n_test, n_features=1,
         contamination=self.contamination, random_state=42,
         n_nan=1)
     self.clf_nan = MAD()
     self.clf_nan.fit(self.X_train_nan)
     # generate data and fit model with infinite value:
     self.X_train_inf, self.X_test_inf, self.y_train_inf, self.y_test_inf = generate_data(
         n_train=self.n_train, n_test=self.n_test, n_features=1,
         contamination=self.contamination, random_state=42,
         n_inf=1)
     self.clf_inf = MAD()
     self.clf_inf.fit(self.X_train_inf)
Example #5
0
def pyod_anomaly_detection(type, contamination):
    X_train, y_train, X_test, y_test = data(type=type,
                                            contamination=contamination)
    if type == 'MAD':
        # train MAD detector
        clf_name = 'MAD'
        clf = MAD(threshold=3.5)
        clf.fit(X_train)

        # get the prediction labels and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores

        # get the prediction on the test data
        y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(X_test)  # outlier scores
        # evaluate and print the results
        print("\nOn Training Data:")
        evaluate_print(clf_name, y_train, y_train_scores)
        print("\nOn Test Data:")
        evaluate_print(clf_name, y_test, y_test_scores)

        # visualize the results
        # making dimensions = 2 for visualising purpose only. By repeating same data each dimension.
        visualize(clf_name,
                  np.hstack((X_train, X_train)),
                  y_train,
                  np.hstack((X_test, X_test)),
                  y_test,
                  y_train_pred,
                  y_test_pred,
                  show_figure=True,
                  save_figure=False)
    elif type == 'ABOD':
        # train ABOD detector
        clf_name = 'ABOD'
        clf = ABOD()
        clf.fit(X_train)

        # get the prediction labels and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores

        # get the prediction on the test data
        y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(X_test)  # outlier scores

        # evaluate and print the results
        print("\nOn Training Data:")
        evaluate_print(clf_name, y_train, y_train_scores)
        print("\nOn Test Data:")
        evaluate_print(clf_name, y_test, y_test_scores)

        # visualize the results
        visualize(clf_name,
                  X_train,
                  y_train,
                  X_test,
                  y_test,
                  y_train_pred,
                  y_test_pred,
                  show_figure=True,
                  save_figure=False)
    elif type == 'AutoEncoder':
        # train AutoEncoder detector
        clf_name = 'AutoEncoder'
        clf = AutoEncoder(epochs=30, contamination=contamination)
        clf.fit(X_train)

        # get the prediction labels and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores

        # get the prediction on the test data
        y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(X_test)  # outlier scores

        # evaluate and print the results
        print("\nOn Training Data:")
        evaluate_print(clf_name, y_train, y_train_scores)
        print("\nOn Test Data:")
        evaluate_print(clf_name, y_test, y_test_scores)
Example #6
0
class TestMAD(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.8
        # generate data and fit model without missing or infinite values:
        self.X_train, self.X_test, self.y_train, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test, n_features=1,
            contamination=self.contamination, random_state=42)
        self.clf = MAD()
        self.clf.fit(self.X_train)
        # generate data and fit model with missing value:
        self.X_train_nan, self.X_test_nan, self.y_train_nan, self.y_test_nan = generate_data(
            n_train=self.n_train, n_test=self.n_test, n_features=1,
            contamination=self.contamination, random_state=42,
            n_nan=1)
        self.clf_nan = MAD()
        self.clf_nan.fit(self.X_train_nan)
        # generate data and fit model with infinite value:
        self.X_train_inf, self.X_test_inf, self.y_train_inf, self.y_test_inf = generate_data(
            n_train=self.n_train, n_test=self.n_test, n_features=1,
            contamination=self.contamination, random_state=42,
            n_inf=1)
        self.clf_inf = MAD()
        self.clf_inf.fit(self.X_train_inf)

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_') and
                self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and
                self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_') and
                self.clf.threshold_ is not None)
        with assert_raises(TypeError):
            MAD(threshold='str')

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_prediction_labels_confidence(self):
        pred_labels, confidence = self.clf.predict(self.X_test,
                                                   return_confidence=True)
        assert_equal(pred_labels.shape, self.y_test.shape)
        assert_equal(confidence.shape, self.y_test.shape)
        assert (confidence.min() >= 0)
        assert (confidence.max() <= 1)

    def test_prediction_proba_linear_confidence(self):
        pred_proba, confidence = self.clf.predict_proba(self.X_test,
                                                        method='linear',
                                                        return_confidence=True)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

        assert_equal(confidence.shape, self.y_test.shape)
        assert (confidence.min() >= 0)
        assert (confidence.max() <= 1)

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_with_nan(self):
        pred_labels = self.clf_nan.fit_predict(self.X_train_nan)
        assert_equal(pred_labels.shape, self.y_train_nan.shape)

    def test_fit_predict_with_inf(self):
        pred_labels = self.clf_inf.fit_predict(self.X_train_inf)
        assert_equal(pred_labels.shape, self.y_train_inf.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_scores = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)
        print(pred_ranks)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_scores), atol=2)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_with_nan(self):
        pred_scores = self.clf_nan.decision_function(self.X_test_nan)
        pred_ranks = self.clf_nan._predict_rank(self.X_test_nan)
        print(pred_ranks)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_scores), atol=2)
        assert_array_less(pred_ranks, self.X_train_nan.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_with_inf(self):
        pred_scores = self.clf_inf.decision_function(self.X_test_inf)
        pred_ranks = self.clf_inf._predict_rank(self.X_test_inf)
        print(pred_ranks)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_scores), atol=2)
        assert_array_less(pred_ranks, self.X_train_inf.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_scores = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_scores), atol=2)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized_with_nan(self):
        pred_scores = self.clf_nan.decision_function(self.X_test_nan)
        pred_ranks = self.clf_nan._predict_rank(self.X_test_nan, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_scores), atol=2)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized_with_inf(self):
        pred_scores = self.clf_inf.decision_function(self.X_test_inf)
        pred_ranks = self.clf_inf._predict_rank(self.X_test_inf, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_scores), atol=2)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def test_check_univariate(self):
        with assert_raises(ValueError):
            MAD().fit(X=[[0.0, 0.0],
                         [0.0, 0.0]])
        with assert_raises(ValueError):
            MAD().decision_function(X=[[0.0, 0.0],
                                       [0.0, 0.0]])

    def test_detect_anomaly(self):
        X_test = [[10000]]
        score = self.clf.decision_function(X_test)
        anomaly = self.clf.predict(X_test)
        self.assertGreaterEqual(score[0], self.clf.threshold_)
        self.assertEqual(anomaly[0], 1)

    def test_detect_anomaly_with_nan(self):
        X_test = [[10000]]
        score = self.clf_nan.decision_function(X_test)
        anomaly = self.clf_nan.predict(X_test)
        self.assertGreaterEqual(score[0], self.clf_nan.threshold_)
        self.assertEqual(anomaly[0], 1)

    def test_detect_anomaly_with_inf(self):
        X_test = [[10000]]
        score = self.clf_inf.decision_function(X_test)
        anomaly = self.clf_inf.predict(X_test)
        self.assertGreaterEqual(score[0], self.clf_inf.threshold_)
        self.assertEqual(anomaly[0], 1)

    # todo: fix clone issue
    def test_model_clone(self):
        pass
        # clone_clf = clone(self.clf)

    def tearDown(self):
        pass
Example #7
0
class TestMAD(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test, n_features=1,
            contamination=self.contamination, random_state=42)

        self.clf = MAD()
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_') and
                self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and
                self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_') and
                self.clf.threshold_ is not None)
        with assert_raises(TypeError):
            MAD(threshold='str')

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_scores = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)
        print(pred_ranks)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_scores), atol=2)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def test_check_univariate(self):
        with assert_raises(ValueError):
            MAD().fit(X=[[0.0, 0.0],
                         [0.0, 0.0]])
        with assert_raises(ValueError):
            MAD().decision_function(X=[[0.0, 0.0],
                                       [0.0, 0.0]])

    def tearDown(self):
        pass
Example #8
0
if __name__ == "__main__":
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=1,
                      contamination=contamination,
                      random_state=42)

    # train MAD detector
    clf_name = 'MAD'
    clf = MAD(threshold=3.5)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)