def test_check_univariate(self): with assert_raises(ValueError): MAD().fit(X=[[0.0, 0.0], [0.0, 0.0]]) with assert_raises(ValueError): MAD().decision_function(X=[[0.0, 0.0], [0.0, 0.0]])
def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=1, contamination=self.contamination, random_state=42) self.clf = MAD() self.clf.fit(self.X_train)
def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) with assert_raises(TypeError): MAD(threshold='str')
def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.8 # generate data and fit model without missing or infinite values: self.X_train, self.X_test, self.y_train, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=1, contamination=self.contamination, random_state=42) self.clf = MAD() self.clf.fit(self.X_train) # generate data and fit model with missing value: self.X_train_nan, self.X_test_nan, self.y_train_nan, self.y_test_nan = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=1, contamination=self.contamination, random_state=42, n_nan=1) self.clf_nan = MAD() self.clf_nan.fit(self.X_train_nan) # generate data and fit model with infinite value: self.X_train_inf, self.X_test_inf, self.y_train_inf, self.y_test_inf = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=1, contamination=self.contamination, random_state=42, n_inf=1) self.clf_inf = MAD() self.clf_inf.fit(self.X_train_inf)
def pyod_anomaly_detection(type, contamination): X_train, y_train, X_test, y_test = data(type=type, contamination=contamination) if type == 'MAD': # train MAD detector clf_name = 'MAD' clf = MAD(threshold=3.5) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results # making dimensions = 2 for visualising purpose only. By repeating same data each dimension. visualize(clf_name, np.hstack((X_train, X_train)), y_train, np.hstack((X_test, X_test)), y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False) elif type == 'ABOD': # train ABOD detector clf_name = 'ABOD' clf = ABOD() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False) elif type == 'AutoEncoder': # train AutoEncoder detector clf_name = 'AutoEncoder' clf = AutoEncoder(epochs=30, contamination=contamination) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
class TestMAD(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.8 # generate data and fit model without missing or infinite values: self.X_train, self.X_test, self.y_train, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=1, contamination=self.contamination, random_state=42) self.clf = MAD() self.clf.fit(self.X_train) # generate data and fit model with missing value: self.X_train_nan, self.X_test_nan, self.y_train_nan, self.y_test_nan = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=1, contamination=self.contamination, random_state=42, n_nan=1) self.clf_nan = MAD() self.clf_nan.fit(self.X_train_nan) # generate data and fit model with infinite value: self.X_train_inf, self.X_test_inf, self.y_train_inf, self.y_test_inf = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=1, contamination=self.contamination, random_state=42, n_inf=1) self.clf_inf = MAD() self.clf_inf.fit(self.X_train_inf) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) with assert_raises(TypeError): MAD(threshold='str') def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_prediction_labels_confidence(self): pred_labels, confidence = self.clf.predict(self.X_test, return_confidence=True) assert_equal(pred_labels.shape, self.y_test.shape) assert_equal(confidence.shape, self.y_test.shape) assert (confidence.min() >= 0) assert (confidence.max() <= 1) def test_prediction_proba_linear_confidence(self): pred_proba, confidence = self.clf.predict_proba(self.X_test, method='linear', return_confidence=True) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) assert_equal(confidence.shape, self.y_test.shape) assert (confidence.min() >= 0) assert (confidence.max() <= 1) def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_with_nan(self): pred_labels = self.clf_nan.fit_predict(self.X_train_nan) assert_equal(pred_labels.shape, self.y_train_nan.shape) def test_fit_predict_with_inf(self): pred_labels = self.clf_inf.fit_predict(self.X_train_inf) assert_equal(pred_labels.shape, self.y_train_inf.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_scores = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) print(pred_ranks) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_scores), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_with_nan(self): pred_scores = self.clf_nan.decision_function(self.X_test_nan) pred_ranks = self.clf_nan._predict_rank(self.X_test_nan) print(pred_ranks) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_scores), atol=2) assert_array_less(pred_ranks, self.X_train_nan.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_with_inf(self): pred_scores = self.clf_inf.decision_function(self.X_test_inf) pred_ranks = self.clf_inf._predict_rank(self.X_test_inf) print(pred_ranks) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_scores), atol=2) assert_array_less(pred_ranks, self.X_train_inf.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_scores = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_scores), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized_with_nan(self): pred_scores = self.clf_nan.decision_function(self.X_test_nan) pred_ranks = self.clf_nan._predict_rank(self.X_test_nan, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_scores), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized_with_inf(self): pred_scores = self.clf_inf.decision_function(self.X_test_inf) pred_ranks = self.clf_inf._predict_rank(self.X_test_inf, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_scores), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def test_check_univariate(self): with assert_raises(ValueError): MAD().fit(X=[[0.0, 0.0], [0.0, 0.0]]) with assert_raises(ValueError): MAD().decision_function(X=[[0.0, 0.0], [0.0, 0.0]]) def test_detect_anomaly(self): X_test = [[10000]] score = self.clf.decision_function(X_test) anomaly = self.clf.predict(X_test) self.assertGreaterEqual(score[0], self.clf.threshold_) self.assertEqual(anomaly[0], 1) def test_detect_anomaly_with_nan(self): X_test = [[10000]] score = self.clf_nan.decision_function(X_test) anomaly = self.clf_nan.predict(X_test) self.assertGreaterEqual(score[0], self.clf_nan.threshold_) self.assertEqual(anomaly[0], 1) def test_detect_anomaly_with_inf(self): X_test = [[10000]] score = self.clf_inf.decision_function(X_test) anomaly = self.clf_inf.predict(X_test) self.assertGreaterEqual(score[0], self.clf_inf.threshold_) self.assertEqual(anomaly[0], 1) # todo: fix clone issue def test_model_clone(self): pass # clone_clf = clone(self.clf) def tearDown(self): pass
class TestMAD(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=1, contamination=self.contamination, random_state=42) self.clf = MAD() self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) with assert_raises(TypeError): MAD(threshold='str') def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_scores = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) print(pred_ranks) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_scores), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def test_check_univariate(self): with assert_raises(ValueError): MAD().fit(X=[[0.0, 0.0], [0.0, 0.0]]) with assert_raises(ValueError): MAD().decision_function(X=[[0.0, 0.0], [0.0, 0.0]]) def tearDown(self): pass
if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=1, contamination=contamination, random_state=42) # train MAD detector clf_name = 'MAD' clf = MAD(threshold=3.5) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)