def anomaly_detection(features, labels): # In this function, I try to use anomaly detection method (using mutivariate gaussian distribution) to identify poi-s non_pois = features[labels==0] pois = features[labels==1] print "non poi size", non_pois.shape, pois.shape, features.shape ## Spliting data to train, test and cross validation set for anomaly detection split1 = produce_spliting_array(non_pois.shape[0], .75 ) X_train = non_pois[split1==1] X_intermediate = non_pois[split1==0] print "size intermediate", X_intermediate.shape split2 = produce_spliting_array(X_intermediate.shape[0], .5 ) X_test = X_intermediate[split2==1] label_test = np.zeros((X_test.shape[0],), dtype=np.int) - 1 X_cv = X_intermediate[split2==0] label_cv = np.zeros((X_cv.shape[0],), dtype=np.int) - 1 split3 = produce_spliting_array(pois.shape[0], .5 ) X_test = np.vstack((X_test, pois[split3==1])) label_test = np.hstack((label_test, np.ones(sum(split3), dtype=np.int))) X_cv = np.vstack((X_cv, pois[split3==0])) label_cv = np.hstack((label_cv, np.ones(sum(split3==0), dtype=np.int))) print "size X_train", X_train.shape print "size test data", X_test.shape, label_test.shape print "size cv data", X_cv.shape, label_cv.shape print "size splits", len(split1), len(split2), len(split3) from sklearn.covariance import EllipticEnvelope detector = EllipticEnvelope(contamination=.85) detector.fit(X_train) pred_cv = detector.predict(X_cv) print pred_cv print label_cv print detector.score(X_cv, label_cv)
def test_outlier_detection(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) clf.fit(X) y_pred = clf.predict(X) assert_array_almost_equal(clf.decision_function(X, raw_mahalanobis=True), clf.mahalanobis(X - clf.location_)) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.)
def test_outlier_detection(): """ """ rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) clf.fit(X) y_pred = clf.predict(X) assert_array_almost_equal(clf.decision_function(X, raw_mahalanobis=True), clf.mahalanobis(X - clf.location_)) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0)
def test_outlier_detection(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) assert_raises(NotFittedError, clf.predict, X) assert_raises(NotFittedError, clf.decision_function, X) clf.fit(X) y_pred = clf.predict(X) decision = clf.decision_function(X, raw_values=True) decision_transformed = clf.decision_function(X, raw_values=False) assert_array_almost_equal(decision, clf.mahalanobis(X)) assert_array_almost_equal(clf.mahalanobis(X), clf.dist_) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0) assert sum(y_pred == -1) == sum(decision_transformed < 0)
def test_outlier_detection(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) assert_raises(NotFittedError, clf.predict, X) assert_raises(NotFittedError, clf.decision_function, X) clf.fit(X) y_pred = clf.predict(X) decision = clf.decision_function(X, raw_values=True) decision_transformed = clf.decision_function(X, raw_values=False) assert_array_almost_equal(decision, clf.mahalanobis(X)) assert_array_almost_equal(clf.mahalanobis(X), clf.dist_) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.) assert (sum(y_pred == -1) == sum(decision_transformed < 0))
def test_elliptic_envelope(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) assert_raises(NotFittedError, clf.predict, X) assert_raises(NotFittedError, clf.decision_function, X) clf.fit(X) y_pred = clf.predict(X) scores = clf.score_samples(X) decisions = clf.decision_function(X) assert_array_almost_equal(scores, -clf.mahalanobis(X)) assert_array_almost_equal(clf.mahalanobis(X), clf.dist_) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.) assert (sum(y_pred == -1) == sum(decisions < 0))
def test_elliptic_envelope(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) assert_raises(NotFittedError, clf.predict, X) assert_raises(NotFittedError, clf.decision_function, X) clf.fit(X) y_pred = clf.predict(X) scores = clf.score_samples(X) decisions = clf.decision_function(X) assert_array_almost_equal( scores, -clf.mahalanobis(X)) assert_array_almost_equal(clf.mahalanobis(X), clf.dist_) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.) assert(sum(y_pred == -1) == sum(decisions < 0))