def test_outlier_detection(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) clf.fit(X) y_pred = clf.predict(X) assert_array_almost_equal( clf.decision_function(X, raw_values=True), clf.mahalanobis(X)) assert_array_almost_equal(clf.mahalanobis(X), clf.dist_) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.)
def test_outlier_detection(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) assert_raises(NotFittedError, clf.predict, X) assert_raises(NotFittedError, clf.decision_function, X) clf.fit(X) y_pred = clf.predict(X) decision = clf.decision_function(X, raw_values=True) decision_transformed = clf.decision_function(X, raw_values=False) assert_array_almost_equal(decision, clf.mahalanobis(X)) assert_array_almost_equal(clf.mahalanobis(X), clf.dist_) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0) assert sum(y_pred == -1) == sum(decision_transformed < 0)
def test_outlier_detection(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) assert_raises(NotFittedError, clf.predict, X) assert_raises(NotFittedError, clf.decision_function, X) clf.fit(X) y_pred = clf.predict(X) decision = clf.decision_function(X, raw_values=True) decision_transformed = clf.decision_function(X, raw_values=False) assert_array_almost_equal(decision, clf.mahalanobis(X)) assert_array_almost_equal(clf.mahalanobis(X), clf.dist_) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.) assert (sum(y_pred == -1) == sum(decision_transformed < 0))
def test_elliptic_envelope(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) assert_raises(NotFittedError, clf.predict, X) assert_raises(NotFittedError, clf.decision_function, X) clf.fit(X) y_pred = clf.predict(X) scores = clf.score_samples(X) decisions = clf.decision_function(X) assert_array_almost_equal(scores, -clf.mahalanobis(X)) assert_array_almost_equal(clf.mahalanobis(X), clf.dist_) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.) assert (sum(y_pred == -1) == sum(decisions < 0))
def calc(self,outliers_fraction): data, dqs, raw = self.get_data() clf = EllipticEnvelope(contamination=outliers_fraction) X = zip(data['Tbandwidth'],data['Tlatency'],data['Tframerate']) clf.fit(X) #data['y_pred'] = clf.decision_function(X).ravel() #data['y_pred'] = clf.decision_function(X).ravel() #threshold = np.percentile(data['y_pred'],100 * outliers_fraction) data['MDist']=clf.mahalanobis(X) #picking "bad" outliers, not good ones outliers = chi2_outliers(data, [.8,.9,.95], 3) #print outliers outliers = [i[i['Tbandwidth']<i['Tlatency']] for i in outliers] #outliers = data[data['y_pred']<threshold] #data['y_pred'] = data['y_pred'] > threshold #outliers = [x[['ticketid','MDist']].merge(raw, how='inner').drop_duplicates() for x in outliers] #print raw #outliers = [raw[raw['ticketid'].isin(j['ticketid'])] for j in outliers] outliers = [k[k['Tframerate']<(k['Tframerate'].mean()+k['Tframerate'].std())] for k in outliers] #making sure we don't remove aberrantly good framrates outliers = [t.sort_values(by='MDist', ascending=False).drop_duplicates().drop(['Tbandwidth','Tlatency','Tframerate'],axis=1) for t in outliers] #dqs = raw[raw['ticketid'].isin(dqs['ticketid'])] #data = data.sort_values('MDist', ascending=False).drop_duplicates() return outliers, dqs, data.sort_values(by='MDist', ascending=False).drop_duplicates().drop(['Tbandwidth','Tlatency','Tframerate'],axis=1)
def test_elliptic_envelope(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) assert_raises(NotFittedError, clf.predict, X) assert_raises(NotFittedError, clf.decision_function, X) clf.fit(X) y_pred = clf.predict(X) scores = clf.score_samples(X) decisions = clf.decision_function(X) assert_array_almost_equal( scores, -clf.mahalanobis(X)) assert_array_almost_equal(clf.mahalanobis(X), clf.dist_) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.) assert(sum(y_pred == -1) == sum(decisions < 0))
def test_outlier_detection(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) clf.fit(X) y_pred = clf.predict(X) assert_array_almost_equal(clf.decision_function(X, raw_mahalanobis=True), clf.mahalanobis(X - clf.location_)) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.)
def elliptEnvMethod(data, uniqueTrains, **kwargs): elp = EllipticEnvelope(support_fraction=1) elp.fit_predict(data) # Squared Mahalanobis distances of the points of data # Note this is the same as using the "elp.dist_" parameter m_d = elp.mahalanobis(data) # Get the regular Mahalanobis distances elp_d = np.sqrt(m_d) # IMPLEMENT THE AUTOMATED CUT-OFF SCORE_INCREASE_RATIO = 1.3 sortD = np.sort(elp_d) sortD = sortD[math.floor( len(sortD) / 2):] # Get the end half of the sorted list of scores ratioD = np.array([sortD[i] / sortD[i - 1] for i in range(1, len(sortD))]) # print(f'\nSorted distances: {sortD}\n\n Ratios: {ratioD}') ind = np.where(ratioD > SCORE_INCREASE_RATIO) if len(ind[0]) >= 1: ind = ind[0][0] + 1 SIGMA = sortD[ ind] # Get the score which increases by the score_ratio compared to the previous score else: SIGMA = 100.0 # use an arbritary high score as there are no big score jumps SIGMA = max(SIGMA, 4.0) # Limit the SIGMA function from being too low # Segment labels = (elp_d >= SIGMA).astype(int) # labelOLD = (elp_d > 4.0).astype(int) if False: print('.dist_ = {m1}\t .m(data)={}'.format(m2)) #\n {m_d}') print("Sigma labels: {}".format(labels)) print("\nCovariance = {}".format(elp.covariance_)) if False: labelColours = 'white' fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 6)) colours = np.array(['blue', 'red']) ax[0].set_title('Elliptical envelope - Mahalanobis distance', color='white') ax[0].plot(elp_d, 'bo', alpha=0.4) #, color=colours[labels]) ax[1].scatter(data[:, 0], data[:, 1], s=20, color=colours[labels], alpha=0.4) ax[1].set_title( 'Elliptical envelope (adjusted cutoff={})'.format(SIGMA), color='white') # ax[2].scatter(data[:, 0], data[:, 1], s=20, color=colours[labelOLD], alpha=0.4) # ax[2].set_title('Elliptical envelope (adjusted cutoff={})'.format(4.0), color='white') for i, a in enumerate(ax.flat): ax[i].set_xlabel('Mean normalised') #, fontsize = 15.0) ax[i].set_ylabel('Standard deviation normalised') # fig1 = plt.figure(figsize=(5,5)) plt.show() anomalies = [ train for ind, train in enumerate(uniqueTrains) if labels[ind] == 1 ] anomalyDates = [] if kwargs.get('dates', None) is not None: dates = kwargs.get('dates') anomalyDates = [dates[i] for i, val in enumerate(labels) if val == 1] return anomalies, labels, anomalyDates
rot = pcac.components_ pd.DataFrame(rot[0, :], index=cfat.columns).round(3).T # pd.DataFrame(rot[1, :], index=cfat.columns).round(3).T # from sklearn.covariance import EllipticEnvelope ee = EllipticEnvelope() ee.fit(cfat) # md = np.sqrt(ee.mahalanobis(cfat)) n = len(md) ix = np.arange(1, n + 1) halfq = sp.stats.norm.ppf((n + ix) / (2 * n + 1)), plt.scatter(halfq, np.sort(md)) plt.xlabel(r'$\chi^2$ quantiles') plt.ylabel('Mahalanobis distances') # xmat = sm.add_constant(cfat) lmod = sm.OLS(fat.brozek, xmat).fit() lmod.sumary() #
import numpy as np from scipy.io import loadmat from sklearn.covariance import EmpiricalCovariance, EllipticEnvelope from sklearn.metrics import accuracy_score, classification_report cardio_data = loadmat('cardio.mat') estimator = EmpiricalCovariance() cov = estimator.fit(cardio_data['X']) mahal_cov = cov.mahalanobis(cardio_data['X']) # sort values and extract n maximum values # number of outliers in cardio data = 176 indexes = np.argpartition(mahal_cov, 176)[-176:] y_pred = np.zeros(cardio_data['y'].shape) y_pred[indexes] = 1 print(classification_report(cardio_data['y'], y_pred)) print(accuracy_score(cardio_data['y'], y_pred)) cov = EllipticEnvelope().fit(np.dot(cardio_data['X'].T, cardio_data['X'])) mahal_cov = cov.mahalanobis(cardio_data['X']) indexes = np.argpartition(mahal_cov, 176)[-176:] y_pred = np.zeros(cardio_data['y'].shape) y_pred[indexes] = 1 print(classification_report(cardio_data['y'], y_pred)) print(accuracy_score(cardio_data['y'], y_pred))
fig = plt.figure() ax = fig.add_subplot(111) arr[:, 0] /= arr[:, 2] arr[:, 1] /= arr[:, 2] a = arr[:, 0:2] ee = EllipticEnvelope() try: ee.fit(a) except: continue dsts = ee.decision_function(a).ravel() m1, m2 = min(dsts), max(dsts) maaa = ee.mahalanobis(a) min_x, max_x = min(arr[:, 0]), max(arr[:, 0]) min_y, max_y = min(arr[:, 1]), max(arr[:, 1]) min_x -= 10 max_x += 10 min_y -= 10 max_y += 10 xx, yy = np.meshgrid(np.linspace(min_x, max_x, 500), np.linspace(min_y, max_y, 500)) Z = ee.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) ax.contour(xx,