def MCD_Score(train_a, test_a, test_b): mcd = MinCovDet() mcd.fit(train_a) mcd_anoscore = mcd.mahalanobis(test_a) mcd_normalscore = mcd.mahalanobis(test_b) print("mcd ano score {} mcd normal score {}".format( mcd_anoscore, mcd_normalscore))
def MCD_ano_score(): print("マハラノビス距離(each MCD) ano score") mcd = MinCovDet() mcd.fit(train_normal) mcd_anoscore = mcd.mahalanobis(test_normal) mcd_normalscore = mcd.mahalanobis(test_ano) print("mcd ano score {} mcd normal score {}".format( mcd_anoscore, mcd_normalscore))
def detect(train_data: np.ndarray, test_data: np.ndarray) -> list: estimated_covarianvce = MinCovDet().fit(train_data) train_dist = estimated_covarianvce.mahalanobis(train_data) np_max = np.max(train_dist) return [ 0 if data <= np_max else 1 for data in estimated_covarianvce.mahalanobis(test_data) ]
def launch_mcd_on_dataset(n_samples, n_features, n_outliers, tol_loc, tol_cov, tol_support): rand_gen = np.random.RandomState(0) data = rand_gen.randn(n_samples, n_features) # add some outliers outliers_index = rand_gen.permutation(n_samples)[:n_outliers] outliers_offset = 10. * \ (rand_gen.randint(2, size=(n_outliers, n_features)) - 0.5) data[outliers_index] += outliers_offset inliers_mask = np.ones(n_samples).astype(bool) inliers_mask[outliers_index] = False pure_data = data[inliers_mask] # compute MCD by fitting an object mcd_fit = MinCovDet(random_state=rand_gen).fit(data) T = mcd_fit.location_ S = mcd_fit.covariance_ H = mcd_fit.support_ # compare with the estimates learnt from the inliers error_location = np.mean((pure_data.mean(0) - T) ** 2) assert (error_location < tol_loc) error_cov = np.mean((empirical_covariance(pure_data) - S) ** 2) assert (error_cov < tol_cov) assert (np.sum(H) >= tol_support) assert_array_almost_equal(mcd_fit.mahalanobis(data), mcd_fit.dist_)
def outliers_finder(data_frame: pd.DataFrame) -> pd.DataFrame: """ Finding and removing outliers :param data_frame: :return: """ (df_X, df_y) = splitting_dataset(data_frame) # Define the PCA object pca = PCA() # Run PCA on scaled data and obtain the scores array T = pca.fit_transform(StandardScaler().fit_transform(df_X.values)) # fit a Minimum Covariance Determinant (MCD) robust estimator to data robust_cov = MinCovDet().fit(T[:, :5]) # Get the Mahalanobis distance m = robust_cov.mahalanobis(T[:, :5]) data_frame['mahalanobis'] = m # calculate p-value for each mahalanobis distance data_frame['p'] = 1 - chi2.cdf(data_frame['mahalanobis'], 3) data_frame.sort_values('p', ascending=False) Drops = (data_frame['p'] <= 0.001) data_frame['Drops'] = (data_frame['p'] <= 0.001) indexNames = data_frame[data_frame['Drops'] == True].index print(indexNames.size) data_frame.drop(indexNames, inplace=True) return data_frame
def find_outliers_mahalanobis(featMatProjected, extremeness=2., figsize=[8, 8], saveto=None): """ A function to determine to return a list of outlier indices using the Mahalanobis distance. Outlier threshold = std(Mahalanobis distance) * extremeness degree [extreme_values=2, very_extreme_values=3 --> according to 68-95-99.7 rule] """ import numpy as np import pandas as pd import seaborn as sns from pathlib import Path from sklearn.covariance import MinCovDet from matplotlib import pyplot as plt # NB: Euclidean distance puts more weight than it should on correlated variables # Chicken and egg situation, we can’t know they are outliers until we calculate # the stats of the distribution, but the stats of the distribution are skewed by outliers! # Mahalanobis gets around this by weighting by robust estimation of covariance matrix # Fit a Minimum Covariance Determinant (MCD) robust estimator to data robust_cov = MinCovDet().fit( featMatProjected[:, :10]) # Use the first 10 principal components # Get the Mahalanobis distance MahalanobisDist = robust_cov.mahalanobis(featMatProjected[:, :10]) projectedTable = pd.DataFrame(featMatProjected[:,:10],\ columns=['PC' + str(n+1) for n in range(10)]) plt.ioff() if saveto else plt.ion() plt.close('all') plt.style.use(CUSTOM_STYLE) sns.set_style('ticks') fig, ax = plt.subplots(figsize=figsize) ax.set_facecolor('#F7FFFF') plt.scatter(np.array(projectedTable['PC1']), np.array(projectedTable['PC2']), c=MahalanobisDist) # colour PCA by Mahalanobis distance plt.title('Mahalanobis Distance for Outlier Detection', fontsize=20) plt.colorbar() ax.grid(False) if saveto: saveto.parent.mkdir(exist_ok=True, parents=True) suffix = Path(saveto).suffix.strip('.') plt.savefig(saveto, format=suffix, dpi=300) else: plt.show() k = np.std(MahalanobisDist) * extremeness upper_t = np.mean(MahalanobisDist) + k outliers = [] for i in range(len(MahalanobisDist)): if (MahalanobisDist[i] >= upper_t): outliers.append(i) print("Outliers found: %d" % len(outliers)) return np.array(outliers)
def launch_mcd_on_dataset(n_samples, n_features, n_outliers, tol_loc, tol_cov, tol_support): rand_gen = np.random.RandomState(0) data = rand_gen.randn(n_samples, n_features) # add some outliers outliers_index = rand_gen.permutation(n_samples)[:n_outliers] outliers_offset = 10. * \ (rand_gen.randint(2, size=(n_outliers, n_features)) - 0.5) data[outliers_index] += outliers_offset inliers_mask = np.ones(n_samples).astype(bool) inliers_mask[outliers_index] = False pure_data = data[inliers_mask] # compute MCD by fitting an object mcd_fit = MinCovDet(random_state=rand_gen).fit(data) T = mcd_fit.location_ S = mcd_fit.covariance_ H = mcd_fit.support_ # compare with the estimates learnt from the inliers error_location = np.mean((pure_data.mean(0) - T) ** 2) assert(error_location < tol_loc) error_cov = np.mean((empirical_covariance(pure_data) - S) ** 2) assert(error_cov < tol_cov) assert(np.sum(H) >= tol_support) assert_array_almost_equal(mcd_fit.mahalanobis(data), mcd_fit.dist_)
def getMahalanobisRobust(dat, critical_alpha = 0.01, good_rows = np.zeros(0)): '''Calculate the Mahalanobis distance from the sample vector.''' if good_rows.size == 0: good_rows = np.any(~np.isnan(dat), axis=1); #import pdb #pdb.set_trace() try: robust_cov = MinCovDet().fit(dat[good_rows]) mahalanobis_dist = np.sqrt(robust_cov.mahalanobis(dat)) except ValueError: #this step will fail if the covariance matrix is not singular. This happens if the data is not #a unimodal symetric distribution. For example there is too many small noisy particles. Therefore #I will take a safe option and return zeros in the mahalanobis distance if this is the case. mahalanobis_dist = np.zeros(dat.shape[0]) #critial distance of the maholanobis distance using the chi-square distirbution #https://en.wikiversity.org/wiki/Mahalanobis%27_distance #http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2.html maha_lim = chi2.ppf(1-critical_alpha, dat.shape[1]) outliers = mahalanobis_dist>maha_lim return mahalanobis_dist, outliers, maha_lim
def _h_getMahalanobisRobust(dat, critical_alpha=0.01, good_rows=np.zeros(0)): '''Calculate the Mahalanobis distance from the sample vector.''' if good_rows.size == 0: good_rows = np.any(~np.isnan(dat), axis=1) try: dat2fit = dat[good_rows] assert not np.any(np.isnan(dat2fit)) robust_cov = MinCovDet().fit(dat2fit) mahalanobis_dist = np.sqrt(robust_cov.mahalanobis(dat)) except ValueError: # this step will fail if the covariance matrix is not singular. This happens if the data is not # a unimodal symetric distribution. For example there is too many small noisy particles. Therefore # I will take a safe option and return zeros in the mahalanobis # distance if this is the case. mahalanobis_dist = np.zeros(dat.shape[0]) # critial distance of the maholanobis distance using the chi-square distirbution # https://en.wikiversity.org/wiki/Mahalanobis%27_distance # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2.html maha_lim = chi2.ppf(1 - critical_alpha, dat.shape[1]) outliers = mahalanobis_dist > maha_lim return mahalanobis_dist, outliers, maha_lim
class ActionDetector(object): """ Publish whether the robot is in action or not to rostopic, by MT method. NOTE Before starting to detect action, some waiting time is required. This is preparation time to calculate mahalanobis distance. Reaction speed for action detection is a bit late because spectrum is mean of spectrogram, not right edge of spectrogram """ def __init__(self): # Config for loading no action spectrum (noise data) rospack = rospkg.RosPack() self.train_dir = osp.join(rospack.get_path( 'decopin_hand'), 'train_data') if not osp.exists(self.train_dir): makedirs(self.train_dir) self.noise_data_path = osp.join(self.train_dir, 'noise.npy') if not osp.exists(self.noise_data_path): rospy.logerr('{} is not found. Exit.'.format(self.noise_data_path)) exit() no_action_data = np.load(self.noise_data_path) # extract about 100 data from no_action_data divide = max(1, len(no_action_data) / 100) no_action_data = no_action_data[::divide] # Detect in action or not by mahalanobis distance self.anormal_threshold = rospy.get_param('~anormal_threshold') self.mcd = MinCovDet() self.mcd.fit(no_action_data) rospy.loginfo('Calc covariance matrix for Mahalanobis distance') # ROS self.bridge = CvBridge() self.pub = rospy.Publisher('~in_action', Bool, queue_size=1) self.sub = rospy.Subscriber('~raw_spectrogram', Image, self.cb) def cb(self, msg): """ Main process of NoiseSaver class Publish whether the robot is in action or not """ # spectrogram.shape is (height, width) = (spectrum, time) spectrogram = self.bridge.imgmsg_to_cv2(msg) self.current_spectrum = np.average(spectrogram, axis=1) # Check whether current spectrogram is in action or not spectrum = self.current_spectrum[None] dist = self.mcd.mahalanobis(spectrum)[0] info_message = '(mahalanobis distance, threshold) = ({}, {})'.format( dist, self.anormal_threshold) if dist < self.anormal_threshold: self.in_action = False rospy.loginfo('No action\n' + info_message + '\n') else: self.in_action = True rospy.loginfo('### In action ###\n' + info_message + '\n') pub_msg = Bool(data=self.in_action) self.pub.publish(pub_msg)
def mahalanobis_calculate(data, num_pcs): pca = PCA(num_pcs) T = pca.fit_transform(data) # fit a Minimum Covariance Determinant (MCD) robust estimator to data robust_cov = MinCovDet().fit(T) # Get the Mahalanobis distance m = robust_cov.mahalanobis(T) return m
def as7262_outliers(data, scatter_correction=None): data_columns = data[as7262_wavelengths] print(data_columns) # data_columns.T.plot() # plt.plot(data_columns.T) plt.show() if scatter_correction == "SNV": data_columns = processing.snv(data_columns) elif scatter_correction == "MSC": data_columns, _ = processing.msc(data_columns) # svm = OneClassSVM().fit_predict(snv_data) # print(svm) robust_cov = MinCovDet().fit(data_columns) mahal_dist = robust_cov.mahalanobis(data_columns) # mahal_dist = MahalanobisDist(np.array(data_columns), verbose=True) print(mahal_dist) zscore(data_columns) print('+++++') mean = np.mean(mahal_dist) std = 3*np.std(mahal_dist) print(mean, std) print(mean - std, mean + std) zscore_mahal = (mahal_dist - mean) / np.std(mahal_dist) # print(zscore_mahal) # print(zscore_mahal.max(), zscore_mahal.argmax(), data_columns.loc[zscore_mahal.argmax()]) print('pppp') print(data_columns) print(zscore_mahal.argmax()) outliers = data_columns.loc[zscore_mahal > 3].index outliers = data_columns.iloc[zscore_mahal.argmax()].name # print(data_columns.loc[zscore_mahal > 3].index) rows = data_columns.loc[outliers] # print(data_columns.loc[zscore_mahal.argmax()].name) print(data_columns.shape) print(rows) # print((mahal_dist-mahal_dist.mean()).std()) # print(mahal_dist.std()) # print(mahal_dist.mean() + 3*mahal_dist.std()) # mahal_dist2 = MahalanobisDist(np.array(data_columns), verbose=True) n, bins, _ = plt.hist(zscore_mahal, bins=40) plt.show() # x_hist = np.linspace(min(mahal_dist), max(mahal_dist), 100) # # popt, pcov = curve_fit(gauss_function, bins[:len(n)], n, maxfev=100000, p0=[300, 0, 20]) # new_fit = gauss_function(x_hist, *popt) # plt.plot(x_hist, new_fit, 'r--') # color = data_columns.shape[0] * ["#000000"] # color[data_columns.loc[zscore_mahal.argmax()].name] = "#FF0000" plt.plot(data_columns.T, c="black") plt.plot(rows.T, c="red") plt.plot(data_columns.mean(), c="blue", lw=4) # snv_data.T.plot(color=color) plt.show()
def mahalanobisDistances(dm): reduced_data = PCA(n_components=2).fit_transform(dm) robust_cov = MinCovDet().fit(reduced_data) emp_cov = EmpiricalCovariance().fit(reduced_data) fig = plt.figure() plt.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05) subfig1 = plt.subplot(3, 1, 1) inlier_plot = subfig1.scatter(reduced_data[:, 0], reduced_data[:, 1], color='black', label='inliers') subfig1.set_xlim(subfig1.get_xlim()[0], 11.) subfig1.set_title("Mahalanobis distances of a contaminated data set:") # Show contours of the distance functions xx, yy = np.meshgrid(np.linspace(plt.xlim()[0], plt.xlim()[1], 100), np.linspace(plt.ylim()[0], plt.ylim()[1], 100)) zz = np.c_[xx.ravel(), yy.ravel()] mahal_emp_cov = emp_cov.mahalanobis(zz) mahal_emp_cov = mahal_emp_cov.reshape(xx.shape) emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov), cmap=plt.cm.PuBu_r, linestyles='dashed') mahal_robust_cov = robust_cov.mahalanobis(zz) mahal_robust_cov = mahal_robust_cov.reshape(xx.shape) robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov), cmap=plt.cm.YlOrBr_r, linestyles='dotted') plt.xticks(()) plt.yticks(()) # Plot the scores for each point emp_mahal = emp_cov.mahalanobis(reduced_data - np.mean(reduced_data, 0)) ** (0.33) subfig2 = plt.subplot(2, 2, 3) plt.yticks(()) robust_mahal = robust_cov.mahalanobis(reduced_data - robust_cov.location_) ** (0.33) subfig3 = plt.subplot(2, 2, 4) plt.yticks(()) plt.show()
def mahal_plot(e): first_half = e[1:len(e) - 1] second_half = e[2:len(e)] X = np.array([first_half, second_half]) X = np.transpose(X) # fit a Minimum Covariance Determinant (MCD) robust estimator to data robust_cov = MinCovDet().fit(X) # compare estimators learnt from the full data set with true parameters emp_cov = EmpiricalCovariance().fit(X) fig = plt.figure() # Show data set subfig1 = plt.subplot(1, 1, 1) inlier_plot = subfig1.scatter(first_half, second_half, color='black', label='daily diff in homes passed') subfig1.set_title("Mahalanobis distances of the iid invariants:") # Show contours of the distance functions xx, yy = np.meshgrid(np.linspace(plt.xlim()[0], plt.xlim()[1], 800), np.linspace(plt.ylim()[0], plt.ylim()[1], 100)) zz = np.c_[xx.ravel(), yy.ravel()] mahal_emp_cov = emp_cov.mahalanobis(zz) mahal_emp_cov = mahal_emp_cov.reshape(xx.shape) emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov), cmap=plt.cm.PuBu_r, linestyles='dashed') mahal_robust_cov = robust_cov.mahalanobis(zz) mahal_robust_cov = mahal_robust_cov.reshape(xx.shape) robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov), cmap=plt.cm.YlOrBr_r, color='red', linewidth="3") subfig1.legend([ emp_cov_contour.collections[1], robust_contour.collections[1], inlier_plot ], ['MLE dist', 'robust dist', 'kpis'], loc="upper right", borderaxespad=0) print(np.corrcoef(first_half, second_half)) return (robust_cov, emp_cov)
def get_outliers(X, chi2thr=0.975, plot=False, figurename=None): """ detect outliers by Mahalanobis distance """ robust_cov = MinCovDet(random_state=100).fit(X) MD = robust_cov.mahalanobis(X) n_samples = len(MD) chi2 = stats.chi2 degrees_of_freedom = X.shape[1] threshold = chi2.ppf(chi2thr, degrees_of_freedom) y_pred = MD > threshold outlierpercent = sum(y_pred) / float(n_samples) return outlierpercent, y_pred, MD
def l_ratio(X, labels): ''' This is a meassure of how far a cluster is from neighbouring clusters computing the mahalanobis distance to the closest point that does not belong to the cluster ATENTION: the covariance matrix is estimated with the robust covariance (outliers not taken into account) Parameters ---------- X : ndarray Data (assumed to be multivariate normal distributed) labels : ndarray Labels Returns ------- lr : list, size(number of clusters) L-ratio for each cluster ''' lr = list() # unique labels unique_l = set(labels).difference([-1]) # if the set is empty, return 0 if len(unique_l)==0: return -1 # degrees of freedom df = len(X[0]) # for each cluster for label in unique_l: # compute points in cluster Xi = X[(labels==label)] # number of spikes in cluster n = len(Xi) # compute points out of the cluster outliers = X[(labels!=label)] # estimate robust covariance mcd = MinCovDet().fit(Xi) # compute mahalanobis distance for outliers Dmcd = mcd.mahalanobis(outliers) # compute L-ratio lr.append(np.sum(1-chi2.cdf(Dmcd,df))/n) return lr
def RejectOutliers(data, threshold=3): """ Rejects nodal outliers based on :threshold: away from the mean based on the mahalanobis distance """ from sklearn.covariance import MinCovDet clf = MinCovDet() clf.fit(data) distances = clf.mahalanobis(data) outliers = np.where(distances >= threshold)[0] inliers = np.where(distances < threshold)[0] return inliers, outliers
def __init__(self, lab_coords_x, lab_coords_y, data, i_panel, delta_scalar, params, verbose=False): training_data = [] mean_x = flex.mean(lab_coords_x) mean_y = flex.mean(lab_coords_y) limit=delta_scalar * 10 for ix in range(len(data)): if abs(lab_coords_x[ix] - mean_x) > limit: continue if abs(lab_coords_y[ix] - mean_y) > limit: continue if abs(data[ix])>1: continue training_data.append((lab_coords_x[ix],lab_coords_y[ix],data[ix])) if verbose: print("Training data is less",len(lab_coords_x) - len(training_data),end=" ") colorcode_set = [] for ix in range(len(data)): colorcode_set.append((lab_coords_x[ix],lab_coords_y[ix],data[ix])) from sklearn.covariance import EmpiricalCovariance, MinCovDet # compare estimators learnt from the full data set with true parameters emp_cov = EmpiricalCovariance(assume_centered=False, store_precision=True).fit(X=training_data) # fit a Minimum Covariance Determinant (MCD) robust estimator to data robust_cov = MinCovDet(assume_centered=False, store_precision=True).fit(X=training_data) features = ["Δx","Δy","ΔΨ(deg)"] if verbose: print("%3d"%i_panel,end=" ") print("%4d items "%(len(training_data),),end=" ") for idx_report in range(len(features)): feature = features[idx_report] diag_elem = math.sqrt(emp_cov.covariance_[idx_report,idx_report]) if verbose: print( "%s=%7.2f±%6.2f"%(feature, emp_cov.location_[idx_report], diag_elem),end=" ") if verbose: print("%4d items:"%(flex.bool(robust_cov.support_).count(True)),end=" ") for idx_report in range(len(features)): feature = features[idx_report] diag_elem = math.sqrt(robust_cov.covariance_[idx_report,idx_report]) if verbose: print( "%s=%7.2f±%6.2f"%(feature, robust_cov.location_[idx_report], diag_elem),end=" ") disc = flex.double(robust_cov.mahalanobis(X=colorcode_set)) # this metric represents malahanobis ** 2 disc_select = disc < (params.residuals.mcd_filter.mahalanobis_distance)**2 if params.residuals.mcd_filter.keep == "outliers": disc_select = (disc_select==False) if verbose: print("OK %4.1f%%"%(100*(disc_select.count(True))/len(training_data))) self.lab_coords_x = lab_coords_x.select(disc_select) self.lab_coords_y = lab_coords_y.select(disc_select) self.data = data.select(disc_select) self.n_input = len(lab_coords_x) self.n_output = len(self.lab_coords_x) self.emp_cov = emp_cov self.rob_cov = robust_cov
def mahalanobis_plot(ctry=None, df=None, weighted=True, inliers=False): """ See http://scikit-learn.org/0.13/modules/outlier_detection.html#\ fitting-an-elliptic-envelop for details. """ if df is None and ctry is None: raise ValueError('Either the country or a dataframe must be supplied') elif df is None: df = load_res(ctry, weighted=weighted) if inliers: df = get_inliers(df=df) X = df.values robust_cov = MinCovDet().fit(X) #----------------------------------------------------------------------------- # compare estimators learnt from the full data set with true parameters emp_cov = EmpiricalCovariance().fit(X) #----------------------------------------------------------------------------- # Display results fig = plt.figure() fig.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05) #----------------------------------------------------------------------------- # Show data set ax1 = fig.add_subplot(1, 1, 1) ax1.scatter(X[:, 0], X[:, 1], alpha=.5, color='k', marker='.') ax1.set_title(country_code[ctry]) #----------------------------------------------------------------------------- # Show contours of the distance functions xx, yy = np.meshgrid(np.linspace(ax1.get_xlim()[0], ax1.get_xlim()[1], 100), np.linspace(ax1.get_ylim()[0], ax1.get_ylim()[1], 100)) zz = np.c_[xx.ravel(), yy.ravel()] #----------------------------------------------------------------------------- mahal_emp_cov = emp_cov.mahalanobis(zz) mahal_emp_cov = mahal_emp_cov.reshape(xx.shape) emp_cov_contour = ax1.contour(xx, yy, np.sqrt(mahal_emp_cov), cmap=plt.cm.PuBu_r, linestyles='dashed') #----------------------------------------------------------------------------- mahal_robust_cov = robust_cov.mahalanobis(zz) mahal_robust_cov = mahal_robust_cov.reshape(xx.shape) robust_contour = ax1.contour(xx, yy, np.sqrt(mahal_robust_cov), cmap=plt.cm.YlOrBr_r, linestyles='dotted') ax1.legend([emp_cov_contour.collections[1], robust_contour.collections[1]], ['MLE dist', 'robust dist'], loc="upper right", borderaxespad=0) ax1.grid() return (fig, ax1, ctry)
def calcurate_mahalabinos_distance(TCKname): length_stats = TCKname+"-stat.txt" tdi_stats = TCKname+"-tdi.txt" cur_stats = TCKname+"-cur.txt" length =np.loadtxt(length_stats, delimiter = '\t') tdi = np.loadtxt(tdi_stats, delimiter = '\t') tdi = np.reciprocal(tdi) cur = np.loadtxt(cur_stats, delimiter = '\t') n_samples = length.shape[0] tdi = np.reshape(tdi, (n_samples,1)) cur = np.reshape(cur, (n_samples,1)) length = np.reshape(length, (n_samples,1)) X = np.hstack((length, tdi, cur)) robust_cov = MinCovDet().fit(X) robust_mahal = robust_cov.mahalanobis(X - robust_cov.location_) return robust_mahal
def ComputeMahalanobisDistance(data): """Compute MahalanobisDistance and return as DataFrame Parameters: data (DataFrame): Pandas DataFrame Returns: DataFrame: contains mahalanobis distances with indices from data """ rob_cov = MinCovDet().fit(data) distances = rob_cov.mahalanobis(data) distances = np.sqrt(distances) df = pd.DataFrame(data=distances, columns={'distance'}, index=data.index.values) return df
def pre_screen(self, var, disp, thresh=10): """Uses Minimum Covariance Determinand / Mahalanobis distance ideas to detect outliers, loosely based on :cite:`chawla_k-means:_2013`. """ fx = var.columns.names.index('file') feat = pd.concat((var.mean(), var.std()), 1) mcd = MinCovDet().fit(feat) md = mcd.mahalanobis(feat) s = set(np.where(md > thresh)[0]) k = s.intersection(disp).union(s.intersection({0, var.shape[1]})) self.dispensable = list(set(disp) - k) if len(k) > 0: print( '\n\nThe following files have been removed from the concatenation as unnecessary outliers:\n' ) for i in k: print(var.columns[i][fx]) return var.drop(var.columns[list(k)], axis=1)
def MahalanobisOutliers(featMatProjected, extremeness=2., showplot=True): """ A function to determine to return a list of outlier indices using the Mahalanobis distance. Outlier threshold = std(Mahalanobis distance) * extremeness degree [extreme_values=2, very_extreme_values=3 --> according to 68-95-99.7 rule] """ # NB: Euclidean distance puts more weight than it should on correlated variables # Chicken and egg situation, we can’t know they are outliers until we calculate # the stats of the distribution, but the stats of the distribution are skewed outliers! # Mahalanobis gets around this by weighting by robust estimation of covariance matrix # Fit a Minimum Covariance Determinant (MCD) robust estimator to data robust_cov = MinCovDet().fit( featMatProjected[:, :10]) # Use the first 10 principal components # TODO: Make PCs to use = min(PCsToUse, FeatMatProjected.shape[1]) # TODO: Check tutorial on Mahalanobis -> check whether distance is always positive - might be throwing away points close to the centre # Get the Mahalanobis distance MahalanobisDist = robust_cov.mahalanobis(featMatProjected[:, :10]) # Colour PCA by Mahalanobis distance if showplot: plt.close('all') plt.rc('xtick', labelsize=15) plt.rc('ytick', labelsize=15) fig, ax = plt.subplots(figsize=[10, 10]) ax.set_facecolor('white') plt.scatter(np.array(projectedTable['PC1']), np.array(projectedTable['PC2']), c=MahalanobisDist) plt.title('Mahalanobis Distance for Outlier Detection', fontsize=20) plt.colorbar() k = np.std(MahalanobisDist) * extremeness upper_t = np.mean(MahalanobisDist) + k outliers = [] for i in range(len(MahalanobisDist)): # TODO: Vectorise if (MahalanobisDist[i] >= upper_t): outliers.append(i) print("Outliers found: %d" % len(outliers)) return np.array(outliers)
def MahalanobisOutliers(featMatProjected, extremeness=2., showplot=False): """ A function to determine to return a list of outlier indices using the Mahalanobis distance. Outlier threshold = std(Mahalanobis distance) * extremeness degree [extreme_values=2, very_extreme_values=3 --> according to 68-95-99.7 rule] """ # NB: Euclidean distance puts more weight than it should on correlated variables # Chicken and egg situation, we can’t know they are outliers until we calculate # the stats of the distribution, but the stats of the distribution are skewed by outliers! # Mahalanobis gets around this by weighting by robust estimation of covariance matrix # Fit a Minimum Covariance Determinant (MCD) robust estimator to data robust_cov = MinCovDet().fit( featMatProjected[:, :10]) # Use the first 10 principal components # Get the Mahalanobis distance MahalanobisDist = robust_cov.mahalanobis(featMatProjected[:, :10]) # Colour PCA by Mahalanobis distance if showplot: projectedTable = pd.DataFrame(featMatProjected[:,:10],\ columns=['PC' + str(n+1) for n in range(10)]) plt.close('all') plt.rc('xtick', labelsize=15) plt.rc('ytick', labelsize=15) fig, ax = plt.subplots(figsize=[10, 10]) ax.set_facecolor('#F7FFFF') plt.scatter(np.array(projectedTable['PC1']), np.array(projectedTable['PC2']), c=MahalanobisDist) plt.title('Mahalanobis Distance for Outlier Detection', fontsize=20) plt.colorbar() k = np.std(MahalanobisDist) * extremeness upper_t = np.mean(MahalanobisDist) + k outliers = [] for i in range(len(MahalanobisDist)): if (MahalanobisDist[i] >= upper_t): outliers.append(i) print("Outliers found: %d" % len(outliers)) return np.array(outliers)
def outlier_measure(X, method="robust_covar"): """ outlier_prediction """ if method == "robust_covar": robust_cov = MinCovDet().fit(X) measure = np.sqrt(robust_cov.mahalanobis(X)) offset = 3 elif method == "isolation_forest": clf = IsolationForest(behaviour="new", contamination="auto") y_pred = clf.fit(X).predict(X) measure = -clf.score_samples(X) offset = -clf.offset_ elif method == "local_outlier_detection": clf = LOF(contamination="auto") y_pred = clf.fit_predict(X) measure = -clf.negative_outlier_factor_ offset = -clf.offset_ assignment = np.where(measure < offset, 1, 0) return measure, offset, assignment
def estimateGaussian(nb_objects_init, nb_objects_final, thr, who, genes, siRNA, loadingFolder = '../resultData/thrivisions/predictions', threshold=0.05,): arr=np.vstack((thr, nb_objects_init, nb_objects_final)).T #deleting siRNAs that have only one experiment print len(siRNA) all_=Counter(siRNA);siRNA = np.array(siRNA) toDelsi=filter(lambda x: all_[x]==1, all_) toDelInd=[] for si in toDelsi: toDelInd.extend(np.where(siRNA==si)[0]) print len(toDelInd) dd=dict(zip(range(4), [arr, who, genes, siRNA])) for array_ in dd: dd[array_]=np.delete(dd[array_],toDelInd,0 ) arr, who, genes, siRNA = [dd[el] for el in range(4)] print arr.shape arr_ctrl=arr[np.where(np.array(genes)=='ctrl')] ctrlcov=MinCovDet().fit(arr_ctrl) robdist= ctrlcov.mahalanobis(arr)*np.sign(arr[:,0]-np.mean(arr[:,0])) new_siRNA=np.array(siRNA)[np.where((genes!='ctrl')&(robdist>0))] pval,qval =empiricalPvalues(np.absolute(robdist[np.where(genes=='ctrl')])[:, np.newaxis],\ robdist[np.where((genes!='ctrl')&(robdist>0))][:, np.newaxis],\ folder=loadingFolder, name="thrivision", sup=True, also_pval=True) assert new_siRNA.shape==qval.shape hits=Counter(new_siRNA[np.where(qval<threshold)[0]]) hits=filter(lambda x: float(hits[x])/all_[x]>=0.5, hits) gene_hits = [genes[list(siRNA).index(el)] for el in hits] gene_hits=Counter(gene_hits) return robdist, pval,qval, hits, gene_hits
color='red', label='outliers') subfig1.set_xlim(subfig1.get_xlim()[0], 11.) subfig1.set_title("Mahalanobis distances of a contaminated data set:") # Show contours of the distance functions xx, yy = np.meshgrid(np.linspace(pl.xlim()[0], pl.xlim()[1], 100), np.linspace(pl.ylim()[0], pl.ylim()[1], 100)) zz = np.c_[xx.ravel(), yy.ravel()] mahal_emp_cov = emp_cov.mahalanobis(zz) mahal_emp_cov = mahal_emp_cov.reshape(xx.shape) emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov), cmap=pl.cm.PuBu_r, linestyles='dashed') mahal_robust_cov = robust_cov.mahalanobis(zz) mahal_robust_cov = mahal_robust_cov.reshape(xx.shape) robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov), cmap=pl.cm.YlOrBr_r, linestyles='dotted') subfig1.legend([emp_cov_contour.collections[1], robust_contour.collections[1], inlier_plot, outlier_plot], ['MLE dist', 'robust dist', 'inliers', 'outliers'], loc="upper right", borderaxespad=0) pl.xticks(()) pl.yticks(()) # Plot the scores for each point emp_mahal = emp_cov.mahalanobis(X - np.mean(X, 0)) ** (0.33) subfig2 = pl.subplot(2, 2, 3) subfig2.boxplot([emp_mahal[:-n_outliers], emp_mahal[-n_outliers:]], widths=.25)
color='red', label='outliers') subfig1.set_xlim(subfig1.get_xlim()[0], 11.) subfig1.set_title("Mahalanobis distances of a contaminated data set:") # Show contours of the distance functions xx, yy = np.meshgrid(np.linspace(plt.xlim()[0], plt.xlim()[1], 100), np.linspace(plt.ylim()[0], plt.ylim()[1], 100)) zz = np.c_[xx.ravel(), yy.ravel()] mahal_emp_cov = emp_cov.mahalanobis(zz) mahal_emp_cov = mahal_emp_cov.reshape(xx.shape) emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov), cmap=plt.cm.PuBu_r, linestyles='dashed') mahal_robust_cov = robust_cov.mahalanobis(zz) mahal_robust_cov = mahal_robust_cov.reshape(xx.shape) robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov), cmap=plt.cm.YlOrBr_r, linestyles='dotted') subfig1.legend([emp_cov_contour.collections[1], robust_contour.collections[1], inlier_plot, outlier_plot], ['MLE dist', 'robust dist', 'inliers', 'outliers'], loc="upper right", borderaxespad=0) plt.xticks(()) plt.yticks(()) # Plot the scores for each point # emp_mahal = emp_cov.mahalanobis(X - np.mean(X, 0)) ** (0.33) # subfig2 = plt.subplot(2, 2, 3) # subfig2.boxplot([emp_mahal[:-n_outliers], emp_mahal[-n_outliers:]], widths=.25)
def main(): parser = argparse.ArgumentParser( description='Plot outlier-like distances for a 2-dimensional dataset') parser.add_argument('dataset', type=argparse.FileType('r'), help='a CSV file containing the dataset') parser.add_argument( '--plot', type=str, choices=['train', 'grid'], default='grid', help='plot the dataset or a grid evenly distributed over its span') parser.add_argument('--plotdims', type=int, choices=[2, 3], default=2, help='the number of dimensions to plot') args = parser.parse_args() X = np.loadtxt(args.dataset, delimiter=',') fig = plt.figure() xformer = NullTransformer() if X.shape[1] > 2: xformer = PCA(n_components=2) X = xformer.fit_transform(X) if args.plotdims == 2: plt.scatter(X[:, 0], X[:, 1], s=60, linewidth='0') else: plt.scatter(X[:, 0], X[:, 1]) plt.show(block=False) path_to_script = os.path.realpath(__file__) dir_of_script = os.path.dirname(path_to_script) dataset_path = dir_of_script + '/outliers.npy' np.save(dataset_path, X) ########################################################################### # Train autoencoder with the n samples until convergence. Run # evenly distributed samples through the autoencoder and compute # their reconstruction error. ########################################################################### maxseq_orig = np.max(X) minseq_orig = np.min(X) seqrange = np.abs(maxseq_orig - minseq_orig) maxseq = maxseq_orig + 0.5 * seqrange minseq = minseq_orig - 0.5 * seqrange print("minseq", minseq, "maxseq", maxseq) if args.plot == 'grid': seq = np.linspace(minseq, maxseq, num=50, endpoint=True) Xplot = np.array([_ for _ in product(seq, seq)]) else: Xplot = X robust_cov = MinCovDet().fit(X) robust_md = robust_cov.mahalanobis(Xplot) empirical_cov = EmpiricalCovariance().fit(X) empirical_md = empirical_cov.mahalanobis(Xplot) # Assume Xplot is at least 2-dimensional. if Xplot.shape[1] > 2: Xplot2d = bh_sne(Xplot) else: Xplot2d = Xplot robust_md01 = robust_md - np.nanmin(robust_md) robust_md01 = robust_md01 / np.nanmax(robust_md01) empirical_md01 = empirical_md - np.nanmin(empirical_md) empirical_md01 = empirical_md01 / np.nanmax(empirical_md01) fig = plt.figure() if args.plotdims == 2: ax = fig.add_subplot(1, 1, 1) ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], cmap=plt.cm.jet, c=robust_md01, s=60, linewidth='0') else: ax = fig.add_subplot(1, 1, 1, projection='3d') ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], robust_md01, cmap=plt.cm.jet, color=robust_md01) ax.set_zlabel('Mahalanobis distance') ax.set_xlabel('x') ax.set_ylabel('y') ax.set_title('Mahalanobis distance (robust covariance)') fig = plt.figure() if args.plotdims == 2: ax = fig.add_subplot(1, 1, 1) ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], cmap=plt.cm.jet, c=empirical_md01, s=60, linewidth='0') else: ax = fig.add_subplot(1, 1, 1, projection='3d') ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], empirical_md01, cmap=plt.cm.jet, color=empirical_md01) ax.set_zlabel('Mahalanobis distance') ax.set_xlabel('x') ax.set_ylabel('y') ax.set_title('Mahalanobis distance (empirical covariance)') enc_dec = [ # tanh encoder, linear decoder ['tanh', 'linear'], # sigmoid encoder, linear decoder ['sigmoid', 'linear'], ####################################################################### # The reconstruction error of the autoencoders trained with the # remaining commented-out pairs don't seem to match Mahalanobis # distance very well. Feel free to uncomment them to see for # yourself. # linear encoder, linear decoder # ['linear', 'linear'], # tanh encoder, tanh decoder # ['tanh', 'tanh'], # tanh encoder, sigmoid decoder # ['tanh', 'sigmoid'], # sigmoid encoder, tanh decoder # ['sigmoid', 'tanh'], # sigmoid encoder, sigmoid decoder # ['sigmoid', 'sigmoid'] ####################################################################### ] for i, act in enumerate(enc_dec): enc, dec = act if dec == 'linear': dec = None model = train_autoencoder(dataset_path, act_enc=enc, act_dec=dec, nvis=X.shape[1], nhid=16) Xshared = theano.shared(np.asarray(Xplot, dtype=theano.config.floatX), borrow=True) f = theano.function([], outputs=model.reconstruct(Xshared)) fit = f() error = reconstruction_error(Xplot, fit) error01 = error - np.nanmin(error) error01 = error01 / np.nanmax(error01) fig = plt.figure() if args.plotdims == 2: ax = fig.add_subplot(1, 1, 1) ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], cmap=plt.cm.jet, c=error, s=60, linewidth='0') else: ax = fig.add_subplot(1, 1, 1, projection='3d') ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], error, cmap=plt.cm.jet, color=error01) ax.set_zlabel('Reconstruction error') ax.set_xlabel('x') ax.set_ylabel('y') encdec_type = ', '.join(act) ax.set_title('Reconstruction error (' + encdec_type + ')') print("Correlation of robust MD and reconstruction error (" + str(encdec_type) + ") " + str(pearsonr(robust_md, error))) print("Correlation of empirical MD and reconstruction error (" + str(encdec_type) + ") " + str(pearsonr(empirical_md, error))) print("Correlation of robust MD and empirical MD " + str(pearsonr(robust_md, empirical_md))) os.remove(dataset_path) os.remove('outliers.pkl') plt.show(block=True)
# Show contours of the distance functions xx, yy = np.meshgrid(np.linspace(plt.xlim()[0], plt.xlim()[1], 100), np.linspace(plt.ylim()[0], plt.ylim()[1], 100)) zz = np.c_[xx.ravel(), yy.ravel()] mahal_emp_cov = emp_cov.mahalanobis(zz) mahal_emp_cov = mahal_emp_cov.reshape(xx.shape) emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov), cmap=plt.cm.PuBu_r, linestyles='dashed') mahal_robust_cov = robust_cov.mahalanobis(zz) mahal_robust_cov = mahal_robust_cov.reshape(xx.shape) robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov), cmap=plt.cm.YlOrBr_r, linestyles='dotted') subfig1.legend([ emp_cov_contour.collections[1], robust_contour.collections[1], inlier_plot, outlier_plot ], ['MLE dist', 'robust dist', 'inliers', 'outliers'], loc="upper right", borderaxespad=0) plt.xticks(()) plt.yticks(())
class Grid: def __init__(self, dim=10, noise=0.1, outliers=0): self.points = create_grid(dim, noise, outliers) self.polardata = np.zeros((len(self.points), 2)) self.polar_cov = 0 self.inlier_points = np.zeros((len(self.points), 2)) self.inlier_indicies = np.zeros((len(self.points), 1)) self.normalized_points = np.zeros((len(self.points), 2)) self.theta = 0 self.step_size = 1 self.linedata = np.zeros((3 * len(self.points), 2)) self.normalized_point_ids = [] self.bounds = [0, 0, 0, 0] def step(self, rotation=0): self.points = rotate(self.points, rotation) def analyze(self, mahalanobis_tolerance=2): self.inlier_points = np.zeros((len(self.points), 2)) for id1 in range(len(self.points)): id2 = closest_point(self.points, self.points[id1], id1)[0] #keep lines fro plotting purposes self.linedata[3 * id1] = self.points[id1] self.linedata[3 * id1 + 1] = self.points[id2] self.linedata[3 * id1 + 2] = [None, None] # we are repeating every pi/2, so we compress the angle space by 4x a = 4 * math.atan2((self.points[id1, 1] - self.points[id2, 1]), (self.points[id1, 0] - self.points[id2, 0])) r = np.linalg.norm(self.points[id1] - self.points[id2]) self.polardata[id1] = [r * math.cos(a), r * math.sin(a)] #find the minimal covariance inlier cluster self.polar_cov = MinCovDet().fit(self.polardata) # extract the grid angle and size. angle is divided by 4 because # we previously scaled it up to repeat every 90 deg self.theta = math.atan2(-self.polar_cov.location_[1], self.polar_cov.location_[0]) / 4 self.step_size = np.linalg.norm(self.polar_cov.location_) # extract inlier points polar_mahal = self.polar_cov.mahalanobis(self.polardata)**(0.33) inlier_count = 0 for i in range(len(polar_mahal)): if polar_mahal[ i] < mahalanobis_tolerance: # stdev tolerance to outliers self.inlier_points[inlier_count] = self.points[i] self.inlier_indicies[inlier_count] = i inlier_count += 1 self.normalized_points = rotate(self.inlier_points[:inlier_count], -self.theta) / self.step_size #enumerate grid IDs origin_id = closest_point(self.normalized_points, np.mean(self.normalized_points))[0] self.normalized_points = self.normalized_points - self.normalized_points[ origin_id] inlier_count = 0 self.bounds = [sys.maxint, sys.maxint, -sys.maxint, -sys.maxint] for p in self.normalized_points: x = round(p[0]) y = round(p[1]) d = np.linalg.norm(p - [x, y]) if d < 0.4: #tolerance from unit position self.normalized_points[inlier_count] = [x, y] if (x < self.bounds[0]): self.bounds[0] = x if (x > self.bounds[2]): self.bounds[2] = x if (y < self.bounds[1]): self.bounds[1] = y if (y > self.bounds[3]): self.bounds[3] = y inlier_count += 1 self.normalized_points = self.normalized_points[:inlier_count]
subfig1.set_title("Mahalanobis distances of a contaminated data set:") # Show contours of the distance functions xx, yy = np.meshgrid(np.linspace(plt.xlim()[0], plt.xlim()[1], 800), np.linspace(plt.ylim()[0], plt.ylim()[1], 100)) zz = np.c_[xx.ravel(), yy.ravel()] mahal_emp_cov = emp_cov.mahalanobis(zz) mahal_emp_cov = mahal_emp_cov.reshape(xx.shape) emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov), cmap=plt.cm.PuBu_r, linestyles='dashed') mahal_robust_cov = robust_cov.mahalanobis(zz) mahal_robust_cov = mahal_robust_cov.reshape(xx.shape) robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov), cmap=plt.cm.YlOrBr_r, linestyles='dotted') subfig1.legend([emp_cov_contour.collections[1], robust_contour.collections[1], inlier_plot], ['MLE dist', 'robust dist', 'kpis'], loc="upper right", borderaxespad=0) print(np.corrcoef(first_half,second_half)) #%% full_data = full_data.drop(['Year', 'Month', 'Day', 'Data Quality','Max Temp (°C)', 'Max Temp Flag', 'Min Temp (°C)', 'Min Temp Flag', 'Mean Temp Flag', 'Heat Deg Days Flag',
subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:], color='red', label='outliers') subfig1.set_xlim(subfig1.get_xlim()[0], 11.) subfig1.set_title("Mahalanobis distances of a contaminated data set:") subfig1.legend(loc="upper right") emp_mahal = emp_cov.mahalanobis(X) ** (0.33) subfig2 = pl.subplot(2, 2, 3) subfig2.boxplot([emp_mahal[:-n_outliers], emp_mahal[-n_outliers:]], widths=.25) subfig2.plot(1.26 * np.ones(n_samples - n_outliers), emp_mahal[:-n_outliers], '+k', markeredgewidth=1) subfig2.plot(2.26 * np.ones(n_outliers), emp_mahal[-n_outliers:], '+k', markeredgewidth=1) subfig2.axes.set_xticklabels(('inliers', 'outliers'), size=11) subfig2.set_ylabel(r"$\sqrt[3]{\rm{(Mahal. dist.)}}$") subfig2.set_title("1. from non-robust estimates\n(Maximum Likelihood)") robust_mahal = robust_cov.mahalanobis(X) ** (0.33) subfig3 = pl.subplot(2, 2, 4) subfig3.boxplot([robust_mahal[:-n_outliers], robust_mahal[-n_outliers:]], widths=.25) subfig3.plot(1.26 * np.ones(n_samples - n_outliers), robust_mahal[:-n_outliers], '+k', markeredgewidth=1) subfig3.plot(2.26 * np.ones(n_outliers), robust_mahal[-n_outliers:], '+k', markeredgewidth=1) subfig3.axes.set_xticklabels(('inliers', 'outliers'), size=11) subfig3.set_ylabel(r"$\sqrt[3]{\rm{(Mahal. dist.)}}$") subfig3.set_title("2. from robust estimates\n(Minimum Covariance Determinant)") pl.show()
def main(): parser = argparse.ArgumentParser( description='Plot outlier-like distances for a 2-dimensional dataset') parser.add_argument( 'dataset', type=argparse.FileType('r'), help='a CSV file containing the dataset') parser.add_argument( '--plot', type=str, choices=['train', 'grid'], default='grid', help='plot the dataset or a grid evenly distributed over its span') parser.add_argument( '--plotdims', type=int, choices=[2, 3], default=2, help='the number of dimensions to plot') args = parser.parse_args() X = np.loadtxt(args.dataset, delimiter=',') fig = plt.figure() xformer = NullTransformer() if X.shape[1] > 2: xformer = PCA(n_components=2) X = xformer.fit_transform(X) if args.plotdims == 2: plt.scatter(X[:, 0], X[:, 1], s=60, linewidth='0') else: plt.scatter(X[:, 0], X[:, 1]) plt.show(block=False) path_to_script = os.path.realpath(__file__) dir_of_script = os.path.dirname(path_to_script) dataset_path = dir_of_script + '/outliers.npy' np.save(dataset_path, X) ########################################################################### # Train autoencoder with the n samples until convergence. Run # evenly distributed samples through the autoencoder and compute # their reconstruction error. ########################################################################### maxseq_orig = np.max(X) minseq_orig = np.min(X) seqrange = np.abs(maxseq_orig - minseq_orig) maxseq = maxseq_orig + 0.5 * seqrange minseq = minseq_orig - 0.5 * seqrange print("minseq", minseq, "maxseq", maxseq) if args.plot == 'grid': seq = np.linspace(minseq, maxseq, num=50, endpoint=True) Xplot = np.array([_ for _ in product(seq, seq)]) else: Xplot = X robust_cov = MinCovDet().fit(X) robust_md = robust_cov.mahalanobis(Xplot) empirical_cov = EmpiricalCovariance().fit(X) empirical_md = empirical_cov.mahalanobis(Xplot) # Assume Xplot is at least 2-dimensional. if Xplot.shape[1] > 2: Xplot2d = bh_sne(Xplot) else: Xplot2d = Xplot robust_md01 = robust_md - np.nanmin(robust_md) robust_md01 = robust_md01 / np.nanmax(robust_md01) empirical_md01 = empirical_md - np.nanmin(empirical_md) empirical_md01 = empirical_md01 / np.nanmax(empirical_md01) fig = plt.figure() if args.plotdims == 2: ax = fig.add_subplot(1, 1, 1) ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], cmap=plt.cm.jet, c=robust_md01, s=60, linewidth='0') else: ax = fig.add_subplot(1, 1, 1, projection='3d') ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], robust_md01, cmap=plt.cm.jet, color=robust_md01) ax.set_zlabel('Mahalanobis distance') ax.set_xlabel('x') ax.set_ylabel('y') ax.set_title('Mahalanobis distance (robust covariance)') fig = plt.figure() if args.plotdims == 2: ax = fig.add_subplot(1, 1, 1) ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], cmap=plt.cm.jet, c=empirical_md01, s=60, linewidth='0') else: ax = fig.add_subplot(1, 1, 1, projection='3d') ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], empirical_md01, cmap=plt.cm.jet, color=empirical_md01) ax.set_zlabel('Mahalanobis distance') ax.set_xlabel('x') ax.set_ylabel('y') ax.set_title('Mahalanobis distance (empirical covariance)') enc_dec = [ # tanh encoder, linear decoder ['tanh', 'linear'], # sigmoid encoder, linear decoder ['sigmoid', 'linear'], ####################################################################### # The reconstruction error of the autoencoders trained with the # remaining commented-out pairs don't seem to match Mahalanobis # distance very well. Feel free to uncomment them to see for # yourself. # linear encoder, linear decoder # ['linear', 'linear'], # tanh encoder, tanh decoder # ['tanh', 'tanh'], # tanh encoder, sigmoid decoder # ['tanh', 'sigmoid'], # sigmoid encoder, tanh decoder # ['sigmoid', 'tanh'], # sigmoid encoder, sigmoid decoder # ['sigmoid', 'sigmoid'] ####################################################################### ] for i, act in enumerate(enc_dec): enc, dec = act if dec == 'linear': dec = None model = train_autoencoder(dataset_path, act_enc=enc, act_dec=dec, nvis=X.shape[1], nhid=16) Xshared = theano.shared( np.asarray(Xplot, dtype=theano.config.floatX), borrow=True) f = theano.function([], outputs=model.reconstruct(Xshared)) fit = f() error = reconstruction_error(Xplot, fit) error01 = error - np.nanmin(error) error01 = error01 / np.nanmax(error01) fig = plt.figure() if args.plotdims == 2: ax = fig.add_subplot(1, 1, 1) ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], cmap=plt.cm.jet, c=error, s=60, linewidth='0') else: ax = fig.add_subplot(1, 1, 1, projection='3d') ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], error, cmap=plt.cm.jet, color=error01) ax.set_zlabel('Reconstruction error') ax.set_xlabel('x') ax.set_ylabel('y') encdec_type = ', '.join(act) ax.set_title('Reconstruction error (' + encdec_type + ')') print("Correlation of robust MD and reconstruction error (" + str(encdec_type) + ") " + str(pearsonr(robust_md, error))) print("Correlation of empirical MD and reconstruction error (" + str(encdec_type) + ") " + str(pearsonr(empirical_md, error))) print("Correlation of robust MD and empirical MD " + str(pearsonr(robust_md, empirical_md))) os.remove(dataset_path) os.remove('outliers.pkl') plt.show(block=True)
class Grid: def __init__(self, dim=10, noise=0.1, outliers=0): self.points = create_grid(dim, noise, outliers) self.polardata = np.zeros((len(self.points), 2)) self.polar_cov = 0 self.inlier_points = np.zeros((len(self.points), 2)) self.inlier_indicies = np.zeros((len(self.points), 1)) self.normalized_points = np.zeros((len(self.points), 2)) self.theta = 0 self.step_size = 1 self.linedata = np.zeros((3*len(self.points), 2)) self.normalized_point_ids = [] self.bounds = [0, 0, 0, 0] def step(self, rotation=0): self.points = rotate(self.points, rotation) def analyze(self, mahalanobis_tolerance=2): self.inlier_points = np.zeros((len(self.points), 2)) for id1 in range(len(self.points)): id2 = closest_point(self.points, self.points[id1], id1)[0] #keep lines fro plotting purposes self.linedata[3*id1] = self.points[id1] self.linedata[3*id1+1] = self.points[id2] self.linedata[3*id1+2] = [None, None] # we are repeating every pi/2, so we compress the angle space by 4x a = 4*math.atan2((self.points[id1, 1] - self.points[id2, 1]), (self.points[id1, 0] - self.points[id2, 0])) r = np.linalg.norm(self.points[id1] - self.points[id2]) self.polardata[id1] = [r*math.cos(a), r*math.sin(a)] #find the minimal covariance inlier cluster self.polar_cov = MinCovDet().fit(self.polardata) # extract the grid angle and size. angle is divided by 4 because # we previously scaled it up to repeat every 90 deg self.theta = math.atan2(-self.polar_cov.location_[1], self.polar_cov.location_[0])/4 self.step_size = np.linalg.norm(self.polar_cov.location_) # extract inlier points polar_mahal = self.polar_cov.mahalanobis(self.polardata)**(0.33) inlier_count = 0 for i in range(len(polar_mahal)): if polar_mahal[i] < mahalanobis_tolerance: # stdev tolerance to outliers self.inlier_points[inlier_count] = self.points[i] self.inlier_indicies[inlier_count] = i inlier_count += 1 self.normalized_points = rotate(self.inlier_points[:inlier_count], -self.theta)/self.step_size #enumerate grid IDs origin_id = closest_point(self.normalized_points, np.mean(self.normalized_points))[0] self.normalized_points = self.normalized_points - self.normalized_points[origin_id] inlier_count = 0 self.bounds = [sys.maxint, sys.maxint, -sys.maxint, -sys.maxint] for p in self.normalized_points: x = round(p[0]) y = round(p[1]) d = np.linalg.norm(p-[x, y]) if d < 0.4: #tolerance from unit position self.normalized_points[inlier_count] = [x, y] if (x < self.bounds[0]): self.bounds[0] = x if (x > self.bounds[2]): self.bounds[2] = x if (y < self.bounds[1]): self.bounds[1] = y if (y > self.bounds[3]): self.bounds[3] = y inlier_count += 1 self.normalized_points = self.normalized_points[:inlier_count]
lm2 = ols('word_diff ~ Age + C(Centre_ID)', data=clean_st,subset=subset).fit() print(lm2.summary()) # <markdowncell> # # Snippets. Might come back to this later: # <codecell> from scipy.stats import pearsonr from sklearn.covariance import MinCovDet # just look at what's interesting for now, and drop the NAs involved clean = st_v_merged.loc[:,['norm_diff','Interview_Suggested_Ranking_numerical_']] clean = clean.dropna(axis=0) # calculate robust covariance estimate, calculate what's too far away mcd = MinCovDet() mcd.fit(clean) pearsonr(clean.iloc[:,0],clean.iloc[:,1]) # <codecell> d = mcd.mahalanobis(clean) d.sort() d
import numpy as np from sklearn.covariance import MinCovDet from heapq import nlargest from load_data import data, norm_data, reformat NUMBER_OF_ANOMALIES = 10 robust_cov = MinCovDet().fit(norm_data) mahal_robust_cov = enumerate(robust_cov.mahalanobis(norm_data)) anomalies = nlargest(NUMBER_OF_ANOMALIES, mahal_robust_cov, key=lambda _: _[1]) print(anomalies)
for k1 in choosen: a1, a2, a3 = extractStats(k1[2], fr) a1.extend(a2) a1.extend(a3) ab.append(a1) rr = np.array(ab) #print(dataNameList_f) if dataNameList_f: print('fitting ' + str(len(dataNameList_f)) + ' new data') mcd.fit(rr[:-1 * nrAnalysis - 1, :]) else: print('no new data') arn = mcd.mahalanobis(rr[-1 * nrAnalysis - 1:-1, :] - mcd.location_)**(0.33) aro = mcd.mahalanobis(rr[:-1 * nrAnalysis - 1, :] - mcd.location_)**(0.33) print(np.median(aro[mcd.support_])) ax1.clear() ax1.scatter(rr[:-1 * nrAnalysis - 1, [0]], rr[:-1 * nrAnalysis - 1, [3]], marker='+') ax1.scatter(rr[-1 * nrAnalysis - 1:-1, [0]], rr[-1 * nrAnalysis - 1:-1, [3]], marker='o', c='r') #ax1.scatter(*mcd.location_,c='r',s=4)
class MCD(BaseDetector): """Detecting outliers in a Gaussian distributed dataset using Minimum Covariance Determinant (MCD): robust estimator of covariance. The Minimum Covariance Determinant covariance estimator is to be applied on Gaussian-distributed data, but could still be relevant on data drawn from a unimodal, symmetric distribution. It is not meant to be used with multi-modal data (the algorithm used to fit a MinCovDet object is likely to fail in such a case). One should consider projection pursuit methods to deal with multi-modal datasets. First fit a minimum covariance determinant model and then compute the Mahalanobis distance as the outlier degree of the data See :cite:`rousseeuw1999fast,hardin2004outlier` for details. Parameters ---------- contamination : float in (0., 0.5), optional (default=0.1) The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function. store_precision : bool Specify if the estimated precision is stored. assume_centered : Boolean If True, the support of the robust location and the covariance estimates is computed, and a covariance estimate is recomputed from it, without centering the data. Useful to work with data whose mean is significantly equal to zero but is not exactly zero. If False, the robust location and covariance are directly computed with the FastMCD algorithm without additional treatment. support_fraction : float, 0 < support_fraction < 1 The proportion of points to be included in the support of the raw MCD estimate. Default is None, which implies that the minimum value of support_fraction will be used within the algorithm: [n_sample + n_features + 1] / 2 random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Attributes ---------- raw_location_ : array-like, shape (n_features,) The raw robust estimated location before correction and re-weighting. raw_covariance_ : array-like, shape (n_features, n_features) The raw robust estimated covariance before correction and re-weighting. raw_support_ : array-like, shape (n_samples,) A mask of the observations that have been used to compute the raw robust estimates of location and shape, before correction and re-weighting. location_ : array-like, shape (n_features,) Estimated robust location covariance_ : array-like, shape (n_features, n_features) Estimated robust covariance matrix precision_ : array-like, shape (n_features, n_features) Estimated pseudo inverse matrix. (stored only if store_precision is True) support_ : array-like, shape (n_samples,) A mask of the observations that have been used to compute the robust estimates of location and shape. decision_scores_ : numpy array of shape (n_samples,) The outlier scores of the training data. The higher, the more abnormal. Outliers tend to have higher scores. This value is available once the detector is fitted. Mahalanobis distances of the training set (on which `:meth:`fit` is called) observations. threshold_ : float The threshold is based on ``contamination``. It is the ``n_samples * contamination`` most abnormal samples in ``decision_scores_``. The threshold is calculated for generating binary outlier labels. labels_ : int, either 0 or 1 The binary labels of the training data. 0 stands for inliers and 1 for outliers/anomalies. It is generated by applying ``threshold_`` on ``decision_scores_``. """ def __init__(self, contamination=0.1, store_precision=True, assume_centered=False, support_fraction=None, random_state=None): super(MCD, self).__init__(contamination=contamination) self.store_precision = store_precision self.assume_centered = assume_centered self.support_fraction = support_fraction self.random_state = random_state # noinspection PyIncorrectDocstring def fit(self, X, y=None): """Fit detector. y is ignored in unsupervised methods. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : Ignored Not used, present for API consistency by convention. Returns ------- self : object Fitted estimator. """ # Validate inputs X and y (optional) X = check_array(X) self._set_n_classes(y) self.detector_ = MinCovDet(store_precision=self.store_precision, assume_centered=self.assume_centered, support_fraction=self.support_fraction, random_state=self.random_state) self.detector_.fit(X=X, y=y) # Use mahalanabis distance as the outlier score self.decision_scores_ = self.detector_.dist_ self._process_decision_scores() return self def decision_function(self, X): """Predict raw anomaly score of X using the fitted detector. The anomaly score of an input sample is computed based on different detector algorithms. For consistency, outliers are assigned with larger anomaly scores. Parameters ---------- X : numpy array of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- anomaly_scores : numpy array of shape (n_samples,) The anomaly score of the input samples. """ check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) X = check_array(X) # Computer mahalanobis distance of the samples return self.detector_.mahalanobis(X) @property def raw_location_(self): """The raw robust estimated location before correction and re-weighting. Decorator for scikit-learn MinCovDet attributes. """ return self.detector_.raw_location_ @property def raw_covariance_(self): """The raw robust estimated location before correction and re-weighting. Decorator for scikit-learn MinCovDet attributes. """ return self.detector_.raw_covariance_ @property def raw_support_(self): """A mask of the observations that have been used to compute the raw robust estimates of location and shape, before correction and re-weighting. Decorator for scikit-learn MinCovDet attributes. """ return self.detector_.raw_support_ @property def location_(self): """Estimated robust location. Decorator for scikit-learn MinCovDet attributes. """ return self.detector_.location_ @property def covariance_(self): """Estimated robust covariance matrix. Decorator for scikit-learn MinCovDet attributes. """ return self.detector_.covariance_ @property def precision_(self): """ Estimated pseudo inverse matrix. (stored only if store_precision is True) Decorator for scikit-learn MinCovDet attributes. """ return self.detector_.precision_ @property def support_(self): """A mask of the observations that have been used to compute the robust estimates of location and shape. Decorator for scikit-learn MinCovDet attributes. """ return self.detector_.support_