def _h_getMahalanobisRobust(dat, critical_alpha=0.01, good_rows=np.zeros(0)): '''Calculate the Mahalanobis distance from the sample vector.''' if good_rows.size == 0: good_rows = np.any(~np.isnan(dat), axis=1) try: dat2fit = dat[good_rows] assert not np.any(np.isnan(dat2fit)) robust_cov = MinCovDet().fit(dat2fit) mahalanobis_dist = np.sqrt(robust_cov.mahalanobis(dat)) except ValueError: # this step will fail if the covariance matrix is not singular. This happens if the data is not # a unimodal symetric distribution. For example there is too many small noisy particles. Therefore # I will take a safe option and return zeros in the mahalanobis # distance if this is the case. mahalanobis_dist = np.zeros(dat.shape[0]) # critial distance of the maholanobis distance using the chi-square distirbution # https://en.wikiversity.org/wiki/Mahalanobis%27_distance # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2.html maha_lim = chi2.ppf(1 - critical_alpha, dat.shape[1]) outliers = mahalanobis_dist > maha_lim return mahalanobis_dist, outliers, maha_lim
def _naiveMCD(self, dataset, thresh=3): types = LoLTypeInference().getDataTypes(dataset) qdataset = [[d[i] for i, t in enumerate(types) if t == 'numerical'] for d in dataset] X = featurize(qdataset, [t for t in types if t == 'numerical']) xshape = np.shape(X) #for conditioning problems with the estimate Xsamp = X + 0.01 * np.random.randn(xshape[0], xshape[1]) m = MinCovDet() m.fit(Xsamp) sigma = np.linalg.inv(m.covariance_) mu = np.mean(X, axis=0) results = [] for i in range(0, xshape[0]): val = np.squeeze((X[i, :] - mu) * sigma * (X[i, :] - mu).T)[0, 0] results.append([str(val)]) e = ErrorDetector(results, modules=[QuantitativeErrorModule], config=[{ 'thresh': thresh }]) e.fit() return set([error['cell'][0] for error in e])
def launch_mcd_on_dataset(n_samples, n_features, n_outliers, tol_loc, tol_cov, tol_support): rand_gen = np.random.RandomState(0) data = rand_gen.randn(n_samples, n_features) # add some outliers outliers_index = rand_gen.permutation(n_samples)[:n_outliers] outliers_offset = 10. * \ (rand_gen.randint(2, size=(n_outliers, n_features)) - 0.5) data[outliers_index] += outliers_offset inliers_mask = np.ones(n_samples).astype(bool) inliers_mask[outliers_index] = False pure_data = data[inliers_mask] # compute MCD by fitting an object mcd_fit = MinCovDet(random_state=rand_gen).fit(data) T = mcd_fit.location_ S = mcd_fit.covariance_ H = mcd_fit.support_ # compare with the estimates learnt from the inliers error_location = np.mean((pure_data.mean(0) - T) ** 2) assert(error_location < tol_loc) error_cov = np.mean((empirical_covariance(pure_data) - S) ** 2) assert(error_cov < tol_cov) assert(np.sum(H) >= tol_support) assert_array_almost_equal(mcd_fit.mahalanobis(data), mcd_fit.dist_)
def outliers_finder(data_frame: pd.DataFrame) -> pd.DataFrame: """ Finding and removing outliers :param data_frame: :return: """ (df_X, df_y) = splitting_dataset(data_frame) # Define the PCA object pca = PCA() # Run PCA on scaled data and obtain the scores array T = pca.fit_transform(StandardScaler().fit_transform(df_X.values)) # fit a Minimum Covariance Determinant (MCD) robust estimator to data robust_cov = MinCovDet().fit(T[:, :5]) # Get the Mahalanobis distance m = robust_cov.mahalanobis(T[:, :5]) data_frame['mahalanobis'] = m # calculate p-value for each mahalanobis distance data_frame['p'] = 1 - chi2.cdf(data_frame['mahalanobis'], 3) data_frame.sort_values('p', ascending=False) Drops = (data_frame['p'] <= 0.001) data_frame['Drops'] = (data_frame['p'] <= 0.001) indexNames = data_frame[data_frame['Drops'] == True].index print(indexNames.size) data_frame.drop(indexNames, inplace=True) return data_frame
def obtenerOutliersMinCovarianza(self, datosOriginales, datosATestear): clf = MinCovDet().fit(datosOriginales) resultadoValoresATestear = clf.predict(datosATestear) listaOutliers, listaInliers = self.getListasOutliersInliers( resultadoValoresATestear, datosATestear) return listaOutliers, listaInliers
def test_mcd_issue1127(): # Check that the code does not break with X.shape = (3, 1) # (i.e. n_support = n_samples) rnd = np.random.RandomState(0) X = rnd.normal(size=(3, 1)) mcd = MinCovDet() mcd.fit(X)
def getMahalanobisRobust(dat, critical_alpha = 0.01, good_rows = np.zeros(0)): '''Calculate the Mahalanobis distance from the sample vector.''' if good_rows.size == 0: good_rows = np.any(~np.isnan(dat), axis=1); #import pdb #pdb.set_trace() try: robust_cov = MinCovDet().fit(dat[good_rows]) mahalanobis_dist = np.sqrt(robust_cov.mahalanobis(dat)) except ValueError: #this step will fail if the covariance matrix is not singular. This happens if the data is not #a unimodal symetric distribution. For example there is too many small noisy particles. Therefore #I will take a safe option and return zeros in the mahalanobis distance if this is the case. mahalanobis_dist = np.zeros(dat.shape[0]) #critial distance of the maholanobis distance using the chi-square distirbution #https://en.wikiversity.org/wiki/Mahalanobis%27_distance #http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2.html maha_lim = chi2.ppf(1-critical_alpha, dat.shape[1]) outliers = mahalanobis_dist>maha_lim return mahalanobis_dist, outliers, maha_lim
def __init__(self): # Config for loading no action spectrum (noise data) rospack = rospkg.RosPack() self.train_dir = osp.join(rospack.get_path( 'decopin_hand'), 'train_data') if not osp.exists(self.train_dir): makedirs(self.train_dir) self.noise_data_path = osp.join(self.train_dir, 'noise.npy') if not osp.exists(self.noise_data_path): rospy.logerr('{} is not found. Exit.'.format(self.noise_data_path)) exit() no_action_data = np.load(self.noise_data_path) # extract about 100 data from no_action_data divide = max(1, len(no_action_data) / 100) no_action_data = no_action_data[::divide] # Detect in action or not by mahalanobis distance self.anormal_threshold = rospy.get_param('~anormal_threshold') self.mcd = MinCovDet() self.mcd.fit(no_action_data) rospy.loginfo('Calc covariance matrix for Mahalanobis distance') # ROS self.bridge = CvBridge() self.pub = rospy.Publisher('~in_action', Bool, queue_size=1) self.sub = rospy.Subscriber('~raw_spectrogram', Image, self.cb)
def launch_mcd_on_dataset(n_samples, n_features, n_outliers, tol_loc, tol_cov, tol_support): rand_gen = np.random.RandomState(0) data = rand_gen.randn(n_samples, n_features) # add some outliers outliers_index = rand_gen.permutation(n_samples)[:n_outliers] outliers_offset = 10. * \ (rand_gen.randint(2, size=(n_outliers, n_features)) - 0.5) data[outliers_index] += outliers_offset inliers_mask = np.ones(n_samples).astype(bool) inliers_mask[outliers_index] = False pure_data = data[inliers_mask] # compute MCD by fitting an object mcd_fit = MinCovDet(random_state=rand_gen).fit(data) T = mcd_fit.location_ S = mcd_fit.covariance_ H = mcd_fit.support_ # compare with the estimates learnt from the inliers error_location = np.mean((pure_data.mean(0) - T) ** 2) assert (error_location < tol_loc) error_cov = np.mean((empirical_covariance(pure_data) - S) ** 2) assert (error_cov < tol_cov) assert (np.sum(H) >= tol_support) assert_array_almost_equal(mcd_fit.mahalanobis(data), mcd_fit.dist_)
def fit(self, X, y=None): """Fit detector. y is optional for unsupervised methods. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : numpy array of shape (n_samples,), optional (default=None) The ground truth of the input samples (labels). """ # Validate inputs X and y (optional) X = check_array(X) self._set_n_classes(y) self.detector_ = MinCovDet(store_precision=self.store_precision, assume_centered=self.assume_centered, support_fraction=self.support_fraction, random_state=self.random_state) self.detector_.fit(X=X, y=y) # Use mahalanabis distance as the outlier score self.decision_scores_ = self.detector_.dist_ self._process_decision_scores() return self
def test_mcd_increasing_det_warning(): # Check that a warning is raised if we observe increasing determinants # during the c_step. In theory the sequence of determinants should be # decreasing. Increasing determinants are likely due to ill-conditioned # covariance matrices that result in poor precision matrices. X = [[5.1, 3.5, 1.4, 0.2], [4.9, 3.0, 1.4, 0.2], [4.7, 3.2, 1.3, 0.2], [4.6, 3.1, 1.5, 0.2], [5.0, 3.6, 1.4, 0.2], [4.6, 3.4, 1.4, 0.3], [5.0, 3.4, 1.5, 0.2], [4.4, 2.9, 1.4, 0.2], [4.9, 3.1, 1.5, 0.1], [5.4, 3.7, 1.5, 0.2], [4.8, 3.4, 1.6, 0.2], [4.8, 3.0, 1.4, 0.1], [4.3, 3.0, 1.1, 0.1], [5.1, 3.5, 1.4, 0.3], [5.7, 3.8, 1.7, 0.3], [5.4, 3.4, 1.7, 0.2], [4.6, 3.6, 1.0, 0.2], [5.0, 3.0, 1.6, 0.2], [5.2, 3.5, 1.5, 0.2]] mcd = MinCovDet(random_state=1) warn_msg = "Determinant has increased" with pytest.warns(RuntimeWarning, match=warn_msg): mcd.fit(X)
def find_outliers_mahalanobis(featMatProjected, extremeness=2., figsize=[8, 8], saveto=None): """ A function to determine to return a list of outlier indices using the Mahalanobis distance. Outlier threshold = std(Mahalanobis distance) * extremeness degree [extreme_values=2, very_extreme_values=3 --> according to 68-95-99.7 rule] """ import numpy as np import pandas as pd import seaborn as sns from pathlib import Path from sklearn.covariance import MinCovDet from matplotlib import pyplot as plt # NB: Euclidean distance puts more weight than it should on correlated variables # Chicken and egg situation, we can’t know they are outliers until we calculate # the stats of the distribution, but the stats of the distribution are skewed by outliers! # Mahalanobis gets around this by weighting by robust estimation of covariance matrix # Fit a Minimum Covariance Determinant (MCD) robust estimator to data robust_cov = MinCovDet().fit( featMatProjected[:, :10]) # Use the first 10 principal components # Get the Mahalanobis distance MahalanobisDist = robust_cov.mahalanobis(featMatProjected[:, :10]) projectedTable = pd.DataFrame(featMatProjected[:,:10],\ columns=['PC' + str(n+1) for n in range(10)]) plt.ioff() if saveto else plt.ion() plt.close('all') plt.style.use(CUSTOM_STYLE) sns.set_style('ticks') fig, ax = plt.subplots(figsize=figsize) ax.set_facecolor('#F7FFFF') plt.scatter(np.array(projectedTable['PC1']), np.array(projectedTable['PC2']), c=MahalanobisDist) # colour PCA by Mahalanobis distance plt.title('Mahalanobis Distance for Outlier Detection', fontsize=20) plt.colorbar() ax.grid(False) if saveto: saveto.parent.mkdir(exist_ok=True, parents=True) suffix = Path(saveto).suffix.strip('.') plt.savefig(saveto, format=suffix, dpi=300) else: plt.show() k = np.std(MahalanobisDist) * extremeness upper_t = np.mean(MahalanobisDist) + k outliers = [] for i in range(len(MahalanobisDist)): if (MahalanobisDist[i] >= upper_t): outliers.append(i) print("Outliers found: %d" % len(outliers)) return np.array(outliers)
def fit(self, X, y=None): """Fit detector. y is ignored in unsupervised methods. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : Ignored Not used, present for API consistency by convention. Returns ------- self : object Fitted estimator. """ # Validate inputs X and y (optional) X = check_array(X) self._set_n_classes(y) self.detector_ = MinCovDet(store_precision=self.store_precision, assume_centered=self.assume_centered, support_fraction=self.support_fraction, random_state=self.random_state) self.detector_.fit(X=X, y=y) # Use mahalanabis distance as the outlier score self.decision_scores_ = self.detector_.dist_ self._process_decision_scores() return self
class ActionDetector(object): """ Publish whether the robot is in action or not to rostopic, by MT method. NOTE Before starting to detect action, some waiting time is required. This is preparation time to calculate mahalanobis distance. Reaction speed for action detection is a bit late because spectrum is mean of spectrogram, not right edge of spectrogram """ def __init__(self): # Config for loading no action spectrum (noise data) rospack = rospkg.RosPack() self.train_dir = osp.join(rospack.get_path( 'decopin_hand'), 'train_data') if not osp.exists(self.train_dir): makedirs(self.train_dir) self.noise_data_path = osp.join(self.train_dir, 'noise.npy') if not osp.exists(self.noise_data_path): rospy.logerr('{} is not found. Exit.'.format(self.noise_data_path)) exit() no_action_data = np.load(self.noise_data_path) # extract about 100 data from no_action_data divide = max(1, len(no_action_data) / 100) no_action_data = no_action_data[::divide] # Detect in action or not by mahalanobis distance self.anormal_threshold = rospy.get_param('~anormal_threshold') self.mcd = MinCovDet() self.mcd.fit(no_action_data) rospy.loginfo('Calc covariance matrix for Mahalanobis distance') # ROS self.bridge = CvBridge() self.pub = rospy.Publisher('~in_action', Bool, queue_size=1) self.sub = rospy.Subscriber('~raw_spectrogram', Image, self.cb) def cb(self, msg): """ Main process of NoiseSaver class Publish whether the robot is in action or not """ # spectrogram.shape is (height, width) = (spectrum, time) spectrogram = self.bridge.imgmsg_to_cv2(msg) self.current_spectrum = np.average(spectrogram, axis=1) # Check whether current spectrogram is in action or not spectrum = self.current_spectrum[None] dist = self.mcd.mahalanobis(spectrum)[0] info_message = '(mahalanobis distance, threshold) = ({}, {})'.format( dist, self.anormal_threshold) if dist < self.anormal_threshold: self.in_action = False rospy.loginfo('No action\n' + info_message + '\n') else: self.in_action = True rospy.loginfo('### In action ###\n' + info_message + '\n') pub_msg = Bool(data=self.in_action) self.pub.publish(pub_msg)
def mahalanobis_calculate(data, num_pcs): pca = PCA(num_pcs) T = pca.fit_transform(data) # fit a Minimum Covariance Determinant (MCD) robust estimator to data robust_cov = MinCovDet().fit(T) # Get the Mahalanobis distance m = robust_cov.mahalanobis(T) return m
def as7262_outliers(data, scatter_correction=None): data_columns = data[as7262_wavelengths] print(data_columns) # data_columns.T.plot() # plt.plot(data_columns.T) plt.show() if scatter_correction == "SNV": data_columns = processing.snv(data_columns) elif scatter_correction == "MSC": data_columns, _ = processing.msc(data_columns) # svm = OneClassSVM().fit_predict(snv_data) # print(svm) robust_cov = MinCovDet().fit(data_columns) mahal_dist = robust_cov.mahalanobis(data_columns) # mahal_dist = MahalanobisDist(np.array(data_columns), verbose=True) print(mahal_dist) zscore(data_columns) print('+++++') mean = np.mean(mahal_dist) std = 3*np.std(mahal_dist) print(mean, std) print(mean - std, mean + std) zscore_mahal = (mahal_dist - mean) / np.std(mahal_dist) # print(zscore_mahal) # print(zscore_mahal.max(), zscore_mahal.argmax(), data_columns.loc[zscore_mahal.argmax()]) print('pppp') print(data_columns) print(zscore_mahal.argmax()) outliers = data_columns.loc[zscore_mahal > 3].index outliers = data_columns.iloc[zscore_mahal.argmax()].name # print(data_columns.loc[zscore_mahal > 3].index) rows = data_columns.loc[outliers] # print(data_columns.loc[zscore_mahal.argmax()].name) print(data_columns.shape) print(rows) # print((mahal_dist-mahal_dist.mean()).std()) # print(mahal_dist.std()) # print(mahal_dist.mean() + 3*mahal_dist.std()) # mahal_dist2 = MahalanobisDist(np.array(data_columns), verbose=True) n, bins, _ = plt.hist(zscore_mahal, bins=40) plt.show() # x_hist = np.linspace(min(mahal_dist), max(mahal_dist), 100) # # popt, pcov = curve_fit(gauss_function, bins[:len(n)], n, maxfev=100000, p0=[300, 0, 20]) # new_fit = gauss_function(x_hist, *popt) # plt.plot(x_hist, new_fit, 'r--') # color = data_columns.shape[0] * ["#000000"] # color[data_columns.loc[zscore_mahal.argmax()].name] = "#FF0000" plt.plot(data_columns.T, c="black") plt.plot(rows.T, c="red") plt.plot(data_columns.mean(), c="blue", lw=4) # snv_data.T.plot(color=color) plt.show()
def detect(train_data: np.ndarray, test_data: np.ndarray) -> list: estimated_covarianvce = MinCovDet().fit(train_data) train_dist = estimated_covarianvce.mahalanobis(train_data) np_max = np.max(train_dist) return [ 0 if data <= np_max else 1 for data in estimated_covarianvce.mahalanobis(test_data) ]
def mahal_plot(e): first_half = e[1:len(e) - 1] second_half = e[2:len(e)] X = np.array([first_half, second_half]) X = np.transpose(X) # fit a Minimum Covariance Determinant (MCD) robust estimator to data robust_cov = MinCovDet().fit(X) # compare estimators learnt from the full data set with true parameters emp_cov = EmpiricalCovariance().fit(X) fig = plt.figure() # Show data set subfig1 = plt.subplot(1, 1, 1) inlier_plot = subfig1.scatter(first_half, second_half, color='black', label='daily diff in homes passed') subfig1.set_title("Mahalanobis distances of the iid invariants:") # Show contours of the distance functions xx, yy = np.meshgrid(np.linspace(plt.xlim()[0], plt.xlim()[1], 800), np.linspace(plt.ylim()[0], plt.ylim()[1], 100)) zz = np.c_[xx.ravel(), yy.ravel()] mahal_emp_cov = emp_cov.mahalanobis(zz) mahal_emp_cov = mahal_emp_cov.reshape(xx.shape) emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov), cmap=plt.cm.PuBu_r, linestyles='dashed') mahal_robust_cov = robust_cov.mahalanobis(zz) mahal_robust_cov = mahal_robust_cov.reshape(xx.shape) robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov), cmap=plt.cm.YlOrBr_r, color='red', linewidth="3") subfig1.legend([ emp_cov_contour.collections[1], robust_contour.collections[1], inlier_plot ], ['MLE dist', 'robust dist', 'kpis'], loc="upper right", borderaxespad=0) print(np.corrcoef(first_half, second_half)) return (robust_cov, emp_cov)
def leverage(self, X): mcd = MinCovDet() mcd.fit(X) loc, cov = mcd.location_, mcd.covariance_ inversed_cov = np.linalg.inv(cov) result = np.zeros(X.shape[0]) for i, element in enumerate(X): h = np.sqrt( np.transpose(element - loc) @ inversed_cov @ (element - loc)) result[i] = h return result
def __init__(self, estimator='ML', tol=1e-6): if (estimator == 'ML'): self.estimator_ = EmpiricalCovariance(store_precision=True, assume_centered=False) elif (estimator == 'MCD'): self.estimator_ = MinCovDet(store_precision=True, assume_centered=False, support_fraction=None, random_state=0) else: self.estimator_ = None self.tol_ = tol
def l_ratio(X, labels): ''' This is a meassure of how far a cluster is from neighbouring clusters computing the mahalanobis distance to the closest point that does not belong to the cluster ATENTION: the covariance matrix is estimated with the robust covariance (outliers not taken into account) Parameters ---------- X : ndarray Data (assumed to be multivariate normal distributed) labels : ndarray Labels Returns ------- lr : list, size(number of clusters) L-ratio for each cluster ''' lr = list() # unique labels unique_l = set(labels).difference([-1]) # if the set is empty, return 0 if len(unique_l)==0: return -1 # degrees of freedom df = len(X[0]) # for each cluster for label in unique_l: # compute points in cluster Xi = X[(labels==label)] # number of spikes in cluster n = len(Xi) # compute points out of the cluster outliers = X[(labels!=label)] # estimate robust covariance mcd = MinCovDet().fit(Xi) # compute mahalanobis distance for outliers Dmcd = mcd.mahalanobis(outliers) # compute L-ratio lr.append(np.sum(1-chi2.cdf(Dmcd,df))/n) return lr
def get_outliers(X, chi2thr=0.975, plot=False, figurename=None): """ detect outliers by Mahalanobis distance """ robust_cov = MinCovDet(random_state=100).fit(X) MD = robust_cov.mahalanobis(X) n_samples = len(MD) chi2 = stats.chi2 degrees_of_freedom = X.shape[1] threshold = chi2.ppf(chi2thr, degrees_of_freedom) y_pred = MD > threshold outlierpercent = sum(y_pred) / float(n_samples) return outlierpercent, y_pred, MD
def RejectOutliers(data, threshold=3): """ Rejects nodal outliers based on :threshold: away from the mean based on the mahalanobis distance """ from sklearn.covariance import MinCovDet clf = MinCovDet() clf.fit(data) distances = clf.mahalanobis(data) outliers = np.where(distances >= threshold)[0] inliers = np.where(distances < threshold)[0] return inliers, outliers
def __init__(self, lab_coords_x, lab_coords_y, data, i_panel, delta_scalar, params, verbose=False): training_data = [] mean_x = flex.mean(lab_coords_x) mean_y = flex.mean(lab_coords_y) limit=delta_scalar * 10 for ix in range(len(data)): if abs(lab_coords_x[ix] - mean_x) > limit: continue if abs(lab_coords_y[ix] - mean_y) > limit: continue if abs(data[ix])>1: continue training_data.append((lab_coords_x[ix],lab_coords_y[ix],data[ix])) if verbose: print("Training data is less",len(lab_coords_x) - len(training_data),end=" ") colorcode_set = [] for ix in range(len(data)): colorcode_set.append((lab_coords_x[ix],lab_coords_y[ix],data[ix])) from sklearn.covariance import EmpiricalCovariance, MinCovDet # compare estimators learnt from the full data set with true parameters emp_cov = EmpiricalCovariance(assume_centered=False, store_precision=True).fit(X=training_data) # fit a Minimum Covariance Determinant (MCD) robust estimator to data robust_cov = MinCovDet(assume_centered=False, store_precision=True).fit(X=training_data) features = ["Δx","Δy","ΔΨ(deg)"] if verbose: print("%3d"%i_panel,end=" ") print("%4d items "%(len(training_data),),end=" ") for idx_report in range(len(features)): feature = features[idx_report] diag_elem = math.sqrt(emp_cov.covariance_[idx_report,idx_report]) if verbose: print( "%s=%7.2f±%6.2f"%(feature, emp_cov.location_[idx_report], diag_elem),end=" ") if verbose: print("%4d items:"%(flex.bool(robust_cov.support_).count(True)),end=" ") for idx_report in range(len(features)): feature = features[idx_report] diag_elem = math.sqrt(robust_cov.covariance_[idx_report,idx_report]) if verbose: print( "%s=%7.2f±%6.2f"%(feature, robust_cov.location_[idx_report], diag_elem),end=" ") disc = flex.double(robust_cov.mahalanobis(X=colorcode_set)) # this metric represents malahanobis ** 2 disc_select = disc < (params.residuals.mcd_filter.mahalanobis_distance)**2 if params.residuals.mcd_filter.keep == "outliers": disc_select = (disc_select==False) if verbose: print("OK %4.1f%%"%(100*(disc_select.count(True))/len(training_data))) self.lab_coords_x = lab_coords_x.select(disc_select) self.lab_coords_y = lab_coords_y.select(disc_select) self.data = data.select(disc_select) self.n_input = len(lab_coords_x) self.n_output = len(self.lab_coords_x) self.emp_cov = emp_cov self.rob_cov = robust_cov
def mahalanobis_plot(ctry=None, df=None, weighted=True, inliers=False): """ See http://scikit-learn.org/0.13/modules/outlier_detection.html#\ fitting-an-elliptic-envelop for details. """ if df is None and ctry is None: raise ValueError('Either the country or a dataframe must be supplied') elif df is None: df = load_res(ctry, weighted=weighted) if inliers: df = get_inliers(df=df) X = df.values robust_cov = MinCovDet().fit(X) #----------------------------------------------------------------------------- # compare estimators learnt from the full data set with true parameters emp_cov = EmpiricalCovariance().fit(X) #----------------------------------------------------------------------------- # Display results fig = plt.figure() fig.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05) #----------------------------------------------------------------------------- # Show data set ax1 = fig.add_subplot(1, 1, 1) ax1.scatter(X[:, 0], X[:, 1], alpha=.5, color='k', marker='.') ax1.set_title(country_code[ctry]) #----------------------------------------------------------------------------- # Show contours of the distance functions xx, yy = np.meshgrid(np.linspace(ax1.get_xlim()[0], ax1.get_xlim()[1], 100), np.linspace(ax1.get_ylim()[0], ax1.get_ylim()[1], 100)) zz = np.c_[xx.ravel(), yy.ravel()] #----------------------------------------------------------------------------- mahal_emp_cov = emp_cov.mahalanobis(zz) mahal_emp_cov = mahal_emp_cov.reshape(xx.shape) emp_cov_contour = ax1.contour(xx, yy, np.sqrt(mahal_emp_cov), cmap=plt.cm.PuBu_r, linestyles='dashed') #----------------------------------------------------------------------------- mahal_robust_cov = robust_cov.mahalanobis(zz) mahal_robust_cov = mahal_robust_cov.reshape(xx.shape) robust_contour = ax1.contour(xx, yy, np.sqrt(mahal_robust_cov), cmap=plt.cm.YlOrBr_r, linestyles='dotted') ax1.legend([emp_cov_contour.collections[1], robust_contour.collections[1]], ['MLE dist', 'robust dist'], loc="upper right", borderaxespad=0) ax1.grid() return (fig, ax1, ctry)
def fit(self, X): """Fit detector. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. """ self.X_train = check_array(X) self.mcd = MinCovDet(store_precision=self.store_precision, assume_centered=self.assume_centered, support_fraction=self.support_fraction, random_state=self.random_state) self.mcd.fit(X=X, y=y) pass
def wcorr(x, y, w=None, robust=False): '''Weighted correlation coeffient Calculate the Pearson linear correlation coefficient of x and y using weights w. This is derived from the weighted covariance and weighted variance. Args: x,y : array of values w : array of weights for each element of x robust : (boolean) robust weights will be internally calculated using FastMCD; only used if robust=True and w is empty Returns: scalar : weighted covariance ''' n = len(x) assert len(y) == n, 'y must be the same length as x' # Use FastMCD to calculate weights; Another method could be used here if (w == None): w = MinCovDet().fit(np.array([x, y]).T).support_ if (len(w) == 0): raise SystemExit('must specify weights w or select robust=True') assert len(w) == n, 'w must be the same length as x and y' w = wscale(w) return wcov(x, y, w) / np.sqrt(wvar(x, w) * wvar(y, w))
def wcov(x, y, w=None, ddof=1, robust=False): '''Weighted covariance Calculate the covariance of x and y using weights w. If ddof=1 (default), then the result is the unbiased (sample) covariance when w=1. Implements weighted covariance as defined by NIST Dataplot (https://www.itl.nist.gov/div898/software/dataplot/refman2/ch2/weighvar.pdf) Args: x,y : array of values w : array of weights for each element of x; can be ommitted if robust=True ddof : scalar differential degrees of freedom (Default ddof=1) robust : (boolean) robust weights will be internally calculated using FastMCD; only used if robust=True and w is empty Returns: scalar : weighted covariance ''' n = len(x) assert len(y) == n, 'y must be the same length as x' # Use FastMCD to calculate weights; Another method could be used here if (robust and w == None): w = MinCovDet().fit(np.array([x, y]).T).support_ if (len(w) == 0): raise SystemExit('must specify weights w or select robust=True') assert len(w) == n, 'w must be the same length as x and y' w = wscale(w) nw = np.count_nonzero(w) return np.sum( ( x - wmean(x,w) ) * ( y - wmean(y,w) ) * w ) / \ ( np.sum(w) / nw * (nw - ddof) )
def test_mcd_issue3367(): # Check that MCD completes when the covariance matrix is singular # i.e. one of the rows and columns are all zeros rand_gen = np.random.RandomState(0) # Think of these as the values for X and Y -> 10 values between -5 and 5 data_values = np.linspace(-5, 5, 10).tolist() # Get the cartesian product of all possible coordinate pairs from above set data = np.array(list(itertools.product(data_values, data_values))) # Add a third column that's all zeros to make our data a set of point # within a plane, which means that the covariance matrix will be singular data = np.hstack((data, np.zeros((data.shape[0], 1)))) # The below line of code should raise an exception if the covariance matrix # is singular. As a further test, since we have points in XYZ, the # principle components (Eigenvectors) of these directly relate to the # geometry of the points. Since it's a plane, we should be able to test # that the Eigenvector that corresponds to the smallest Eigenvalue is the # plane normal, specifically [0, 0, 1], since everything is in the XY plane # (as I've set it up above). To do this one would start by: # # evals, evecs = np.linalg.eigh(mcd_fit.covariance_) # normal = evecs[:, np.argmin(evals)] # # After which we need to assert that our `normal` is equal to [0, 0, 1]. # Do note that there is floating point error associated with this, so it's # best to subtract the two and then compare some small tolerance (e.g. # 1e-12). MinCovDet(random_state=rand_gen).fit(data)
def wmean(x, w=None, robust=False): '''Weighted mean Calculate the mean of x using weights w. Args: x : array of values to be averaged w : array of weights for each element of x; can be ommitted if robust=True robust : (boolean) robust weights will be internally calculated using FastMCD; only used if robust=True and w is empty Returns: scalar : weighted mean ''' if (w != None): assert len(w) == len(x), 'w must be the same length as x' # Use FastMCD to calculate weights; Another method could be used here if (robust and w == None): w = MinCovDet().fit(np.array([x, x]).T).support_ if (len(w) == 0): raise SystemExit('must specify weights w or select robust=True') assert len(w) == len(x), 'w must be the same length as x' return np.sum(x * w) / np.sum(w)
def robust_mahalanobis_method(x=None, data=None): #Minimum covariance determinant method rng = np.random.RandomState(0) real_cov = np.cov(data.values.T) X = rng.multivariate_normal(mean=np.mean(data, axis=0), cov=real_cov, size=506) cov = MinCovDet(random_state=0).fit(X) mcd = cov.covariance_ #robust covariance metric robust_mean = cov.location_ #robust mean inv_covmat = sp.linalg.inv(mcd) #inverse covariance metric #Calculate MD with minimum covariance determinant method x_minus_mu = x - robust_mean left_term = np.dot(x_minus_mu, inv_covmat) mahal = np.dot(left_term, x_minus_mu.T) md = mahal.diagonal() #Compare rMD with threshold and flag as outlier outlier = [] C = chi2.ppf((1 - 0.001), df=x.shape[1]) #degrees of freedom = number of variables for index, value in enumerate(md): if value > C: outlier.append(index) else: continue return outlier, md
def compute_MCD_weft(weftsPickled, target_path): weft_points_list = floatPointList() for pickled_path in weftsPickled: weft_points_list.extend(pickle.load(open(pickled_path, "rb" ))) x_vals = [fp.x for fp in weft_points_list] y_vals = [fp.y for fp in weft_points_list] mean_hor_dist = weft_points_list.getMedianWeftDist() min_x = min(x_vals) + 1.5 * mean_hor_dist max_x = max(x_vals) - 1.5 * mean_hor_dist min_y = min(y_vals) + 1.5 * mean_hor_dist max_y = max(y_vals) - 1.5 * mean_hor_dist inner_points = floatPointList() for pt in weft_points_list: if min_x < pt.x < max_x and min_y < pt.y < max_y: inner_points.append(pt) X = np.zeros([len(inner_points), 3]) for idx, pt in enumerate(inner_points): X[idx,0] = pt.area X[idx,1] = pt.right_dist X[idx,2] = pt.left_dist Y = X[~(X<=0).any(axis=1)] robust_cov = MinCovDet(support_fraction=0.8).fit(Y) pickle.dump(robust_cov, open(target_path, "wb"))
def robust_mahalanobis_method(df): #Minimum covariance determinant rng = np.random.RandomState(0) real_cov = np.cov(df.values.T) X = rng.multivariate_normal(mean=np.mean(df, axis=0), cov=real_cov, size=506) cov = MinCovDet(random_state=0).fit(X) mcd = cov.covariance_ #robust covariance metric robust_mean = cov.location_ #robust mean inv_covmat = sp.linalg.inv(mcd) #inverse covariance metric #Robust M-Distance x_minus_mu = df - robust_mean left_term = np.dot(x_minus_mu, inv_covmat) mahal = np.dot(left_term, x_minus_mu.T) md = np.sqrt(mahal.diagonal()) #Flag as outlier outlier = [] C = np.sqrt(chi2.ppf( (1 - 0.001), df=df.shape[1])) #degrees of freedom = number of variables for index, value in enumerate(md): if value > C: outlier.append(index) else: continue return outlier, md
def analyze(self, mahalanobis_tolerance=2): self.inlier_points = np.zeros((len(self.points), 2)) for id1 in range(len(self.points)): id2 = closest_point(self.points, self.points[id1], id1)[0] #keep lines fro plotting purposes self.linedata[3*id1] = self.points[id1] self.linedata[3*id1+1] = self.points[id2] self.linedata[3*id1+2] = [None, None] # we are repeating every pi/2, so we compress the angle space by 4x a = 4*math.atan2((self.points[id1, 1] - self.points[id2, 1]), (self.points[id1, 0] - self.points[id2, 0])) r = np.linalg.norm(self.points[id1] - self.points[id2]) self.polardata[id1] = [r*math.cos(a), r*math.sin(a)] #find the minimal covariance inlier cluster self.polar_cov = MinCovDet().fit(self.polardata) # extract the grid angle and size. angle is divided by 4 because # we previously scaled it up to repeat every 90 deg self.theta = math.atan2(-self.polar_cov.location_[1], self.polar_cov.location_[0])/4 self.step_size = np.linalg.norm(self.polar_cov.location_) # extract inlier points polar_mahal = self.polar_cov.mahalanobis(self.polardata)**(0.33) inlier_count = 0 for i in range(len(polar_mahal)): if polar_mahal[i] < mahalanobis_tolerance: # stdev tolerance to outliers self.inlier_points[inlier_count] = self.points[i] self.inlier_indicies[inlier_count] = i inlier_count += 1 self.normalized_points = rotate(self.inlier_points[:inlier_count], -self.theta)/self.step_size #enumerate grid IDs origin_id = closest_point(self.normalized_points, np.mean(self.normalized_points))[0] self.normalized_points = self.normalized_points - self.normalized_points[origin_id] inlier_count = 0 self.bounds = [sys.maxint, sys.maxint, -sys.maxint, -sys.maxint] for p in self.normalized_points: x = round(p[0]) y = round(p[1]) d = np.linalg.norm(p-[x, y]) if d < 0.4: #tolerance from unit position self.normalized_points[inlier_count] = [x, y] if (x < self.bounds[0]): self.bounds[0] = x if (x > self.bounds[2]): self.bounds[2] = x if (y < self.bounds[1]): self.bounds[1] = y if (y > self.bounds[3]): self.bounds[3] = y inlier_count += 1 self.normalized_points = self.normalized_points[:inlier_count]
def estimateGaussian(nb_objects_init, nb_objects_final, thr, who, genes, siRNA, loadingFolder = '../resultData/thrivisions/predictions', threshold=0.05,): arr=np.vstack((thr, nb_objects_init, nb_objects_final)).T #deleting siRNAs that have only one experiment print len(siRNA) all_=Counter(siRNA);siRNA = np.array(siRNA) toDelsi=filter(lambda x: all_[x]==1, all_) toDelInd=[] for si in toDelsi: toDelInd.extend(np.where(siRNA==si)[0]) print len(toDelInd) dd=dict(zip(range(4), [arr, who, genes, siRNA])) for array_ in dd: dd[array_]=np.delete(dd[array_],toDelInd,0 ) arr, who, genes, siRNA = [dd[el] for el in range(4)] print arr.shape arr_ctrl=arr[np.where(np.array(genes)=='ctrl')] ctrlcov=MinCovDet().fit(arr_ctrl) robdist= ctrlcov.mahalanobis(arr)*np.sign(arr[:,0]-np.mean(arr[:,0])) new_siRNA=np.array(siRNA)[np.where((genes!='ctrl')&(robdist>0))] pval,qval =empiricalPvalues(np.absolute(robdist[np.where(genes=='ctrl')])[:, np.newaxis],\ robdist[np.where((genes!='ctrl')&(robdist>0))][:, np.newaxis],\ folder=loadingFolder, name="thrivision", sup=True, also_pval=True) assert new_siRNA.shape==qval.shape hits=Counter(new_siRNA[np.where(qval<threshold)[0]]) hits=filter(lambda x: float(hits[x])/all_[x]>=0.5, hits) gene_hits = [genes[list(siRNA).index(el)] for el in hits] gene_hits=Counter(gene_hits) return robdist, pval,qval, hits, gene_hits
#X1 = preprocessing.scale(X2) n_samples = len(X) n_outliers = n_samples*0.05 n_features = 2 # generate data # gen_cov = np.eye(n_features) # gen_cov[0, 0] = 2. # X = np.dot(np.random.randn(n_samples, n_features), gen_cov) # # add some outliers # outliers_cov = np.eye(n_features) # outliers_cov[np.arange(1, n_features), np.arange(1, n_features)] = 7. # X[-n_outliers:] = np.dot(np.random.randn(n_outliers, n_features), outliers_cov) # fit a Minimum Covariance Determinant (MCD) robust estimator to data robust_cov = MinCovDet().fit(X) # compare estimators learnt from the full data set with true parameters emp_cov = EmpiricalCovariance().fit(X) ############################################################################### # Display results fig = plt.figure() plt.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05) # Show data set subfig1 = plt.subplot(3, 1, 1) inlier_plot = subfig1.scatter(X[:, 0], X[:, 1], color='black', label='inliers') outlier_plot = subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:], color='red', label='outliers')
n_samples = 125 n_outliers = 25 n_features = 2 # generate data gen_cov = np.eye(n_features) gen_cov[0, 0] = 2. X = np.dot(np.random.randn(n_samples, n_features), gen_cov) # add some outliers outliers_cov = np.eye(n_features) outliers_cov[np.arange(1, n_features), np.arange(1, n_features)] = 7. X[-n_outliers:] = np.dot(np.random.randn(n_outliers, n_features), outliers_cov) # fit a Minimum Covariance Determinant (MCD) robust estimator to data robust_cov = MinCovDet().fit(X) # compare estimators learnt from the full data set with true parameters emp_cov = EmpiricalCovariance().fit(X) ############################################################################### # Display results fig = pl.figure() pl.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05) # Show data set subfig1 = pl.subplot(3, 1, 1) inlier_plot = subfig1.scatter(X[:, 0], X[:, 1], color='black', label='inliers') outlier_plot = subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:], color='red', label='outliers')
def main(): parser = argparse.ArgumentParser( description='Plot outlier-like distances for a 2-dimensional dataset') parser.add_argument( 'dataset', type=argparse.FileType('r'), help='a CSV file containing the dataset') parser.add_argument( '--plot', type=str, choices=['train', 'grid'], default='grid', help='plot the dataset or a grid evenly distributed over its span') parser.add_argument( '--plotdims', type=int, choices=[2, 3], default=2, help='the number of dimensions to plot') args = parser.parse_args() X = np.loadtxt(args.dataset, delimiter=',') fig = plt.figure() xformer = NullTransformer() if X.shape[1] > 2: xformer = PCA(n_components=2) X = xformer.fit_transform(X) if args.plotdims == 2: plt.scatter(X[:, 0], X[:, 1], s=60, linewidth='0') else: plt.scatter(X[:, 0], X[:, 1]) plt.show(block=False) path_to_script = os.path.realpath(__file__) dir_of_script = os.path.dirname(path_to_script) dataset_path = dir_of_script + '/outliers.npy' np.save(dataset_path, X) ########################################################################### # Train autoencoder with the n samples until convergence. Run # evenly distributed samples through the autoencoder and compute # their reconstruction error. ########################################################################### maxseq_orig = np.max(X) minseq_orig = np.min(X) seqrange = np.abs(maxseq_orig - minseq_orig) maxseq = maxseq_orig + 0.5 * seqrange minseq = minseq_orig - 0.5 * seqrange print("minseq", minseq, "maxseq", maxseq) if args.plot == 'grid': seq = np.linspace(minseq, maxseq, num=50, endpoint=True) Xplot = np.array([_ for _ in product(seq, seq)]) else: Xplot = X robust_cov = MinCovDet().fit(X) robust_md = robust_cov.mahalanobis(Xplot) empirical_cov = EmpiricalCovariance().fit(X) empirical_md = empirical_cov.mahalanobis(Xplot) # Assume Xplot is at least 2-dimensional. if Xplot.shape[1] > 2: Xplot2d = bh_sne(Xplot) else: Xplot2d = Xplot robust_md01 = robust_md - np.nanmin(robust_md) robust_md01 = robust_md01 / np.nanmax(robust_md01) empirical_md01 = empirical_md - np.nanmin(empirical_md) empirical_md01 = empirical_md01 / np.nanmax(empirical_md01) fig = plt.figure() if args.plotdims == 2: ax = fig.add_subplot(1, 1, 1) ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], cmap=plt.cm.jet, c=robust_md01, s=60, linewidth='0') else: ax = fig.add_subplot(1, 1, 1, projection='3d') ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], robust_md01, cmap=plt.cm.jet, color=robust_md01) ax.set_zlabel('Mahalanobis distance') ax.set_xlabel('x') ax.set_ylabel('y') ax.set_title('Mahalanobis distance (robust covariance)') fig = plt.figure() if args.plotdims == 2: ax = fig.add_subplot(1, 1, 1) ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], cmap=plt.cm.jet, c=empirical_md01, s=60, linewidth='0') else: ax = fig.add_subplot(1, 1, 1, projection='3d') ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], empirical_md01, cmap=plt.cm.jet, color=empirical_md01) ax.set_zlabel('Mahalanobis distance') ax.set_xlabel('x') ax.set_ylabel('y') ax.set_title('Mahalanobis distance (empirical covariance)') enc_dec = [ # tanh encoder, linear decoder ['tanh', 'linear'], # sigmoid encoder, linear decoder ['sigmoid', 'linear'], ####################################################################### # The reconstruction error of the autoencoders trained with the # remaining commented-out pairs don't seem to match Mahalanobis # distance very well. Feel free to uncomment them to see for # yourself. # linear encoder, linear decoder # ['linear', 'linear'], # tanh encoder, tanh decoder # ['tanh', 'tanh'], # tanh encoder, sigmoid decoder # ['tanh', 'sigmoid'], # sigmoid encoder, tanh decoder # ['sigmoid', 'tanh'], # sigmoid encoder, sigmoid decoder # ['sigmoid', 'sigmoid'] ####################################################################### ] for i, act in enumerate(enc_dec): enc, dec = act if dec == 'linear': dec = None model = train_autoencoder(dataset_path, act_enc=enc, act_dec=dec, nvis=X.shape[1], nhid=16) Xshared = theano.shared( np.asarray(Xplot, dtype=theano.config.floatX), borrow=True) f = theano.function([], outputs=model.reconstruct(Xshared)) fit = f() error = reconstruction_error(Xplot, fit) error01 = error - np.nanmin(error) error01 = error01 / np.nanmax(error01) fig = plt.figure() if args.plotdims == 2: ax = fig.add_subplot(1, 1, 1) ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], cmap=plt.cm.jet, c=error, s=60, linewidth='0') else: ax = fig.add_subplot(1, 1, 1, projection='3d') ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], error, cmap=plt.cm.jet, color=error01) ax.set_zlabel('Reconstruction error') ax.set_xlabel('x') ax.set_ylabel('y') encdec_type = ', '.join(act) ax.set_title('Reconstruction error (' + encdec_type + ')') print("Correlation of robust MD and reconstruction error (" + str(encdec_type) + ") " + str(pearsonr(robust_md, error))) print("Correlation of empirical MD and reconstruction error (" + str(encdec_type) + ") " + str(pearsonr(empirical_md, error))) print("Correlation of robust MD and empirical MD " + str(pearsonr(robust_md, empirical_md))) os.remove(dataset_path) os.remove('outliers.pkl') plt.show(block=True)
n_samples = 125 n_outliers = 25 n_features = 2 # generate data gen_cov = np.eye(n_features) gen_cov[0, 0] = 2. X = np.dot(np.random.randn(n_samples, n_features), gen_cov) # add some outliers outliers_cov = np.eye(n_features) outliers_cov[np.arange(1, n_features), np.arange(1, n_features)] = 7. X[-n_outliers:] = np.dot(np.random.randn(n_outliers, n_features), outliers_cov) # fit a Minimum Covariance Determinant (MCD) robust estimator to data robust_cov = MinCovDet().fit(X) # compare estimators learnt from the full data set with true parameters emp_cov = EmpiricalCovariance().fit(X) # Display results fig = pl.figure() # Show data set subfig1 = pl.subplot(3, 1, 1) subfig1.scatter(X[:, 0], X[:, 1], color='black', label='inliers') subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:], color='red', label='outliers') subfig1.set_xlim(subfig1.get_xlim()[0], 11.) subfig1.set_title("Mahalanobis distances of a contaminated data set:")
class Outlier_detection(object): def __init__(self, support_fraction = 0.95, verbose = True, chi2_percentile = 0.995): self.verbose = verbose self.support_fraction = support_fraction self.chi2 = stats.chi2 self.mcd = MCD(store_precision = True, support_fraction = support_fraction) self.chi2_percentile = chi2_percentile def fit(self, X): """Prints some summary stats (if verbose is one) and returns the indices of what it consider to be extreme""" self.mcd.fit(X) mahalanobis = lambda p: distance.mahalanobis(p, self.mcd.location_, self.mcd.precision_ ) d = np.array(map(mahalanobis, X)) #Mahalanobis distance values self.d2 = d ** 2 #MD squared n, self.degrees_of_freedom_ = X.shape self.iextreme_values = (self.d2 > self.chi2.ppf(0.995, self.degrees_of_freedom_) ) if self.verbose: print "%.3f proportion of outliers at %.3f%% chi2 percentile, "%(self.iextreme_values.sum()/float(n), self.chi2_percentile) print "with support fraction %.2f."%self.support_fraction return self def plot(self,log=False, sort = False ): """ Cause plotting is always fun. log: transform the distance-sq to a log ( distance-sq ) sort: sort the data according to distnace before plotting ifollow: a set if indices to mark with yellow, useful for seeing where data lies across views. """ n = self.d2.shape[0] fig = plt.figure() x = np.arange( n ) ax = fig.add_subplot(111) transform = (lambda x: x ) if not log else (lambda x: np.log(x)) chi_line = self.chi2.ppf(self.chi2_percentile, self.degrees_of_freedom_) chi_line = transform( chi_line ) d2 = transform( self.d2 ) if sort: isort = np.argsort( d2 ) ax.scatter(x, d2[isort], alpha = 0.7, facecolors='none' ) plt.plot( x, transform(self.chi2.ppf( np.linspace(0,1,n),self.degrees_of_freedom_ )), c="r", label="distribution assuming normal" ) else: ax.scatter(x, d2 ) extreme_values = d2[ self.iextreme_values ] ax.scatter( x[self.iextreme_values], extreme_values, color="r" ) ax.hlines( chi_line, 0, n, label ="%.1f%% $\chi^2$ quantile"%(100*self.chi2_percentile), linestyles = "dotted" ) ax.legend() ax.set_ylabel("distance squared") ax.set_xlabel("observation") ax.set_xlim(0, self.d2.shape[0]) plt.show()
def __init__(self, support_fraction = 0.95, verbose = True, chi2_percentile = 0.995): self.verbose = verbose self.support_fraction = support_fraction self.chi2 = stats.chi2 self.mcd = MCD(store_precision = True, support_fraction = support_fraction) self.chi2_percentile = chi2_percentile
lm2 = ols('word_diff ~ Age + C(Centre_ID)', data=clean_st,subset=subset).fit() print(lm2.summary()) # <markdowncell> # # Snippets. Might come back to this later: # <codecell> from scipy.stats import pearsonr from sklearn.covariance import MinCovDet # just look at what's interesting for now, and drop the NAs involved clean = st_v_merged.loc[:,['norm_diff','Interview_Suggested_Ranking_numerical_']] clean = clean.dropna(axis=0) # calculate robust covariance estimate, calculate what's too far away mcd = MinCovDet() mcd.fit(clean) pearsonr(clean.iloc[:,0],clean.iloc[:,1]) # <codecell> d = mcd.mahalanobis(clean) d.sort() d
# computation for i, n_outliers in enumerate(range_n_outliers): for j in range(repeat): # generate data X = np.random.randn(n_samples, n_features) # add some outliers outliers_index = np.random.permutation(n_samples)[:n_outliers] outliers_offset = 10. * \ (np.random.randint(2, size=(n_outliers, n_features)) - 0.5) X[outliers_index] += outliers_offset inliers_mask = np.ones(n_samples).astype(bool) inliers_mask[outliers_index] = False # fit a Minimum Covariance Determinant (MCD) robust estimator to data S = MinCovDet().fit(X) # compare raw robust estimates with the true location and covariance err_loc_mcd[i, j] = np.sum(S.location_ ** 2) err_cov_mcd[i, j] = S.error_norm(np.eye(n_features)) # compare estimators learnt from the full data set with true parameters err_loc_emp_full[i, j] = np.sum(X.mean(0) ** 2) err_cov_emp_full[i, j] = EmpiricalCovariance().fit(X).error_norm( np.eye(n_features)) # compare with an empirical covariance learnt from a pure data set # (i.e. "perfect" MCD) pure_X = X[inliers_mask] pure_location = pure_X.mean(0) pure_emp_cov = EmpiricalCovariance().fit(pure_X) err_loc_emp_pure[i, j] = np.sum(pure_location ** 2) err_cov_emp_pure[i, j] = pure_emp_cov.error_norm(np.eye(n_features))