def _calc_mean(self, data=None): if data is None: data = self._data if self._min_cov_det: lg.details("Use MCD for expectation value estimation") mcd = MCD(support_fraction=self._mcd_supp_frac).fit( self._data.transpose()) ydata = mcd.location_ else: ydata = np.mean(data, axis=1) return ydata
def _calc_cov(self, data=None): if data is None: data = self._data if self._min_cov_det: lg.details("Use MCD for covariance estimation") mcd = MCD(support_fraction=self._mcd_supp_frac).fit( self._data.transpose()) cov = mcd.covariance_ else: cov = calc_cov(data) if not self._sample_data: cov /= self._nconfs # For fit we have to normalize like an error return cov
def _calc_cov_and_mean(self, data=None): if data is None: data = self._data if self._min_cov_det: ind = self._xdata < self.xmax() mcd = MCD(support_fraction=self._mcd_supp_frac).fit( self._data.transpose()) ydata = mcd.location_ lg.details("Use MCD for covariance estimation") cov = mcd.covariance_ else: ydata = std_mean(self._data, axis=1) cov = calc_cov(data) if not self._sample_data: cov /= self._nconfs # For fit we have to normalize like an error edata = np.sqrt(np.diag(cov)) return ydata, edata, cov
return corr if has_sklearn: from sklearn.covariance import LedoitWolf, OAS, MCD lw = LedoitWolf(store_precision=False) lw.fit(rr, assume_centered=False) cov_lw = lw.covariance_ corr_lw = cov2corr(cov_lw) oas = OAS(store_precision=False) oas.fit(rr, assume_centered=False) cov_oas = oas.covariance_ corr_oas = cov2corr(cov_oas) mcd = MCD()#.fit(rr, reweight=None) mcd.fit(rr, assume_centered=False) cov_mcd = mcd.covariance_ corr_mcd = cov2corr(cov_mcd) titles = ['raw correlation', 'lw', 'oas', 'mcd'] normcolor = None fig = plt.figure() for i, c in enumerate([rrcorr, corr_lw, corr_oas, corr_mcd]): #for i, c in enumerate([np.cov(rr, rowvar=0), cov_lw, cov_oas, cov_mcd]): ax = fig.add_subplot(2,2,i+1) plot_corr(c, xnames=None, title=titles[i], normcolor=normcolor, ax=ax) images = [c for ax in fig.axes for c in ax.get_children() if isinstance(c, mpl.image.AxesImage)] fig. subplots_adjust(bottom=0.1, right=0.9, top=0.9)
if has_sklearn: from sklearn.covariance import LedoitWolf, OAS, MCD lw = LedoitWolf(store_precision=False) lw.fit(rr, assume_centered=False) cov_lw = lw.covariance_ corr_lw = cov2corr(cov_lw) oas = OAS(store_precision=False) oas.fit(rr, assume_centered=False) cov_oas = oas.covariance_ corr_oas = cov2corr(cov_oas) mcd = MCD() #.fit(rr, reweight=None) mcd.fit(rr, assume_centered=False) cov_mcd = mcd.covariance_ corr_mcd = cov2corr(cov_mcd) titles = ['raw correlation', 'lw', 'oas', 'mcd'] normcolor = None fig = plt.figure() for i, c in enumerate([rrcorr, corr_lw, corr_oas, corr_mcd]): #for i, c in enumerate([np.cov(rr, rowvar=0), cov_lw, cov_oas, cov_mcd]): ax = fig.add_subplot(2, 2, i + 1) plot_corr(c, xnames=None, title=titles[i], normcolor=normcolor, ax=ax) images = [ c for ax in fig.axes for c in ax.get_children() if isinstance(c, mpl.image.AxesImage)
def RobustMD_flsification(d_var, d_obs, plt_OrNot, Q_quantile): ''' This function falsifies the prior using Robust Mahalanobis Distance RMD. d_var: the data variable, (nXp) d_obs: the data observation variable, (1xp) plt_OrNot: True or False, to create the distribution plot of the calculated RMDs. Q_quantile:the Q_quantile of the RMD distribution, 95 or 97.5 is suggested example: MD_flsification(d_pri, d_obs, True, 95) will produce the RMD_obs, RMD_pri, RMD_Q95, and plot them. ''' mcd = MCD(random_state=0).fit(d_var) new_obs = d_obs - mcd.location_ md_obs = np.sqrt( new_obs.dot(np.linalg.inv(mcd.covariance_)).dot(new_obs.T)) print('Robust Mahalanobis Distance of d_obs = ', md_obs[0, 0].round(decimals=3)) md_samples = [] for i in range(len(d_var)): sample = d_var[i:i + 1, :] - mcd.location_ md_samp = np.sqrt( sample.dot(np.linalg.inv(mcd.covariance_)).dot(sample.T))[0, 0] md_samples.append(md_samp) md_samples = np.asarray(md_samples) print(str(Q_quantile)+'th Quantile of Robust Mahalanobis Distance is', \ stats.scoreatpercentile(md_samples, Q_quantile).round(decimals=3)) if plt_OrNot == True: plt.figure(figsize=(6, 5)) plt.scatter(np.arange(1, (len(d_var) + 1)), md_samples, c=abs(md_samples), cmap='winter_r', s=50, vmax=md_samples.max(), vmin=md_samples.min(), linewidths=1, edgecolor='k') plt.scatter([0], md_obs, c=md_obs, cmap='winter_r', marker='D', s=110, vmax=md_samples.max(), vmin=md_samples.min(), linewidths=3, edgecolor='red') plt.ylabel('Robust Mahalanobis dist', fontsize=12) plt.xlabel('realization No.', fontsize=12) plt.xlim(-8, 259) plt.hlines(y=stats.scoreatpercentile(md_samples, Q_quantile), xmin=-10, xmax=259, colors='red', linewidths=2, linestyles='--') cbar = plt.colorbar(fraction=0.035) cbar.ax.set_ylabel('RMD') plt.title( 'Prior falsification using Robust Mahalanobis Distance outlier dectection', fontsize=18, loc='left', style='italic') return md_obs[0, 0].round(decimals=3), stats.scoreatpercentile( md_samples, Q_quantile).round(decimals=3)
n_samples = 125 n_outliers = 25 n_features = 2 # generate data gen_cov = np.eye(n_features) gen_cov[0, 0] = 2. X = np.dot(np.random.randn(n_samples, n_features), gen_cov) # add some outliers outliers_cov = np.eye(n_features) outliers_cov[np.arange(1, n_features), np.arange(1, n_features)] = 7. X[-n_outliers:] = np.dot(np.random.randn(n_outliers, n_features), outliers_cov) # fit a Minimum Covariance Determinant (MCD) robust estimator to data robust_cov = MCD().fit(X, reweight=None) # compare estimators learnt from the full data set with true parameters emp_cov = EmpiricalCovariance().fit(X) # Display results fig = pl.figure() # variables and parameters for cosmetic offset_left = fig.subplotpars.left offset_bottom = fig.subplotpars.bottom width = fig.subplotpars.right - offset_left subfig1 = pl.subplot(3, 1, 1) subfig2 = pl.subplot(3, 1, 2) subfig3 = pl.subplot(3, 1, 3)
# computation for i, n_outliers in enumerate(range_n_outliers): for j in range(repeat): # generate data X = np.random.randn(n_samples, n_features) # add some outliers outliers_index = np.random.permutation(n_samples)[:n_outliers] outliers_offset = 10. * \ (np.random.randint(2, size=(n_outliers, n_features)) - 0.5) X[outliers_index] += outliers_offset inliers_mask = np.ones(n_samples).astype(bool) inliers_mask[outliers_index] = False # fit a Minimum Covariance Determinant (MCD) robust estimator to data S = MCD().fit(X, reweight=None) # compare robust estimates with the true location and covariance err_loc_mcd[i, j] = np.sum(S.location_ ** 2) err_cov_mcd[i, j] = S.error_norm(np.eye(n_features)) # fit a reweighted MCD robust estimator to data S = MCD().fit(X) # compare robust estimates with the true location and covariance err_loc_mcd_reweighted[i, j] = np.sum(S.location_ ** 2) err_cov_mcd_reweighted[i, j] = S.error_norm(np.eye(n_features)) # compare estimators learnt from the full data set with true parameters err_loc_emp_full[i, j] = np.sum(X.mean(0) ** 2) err_cov_emp_full[i, j] = EmpiricalCovariance().fit(X).error_norm( np.eye(n_features)) # compare with an empirical covariance learnt from a pure data set # (i.e. "perfect" MCD) pure_X = X[inliers_mask]