def __init__(self, store_precision=True, assume_centered=False, h=None, correction=None): EmpiricalCovariance.__init__( self, store_precision=store_precision, assume_centered=assume_centered) self.h = h self.correction = correction
def test_suffstat_sk_tied(): # use equation Nk * Sk / N = S_tied rng = np.random.RandomState(0) n_samples, n_features, n_components = 500, 2, 2 resp = rng.rand(n_samples, n_components) resp = resp / resp.sum(axis=1)[:, np.newaxis] X = rng.rand(n_samples, n_features) nk = resp.sum(axis=0) xk = np.dot(resp.T, X) / nk[:, np.newaxis] covars_pred_full = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0) covars_pred_full = np.sum(nk[:, np.newaxis, np.newaxis] * covars_pred_full, 0) / n_samples covars_pred_tied = _estimate_gaussian_covariances_tied(resp, X, nk, xk, 0) ecov = EmpiricalCovariance() ecov.covariance_ = covars_pred_full assert_almost_equal(ecov.error_norm(covars_pred_tied, norm='frobenius'), 0) assert_almost_equal(ecov.error_norm(covars_pred_tied, norm='spectral'), 0) # check the precision computation precs_chol_pred = _compute_precision_cholesky(covars_pred_tied, 'tied') precs_pred = np.dot(precs_chol_pred, precs_chol_pred.T) precs_est = linalg.inv(covars_pred_tied) assert_array_almost_equal(precs_est, precs_pred)
def test_suffstat_sk_diag(): # test against 'full' case rng = np.random.RandomState(0) n_samples, n_features, n_components = 500, 2, 2 resp = rng.rand(n_samples, n_components) resp = resp / resp.sum(axis=1)[:, np.newaxis] X = rng.rand(n_samples, n_features) nk = resp.sum(axis=0) xk = np.dot(resp.T, X) / nk[:, np.newaxis] precs_pred_full = _estimate_gaussian_precisions_cholesky_full(resp, X, nk, xk, 0) covars_pred_full = [linalg.inv(np.dot(precision_chol, precision_chol.T)) for precision_chol in precs_pred_full] precs_pred_diag = _estimate_gaussian_precisions_cholesky_diag(resp, X, nk, xk, 0) covars_pred_diag = np.array([np.diag(1. / d) ** 2 for d in precs_pred_diag]) ecov = EmpiricalCovariance() for (cov_full, cov_diag) in zip(covars_pred_full, covars_pred_diag): ecov.covariance_ = np.diag(np.diag(cov_full)) assert_almost_equal(ecov.error_norm(cov_diag, norm='frobenius'), 0) assert_almost_equal(ecov.error_norm(cov_diag, norm='spectral'), 0)
class CovEmbedding(BaseEstimator, TransformerMixin): """ Tranformer that returns the coefficients on a flat space to perform the analysis. """ def __init__(self, base_estimator=None, kind='tangent'): self.base_estimator = base_estimator self.kind = kind # if self.base_estimator == None: # self.base_estimator_ = ... # else: # self.base_estimator_ = clone(base_estimator) def fit(self, X, y=None): if self.base_estimator is None: self.base_estimator_ = EmpiricalCovariance( assume_centered=True) else: self.base_estimator_ = clone(self.base_estimator) if self.kind == 'tangent': # self.mean_cov = mean_cov = spd_manifold.log_mean(covs) # Euclidean mean as an approximation to the geodesic covs = [self.base_estimator_.fit(x).covariance_ for x in X] covs = my_stack(covs) mean_cov = np.mean(covs, axis=0) self.whitening_ = inv_sqrtm(mean_cov) return self def transform(self, X): """Apply transform to covariances Parameters ---------- covs: list of array list of covariance matrices, shape (n_rois, n_rois) Returns ------- list of array, transformed covariance matrices, shape (n_rois * (n_rois+1)/2,) """ covs = [self.base_estimator_.fit(x).covariance_ for x in X] covs = my_stack(covs) p = covs.shape[-1] if self.kind == 'tangent': id_ = np.identity(p) covs = [self.whitening_.dot(c.dot(self.whitening_)) - id_ for c in covs] elif self.kind == 'partial correlation': covs = [cov_to_corr(inv(g)) for g in covs] elif self.kind == 'correlation': covs = [cov_to_corr(g) for g in covs] return np.array([sym_to_vec(c) for c in covs])
class Mahalanobis (BaseEstimator): """Mahalanobis distance estimator. Uses Covariance estimate to compute mahalanobis distance of the observations from the model. Parameters ---------- robust : boolean to determine wheter to use robust estimator based on Minimum Covariance Determinant computation """ def __init__(self, robust=False): if not robust: from sklearn.covariance import EmpiricalCovariance as CovarianceEstimator # else: from sklearn.covariance import MinCovDet as CovarianceEstimator # self.model = CovarianceEstimator() self.cov = None def fit(self, X, y=None, **params): """Fits the covariance model according to the given training data and parameters. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training data, where n_samples is the number of samples and n_features is the number of features. Returns ------- self : object Returns self. """ self.cov = self.model.fit(X) return self def score(self, X, y=None): """Computes the mahalanobis distances of given observations. The provided observations are assumed to be centered. One may want to center them using a location estimate first. Parameters ---------- X : array-like, shape = [n_samples, n_features] The observations, the Mahalanobis distances of the which we compute. Returns ------- mahalanobis_distance : array, shape = [n_observations,] Mahalanobis distances of the observations. """ #return self.model.score(X,assume_centered=True) return - self.model.mahalanobis(X-self.model.location_) ** 0.33
def __init__(self, store_precision=True, assume_centered=False, h=None, contamination=0.1, pvalue_correction="fwer", no_fit=False): """ """ EmpiricalCovariance.__init__( self, store_precision=store_precision, assume_centered=assume_centered) CovarianceOutlierDetectionMixin.__init__( self, contamination=contamination, pvalue_correction=pvalue_correction) self.no_fit = no_fit
def printSciKitCovarianceMatrixs(): #does not work, ValueError: setting an array element with a sequence. xMaker = RSTCovarianceMatrixMaker() nums, data, ilabels = getLabeledRSTData(False) for i,d in enumerate(data): d['ratio'] = ilabels[i] xMaker.setInstanceNums(nums) xMaker.fit(data) X = xMaker.transform(data) correlator = EmpiricalCovariance() correlator.fit(X) print correlator.covariance_
class CovEmbedding(BaseEstimator, TransformerMixin): """ Tranformer that returns the coefficients on a flat space to perform the analysis. """ def __init__(self, cov_estimator=None, kind='tangent'): self.cov_estimator = cov_estimator self.kind = kind def fit(self, X, y=None): if self.cov_estimator is None: self.cov_estimator_ = EmpiricalCovariance( assume_centered=True) else: self.cov_estimator_ = clone(self.cov_estimator) if self.kind == 'tangent': covs = [self.cov_estimator_.fit(x).covariance_ for x in X] self.mean_cov_ = spd_mfd.frechet_mean(covs, max_iter=30, tol=1e-7) self.whitening_ = spd_mfd.inv_sqrtm(self.mean_cov_) return self def transform(self, X): """Apply transform to covariances Parameters ---------- covs: list of array list of covariance matrices, shape (n_rois, n_rois) Returns ------- list of array, transformed covariance matrices, shape (n_rois * (n_rois+1)/2,) """ covs = [self.cov_estimator_.fit(x).covariance_ for x in X] covs = spd_mfd.my_stack(covs) if self.kind == 'tangent': covs = [spd_mfd.logm(self.whitening_.dot(c).dot(self.whitening_)) for c in covs] elif self.kind == 'precision': covs = [spd_mfd.inv(g) for g in covs] elif self.kind == 'partial correlation': covs = [prec_to_partial(spd_mfd.inv(g)) for g in covs] elif self.kind == 'correlation': covs = [cov_to_corr(g) for g in covs] else: raise ValueError("Unknown connectivity measure.") return np.array([sym_to_vec(c) for c in covs])
def mahalanobis_plot(ctry=None, df=None, weighted=True, inliers=False): """ See http://scikit-learn.org/0.13/modules/outlier_detection.html#\ fitting-an-elliptic-envelop for details. """ if df is None and ctry is None: raise ValueError('Either the country or a dataframe must be supplied') elif df is None: df = load_res(ctry, weighted=weighted) if inliers: df = get_inliers(df=df) X = df.values robust_cov = MinCovDet().fit(X) #----------------------------------------------------------------------------- # compare estimators learnt from the full data set with true parameters emp_cov = EmpiricalCovariance().fit(X) #----------------------------------------------------------------------------- # Display results fig = plt.figure() fig.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05) #----------------------------------------------------------------------------- # Show data set ax1 = fig.add_subplot(1, 1, 1) ax1.scatter(X[:, 0], X[:, 1], alpha=.5, color='k', marker='.') ax1.set_title(country_code[ctry]) #----------------------------------------------------------------------------- # Show contours of the distance functions xx, yy = np.meshgrid(np.linspace(ax1.get_xlim()[0], ax1.get_xlim()[1], 100), np.linspace(ax1.get_ylim()[0], ax1.get_ylim()[1], 100)) zz = np.c_[xx.ravel(), yy.ravel()] #----------------------------------------------------------------------------- mahal_emp_cov = emp_cov.mahalanobis(zz) mahal_emp_cov = mahal_emp_cov.reshape(xx.shape) emp_cov_contour = ax1.contour(xx, yy, np.sqrt(mahal_emp_cov), cmap=plt.cm.PuBu_r, linestyles='dashed') #----------------------------------------------------------------------------- mahal_robust_cov = robust_cov.mahalanobis(zz) mahal_robust_cov = mahal_robust_cov.reshape(xx.shape) robust_contour = ax1.contour(xx, yy, np.sqrt(mahal_robust_cov), cmap=plt.cm.YlOrBr_r, linestyles='dotted') ax1.legend([emp_cov_contour.collections[1], robust_contour.collections[1]], ['MLE dist', 'robust dist'], loc="upper right", borderaxespad=0) ax1.grid() return (fig, ax1, ctry)
def outlier_rejection(feat, prob): ''' ''' from sklearn.covariance import EmpiricalCovariance #MinCovDet #real_cov #linalg.inv(real_cov) #robust_cov = MinCovDet().fit(feat) robust_cov = EmpiricalCovariance().fit(feat) dist = robust_cov.mahalanobis(feat - numpy.median(feat, 0)) cut = scipy.stats.chi2.ppf(prob, feat.shape[1]) return dist < cut
def fit(self, data): nu = 0.01 n_sample = data.shape[0] n_feature = data.shape[1] exclude = set() for d in range(n_feature): feature = data[:, d] s_feature = feature.copy() s_feature.sort() low = s_feature[int(n_sample*nu/2)] upp = s_feature[n_sample-int(n_sample*nu/2)] exld = numpy.nonzero(numpy.logical_or((feature > upp),(feature < low)))[0] [exclude.add(e) for e in exld] use = numpy.array([f for f in range(n_sample) if f not in exclude]) data_ = data[use, :] self.cov = EmpiricalCovariance().fit(data_) dist = self.cov.mahalanobis(data) self.cutoff = numpy.percentile(dist, self.perc_keep) print self.cutoff
def __init__(self, robust=False): if not robust: from sklearn.covariance import EmpiricalCovariance as CovarianceEstimator # else: from sklearn.covariance import MinCovDet as CovarianceEstimator # self.model = CovarianceEstimator() self.cov = None
def test_gaussian_mixture_fit(): # recover the ground truth rng = np.random.RandomState(0) rand_data = RandomData(rng) n_features = rand_data.n_features n_components = rand_data.n_components for covar_type in COVARIANCE_TYPE: X = rand_data.X[covar_type] g = GaussianMixture(n_components=n_components, n_init=20, reg_covar=0, random_state=rng, covariance_type=covar_type) g.fit(X) # needs more data to pass the test with rtol=1e-7 assert_allclose(np.sort(g.weights_), np.sort(rand_data.weights), rtol=0.1, atol=1e-2) arg_idx1 = g.means_[:, 0].argsort() arg_idx2 = rand_data.means[:, 0].argsort() assert_allclose(g.means_[arg_idx1], rand_data.means[arg_idx2], rtol=0.1, atol=1e-2) if covar_type == 'full': prec_pred = g.precisions_ prec_test = rand_data.precisions['full'] elif covar_type == 'tied': prec_pred = np.array([g.precisions_] * n_components) prec_test = np.array([rand_data.precisions['tied']] * n_components) elif covar_type == 'spherical': prec_pred = np.array([np.eye(n_features) * c for c in g.precisions_]) prec_test = np.array([np.eye(n_features) * c for c in rand_data.precisions['spherical']]) elif covar_type == 'diag': prec_pred = np.array([np.diag(d) for d in g.precisions_]) prec_test = np.array([np.diag(d) for d in rand_data.precisions['diag']]) arg_idx1 = np.trace(prec_pred, axis1=1, axis2=2).argsort() arg_idx2 = np.trace(prec_test, axis1=1, axis2=2).argsort() for k, h in zip(arg_idx1, arg_idx2): ecov = EmpiricalCovariance() ecov.covariance_ = prec_test[h] # the accuracy depends on the number of data and randomness, rng assert_allclose(ecov.error_norm(prec_pred[k]), 0, atol=0.1)
class OneClassMahalanobis(BaseClassifier): _fit_params = ['perc_keep'] _predict_params = [] def __init__(self,*args, **kwargs): # BaseClassifier.__init__(self, *args, **kwargs) self.perc_keep = kwargs["perc_keep"] def fit(self, data): nu = 0.01 n_sample = data.shape[0] n_feature = data.shape[1] exclude = set() for d in range(n_feature): feature = data[:, d] s_feature = feature.copy() s_feature.sort() low = s_feature[int(n_sample*nu/2)] upp = s_feature[n_sample-int(n_sample*nu/2)] exld = numpy.nonzero(numpy.logical_or((feature > upp),(feature < low)))[0] [exclude.add(e) for e in exld] use = numpy.array([f for f in range(n_sample) if f not in exclude]) data_ = data[use, :] self.cov = EmpiricalCovariance().fit(data_) dist = self.cov.mahalanobis(data) self.cutoff = numpy.percentile(dist, self.perc_keep) print self.cutoff def predict(self, data): mahal_dist = self.cov.mahalanobis(data) self.mahal_dist = mahal_dist print mahal_dist.min(), mahal_dist.max(), self.cutoff, (mahal_dist > self.cutoff).sum(), "of", len(mahal_dist) return (mahal_dist > self.cutoff).astype(numpy.uint8)*-2+1 def decision_function(self, data=None): return self.mahal_dist
def test_suffstat_sk_tied(): # use equation Nk * Sk / N = S_tied rng = np.random.RandomState(0) n_samples, n_features, n_components = 500, 2, 2 resp = rng.rand(n_samples, n_components) resp = resp / resp.sum(axis=1)[:, np.newaxis] X = rng.rand(n_samples, n_features) nk = resp.sum(axis=0) xk = np.dot(resp.T, X) / nk[:, np.newaxis] covars_pred_full = _estimate_gaussian_covariance_full(resp, X, nk, xk, 0) covars_pred_full = np.sum(nk[:, np.newaxis, np.newaxis] * covars_pred_full, 0) / n_samples covars_pred_tied = _estimate_gaussian_covariance_tied(resp, X, nk, xk, 0) ecov = EmpiricalCovariance() ecov.covariance_ = covars_pred_full assert_almost_equal(ecov.error_norm(covars_pred_tied, norm='frobenius'), 0) assert_almost_equal(ecov.error_norm(covars_pred_tied, norm='spectral'), 0)
def launch_mcd_on_dataset(n_samples, n_features, n_outliers, tol_loc, tol_cov, tol_support, correction): """ """ data = np.random.randn(n_samples, n_features) # add some outliers outliers_index = np.random.permutation(n_samples)[:n_outliers] outliers_offset = 10. * \ (np.random.randint(2, size=(n_outliers, n_features)) - 0.5) data[outliers_index] += outliers_offset inliers_mask = np.ones(n_samples).astype(bool) inliers_mask[outliers_index] = False # compute MCD directly T, S, H = fast_mcd(data, correction=correction) # compare with the estimates learnt from the inliers pure_data = data[inliers_mask] error_location = np.sum((pure_data.mean(0) - T) ** 2) assert(error_location < tol_loc) emp_cov = EmpiricalCovariance().fit(pure_data) #print emp_cov.error_norm(S) assert(emp_cov.error_norm(S) < tol_cov) assert(np.sum(H) > tol_support) # check improvement if (n_outliers / float(n_samples) > 0.1) and (n_features > 1): error_bad_location = np.sum((data.mean(0) - T) ** 2) assert(error_bad_location > error_location) bad_emp_cov = EmpiricalCovariance().fit(data) assert(emp_cov.error_norm(S) < bad_emp_cov.error_norm(S)) # compute MCD by fitting an object mcd_fit = MCD().fit(data) T = mcd_fit.location_ S = mcd_fit.covariance_ H = mcd_fit.support_ # compare with the estimates learnt from the inliers error_location = np.sum((pure_data.mean(0) - T) ** 2) assert(error_location < tol_loc) assert(emp_cov.error_norm(S) < tol_cov) assert(np.sum(H) > tol_support) # check improvement if (n_outliers / float(n_samples) > 0.1) and (n_features > 1): error_bad_location = np.sum((data.mean(0) - T) ** 2) assert(error_bad_location > error_location) bad_emp_cov = EmpiricalCovariance().fit(data) assert(emp_cov.error_norm(S) < bad_emp_cov.error_norm(S))
def fit(self, X, y=None): if self.cov_estimator is None: self.cov_estimator_ = EmpiricalCovariance( assume_centered=True) else: self.cov_estimator_ = clone(self.cov_estimator) if self.kind == 'tangent': covs = [self.cov_estimator_.fit(x).covariance_ for x in X] self.mean_cov_ = spd_mfd.frechet_mean(covs, max_iter=30, tol=1e-7) self.whitening_ = spd_mfd.inv_sqrtm(self.mean_cov_) return self
def test_suffstat_sk_full(): # compare the precision matrix compute from the # EmpiricalCovariance.covariance fitted on X*sqrt(resp) # with _sufficient_sk_full, n_components=1 rng = np.random.RandomState(0) n_samples, n_features = 500, 2 # special case 1, assuming data is "centered" X = rng.rand(n_samples, n_features) resp = rng.rand(n_samples, 1) X_resp = np.sqrt(resp) * X nk = np.array([n_samples]) xk = np.zeros((1, n_features)) covars_pred = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0) ecov = EmpiricalCovariance(assume_centered=True) ecov.fit(X_resp) assert_almost_equal(ecov.error_norm(covars_pred[0], norm='frobenius'), 0) assert_almost_equal(ecov.error_norm(covars_pred[0], norm='spectral'), 0) # check the precision computation precs_chol_pred = _compute_precision_cholesky(covars_pred, 'full') precs_pred = np.array([np.dot(prec, prec.T) for prec in precs_chol_pred]) precs_est = np.array([linalg.inv(cov) for cov in covars_pred]) assert_array_almost_equal(precs_est, precs_pred) # special case 2, assuming resp are all ones resp = np.ones((n_samples, 1)) nk = np.array([n_samples]) xk = X.mean(axis=0).reshape((1, -1)) covars_pred = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0) ecov = EmpiricalCovariance(assume_centered=False) ecov.fit(X) assert_almost_equal(ecov.error_norm(covars_pred[0], norm='frobenius'), 0) assert_almost_equal(ecov.error_norm(covars_pred[0], norm='spectral'), 0) # check the precision computation precs_chol_pred = _compute_precision_cholesky(covars_pred, 'full') precs_pred = np.array([np.dot(prec, prec.T) for prec in precs_chol_pred]) precs_est = np.array([linalg.inv(cov) for cov in covars_pred]) assert_array_almost_equal(precs_est, precs_pred)
def test_suffstat_sk_diag(): # test against 'full' case rng = np.random.RandomState(0) n_samples, n_features, n_components = 500, 2, 2 resp = rng.rand(n_samples, n_components) resp = resp / resp.sum(axis=1)[:, np.newaxis] X = rng.rand(n_samples, n_features) nk = resp.sum(axis=0) xk = np.dot(resp.T, X) / nk[:, np.newaxis] covars_pred_full = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0) covars_pred_diag = _estimate_gaussian_covariances_diag(resp, X, nk, xk, 0) ecov = EmpiricalCovariance() for (cov_full, cov_diag) in zip(covars_pred_full, covars_pred_diag): ecov.covariance_ = np.diag(np.diag(cov_full)) cov_diag = np.diag(cov_diag) assert_almost_equal(ecov.error_norm(cov_diag, norm='frobenius'), 0) assert_almost_equal(ecov.error_norm(cov_diag, norm='spectral'), 0) # check the precision computation precs_chol_pred = _compute_precision_cholesky(covars_pred_diag, 'diag') assert_almost_equal(covars_pred_diag, 1. / precs_chol_pred ** 2)
def fit(self, X, y=None): if self.base_estimator is None: self.base_estimator_ = EmpiricalCovariance( assume_centered=True) else: self.base_estimator_ = clone(self.base_estimator) if self.kind == 'tangent': # self.mean_cov = mean_cov = spd_manifold.log_mean(covs) # Euclidean mean as an approximation to the geodesic covs = [self.base_estimator_.fit(x).covariance_ for x in X] covs = my_stack(covs) mean_cov = np.mean(covs, axis=0) self.whitening_ = inv_sqrtm(mean_cov) return self
def detect_bad_channels(inst, pick_types=None, threshold=.2): from sklearn.preprocessing import RobustScaler from sklearn.covariance import EmpiricalCovariance from jr.stats import median_abs_deviation if pick_types is None: pick_types = dict(meg='mag') inst = inst.pick_types(copy=True, **pick_types) cov = EmpiricalCovariance() cov.fit(inst._data.T) cov = cov.covariance_ # center scaler = RobustScaler() cov = scaler.fit_transform(cov).T cov /= median_abs_deviation(cov) cov -= np.median(cov) # compute robust summary metrics mu = np.median(cov, axis=0) sigma = median_abs_deviation(cov, axis=0) mu /= median_abs_deviation(mu) sigma /= median_abs_deviation(sigma) distance = np.sqrt(mu ** 2 + sigma ** 2) bad = np.where(distance < threshold)[0] bad = [inst.ch_names[ch] for ch in bad] return bad
class OneClassMahalanobis(BaseClassifier): _fit_params = [] def __init__(self, *args, **kwargs): pass def fit(self, data): #self.cov = MinCovDet().fit(data) self.cov = EmpiricalCovariance().fit(data) def predict(self, data): mahal_emp_cov = self.cov.mahalanobis(data) d = data.shape[1] thres = scipy.stats.chi2.ppf(0.95, d) self.mahal_emp_cov = mahal_emp_cov return (mahal_emp_cov > thres).astype(numpy.int32)*-2+1 def decision_function(self, data): return self.mahal_emp_cov
import numpy as np #,def,gen_A(num_classes,t,adj_file): import pickle import matplotlib.pyplot as plt import numpy as np from sklearn.covariance import EmpiricalCovariance from sklearn.datasets import make_gaussian_quantiles real_cov = np.array([[.8, .3], [.3, .4]]) rng = np.random.RandomState(0) X = rng.multivariate_normal(mean=[0, 0], cov=real_cov, size=500) print(X) print(X.shape) cov = EmpiricalCovariance().fit(X) print(cov.covariance_.shape)
def calc_suggestions(self): """ Used to normalize """ suggs = [] preds = [] ents = [] """ Quick fix till i come up with an idea to approximate the manifold..""" inv_cov = np.linalg.inv( EmpiricalCovariance( assume_centered=True).fit(self.N_samples, self.M_samples) .covariance_ + np.eye(self.N) * 1e-6) # Exploitation for p in range(self.samples): gradients, inv_hess = self.S.run( (self.IN_grads, self.IN_hessian), feed_dict={ self.IN: [self.N_samples[p]] }) suggestion = self.N_samples[p].reshape(-1) - ( inv_hess @ gradients).reshape(-1) if self.minis and self.maxis: suggestion = np.clip(suggestion, self.minis, self.maxis) pred = self.predict([suggestion]) ent = 1000000 for point in self.N_samples: ent = min( ent, np.sqrt((point - suggestion).reshape(1, -1) @ inv_cov @ ( point - suggestion).reshape(-1, 1))) # print("Start", self.N_samples[p], "Score", self.M_samples[p]) # print("Recommendation", suggestion, "Pred", pred, "Ent", ent) # print() # print("Score", self.M_samples[p]) # print("Pred", pred, "Ent", ent) # print() suggs.append(suggestion) preds.append(pred) ents.append(ent) # Exploration.. for p in range(10): suggestion = self.S.run(self.N_gen).reshape(-1) pred = self.predict([suggestion]) ent = 1000000 for point in self.N_samples: ent = min( ent, np.sqrt((point - suggestion).reshape(1, -1) @ inv_cov @ ( point - suggestion).reshape(-1, 1))) # print("Recommendation", suggestion, "Pred", pred, "Ent", ent) # print() suggs.append(suggestion) preds.append(pred) ents.append(ent) suggs = np.array(suggs) preds = np.array(preds).reshape(-1) ents = np.array(ents).reshape(-1) mean_pred = np.mean(preds) stdev_pred = np.std(preds) mean_ent = np.mean(ents) stdev_ent = np.std(ents) # print("MEANENT VARENT", np.mean(ents), np.var(ents)) preds = (preds - mean_pred) / stdev_pred ents = (ents - mean_ent) / stdev_ent return suggs, preds, ents
def plot_contours(self, ax, show=False): COV = self.emp_cov COV_slice = EmpiricalCovariance() COV_slice.location_ = np.array([ COV.location_[0], COV.location_[1] ]) COV_slice.covariance_ = np.array([ COV.covariance_[ 0,0 ], COV.covariance_[ 0,1 ], COV.covariance_[ 1,0 ], COV.covariance_[ 1,1 ] ]) COV_slice.covariance_ = COV_slice.covariance_.reshape((2,2)) COV_slice.precision_ = np.array([ COV.precision_[ 0,0 ], COV.precision_[ 0,1 ], COV.precision_[ 1,0 ], COV.precision_[ 1,1 ] ]) COV_slice.precision_ = COV_slice.precision_.reshape((2,2)) # Show contours of the distance functions xx, yy = np.meshgrid( np.linspace(COV_slice.location_[0]-5*math.sqrt(COV_slice.covariance_[0,0]), COV_slice.location_[0]+5*math.sqrt(COV_slice.covariance_[0,0]), 100), np.linspace(COV_slice.location_[1]-5*math.sqrt(COV_slice.covariance_[1,1]), COV_slice.location_[1]+5*math.sqrt(COV_slice.covariance_[1,1]), 100), ) zz = np.c_[xx.ravel(), yy.ravel()] # Empirical fit is not so good. Don't plot this if False: # keep for debugging mahal_emp_cov = COV_slice.mahalanobis(zz) mahal_emp_cov = mahal_emp_cov.reshape(xx.shape) emp_cov_contour = ax.contour(xx, yy, np.sqrt(mahal_emp_cov), levels=[1.,2.,3.,4.,5.], #cmap=plt.cm.PuBu_r, cmap=plt.cm.cool_r, linestyles='dashed') COV = self.rob_cov COV_slice = EmpiricalCovariance() COV_slice.location_ = np.array([ COV.location_[0], COV.location_[1] ]) COV_slice.covariance_ = np.array([ COV.covariance_[ 0,0 ], COV.covariance_[ 0,1 ], COV.covariance_[ 1,0 ], COV.covariance_[ 1,1 ] ]) COV_slice.covariance_ = COV_slice.covariance_.reshape((2,2)) COV_slice.precision_ = np.array([ COV.precision_[ 0,0 ], COV.precision_[ 0,1 ], COV.precision_[ 1,0 ], COV.precision_[ 1,1 ] ]) COV_slice.precision_ = COV_slice.precision_.reshape((2,2)) self.robust_model_XY = COV_slice # robust is better if show: mahal_robust_cov = COV_slice.mahalanobis(zz) mahal_robust_cov = mahal_robust_cov.reshape(xx.shape) robust_contour = ax.contour(xx, yy, np.sqrt(mahal_robust_cov), levels=[1.,2.,3.,4.,5.], #cmap=plt.cm.YlOrBr_r, cmap=plt.cm.spring_r, linestyles='dotted')
def __init__(self, lab_coords_x, lab_coords_y, data, i_panel, delta_scalar, params, verbose=True): training_data = [] mean_x = flex.mean(lab_coords_x) mean_y = flex.mean(lab_coords_y) limit = delta_scalar * 10 for ix in range(len(data)): if abs(lab_coords_x[ix] - mean_x) > limit: continue if abs(lab_coords_y[ix] - mean_y) > limit: continue if abs(data[ix]) > 1: continue training_data.append( (lab_coords_x[ix], lab_coords_y[ix], data[ix])) if verbose: print("Training data is less", len(lab_coords_x) - len(training_data), end=" ") colorcode_set = [] for ix in range(len(data)): colorcode_set.append( (lab_coords_x[ix], lab_coords_y[ix], data[ix])) from sklearn.covariance import EmpiricalCovariance, MinCovDet # compare estimators learnt from the full data set with true parameters emp_cov = EmpiricalCovariance( assume_centered=False, store_precision=True).fit(X=training_data) # fit a Minimum Covariance Determinant (MCD) robust estimator to data robust_cov = MinCovDet(assume_centered=False, store_precision=True).fit(X=training_data) features = ["Δx", "Δy", "ΔΨ(deg)"] if verbose: print("%3d" % i_panel, end=" ") print("%4d items " % (len(training_data), ), end=" ") for idx_report in range(len(features)): feature = features[idx_report] diag_elem = math.sqrt(emp_cov.covariance_[idx_report, idx_report]) if verbose: print("%s=%7.2f±%6.2f" % (feature, emp_cov.location_[idx_report], diag_elem), end=" ") if verbose: print("%4d items:" % (flex.bool(robust_cov.support_).count(True)), end=" ") for idx_report in range(len(features)): feature = features[idx_report] diag_elem = math.sqrt(robust_cov.covariance_[idx_report, idx_report]) if verbose: print("%s=%7.2f±%6.2f" % (feature, robust_cov.location_[idx_report], diag_elem), end=" ") disc = flex.double(robust_cov.mahalanobis( X=colorcode_set)) # this metric represents malahanobis ** 2 disc_select = disc < ( params.residuals.mcd_filter.mahalanobis_distance)**2 if params.residuals.mcd_filter.keep == "outliers": disc_select = (disc_select == False) if verbose: print("OK %4.1f%%" % (100 * (disc_select.count(True)) / len(training_data))) self.lab_coords_x = lab_coords_x.select(disc_select) self.lab_coords_y = lab_coords_y.select(disc_select) self.data = data.select(disc_select) self.n_input = len(lab_coords_x) self.n_output = len(self.lab_coords_x) self.emp_cov = emp_cov self.rob_cov = robust_cov
###### Likelyhood Computation ###### # Fold the angles in params into proper range, such that # they centered at the mean. N_CYCLE_FOLD_ANGLE = 10 for j in xrange(N_CYCLE_FOLD_ANGLE): mean = np.mean(params, axis=0) for i in xrange(3, 6): # index 3,4,5 are angles, others are distances params[:, i][params[:, i] > mean[i] + np.pi] -= 2 * np.pi params[:, i][params[:, i] < mean[i] - np.pi] += 2 * np.pi if PARAMS_TLR[i] > mean[i] + np.pi: PARAMS_TLR[i] += 2 * np.pi if PARAMS_TLR[i] < mean[i] - np.pi: PARAMS_TLR[i] -= 2 * np.pi est = EmpiricalCovariance(True, False) est.fit(params) log_likelyhood = est.score(PARAMS_TLR[None, :]) KT = 0.59 free_e = -log_likelyhood * KT print 'Log likelyhood score:', log_likelyhood print 'Free energy:', free_e ###### Output the best conformer to pdb ###### def generate_bp_par_file(params, bps, out_name): assert(len(params) == len(bps)) n_bp = len(params) # convert from radians to degrees params[:, 3:] = np.degrees(params[:, 3:])
def __init__( self, propensity_transform=None, caliper=None, with_replacement=True, n_neighbors=1, matching_mode="both", metric="mahalanobis", knn_backend="sklearn", estimate_observed_outcome=False, ): """Match treatment and control samples with similar covariates. Args: propensity_transform (causallib.transformers.PropensityTransformer): an object for data preprocessing which adds the propensity score as a feature (default: None) caliper (float) : maximal distance for a match to be accepted. If not defined, all matches will be accepted. If defined, some samples may not be matched and their outcomes will not be estimated. (default: None) with_replacement (bool): whether samples can be used multiple times for matching. If set to False, the matching process will optimize the linear sum of distances between pairs of treatment and control samples and only `min(N_treatment, N_control)` samples will be estimated. Matching with no replacement does not make use of the `fit` data and is therefore not implemented for out-of-sample data (default: True) n_neighbors (int) : number of nearest neighbors to include in match. Must be 1 if `with_replacement` is `False.` If larger than 1, the estimate is calculated using the `regress_agg_function` or `classify_agg_function` across the `n_neighbors`. Note that when the `caliper` variable is set, some samples will have fewer than `n_neighbors` matches. (default: 1). matching_mode (str) : Direction of matching: `treatment_to_control`, `control_to_treatment` or `both` to indicate which set should be matched to which. All sets are cross-matched in `match` and when `with_replacement` is `False` all matching modes coincide. With replacement there is a difference. metric (str) : Distance metric string for calculating distance between samples. Note: if an external built `knn_backend` object with a different metric is supplied, `metric` needs to be changed to reflect that, because `Matching` will set its inverse covariance matrix if "mahalanobis" is set. (default: "mahalanobis", also supported: "euclidean") knn_backend (str or callable) : Backend to use for nearest neighbor search. Options are "sklearn" or a callable which returns an object implementing `fit`, `kneighbors` and `set_params` like the sklearn `NearestNeighbors` object. (default: "sklearn"). estimate_observed_outcome (bool) : Whether to allow a match of a sample to a sample other than itself when looking within its own treatment value. If True, the estimated potential outcome for the observed outcome may differ from the true observed outcome. (default: False) Attributes: classify_agg_function (callable) : Aggregating function for outcome estimation when classifying. (default: majority_rule) Usage is determined by type of `y` during `fit` regress_agg_function (callable) : Aggregating function for outcome estimation when regressing or predicting prob_a. (default: np.mean) Usage is determined by type of `y` during `fit` treatments_ (pd.DataFrame) : DataFrame of treatments (created after `fit`) outcomes_ (pd.DataFrame) : DataFrame of outcomes (created after `fit`) match_df_ (pd.DataFrame) : Dataframe of most recently calculated matches. For details, see `match`. (created after `match`) samples_used_ (pd.Series) : Series with count of samples used during most recent match. Series includes a count for each treatment value. (created after `match`) """ self.propensity_transform = propensity_transform self.covariance_conditioner = EmpiricalCovariance() self.caliper = caliper self.with_replacement = with_replacement self.n_neighbors = n_neighbors self.matching_mode = matching_mode self.metric = metric # if classify task, default aggregation function is majority self.classify_agg_function = majority_rule # if regress task, default aggregation function is mean self.regress_agg_function = np.mean self.knn_backend = knn_backend self.estimate_observed_outcome = estimate_observed_outcome
n_features = 2 # generate data gen_cov = np.eye(n_features) gen_cov[0, 0] = 2. X = np.dot(np.random.randn(n_samples, n_features), gen_cov) # add some outliers outliers_cov = np.eye(n_features) outliers_cov[np.arange(1, n_features), np.arange(1, n_features)] = 7. X[-n_outliers:] = np.dot(np.random.randn(n_outliers, n_features), outliers_cov) # fit a Minimum Covariance Determinant (MCD) robust estimator to data robust_cov = MinCovDet().fit(X) # compare estimators learnt from the full data set with true parameters emp_cov = EmpiricalCovariance().fit(X) ############################################################################### # Display results fig = pl.figure() pl.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05) # Show data set subfig1 = pl.subplot(3, 1, 1) inlier_plot = subfig1.scatter(X[:, 0], X[:, 1], color='black', label='inliers') outlier_plot = subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:], color='red', label='outliers') subfig1.set_xlim(subfig1.get_xlim()[0], 11.) subfig1.set_title("Mahalanobis distances of a contaminated data set:")
def fit(self, X, y=None): """Fits the GraphLasso covariance model to X. Closely follows sklearn.covariance.graph_lasso.GraphLassoCV. Parameters ---------- X : ndarray, shape (n_samples, n_features) Data from which to compute the covariance estimate """ # initialize X = check_array(X, ensure_min_features=2, estimator=self) X = as_float_array(X, copy=False, force_all_finite=False) cv = check_cv(self.cv, X, y, classifier=False) self.init_coefs(X) # get path if isinstance(self.lams, collections.Sequence): path = self.lams n_refinements = 1 else: n_refinements = self.n_refinements lam_1 = self.lam_scale_ lam_0 = 1e-2 * lam_1 path = np.logspace(np.log10(lam_0), np.log10(lam_1), self.lams)[::-1] # run this thing a bunch results = list() t0 = time.time() for rr in range(n_refinements): # parallel version this_result = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, )( delayed(_quic_path)( X[train], path, X_test=X[test], lam=self.lam, tol=self.tol, max_iter=self.max_iter, Theta0=self.Theta0, Sigma0=self.Sigma0, method=self.method, verbose=self.verbose, score_metric=self.score_metric, init_method=self.init_method) for train, test in cv) # Little dance to transform the list in what we need covs, _, scores = zip(*this_result) covs = zip(*covs) scores = zip(*scores) results.extend(zip(path, scores, covs)) results = sorted(results, key=operator.itemgetter(0), reverse=True) # Find the maximum (avoid using built in 'max' function to # have a fully-reproducible selection of the smallest alpha # in case of equality) best_score = -np.inf last_finite_idx = 0 for index, (lam, scores, _) in enumerate(results): # sometimes we get -np.inf in the result (in kl-loss) scores = [s for s in scores if not np.isinf(s)] if len(scores) == 0: this_score = -np.inf else: this_score = np.mean(scores) if this_score >= .1 / np.finfo(np.float64).eps: this_score = np.nan if np.isfinite(this_score): last_finite_idx = index if this_score >= best_score: best_score = this_score best_index = index # Refine the grid if best_index == 0: # We do not need to go back: we have chosen # the highest value of lambda for which there are # non-zero coefficients lam_1 = results[0][0] lam_0 = results[1][0] elif (best_index == last_finite_idx and not best_index == len(results) - 1): # We have non-converged models on the upper bound of the # grid, we need to refine the grid there lam_1 = results[best_index][0] lam_0 = results[best_index + 1][0] elif best_index == len(results) - 1: lam_1 = results[best_index][0] lam_0 = 0.01 * results[best_index][0] else: lam_1 = results[best_index - 1][0] lam_0 = results[best_index + 1][0] if not isinstance(self.lams, collections.Sequence): path = np.logspace(np.log10(lam_1), np.log10(lam_0), self.lams + 2) path = path[1:-1] if self.verbose and n_refinements > 1: print('[GraphLassoCV] Done refinement % 2i out of %i: % 3is' % (rr + 1, n_refinements, time.time() - t0)) results = list(zip(*results)) grid_scores = list(results[1]) lams = list(results[0]) # Finally, compute the score with lambda = 0 lams.append(0) grid_scores.append(cross_val_score(EmpiricalCovariance(), X, cv=cv, n_jobs=self.n_jobs)) self.grid_scores = np.array(grid_scores) self.lam_ = self.lam * lams[best_index] self.cv_lams_ = [self.lam * l for l in lams] # Finally fit the model with the selected lambda if self.method == 'quic': (self.precision_, self.covariance_, self.opt_, self.cputime_, self.iters_, self.duality_gap_) = quic(self.sample_covariance_, self.lam_, mode='default', tol=self.tol, max_iter=self.max_iter, Theta0=self.Theta0, Sigma0=self.Sigma0, path=None, msg=self.verbose) else: raise NotImplementedError( "Only method='quic' has been implemented.") self.is_fitted = True return self
outliers_offset = 10. * \ (np.random.randint(2, size=(n_outliers, n_features)) - 0.5) X[outliers_index] += outliers_offset inliers_mask = np.ones(n_samples).astype(bool) inliers_mask[outliers_index] = False # fit a Minimum Covariance Determinant (MCD) robust estimator to data mcd = MinCovDet().fit(X) # compare raw robust estimates with the true location and covariance err_loc_mcd[i, j] = np.sum(mcd.location_**2) err_cov_mcd[i, j] = mcd.error_norm(np.eye(n_features)) # compare estimators learned from the full data set with true # parameters err_loc_emp_full[i, j] = np.sum(X.mean(0)**2) err_cov_emp_full[i, j] = EmpiricalCovariance().fit(X).error_norm( np.eye(n_features)) # compare with an empirical covariance learned from a pure data set # (i.e. "perfect" mcd) pure_X = X[inliers_mask] pure_location = pure_X.mean(0) pure_emp_cov = EmpiricalCovariance().fit(pure_X) err_loc_emp_pure[i, j] = np.sum(pure_location**2) err_cov_emp_pure[i, j] = pure_emp_cov.error_norm(np.eye(n_features)) # Display results font_prop = matplotlib.font_manager.FontProperties(size=11) plt.subplot(2, 1, 1) lw = 2 plt.errorbar(range_n_outliers, err_loc_mcd.mean(1),
def test_covariance(): # Tests Covariance module on a simple dataset. # test covariance fit from data cov = EmpiricalCovariance() cov.fit(X) emp_cov = empirical_covariance(X) assert_array_almost_equal(emp_cov, cov.covariance_, 4) assert_almost_equal(cov.error_norm(emp_cov), 0) assert_almost_equal(cov.error_norm(emp_cov, norm='spectral'), 0) assert_almost_equal(cov.error_norm(emp_cov, norm='frobenius'), 0) assert_almost_equal(cov.error_norm(emp_cov, scaling=False), 0) assert_almost_equal(cov.error_norm(emp_cov, squared=False), 0) assert_raises(NotImplementedError, cov.error_norm, emp_cov, norm='foo') # Mahalanobis distances computation test mahal_dist = cov.mahalanobis(X) print(np.amin(mahal_dist), np.amax(mahal_dist)) assert (np.amin(mahal_dist) > 0) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) cov = EmpiricalCovariance() cov.fit(X_1d) assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4) assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0) # test with one sample # FIXME I don't know what this test does X_1sample = np.arange(5) cov = EmpiricalCovariance() assert_warns(UserWarning, cov.fit, X_1sample) assert_array_almost_equal(cov.covariance_, np.zeros(shape=(5, 5), dtype=np.float64)) # test integer type X_integer = np.asarray([[0, 1], [1, 0]]) result = np.asarray([[0.25, -0.25], [-0.25, 0.25]]) assert_array_almost_equal(empirical_covariance(X_integer), result) # test centered case cov = EmpiricalCovariance(assume_centered=True) cov.fit(X) assert_array_equal(cov.location_, np.zeros(X.shape[1]))
def test_covariance(): """Tests Covariance module on a simple dataset. """ # test covariance fit from data cov = EmpiricalCovariance() cov.fit(X) assert_array_almost_equal(empirical_covariance(X), cov.covariance_, 4) assert_almost_equal(cov.error_norm(empirical_covariance(X)), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X), norm='spectral'), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X), norm='frobenius'), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X), scaling=False), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X), squared=False), 0) # Mahalanobis distances computation test mahal_dist = cov.mahalanobis(X) assert(np.amax(mahal_dist) < 250) assert(np.amin(mahal_dist) > 50) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) cov = EmpiricalCovariance() cov.fit(X_1d) assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4) assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0) # test integer type X_integer = np.asarray([[0, 1], [1, 0]]) result = np.asarray([[0.25, -0.25], [-0.25, 0.25]]) assert_array_almost_equal(empirical_covariance(X_integer), result)
def _sample_covariance(self, X): return EmpiricalCovariance().fit(X).covariance_
def test_connectivity_measure_outputs(): n_subjects = 10 n_features = 49 n_samples = 200 # Generate signals and compute covariances emp_covs = [] ledoit_covs = [] signals = [] random_state = check_random_state(0) ledoit_estimator = LedoitWolf() for k in range(n_subjects): signal = random_state.randn(n_samples, n_features) signals.append(signal) signal -= signal.mean(axis=0) emp_covs.append((signal.T).dot(signal) / n_samples) ledoit_covs.append(ledoit_estimator.fit(signal).covariance_) kinds = ["correlation", "tangent", "precision", "partial correlation"] # Check outputs properties for cov_estimator, covs in zip( [EmpiricalCovariance(), LedoitWolf()], [emp_covs, ledoit_covs]): input_covs = copy.copy(covs) for kind in kinds: conn_measure = ConnectivityMeasure(kind=kind, cov_estimator=cov_estimator) connectivities = conn_measure.fit_transform(signals) # Generic assert_true(isinstance(connectivities, np.ndarray)) assert_equal(len(connectivities), len(covs)) for k, cov_new in enumerate(connectivities): assert_array_equal(input_covs[k], covs[k]) assert (is_spd(covs[k], decimal=7)) # Positive definiteness if expected and output value checks if kind == "tangent": assert_array_almost_equal(cov_new, cov_new.T) gmean_sqrt = _map_eigenvalues(np.sqrt, conn_measure.mean_) assert (is_spd(gmean_sqrt, decimal=7)) assert (is_spd(conn_measure.whitening_, decimal=7)) assert_array_almost_equal( conn_measure.whitening_.dot(gmean_sqrt), np.eye(n_features)) assert_array_almost_equal( gmean_sqrt.dot(_map_eigenvalues( np.exp, cov_new)).dot(gmean_sqrt), covs[k]) elif kind == "precision": assert (is_spd(cov_new, decimal=7)) assert_array_almost_equal(cov_new.dot(covs[k]), np.eye(n_features)) elif kind == "correlation": assert (is_spd(cov_new, decimal=7)) d = np.sqrt(np.diag(np.diag(covs[k]))) if cov_estimator == EmpiricalCovariance(): assert_array_almost_equal( d.dot(cov_new).dot(d), covs[k]) assert_array_almost_equal(np.diag(cov_new), np.ones((n_features))) elif kind == "partial correlation": prec = linalg.inv(covs[k]) d = np.sqrt(np.diag(np.diag(prec))) assert_array_almost_equal( d.dot(cov_new).dot(d), -prec + 2 * np.diag(np.diag(prec)))
def test_connectivity_measure_outputs(): n_subjects = 10 n_features = 49 # Generate signals and compute covariances emp_covs = [] ledoit_covs = [] signals = [] ledoit_estimator = LedoitWolf() for k in range(n_subjects): n_samples = 200 + k signal, _, _ = generate_signals(n_features=n_features, n_confounds=5, length=n_samples, same_variance=False) signals.append(signal) signal -= signal.mean(axis=0) emp_covs.append((signal.T).dot(signal) / n_samples) ledoit_covs.append(ledoit_estimator.fit(signal).covariance_) kinds = [ "covariance", "correlation", "tangent", "precision", "partial correlation" ] # Check outputs properties for cov_estimator, covs in zip( [EmpiricalCovariance(), LedoitWolf()], [emp_covs, ledoit_covs]): input_covs = copy.copy(covs) for kind in kinds: conn_measure = ConnectivityMeasure(kind=kind, cov_estimator=cov_estimator) connectivities = conn_measure.fit_transform(signals) # Generic assert_true(isinstance(connectivities, np.ndarray)) assert_equal(len(connectivities), len(covs)) for k, cov_new in enumerate(connectivities): assert_array_equal(input_covs[k], covs[k]) assert (is_spd(covs[k], decimal=7)) # Positive definiteness if expected and output value checks if kind == "tangent": assert_array_almost_equal(cov_new, cov_new.T) gmean_sqrt = _map_eigenvalues(np.sqrt, conn_measure.mean_) assert (is_spd(gmean_sqrt, decimal=7)) assert (is_spd(conn_measure.whitening_, decimal=7)) assert_array_almost_equal( conn_measure.whitening_.dot(gmean_sqrt), np.eye(n_features)) assert_array_almost_equal( gmean_sqrt.dot(_map_eigenvalues( np.exp, cov_new)).dot(gmean_sqrt), covs[k]) elif kind == "precision": assert (is_spd(cov_new, decimal=7)) assert_array_almost_equal(cov_new.dot(covs[k]), np.eye(n_features)) elif kind == "correlation": assert (is_spd(cov_new, decimal=7)) d = np.sqrt(np.diag(np.diag(covs[k]))) if cov_estimator == EmpiricalCovariance(): assert_array_almost_equal( d.dot(cov_new).dot(d), covs[k]) assert_array_almost_equal(np.diag(cov_new), np.ones((n_features))) elif kind == "partial correlation": prec = linalg.inv(covs[k]) d = np.sqrt(np.diag(np.diag(prec))) assert_array_almost_equal( d.dot(cov_new).dot(d), -prec + 2 * np.diag(np.diag(prec))) # Check the mean_ for kind in kinds: conn_measure = ConnectivityMeasure(kind=kind) conn_measure.fit_transform(signals) assert_equal((conn_measure.mean_).shape, (n_features, n_features)) if kind != 'tangent': assert_array_almost_equal( conn_measure.mean_, np.mean(conn_measure.transform(signals), axis=0)) # Check that the mean isn't modified in transform conn_measure = ConnectivityMeasure(kind='covariance') conn_measure.fit(signals[:1]) mean = conn_measure.mean_ conn_measure.transform(signals[1:]) assert_array_equal(mean, conn_measure.mean_) # Check vectorization option for kind in kinds: conn_measure = ConnectivityMeasure(kind=kind) connectivities = conn_measure.fit_transform(signals) conn_measure = ConnectivityMeasure(vectorize=True, kind=kind) vectorized_connectivities = conn_measure.fit_transform(signals) assert_array_almost_equal(vectorized_connectivities, sym_matrix_to_vec(connectivities)) # Check not fitted error assert_raises_regex(ValueError, 'has not been fitted. ', ConnectivityMeasure().inverse_transform, vectorized_connectivities) # Check inverse transformation kinds.remove('tangent') for kind in kinds: # without vectorization: input matrices are returned with no change conn_measure = ConnectivityMeasure(kind=kind) connectivities = conn_measure.fit_transform(signals) assert_array_almost_equal( conn_measure.inverse_transform(connectivities), connectivities) # with vectorization: input vectors are reshaped into matrices # if diagonal has not been discarded conn_measure = ConnectivityMeasure(kind=kind, vectorize=True) vectorized_connectivities = conn_measure.fit_transform(signals) assert_array_almost_equal( conn_measure.inverse_transform(vectorized_connectivities), connectivities) # with vectorization if diagonal has been discarded for kind in ['correlation', 'partial correlation']: connectivities = ConnectivityMeasure(kind=kind).fit_transform(signals) conn_measure = ConnectivityMeasure(kind=kind, vectorize=True, discard_diagonal=True) vectorized_connectivities = conn_measure.fit_transform(signals) assert_array_almost_equal( conn_measure.inverse_transform(vectorized_connectivities), connectivities) for kind in ['covariance', 'precision']: connectivities = ConnectivityMeasure(kind=kind).fit_transform(signals) conn_measure = ConnectivityMeasure(kind=kind, vectorize=True, discard_diagonal=True) vectorized_connectivities = conn_measure.fit_transform(signals) diagonal = np.array( [np.diagonal(conn) / sqrt(2) for conn in connectivities]) inverse_transformed = conn_measure.inverse_transform( vectorized_connectivities, diagonal=diagonal) assert_array_almost_equal(inverse_transformed, connectivities) assert_raises_regex(ValueError, 'can not reconstruct connectivity matrices', conn_measure.inverse_transform, vectorized_connectivities) # for 'tangent' kind, covariance matrices are reconstructed # without vectorization tangent_measure = ConnectivityMeasure(kind='tangent') displacements = tangent_measure.fit_transform(signals) covariances = ConnectivityMeasure(kind='covariance').fit_transform(signals) assert_array_almost_equal(tangent_measure.inverse_transform(displacements), covariances) # with vectorization # when diagonal has not been discarded tangent_measure = ConnectivityMeasure(kind='tangent', vectorize=True) vectorized_displacements = tangent_measure.fit_transform(signals) assert_array_almost_equal( tangent_measure.inverse_transform(vectorized_displacements), covariances) # when diagonal has been discarded tangent_measure = ConnectivityMeasure(kind='tangent', vectorize=True, discard_diagonal=True) vectorized_displacements = tangent_measure.fit_transform(signals) diagonal = np.array( [np.diagonal(matrix) / sqrt(2) for matrix in displacements]) inverse_transformed = tangent_measure.inverse_transform( vectorized_displacements, diagonal=diagonal) assert_array_almost_equal(inverse_transformed, covariances) assert_raises_regex(ValueError, 'can not reconstruct connectivity matrices', tangent_measure.inverse_transform, vectorized_displacements)
hPurity_disc.GetYaxis().SetRangeUser(0, 1.3) hPurity_disc.Divide(hPurity_discDen) hPurity_disc.Draw() c.Print("purity_disc.png") hMVAdisc_pt.Draw("colz") c.Print("discriminator_vs_candPt.png") from sklearn.covariance import EmpiricalCovariance npRocInput = numpy.array(rocInput) npRocAnswers = numpy.array(rocScore) slimNpData0 = npRocInput[npRocAnswers == 0] slimNpData1 = npRocInput[npRocAnswers == 1] ecv = EmpiricalCovariance() ecv.fit(slimNpData0) from scipy.linalg import fractional_matrix_power def diagElements(m): size = m.shape[0] return numpy.matrix(numpy.diag([m[i, i] for i in xrange(size)])) def corrMat(m): sqrt_diag = fractional_matrix_power(diagElements(m), -0.5) return numpy.array(sqrt_diag * m * sqrt_diag)
def __init__(self, cov_estimator=EmpiricalCovariance(assume_centered=True), kind='covariance'): self.cov_estimator = cov_estimator self.kind = kind
class Matching(IndividualOutcomeEstimator): def __init__( self, propensity_transform=None, caliper=None, with_replacement=True, n_neighbors=1, matching_mode="both", metric="mahalanobis", knn_backend="sklearn", estimate_observed_outcome=False, ): """Match treatment and control samples with similar covariates. Args: propensity_transform (causallib.transformers.PropensityTransformer): an object for data preprocessing which adds the propensity score as a feature (default: None) caliper (float) : maximal distance for a match to be accepted. If not defined, all matches will be accepted. If defined, some samples may not be matched and their outcomes will not be estimated. (default: None) with_replacement (bool): whether samples can be used multiple times for matching. If set to False, the matching process will optimize the linear sum of distances between pairs of treatment and control samples and only `min(N_treatment, N_control)` samples will be estimated. Matching with no replacement does not make use of the `fit` data and is therefore not implemented for out-of-sample data (default: True) n_neighbors (int) : number of nearest neighbors to include in match. Must be 1 if `with_replacement` is `False.` If larger than 1, the estimate is calculated using the `regress_agg_function` or `classify_agg_function` across the `n_neighbors`. Note that when the `caliper` variable is set, some samples will have fewer than `n_neighbors` matches. (default: 1). matching_mode (str) : Direction of matching: `treatment_to_control`, `control_to_treatment` or `both` to indicate which set should be matched to which. All sets are cross-matched in `match` and when `with_replacement` is `False` all matching modes coincide. With replacement there is a difference. metric (str) : Distance metric string for calculating distance between samples. Note: if an external built `knn_backend` object with a different metric is supplied, `metric` needs to be changed to reflect that, because `Matching` will set its inverse covariance matrix if "mahalanobis" is set. (default: "mahalanobis", also supported: "euclidean") knn_backend (str or callable) : Backend to use for nearest neighbor search. Options are "sklearn" or a callable which returns an object implementing `fit`, `kneighbors` and `set_params` like the sklearn `NearestNeighbors` object. (default: "sklearn"). estimate_observed_outcome (bool) : Whether to allow a match of a sample to a sample other than itself when looking within its own treatment value. If True, the estimated potential outcome for the observed outcome may differ from the true observed outcome. (default: False) Attributes: classify_agg_function (callable) : Aggregating function for outcome estimation when classifying. (default: majority_rule) Usage is determined by type of `y` during `fit` regress_agg_function (callable) : Aggregating function for outcome estimation when regressing or predicting prob_a. (default: np.mean) Usage is determined by type of `y` during `fit` treatments_ (pd.DataFrame) : DataFrame of treatments (created after `fit`) outcomes_ (pd.DataFrame) : DataFrame of outcomes (created after `fit`) match_df_ (pd.DataFrame) : Dataframe of most recently calculated matches. For details, see `match`. (created after `match`) samples_used_ (pd.Series) : Series with count of samples used during most recent match. Series includes a count for each treatment value. (created after `match`) """ self.propensity_transform = propensity_transform self.covariance_conditioner = EmpiricalCovariance() self.caliper = caliper self.with_replacement = with_replacement self.n_neighbors = n_neighbors self.matching_mode = matching_mode self.metric = metric # if classify task, default aggregation function is majority self.classify_agg_function = majority_rule # if regress task, default aggregation function is mean self.regress_agg_function = np.mean self.knn_backend = knn_backend self.estimate_observed_outcome = estimate_observed_outcome def fit(self, X, a, y, sample_weight=None): """Load the treatments and outcomes and fit search trees. Applies transform to covariates X, initializes search trees for each treatment value for performing nearest neighbor searches. Note: Running `fit` a second time overwrites any information from previous `fit or `match` and re-fits the propensity_transform object. Args: X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates for n samples. a (pd.Series): Series of shape (n,) containing discrete treatment values for the n samples. y (pd.Series): Series of shape (n,) containing outcomes for the n samples. sample_weight: IGNORED In signature for compatibility with other estimators. Note: `X`, `a` and `y` must share the same index. Returns: self (Matching) the fitted object """ self._clear_post_fit_variables() self.outcome_ = y.copy() self.treatments_ = a.copy() if self.propensity_transform: self.propensity_transform.fit(X, a) X = self.propensity_transform.transform(X) self.conditioned_covariance_ = self._calculate_covariance(X) self.treatment_knns_ = {} for a in self.treatments_.unique(): haystack = X[self.treatments_ == a] self.treatment_knns_[a] = self._fit_sknn(haystack) return self def _execute_matching(self, X, a): """Execute matching of samples in X according to the treatment values in a. Returns a DataFrame including all the results, which is also set as the attribute `self.match_df_`. The arguments `X` and `a` define the "needle" where the "haystack" is the data that was previously passed to fit, for matching with replacement. As such, treatment and control samples from within `X` will not be matched with each other, unless the same `X` and `a` were passed to `fit`. For matching without replacement, the `X` and `a` passed to `match` provide the "needle" and the "haystack". If the attribute `caliper` is set, the matches are limited to those with a distance less than `caliper`. This function ignores the existing `match_df_` and will overwrite it. It is thus useful for if you have changed the settings and need to rematch the samples. For most applications, the `match` function is more convenient. Args: X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates for n samples. a (pd.Series): Series of shape (n,) containing discrete treatment values for the n samples. Note: The args are assumed to share the same index. Returns: match_df: The resulting matches DataFrame is indexed so that ` match_df.loc[treatment_value, sample_id]` has columns `matches` and `distances` containing lists of indices to samples and the respective distances for the matches discovered for `sample_id` from within the fitted samples with the given `treatment_value`. The indices in the `matches` column are from the fitted data, not the X argument in `match`. If `sample_id` had no match, `match_df.loc[treatment_value, sample_id].matches = []`. The DataFrame has shape (n* len(a.unique()), 2 ). Raises: NotImplementedError: Raised when with_replacement is False and n_neighbors is not 1. """ if self.n_neighbors != 1 and not self.with_replacement: raise NotImplementedError( "Matching more than one neighbor is only implemented for" "no-replacement") if self.propensity_transform: X = self.propensity_transform.transform(X) if self.with_replacement: self.match_df_ = self._withreplacement_match(X, a) else: self.match_df_ = self._noreplacement_match(X, a) sample_id_name = X.index.name if X.index.name is not None else "sample_id" self.match_df_.index.set_names(["match_to_treatment", sample_id_name], inplace=True) # we record the number of samples that were successfully matched of # each treatment value self.samples_used_ = self._count_samples_used_by_treatment_value(a) return self.match_df_ def estimate_individual_outcome(self, X, a, y=None, treatment_values=None, predict_proba=True, dropna=True): """ Calculate the potential outcome for each sample and treatment value. Execute match and calculate, for each treatment value and each sample, the expected outcome. Note: Out of sample estimation for matching without replacement requires passing a `y` vector here. If no 'y' is passed here, the values received by `fit` are used, and if the estimation indices are not a subset of the fitted indices, the estimation will fail. If the attribute `estimate_observed_outcome` is `True`, estimates will be calculated for the observed outcomes as well. If not, then the observed outcome will be passed through from the corresponding element of `y` passed to `fit`. Args: X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates for n samples. a (pd.Series): Series of shape (n,) containing discrete treatment values for the n samples. y (pd.Series): Series of shape (n,) containing outcome values for n samples. This is only used when `with_replacemnt=False`. Otherwise, the outcome values passed to `fit` are used. predict_proba (bool) : whether to output classifications or probabilties for a classification task. If set to False and data is non-integer, a warning is issued. (default True) dropna (bool) : For samples that were unmatched due to caliper restrictions, drop from outcome_df leading to a potentially smaller sized output, or include them as NaN. (default: True) treatment_values : IGNORED Note: The args are assumed to share the same index. Returns: outcome_df (pd.DataFrame) """ match_df = self.match(X, a, use_cached_result=True) outcome_df = self._aggregate_match_df_to_generate_outcome_df( match_df, a, predict_proba) outcome_df = self._filter_outcome_df_by_matching_mode(outcome_df, a) if outcome_df.isna().all(axis=None): raise ValueError("Matching was not successful and no outcomes can" "be estimated. Check caliper value.") if dropna: outcome_df = outcome_df.dropna() return outcome_df def match(self, X, a, use_cached_result=True, successful_matches_only=False): """Matching the samples in X according to the treatment values in a. Returns a DataFrame including all the results, which is also set as the attribute `self.match_df_`. The arguments `X` and `a` define the "needle" where the "haystack" is the data that was previously passed to fit, for matching with replacement. As such, treatment and control samp les from within `X` will not be matched with each other, unless the same `X` and `a` were passed to `fit`. For matching without replacement, the `X` and `a` passed to `match` provide the "needle" and the "haystack". If the attribute `caliper` is set, the matches are limited to those with a distance less than `caliper`. Args: X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates for n samples. a (pd.Series): Series of shape (n,) containing discrete treatment values for the n samples. use_cached_result (bool): Whether or not to return the `match_df` from the most recent matching operation. The cached result will only be used if the sample indices of `X` and those of `match_df` are identical, otherwise it will rematch. successful_matches_only (bool): Whether or not to filter the matches to those which matched successfully. If set to `False`, the resulting DataFrame will have shape (n* len(a.unique()), 2 ), otherwise it may have a smaller shape due to unsuccessful matches. Note: The args are assumed to share the same index. Returns: match_df: The resulting matches DataFrame is indexed so that ` match_df.loc[treatment_value, sample_id]` has columns `matches` and `distances` containing lists of indices to samples and the respective distances for the matches discovered for `sample_id` from within the fitted samples with the given `treatment_value`. The indices in the `matches` column are from the fitted data, not the X argument in `match`. If `sample_id` had no match, `match_df.loc[treatment_value, sample_id].matches = []`. The DataFrame has shape (n* len(a.unique()), 2 ), if `successful_matches_only` is set to `False. Raises: NotImplementedError: Raised when with_replacement is False and n_neighbors is not 1. """ cached_result_available = (hasattr(self, "match_df_") and X.index.equals(self.match_df_.loc[0].index)) if not (use_cached_result and cached_result_available): self._execute_matching(X, a) return self._get_match_df( successful_matches_only=successful_matches_only) def matches_to_weights(self, match_df=None): """Calculate weights based on a given set of matches. For each matching from one treatment value to another, a weight vector is generated. The weights are calculated as the number of times a sample was selected in a matching, with each occurrence weighted according to the number of other samples in that matching. The weights can be used to estimate outcomes or to check covariate balancing. The function can only be called after `match` has been run. Args: match_df (pd.DataFrame) : a DataFrame of matches returned from `match`. If not supplied, will use the `match_df_` attribute if available, else raises ValueError. Will not execute `match` to generate a `match_df`. Returns: weights_df (pd.DataFrame): DataFrame of shape (n,M) where M is the number of permutations of `a.unique()`. """ if match_df is None: match_df = self._get_match_df(successful_matches_only=False) match_permutations = sorted(permutations(self.treatments_.unique())) weights_df = pd.DataFrame([ self._matches_to_weights_single_matching(s, t, match_df) for s, t in match_permutations ], ).T return weights_df def get_covariates_of_matches(self, s, t, covariates): """ Look up covariates of closest matches for a given matching. Using `self.match_df_` and the supplied `covariates`, look up the covariates of the last match. The function can only be called after `match` has been run. Args: s (int) : source treatment value t (int) : target treatment value covariates (pd.DataFrame) : The same covariates which were passed to `fit`. Returns: covariate_df (pd.DataFrame) : a DataFrame of size (n_matched_samples, n_covariates * 3 + 2) with the covariate values of the sample, covariates of its match, calculated distance and number of neighbors found within the given caliper (with no caliper this will equal self.n_neighbors ) """ match_df = self._get_match_df() subdf = match_df.loc[s][self.treatments_ == t] sample_id_name = subdf.index.name def get_covariate_difference_from_nearest_match(source_row_index): j = subdf.loc[source_row_index].matches[0] delta_series = pd.Series(covariates.loc[source_row_index] - covariates.loc[j]) source_row = covariates.loc[j].copy() source_row.at[sample_id_name] = j target_row = covariates.loc[source_row_index].copy() target_row = target_row covariate_differences = pd.concat({ t: target_row, s: source_row, "delta": delta_series, "outcomes": pd.Series({ t: self.outcome_.loc[source_row_index], s: self.outcome_.loc[j] }), "match": pd.Series( dict( n_neighbors=len(subdf.loc[source_row_index].matches), distance=subdf.loc[source_row_index].distances[0], )), }) return covariate_differences covdf = pd.DataFrame(data=[ get_covariate_difference_from_nearest_match(i) for i in subdf.index ], index=subdf.index) covdf = covdf.reset_index() cols = covdf.columns covdf.columns = pd.MultiIndex.from_tuples([(t, sample_id_name)] + list(cols[1:])) return covdf def _clear_post_fit_variables(self): for var in list(vars(self)): if var[-1] == "_": self.__delattr__(var) def _calculate_covariance(self, X): if len(X.shape) > 1 and X.shape[1] > 1: V_list = [] for a in self.treatments_.unique(): X_at_a = X[self.treatments_ == a].copy() current_V = self.covariance_conditioner.fit(X_at_a).covariance_ V_list.append(current_V) # following Imbens&Rubin, we average across treatment groups V = np.mean(V_list, axis=0) else: # for 1d data revert to euclidean metric V = np.array(1).reshape(1, 1) return V def _aggregate_match_df_to_generate_outcome_df(self, match_df, a, predict_proba): agg_function = self._get_agg_function(predict_proba) def outcome_from_matches_by_idx(x): return agg_function(self.outcome_.loc[x]) outcomes = {} for i in sorted(a.unique()): outcomes[i] = match_df.loc[i].matches.apply( outcome_from_matches_by_idx) outcome_df = pd.DataFrame(outcomes) return outcome_df def _get_match_df(self, successful_matches_only=True): if not hasattr(self, "match_df_") or self.match_df_ is None: raise NotFittedError("You need to run `match` first") match_df = self.match_df_.copy() if successful_matches_only: match_df = match_df[match_df.matches.apply(bool)] if match_df.empty: raise ValueError( "Matching was not successful and no outcomes can be " "estimated. Check caliper value.") return match_df def _filter_outcome_df_by_matching_mode(self, outcome_df, a): if self.matching_mode == "treatment_to_control": outcome_df = outcome_df[a == 1] elif self.matching_mode == "control_to_treatment": outcome_df = outcome_df[a == 0] elif self.matching_mode == "both": pass else: raise NotImplementedError( "Matching mode {} is not implemented. Please select one of " "'treatment_to_control', 'control_to_treatment, " "or 'both'.".format(self.matching_mode)) return outcome_df def _get_agg_function(self, predict_proba): if predict_proba: agg_function = self.regress_agg_function else: agg_function = self.classify_agg_function try: isoutputinteger = np.allclose(self.outcome_.apply(int), self.outcome_) if not isoutputinteger: warnings.warn("Classifying non-categorical outcomes. " "This is probably a mistake.") except: warnings.warn( "Unable to detect whether outcome is integer-like. ") return agg_function def _instantiate_nearest_neighbors_object(self): backend = self.knn_backend if backend == "sklearn": backend_instance = NearestNeighbors(algorithm="auto") elif callable(backend): backend_instance = backend() self.metric = backend_instance.metric elif hasattr(backend, "fit") and hasattr(backend, "kneighbors"): backend_instance = sk_clone(backend) self.metric = backend_instance.metric else: raise NotImplementedError( "`knn_backend` must be either an NearestNeighbors-like object," " a callable returning such an object, or the string \"sklearn\"" ) backend_instance.set_params(**self._get_metric_dict()) return backend_instance def _fit_sknn(self, target_df): """ Fit scikit-learn NearestNeighbors object with samples in target_df. Fits object, adds metric parameters and returns namedtuple which also includes DataFrame indices so that identities can looked up. Args: target_df (pd.DataFrame) : DataFrame of covariates to fit Returns: KNN (namedtuple) : Namedtuple with members `learner` and `index` containing the fitted sklearn object and an index lookup vector, respectively. """ target_array = target_df.values sknn = self._instantiate_nearest_neighbors_object() target_array = self._ensure_array_columnlike(target_array) sknn.fit(target_array) return KNN(sknn, target_df.index) @staticmethod def _ensure_array_columnlike(target_array): if len(target_array.shape) < 2 or target_array.shape[1] == 1: target_array = target_array.reshape(-1, 1) return target_array def _get_metric_dict( self, VI_in_metric_params=True, ): metric_dict = dict(metric=self.metric) if self.metric == "mahalanobis": VI = np.linalg.inv(self.conditioned_covariance_) if VI_in_metric_params: metric_dict["metric_params"] = {"VI": VI} else: metric_dict["VI"] = VI return metric_dict def _kneighbors(self, knn, source_df): """Lookup neighbors in knn object. Args: knn (namedtuple) : knn named tuple to look for neighbors in. The object has `learner` and `index` attributes to reference the original df index. source_df (pd.DataFrame) : a DataFrame of source data points to use as "needles" for the knn "haystack." Returns: match_df (pd.DataFrame) : a DataFrame of matches """ source_array = source_df.values # 1d data must be in shape (-1, 1) for sklearn.knn source_array = self._ensure_array_columnlike(source_array) distances, neighbor_array_indices = knn.learner.kneighbors( source_array, n_neighbors=self.n_neighbors) return self._generate_match_df(source_df, knn.index, distances, neighbor_array_indices) def _generate_match_df(self, source_df, target_df_index, distances, neighbor_array_indices): """ Take results of matching and build into match_df DataFrame. For clarity we'll call the samples that are being matched "needles" and the set of samples that they looked for matches in the "haystack". Args: source_df (pd.DataFrame) : Covariate dataframe of N "needles" target_df_index (np.array) : An array of M indices of the haystack samples in their original dataframe. distances (np.array) : An array of N arrays of floats of length K where K is `self.n_neighbors`. neighbor_array_indices (np.array) : An array of N arrays of ints of length K where K is `self.n_neighbors`. """ # target is the haystack, source is the needle(s) # translate array indices back to original indices matches_dict = {} for source_idx, distance_row, neighbor_array_index_row in zip( source_df.index, distances, neighbor_array_indices): neighbor_df_indices = \ target_df_index[neighbor_array_index_row.flatten()] if self.caliper is not None: neighbor_df_indices = [ n for i, n in enumerate(neighbor_df_indices) if distance_row[i] < self.caliper ] distance_row = [d for d in distance_row if d < self.caliper] matches_dict[source_idx] = dict(matches=list(neighbor_df_indices), distances=list(distance_row)) # convert dict of dicts like { 1: {'matches':[], 'distances':[]}} to df return pd.DataFrame(matches_dict).T def _matches_to_weights_single_matching(self, s, t, match_df): """ For a given match, calculate the resulting weight vector. The weight vector adds a count each time a sample is used, weighted by the number of other neighbors when it was used. This is necessary to make the weighted sum return the correct effect estimate. """ weights = pd.Series(self.treatments_.copy() * 0) name = {0: "control", 1: "treatment"} weights.name = "{s}_to_{t}".format(s=name[s], t=name[t]) s_to_t_matches = match_df.loc[t][self.treatments_ == s].matches for source_idx, matches_list in s_to_t_matches.iteritems(): if matches_list: weights.loc[source_idx] += 1 for match in matches_list: weights.loc[match] += 1 / len(matches_list) return weights def _get_distance_matrix(self, source_df, target_df): """ Create distance matrix for no replacement match. Combines metric, caliper and source/target data into a precalculated distance matrix which can be passed to scipy.optimize.linear_sum_assignment. """ cdist_args = dict( XA=self._ensure_array_columnlike(source_df.values), XB=self._ensure_array_columnlike(target_df.values), ) cdist_args.update(self._get_metric_dict(False)) distance_matrix = distance.cdist(**cdist_args) if self.caliper is not None: distance_matrix[distance_matrix > self.caliper] = VERY_LARGE_NUMBER return distance_matrix def _withreplacement_match(self, X, a): matches = {} # maps treatment value to list of matches TO that value for treatment_value, knn in self.treatment_knns_.items(): matches[treatment_value] = self._kneighbors(knn, X) # when producing potential outcomes we may want to force the # value of the observed outcome to be the actual observed # outcome, and not an average of the k nearest samples. if not self.estimate_observed_outcome: def limit_within_treatment_matches_to_self_only(row): if (a.loc[row.name] == treatment_value and row.name in row.matches): row.matches = [row.name] row.distances = [0] return row matches[treatment_value] = matches[treatment_value].apply( limit_within_treatment_matches_to_self_only, axis=1) return pd.concat(matches, sort=True) def _noreplacement_match(self, X, a): match_combinations = sorted(combinations(a.unique(), 2)) matches = {} for s, t in match_combinations: distance_matrix = self._get_distance_matrix(X[a == s], X[a == t]) source_array, neighbor_array_indices, distances = \ self._optimally_match_distance_matrix(distance_matrix) source_df = X[a == s].iloc[np.array(source_array)] target_df = X[a == t].iloc[np.array(neighbor_array_indices)] if t in matches or s in matches: warnings.warn("No-replacement matching for more than " "2 treatment values is not supported") matches[t] = self._create_match_df_for_no_replacement( a, source_df, target_df, distances) matches[s] = self._create_match_df_for_no_replacement( a, target_df, source_df, distances) match_df = pd.concat(matches, sort=True) return match_df def _optimally_match_distance_matrix(self, distance_matrix): source_array, neighbor_array_indices = linear_sum_assignment( distance_matrix) distances = [[ distance_matrix[s_idx, t_idx] ] for s_idx, t_idx in zip(source_array, neighbor_array_indices)] source_array, neighbor_array_indices, distances = \ self._filter_noreplacement_matches_using_caliper( source_array, neighbor_array_indices, distances) return source_array, neighbor_array_indices, distances def _filter_noreplacement_matches_using_caliper(self, source_array, neighbor_array_indices, distances): if self.caliper is None: return source_array, neighbor_array_indices, distances keep_indices = [ i for i, d in enumerate(distances) if d[0] <= self.caliper ] source_array = source_array[keep_indices] neighbor_array_indices = neighbor_array_indices[keep_indices] distances = [distances[i] for i in keep_indices] if not keep_indices: warnings.warn("No matches found, check caliper." "No estimation possible.") return source_array, neighbor_array_indices, distances @staticmethod def _create_match_df_for_no_replacement(base_series, source_df, target_df, distances): match_sub_df = pd.DataFrame( index=base_series.index, columns=[ "matches", "distances", ], data=base_series.apply(lambda x: pd.Series([[], []])).values, dtype="object", ) # matching from source to target: read distances match_sub_df.loc[source_df.index] = pd.DataFrame( data=dict( matches=[[tidx] for tidx in target_df.index], distances=distances, ), index=source_df.index, ) # matching from target to target: fill with zeros match_sub_df.loc[target_df.index] = pd.DataFrame( data=dict( matches=[[tidx] for tidx in target_df.index], distances=[[0]] * len(distances), ), index=target_df.index, ) return match_sub_df def _count_samples_used_by_treatment_value(self, a): # we record the number of samples that were successfully matched of # each treatment value samples_used = { treatment_value: self.match_df_.loc[treatment_value][ a != treatment_value].matches.apply(bool).sum() for treatment_value in sorted(a.unique(), reverse=True) } return pd.Series(samples_used)
def clipped(X, alpha=None, return_covariance=False): """Clips the eigenvalues of an empirical correlation matrix E in order to provide a cleaned estimator E_clipped of the underlying correlation matrix. Proceeds by keeping the [N * alpha] top eigenvalues and shrinking the remaining ones by a trace-preserving constant (i.e. Tr(E_clipped) = Tr(E)). Parameters ---------- X: design matrix, of shape (T, N), where T denotes the number of samples (think measurements in a time series), while N stands for the number of features (think of stock tickers). alpha: type float or derived from numbers.Real (default: None) Parameter between 0 and 1, inclusive, determining the fraction to keep of the top eigenvalues of an empirical correlation matrix. If left unspecified, alpha is chosen so as to keep all the empirical eigenvalues greater than the upper limit of the support to the Marcenko-Pastur spectrum. Indeed, such eigenvalues can be considered as associated with some signal, whereas the ones falling inside the Marcenko-Pastur range should be considered as corrupted with noise and indistinguishable from the spectrum of the correlation of a random matrix. This ignores finite-size effects that make it possible for the eigenvalues to exceed the upper and lower edges defined by the Marcenko-Pastur spectrum (cf. a set of results revolving around the Tracy-Widom distribution) return_covariance: type bool (default: False) If set to True, compute the standard deviations of each individual feature across observations, clean the underlying matrix of pairwise correlations, then re-apply the standard deviations and return a cleaned variance-covariance matrix. Returns ------- E_clipped: type numpy.ndarray, shape (N, N) Cleaned estimator of the true correlation matrix C underlying a noisy, in-sample estimate E (empirical correlation matrix estimated from X). This cleaned estimator proceeds through a simple eigenvalue clipping procedure (cf. reference below). If return_covariance=True, E_clipped corresponds to a cleaned variance-covariance matrix. Reference --------- "Financial Applications of Random Matrix Theory: a short review", J.-P. Bouchaud and M. Potters arXiv: 0910.1205 [q-fin.ST] """ try: if alpha is not None: assert isinstance(alpha, Real) and 0 <= alpha <= 1 assert isinstance(return_covariance, bool) except AssertionError: raise sys.exit(1) T, N, transpose_flag = checkDesignMatrix(X) if transpose_flag: X = X.T if not return_covariance: X = StandardScaler(with_mean=False, with_std=True).fit_transform(X) ec = EmpiricalCovariance(store_precision=False, assume_centered=True) ec.fit(X) E = ec.covariance_ if return_covariance: inverse_std = 1./np.sqrt(np.diag(E)) E *= inverse_std E *= inverse_std.reshape(-1, 1) eigvals, eigvecs = np.linalg.eigh(E) eigvecs = eigvecs.T if alpha is None: (lambda_min, lambda_max), _ = marcenkoPastur(X) xi_clipped = np.where(eigvals >= lambda_max, eigvals, np.nan) else: xi_clipped = np.full(N, np.nan) threshold = int(ceil(alpha * N)) if threshold > 0: xi_clipped[-threshold:] = eigvals[-threshold:] gamma = float(E.trace() - np.nansum(xi_clipped)) gamma /= np.isnan(xi_clipped).sum() xi_clipped = np.where(np.isnan(xi_clipped), gamma, xi_clipped) E_clipped = np.zeros((N, N), dtype=float) for xi, eigvec in zip(xi_clipped, eigvecs): eigvec = eigvec.reshape(-1, 1) E_clipped += xi * eigvec.dot(eigvec.T) tmp = 1./np.sqrt(np.diag(E_clipped)) E_clipped *= tmp E_clipped *= tmp.reshape(-1, 1) if return_covariance: std = 1./inverse_std E_clipped *= std E_clipped *= std.reshape(-1, 1) return E_clipped
def __init__(self): super().__init__() self.estimator = EmpiricalCovariance()
def correlation(self, estimator="maximum_likelihood", assume_centered=False): if estimator=="maximum_likelihood": correlation_measure = ConnectivityMeasure(kind="correlation", cov_estimator=EmpiricalCovariance(assume_centered=assume_centered)) elif estimator=="ledoit_wolf": correlation_measure = ConnectivityMeasure(kind="correlation", cov_estimator=LedoitWolf(assume_centered=assume_centered)) else: raise ValueError("Estimator should be 'maximum_likelihood' or 'ledoit_wolf'") R = np.nan_to_num(correlation_measure.fit_transform(self.ts)) return R
# Fixing the positive definiteness of the precision matrix gam = 1 c = np.array([4,-1,0,0,-1,0,0,0,0,0,-1,0,0,0,-1]) c = c + gam/len(c) C = circulant(c) Q = np.dot(C.T,C) + gam*np.ones(len(c)) mu = np.zeros((d,)) theta = sampler_circulant(a=np.reshape(Q[:,0],(len(c),1)), M=1, N=len(c), mu=mu, mode="precision", size=10000) mu_hat = np.mean(theta,axis=1) Q_hat = EmpiricalCovariance().fit(theta.T).precision_ np.linalg.norm(mu-mu_hat) np.linalg.norm(Q-Q_hat)/np.linalg.norm(Q) mu = np.array([0., 1.]) Sigma = np.array([[ 1. , -0.9], [-0.9, 1]]) Q = np.linalg.inv(Sigma) def matvec_fun(x): return Q.dot(x) lam_l = 0 lam_u = np.max(np.sum(np.abs(Q),0)) tol = 1e-3 [theta,K] = sampler_squareRootApprox(mu,matvec_fun,lam_l,lam_u,tol, K=100,mode="precision", size=1,info=True)
def optimalShrinkage(X, return_covariance=False, method='rie'): """This function computes a cleaned, optimal shrinkage, rotationally-invariant estimator (RIE) of the true correlation matrix C underlying the noisy, in-sample estimate E = 1/T X * transpose(X) associated to a design matrix X of shape (T, N) (T measurements and N features). One approach to getting a cleaned estimator that predates the optimal shrinkage, RIE estimator consists in inverting the Marcenko-Pastur equation so as to replace the eigenvalues from the spectrum of E by an estimation of the true ones. This approach is known to be numerically-unstable, in addition to failing to account for the overlap between the sample eigenvectors and the true eigenvectors. How to compute such overlaps was first explained by Ledoit and Peche (cf. reference below). Their procedure was extended by Bun, Bouchaud and Potters, who also correct for a systematic downward bias in small eigenvalues. It is this debiased, optimal shrinkage, rotationally-invariant estimator that the function at hand implements. In addition to above method, this funtion also provides access to: - The finite N regularization of the optimal RIE for small eigenvalues as provided in section 8.1 of [3] a.k.a the inverse wishart (IW) regularization. - The direct kernel method of O. Ledoit and M. Wolf in their 2017 paper [4]. This is a direct port of their Matlab code. Parameters ---------- X: design matrix, of shape (T, N), where T denotes the number of samples (think measurements in a time series), while N stands for the number of features (think of stock tickers). return_covariance: type bool (default: False) If set to True, compute the standard deviations of each individual feature across observations, clean the underlying matrix of pairwise correlations, then re-apply the standard deviations and return a cleaned variance-covariance matrix. method: type string, optional (default="rie") - If "rie" : optimal shrinkage in the manner of Bun & al. with no regularisation - If "iw" : optimal shrinkage in the manner of Bun & al. with the so called Inverse Wishart regularization - If 'kernel': Direct kernel method of Ledoit Wolf. Returns ------- E_RIE: type numpy.ndarray, shape (N, N) Cleaned estimator of the true correlation matrix C. A sample estimator of C is the empirical covariance matrix E estimated from X. E is corrupted by in-sample noise. E_RIE is the optimal shrinkage, rotationally-invariant estimator (RIE) of C computed following the procedure of Joel Bun and colleagues (cf. references below). If return_covariance=True, E_clipped corresponds to a cleaned variance-covariance matrix. References ---------- 1 "Eigenvectors of some large sample covariance matrix ensembles", O. Ledoit and S. Peche Probability Theory and Related Fields, Vol. 151 (1), pp 233-264 2 "Rotational invariant estimator for general noisy matrices", J. Bun, R. Allez, J.-P. Bouchaud and M. Potters arXiv: 1502.06736 [cond-mat.stat-mech] 3 "Cleaning large Correlation Matrices: tools from Random Matrix Theory", J. Bun, J.-P. Bouchaud and M. Potters arXiv: 1610.08104 [cond-mat.stat-mech] 4 "Direct Nonlinear Shrinkage Estimation of Large-Dimensional Covariance Matrices (September 2017)", O. Ledoit and M. Wolf https://ssrn.com/abstract=3047302 or http://dx.doi.org/10.2139/ssrn.3047302 """ try: assert isinstance(return_covariance, bool) except AssertionError: raise sys.exit(1) T, N, transpose_flag = checkDesignMatrix(X) if transpose_flag: X = X.T if not return_covariance: X = StandardScaler(with_mean=False, with_std=True).fit_transform(X) ec = EmpiricalCovariance(store_precision=False, assume_centered=True) ec.fit(X) E = ec.covariance_ if return_covariance: inverse_std = 1./np.sqrt(np.diag(E)) E *= inverse_std E *= inverse_std.reshape(-1, 1) eigvals, eigvecs = np.linalg.eigh(E) eigvecs = eigvecs.T q = N / float(T) lambda_N = eigvals[0] # The smallest empirical eigenvalue, # given that the function used to compute # the spectrum of a Hermitian or symmetric # matrix - namely np.linalg.eigh - returns # the eigenvalues in ascending order. lambda_hats = None if method is not 'kernel': use_inverse_wishart = (method == 'iw') xis = map(lambda x: xiHelper(x, q, E), eigvals) Gammas = map(lambda x: gammaHelper(x, q, N, lambda_N, inverse_wishart=use_inverse_wishart), eigvals) xi_hats = map(lambda a, b: a * b if b > 1 else a, xis, Gammas) lambda_hats = xi_hats else: lambda_hats = directKernel(q, T, N, eigvals) E_RIE = np.zeros((N, N), dtype=float) for lambda_hat, eigvec in zip(lambda_hats, eigvecs): eigvec = eigvec.reshape(-1, 1) E_RIE += lambda_hat * eigvec.dot(eigvec.T) # bp() tmp = 1./np.sqrt(np.diag(E_RIE)) E_RIE *= tmp E_RIE *= tmp.reshape(-1, 1) if return_covariance: std = 1./inverse_std E_RIE *= std E_RIE *= std.reshape(-1, 1) return E_RIE
def test_covariance(): """Tests Covariance module on a simple dataset. """ # test covariance fit from data cov = EmpiricalCovariance() cov.fit(X) emp_cov = empirical_covariance(X) assert_array_almost_equal(emp_cov, cov.covariance_, 4) assert_almost_equal(cov.error_norm(emp_cov), 0) assert_almost_equal( cov.error_norm(emp_cov, norm='spectral'), 0) assert_almost_equal( cov.error_norm(emp_cov, norm='frobenius'), 0) assert_almost_equal( cov.error_norm(emp_cov, scaling=False), 0) assert_almost_equal( cov.error_norm(emp_cov, squared=False), 0) assert_raises(NotImplementedError, cov.error_norm, emp_cov, norm='foo') # Mahalanobis distances computation test mahal_dist = cov.mahalanobis(X) print np.amin(mahal_dist), np.amax(mahal_dist) assert(np.amin(mahal_dist) > 0) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) cov = EmpiricalCovariance() cov.fit(X_1d) assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4) assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0) # test with one sample X_1sample = np.arange(5) cov = EmpiricalCovariance() with warnings.catch_warnings(record=True): cov.fit(X_1sample) # test integer type X_integer = np.asarray([[0, 1], [1, 0]]) result = np.asarray([[0.25, -0.25], [-0.25, 0.25]]) assert_array_almost_equal(empirical_covariance(X_integer), result) # test centered case cov = EmpiricalCovariance(assume_centered=True) cov.fit(X) assert_equal(cov.location_, np.zeros(X.shape[1]))
def rand_pts_overall_cov_init(X, n_components, cov_est_method='LW', covariance_type='full', random_state=None): """ Sets the means to randomly selected points. Sets the covariances to the overall covariance matrix. Parameters ---------- X: (n_samples, n_features) n_components: int cov_est_method: str Must be one of ['emperical', 'LW', 'OAS'] for empirical covariance matrix estimate, LedoitWolf and Oracle Approximating Shrinkage Estimator. See sklean.covariace for details. random_state: None, int, random seed Random seed. """ assert cov_est_method in ['empirical', 'LW', 'OAS'] assert covariance_type in ['full', 'diag', 'tied', 'spherical'] n_samples = X.shape[0] # randomly select data points to start cluster centers from rng = check_random_state(random_state) # estimate global covariance if cov_est_method == 'empirical': cov_estimator = EmpiricalCovariance(store_precision=False) elif cov_est_method == 'LW': cov_estimator = LedoitWolf(store_precision=False) elif cov_est_method == 'OAS': cov_estimator = OAS(store_precision=False) cov_estimator.fit(X) cov_est = cov_estimator.covariance_ # set covariance matrix for each cluster if covariance_type == 'tied': covs = cov_est elif covariance_type == 'full': covs = np.stack([cov_est for _ in range(n_components)]) elif covariance_type == 'diag': # each components gets the diagonal of the estimated covariance matrix covs = np.diag(cov_est) covs = np.repeat(covs.reshape(1, -1), repeats=n_components, axis=0) elif covariance_type == 'spherical': # each components gets the average of the variances covs = np.diag(cov_est).mean() covs = np.repeat(covs, repeats=n_components) # set means to random data points rand_idxs = rng.choice(range(n_samples), replace=False, size=n_components) means = [X[pt_idx, ] for pt_idx in rand_idxs] means = np.array(means) return means, covs
inliers_mask[outliers_index] = False # fit a Minimum Covariance Determinant (MCD) robust estimator to data S = MinCovDet().fit(X) # compare raw robust estimates with the true location and covariance err_loc_mcd[i, j] = np.sum(S.location_ ** 2) err_cov_mcd[i, j] = S.error_norm(np.eye(n_features)) # compare estimators learnt from the full data set with true parameters err_loc_emp_full[i, j] = np.sum(X.mean(0) ** 2) err_cov_emp_full[i, j] = EmpiricalCovariance().fit(X).error_norm( np.eye(n_features)) # compare with an empirical covariance learnt from a pure data set # (i.e. "perfect" MCD) pure_X = X[inliers_mask] pure_location = pure_X.mean(0) pure_emp_cov = EmpiricalCovariance().fit(pure_X) err_loc_emp_pure[i, j] = np.sum(pure_location ** 2) err_cov_emp_pure[i, j] = pure_emp_cov.error_norm(np.eye(n_features)) # Display results font_prop = matplotlib.font_manager.FontProperties(size=11) pl.subplot(2, 1, 1) pl.errorbar(range_n_outliers, err_loc_mcd.mean(1), yerr=err_loc_mcd.std(1) / np.sqrt(repeat), label="Robust location", color='m') pl.errorbar(range_n_outliers, err_loc_emp_full.mean(1), yerr=err_loc_emp_full.std(1) / np.sqrt(repeat), label="Full data set mean", color='green') pl.errorbar(range_n_outliers, err_loc_emp_pure.mean(1), yerr=err_loc_emp_pure.std(1) / np.sqrt(repeat), label="Pure data set mean", color='black')
def main(out_data: str = 'chexpert'): models = wsl_model_dir.glob('*') # all_configs = [] for idx, path in enumerate(models): if 'debug' in str(path): # Debugging model continue elif not (path / 'configs.json').exists(): # Model not completed continue else: with open(path / 'configs.json') as f: configs = json.load(f) # print(configs) print(f'Model {idx} : {path}') # ------------------------------------------------------ train_dataset = Loader(data=configs['data'], split='train', extension=configs['extension'], classes=configs['classes'], column=configs['column'], regression=configs['regression']) train_loader = DataLoader( # type: ignore train_dataset, batch_size=configs['batchsize'], num_workers=4, pin_memory=True, shuffle=True) valid_dataset = Loader(data=configs['data'], split='valid', extension=configs['extension'], classes=configs['classes'], column=configs['column'], regression=configs['regression']) valid_loader = DataLoader( # type: ignore valid_dataset, batch_size=configs['batchsize'], num_workers=4, pin_memory=True, shuffle=True) out_dataset = Loader(data=out_data, split='valid', extension=configs['extension'], classes=configs['classes'], column=configs['column'], regression=configs['regression']) out_loader = DataLoader( # type: ignore out_dataset, batch_size=configs['batchsize'], num_workers=4, pin_memory=True, shuffle=True) checkpoint = torch.load(path / 'best.pt', map_location='cuda:0' if torch.cuda.is_available() else 'cpu') checkpoint['model'] = checkpoint['model'].module checkpoint['model'].network = configs['network'] checkpoint['model'].get_map = False checkpoint['model'].eval() # sigmoid = torch.nn.Sigmoid() group_lasso = EmpiricalCovariance(assume_centered=False) layer_names = {} # ------------------------------------------------------ def get_mean_precision(loader): print('building hook function...') features = {} def hook(layer, inp, out): name = layer_names[layer] if name not in features: features[name] = out.detach().data.view(out.size(0), out.size(1), -1).mean(dim=-1) else: features[name] = torch.cat((features[name], out.detach().data.view(out.size(0), out.size(1), -1).mean(dim=-1)), dim=0) handles = checkpoint['model'].register_forward_hooks(checkpoint['model'], hook, layer_names) start = time.time() with torch.set_grad_enabled(False): for idx, data in enumerate(loader): imgs = data[0].cuda().float() _ = data[1] _ = checkpoint['model'](imgs) speed = configs['batchsize'] * idx // (time.time() - start) print('Iter:', idx, 'Speed:', int(speed), 'img/s', end='\r', flush=True) if idx > 20: break print('Total time:', time.time() - start, 'secs') print('calculating sample mean...') mean = {} precision = {} for key, value in features.items(): mean[key] = value.mean(dim=0) features[key] -= mean[key] group_lasso.fit(features[key].cpu().numpy()) precision[key] = torch.from_numpy(group_lasso.precision_).float().cuda() for handle in handles: handle.remove() return mean, precision train_mean, train_precision = get_mean_precision(train_loader) # ------------------------------------------------------ def get_mahalanobis_score(loader: Any, features: Any, magnitude: float): scores = {} gaussian = {} for layer, name in layer_names.items(): checkpoint['optimizer'].zero_grad() def hook(layer, inp, out): zero_feat = out.view(out.size(0), out.size(1), -1).mean(dim=-1) - train_mean[name] gaussian[name] = -0.5 * torch.mm(torch.mm(zero_feat, train_precision[name]), zero_feat.t()).diag() handle = layer.register_forward_hook(hook) start = time.time() for idx, data in enumerate(loader): with torch.set_grad_enabled(True): imgs = data[0].cuda().float() _ = data[1] imgs.requires_grad = True _ = checkpoint['model'](imgs) loss = gaussian[name].mean() loss.backward() gradient = torch.ge(imgs.grad.data, 0) gradient = (gradient.float() - 0.5) * 2 with torch.set_grad_enabled(False): noisy_imgs = torch.add(imgs.data, gradient, alpha=-magnitude) _ = checkpoint['model'](noisy_imgs) if name not in scores: scores[name] = gaussian[name].detach().data else: scores[name] = torch.cat((scores[name], gaussian[name].detach().data), dim=0) print(scores[name].mean()) checkpoint['optimizer'].zero_grad() speed = configs['batchsize'] * idx // (time.time() - start) print(name, 'Iter:', idx, 'Speed:', int(speed), 'img/s', end='\r', flush=True) handle.remove() print() return scores print('get mahalanobis scores...') magnitudes = [0.0, 0.01, 0.005, 0.002, 0.0014, 0.001, 0.0005] maha_valid_scores = {} maha_out_scores = {} for magnitude in magnitudes: print('Noise:', magnitude) print('Data - Assumed negative class:', configs['data']) maha_valid_scores[magnitude] = get_mahalanobis_score(valid_loader, layer_names, magnitude) print('Data - Assumed positive class:', out_data) maha_out_scores[magnitude] = get_mahalanobis_score(out_loader, layer_names, magnitude) print() print('merge mahalanobis scores...')
def test_suffstat_sk_full(): # compare the EmpiricalCovariance.covariance fitted on X*sqrt(resp) # with _sufficient_sk_full, n_components=1 rng = np.random.RandomState(0) n_samples, n_features = 500, 2 # special case 1, assuming data is "centered" X = rng.rand(n_samples, n_features) resp = rng.rand(n_samples, 1) X_resp = np.sqrt(resp) * X nk = np.array([n_samples]) xk = np.zeros((1, n_features)) covars_pred = _estimate_gaussian_covariance_full(resp, X, nk, xk, 0) ecov = EmpiricalCovariance(assume_centered=True) ecov.fit(X_resp) assert_almost_equal(ecov.error_norm(covars_pred[0], norm='frobenius'), 0) assert_almost_equal(ecov.error_norm(covars_pred[0], norm='spectral'), 0) # special case 2, assuming resp are all ones resp = np.ones((n_samples, 1)) nk = np.array([n_samples]) xk = X.mean().reshape((1, -1)) covars_pred = _estimate_gaussian_covariance_full(resp, X, nk, xk, 0) ecov = EmpiricalCovariance(assume_centered=False) ecov.fit(X) assert_almost_equal(ecov.error_norm(covars_pred[0], norm='frobenius'), 0) assert_almost_equal(ecov.error_norm(covars_pred[0], norm='spectral'), 0)
def extract_vector(self): if self.level == 2: self.df_cifti_load = pd.DataFrame( self.fmri_data_np_arr.mean(axis=2)) if type(self.seed_ROI_name) == list and len(self.seed_ROI_name) > 1: if self.seed_analysis_output == 'parcellated': self.df_cifti_load = pd.DataFrame( self.parcellated_cifti_load.get_fdata()) self.df_cifti_load.columns = self.parcel_labels self.df_cifti_load['avg'] = self.df_cifti_load[ self.seed_ROI_name].mean(axis=1) self.parcel_labels = self.df_cifti_load.columns.to_list() else: self.df_cifti_load = pd.DataFrame(self.cifti_load.get_fdata()) df_parcellated_cifti_load = pd.DataFrame( self.parcellated_cifti_load.get_fdata()) df_parcellated_cifti_load.columns = self.parcel_labels self.df_cifti_load['avg'] = df_parcellated_cifti_load[ self.seed_ROI_name].mean(axis=1) self.seed_ROI_name = 'avg' else: if self.seed_analysis_output == 'dense': self.df_cifti_load = pd.DataFrame(self.cifti_load.get_fdata()) df_parcellated_cifti_load = pd.DataFrame( self.parcellated_cifti_load.get_fdata()) df_parcellated_cifti_load.columns = self.parcel_labels self.df_cifti_load[ self.seed_ROI_name] = df_parcellated_cifti_load[ self.seed_ROI_name] else: self.df_cifti_load = pd.DataFrame( self.parcellated_cifti_load.get_fdata()) cifti_np_array = self.df_cifti_load.to_numpy() if self.method == 'correlation': #Pearson correlation coefficients with LedoitWolf covariance estimator #measure = ConnectivityMeasure(kind='correlation',cov_estimator='LedoitWolf') #Pearson correlation coefficients based oemperical covariance (i.e. standard) measure = ConnectivityMeasure(kind='correlation', cov_estimator=EmpiricalCovariance()) elif self.method == 'covariance': #LedoitWolf estimator measure = ConnectivityMeasure(kind='covariance') elif self.method == 'partial_correlation': # Partial correlation with LedoitWolf covariance estimator measure = ConnectivityMeasure(kind='partial correlation') elif self.method == 'precision': measure = ConnectivityMeasure(kind='precision') elif 'sparse' in self.method: measure = GraphicalLassoCV() if 'sparse' in self.method: measure.fit(cifti_np_array) if 'covariance' in self.method: network_matrix = measure.covariance_ elif 'precision' in self.method: network_matrix = measure.precision_ else: network_matrix = measure.fit_transform([cifti_np_array])[0] df_network_matrix = pd.DataFrame(network_matrix) df_network_matrix.columns = self.parcel_labels if self.seed_ROI_name == 'avg': # take everything except last element, i.e. avg. Need to do this because downstream this object must match grayordinate_file self.r_functional_vector = df_network_matrix[ self.seed_ROI_name][:-1].to_numpy() else: self.r_functional_vector = np.squeeze( df_network_matrix[self.seed_ROI_name].to_numpy()) self.z_functional_vector = 0.5 * ( np.log(1 + self.r_functional_vector) - np.log(1 - self.r_functional_vector))
def fit(self, X): ''' Copulafit using Gaussian copula with marginals evaluated by Gaussian KDE Precision matrix is evaluated using specified method, default to graphical LASSO :param X: input dataset :return: estimated precision matrix rho ''' N, d = X.shape if self.scaler is not None: X_scale = self.scaler.fit_transform(X) else: X_scale = X if len(self.vertexes) == 0: self.vertexes = [str(id) for id in range(d)] self.theta = 1.0 / N cum_marginals = np.zeros_like(X) inv_norm_cdf = np.zeros_like(X) # inv_norm_cdf_scaled = np.zeros_like(X) self.kernels = list([]) # TODO: complexity O(Nd) is high if self.verbose: colored('>> Computing marginals', color='blue') for j in range(cum_marginals.shape[1]): self.kernels.append(gaussian_kde(X_scale[:, j])) cum_pdf_overall = self.kernels[-1].integrate_box_1d( X_scale[:, j].min(), X_scale[:, j].max()) for i in range(cum_marginals.shape[0]): cum_marginals[i, j] = self.kernels[-1].integrate_box_1d( X_scale[:, j].min(), X_scale[i, j]) / cum_pdf_overall # truncate cumulative marginals if cum_marginals[i, j] < self.theta: cum_marginals[i, j] = self.theta elif cum_marginals[i, j] > 1 - self.theta: cum_marginals[i, j] = 1 - self.theta # inverse of normal CDF: \Phi(F_j(x))^{-1} inv_norm_cdf[i, j] = norm.ppf(cum_marginals[i, j]) # scaled to preserve mean and variance: u_j + \sigma_j*\Phi(F_j(x))^{-1} # inv_norm_cdf_scaled[i, j] = X_scale[:, j].mean() + X_scale[:, j].std() * inv_norm_cdf[i, j] if self.method == 'mle': # maximum-likelihood estiamtor empirical_cov = EmpiricalCovariance() empirical_cov.fit(inv_norm_cdf) if self.verbose: print colored('>> Running MLE to estiamte precision matrix', color='blue') self.est_cov = empirical_cov.covariance_ self.corr = scale_matrix(self.est_cov) self.precision_ = inv(empirical_cov.covariance_) if self.method == 'glasso': if self.verbose: print colored('>> Running glasso to estiamte precision matrix', color='blue') empirical_cov = EmpiricalCovariance() empirical_cov.fit(inv_norm_cdf) # shrunk convariance to avoid numerical instability shrunk_cov = shrunk_covariance(empirical_cov.covariance_, shrinkage=0.8) self.est_cov, self.precision_ = graph_lasso(emp_cov=shrunk_cov, alpha=self.penalty, verbose=self.verbose, max_iter=self.max_iter) self.corr = scale_matrix(self.est_cov) if self.method == 'ledoit_wolf': if self.verbose: print colored( '>> Running ledoit_wolf to estiamte precision matrix', color='blue') self.est_cov, _ = ledoit_wolf(inv_norm_cdf) self.corr = scale_matrix(self.est_cov) self.precision_ = linalg.inv(self.est_cov) if self.method == 'spectral': '''L2 mehtod, use paper Inverse covariance estimation for high dimension data in linear time and space :formular: in paper eq(8) ''' if self.verbose: print colored( '>> Running Riccati to estiamte precision matrix', color='blue') # TODO: note estimated cov is sample cov self.est_cov, self.precision_ = spectral(inv_norm_cdf, rho=2 * self.penalty, assume_centered=False) self.corr = scale_matrix(self.est_cov) if self.method == 'pc': clf = pgmlearner.PGMLearner() data_list = list([]) for row_id in range(X_scale.shape[0]): instance = dict() for i, n in enumerate(self.vertexes): instance[n] = X_scale[row_id, i] data_list.append(instance) graph = clf.lg_constraint_estimatestruct(data=data_list, pvalparam=self.pval, bins=self.bins) dag = np.zeros(shape=(len(graph.V), len(graph.V))) for e in graph.E: dag[self.vertexes.index(e[0]), self.vertexes.index(e[1])] = 1 self.conditional_independences_ = dag if self.method == 'ic': df = dict() variable_types = dict() for j in range(X_scale.shape[1]): df[self.vertexes[j]] = X_scale[:, j] variable_types[self.vertexes[j]] = 'c' data = pd.DataFrame(df) # run the search ic_algorithm = IC(RobustRegressionTest, data, variable_types, alpha=self.pval) graph = ic_algorithm.search() dag = np.zeros(shape=(X_scale.shape[1], X_scale.shape[1])) for e in graph.edges(data=True): i = self.vertexes.index(e[0]) j = self.vertexes.index(e[1]) dag[i, j] = 1 dag[j, i] = 1 arrows = set(e[2]['arrows']) head_len = len(arrows) if head_len > 0: head = arrows.pop() if head_len == 1 and head == e[0]: dag[i, j] = 0 if head_len == 1 and head == e[1]: dag[j, i] = 0 self.conditional_independences_ = dag # finally we fit the structure self.fit_structure(self.precision_)
# --------------------- # # Below, we fit MCD and MLE based covariance estimators to our data and print # the estimated covariance matrices. Note that the estimated variance of # feature 2 is much higher with the MLE based estimator (7.5) than # that of the MCD robust estimator (1.2). This shows that the MCD based # robust estimator is much more resistant to the outlier samples, which were # designed to have a much larger variance in feature 2. import matplotlib.pyplot as plt from sklearn.covariance import EmpiricalCovariance, MinCovDet # fit a MCD robust estimator to data robust_cov = MinCovDet().fit(X) # fit a MLE estimator to data emp_cov = EmpiricalCovariance().fit(X) print('Estimated covariance matrix:\n' 'MCD (Robust):\n{}\n' 'MLE:\n{}'.format(robust_cov.covariance_, emp_cov.covariance_)) # %% # To better visualize the difference, we plot contours of the # Mahalanobis distances calculated by both methods. Notice that the robust # MCD based Mahalanobis distances fit the inlier black points much better, # whereas the MLE based distances are more influenced by the outlier # red points. fig, ax = plt.subplots(figsize=(10, 5)) # Plot data set inlier_plot = ax.scatter(X[:, 0], X[:, 1], color='black', label='inliers') outlier_plot = ax.scatter(X[:, 0][-n_outliers:],
def _get_cov(X): # calculates cov matrix from sklearn.covariance import EmpiricalCovariance cov = EmpiricalCovariance().fit(X) return cov
def main(): parser = argparse.ArgumentParser( description='Plot outlier-like distances for a 2-dimensional dataset') parser.add_argument( 'dataset', type=argparse.FileType('r'), help='a CSV file containing the dataset') parser.add_argument( '--plot', type=str, choices=['train', 'grid'], default='grid', help='plot the dataset or a grid evenly distributed over its span') parser.add_argument( '--plotdims', type=int, choices=[2, 3], default=2, help='the number of dimensions to plot') args = parser.parse_args() X = np.loadtxt(args.dataset, delimiter=',') fig = plt.figure() xformer = NullTransformer() if X.shape[1] > 2: xformer = PCA(n_components=2) X = xformer.fit_transform(X) if args.plotdims == 2: plt.scatter(X[:, 0], X[:, 1], s=60, linewidth='0') else: plt.scatter(X[:, 0], X[:, 1]) plt.show(block=False) path_to_script = os.path.realpath(__file__) dir_of_script = os.path.dirname(path_to_script) dataset_path = dir_of_script + '/outliers.npy' np.save(dataset_path, X) ########################################################################### # Train autoencoder with the n samples until convergence. Run # evenly distributed samples through the autoencoder and compute # their reconstruction error. ########################################################################### maxseq_orig = np.max(X) minseq_orig = np.min(X) seqrange = np.abs(maxseq_orig - minseq_orig) maxseq = maxseq_orig + 0.5 * seqrange minseq = minseq_orig - 0.5 * seqrange print("minseq", minseq, "maxseq", maxseq) if args.plot == 'grid': seq = np.linspace(minseq, maxseq, num=50, endpoint=True) Xplot = np.array([_ for _ in product(seq, seq)]) else: Xplot = X robust_cov = MinCovDet().fit(X) robust_md = robust_cov.mahalanobis(Xplot) empirical_cov = EmpiricalCovariance().fit(X) empirical_md = empirical_cov.mahalanobis(Xplot) # Assume Xplot is at least 2-dimensional. if Xplot.shape[1] > 2: Xplot2d = bh_sne(Xplot) else: Xplot2d = Xplot robust_md01 = robust_md - np.nanmin(robust_md) robust_md01 = robust_md01 / np.nanmax(robust_md01) empirical_md01 = empirical_md - np.nanmin(empirical_md) empirical_md01 = empirical_md01 / np.nanmax(empirical_md01) fig = plt.figure() if args.plotdims == 2: ax = fig.add_subplot(1, 1, 1) ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], cmap=plt.cm.jet, c=robust_md01, s=60, linewidth='0') else: ax = fig.add_subplot(1, 1, 1, projection='3d') ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], robust_md01, cmap=plt.cm.jet, color=robust_md01) ax.set_zlabel('Mahalanobis distance') ax.set_xlabel('x') ax.set_ylabel('y') ax.set_title('Mahalanobis distance (robust covariance)') fig = plt.figure() if args.plotdims == 2: ax = fig.add_subplot(1, 1, 1) ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], cmap=plt.cm.jet, c=empirical_md01, s=60, linewidth='0') else: ax = fig.add_subplot(1, 1, 1, projection='3d') ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], empirical_md01, cmap=plt.cm.jet, color=empirical_md01) ax.set_zlabel('Mahalanobis distance') ax.set_xlabel('x') ax.set_ylabel('y') ax.set_title('Mahalanobis distance (empirical covariance)') enc_dec = [ # tanh encoder, linear decoder ['tanh', 'linear'], # sigmoid encoder, linear decoder ['sigmoid', 'linear'], ####################################################################### # The reconstruction error of the autoencoders trained with the # remaining commented-out pairs don't seem to match Mahalanobis # distance very well. Feel free to uncomment them to see for # yourself. # linear encoder, linear decoder # ['linear', 'linear'], # tanh encoder, tanh decoder # ['tanh', 'tanh'], # tanh encoder, sigmoid decoder # ['tanh', 'sigmoid'], # sigmoid encoder, tanh decoder # ['sigmoid', 'tanh'], # sigmoid encoder, sigmoid decoder # ['sigmoid', 'sigmoid'] ####################################################################### ] for i, act in enumerate(enc_dec): enc, dec = act if dec == 'linear': dec = None model = train_autoencoder(dataset_path, act_enc=enc, act_dec=dec, nvis=X.shape[1], nhid=16) Xshared = theano.shared( np.asarray(Xplot, dtype=theano.config.floatX), borrow=True) f = theano.function([], outputs=model.reconstruct(Xshared)) fit = f() error = reconstruction_error(Xplot, fit) error01 = error - np.nanmin(error) error01 = error01 / np.nanmax(error01) fig = plt.figure() if args.plotdims == 2: ax = fig.add_subplot(1, 1, 1) ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], cmap=plt.cm.jet, c=error, s=60, linewidth='0') else: ax = fig.add_subplot(1, 1, 1, projection='3d') ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], error, cmap=plt.cm.jet, color=error01) ax.set_zlabel('Reconstruction error') ax.set_xlabel('x') ax.set_ylabel('y') encdec_type = ', '.join(act) ax.set_title('Reconstruction error (' + encdec_type + ')') print("Correlation of robust MD and reconstruction error (" + str(encdec_type) + ") " + str(pearsonr(robust_md, error))) print("Correlation of empirical MD and reconstruction error (" + str(encdec_type) + ") " + str(pearsonr(empirical_md, error))) print("Correlation of robust MD and empirical MD " + str(pearsonr(robust_md, empirical_md))) os.remove(dataset_path) os.remove('outliers.pkl') plt.show(block=True)
bins=[i / 2 for i in range(0, 12, 1)], rwidth=1, color='#b1dbd0', edgecolor='black', align='left') fig.tight_layout(pad=5.0) xa.locator_params(axis='y', nbins=7) for tick in ax.get_xticklabels(): tick.set_fontname("Arial") for tick in ax.get_yticklabels(): tick.set_fontname("Arial") plt.show() A = EmpiricalCovariance().fit(np.array((x, dx))).covariance_ A = (A > 0) * A A = A - np.diag(np.diag(A)) X = networkx.from_numpy_array(A) F = networkx.Graph() ps = networkx.spring_layout(X, scale=5, k=1 / len(A)**(1 / 40000)) labels = (z[:, 0].astype(str)) l = {i: labels[i] for i in range(len(labels))} networkx.draw_networkx_nodes(F, ps, nodelist=X.nodes, node_color='maroon',
n_features = 2 # generate data # gen_cov = np.eye(n_features) # gen_cov[0, 0] = 2. # X = np.dot(np.random.randn(n_samples, n_features), gen_cov) # # add some outliers # outliers_cov = np.eye(n_features) # outliers_cov[np.arange(1, n_features), np.arange(1, n_features)] = 7. # X[-n_outliers:] = np.dot(np.random.randn(n_outliers, n_features), outliers_cov) # fit a Minimum Covariance Determinant (MCD) robust estimator to data robust_cov = MinCovDet().fit(X) # compare estimators learnt from the full data set with true parameters emp_cov = EmpiricalCovariance().fit(X) ############################################################################### # Display results fig = plt.figure() plt.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05) # Show data set subfig1 = plt.subplot(3, 1, 1) inlier_plot = subfig1.scatter(X[:, 0], X[:, 1], color='black', label='inliers') outlier_plot = subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:], color='red', label='outliers') subfig1.set_xlim(subfig1.get_xlim()[0], 11.) subfig1.set_title("Mahalanobis distances of a contaminated data set:")
def test_covariance(): # Tests Covariance module on a simple dataset. # test covariance fit from data cov = EmpiricalCovariance() cov.fit(X) emp_cov = empirical_covariance(X) assert_array_almost_equal(emp_cov, cov.covariance_, 4) assert_almost_equal(cov.error_norm(emp_cov), 0) assert_almost_equal(cov.error_norm(emp_cov, norm='spectral'), 0) assert_almost_equal(cov.error_norm(emp_cov, norm='frobenius'), 0) assert_almost_equal(cov.error_norm(emp_cov, scaling=False), 0) assert_almost_equal(cov.error_norm(emp_cov, squared=False), 0) with pytest.raises(NotImplementedError): cov.error_norm(emp_cov, norm='foo') # Mahalanobis distances computation test mahal_dist = cov.mahalanobis(X) assert np.amin(mahal_dist) > 0 # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) cov = EmpiricalCovariance() cov.fit(X_1d) assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4) assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0) # test with one sample # Create X with 1 sample and 5 features X_1sample = np.arange(5).reshape(1, 5) cov = EmpiricalCovariance() warn_msg = ( "Only one sample available. You may want to reshape your data array") with pytest.warns(UserWarning, match=warn_msg): cov.fit(X_1sample) assert_array_almost_equal(cov.covariance_, np.zeros(shape=(5, 5), dtype=np.float64)) # test integer type X_integer = np.asarray([[0, 1], [1, 0]]) result = np.asarray([[0.25, -0.25], [-0.25, 0.25]]) assert_array_almost_equal(empirical_covariance(X_integer), result) # test centered case cov = EmpiricalCovariance(assume_centered=True) cov.fit(X) assert_array_equal(cov.location_, np.zeros(X.shape[1]))