def test_covariance(): """Tests Covariance module on a simple dataset. """ # test covariance fit from data cov = EmpiricalCovariance() cov.fit(X) assert_array_almost_equal(empirical_covariance(X), cov.covariance_, 4) assert_almost_equal(cov.error_norm(empirical_covariance(X)), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X), norm='spectral'), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X), norm='frobenius'), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X), scaling=False), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X), squared=False), 0) # Mahalanobis distances computation test mahal_dist = cov.mahalanobis(X) assert(np.amax(mahal_dist) < 250) assert(np.amin(mahal_dist) > 50) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) cov = EmpiricalCovariance() cov.fit(X_1d) assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4) assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0) # test integer type X_integer = np.asarray([[0, 1], [1, 0]]) result = np.asarray([[0.25, -0.25], [-0.25, 0.25]]) assert_array_almost_equal(empirical_covariance(X_integer), result)
def test_suffstat_sk_full(): # compare the EmpiricalCovariance.covariance fitted on X*sqrt(resp) # with _sufficient_sk_full, n_components=1 rng = np.random.RandomState(0) n_samples, n_features = 500, 2 # special case 1, assuming data is "centered" X = rng.rand(n_samples, n_features) resp = rng.rand(n_samples, 1) X_resp = np.sqrt(resp) * X nk = np.array([n_samples]) xk = np.zeros((1, n_features)) covars_pred = _estimate_gaussian_covariance_full(resp, X, nk, xk, 0) ecov = EmpiricalCovariance(assume_centered=True) ecov.fit(X_resp) assert_almost_equal(ecov.error_norm(covars_pred[0], norm='frobenius'), 0) assert_almost_equal(ecov.error_norm(covars_pred[0], norm='spectral'), 0) # special case 2, assuming resp are all ones resp = np.ones((n_samples, 1)) nk = np.array([n_samples]) xk = X.mean().reshape((1, -1)) covars_pred = _estimate_gaussian_covariance_full(resp, X, nk, xk, 0) ecov = EmpiricalCovariance(assume_centered=False) ecov.fit(X) assert_almost_equal(ecov.error_norm(covars_pred[0], norm='frobenius'), 0) assert_almost_equal(ecov.error_norm(covars_pred[0], norm='spectral'), 0)
def fit(self, X, n_jobs=-1): EmpiricalCovariance.fit(self, X) if not self.no_fit: CovarianceOutlierDetectionMixin.set_threshold(self, X, n_jobs=n_jobs) return self
def calc_full_covs(net, trainloader, n_classes, layers): net.eval() layers_centers = [] layers_precisions = [] for l in range(layers): outputs_list = [] target_list = [] with torch.no_grad(): for (inputs, targets) in trainloader: inputs, targets = inputs.to(device), targets.to(device) outputs = net.intermediate_forward(inputs, layer_index=l) outputs_list.append(outputs) target_list.append(targets) outputs = torch.cat(outputs_list, axis=0) target_list = torch.cat(target_list) x_dim = outputs.size(1) centers = torch.zeros(n_classes, x_dim).cuda() normlized_outputs = [] for c in range(n_classes): class_points = outputs[c == target_list] centers[c] = torch.mean(class_points, axis=0) normlized_outputs.append( class_points - centers[c].unsqueeze(0).expand(class_points.size(0), -1)) normlized_outputs = torch.cat(normlized_outputs, axis=0).cpu() covs_lasso = EmpiricalCovariance(assume_centered=False) covs_lasso.fit(normlized_outputs.cpu().numpy()) precision = torch.from_numpy(covs_lasso.precision_).float().cuda() layers_centers.append(centers) layers_precisions.append(precision) return layers_precisions, layers_centers
def test_covariance(): """Tests Covariance module on a simple dataset. """ # test covariance fit from data cov = EmpiricalCovariance() cov.fit(X) assert_array_almost_equal(empirical_covariance(X), cov.covariance_, 4) assert_almost_equal(cov.error_norm(empirical_covariance(X)), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X), norm='spectral'), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X), norm='frobenius'), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X), scaling=False), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X), squared=False), 0) # Mahalanobis distances computation test mahal_dist = cov.mahalanobis(X) assert(np.amax(mahal_dist) < 250) assert(np.amin(mahal_dist) > 50) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) cov = EmpiricalCovariance() cov.fit(X_1d) assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4) assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0) # test integer type X_integer = np.asarray([[0, 1], [1, 0]]) result = np.asarray([[0.25, -0.25], [-0.25, 0.25]]) assert_array_almost_equal(empirical_covariance(X_integer), result)
class CovEmbedding(BaseEstimator, TransformerMixin): """ Tranformer that returns the coefficients on a flat space to perform the analysis. """ def __init__(self, base_estimator=None, kind='tangent'): self.base_estimator = base_estimator self.kind = kind # if self.base_estimator == None: # self.base_estimator_ = ... # else: # self.base_estimator_ = clone(base_estimator) def fit(self, X, y=None): if self.base_estimator is None: self.base_estimator_ = EmpiricalCovariance( assume_centered=True) else: self.base_estimator_ = clone(self.base_estimator) if self.kind == 'tangent': # self.mean_cov = mean_cov = spd_manifold.log_mean(covs) # Euclidean mean as an approximation to the geodesic covs = [self.base_estimator_.fit(x).covariance_ for x in X] covs = my_stack(covs) mean_cov = np.mean(covs, axis=0) self.whitening_ = inv_sqrtm(mean_cov) return self def transform(self, X): """Apply transform to covariances Parameters ---------- covs: list of array list of covariance matrices, shape (n_rois, n_rois) Returns ------- list of array, transformed covariance matrices, shape (n_rois * (n_rois+1)/2,) """ covs = [self.base_estimator_.fit(x).covariance_ for x in X] covs = my_stack(covs) p = covs.shape[-1] if self.kind == 'tangent': id_ = np.identity(p) covs = [self.whitening_.dot(c.dot(self.whitening_)) - id_ for c in covs] elif self.kind == 'partial correlation': covs = [cov_to_corr(inv(g)) for g in covs] elif self.kind == 'correlation': covs = [cov_to_corr(g) for g in covs] return np.array([sym_to_vec(c) for c in covs])
def printSciKitCovarianceMatrixs(): #does not work, ValueError: setting an array element with a sequence. xMaker = RSTCovarianceMatrixMaker() nums, data, ilabels = getLabeledRSTData(False) for i,d in enumerate(data): d['ratio'] = ilabels[i] xMaker.setInstanceNums(nums) xMaker.fit(data) X = xMaker.transform(data) correlator = EmpiricalCovariance() correlator.fit(X) print correlator.covariance_
class CovEmbedding(BaseEstimator, TransformerMixin): """ Tranformer that returns the coefficients on a flat space to perform the analysis. """ def __init__(self, cov_estimator=None, kind='tangent'): self.cov_estimator = cov_estimator self.kind = kind def fit(self, X, y=None): if self.cov_estimator is None: self.cov_estimator_ = EmpiricalCovariance( assume_centered=True) else: self.cov_estimator_ = clone(self.cov_estimator) if self.kind == 'tangent': covs = [self.cov_estimator_.fit(x).covariance_ for x in X] self.mean_cov_ = spd_mfd.frechet_mean(covs, max_iter=30, tol=1e-7) self.whitening_ = spd_mfd.inv_sqrtm(self.mean_cov_) return self def transform(self, X): """Apply transform to covariances Parameters ---------- covs: list of array list of covariance matrices, shape (n_rois, n_rois) Returns ------- list of array, transformed covariance matrices, shape (n_rois * (n_rois+1)/2,) """ covs = [self.cov_estimator_.fit(x).covariance_ for x in X] covs = spd_mfd.my_stack(covs) if self.kind == 'tangent': covs = [spd_mfd.logm(self.whitening_.dot(c).dot(self.whitening_)) for c in covs] elif self.kind == 'precision': covs = [spd_mfd.inv(g) for g in covs] elif self.kind == 'partial correlation': covs = [prec_to_partial(spd_mfd.inv(g)) for g in covs] elif self.kind == 'correlation': covs = [cov_to_corr(g) for g in covs] else: raise ValueError("Unknown connectivity measure.") return np.array([sym_to_vec(c) for c in covs])
def init_w_kmeans(data): """Calculate initialization values using K-Means""" # initialize means km = KMeans(n_clusters=CLUSTERS) labs = km.fit_predict(data) means = km.cluster_centers_ # initialize covariaces covs = np.empty((CLUSTERS, DIMENSIONS, DIMENSIONS)) for l in np.unique(labs.ravel()): ce = EmpiricalCovariance() ce.fit(data[labs==l, :]) ce.fit(data) covs[l,:,:] = ce.covariance_ return means, covs
def main(): print ("Running CV on Mahalanobis Distance based approach.") mahanalobis() start_time = time.time() totalX = [] totalY = [] flag = True countTrain = 228000 print ("\n\nNow testing on separate data.") with open("creditcard.csv", "rb") as f: data = csv.reader(f) for row in data: if flag: flag = False continue countTrain += 1 if countTrain > 228000: #CV on 80% of data totalX.append([float(i) for i in row[:-1]]) totalY.append(int(row[-1])) print ("Data Loaded") totalX = scalar.fit_transform(totalX) clf = EmpiricalCovariance() clf.fit(totalX) distances = clf.mahalanobis(totalX) Y = [] for i in range(len(totalY)): if np.log10(distances[i]) > 1.838: Y.append(1) else: Y.append(0) print("%s seconds" % (time.time() - start_time)) print ("Results") auc = roc_auc_score(totalY, Y) print("Area under curve : " + str(auc)) fpr, tpr, _ = roc_curve(totalY, Y) print ("False Positive Rate : " + str(fpr[1])) _, recall, _ = precision_recall_curve(totalY, Y) print ("Recall : " + str(recall[1])) plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.3f)' % auc) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.legend(loc="lower right") plt.show()
class Mahalanobis(BaseEstimator): """Mahalanobis distance estimator. Uses Covariance estimate to compute mahalanobis distance of the observations from the model. Parameters ---------- robust : boolean to determine wheter to use robust estimator based on Minimum Covariance Determinant computation """ def __init__(self, robust=False): if not robust: from sklearn.covariance import EmpiricalCovariance as CovarianceEstimator # else: from sklearn.covariance import MinCovDet as CovarianceEstimator # self.model = CovarianceEstimator() self.cov = None def fit(self, X, y=None, **params): """Fits the covariance model according to the given training data and parameters. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training data, where n_samples is the number of samples and n_features is the number of features. Returns ------- self : object Returns self. """ self.cov = self.model.fit(X) return self def score(self, X, y=None): """Computes the mahalanobis distances of given observations. The provided observations are assumed to be centered. One may want to center them using a location estimate first. Parameters ---------- X : array-like, shape = [n_samples, n_features] The observations, the Mahalanobis distances of the which we compute. Returns ------- mahalanobis_distance : array, shape = [n_observations,] Mahalanobis distances of the observations. """ #return self.model.score(X,assume_centered=True) return -self.model.mahalanobis(X - self.model.location_)**0.33
def phd(X, Y, **kwargs): """ Parameters ---------- X : array-like, shape = [n_features, n_samples] Training data, where n_samples is the number of samples and n_features is the number of features. Y : array-like, shape = [n_samples] Response variable, where n_samples is the number of samples Argument dictionary should contain: kwargs = { 'd' : intrinsic dimension (int) 'residuals' : If True, creates PHDs from the residuals of linear regression (defaults to False) 'return_mat' : Boolean whether key PHD matrix should be returned (defaults to False). } Returns ----------- proj_vecs : array-like, shape = [n_features, d] Orthonormal system spanning the sufficient dimension subspace, where d refers to the intrinsic dimension. } """ # Extract arguments from dictionary d = kwargs['d'] residuals = kwargs.get('residuals', False) return_mat = kwargs.get('return_mat', False) D, N = X.shape # Calculate covariance matrix and empirical covariance matrix emc = EmpiricalCovariance() emc = emc.fit(X.T) # Covariance of all samples cov_all = emc.covariance_ weighted_cov = np.zeros(cov_all.shape) if residuals: linreg = LinearRegression() linreg = linreg.fit(X.T, Y) res = Y - linreg.predict(X.T) Y = res Ymean = np.mean(Y) mean_all = np.mean(X, axis=1) for i in range(N): weighted_cov += (Y[i] - Ymean) * np.outer(X[:, i] - mean_all, X[:, i] - mean_all) weighted_cov = weighted_cov / float(N) vals, vecs = eig(weighted_cov, cov_all) order = np.argsort(np.abs(vals))[::-1] proj_vecs = vecs[:, order[:d]] if return_mat: return proj_vecs, weighted_cov else: return proj_vecs
class Mahalanobis (BaseEstimator): """Mahalanobis distance estimator. Uses Covariance estimate to compute mahalanobis distance of the observations from the model. Parameters ---------- robust : boolean to determine wheter to use robust estimator based on Minimum Covariance Determinant computation """ def __init__(self, robust=False): if not robust: from sklearn.covariance import EmpiricalCovariance as CovarianceEstimator # else: from sklearn.covariance import MinCovDet as CovarianceEstimator # self.model = CovarianceEstimator() self.cov = None def fit(self, X, y=None, **params): """Fits the covariance model according to the given training data and parameters. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training data, where n_samples is the number of samples and n_features is the number of features. Returns ------- self : object Returns self. """ self.cov = self.model.fit(X) return self def score(self, X, y=None): """Computes the mahalanobis distances of given observations. The provided observations are assumed to be centered. One may want to center them using a location estimate first. Parameters ---------- X : array-like, shape = [n_samples, n_features] The observations, the Mahalanobis distances of the which we compute. Returns ------- mahalanobis_distance : array, shape = [n_observations,] Mahalanobis distances of the observations. """ #return self.model.score(X,assume_centered=True) return - self.model.mahalanobis(X-self.model.location_) ** 0.33
def test_suffstat_sk_full(): # compare the precision matrix compute from the # EmpiricalCovariance.covariance fitted on X*sqrt(resp) # with _sufficient_sk_full, n_components=1 rng = np.random.RandomState(0) n_samples, n_features = 500, 2 # special case 1, assuming data is "centered" X = rng.rand(n_samples, n_features) resp = rng.rand(n_samples, 1) X_resp = np.sqrt(resp) * X nk = np.array([n_samples]) xk = np.zeros((1, n_features)) covars_pred = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0) ecov = EmpiricalCovariance(assume_centered=True) ecov.fit(X_resp) assert_almost_equal(ecov.error_norm(covars_pred[0], norm='frobenius'), 0) assert_almost_equal(ecov.error_norm(covars_pred[0], norm='spectral'), 0) # check the precision computation precs_chol_pred = _compute_precision_cholesky(covars_pred, 'full') precs_pred = np.array([np.dot(prec, prec.T) for prec in precs_chol_pred]) precs_est = np.array([linalg.inv(cov) for cov in covars_pred]) assert_array_almost_equal(precs_est, precs_pred) # special case 2, assuming resp are all ones resp = np.ones((n_samples, 1)) nk = np.array([n_samples]) xk = X.mean(axis=0).reshape((1, -1)) covars_pred = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0) ecov = EmpiricalCovariance(assume_centered=False) ecov.fit(X) assert_almost_equal(ecov.error_norm(covars_pred[0], norm='frobenius'), 0) assert_almost_equal(ecov.error_norm(covars_pred[0], norm='spectral'), 0) # check the precision computation precs_chol_pred = _compute_precision_cholesky(covars_pred, 'full') precs_pred = np.array([np.dot(prec, prec.T) for prec in precs_chol_pred]) precs_est = np.array([linalg.inv(cov) for cov in covars_pred]) assert_array_almost_equal(precs_est, precs_pred)
def test_suffstat_sk_full(): # compare the precision matrix compute from the # EmpiricalCovariance.covariance fitted on X*sqrt(resp) # with _sufficient_sk_full, n_components=1 rng = np.random.RandomState(0) n_samples, n_features = 500, 2 # special case 1, assuming data is "centered" X = rng.rand(n_samples, n_features) resp = rng.rand(n_samples, 1) X_resp = np.sqrt(resp) * X nk = np.array([n_samples]) xk = np.zeros((1, n_features)) covars_pred = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0) ecov = EmpiricalCovariance(assume_centered=True) ecov.fit(X_resp) assert_almost_equal(ecov.error_norm(covars_pred[0], norm='frobenius'), 0) assert_almost_equal(ecov.error_norm(covars_pred[0], norm='spectral'), 0) # check the precision computation precs_chol_pred = _compute_precision_cholesky(covars_pred, 'full') precs_pred = np.array([np.dot(prec, prec.T) for prec in precs_chol_pred]) precs_est = np.array([linalg.inv(cov) for cov in covars_pred]) assert_array_almost_equal(precs_est, precs_pred) # special case 2, assuming resp are all ones resp = np.ones((n_samples, 1)) nk = np.array([n_samples]) xk = X.mean(axis=0).reshape((1, -1)) covars_pred = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0) ecov = EmpiricalCovariance(assume_centered=False) ecov.fit(X) assert_almost_equal(ecov.error_norm(covars_pred[0], norm='frobenius'), 0) assert_almost_equal(ecov.error_norm(covars_pred[0], norm='spectral'), 0) # check the precision computation precs_chol_pred = _compute_precision_cholesky(covars_pred, 'full') precs_pred = np.array([np.dot(prec, prec.T) for prec in precs_chol_pred]) precs_est = np.array([linalg.inv(cov) for cov in covars_pred]) assert_array_almost_equal(precs_est, precs_pred)
def peakSSV(X, Y, **kwargs): """ Parameters ---------- X : array-like, shape = [n_features, n_samples] Training data, where n_samples is the number of samples and n_features is the number of features. Y : array-like, shape = [n_samples] Response variable, where n_samples is the number of samples Argument dictionary should contain: kwargs = { 'd' : intrinsic dimension (int) 'n_samples' : Number of samples around the maximum Y to take. 'rescale' : Boolean whether standardization should be performed (True for yes). } Returns ----------- proj_vecs : array-like, shape = [n_features, d] Orthonormal system spanning the sufficient dimension subspace, where d refers to the intrinsic dimension. } """ # Extract arguments from dictionary d = kwargs['d'] n_samples = kwargs['n_samples'] rescale = kwargs['rescale'] return_mat = kwargs.get('return_mat', False) D, N = X.shape # Standardize X emc = EmpiricalCovariance() emc = emc.fit(X.T) # Covariance of all samples mean_all = np.mean(X, axis=0) cov_all = emc.covariance_ scaler = StandardScaler() if rescale: Z = scaler.fit_transform(X.T).T pca = PCA() order = np.argsort(Y) XO = X[:, order] pca = pca.fit(X[:, -n_samples:].T) U = pca.components_[-d:, :].T if rescale: # Apply inverse transformation vecs = sqrtm(scipy.linalg.inv(cov_all)).dot(U[:, :d]) proj_vecs, dummy = np.linalg.qr(vecs) else: proj_vecs = U[:, :d] return proj_vecs
def shape_(data): ec = EC() centre = np.mean(data, axis=0) covar = ec.fit(data).covariance_ v, w = linalg.eigh(covar) v = 2. * np.sqrt(2.) * np.sqrt(v) u = w[0] / linalg.norm(w[0]) angle = np.arctan(u[1] / u[0]) angle = 180. * angle / np.pi circ = geometry.Point(centre).buffer(1) ellipse = affinity.scale(circ, float(v[0]), float(v[1])) return affinity.rotate(ellipse, angle)
def test_covariance(): """Tests Covariance module on a simple dataset. """ # test covariance fit from data cov = EmpiricalCovariance() cov.fit(X) emp_cov = empirical_covariance(X) assert_array_almost_equal(emp_cov, cov.covariance_, 4) assert_almost_equal(cov.error_norm(emp_cov), 0) assert_almost_equal( cov.error_norm(emp_cov, norm='spectral'), 0) assert_almost_equal( cov.error_norm(emp_cov, norm='frobenius'), 0) assert_almost_equal( cov.error_norm(emp_cov, scaling=False), 0) assert_almost_equal( cov.error_norm(emp_cov, squared=False), 0) assert_raises(NotImplementedError, cov.error_norm, emp_cov, norm='foo') # Mahalanobis distances computation test mahal_dist = cov.mahalanobis(X) print np.amin(mahal_dist), np.amax(mahal_dist) assert(np.amin(mahal_dist) > 0) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) cov = EmpiricalCovariance() cov.fit(X_1d) assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4) assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0) # test with one sample X_1sample = np.arange(5) cov = EmpiricalCovariance() with warnings.catch_warnings(record=True): cov.fit(X_1sample) # test integer type X_integer = np.asarray([[0, 1], [1, 0]]) result = np.asarray([[0.25, -0.25], [-0.25, 0.25]]) assert_array_almost_equal(empirical_covariance(X_integer), result) # test centered case cov = EmpiricalCovariance(assume_centered=True) cov.fit(X) assert_equal(cov.location_, np.zeros(X.shape[1]))
def test_covariance(): # Tests Covariance module on a simple dataset. # test covariance fit from data cov = EmpiricalCovariance() cov.fit(X) emp_cov = empirical_covariance(X) assert_array_almost_equal(emp_cov, cov.covariance_, 4) assert_almost_equal(cov.error_norm(emp_cov), 0) assert_almost_equal(cov.error_norm(emp_cov, norm='spectral'), 0) assert_almost_equal(cov.error_norm(emp_cov, norm='frobenius'), 0) assert_almost_equal(cov.error_norm(emp_cov, scaling=False), 0) assert_almost_equal(cov.error_norm(emp_cov, squared=False), 0) with pytest.raises(NotImplementedError): cov.error_norm(emp_cov, norm='foo') # Mahalanobis distances computation test mahal_dist = cov.mahalanobis(X) assert np.amin(mahal_dist) > 0 # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) cov = EmpiricalCovariance() cov.fit(X_1d) assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4) assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0) # test with one sample # Create X with 1 sample and 5 features X_1sample = np.arange(5).reshape(1, 5) cov = EmpiricalCovariance() warn_msg = ( "Only one sample available. You may want to reshape your data array") with pytest.warns(UserWarning, match=warn_msg): cov.fit(X_1sample) assert_array_almost_equal(cov.covariance_, np.zeros(shape=(5, 5), dtype=np.float64)) # test integer type X_integer = np.asarray([[0, 1], [1, 0]]) result = np.asarray([[0.25, -0.25], [-0.25, 0.25]]) assert_array_almost_equal(empirical_covariance(X_integer), result) # test centered case cov = EmpiricalCovariance(assume_centered=True) cov.fit(X) assert_array_equal(cov.location_, np.zeros(X.shape[1]))
class MahalanobisDistance(DimReducer): """ Computes a person's Mahalanobis distance using the mean and covariance estimated from a set of young people. Uses sklearn; verified this matches up with the normal matrix computation. """ def __init__(self, age_lower, age_upper): self.age_lower = age_lower self.age_upper = age_upper self.need_ages = True self.k = 1 def _fit_from_processed_data(self, X, ages): young_people = (ages >= self.age_lower) & (ages <= self.age_upper) print("%i people between %s and %s used for mean/cov calculation" % (young_people.sum(), self.age_lower, self.age_upper)) assert young_people.sum() > 1000 self.model = EmpiricalCovariance(assume_centered=False) self.model.fit(X[young_people, :]) def _get_projections_from_processed_data(self, X): md = np.sqrt(self.model.mahalanobis(X)).reshape([-1, 1]) return md
def mahanalobis(): totalX = [] totalY = [] flag = True countTrain = 0 with open("creditcard.csv", "rb") as f: data = csv.reader(f) for row in data: if flag: flag = False continue if countTrain >= 228000: #test on 20% of data break countTrain += 1 totalX.append([float(i) for i in row[:-1]]) totalY.append(int(row[-1])) totalX = scalar.fit_transform(totalX) print ("Data Loaded") clf = EmpiricalCovariance() clf.fit(totalX) distances = clf.mahalanobis(totalX) Y = [] for i in range(len(totalY)): if np.log10(distances[i]) > 1.838: Y.append(1) else: Y.append(0) print ("Results") auc = roc_auc_score(totalY, Y) print(auc) fpr, _, _ = roc_curve(totalY, Y) print (fpr[1]) _, recall, _ = precision_recall_curve(totalY, Y) print (recall[1]) return auc, fpr[1], recall[1]
def detect_bad_channels(inst, pick_types=None, threshold=.2): from sklearn.preprocessing import RobustScaler from sklearn.covariance import EmpiricalCovariance from jr.stats import median_abs_deviation if pick_types is None: pick_types = dict(meg='mag') inst = inst.pick_types(copy=True, **pick_types) cov = EmpiricalCovariance() cov.fit(inst._data.T) cov = cov.covariance_ # center scaler = RobustScaler() cov = scaler.fit_transform(cov).T cov /= median_abs_deviation(cov) cov -= np.median(cov) # compute robust summary metrics mu = np.median(cov, axis=0) sigma = median_abs_deviation(cov, axis=0) mu /= median_abs_deviation(mu) sigma /= median_abs_deviation(sigma) distance = np.sqrt(mu ** 2 + sigma ** 2) bad = np.where(distance < threshold)[0] bad = [inst.ch_names[ch] for ch in bad] return bad
def detect_bad_channels(inst, pick_types=None, threshold=.2): from sklearn.preprocessing import RobustScaler from sklearn.covariance import EmpiricalCovariance from jr.stats import median_abs_deviation if pick_types is None: pick_types = dict(meg='mag') inst = inst.pick_types(copy=True, **pick_types) cov = EmpiricalCovariance() cov.fit(inst._data.T) cov = cov.covariance_ # center scaler = RobustScaler() cov = scaler.fit_transform(cov).T cov /= median_abs_deviation(cov) cov -= np.median(cov) # compute robust summary metrics mu = np.median(cov, axis=0) sigma = median_abs_deviation(cov, axis=0) mu /= median_abs_deviation(mu) sigma /= median_abs_deviation(sigma) distance = np.sqrt(mu ** 2 + sigma ** 2) bad = np.where(distance < threshold)[0] bad = [inst.ch_names[ch] for ch in bad] return bad
def pca(X, **kwargs): """ Parameters ---------- X : array-like, shape = [n_features, n_samples] Training data, where n_samples is the number of samples and n_features is the number of features. Argument dictionary should contain: kwargs = { 'd' : intrinsic dimension (int) 'rescale' : Boolean whether standardization should be performed (True for yes). 'return_mat' : Boolean whether key SIR matrix should be returned (defaults to False). Returns ----------- proj_vecs : array-like, shape = [n_features, d] Orthonormal system spanning the sufficient dimension subspace, where d refers to the intrinsic dimension. } """ d = kwargs['d'] return_mat = kwargs['return_mat'] rescale = kwargs['rescale'] scaler = StandardScaler() if rescale: emc = EmpiricalCovariance() emc = emc.fit(X.T) # Covariance of all samples cov_all = emc.covariance_ Z = scaler.fit_transform(X.T).T pca = PCA(svd_solver='full') pca = pca.fit(Z.T) proj_vecs = pca.components_[:d, :].T # Apply inverse transformation vecs = sqrtm(scipy.linalg.inv(cov_all)).dot(proj_vecs) proj_vecs, dummy = np.linalg.qr(vecs) else: pca = PCA(svd_solver='full') pca = pca.fit(X.T) proj_vecs = pca.components_[:d, :].T if return_mat: return proj_vecs, X.dot(X.T) else: return proj_vecs
class ChangeDetector(object): """ Joint Gaussian Change detector using a scikit learn style interface This class is really a wrapper around the methods in scikit learn for estimating covariance using robust or empirical methods and calculating the mahalanobis distances. """ def __init__(self, method='robust', estimator_kw_args={}): if method is 'robust': self.covariance_estimator_ = MinCovDet(**estimator_kw_args) elif method is 'empirical': self.covariance_estimator_ = EmpiricalCovariance( **estimator_kw_args) else: raise ValueError( "{} is not a valid method. Must be one of 'robust' or 'empirical'" .format(method)) def fit(self, X): """ Fits the estimator. Parameters: ----------- X - array of time series, shape (n_series, len_series) """ self.covariance_estimator_ = self.covariance_estimator_.fit(X) return self def predict(self, X, threshold): """ Returns true for each time series predicted as change. Also returns the mahalanobis distances parameters: ----------- X - array of time series, shape (n_series, len_series) threshold - float returns: y_pred - shape (n_time_series), true of change detected distances - shape (n_time_series). The mahanobis distances of each time series under the fitted distribution """ distances = self.covariance_estimator_.mahalanobis(X) return distances > threshold, distances
def test_covariance(): # Tests Covariance module on a simple dataset. # test covariance fit from data cov = EmpiricalCovariance() cov.fit(X) emp_cov = empirical_covariance(X) assert_array_almost_equal(emp_cov, cov.covariance_, 4) assert_almost_equal(cov.error_norm(emp_cov), 0) assert_almost_equal( cov.error_norm(emp_cov, norm='spectral'), 0) assert_almost_equal( cov.error_norm(emp_cov, norm='frobenius'), 0) assert_almost_equal( cov.error_norm(emp_cov, scaling=False), 0) assert_almost_equal( cov.error_norm(emp_cov, squared=False), 0) assert_raises(NotImplementedError, cov.error_norm, emp_cov, norm='foo') # Mahalanobis distances computation test mahal_dist = cov.mahalanobis(X) assert_greater(np.amin(mahal_dist), 0) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) cov = EmpiricalCovariance() cov.fit(X_1d) assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4) assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0) # test with one sample # Create X with 1 sample and 5 features X_1sample = np.arange(5).reshape(1, 5) cov = EmpiricalCovariance() assert_warns(UserWarning, cov.fit, X_1sample) assert_array_almost_equal(cov.covariance_, np.zeros(shape=(5, 5), dtype=np.float64)) # test integer type X_integer = np.asarray([[0, 1], [1, 0]]) result = np.asarray([[0.25, -0.25], [-0.25, 0.25]]) assert_array_almost_equal(empirical_covariance(X_integer), result) # test centered case cov = EmpiricalCovariance(assume_centered=True) cov.fit(X) assert_array_equal(cov.location_, np.zeros(X.shape[1]))
def save(X, Y, **kwargs): """ Parameters ---------- X : array-like, shape = [n_features, n_samples] Training data, where n_samples is the number of samples and n_features is the number of features. Y : array-like, shape = [n_samples] Response variable, where n_samples is the number of samples Argument dictionary should contain: kwargs = { 'd' : intrinsic dimension (int) 'n_levelsets' : number of slices to use (int) 'rescale' : Boolean whether standardization should be performed (True for yes). 'return_mat' : Boolean whether key SIR matrix should be returned (defaults to False). } Returns ----------- proj_vecs : array-like, shape = [n_features, d] Orthonormal system spanning the sufficient dimension subspace, where d refers to the intrinsic dimension. } """ # Extract arguments from dictionary d = kwargs['d'] n_levelsets = kwargs['n_levelsets'] rescale = kwargs['rescale'] return_mat = kwargs.get('return_mat', False) D, N = X.shape # Standardize X emc = EmpiricalCovariance() emc = emc.fit(X.T) # Covariance of all samples mean_all = np.mean(X, axis = 0) cov_all = emc.covariance_ scaler = StandardScaler() if rescale: Z = scaler.fit_transform(X.T).T labels, n_levelsets = split_statistically_equivalent_blocks(X, Y, n_levelsets) M = np.zeros((D, D)) # Key matrix in SAVE empirical_probabilities = np.zeros(n_levelsets) for i in range(n_levelsets): empirical_probabilities[i] = float(len(np.where(labels == i)[0]))/float(N) if rescale: emc = emc.fit(Z[:,labels == i].T) # Covariance of all samples cov_sub = emc.covariance_ M += empirical_probabilities[i] * (np.eye(D) - cov_sub).dot((cov_all - cov_sub)) else: emc = emc.fit(X[:,labels == i].T) # Covariance of all samples cov_sub = emc.covariance_ M += empirical_probabilities[i] * (cov_all - cov_sub).dot((cov_all - cov_sub)) U, S, V = np.linalg.svd(M) if rescale: # Apply inverse transformation vecs = sqrtm(scipy.linalg.inv(cov_all)).dot(U[:,:d]) proj_vecs, dummy = np.linalg.qr(vecs) else: proj_vecs = U[:,:d] if return_mat: return proj_vecs, M else: return proj_vecs
def optimalShrinkage(X, return_covariance=False, method='rie'): """This function computes a cleaned, optimal shrinkage, rotationally-invariant estimator (RIE) of the true correlation matrix C underlying the noisy, in-sample estimate E = 1/T X * transpose(X) associated to a design matrix X of shape (T, N) (T measurements and N features). One approach to getting a cleaned estimator that predates the optimal shrinkage, RIE estimator consists in inverting the Marcenko-Pastur equation so as to replace the eigenvalues from the spectrum of E by an estimation of the true ones. This approach is known to be numerically-unstable, in addition to failing to account for the overlap between the sample eigenvectors and the true eigenvectors. How to compute such overlaps was first explained by Ledoit and Peche (cf. reference below). Their procedure was extended by Bun, Bouchaud and Potters, who also correct for a systematic downward bias in small eigenvalues. It is this debiased, optimal shrinkage, rotationally-invariant estimator that the function at hand implements. In addition to above method, this funtion also provides access to: - The finite N regularization of the optimal RIE for small eigenvalues as provided in section 8.1 of [3] a.k.a the inverse wishart (IW) regularization. - The direct kernel method of O. Ledoit and M. Wolf in their 2017 paper [4]. This is a direct port of their Matlab code. Parameters ---------- X: design matrix, of shape (T, N), where T denotes the number of samples (think measurements in a time series), while N stands for the number of features (think of stock tickers). return_covariance: type bool (default: False) If set to True, compute the standard deviations of each individual feature across observations, clean the underlying matrix of pairwise correlations, then re-apply the standard deviations and return a cleaned variance-covariance matrix. method: type string, optional (default="rie") - If "rie" : optimal shrinkage in the manner of Bun & al. with no regularisation - If "iw" : optimal shrinkage in the manner of Bun & al. with the so called Inverse Wishart regularization - If 'kernel': Direct kernel method of Ledoit Wolf. Returns ------- E_RIE: type numpy.ndarray, shape (N, N) Cleaned estimator of the true correlation matrix C. A sample estimator of C is the empirical covariance matrix E estimated from X. E is corrupted by in-sample noise. E_RIE is the optimal shrinkage, rotationally-invariant estimator (RIE) of C computed following the procedure of Joel Bun and colleagues (cf. references below). If return_covariance=True, E_clipped corresponds to a cleaned variance-covariance matrix. References ---------- 1 "Eigenvectors of some large sample covariance matrix ensembles", O. Ledoit and S. Peche Probability Theory and Related Fields, Vol. 151 (1), pp 233-264 2 "Rotational invariant estimator for general noisy matrices", J. Bun, R. Allez, J.-P. Bouchaud and M. Potters arXiv: 1502.06736 [cond-mat.stat-mech] 3 "Cleaning large Correlation Matrices: tools from Random Matrix Theory", J. Bun, J.-P. Bouchaud and M. Potters arXiv: 1610.08104 [cond-mat.stat-mech] 4 "Direct Nonlinear Shrinkage Estimation of Large-Dimensional Covariance Matrices (September 2017)", O. Ledoit and M. Wolf https://ssrn.com/abstract=3047302 or http://dx.doi.org/10.2139/ssrn.3047302 """ try: assert isinstance(return_covariance, bool) except AssertionError: raise sys.exit(1) T, N, transpose_flag = checkDesignMatrix(X) if transpose_flag: X = X.T if not return_covariance: X = StandardScaler(with_mean=False, with_std=True).fit_transform(X) ec = EmpiricalCovariance(store_precision=False, assume_centered=True) ec.fit(X) E = ec.covariance_ if return_covariance: inverse_std = 1./np.sqrt(np.diag(E)) E *= inverse_std E *= inverse_std.reshape(-1, 1) eigvals, eigvecs = np.linalg.eigh(E) eigvecs = eigvecs.T q = N / float(T) lambda_N = eigvals[0] # The smallest empirical eigenvalue, # given that the function used to compute # the spectrum of a Hermitian or symmetric # matrix - namely np.linalg.eigh - returns # the eigenvalues in ascending order. lambda_hats = None if method is not 'kernel': use_inverse_wishart = (method == 'iw') xis = map(lambda x: xiHelper(x, q, E), eigvals) Gammas = map(lambda x: gammaHelper(x, q, N, lambda_N, inverse_wishart=use_inverse_wishart), eigvals) xi_hats = map(lambda a, b: a * b if b > 1 else a, xis, Gammas) lambda_hats = xi_hats else: lambda_hats = directKernel(q, T, N, eigvals) E_RIE = np.zeros((N, N), dtype=float) for lambda_hat, eigvec in zip(lambda_hats, eigvecs): eigvec = eigvec.reshape(-1, 1) E_RIE += lambda_hat * eigvec.dot(eigvec.T) # bp() tmp = 1./np.sqrt(np.diag(E_RIE)) E_RIE *= tmp E_RIE *= tmp.reshape(-1, 1) if return_covariance: std = 1./inverse_std E_RIE *= std E_RIE *= std.reshape(-1, 1) return E_RIE
def clipped(X, alpha=None, return_covariance=False): """Clips the eigenvalues of an empirical correlation matrix E in order to provide a cleaned estimator E_clipped of the underlying correlation matrix. Proceeds by keeping the [N * alpha] top eigenvalues and shrinking the remaining ones by a trace-preserving constant (i.e. Tr(E_clipped) = Tr(E)). Parameters ---------- X: design matrix, of shape (T, N), where T denotes the number of samples (think measurements in a time series), while N stands for the number of features (think of stock tickers). alpha: type float or derived from numbers.Real (default: None) Parameter between 0 and 1, inclusive, determining the fraction to keep of the top eigenvalues of an empirical correlation matrix. If left unspecified, alpha is chosen so as to keep all the empirical eigenvalues greater than the upper limit of the support to the Marcenko-Pastur spectrum. Indeed, such eigenvalues can be considered as associated with some signal, whereas the ones falling inside the Marcenko-Pastur range should be considered as corrupted with noise and indistinguishable from the spectrum of the correlation of a random matrix. This ignores finite-size effects that make it possible for the eigenvalues to exceed the upper and lower edges defined by the Marcenko-Pastur spectrum (cf. a set of results revolving around the Tracy-Widom distribution) return_covariance: type bool (default: False) If set to True, compute the standard deviations of each individual feature across observations, clean the underlying matrix of pairwise correlations, then re-apply the standard deviations and return a cleaned variance-covariance matrix. Returns ------- E_clipped: type numpy.ndarray, shape (N, N) Cleaned estimator of the true correlation matrix C underlying a noisy, in-sample estimate E (empirical correlation matrix estimated from X). This cleaned estimator proceeds through a simple eigenvalue clipping procedure (cf. reference below). If return_covariance=True, E_clipped corresponds to a cleaned variance-covariance matrix. Reference --------- "Financial Applications of Random Matrix Theory: a short review", J.-P. Bouchaud and M. Potters arXiv: 0910.1205 [q-fin.ST] """ try: if alpha is not None: assert isinstance(alpha, Real) and 0 <= alpha <= 1 assert isinstance(return_covariance, bool) except AssertionError: raise sys.exit(1) T, N, transpose_flag = checkDesignMatrix(X) if transpose_flag: X = X.T if not return_covariance: X = StandardScaler(with_mean=False, with_std=True).fit_transform(X) ec = EmpiricalCovariance(store_precision=False, assume_centered=True) ec.fit(X) E = ec.covariance_ if return_covariance: inverse_std = 1./np.sqrt(np.diag(E)) E *= inverse_std E *= inverse_std.reshape(-1, 1) eigvals, eigvecs = np.linalg.eigh(E) eigvecs = eigvecs.T if alpha is None: (lambda_min, lambda_max), _ = marcenkoPastur(X) xi_clipped = np.where(eigvals >= lambda_max, eigvals, np.nan) else: xi_clipped = np.full(N, np.nan) threshold = int(ceil(alpha * N)) if threshold > 0: xi_clipped[-threshold:] = eigvals[-threshold:] gamma = float(E.trace() - np.nansum(xi_clipped)) gamma /= np.isnan(xi_clipped).sum() xi_clipped = np.where(np.isnan(xi_clipped), gamma, xi_clipped) E_clipped = np.zeros((N, N), dtype=float) for xi, eigvec in zip(xi_clipped, eigvecs): eigvec = eigvec.reshape(-1, 1) E_clipped += xi * eigvec.dot(eigvec.T) tmp = 1./np.sqrt(np.diag(E_clipped)) E_clipped *= tmp E_clipped *= tmp.reshape(-1, 1) if return_covariance: std = 1./inverse_std E_clipped *= std E_clipped *= std.reshape(-1, 1) return E_clipped
class Matching(IndividualOutcomeEstimator): def __init__( self, propensity_transform=None, caliper=None, with_replacement=True, n_neighbors=1, matching_mode="both", metric="mahalanobis", knn_backend="sklearn", estimate_observed_outcome=False, ): """Match treatment and control samples with similar covariates. Args: propensity_transform (causallib.transformers.PropensityTransformer): an object for data preprocessing which adds the propensity score as a feature (default: None) caliper (float) : maximal distance for a match to be accepted. If not defined, all matches will be accepted. If defined, some samples may not be matched and their outcomes will not be estimated. (default: None) with_replacement (bool): whether samples can be used multiple times for matching. If set to False, the matching process will optimize the linear sum of distances between pairs of treatment and control samples and only `min(N_treatment, N_control)` samples will be estimated. Matching with no replacement does not make use of the `fit` data and is therefore not implemented for out-of-sample data (default: True) n_neighbors (int) : number of nearest neighbors to include in match. Must be 1 if `with_replacement` is `False.` If larger than 1, the estimate is calculated using the `regress_agg_function` or `classify_agg_function` across the `n_neighbors`. Note that when the `caliper` variable is set, some samples will have fewer than `n_neighbors` matches. (default: 1). matching_mode (str) : Direction of matching: `treatment_to_control`, `control_to_treatment` or `both` to indicate which set should be matched to which. All sets are cross-matched in `match` and when `with_replacement` is `False` all matching modes coincide. With replacement there is a difference. metric (str) : Distance metric string for calculating distance between samples. Note: if an external built `knn_backend` object with a different metric is supplied, `metric` needs to be changed to reflect that, because `Matching` will set its inverse covariance matrix if "mahalanobis" is set. (default: "mahalanobis", also supported: "euclidean") knn_backend (str or callable) : Backend to use for nearest neighbor search. Options are "sklearn" or a callable which returns an object implementing `fit`, `kneighbors` and `set_params` like the sklearn `NearestNeighbors` object. (default: "sklearn"). estimate_observed_outcome (bool) : Whether to allow a match of a sample to a sample other than itself when looking within its own treatment value. If True, the estimated potential outcome for the observed outcome may differ from the true observed outcome. (default: False) Attributes: classify_agg_function (callable) : Aggregating function for outcome estimation when classifying. (default: majority_rule) Usage is determined by type of `y` during `fit` regress_agg_function (callable) : Aggregating function for outcome estimation when regressing or predicting prob_a. (default: np.mean) Usage is determined by type of `y` during `fit` treatments_ (pd.DataFrame) : DataFrame of treatments (created after `fit`) outcomes_ (pd.DataFrame) : DataFrame of outcomes (created after `fit`) match_df_ (pd.DataFrame) : Dataframe of most recently calculated matches. For details, see `match`. (created after `match`) samples_used_ (pd.Series) : Series with count of samples used during most recent match. Series includes a count for each treatment value. (created after `match`) """ self.propensity_transform = propensity_transform self.covariance_conditioner = EmpiricalCovariance() self.caliper = caliper self.with_replacement = with_replacement self.n_neighbors = n_neighbors self.matching_mode = matching_mode self.metric = metric # if classify task, default aggregation function is majority self.classify_agg_function = majority_rule # if regress task, default aggregation function is mean self.regress_agg_function = np.mean self.knn_backend = knn_backend self.estimate_observed_outcome = estimate_observed_outcome def fit(self, X, a, y, sample_weight=None): """Load the treatments and outcomes and fit search trees. Applies transform to covariates X, initializes search trees for each treatment value for performing nearest neighbor searches. Note: Running `fit` a second time overwrites any information from previous `fit or `match` and re-fits the propensity_transform object. Args: X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates for n samples. a (pd.Series): Series of shape (n,) containing discrete treatment values for the n samples. y (pd.Series): Series of shape (n,) containing outcomes for the n samples. sample_weight: IGNORED In signature for compatibility with other estimators. Note: `X`, `a` and `y` must share the same index. Returns: self (Matching) the fitted object """ self._clear_post_fit_variables() self.outcome_ = y.copy() self.treatments_ = a.copy() if self.propensity_transform: self.propensity_transform.fit(X, a) X = self.propensity_transform.transform(X) self.conditioned_covariance_ = self._calculate_covariance(X) self.treatment_knns_ = {} for a in self.treatments_.unique(): haystack = X[self.treatments_ == a] self.treatment_knns_[a] = self._fit_sknn(haystack) return self def _execute_matching(self, X, a): """Execute matching of samples in X according to the treatment values in a. Returns a DataFrame including all the results, which is also set as the attribute `self.match_df_`. The arguments `X` and `a` define the "needle" where the "haystack" is the data that was previously passed to fit, for matching with replacement. As such, treatment and control samples from within `X` will not be matched with each other, unless the same `X` and `a` were passed to `fit`. For matching without replacement, the `X` and `a` passed to `match` provide the "needle" and the "haystack". If the attribute `caliper` is set, the matches are limited to those with a distance less than `caliper`. This function ignores the existing `match_df_` and will overwrite it. It is thus useful for if you have changed the settings and need to rematch the samples. For most applications, the `match` function is more convenient. Args: X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates for n samples. a (pd.Series): Series of shape (n,) containing discrete treatment values for the n samples. Note: The args are assumed to share the same index. Returns: match_df: The resulting matches DataFrame is indexed so that ` match_df.loc[treatment_value, sample_id]` has columns `matches` and `distances` containing lists of indices to samples and the respective distances for the matches discovered for `sample_id` from within the fitted samples with the given `treatment_value`. The indices in the `matches` column are from the fitted data, not the X argument in `match`. If `sample_id` had no match, `match_df.loc[treatment_value, sample_id].matches = []`. The DataFrame has shape (n* len(a.unique()), 2 ). Raises: NotImplementedError: Raised when with_replacement is False and n_neighbors is not 1. """ if self.n_neighbors != 1 and not self.with_replacement: raise NotImplementedError( "Matching more than one neighbor is only implemented for" "no-replacement") if self.propensity_transform: X = self.propensity_transform.transform(X) if self.with_replacement: self.match_df_ = self._withreplacement_match(X, a) else: self.match_df_ = self._noreplacement_match(X, a) sample_id_name = X.index.name if X.index.name is not None else "sample_id" self.match_df_.index.set_names(["match_to_treatment", sample_id_name], inplace=True) # we record the number of samples that were successfully matched of # each treatment value self.samples_used_ = self._count_samples_used_by_treatment_value(a) return self.match_df_ def estimate_individual_outcome(self, X, a, y=None, treatment_values=None, predict_proba=True, dropna=True): """ Calculate the potential outcome for each sample and treatment value. Execute match and calculate, for each treatment value and each sample, the expected outcome. Note: Out of sample estimation for matching without replacement requires passing a `y` vector here. If no 'y' is passed here, the values received by `fit` are used, and if the estimation indices are not a subset of the fitted indices, the estimation will fail. If the attribute `estimate_observed_outcome` is `True`, estimates will be calculated for the observed outcomes as well. If not, then the observed outcome will be passed through from the corresponding element of `y` passed to `fit`. Args: X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates for n samples. a (pd.Series): Series of shape (n,) containing discrete treatment values for the n samples. y (pd.Series): Series of shape (n,) containing outcome values for n samples. This is only used when `with_replacemnt=False`. Otherwise, the outcome values passed to `fit` are used. predict_proba (bool) : whether to output classifications or probabilties for a classification task. If set to False and data is non-integer, a warning is issued. (default True) dropna (bool) : For samples that were unmatched due to caliper restrictions, drop from outcome_df leading to a potentially smaller sized output, or include them as NaN. (default: True) treatment_values : IGNORED Note: The args are assumed to share the same index. Returns: outcome_df (pd.DataFrame) """ match_df = self.match(X, a, use_cached_result=True) outcome_df = self._aggregate_match_df_to_generate_outcome_df( match_df, a, predict_proba) outcome_df = self._filter_outcome_df_by_matching_mode(outcome_df, a) if outcome_df.isna().all(axis=None): raise ValueError("Matching was not successful and no outcomes can" "be estimated. Check caliper value.") if dropna: outcome_df = outcome_df.dropna() return outcome_df def match(self, X, a, use_cached_result=True, successful_matches_only=False): """Matching the samples in X according to the treatment values in a. Returns a DataFrame including all the results, which is also set as the attribute `self.match_df_`. The arguments `X` and `a` define the "needle" where the "haystack" is the data that was previously passed to fit, for matching with replacement. As such, treatment and control samp les from within `X` will not be matched with each other, unless the same `X` and `a` were passed to `fit`. For matching without replacement, the `X` and `a` passed to `match` provide the "needle" and the "haystack". If the attribute `caliper` is set, the matches are limited to those with a distance less than `caliper`. Args: X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates for n samples. a (pd.Series): Series of shape (n,) containing discrete treatment values for the n samples. use_cached_result (bool): Whether or not to return the `match_df` from the most recent matching operation. The cached result will only be used if the sample indices of `X` and those of `match_df` are identical, otherwise it will rematch. successful_matches_only (bool): Whether or not to filter the matches to those which matched successfully. If set to `False`, the resulting DataFrame will have shape (n* len(a.unique()), 2 ), otherwise it may have a smaller shape due to unsuccessful matches. Note: The args are assumed to share the same index. Returns: match_df: The resulting matches DataFrame is indexed so that ` match_df.loc[treatment_value, sample_id]` has columns `matches` and `distances` containing lists of indices to samples and the respective distances for the matches discovered for `sample_id` from within the fitted samples with the given `treatment_value`. The indices in the `matches` column are from the fitted data, not the X argument in `match`. If `sample_id` had no match, `match_df.loc[treatment_value, sample_id].matches = []`. The DataFrame has shape (n* len(a.unique()), 2 ), if `successful_matches_only` is set to `False. Raises: NotImplementedError: Raised when with_replacement is False and n_neighbors is not 1. """ cached_result_available = (hasattr(self, "match_df_") and X.index.equals(self.match_df_.loc[0].index)) if not (use_cached_result and cached_result_available): self._execute_matching(X, a) return self._get_match_df( successful_matches_only=successful_matches_only) def matches_to_weights(self, match_df=None): """Calculate weights based on a given set of matches. For each matching from one treatment value to another, a weight vector is generated. The weights are calculated as the number of times a sample was selected in a matching, with each occurrence weighted according to the number of other samples in that matching. The weights can be used to estimate outcomes or to check covariate balancing. The function can only be called after `match` has been run. Args: match_df (pd.DataFrame) : a DataFrame of matches returned from `match`. If not supplied, will use the `match_df_` attribute if available, else raises ValueError. Will not execute `match` to generate a `match_df`. Returns: weights_df (pd.DataFrame): DataFrame of shape (n,M) where M is the number of permutations of `a.unique()`. """ if match_df is None: match_df = self._get_match_df(successful_matches_only=False) match_permutations = sorted(permutations(self.treatments_.unique())) weights_df = pd.DataFrame([ self._matches_to_weights_single_matching(s, t, match_df) for s, t in match_permutations ], ).T return weights_df def get_covariates_of_matches(self, s, t, covariates): """ Look up covariates of closest matches for a given matching. Using `self.match_df_` and the supplied `covariates`, look up the covariates of the last match. The function can only be called after `match` has been run. Args: s (int) : source treatment value t (int) : target treatment value covariates (pd.DataFrame) : The same covariates which were passed to `fit`. Returns: covariate_df (pd.DataFrame) : a DataFrame of size (n_matched_samples, n_covariates * 3 + 2) with the covariate values of the sample, covariates of its match, calculated distance and number of neighbors found within the given caliper (with no caliper this will equal self.n_neighbors ) """ match_df = self._get_match_df() subdf = match_df.loc[s][self.treatments_ == t] sample_id_name = subdf.index.name def get_covariate_difference_from_nearest_match(source_row_index): j = subdf.loc[source_row_index].matches[0] delta_series = pd.Series(covariates.loc[source_row_index] - covariates.loc[j]) source_row = covariates.loc[j].copy() source_row.at[sample_id_name] = j target_row = covariates.loc[source_row_index].copy() target_row = target_row covariate_differences = pd.concat({ t: target_row, s: source_row, "delta": delta_series, "outcomes": pd.Series({ t: self.outcome_.loc[source_row_index], s: self.outcome_.loc[j] }), "match": pd.Series( dict( n_neighbors=len(subdf.loc[source_row_index].matches), distance=subdf.loc[source_row_index].distances[0], )), }) return covariate_differences covdf = pd.DataFrame(data=[ get_covariate_difference_from_nearest_match(i) for i in subdf.index ], index=subdf.index) covdf = covdf.reset_index() cols = covdf.columns covdf.columns = pd.MultiIndex.from_tuples([(t, sample_id_name)] + list(cols[1:])) return covdf def _clear_post_fit_variables(self): for var in list(vars(self)): if var[-1] == "_": self.__delattr__(var) def _calculate_covariance(self, X): if len(X.shape) > 1 and X.shape[1] > 1: V_list = [] for a in self.treatments_.unique(): X_at_a = X[self.treatments_ == a].copy() current_V = self.covariance_conditioner.fit(X_at_a).covariance_ V_list.append(current_V) # following Imbens&Rubin, we average across treatment groups V = np.mean(V_list, axis=0) else: # for 1d data revert to euclidean metric V = np.array(1).reshape(1, 1) return V def _aggregate_match_df_to_generate_outcome_df(self, match_df, a, predict_proba): agg_function = self._get_agg_function(predict_proba) def outcome_from_matches_by_idx(x): return agg_function(self.outcome_.loc[x]) outcomes = {} for i in sorted(a.unique()): outcomes[i] = match_df.loc[i].matches.apply( outcome_from_matches_by_idx) outcome_df = pd.DataFrame(outcomes) return outcome_df def _get_match_df(self, successful_matches_only=True): if not hasattr(self, "match_df_") or self.match_df_ is None: raise NotFittedError("You need to run `match` first") match_df = self.match_df_.copy() if successful_matches_only: match_df = match_df[match_df.matches.apply(bool)] if match_df.empty: raise ValueError( "Matching was not successful and no outcomes can be " "estimated. Check caliper value.") return match_df def _filter_outcome_df_by_matching_mode(self, outcome_df, a): if self.matching_mode == "treatment_to_control": outcome_df = outcome_df[a == 1] elif self.matching_mode == "control_to_treatment": outcome_df = outcome_df[a == 0] elif self.matching_mode == "both": pass else: raise NotImplementedError( "Matching mode {} is not implemented. Please select one of " "'treatment_to_control', 'control_to_treatment, " "or 'both'.".format(self.matching_mode)) return outcome_df def _get_agg_function(self, predict_proba): if predict_proba: agg_function = self.regress_agg_function else: agg_function = self.classify_agg_function try: isoutputinteger = np.allclose(self.outcome_.apply(int), self.outcome_) if not isoutputinteger: warnings.warn("Classifying non-categorical outcomes. " "This is probably a mistake.") except: warnings.warn( "Unable to detect whether outcome is integer-like. ") return agg_function def _instantiate_nearest_neighbors_object(self): backend = self.knn_backend if backend == "sklearn": backend_instance = NearestNeighbors(algorithm="auto") elif callable(backend): backend_instance = backend() self.metric = backend_instance.metric elif hasattr(backend, "fit") and hasattr(backend, "kneighbors"): backend_instance = sk_clone(backend) self.metric = backend_instance.metric else: raise NotImplementedError( "`knn_backend` must be either an NearestNeighbors-like object," " a callable returning such an object, or the string \"sklearn\"" ) backend_instance.set_params(**self._get_metric_dict()) return backend_instance def _fit_sknn(self, target_df): """ Fit scikit-learn NearestNeighbors object with samples in target_df. Fits object, adds metric parameters and returns namedtuple which also includes DataFrame indices so that identities can looked up. Args: target_df (pd.DataFrame) : DataFrame of covariates to fit Returns: KNN (namedtuple) : Namedtuple with members `learner` and `index` containing the fitted sklearn object and an index lookup vector, respectively. """ target_array = target_df.values sknn = self._instantiate_nearest_neighbors_object() target_array = self._ensure_array_columnlike(target_array) sknn.fit(target_array) return KNN(sknn, target_df.index) @staticmethod def _ensure_array_columnlike(target_array): if len(target_array.shape) < 2 or target_array.shape[1] == 1: target_array = target_array.reshape(-1, 1) return target_array def _get_metric_dict( self, VI_in_metric_params=True, ): metric_dict = dict(metric=self.metric) if self.metric == "mahalanobis": VI = np.linalg.inv(self.conditioned_covariance_) if VI_in_metric_params: metric_dict["metric_params"] = {"VI": VI} else: metric_dict["VI"] = VI return metric_dict def _kneighbors(self, knn, source_df): """Lookup neighbors in knn object. Args: knn (namedtuple) : knn named tuple to look for neighbors in. The object has `learner` and `index` attributes to reference the original df index. source_df (pd.DataFrame) : a DataFrame of source data points to use as "needles" for the knn "haystack." Returns: match_df (pd.DataFrame) : a DataFrame of matches """ source_array = source_df.values # 1d data must be in shape (-1, 1) for sklearn.knn source_array = self._ensure_array_columnlike(source_array) distances, neighbor_array_indices = knn.learner.kneighbors( source_array, n_neighbors=self.n_neighbors) return self._generate_match_df(source_df, knn.index, distances, neighbor_array_indices) def _generate_match_df(self, source_df, target_df_index, distances, neighbor_array_indices): """ Take results of matching and build into match_df DataFrame. For clarity we'll call the samples that are being matched "needles" and the set of samples that they looked for matches in the "haystack". Args: source_df (pd.DataFrame) : Covariate dataframe of N "needles" target_df_index (np.array) : An array of M indices of the haystack samples in their original dataframe. distances (np.array) : An array of N arrays of floats of length K where K is `self.n_neighbors`. neighbor_array_indices (np.array) : An array of N arrays of ints of length K where K is `self.n_neighbors`. """ # target is the haystack, source is the needle(s) # translate array indices back to original indices matches_dict = {} for source_idx, distance_row, neighbor_array_index_row in zip( source_df.index, distances, neighbor_array_indices): neighbor_df_indices = \ target_df_index[neighbor_array_index_row.flatten()] if self.caliper is not None: neighbor_df_indices = [ n for i, n in enumerate(neighbor_df_indices) if distance_row[i] < self.caliper ] distance_row = [d for d in distance_row if d < self.caliper] matches_dict[source_idx] = dict(matches=list(neighbor_df_indices), distances=list(distance_row)) # convert dict of dicts like { 1: {'matches':[], 'distances':[]}} to df return pd.DataFrame(matches_dict).T def _matches_to_weights_single_matching(self, s, t, match_df): """ For a given match, calculate the resulting weight vector. The weight vector adds a count each time a sample is used, weighted by the number of other neighbors when it was used. This is necessary to make the weighted sum return the correct effect estimate. """ weights = pd.Series(self.treatments_.copy() * 0) name = {0: "control", 1: "treatment"} weights.name = "{s}_to_{t}".format(s=name[s], t=name[t]) s_to_t_matches = match_df.loc[t][self.treatments_ == s].matches for source_idx, matches_list in s_to_t_matches.iteritems(): if matches_list: weights.loc[source_idx] += 1 for match in matches_list: weights.loc[match] += 1 / len(matches_list) return weights def _get_distance_matrix(self, source_df, target_df): """ Create distance matrix for no replacement match. Combines metric, caliper and source/target data into a precalculated distance matrix which can be passed to scipy.optimize.linear_sum_assignment. """ cdist_args = dict( XA=self._ensure_array_columnlike(source_df.values), XB=self._ensure_array_columnlike(target_df.values), ) cdist_args.update(self._get_metric_dict(False)) distance_matrix = distance.cdist(**cdist_args) if self.caliper is not None: distance_matrix[distance_matrix > self.caliper] = VERY_LARGE_NUMBER return distance_matrix def _withreplacement_match(self, X, a): matches = {} # maps treatment value to list of matches TO that value for treatment_value, knn in self.treatment_knns_.items(): matches[treatment_value] = self._kneighbors(knn, X) # when producing potential outcomes we may want to force the # value of the observed outcome to be the actual observed # outcome, and not an average of the k nearest samples. if not self.estimate_observed_outcome: def limit_within_treatment_matches_to_self_only(row): if (a.loc[row.name] == treatment_value and row.name in row.matches): row.matches = [row.name] row.distances = [0] return row matches[treatment_value] = matches[treatment_value].apply( limit_within_treatment_matches_to_self_only, axis=1) return pd.concat(matches, sort=True) def _noreplacement_match(self, X, a): match_combinations = sorted(combinations(a.unique(), 2)) matches = {} for s, t in match_combinations: distance_matrix = self._get_distance_matrix(X[a == s], X[a == t]) source_array, neighbor_array_indices, distances = \ self._optimally_match_distance_matrix(distance_matrix) source_df = X[a == s].iloc[np.array(source_array)] target_df = X[a == t].iloc[np.array(neighbor_array_indices)] if t in matches or s in matches: warnings.warn("No-replacement matching for more than " "2 treatment values is not supported") matches[t] = self._create_match_df_for_no_replacement( a, source_df, target_df, distances) matches[s] = self._create_match_df_for_no_replacement( a, target_df, source_df, distances) match_df = pd.concat(matches, sort=True) return match_df def _optimally_match_distance_matrix(self, distance_matrix): source_array, neighbor_array_indices = linear_sum_assignment( distance_matrix) distances = [[ distance_matrix[s_idx, t_idx] ] for s_idx, t_idx in zip(source_array, neighbor_array_indices)] source_array, neighbor_array_indices, distances = \ self._filter_noreplacement_matches_using_caliper( source_array, neighbor_array_indices, distances) return source_array, neighbor_array_indices, distances def _filter_noreplacement_matches_using_caliper(self, source_array, neighbor_array_indices, distances): if self.caliper is None: return source_array, neighbor_array_indices, distances keep_indices = [ i for i, d in enumerate(distances) if d[0] <= self.caliper ] source_array = source_array[keep_indices] neighbor_array_indices = neighbor_array_indices[keep_indices] distances = [distances[i] for i in keep_indices] if not keep_indices: warnings.warn("No matches found, check caliper." "No estimation possible.") return source_array, neighbor_array_indices, distances @staticmethod def _create_match_df_for_no_replacement(base_series, source_df, target_df, distances): match_sub_df = pd.DataFrame( index=base_series.index, columns=[ "matches", "distances", ], data=base_series.apply(lambda x: pd.Series([[], []])).values, dtype="object", ) # matching from source to target: read distances match_sub_df.loc[source_df.index] = pd.DataFrame( data=dict( matches=[[tidx] for tidx in target_df.index], distances=distances, ), index=source_df.index, ) # matching from target to target: fill with zeros match_sub_df.loc[target_df.index] = pd.DataFrame( data=dict( matches=[[tidx] for tidx in target_df.index], distances=[[0]] * len(distances), ), index=target_df.index, ) return match_sub_df def _count_samples_used_by_treatment_value(self, a): # we record the number of samples that were successfully matched of # each treatment value samples_used = { treatment_value: self.match_df_.loc[treatment_value][ a != treatment_value].matches.apply(bool).sum() for treatment_value in sorted(a.unique(), reverse=True) } return pd.Series(samples_used)
###### Likelyhood Computation ###### # Fold the angles in params into proper range, such that # they centered at the mean. N_CYCLE_FOLD_ANGLE = 10 for j in xrange(N_CYCLE_FOLD_ANGLE): mean = np.mean(params, axis=0) for i in xrange(3, 6): # index 3,4,5 are angles, others are distances params[:, i][params[:, i] > mean[i] + np.pi] -= 2 * np.pi params[:, i][params[:, i] < mean[i] - np.pi] += 2 * np.pi if PARAMS_TLR[i] > mean[i] + np.pi: PARAMS_TLR[i] += 2 * np.pi if PARAMS_TLR[i] < mean[i] - np.pi: PARAMS_TLR[i] -= 2 * np.pi est = EmpiricalCovariance(True, False) est.fit(params) log_likelyhood = est.score(PARAMS_TLR[None, :]) KT = 0.59 free_e = -log_likelyhood * KT print 'Log likelyhood score:', log_likelyhood print 'Free energy:', free_e ###### Output the best conformer to pdb ###### def generate_bp_par_file(params, bps, out_name): assert(len(params) == len(bps)) n_bp = len(params) # convert from radians to degrees params[:, 3:] = np.degrees(params[:, 3:])
import numpy as np from scipy.io import loadmat from sklearn.covariance import EmpiricalCovariance, EllipticEnvelope from sklearn.metrics import accuracy_score, classification_report cardio_data = loadmat('cardio.mat') estimator = EmpiricalCovariance() cov = estimator.fit(cardio_data['X']) mahal_cov = cov.mahalanobis(cardio_data['X']) # sort values and extract n maximum values # number of outliers in cardio data = 176 indexes = np.argpartition(mahal_cov, 176)[-176:] y_pred = np.zeros(cardio_data['y'].shape) y_pred[indexes] = 1 print(classification_report(cardio_data['y'], y_pred)) print(accuracy_score(cardio_data['y'], y_pred)) cov = EllipticEnvelope().fit(np.dot(cardio_data['X'].T, cardio_data['X'])) mahal_cov = cov.mahalanobis(cardio_data['X']) indexes = np.argpartition(mahal_cov, 176)[-176:] y_pred = np.zeros(cardio_data['y'].shape) y_pred[indexes] = 1 print(classification_report(cardio_data['y'], y_pred)) print(accuracy_score(cardio_data['y'], y_pred))
# #################################### # PLSR - Marco # #################################### plsr = PLSR(X, Y) plsr.Initialize() plsr.EvaluateComponents() weights = plsr.GetWeights() comps = plsr.ReturnComponents() print('Covariance X (XX\'):\n %s' % str(X.dot(X.T).shape)) print('Covariance X (plsr):\n %s' % str(plsr._covX.shape)) print('Covariance X (numpy):\n %s' % np.cov(X, rowvar=False)) print('Covariance X (pandas):\n %s' % dfx.cov().values) cov = EmpiricalCovariance(assume_centered=True) cov.fit(X) print('Covariance X (sklearn):\n %s' % cov.covariance_) # print('weigths 0:\n %s' % weights[0]) # print('weigths 1:\n %s' % weights[1]) # print('Y Scores:\n %s' % comps[0]) # print('Components Y:\n %s' % comps[1]) # #################################### # PLSR - SKLEARN # #################################### # print('\n\nPLS-SVD') # plsr = PLSSVD(n_components=2, scale=False) # plsr.fit(X,Y)
def get_covariance(X): cov = EmpiricalCovariance(assume_centered=True) cov.fit(X) return cov.covariance_
def fit(self, X, n_jobs=-1): EmpiricalCovariance.fit(self, X) if not self.no_fit: CovarianceOutlierDetectionMixin.set_threshold( self, X, n_jobs=n_jobs) return self
hPurity_disc.Divide(hPurity_discDen) hPurity_disc.Draw() c.Print("purity_disc.png") hMVAdisc_pt.Draw("colz") c.Print("discriminator_vs_candPt.png") from sklearn.covariance import EmpiricalCovariance npRocInput = numpy.array(rocInput) npRocAnswers = numpy.array(rocScore) slimNpData0 = npRocInput[npRocAnswers == 0] slimNpData1 = npRocInput[npRocAnswers == 1] ecv = EmpiricalCovariance() ecv.fit(slimNpData0) from scipy.linalg import fractional_matrix_power def diagElements(m): size = m.shape[0] return numpy.matrix(numpy.diag([m[i, i] for i in xrange(size)])) def corrMat(m): sqrt_diag = fractional_matrix_power(diagElements(m), -0.5) return numpy.array(sqrt_diag * m * sqrt_diag) corr0 = corrMat(numpy.matrix(ecv.covariance_))
###### Likelyhood Computation ###### # Fold the angles in params into proper range, such that # they centered at the mean. N_CYCLE_FOLD_ANGLE = 10 for j in xrange(N_CYCLE_FOLD_ANGLE): mean = np.mean(params, axis=0) for i in xrange(3, 6): # index 3,4,5 are angles, others are distances params[:, i][params[:, i] > mean[i] + np.pi] -= 2 * np.pi params[:, i][params[:, i] < mean[i] - np.pi] += 2 * np.pi if PARAMS_TLR[i] > mean[i] + np.pi: PARAMS_TLR[i] += 2 * np.pi if PARAMS_TLR[i] < mean[i] - np.pi: PARAMS_TLR[i] -= 2 * np.pi est = EmpiricalCovariance(True, False) est.fit(params) log_likelyhood = est.score(PARAMS_TLR[None, :]) KT = 0.59 free_e = -log_likelyhood * KT print 'Log likelyhood score:', log_likelyhood print 'Free energy:', free_e ###### Output the best conformer to pdb ###### def generate_bp_par_file(params, bps, out_name): assert (len(params) == len(bps)) n_bp = len(params) # convert from radians to degrees params[:, 3:] = np.degrees(params[:, 3:])
def fit(self, X): ''' Copulafit using Gaussian copula with marginals evaluated by Gaussian KDE Precision matrix is evaluated using specified method, default to graphical LASSO :param X: input dataset :return: estimated precision matrix rho ''' N, d = X.shape if self.scaler is not None: X_scale = self.scaler.fit_transform(X) else: X_scale = X if len(self.vertexes) == 0: self.vertexes = [str(id) for id in range(d)] self.theta = 1.0 / N cum_marginals = np.zeros_like(X) inv_norm_cdf = np.zeros_like(X) # inv_norm_cdf_scaled = np.zeros_like(X) self.kernels = list([]) # TODO: complexity O(Nd) is high if self.verbose: colored('>> Computing marginals', color='blue') for j in range(cum_marginals.shape[1]): self.kernels.append(gaussian_kde(X_scale[:, j])) cum_pdf_overall = self.kernels[-1].integrate_box_1d( X_scale[:, j].min(), X_scale[:, j].max()) for i in range(cum_marginals.shape[0]): cum_marginals[i, j] = self.kernels[-1].integrate_box_1d( X_scale[:, j].min(), X_scale[i, j]) / cum_pdf_overall # truncate cumulative marginals if cum_marginals[i, j] < self.theta: cum_marginals[i, j] = self.theta elif cum_marginals[i, j] > 1 - self.theta: cum_marginals[i, j] = 1 - self.theta # inverse of normal CDF: \Phi(F_j(x))^{-1} inv_norm_cdf[i, j] = norm.ppf(cum_marginals[i, j]) # scaled to preserve mean and variance: u_j + \sigma_j*\Phi(F_j(x))^{-1} # inv_norm_cdf_scaled[i, j] = X_scale[:, j].mean() + X_scale[:, j].std() * inv_norm_cdf[i, j] if self.method == 'mle': # maximum-likelihood estiamtor empirical_cov = EmpiricalCovariance() empirical_cov.fit(inv_norm_cdf) if self.verbose: print colored('>> Running MLE to estiamte precision matrix', color='blue') self.est_cov = empirical_cov.covariance_ self.corr = scale_matrix(self.est_cov) self.precision_ = inv(empirical_cov.covariance_) if self.method == 'glasso': if self.verbose: print colored('>> Running glasso to estiamte precision matrix', color='blue') empirical_cov = EmpiricalCovariance() empirical_cov.fit(inv_norm_cdf) # shrunk convariance to avoid numerical instability shrunk_cov = shrunk_covariance(empirical_cov.covariance_, shrinkage=0.8) self.est_cov, self.precision_ = graph_lasso(emp_cov=shrunk_cov, alpha=self.penalty, verbose=self.verbose, max_iter=self.max_iter) self.corr = scale_matrix(self.est_cov) if self.method == 'ledoit_wolf': if self.verbose: print colored( '>> Running ledoit_wolf to estiamte precision matrix', color='blue') self.est_cov, _ = ledoit_wolf(inv_norm_cdf) self.corr = scale_matrix(self.est_cov) self.precision_ = linalg.inv(self.est_cov) if self.method == 'spectral': '''L2 mehtod, use paper Inverse covariance estimation for high dimension data in linear time and space :formular: in paper eq(8) ''' if self.verbose: print colored( '>> Running Riccati to estiamte precision matrix', color='blue') # TODO: note estimated cov is sample cov self.est_cov, self.precision_ = spectral(inv_norm_cdf, rho=2 * self.penalty, assume_centered=False) self.corr = scale_matrix(self.est_cov) if self.method == 'pc': clf = pgmlearner.PGMLearner() data_list = list([]) for row_id in range(X_scale.shape[0]): instance = dict() for i, n in enumerate(self.vertexes): instance[n] = X_scale[row_id, i] data_list.append(instance) graph = clf.lg_constraint_estimatestruct(data=data_list, pvalparam=self.pval, bins=self.bins) dag = np.zeros(shape=(len(graph.V), len(graph.V))) for e in graph.E: dag[self.vertexes.index(e[0]), self.vertexes.index(e[1])] = 1 self.conditional_independences_ = dag if self.method == 'ic': df = dict() variable_types = dict() for j in range(X_scale.shape[1]): df[self.vertexes[j]] = X_scale[:, j] variable_types[self.vertexes[j]] = 'c' data = pd.DataFrame(df) # run the search ic_algorithm = IC(RobustRegressionTest, data, variable_types, alpha=self.pval) graph = ic_algorithm.search() dag = np.zeros(shape=(X_scale.shape[1], X_scale.shape[1])) for e in graph.edges(data=True): i = self.vertexes.index(e[0]) j = self.vertexes.index(e[1]) dag[i, j] = 1 dag[j, i] = 1 arrows = set(e[2]['arrows']) head_len = len(arrows) if head_len > 0: head = arrows.pop() if head_len == 1 and head == e[0]: dag[i, j] = 0 if head_len == 1 and head == e[1]: dag[j, i] = 0 self.conditional_independences_ = dag # finally we fit the structure self.fit_structure(self.precision_)
def optimalShrinkage(X, return_covariance=False): """This function computes a cleaned, optimal shrinkage, rotationally-invariant estimator (RIE) of the true correlation matrix C underlying the noisy, in-sample estimate E = 1/T X * transpose(X) associated to a design matrix X of shape (T, N) (T measurements and N features). One approach to getting a cleaned estimator that predates the optimal shrinkage, RIE estimator consists in inverting the Marcenko-Pastur equation so as to replace the eigenvalues from the spectrum of E by an estimation of the true ones. This approach is known to be numerically-unstable, in addition to failing to account for the overlap between the sample eigenvectors and the true eigenvectors. How to compute such overlaps was first explained by Ledoit and Peche (cf. reference below). Their procedure was extended by Bun, Bouchaud and Potters, who also correct for a systematic downward bias in small eigenvalues. It is this debiased, optimal shrinkage, rotationally-invariant estimator that the function at hand implements. Parameter --------- X: design matrix, of shape (T, N), where T denotes the number of samples (think measurements in a time series), while N stands for the number of features (think of stock tickers). return_covariance: type bool (default: False) If set to True, compute the standard deviations of each individual feature across observations, clean the underlying matrix of pairwise correlations, then re-apply the standard deviations and return a cleaned variance-covariance matrix. Returns ------- E_RIE: type numpy.ndarray, shape (N, N) Cleaned estimator of the true correlation matrix C. A sample estimator of C is the empirical covariance matrix E estimated from X. E is corrupted by in-sample noise. E_RIE is the optimal shrinkage, rotationally-invariant estimator (RIE) of C computed following the procedure of Joel Bun and colleagues (cf. references below). If return_covariance=True, E_clipped corresponds to a cleaned variance-covariance matrix. References ---------- * "Eigenvectors of some large sample covariance matrix ensembles", O. Ledoit and S. Peche Probability Theory and Related Fields, Vol. 151 (1), pp 233-264 * "Rotational invariant estimator for general noisy matrices", J. Bun, R. Allez, J.-P. Bouchaud and M. Potters arXiv: 1502.06736 [cond-mat.stat-mech] * "Cleaning large Correlation Matrices: tools from Random Matrix Theory", J. Bun, J.-P. Bouchaud and M. Potters arXiv: 1610.08104 [cond-mat.stat-mech] """ try: assert isinstance(return_covariance, bool) except AssertionError: raise sys.exit(1) T, N, transpose_flag = checkDesignMatrix(X) if transpose_flag: X = X.T if not return_covariance: X = StandardScaler(with_mean=False, with_std=True).fit_transform(X) ec = EmpiricalCovariance(store_precision=False, assume_centered=True) ec.fit(X) E = ec.covariance_ if return_covariance: inverse_std = 1./np.sqrt(np.diag(E)) E *= inverse_std E *= inverse_std.reshape(-1, 1) eigvals, eigvecs = np.linalg.eigh(E) eigvecs = eigvecs.T q = N / float(T) lambda_N = eigvals[0] # The smallest empirical eigenvalue, # given that the function used to compute # the spectrum of a Hermitian or symmetric # matrix - namely np.linalg.eigh - returns # the eigenvalues in ascending order. xis = map(lambda x: xiHelper(x, q, E), eigvals) Gammas = map(lambda x: gammaHelper(x, q, N, lambda_N), eigvals) xi_hats = map(lambda a, b: a * b if b > 1 else a, xis, Gammas) E_RIE = np.zeros((N, N), dtype=float) for xi_hat, eigvec in zip(xi_hats, eigvecs): eigvec = eigvec.reshape(-1, 1) E_RIE += xi_hat * eigvec.dot(eigvec.T) tmp = 1./np.sqrt(np.diag(E_RIE)) E_RIE *= tmp E_RIE *= tmp.reshape(-1, 1) if return_covariance: std = 1./inverse_std E_RIE *= std E_RIE *= std.reshape(-1, 1) return E_RIE
def rand_pts_overall_cov_init(X, n_components, cov_est_method='LW', covariance_type='full', random_state=None): """ Sets the means to randomly selected points. Sets the covariances to the overall covariance matrix. Parameters ---------- X: (n_samples, n_features) n_components: int cov_est_method: str Must be one of ['emperical', 'LW', 'OAS'] for empirical covariance matrix estimate, LedoitWolf and Oracle Approximating Shrinkage Estimator. See sklean.covariace for details. random_state: None, int, random seed Random seed. """ assert cov_est_method in ['empirical', 'LW', 'OAS'] assert covariance_type in ['full', 'diag', 'tied', 'spherical'] n_samples = X.shape[0] # randomly select data points to start cluster centers from rng = check_random_state(random_state) # estimate global covariance if cov_est_method == 'empirical': cov_estimator = EmpiricalCovariance(store_precision=False) elif cov_est_method == 'LW': cov_estimator = LedoitWolf(store_precision=False) elif cov_est_method == 'OAS': cov_estimator = OAS(store_precision=False) cov_estimator.fit(X) cov_est = cov_estimator.covariance_ # set covariance matrix for each cluster if covariance_type == 'tied': covs = cov_est elif covariance_type == 'full': covs = np.stack([cov_est for _ in range(n_components)]) elif covariance_type == 'diag': # each components gets the diagonal of the estimated covariance matrix covs = np.diag(cov_est) covs = np.repeat(covs.reshape(1, -1), repeats=n_components, axis=0) elif covariance_type == 'spherical': # each components gets the average of the variances covs = np.diag(cov_est).mean() covs = np.repeat(covs, repeats=n_components) # set means to random data points rand_idxs = rng.choice(range(n_samples), replace=False, size=n_components) means = [X[pt_idx, ] for pt_idx in rand_idxs] means = np.array(means) return means, covs
# save for heuristic correction age = df_test['var15'] age_ecdf = ECDF(df_train['var15']) df_train['var15'] = age_ecdf(df_train['var15']) df_test['var15'] = age_ecdf(df_test['var15']) # feature engineering df_train.loc[df_train['var3'] == -999999.000000, 'var3'] = 2.0 df_train['num_zeros'] = (df_train == 0).sum(axis=1) df_test.loc[df_train['var3'] == -999999.000000, 'var3'] = 2.0 df_test['num_zeros'] = (df_test == 0).sum(axis=1) # outliers ec = EmpiricalCovariance() ec = ec.fit(df_train) m2 = ec.mahalanobis(df_train) df_train = df_train[m2 < 40000] df_target = df_target[m2 < 40000] # clip # df_test = df_test.clip(df_train.min(), df_train.max(), axis=1) # standard preprocessing prep = Pipeline([ ('cd', ColumnDropper(drop=ZERO_VARIANCE_COLUMNS + CORRELATED_COLUMNS)), ('std', StandardScaler()) ]) X_train = prep.fit_transform(df_train) X_test = prep.transform(df_test)