def lw(data, alphas): """ Estimates the graph with Ledoit-Wolf estimator. Parameters ---------- data: numpy ndarray The input data for to reconstruct/estimate a graph on. Features as columns and observations as rows. alphas: float The threshold on the precision matrix to determine edges. Returns ------- adjacency matrix : the estimated adjacency matrix. """ alpha=alphas scaler = StandardScaler() data = scaler.fit_transform(data) cov = LedoitWolf().fit(data) precision_matrix = cov.get_precision() n_features, _ = precision_matrix.shape mask1 = np.abs(precision_matrix) > alpha mask0 = np.abs(precision_matrix) <= alpha adjacency_matrix = np.zeros((n_features,n_features)) adjacency_matrix[mask1] = 1 adjacency_matrix[mask0] = 0 adjacency_matrix[np.diag_indices_from(adjacency_matrix)] = 0 return adjacency_matrix
def maximization(self): # mean maximization for i in range(self._K): mu[i] = mu_ss[i] / ndata_ss # covariance maximization for i in range(self._K): for j in range(self._K): cov[i,j] = (1.0/ ndata_ss) * cov_ss[i,j] + ndata_ss * mu[i] * mu[j] - mu_ss[i] * mu[j] - mu_ss[j] * mu[i] # covariance shrinkage lw = LedoitWolf() cov_result = lw.fit(cov,assume_centered=True).covariance_ inv_cov = np.linalg.inv(cov_result) log_det_inv_cov = np.log(np.linalg.det(inv_cov)) # topic maximization for i in range(self._K): sum_m = 0 for j in range(self._W): sum_m += beta_ss[i,j] if sum_m == 0: sum_m = -1000 * self._W else: sum_m = np.log(sum_m) for j in range(self._W): log_beta[i,j] = np.log(beta_ss[i,j] - sum_m)
def test_ledoit_wolf_small(): # Compare our blocked implementation to the naive implementation X_small = X[:, :4] lw = LedoitWolf() lw.fit(X_small) shrinkage_ = lw.shrinkage_ assert_almost_equal(shrinkage_, _naive_ledoit_wolf_shrinkage(X_small))
def LW_est(X): ''' Ledoit-Wolf optimal shrinkage coefficient estimate X_size = (n_samples, n_features) ''' lw = LedoitWolf() cov_lw = lw.fit(X).covariance_ return cov_lw
def covarianceEstimation(daily_returns, cov_estimator): lw = LedoitWolf() if cov_estimator == "shrinkage": return lw.fit(daily_returns).covariance_ elif cov_estimator == "empirical": return daily_returns.cov() elif cov_estimator == "multifactor": # FIXME return None else: raise Exception("协方差矩阵类型为[shrinkage,empirical,multifactor]")
def shrinked_covariance(returns, price_data=False, shrinkage_type='basic', assume_centered=False, basic_shrinkage=0.1): """ Calculates the Covariance estimator with shrinkage for a dataframe of asset prices or returns. This function allows three types of shrinkage - Basic, Ledoit-Wolf and Oracle Approximating Shrinkage. It is a wrap of the sklearn's ShrunkCovariance, LedoitWolf and OAS classes. According to the scikit-learn User Guide on Covariance estimation: "Sometimes, it even occurs that the empirical covariance matrix cannot be inverted for numerical reasons. To avoid such an inversion problem, a transformation of the empirical covariance matrix has been introduced: the shrinkage. Mathematically, this shrinkage consists in reducing the ratio between the smallest and the largest eigenvalues of the empirical covariance matrix". Link to the documentation: <https://scikit-learn.org/stable/modules/covariance.html>`_ If a dataframe of prices is given, it is transformed into a dataframe of returns using the calculate_returns method from the ReturnsEstimators class. :param returns: (pd.DataFrame) Dataframe where each column is a series of returns or prices for an asset. :param price_data: (bool) Flag if prices of assets are used and not returns. (False by default) :param shrinkage_type: (str) Type of shrinkage to use. (``basic`` by default, ``lw``, ``oas``, ``all``) :param assume_centered: (bool) Flag for data with mean almost, but not exactly zero. (Read documentation for chosen shrinkage class, False by default) :param basic_shrinkage: (float) Between 0 and 1. Coefficient in the convex combination for basic shrinkage. (0.1 by default) :return: (np.array) Estimated covariance matrix. Tuple of covariance matrices if shrinkage_type = ``all``. """ # Calculating the series of returns from series of prices if price_data: # Class with returns calculation function ret_est = ReturnsEstimators() # Calculating returns returns = ret_est.calculate_returns(returns) # Calculating the covariance matrix for the chosen method if shrinkage_type == 'basic': cov_matrix = ShrunkCovariance(assume_centered=assume_centered, shrinkage=basic_shrinkage).fit( returns).covariance_ elif shrinkage_type == 'lw': cov_matrix = LedoitWolf(assume_centered=assume_centered).fit(returns).covariance_ elif shrinkage_type == 'oas': cov_matrix = OAS(assume_centered=assume_centered).fit(returns).covariance_ else: cov_matrix = ( ShrunkCovariance(assume_centered=assume_centered, shrinkage=basic_shrinkage).fit(returns).covariance_, LedoitWolf(assume_centered=assume_centered).fit(returns).covariance_, OAS(assume_centered=assume_centered).fit(returns).covariance_) return cov_matrix
def similarity_measure_mahalanobis(ds_tar, ds_src, results, p_value=0.95): print 'Computing Mahalanobis similarity...' # TODO: The function parameters must be the two datasets, # TODO: src is the one with parameter calculation, second is the similarity one # Get classifier from results classifier = results['fclf'] # Make prediction on training set, to understand data distribution ## TODO: Evaluate if it is correct! classifier_predictions_src = classifier.predict(ds_src) prediction_mask = np.array(classifier_predictions_src) == ds_src.targets example_dist = dict() # Extract feature selected from each dataset if isinstance(classifier, FeatureSelectionClassifier): f_selection = results['fclf'].mapper ds_tar = f_selection(ds_tar) ds_src = f_selection(ds_src) ''' Get class distribution information: mean and covariance ''' for label in np.unique(ds_src.targets): # Get examples correctly classified mask = ds_src.targets == label example_dist[label] = dict() true_ex = ds_src.samples[mask * prediction_mask] # Get Mean and Covariance to draw the distribution # We evaluate mean and cov only on well-classified examples mean_ = np.mean(true_ex, axis=0) example_dist[label]['mean'] = mean_ print 'Estimation of covariance matrix for ' + label + ' class...' print true_ex.shape try: #cov_ = MinCovDet().transform(true_ex) cov_ = LedoitWolf().transform(true_ex) #cov_ = EmpiricalCovariance().transform(true_ex) #cov_ = GraphLasso(alpha=0.5).transform(true_ex) #cov_ = OAS(alpha=0.1).transform(true_ex) except MemoryError, err: print 'Method is LedoitWolf' cov_ = LedoitWolf(block_size=15000).transform(true_ex) example_dist[label]['i_cov'] = cov_.precision_ print 'Inverted covariance estimated...'
def partial_corrconn(activity_matrix, estimator='EmpiricalCovariance', target_ts=None): """ activity_matrix: Activity matrix should be nodes X time target_ts: Optional, used when only a single target time series (returns 1 X nnodes matrix) estimator: can be either 'Empirical covariance' the default, or 'LedoitWolf' partial correlation with Ledoit-Wolf shrinkage Output: connectivity_mat, formatted targets X sources Credit goes to nilearn connectivity_matrices.py which contains code that was simplified for this use. """ nnodes = activity_matrix.shape[0] timepoints = activity_matrix.shape[1] if nnodes > timepoints: print('activity_matrix shape: ', np.shape(activity_matrix)) raise Exception( 'More nodes (regressors) than timepoints! Use regularized regression' ) if 2 * nnodes > timepoints: print('activity_matrix shape: ', np.shape(activity_matrix)) print('Consider using a shrinkage method') if target_ts is None: connectivity_mat = np.zeros((nnodes, nnodes)) # calculate covariance if estimator is 'LedoitWolf': cov_estimator = LedoitWolf(store_precision=False) elif estimator is 'EmpiricalCovariance': cov_estimator = EmpiricalCovariance(store_precision=False) covariance = cov_estimator.fit(activity_matrix.T).covariance_ # calculate precision precision = linalg.inv(covariance) # precision to partial corr diagonal = np.atleast_2d(1. / np.sqrt(np.diag(precision))) correlation = precision * diagonal * diagonal.T # Force exact 0. on diagonal np.fill_diagonal(correlation, 0.) connectivity_mat = -correlation else: #Computing values for a single target node connectivity_mat = np.zeros((nnodes, 1)) X = activity_matrix.T y = target_ts #Note: LinearRegression fits intercept by default (intercept beta not included in coef_ output) reg = LinearRegression().fit(X, y) connectivity_mat = reg.coef_ return connectivity_mat
def test_ledoit_wolf_large(): # test that ledoit_wolf doesn't error on data that is wider than block_size rng = np.random.RandomState(0) # use a number of features that is larger than the block-size X = rng.normal(size=(10, 20)) lw = LedoitWolf(block_size=10).fit(X) # check that covariance is about diagonal (random normal noise) assert_almost_equal(lw.covariance_, np.eye(20), 0) cov = lw.covariance_ # check that the result is consistent with not splitting data into blocks. lw = LedoitWolf(block_size=25).fit(X) assert_almost_equal(lw.covariance_, cov)
def __init__(self, k=2, gamma=1.0, covariance_estimator='ledoit-wolf'): self.k = float(k) self.gamma = gamma self.covariance_estimator = covariance_estimator if covariance_estimator == 'empirical': self.cov = EmpiricalCovariance(store_precision=False) elif covariance_estimator == 'ledoit-wolf': self.cov = LedoitWolf(store_precision=False) else: raise NotImplementedError('%s is not implemented' % covariance_estimator) self.x0 = None self.x1 = None
def max_IC_weight(ic_df, factors_dict, holding_period, covariance_type="shrink"): """ 输入ic_df(ic值序列矩阵),指定持有期和滚动窗口,给出相应的多因子组合权重 :param factors_dict: 若干因子组成的字典(dict),形式为: {"factor_name_1":factor_1,"factor_name_2":factor_2} 每个因子值格式为一个pd.DataFrame,索引(index)为date,column为asset :param ic_df: ic值序列矩阵 (pd.Dataframe),索引(index)为datetime,columns为各因子名称。 如: BP CFP EP ILLIQUIDITY REVS20 SRMI VOL20 date 2016-06-24 0.165260 0.002198 0.085632 -0.078074 0.173832 0.214377 0.068445 2016-06-27 0.165537 0.003583 0.063299 -0.048674 0.180890 0.202724 0.081748 2016-06-28 0.135215 0.010403 0.059038 -0.034879 0.111691 0.122554 0.042489 2016-06-29 0.068774 0.019848 0.058476 -0.049971 0.042805 0.053339 0.079592 2016-06-30 0.039431 0.012271 0.037432 -0.027272 0.010902 0.077293 -0.050667 :param holding_period: 持有周期(int) :param covariance_type:"shrink"/"simple" 协防差矩阵估算方式 Ledoit-Wolf压缩估计或简单估计 :return: weight_df:使用Sample协方差矩阵估算方法得到的因子权重(pd.Dataframe), 索引(index)为datetime,columns为待合成的因子名称。 """ weight_df = pd.DataFrame(index=ic_df.index, columns=ic_df.columns) lw = LedoitWolf() # 最大化第t天的ic,用到了截止到t+period的数据(算收益), # 算得的权重用于t+period的因子进行加权 for dt in ic_df.index: f_dt = pd.concat([ factors_dict[factor_name].loc[dt] for factor_name in ic_df.columns ], axis=1).dropna() if len(f_dt) == 0: continue if covariance_type == "shrink": try: f_cov_mat = lw.fit(f_dt.as_matrix()).covariance_ except: f_cov_mat = np.mat(np.cov(f_dt.T.as_matrix()).astype(float)) else: f_cov_mat = np.mat(np.cov(f_dt.T.as_matrix()).astype(float)) inv_f_cov_mat = np.linalg.inv(f_cov_mat) weight = inv_f_cov_mat * np.mat(ic_df.loc[dt].values).reshape( len(inv_f_cov_mat), 1) weight = np.array(weight.reshape(len(weight), ))[0] weight_df.ix[dt] = weight / np.sum(np.abs(weight)) return weight_df.shift(holding_period)
def maximization(self): ''' M-step of EM algorithm, use scikit.learn's LedoitWolf method to perfom covariance matrix shrinkage. Arguments: sufficient statistics, i.e. model parameters Returns: the updated sufficient statistics which all in self definition, so no return values ''' logger.info("running maximization function") logger.info("mean maximization") mu = np.divide(self.mu, self.ndata) logger.info("covariance maximization") for i in range(self._K): for j in range(self._K): self.cov[i, j] = (1.0 / self.ndata) * self.cov[i, j] + self.ndata * mu[i] * mu[j] - self.mu[i] * mu[j] - self.mu[j] * mu[i] logger.info(" performing covariance shrinkage using sklearn module") lw = LedoitWolf() cov_result = lw.fit(self.cov, assume_centered=True).covariance_ self.inv_cov = np.linalg.inv(cov_result) self.log_det_inv_cov = math_utli.safe_log(np.linalg.det(self.inv_cov)) logger.info("topic maximization") for i in range(self._K): sum_m = 0 sum_m += np.sum(self.beta, axis=0)[i] if sum_m == 0: sum_m = -1000 * self._W else: sum_m = np.log(sum_m) for j in range(self._W): self.log_beta[i, j] = math_utli.safe_log(self.beta[i, j] - sum_m) logger.info("write model parameters to file") logger.info("write gaussian") with open('ctm_nu', 'w') as ctm_nu_dump: cPickle.dump(self.nu, ctm_nu_dump) with open('ctm_cov', 'w') as ctm_cov_dump: cPickle.dump(self.cov, ctm_cov_dump) with open('ctm_inv_cov', 'w') as ctm_inv_cov_dump: cPickle.dump(self.inv_cov, ctm_inv_cov_dump) with open('ctm_log_det_inv_cov', 'w') as ctm_log_det_inv_cov_dump: cPickle.dump(self.log_det_inv_cov, ctm_log_det_inv_cov_dump) logger.info("write topic matrix") with open('ctm_log_beta', 'w') as ctm_log_beta_dump: cPickle.dump(self.log_beta, ctm_log_beta_dump)
def estimatorLedoitWolf(self): #remove Date column for this function trimmedData = self.data.drop('Date', axis=1) cov = LedoitWolf().fit(trimmedData).covariance_ #centers the data assert cov.shape == self.expectedCovShape self.cov = cov return self.cov
def filter_W_fromVcv(vcv, variance_perc=1.0): '''vcv is a filtered value for the Vcv, with shapes T,N,N. It filters init_W,init_df that are the initial distribution parameters for W's posterior. W is the diffusion matrix of the components of the cholesky-decomposition of vcv. It filters also init_vcv_std, the standard deviations of this components' posteriors. ''' [T, N, _] = vcv.shape num_tril = int(N * (N + 1) / 2) chol_vcv = np.zeros([T, int(N * (N + 1) / 2)]) ind = indexes_librarian(N) for t in range(T): cvcv = np.linalg.cholesky(vcv[t]) chol_vcv[t, ind.spiral_diag] = inv_softplus(cvcv[ind.diag[0], ind.diag[1]]) chol_vcv[t, ind.spiral_udiag] = cvcv[ind.udiag[0], ind.udiag[1]] cov = LedoitWolf().fit(chol_vcv[1:, :] - chol_vcv[:-1, :]) init_W = cov.covariance_ try: np.linalg.cholesky(init_W) except: #adds a constant term if init_W is singular print('W resulted singular, a correction term (I*1e-4) is added') init_W += np.eye(num_tril) * 1e-4 init_df = np.max([4 * num_tril / variance_perc, num_tril]) init_W *= 2 init_vcv_std = np.abs(chol_vcv) * 0.1 / N * variance_perc #init_vcv_std=np.tile(np.reshape(np.abs(vcv).mean(axis=0),[1,N,N]),[T,1,1])/np.sqrt(N)*variance_perc return np.float32(init_W), np.float32(init_df), np.float32(init_vcv_std)
def query_samples_and_probabilities(pydc,query,evidence,var,std=False): pydc.queryWithSamples(NUM_SAMPLES,query,evidence,var,FLAG,BIGNUM) parsed_samples = ast.literal_eval(pydc.samples) values = [] weights = [] for sample in parsed_samples: x, w = sample[0], sample[1] values += [x] weights += [w] values, weights = np.array(values), np.array(weights) if std: avg, std = weighted_avg_and_std(values, weights) return avg, std else: #values = values + 1e-5*np.random.rand(*values.shape) avg, cov = weighted_avg_and_cov(values, weights) print avg #X = np.random.multivariate_normal(mean=avg,cov=cov,size=100) #shcov = LedoitWolf().fit(X) #assert cov is positive-semidefinite try: assert(np.all(np.linalg.eigvals(cov) >= 0)) except AssertionError: X = np.random.multivariate_normal(mean=avg,cov=cov,size=100) shcov = LedoitWolf().fit(X) avg, cov = shcov.location_, shcov.covariance_ #assert(np.all(np.linalg.eigvals(cov) >= 0)) return (avg, cov)
def _simulate_covariance(mu_vector, cov_matrix, num_obs, lw_shrinkage=False): """ Derives an empirical vector of means and an empirical covariance matrix. Based on the set of true means vector and covariance matrix of X distributions, the function generates num_obs observations for every X. Based on these observations simulated vector of means and the simulated covariance matrix are obtained. :param mu_vector: (np.array) True means vector for X distributions :param cov_matrix: (np.array) True covariance matrix for X distributions :param num_obs: (int) Number of observations to draw for every X :param lw_shrinkage: (bool) Flag to apply Ledoit-Wolf shrinkage to X (False by default) :return: (np.array, np.array) Empirical means vector, empirical covariance matrix """ # Generating a matrix of num_obs observations for X distributions observations = np.random.multivariate_normal(mu_vector.flatten(), cov_matrix, size=num_obs) # Empirical means vector calculation mu_simulated = observations.mean(axis=0).reshape(-1, 1) if lw_shrinkage: # If applying Ledoit-Wolf shrinkage cov_simulated = LedoitWolf().fit(observations).covariance_ else: # Simple empirical covariance matrix cov_simulated = np.cov(observations, rowvar=False) return mu_simulated, cov_simulated
def compute_connectivity_subject(conn, masker, func, confound=None): """ Returns connectivity of one fMRI for a given atlas """ ts = do_mask_img(masker, func, confound) if conn == 'gl': fc = GraphLassoCV(max_iter=1000) elif conn == 'lw': fc = LedoitWolf() elif conn == 'oas': fc = OAS() elif conn == 'scov': fc = ShrunkCovariance() fc = Bunch(covariance_=0, precision_=0) if conn == 'corr' or conn == 'pcorr': fc = Bunch(covariance_=0, precision_=0) fc.covariance_ = np.corrcoef(ts) fc.precision_ = partial_corr(ts) else: fc.fit(ts) ind = np.tril_indices(ts.shape[1], k=-1) return fc.covariance_[ind], fc.precision_[ind]
def weight_opt(returns,benchmark, lower = 0, upper = 1, ph=2**7, cov_method='sample', seed = 123): np.random.seed(seed) n_asset, n_sample = returns.shape rets = np.asmatrix(returns) #N = 10 #phs = [2**(t-2) for t in range(N)] # Convert to cvxopt matrices if cov_method == 'sample': Cov = opt.matrix(np.cov(rets,benchmark)) elif cov_method == 'lw': Cov = opt.matrix(LedoitWolf().fit(np.append(np.transpose(rets),benchmark.reshape(n_sample,1), axis=1)).covariance_) else: raise ValueError('cov_method should be in {}'.format({'sample', 'lw'})) S = Cov[:n_asset,:n_asset] r_mean = opt.matrix(np.nanmean(rets, axis=1)) # n*1 Cb = Cov[:n_asset,n_asset] # Create constraint matrices G = opt.matrix(np.append(np.eye(n_asset),-np.eye(n_asset),axis = 0)) # 2n x n identity matrix h = opt.matrix(np.append(upper*np.ones((n_asset,1)),-lower*np.ones((n_asset,1)),axis = 0)) A = opt.matrix(1.0, (1, n_asset)) b = opt.matrix(1.0) # Calculate efficient frontier weights using quadratic programming x = solvers.qp(ph*S, -ph*Cb-r_mean, G, h, A, b)['x'] #portfolios = [solvers.qp(ph*S, -ph*Cb-r_mean, G, h, A, b)['x'] # for ph in phs] # CALCULATE RISKS AND RETURNS FOR FRONTIER ret = blas.dot(r_mean, x) #[blas.dot(r_mean, x) for x in portfolios] errors = blas.dot(x, S*x)+Cov[n_asset,n_asset]-2*blas.dot(Cb,x) #[blas.dot(x, S*x)+Cov[n_asset,n_asset]-2*blas.dot(Cb,x) for x in portfolios] return np.transpose(np.array(x))[0], ret, errors#, ret_opt, risk_opt
def compute_network_connectivity_subject(conn, func, masker, rois): """ Returns connectivity of one fMRI for a given atlas """ ts = masker.fit_transform(func) ts = np.asarray(ts)[:, rois] if conn == 'gl': fc = GraphLassoCV(max_iter=1000) elif conn == 'lw': fc = LedoitWolf() elif conn == 'oas': fc = OAS() elif conn == 'scov': fc = ShrunkCovariance() fc = Bunch(covariance_=0, precision_=0) if conn == 'corr' or conn == 'pcorr': fc = Bunch(covariance_=0, precision_=0) fc.covariance_ = np.corrcoef(ts) fc.precision_ = partial_corr(ts) else: fc.fit(ts) ind = np.tril_indices(ts.shape[1], k=-1) return fc.covariance_[ind], fc.precision_[ind]
def test_lda_predict(): # Test LDA classification. # This checks that LDA implements fit and predict and returns correct # values for simple toy data. for test_case in solver_shrinkage: solver, shrinkage = test_case clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage) y_pred = clf.fit(X, y).predict(X) assert_array_equal(y_pred, y, "solver %s" % solver) # Assert that it works with 1D data y_pred1 = clf.fit(X1, y).predict(X1) assert_array_equal(y_pred1, y, "solver %s" % solver) # Test probability estimates y_proba_pred1 = clf.predict_proba(X1) assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y, "solver %s" % solver) y_log_proba_pred1 = clf.predict_log_proba(X1) assert_allclose( np.exp(y_log_proba_pred1), y_proba_pred1, rtol=1e-6, atol=1e-6, err_msg="solver %s" % solver, ) # Primarily test for commit 2f34950 -- "reuse" of priors y_pred3 = clf.fit(X, y3).predict(X) # LDA shouldn't be able to separate those assert np.any(y_pred3 != y3), "solver %s" % solver clf = LinearDiscriminantAnalysis(solver="svd", shrinkage="auto") with pytest.raises(NotImplementedError): clf.fit(X, y) clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=0.1, covariance_estimator=ShrunkCovariance()) with pytest.raises( ValueError, match=("covariance_estimator and shrinkage " "parameters are not None. " "Only one of the two can be set."), ): clf.fit(X, y) # test bad solver with covariance_estimator clf = LinearDiscriminantAnalysis(solver="svd", covariance_estimator=LedoitWolf()) with pytest.raises(ValueError, match="covariance estimator is not supported with svd"): clf.fit(X, y) # test bad covariance estimator clf = LinearDiscriminantAnalysis(solver="lsqr", covariance_estimator=KMeans( n_clusters=2, n_init="auto")) with pytest.raises(ValueError): clf.fit(X, y)
def __call__(self, train_list, rest_list, clear_after_use=False): print("Apply Whitening...") if clear_after_use: self.sigma_neg_sqrt = None self.shrinkage_parameter = None if self.sigma_neg_sqrt is None: train_stacked = np.concatenate([d.x for d in train_list], axis=0) # Fit LedoitWolf for covariance estimation lw = LedoitWolf().fit(train_stacked) self.shrinkage_parameter = lw.shrinkage_ print(" Estimated shrinkage-parameter={:.3f}".format( self.shrinkage_parameter)) # estimated covariance matrix sigma = lw.covariance_ # eigenvalue decomposition eig_values, eig_vectors = np.linalg.eig(sigma) # negative square root of eigenvalues eig_values_neg_sqrt = np.diag(1 / np.sqrt(eig_values + self.eps)) # negative square root of sigma self.sigma_neg_sqrt = np.dot( np.dot(eig_vectors, eig_values_neg_sqrt), eig_vectors.T) def tensor_whiten(data): x = data.x x = np.dot(x, self.sigma_neg_sqrt) return RawData.create_from_ref(data, x=x) return self.transform(tensor_whiten, train_list, rest_list)
def postProcessing(nifti_file, subject_key, spheres_masker): """Perform post processing param nifti_file: string. path to the nifty file param subject_key: string. subject's key return: dictionary raw. key: subject's key . value: {"time_series" : matrix of time series (time_points,rois), "covariance" : covariance matrix of atlas rois (rois, rois), "correlation" : correlation matrix of atlas rois (rois, rois)} """ try: print("subject_key: " + subject_key) print("Extract timeseries") # Extract the time series print(nifti_file) timeseries = spheres_masker.fit_transform(nifti_file, confounds=None) print("Extract covariance matrix") cov_measure = ConnectivityMeasure(cov_estimator=LedoitWolf( assume_centered=False, block_size=1000, store_precision=False), kind='covariance') cov = [] cor = [] cov = cov_measure.fit_transform([timeseries])[0, :, :] print("Extract correlation matrix") cor = nilearn.connectome.cov_to_corr(cov) except: raise Exception("subject_key: %s \n" % subject_key + traceback.format_exc()) return (subject_key, { "time_series": timeseries, "covariance": cov, "correlation": cor })
def __init__(self, sharpes, returns): """ Initialize AuthorModelBuilder object. Parameters ---------- sharpes : pd.DataFrame Long-format DataFrame of in-sample Sharpe ratios (from user-run backtests), indexed by user, algorithm and code ID. Note that currently, backtests are deduplicated based on code id. See fit_authors for more information. """ self.num_authors = sharpes.meta_user_id.nunique() self.num_algos = sharpes.meta_algorithm_id.nunique() # For num_backtests, nunique() and count() should be the same self.num_backtests = sharpes.meta_code_id.nunique() # Which algos correspond to which authors? df = (sharpes.loc[:, ['meta_user_id', 'meta_algorithm_id']]. drop_duplicates( subset='meta_algorithm_id', keep='first').reset_index().meta_user_id.astype(str)) self.author_to_algo_encoding = LabelEncoder().fit_transform(df) # Which backtests correspond to which algos? df = sharpes.meta_algorithm_id.astype(str) self.algo_to_backtest_encoding = LabelEncoder().fit_transform(df) # Which backtests correspond to which authors? df = sharpes.meta_user_id.astype(str) self.author_to_backtest_encoding = LabelEncoder().fit_transform(df) # Construct correlation matrix. # 0 is a better estimate for mean returns than the sample mean! returns_ = returns / returns.std() self.corr = LedoitWolf(assume_centered=True).fit(returns_).covariance_ self.model = self._build_model(sharpes, self.corr) self.coords = { 'meta_user_id': sharpes.meta_user_id.drop_duplicates().values, 'meta_algorithm_id': sharpes.meta_algorithm_id.drop_duplicates().values, 'meta_code_id': sharpes.meta_code_id.values } self.dims = { 'mu_global': (), 'mu_author': ('meta_user_id', ), 'mu_author_raw': ('meta_user_id', ), 'mu_author_sd': (), 'mu_algo': ('meta_algorithm_id', ), 'mu_algo_raw': ('meta_algorithm_id', ), 'mu_algo_sd': (), 'mu_backtest': ('meta_code_id', ), 'sigma_backtest': ('meta_code_id', ), 'alpha_author': ('meta_user_id', ), 'alpha_algo': ('meta_algorithm_id', ) }
def LedoitWolf_covMatrix(X): logger.info( 'Se realiza el calculo de la matriz de covarianza con Shrinkage') cov = LedoitWolf().fit(X) cov_matrix = cov.covariance_ mean_vector = cov.location_ return cov_matrix, mean_vector
def GetModelParams(DataFrame, ColumnIndex): cDataSet = DataFrame cData0 = cDataSet[cDataSet['target'] == 0] cData1 = cDataSet[cDataSet['target'] == 1] bData0 = np.array(cData0[ColumnIndex]) bData1 = np.array(cData1[ColumnIndex]) Cov0 = LedoitWolf(assume_centered=False).fit(bData0) Cov1 = LedoitWolf(assume_centered=False).fit(bData1) Mean0 = bData0.mean(axis=0) Mean1 = bData1.mean(axis=0) return Cov0.covariance_, Cov1.covariance_, Mean0, Mean1
def prior_vector_variability(x): """ Estimate the covariance matrix of x with the LedoitWolf estimator :param x: an array of dim (t,n) :return: The estimated covariance matrix """ dx = LedoitWolf().fit(x).covariance_ return dx
def simCovMu(mu0, cov0, nObs, shrink=False): x = np.random.multivariate_normal(mu0.flatten(), cov0, size = nObs) #print(x.shape) mu1 = x.mean(axis = 0).reshape(-1,1) #calc mean of columns of rand matrix #print(mu1.shape) if shrink: cov1 = LedoitWolf().fit(x).covariance_ else: cov1 = np.cov(x, rowvar=0) return mu1, cov1
def max_IR_weight(ic_df, holding_period, rollback_period=120, covariance_type="shrink"): """ 输入ic_df(ic值序列矩阵),指定持有期和滚动窗口,给出相应的多因子组合权重 :param ic_df: ic值序列矩阵 (pd.Dataframe),索引(index)为datetime,columns为各因子名称。 如: BP CFP EP ILLIQUIDITY REVS20 SRMI VOL20 date 2016-06-24 0.165260 0.002198 0.085632 -0.078074 0.173832 0.214377 0.068445 2016-06-27 0.165537 0.003583 0.063299 -0.048674 0.180890 0.202724 0.081748 2016-06-28 0.135215 0.010403 0.059038 -0.034879 0.111691 0.122554 0.042489 2016-06-29 0.068774 0.019848 0.058476 -0.049971 0.042805 0.053339 0.079592 2016-06-30 0.039431 0.012271 0.037432 -0.027272 0.010902 0.077293 -0.050667 :param holding_period: 持有周期(int) :param rollback_period: 滚动窗口,即计算每一天的因子权重时,使用了之前rollback_period下的IC时间序列来计算IC均值向量和IC协方差矩阵(int)。 :param covariance_type:"shrink"/"simple" 协防差矩阵估算方式 Ledoit-Wolf压缩估计或简单估计 :return: weight_df:使用Sample协方差矩阵估算方法得到的因子权重(pd.Dataframe), 索引(index)为datetime,columns为待合成的因子名称。 """ # 最大化t-n ~ t天的ic_ir,用到了截止到t+period的数据(算收益), # 算得的权重用于t+period的因子进行加权 n = rollback_period weight_df = pd.DataFrame(index=ic_df.index, columns=ic_df.columns) lw = LedoitWolf() for dt in ic_df.index: ic_dt = ic_df[ic_df.index <= dt].tail(n) if len(ic_dt) < n: continue if covariance_type == "shrink": try: ic_cov_mat = lw.fit(ic_dt.as_matrix()).covariance_ except: ic_cov_mat = np.mat(np.cov(ic_dt.T.as_matrix()).astype(float)) else: ic_cov_mat = np.mat(np.cov(ic_dt.T.as_matrix()).astype(float)) inv_ic_cov_mat = np.linalg.inv(ic_cov_mat) weight = inv_ic_cov_mat * np.mat(ic_dt.mean().values).reshape( len(inv_ic_cov_mat), 1) weight = np.array(weight.reshape(len(weight), ))[0] weight_df.ix[dt] = weight / np.sum(np.abs(weight)) return weight_df.shift(holding_period)
def untangle(X: Iterable, y: Iterable, n_clusters: int = None, get_connectivity: bool = True, compute_distances: bool = True, kind: str = 'correlation', agglo_kws: Union[dict, Bunch] = None) -> FeatureAgglomeration: from nilearn.connectome import ConnectivityMeasure as CM from sklearn.cluster import FeatureAgglomeration from sklearn.covariance import LedoitWolf from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import mutual_info_classif agglo_defs = dict(affinity='euclidean', compute_full_tree='auto', linkage='ward', pooling_func=np.mean, distance_threshold=None, compute_distances=compute_distances) if get_connectivity is True: connect_mat = CM(LedoitWolf(), kind=kind).fit_transform([X.values])[0] else: connect_mat = None if n_clusters is None: n_clusters = divmod(X.shape[1], 2)[0] - 1 if n_clusters == 0: n_clusters = 1 if agglo_kws is None: agglo_kws = {} agglo_defs.update(agglo_kws) agglo = FeatureAgglomeration(n_clusters=n_clusters, connectivity=connect_mat, **agglo_defs) if not isinstance(y, pd.Series): y = pd.Series(y) if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) agglo.fit(X, y) setattr( agglo, 'cluster_indexes_', pd.DataFrame(zip(agglo.labels_, agglo.feature_names_in_), columns=['cluster', 'feature']).groupby('cluster').feature) skb = SelectKBest(k=1, score_func=mutual_info_classif) factor_leaders_ = [ skb.fit(X[itm[1]], y).get_feature_names_out()[0] for itm in tuple(agglo.cluster_indexes_) ] setattr(agglo, 'factor_leaders_', factor_leaders_) return agglo
def connectivity(subjects_ts, kinds=kinds, saveas='file'): """ Estimates Functional Connectivity using several estimation models Parameters ---------- subjects_ts: array-like , 2-D (n_subjects,n_regions) Array of BOLD time-series kinds: list of kinds of connectivity measure to be computed . kinds include : ' correlation ' , ' partial correlation', ' tangent' , 'covariance' . saveas : Destination to save and load output (.npz) Returns --------- mean_connectivity_matrix: dictionary , {'kind' : (n_regions,n_regions)} Group-level functional connectivity matrix individual_connectivity_matrix: dictionary , {'kind' : (n_subjects,n_regions,n_regions)} Subject-level functional connectivity matrices """ individual_connectivity_matrices = dict() mean_connectivity_matrix = dict() if os.path.exists(saveas): data = np.load(saveas) individual_connectivity_matrices = data['arr_0'].flatten()[0] mean_connectivity_matrix = data['arr_1'].flatten()[0] else: for kind in kinds: # Computing individual functional connectivity conn_measure = ConnectivityMeasure(cov_estimator=LedoitWolf( assume_centered=True, store_precision=True), kind=kind, vectorize=False, discard_diagonal=False) individual_connectivity_matrices[ kind] = conn_measure.fit_transform(subjects_ts) # Computing group functional connectivity if kind == 'tangent': mean_connectivity_matrix[kind] = conn_measure.mean_ else: mean_connectivity_matrix[kind] = \ individual_connectivity_matrices[kind].mean(axis=0) np.savez(saveas, individual_connectivity_matrices, mean_connectivity_matrix) return mean_connectivity_matrix, individual_connectivity_matrices
def __init__(self, cov_estimator=LedoitWolf(store_precision=False), kind='covariance', vectorize=False, discard_diagonal=False): self.cov_estimator = cov_estimator self.kind = kind self.vectorize = vectorize self.discard_diagonal = discard_diagonal
def simulateLogNormal(data, covtype='Estimate', nsamples=2000, **kwargs): """ :param data: :param covtype: Type of covariance matrix estimator. Allowed types are: - Estimate (default): - Diagonal: - Shrinkage OAS: :param int nsamples: Number of simulated samples to draw :return: simulated data and empirical covariance est """ try: # Offset data to make sure there are no 0 values for log transform offset = np.min(data) + 1 offdata = data + offset # log on the offsetted data logdata = np.log(offdata) # Get the means meanslog = np.mean(logdata, axis=0) # Specify covariance # Regular covariance estimator if covtype == "Estimate": covlog = np.cov(logdata, rowvar=0) # Shrinkage covariance estimator, using LedoitWolf elif covtype == "ShrinkageLedoitWolf": scov = LedoitWolf() scov.fit(logdata) covlog = scov.covariance_ elif covtype == "ShrinkageOAS": scov = OAS() scov.fit(logdata) covlog = scov.covariance_ # Diagonal covariance matrix (no between variable correlation) elif covtype == "Diagonal": covlogdata = np.var( logdata, axis=0) #get variance of log data by each column covlog = np.diag( covlogdata ) #generate a matrix with diagonal of variance of log Data else: raise ValueError('Unknown Covariance type') simData = np.random.multivariate_normal(meanslog, covlog, nsamples) simData = np.exp(simData) simData -= offset ##Set to 0 negative values simData[np.where(simData < 0)] = 0 # work out the correlation of matrix by columns, each column is a variable corrMatrix = np.corrcoef(simData, rowvar=0) return simData, corrMatrix except Exception as exp: raise exp
def prepareProblem(filePath, shrinkage=False, subset=False, subsetSize=0): # Import data from .csv df = pd.read_csv(filePath, sep=';') df.index = df.date df = df.drop('date', axis=1) # Subset, if called via subset == True if subset == True: df = df.tail(subsetSize) # Estimate covariance using Empirical/MLE # Expected input is returns, hence set: assume_centered = True mleFitted = empirical_covariance(X=df, assume_centered=True) sigma = mleFitted if shrinkage == True: # Estimate covariance using LedoitWolf, first create instance of object lw = LedoitWolf(assume_centered=True) lwFitted = lw.fit(X=df).covariance_ sigma = lwFitted return sigma
base_X_train = np.random.normal(size=(n_samples, n_features)) base_X_test = np.random.normal(size=(n_samples, n_features)) # Color samples coloring_matrix = np.random.normal(size=(n_features, n_features)) X_train = np.dot(base_X_train, coloring_matrix) X_test = np.dot(base_X_test, coloring_matrix) ############################################################################### # Compute Ledoit-Wolf and Covariances on a grid of shrinkages from sklearn.covariance import LedoitWolf, OAS, ShrunkCovariance, \ log_likelihood, empirical_covariance # Ledoit-Wolf optimal shrinkage coefficient estimate lw = LedoitWolf() loglik_lw = lw.fit(X_train, assume_centered=True).score( X_test, assume_centered=True) # OAS coefficient estimate oa = OAS() loglik_oa = oa.fit(X_train, assume_centered=True).score( X_test, assume_centered=True) # spanning a range of possible shrinkage coefficient values shrinkages = np.logspace(-3, 0, 30) negative_logliks = [-ShrunkCovariance(shrinkage=s).fit( X_train, assume_centered=True).score(X_test, assume_centered=True) \ for s in shrinkages] # getting the likelihood under the real model
def main(): ''' Constructs a co-occurence network from gene expression data. Main entry point to code. ''' # Read in the data if os.path.isfile(DATA_PICKLE): print("reading previously saved data from pickle %s" % (DATA_PICKLE)) with open(DATA_PICKLE, 'rb') as file: df = pickle.load(file) lwe = pickle.load(file) pmat = pickle.load(file) pcore_indices = pickle.load(file) pcor = pickle.load(file) lfdr_pcor = pickle.load(file) #prob = pickle.load(file) else: print("reading in data from %s" % (FILENAME)) df = pd.read_csv(FILENAME, sep='\t') print("found %d rows and %d columns" % (df.shape[0], df.shape[1])) # compute the row means and sort the data frame by descinding means df['row_means'] = df.mean(axis=1) df.sort_values('row_means', axis=0, ascending=False, inplace=True) df.drop('row_means', axis=1, inplace=True) # take the most abundant genes df = df.head(PRUNE_GENES) # Ledoit-Wolf optimal shrinkage coefficient estimate print("computing Ledoit-Wolf optimal shrinkage coeffecient estimate") lwe = LedoitWolf().fit(df.transpose()) pmat = lwe.get_precision() # Convert symmetric matrix to array, first by getting indices # of the off diagonal elements, second by pulling them into # separate array (pcor). print("extracting off diagnol elements of precision matrix") pcor_indices = np.triu_indices(pmat.shape[0], 1) pcor = pmat[pcor_indices] # Determine edges by computing lfdr of pcor. print("computing lfdr of partial correlations") fdrtool = importr('fdrtool') lfdr_pcor = fdrtool.fdrtool(FloatVector(pcor), statistic="correlation", plot=False) #prob = 1-lfdr_pcor['lfdr'] with open(DATA_PICKLE, 'wb') as file: pickle.dump(df, file, pickle.HIGHEST_PROTOCOL) pickle.dump(lwe, file, pickle.HIGHEST_PROTOCOL) pickle.dump(pmat, file, pickle.HIGHEST_PROTOCOL) pickle.dump(pcor_indices, file, pickle.HIGHEST_PROTOCOL) pickle.dump(pcor, file, pickle.HIGHEST_PROTOCOL) pickle.dump(lfdr_pcor, file, pickle.HIGHEST_PROTOCOL) #pickle.dump(prob, file, pickle.HIGHEST_PROTOCOL) print("making 1-lfdr vs. pcor plot") prob = 1-np.array(lfdr_pcor.rx2('lfdr')) with PdfPages(PDF_FILENAME) as pdf: plt.figure(figsize=(3, 3)) plt.plot(range(7), [3, 1, 4, 1, 5, 9, 2], 'r-o') plt.title('Page One') pdf.savefig() # saves the current figure into a pdf page plt.close() plt.plot(pcor[0:10000:10], prob[0:10000:10], 'o', markeredgecolor='k', markersize=3) plt.title("THIS IS A PLOT TITLE, YOU BET") plt.xlabel('partial correlation') plt.ylabel('lfdr') pdf.savefig plt.close()
# Remove data not analysed mask_block=block==block for x in range(label.shape[0]): if label[x,2]!=label[x-1,2]: mask_block[x]=False elif label[x,2]!=label[x-2,2]: mask_block[x]=False c_des_out=np.logical_not(label[:,2]== b'des') tmp_out= np.logical_and(c_des_out,mask_block) c_rest_out=np.logical_not(label[:,0]== b'rest') cond_out= np.logical_and(tmp_out,c_rest_out) y=label[cond_out,2] labels=np.unique(y) # Prepare correlation estimator = LedoitWolf() scaler=StandardScaler() # Create np array result_matrix = np.empty([len(names),motor_region.shape[0],labels.shape[0],labels.shape[0]]) #Analysis for each subject for i,n in enumerate(sorted(names)): roi_name=fold_g+'mni4060/asymroi_'+smt+'_'+n+'.npz' roi=np.load(roi_name)['roi'][cond_out] roi=roi[:,motor_region-1] for j in range(motor_region.shape[0]): roi_j=roi[:,j] roi_mat=np.zeros(((y==b'imp').sum(),len(labels))) for z,lab in enumerate(sorted(labels)): roi_mat[:,z]=roi_j[y==lab] roi_sc=scaler.fit_transform(roi_mat)
# stack a random subset image patches, 125K X_unlab_patches = [] random.seed(42) print "Gathering examples..." # Use subsample of 200K for k-means and covariance estimates for i in random.sample(range(0, unlab_X.shape[2]), 200000): patches = view_as_windows(unlab_X[:, :, i], (w, w), step=s) re_shaped = numpy.reshape(patches, (patches.shape[0]*patches.shape[0], w * w)) # normalize the patches, per sample re_shaped = preprocessing.scale(re_shaped, axis=1) X_unlab_patches.append(re_shaped) X_unlab_patches = numpy.vstack(X_unlab_patches) # build whitening transform matrix print "Fitting ZCA Whitening Transform..." cov = LedoitWolf() cov.fit(X_unlab_patches) # fit covariance estimate D, U = numpy.linalg.eigh(cov.covariance_) V = numpy.sqrt(numpy.linalg.inv(numpy.diag(D + zca_eps))) Wh = numpy.dot(numpy.dot(U, V), U.T) mu = numpy.mean(X_unlab_patches, axis=0) X_unlab_patches = numpy.dot(X_unlab_patches-mu, Wh) # run k-means on unlabelled data print "Starting k-means..." clustr = sklearn.cluster.MiniBatchKMeans(n_clusters=n_clust, compute_labels=False, batch_size=300) k_means = clustr.fit(X_unlab_patches)
def test_ledoit_wolf(): """Tests LedoitWolf module on a simple dataset. """ # test shrinkage coeff on a simple data set lw = LedoitWolf() lw.fit(X, assume_centered=True) assert_almost_equal(lw.shrinkage_, 0.00192, 4) assert_almost_equal(lw.score(X, assume_centered=True), -2.89795, 4) # compare shrunk covariance obtained from data and from MLE estimate lw_cov_from_mle, lw_shinkrage_from_mle = ledoit_wolf(X, assume_centered=True) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shinkrage_from_mle, lw.shrinkage_) # compare estimates given by LW and ShrunkCovariance scov = ShrunkCovariance(shrinkage=lw.shrinkage_) scov.fit(X, assume_centered=True) assert_array_almost_equal(scov.covariance_, lw.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) lw = LedoitWolf() lw.fit(X_1d, assume_centered=True) lw_cov_from_mle, lw_shinkrage_from_mle = ledoit_wolf(X_1d, assume_centered=True) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shinkrage_from_mle, lw.shrinkage_) assert_array_almost_equal((X_1d ** 2).sum() / n_samples, lw.covariance_, 4) # test shrinkage coeff on a simple data set (without saving precision) lw = LedoitWolf(store_precision=False) lw.fit(X, assume_centered=True) assert_almost_equal(lw.score(X, assume_centered=True), -2.89795, 4) assert(lw.precision_ is None) # Same tests without assuming centered data # test shrinkage coeff on a simple data set lw = LedoitWolf() lw.fit(X) assert_almost_equal(lw.shrinkage_, 0.007582, 4) assert_almost_equal(lw.score(X), 2.243483, 4) # compare shrunk covariance obtained from data and from MLE estimate lw_cov_from_mle, lw_shinkrage_from_mle = ledoit_wolf(X) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shinkrage_from_mle, lw.shrinkage_) # compare estimates given by LW and ShrunkCovariance scov = ShrunkCovariance(shrinkage=lw.shrinkage_) scov.fit(X) assert_array_almost_equal(scov.covariance_, lw.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) lw = LedoitWolf() lw.fit(X_1d) lw_cov_from_mle, lw_shinkrage_from_mle = ledoit_wolf(X_1d) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shinkrage_from_mle, lw.shrinkage_) assert_array_almost_equal(empirical_covariance(X_1d), lw.covariance_, 4) # test shrinkage coeff on a simple data set (without saving precision) lw = LedoitWolf(store_precision=False) lw.fit(X) assert_almost_equal(lw.score(X), 2.2434839, 4) assert(lw.precision_ is None)
def threshold_from_simulations(self, X, precision=2000, verbose=False, n_jobs=-1): """ """ import multiprocessing as mp if n_jobs < 1: n_jobs = mp.cpu_count() n_samples, n_features = X.shape n = n_samples p = n_features h = self.support_.sum() lw = LedoitWolf() ref_covariance = lw.fit(X[self.support_]).covariance_ c = sp.stats.chi2(p + 2).cdf( sp.stats.chi2(p).ppf(float(h) / n)) / (float(h) / n) sigma_root = np.linalg.cholesky(ref_covariance / c) all_h = [] # inliers distribution dist_in = np.array([], ndmin=1) max_i = max(1, int(precision / float(self.support_.sum()))) for i in range(max_i): if verbose and max_i > 4 and (i % (max_i / 4) == 0): print "\t", 50 * i / float(max_i), "%" #sigma_root = np.diag(np.sqrt(eigenvalues)) #sigma_root = np.eye(n_features) X1, _ = dg.generate_gaussian( n_samples, n_features, np.zeros(n_features), cov_root=sigma_root) # learn location and shape clf = EllipticEnvelopeRMCDl1( correction=self.correction, shrinkage=self.shrinkage, h=self.support_.sum() / float(n_samples), no_fit=True).fit( X1) X2 = X1 - clf.location_ dist_in = np.concatenate( (dist_in, clf.decision_function( X2[clf.support_], raw_values=True))) all_h.append(clf.h) # outliers distribution dist_out = np.array([], ndmin=1) max_i = max(1, int(precision / float(n_samples - self.support_.sum()))) for i in range(max_i): if verbose and max_i > 4 and (i % (max_i / 4) == 0): print "\t", 50 * (1. + i / float(max_i)), "%" X1, _ = dg.generate_gaussian( n_samples, n_features, np.zeros(n_features), cov_root=sigma_root) # learn location and shape clf = EllipticEnvelopeRMCDl1( correction=self.correction, shrinkage=self.shrinkage, h=self.support_.sum() / float(n_samples), no_fit=True).fit(X1) X2 = X1 - clf.location_ dist_out = np.concatenate( (dist_out, clf.decision_function( X2[~clf.support_], raw_values=True))) all_h.append(clf.h) self.dist_in = np.sort(dist_in) self.dist_out = np.sort(dist_out) self.h_mean = np.mean(all_h) return self.dist_out
def test_ledoit_wolf(): # Tests LedoitWolf module on a simple dataset. # test shrinkage coeff on a simple data set X_centered = X - X.mean(axis=0) lw = LedoitWolf(assume_centered=True) lw.fit(X_centered) shrinkage_ = lw.shrinkage_ score_ = lw.score(X_centered) assert_almost_equal(ledoit_wolf_shrinkage(X_centered, assume_centered=True), shrinkage_) assert_almost_equal(ledoit_wolf_shrinkage(X_centered, assume_centered=True, block_size=6), shrinkage_) # compare shrunk covariance obtained from data and from MLE estimate lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_centered, assume_centered=True) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_) # compare estimates given by LW and ShrunkCovariance scov = ShrunkCovariance(shrinkage=lw.shrinkage_, assume_centered=True) scov.fit(X_centered) assert_array_almost_equal(scov.covariance_, lw.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) lw = LedoitWolf(assume_centered=True) lw.fit(X_1d) lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_1d, assume_centered=True) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_) assert_array_almost_equal((X_1d ** 2).sum() / n_samples, lw.covariance_, 4) # test shrinkage coeff on a simple data set (without saving precision) lw = LedoitWolf(store_precision=False, assume_centered=True) lw.fit(X_centered) assert_almost_equal(lw.score(X_centered), score_, 4) assert(lw.precision_ is None) # Same tests without assuming centered data # test shrinkage coeff on a simple data set lw = LedoitWolf() lw.fit(X) assert_almost_equal(lw.shrinkage_, shrinkage_, 4) assert_almost_equal(lw.shrinkage_, ledoit_wolf_shrinkage(X)) assert_almost_equal(lw.shrinkage_, ledoit_wolf(X)[1]) assert_almost_equal(lw.score(X), score_, 4) # compare shrunk covariance obtained from data and from MLE estimate lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_) # compare estimates given by LW and ShrunkCovariance scov = ShrunkCovariance(shrinkage=lw.shrinkage_) scov.fit(X) assert_array_almost_equal(scov.covariance_, lw.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) lw = LedoitWolf() lw.fit(X_1d) lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_1d) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_) assert_array_almost_equal(empirical_covariance(X_1d), lw.covariance_, 4) # test with one sample # warning should be raised when using only 1 sample X_1sample = np.arange(5).reshape(1, 5) lw = LedoitWolf() assert_warns(UserWarning, lw.fit, X_1sample) assert_array_almost_equal(lw.covariance_, np.zeros(shape=(5, 5), dtype=np.float64)) # test shrinkage coeff on a simple data set (without saving precision) lw = LedoitWolf(store_precision=False) lw.fit(X) assert_almost_equal(lw.score(X), score_, 4) assert(lw.precision_ is None)
def test_ledoit_wolf(): """Tests LedoitWolf module on a simple dataset. """ # test shrinkage coeff on a simple data set X_centered = X - X.mean(axis=0) lw = LedoitWolf(assume_centered=True) lw.fit(X_centered) shrinkage_ = lw.shrinkage_ score_ = lw.score(X_centered) assert_almost_equal(ledoit_wolf_shrinkage(X_centered, assume_centered=True), shrinkage_) assert_almost_equal(ledoit_wolf_shrinkage(X_centered, assume_centered=True, block_size=6), shrinkage_) # compare shrunk covariance obtained from data and from MLE estimate lw_cov_from_mle, lw_shinkrage_from_mle = ledoit_wolf(X_centered, assume_centered=True) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shinkrage_from_mle, lw.shrinkage_) # compare estimates given by LW and ShrunkCovariance scov = ShrunkCovariance(shrinkage=lw.shrinkage_, assume_centered=True) scov.fit(X_centered) assert_array_almost_equal(scov.covariance_, lw.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) lw = LedoitWolf(assume_centered=True) lw.fit(X_1d) lw_cov_from_mle, lw_shinkrage_from_mle = ledoit_wolf(X_1d, assume_centered=True) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shinkrage_from_mle, lw.shrinkage_) assert_array_almost_equal((X_1d ** 2).sum() / n_samples, lw.covariance_, 4) # test shrinkage coeff on a simple data set (without saving precision) lw = LedoitWolf(store_precision=False, assume_centered=True) lw.fit(X_centered) assert_almost_equal(lw.score(X_centered), score_, 4) assert(lw.precision_ is None) # (too) large data set X_large = np.ones((20, 200)) assert_raises(MemoryError, ledoit_wolf, X_large, block_size=100) # Same tests without assuming centered data # test shrinkage coeff on a simple data set lw = LedoitWolf() lw.fit(X) assert_almost_equal(lw.shrinkage_, shrinkage_, 4) assert_almost_equal(lw.shrinkage_, ledoit_wolf_shrinkage(X)) assert_almost_equal(lw.shrinkage_, ledoit_wolf(X)[1]) assert_almost_equal(lw.score(X), score_, 4) # compare shrunk covariance obtained from data and from MLE estimate lw_cov_from_mle, lw_shinkrage_from_mle = ledoit_wolf(X) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shinkrage_from_mle, lw.shrinkage_) # compare estimates given by LW and ShrunkCovariance scov = ShrunkCovariance(shrinkage=lw.shrinkage_) scov.fit(X) assert_array_almost_equal(scov.covariance_, lw.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) lw = LedoitWolf() lw.fit(X_1d) lw_cov_from_mle, lw_shinkrage_from_mle = ledoit_wolf(X_1d) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shinkrage_from_mle, lw.shrinkage_) assert_array_almost_equal(empirical_covariance(X_1d), lw.covariance_, 4) # test with one sample X_1sample = np.arange(5) lw = LedoitWolf() with warnings.catch_warnings(record=True): lw.fit(X_1sample) # test shrinkage coeff on a simple data set (without saving precision) lw = LedoitWolf(store_precision=False) lw.fit(X) assert_almost_equal(lw.score(X), score_, 4) assert(lw.precision_ is None)
# under the ground-truth model, which we would not have access to in real # settings real_cov = np.dot(coloring_matrix.T, coloring_matrix) emp_cov = empirical_covariance(X_train) loglik_real = -log_likelihood(emp_cov, linalg.inv(real_cov)) # ############################################################################# # Compare different approaches to setting the parameter # GridSearch for an optimal shrinkage coefficient tuned_parameters = [{'shrinkage': shrinkages}] cv = GridSearchCV(ShrunkCovariance(), tuned_parameters, cv=5) cv.fit(X_train) # Ledoit-Wolf optimal shrinkage coefficient estimate lw = LedoitWolf() loglik_lw = lw.fit(X_train).score(X_test) # OAS coefficient estimate oa = OAS() loglik_oa = oa.fit(X_train).score(X_test) # ############################################################################# # Plot results fig = plt.figure() plt.title("Regularized covariance: likelihood and shrinkage coefficient") plt.xlabel('Regularization parameter: shrinkage coefficient') plt.ylabel('Error: negative log-likelihood on test data') # range shrinkage curve plt.loglog(shrinkages, negative_logliks, label="Negative log-likelihood")
def test_connectivity_measure_outputs(): n_subjects = 10 n_features = 49 n_samples = 200 # Generate signals and compute covariances emp_covs = [] ledoit_covs = [] signals = [] random_state = check_random_state(0) ledoit_estimator = LedoitWolf() for k in range(n_subjects): signal = random_state.randn(n_samples, n_features) signals.append(signal) signal -= signal.mean(axis=0) emp_covs.append((signal.T).dot(signal) / n_samples) ledoit_covs.append(ledoit_estimator.fit(signal).covariance_) kinds = ["correlation", "tangent", "precision", "partial correlation"] # Check outputs properties for cov_estimator, covs in zip([EmpiricalCovariance(), LedoitWolf()], [emp_covs, ledoit_covs]): input_covs = copy.copy(covs) for kind in kinds: conn_measure = ConnectivityMeasure(kind=kind, cov_estimator=cov_estimator) connectivities = conn_measure.fit_transform(signals) # Generic assert_true(isinstance(connectivities, np.ndarray)) assert_equal(len(connectivities), len(covs)) for k, cov_new in enumerate(connectivities): assert_array_equal(input_covs[k], covs[k]) assert(is_spd(covs[k], decimal=7)) # Positive definiteness if expected and output value checks if kind == "tangent": assert_array_almost_equal(cov_new, cov_new.T) gmean_sqrt = _map_eigenvalues(np.sqrt, conn_measure.mean_) assert(is_spd(gmean_sqrt, decimal=7)) assert(is_spd(conn_measure.whitening_, decimal=7)) assert_array_almost_equal(conn_measure.whitening_.dot( gmean_sqrt), np.eye(n_features)) assert_array_almost_equal(gmean_sqrt.dot( _map_eigenvalues(np.exp, cov_new)).dot(gmean_sqrt), covs[k]) elif kind == "precision": assert(is_spd(cov_new, decimal=7)) assert_array_almost_equal(cov_new.dot(covs[k]), np.eye(n_features)) elif kind == "correlation": assert(is_spd(cov_new, decimal=7)) d = np.sqrt(np.diag(np.diag(covs[k]))) if cov_estimator == EmpiricalCovariance(): assert_array_almost_equal(d.dot(cov_new).dot(d), covs[k]) assert_array_almost_equal(np.diag(cov_new), np.ones((n_features))) elif kind == "partial correlation": prec = linalg.inv(covs[k]) d = np.sqrt(np.diag(np.diag(prec))) assert_array_almost_equal(d.dot(cov_new).dot(d), -prec + 2 * np.diag(np.diag(prec)))
def shrink(X): lw = LedoitWolf(store_precision=False, assume_centered=False) lw.fit(X) return lw.covariance_
class DCS_kd(BaseEstimator): def __init__(self, k=2, gamma=1.0, covariance_estimator='ledoit-wolf'): self.k = float(k) self.gamma = gamma self.covariance_estimator = covariance_estimator if covariance_estimator == 'empirical': self.cov = EmpiricalCovariance(store_precision=False) elif covariance_estimator == 'ledoit-wolf': self.cov = LedoitWolf(store_precision=False) else: raise NotImplementedError('%s is not implemented' % covariance_estimator) self.x0 = None self.x1 = None def fit(self, x, y): self.x0 = x[y == min(y)] self.x1 = x[y == max(y)] def __str__(self): return 'Analytical Cauchy-Schwarz Divergence in {}-d'.format(self.k) def value(self, v): # We need matrix, not vector v = v.reshape(-1, self.k) ipx0 = self._ipx(self.x0, self.x0, v) ipx1 = self._ipx(self.x1, self.x1, v) ipx2 = self._ipx(self.x0, self.x1, v) return np.log(ipx0) + np.log(ipx1) - 2 * np.log(ipx2) def derivative(self, v): # We need matrix, not vector v = v.reshape(-1, self.k) ret = (self._d_ipx(self.x0, self.x0, v) / self._ipx(self.x0, self.x0, v) + self._d_ipx(self.x1, self.x1, v) / self._ipx(self.x1, self.x1, v) - 2 * self._d_ipx(self.x0, self.x1, v) / self._ipx(self.x0, self.x1, v)) return ret.reshape(-1) def _H(self, X0, X1): n = (4.0 / (self.k + 2)) ** (2.0 / (self.k + 4)) p = (-2.0 / (self.k + 4)) return n * (X0.shape[0] ** p * self.cov.fit(X0).covariance_ + X1.shape[0] ** p * self.cov.fit(X1).covariance_) def _f1(self, X0, X1, v): Hxy = self.gamma * self.gamma * self._H(X0, X1) vHv = v.T.dot(Hxy).dot(v) # return 1.0 / np.sqrt(la.det(vHv)) return 1.0 / (X0.shape[0] * X1.shape[0] * np.sqrt(la.det(vHv)) * (2 * np.pi) ** (self.k / 2)) def _g1(self, X0, X1, v): Hxy = self.gamma * self.gamma * self._H(X0, X1) vHv = v.T.dot(Hxy).dot(v) return - self._f1(X0, X1, v) * Hxy.dot(v).dot(la.inv(vHv)) def _f2(self, X0, X1, v): Hxy = self.gamma * self.gamma * self._H(X0, X1) vHv = v.T.dot(Hxy).dot(v) vHv_inv = la.inv(vHv) vx0 = X0.dot(v) vx1 = X1.dot(v) vx0c = vx0.dot(vHv_inv) vx1c = vx1.dot(vHv_inv) ret = 0.0 for i in range(X0.shape[0]): ret += np.exp(-0.5 * ((vx0c[i] - vx1c) * (vx0[i] - vx1)).sum(axis=1)).sum() return ret def _g2(self, X0, X1, v): Hxy = self.gamma * self.gamma * self._H(X0, X1) vHv = v.T.dot(Hxy).dot(v) vHv_inv = la.inv(vHv) # k x k vx0 = X0.dot(v) vx1 = X1.dot(v) vx0c = vx0.dot(vHv_inv) vx1c = vx1.dot(vHv_inv) eye = np.eye(v.shape[0]) right_expr = (eye - Hxy.dot(v).dot(vHv_inv).dot(v.T)) # d x d d = v.shape[0] k = int(self.k) ret = 0.0 for i in range(X0.shape[0]): f2_vals = np.exp(-0.5 * ((vx0c[i] - vx1c) * (vx0[i] - vx1)).sum(axis=1)).reshape(-1, 1) ws = (X0[i] - X1).reshape(X1.shape[0], d, 1) vxdiffs = (- f2_vals * (vx0[i] - vx1)).reshape(X1.shape[0], 1, k) ret += np.tensordot(ws, vxdiffs, ([0, 2], [0, 1])) return right_expr.dot(ret).dot(vHv_inv) def _ipx(self, X0, X1, v): return self._f1(X0, X1, v) * self._f2(X0, X1, v) def _d_ipx(self, X0, X1, v): return self._f1(X0, X1, v) * self._g2(X0, X1, v) + self._f2(X0, X1, v) * self._g1(X0, X1, v)
r = 0.1 real_cov = toeplitz(r**np.arange(n_features)) coloring_matrix = cholesky(real_cov) n_samples_range = np.arange(6, 31, 1) repeat = 100 lw_mse = np.zeros((n_samples_range.size, repeat)) oa_mse = np.zeros((n_samples_range.size, repeat)) lw_shrinkage = np.zeros((n_samples_range.size, repeat)) oa_shrinkage = np.zeros((n_samples_range.size, repeat)) for i, n_samples in enumerate(n_samples_range): for j in range(repeat): X = np.dot( np.random.normal(size=(n_samples, n_features)), coloring_matrix.T) lw = LedoitWolf(store_precision=False) lw.fit(X, assume_centered=True) lw_mse[i,j] = lw.error_norm(real_cov, scaling=False) lw_shrinkage[i,j] = lw.shrinkage_ oa = OAS(store_precision=False) oa.fit(X, assume_centered=True) oa_mse[i,j] = oa.error_norm(real_cov, scaling=False) oa_shrinkage[i,j] = oa.shrinkage_ # plot MSE pl.subplot(2,1,1) pl.errorbar(n_samples_range, lw_mse.mean(1), yerr=lw_mse.std(1), label='Ledoit-Wolf', color='g') pl.errorbar(n_samples_range, oa_mse.mean(1), yerr=oa_mse.std(1), label='OAS', color='r')
def plot_psds(psd_file, data_dir='/auto/tdrive/mschachter/data'): # read PairwiseCF file pcf_file = os.path.join(data_dir, 'aggregate', 'pairwise_cf.h5') pcf = AggregatePairwiseCF.load(pcf_file) # pcf.zscore_within_site() g = pcf.df.groupby(['bird', 'block', 'segment', 'electrode']) nsamps_electrodes = len(g) i = pcf.df.cell_index != -1 g = pcf.df[i].groupby(['bird', 'block', 'segment', 'electrode', 'cell_index']) nsamps_cells = len(g) print '# of electrodes: %d' % nsamps_electrodes print '# of cells: %d' % nsamps_cells print '# of lfp samples: %d' % (pcf.lfp_psds.shape[0]) print '# of spike psd samples: %d' % (pcf.spike_psds.shape[0]) # compute the LFP mean and std lfp_psds = deepcopy(pcf.lfp_psds) print 'lfp_psds_ind: max=%f, q99=%f' % (lfp_psds.max(), np.percentile(lfp_psds.ravel(), 99)) log_transform(lfp_psds) print 'lfp_psds_ind: max=%f, q99=%f' % (lfp_psds.max(), np.percentile(lfp_psds.ravel(), 99)) nz = lfp_psds.sum(axis=1) > 0 lfp_psds = lfp_psds[nz, :] lfp_psd_mean = lfp_psds.mean(axis=0) lfp_psd_std = lfp_psds.std(axis=0, ddof=1) nsamps_lfp = lfp_psds.shape[0] # get the spike rate spike_rate = pcf.df.spike_rate.values # plt.figure() # plt.hist(spike_rate, bins=20, color='g', alpha=0.7) # plt.title('Spike Rate Histogram, q1=%0.3f, q5=%0.3f, q10=%0.3f, q50=%0.3f, q99=%0.3f' % # (np.percentile(spike_rate, 1), np.percentile(spike_rate, 5), np.percentile(spike_rate, 10), # np.percentile(spike_rate, 50), np.percentile(spike_rate, 99))) # plt.show() # compute the covariance lfp_psd_z = deepcopy(lfp_psds) lfp_psd_z -= lfp_psd_mean lfp_psd_z /= lfp_psd_std lfp_and_spike_cov_est = LedoitWolf() lfp_and_spike_cov_est.fit(lfp_psd_z) lfp_and_spike_cov = lfp_and_spike_cov_est.covariance_ """ # read CRCNS file cell_data = dict() hf = h5py.File(psd_file, 'r') cnames = hf.attrs['col_names'] for c in cnames: cell_data[c] = np.array(hf[c]) crcns_psds = np.array(hf['psds']) freqs = hf.attrs['freqs'] hf.close() cell_df = pd.DataFrame(cell_data) print 'regions=',cell_df.superregion.unique() name_map = {'brainstem':'MLd', 'thalamus':'OV', 'cortex':'Field L+CM'} """ # resample the lfp mean and std freq_rs = np.linspace(pcf.freqs.min(), pcf.freqs.max(), 1000) lfp_mean_cs = interp1d(pcf.freqs, lfp_psd_mean, kind='cubic') lfp_mean_rs = lfp_mean_cs(freq_rs) lfp_std_cs = interp1d(pcf.freqs, lfp_psd_std, kind='cubic') lfp_std_rs = lfp_std_cs(freq_rs) # concatenate the lfp psd and log spike rate lfp_psd_and_spike_rate = list() for k,(li,si) in enumerate(zip(pcf.df['lfp_index'], pcf.df['spike_index'])): lpsd = pcf.lfp_psds[li, :] srate,sstd = pcf.spike_rates[si, :] if srate > 0: lfp_psd_and_spike_rate.append(np.hstack([lpsd, np.log(srate)])) lfp_psd_and_spike_rate = np.array(lfp_psd_and_spike_rate) nfreqs = len(pcf.freqs) lfp_rate_cc = np.zeros([nfreqs]) for k in range(nfreqs): lfp_rate_cc[k] = np.corrcoef(lfp_psd_and_spike_rate[:, k], lfp_psd_and_spike_rate[:, -1])[0, 1] fig = plt.figure(figsize=(24, 12)) fig.subplots_adjust(left=0.05, right=0.95, wspace=0.30, hspace=0.30) nrows = 2 ncols = 100 gs = plt.GridSpec(nrows, ncols) ax = plt.subplot(gs[0, :35]) plt.errorbar(freq_rs, lfp_mean_rs, yerr=lfp_std_rs, c='k', linewidth=9.0, elinewidth=3.0, ecolor='#D8D8D8', alpha=0.5, capthick=0.) plt.axis('tight') plt.xlabel('Frequency (Hz)') plt.ylabel('Power (dB)') # plt.ylim(0, 1) plt.title('Mean LFP PSD') ax = plt.subplot(gs[1, :35]) plt.plot(pcf.freqs, lfp_rate_cc, '-', c=COLOR_BLUE_LFP, linewidth=9.0, alpha=0.7) plt.axhline(0, c='k') plt.axis('tight') plt.xlabel('Frequency (Hz)') plt.ylabel('Correlation Coefficient') plt.ylim(-0.05, 0.25) plt.title('LFP Power vs log Spike Rate') """ fi = freqs < 200 ax = plt.subplot(gs[1, :35]) clrs = ['k', '#d60036', COLOR_YELLOW_SPIKE] alphas = [0.8, 0.8, 0.6] for k,reg in enumerate(['brainstem', 'thalamus', 'cortex']): i = cell_df.superregion == reg indices = cell_df['index'][i].values psds = crcns_psds[indices, :] log_psds = deepcopy(psds) log_transform(log_psds) # compute the mean and sd of the power spectra psd_mean = log_psds.mean(axis=0) psd_std = log_psds.std(axis=0, ddof=1) psd_cv = psd_std / psd_mean # plot the mean power spectrum on the left plt.plot(freqs[fi], psd_mean[fi], c=clrs[k], linewidth=9.0, alpha=alphas[k]) plt.ylabel('Power (dB)') plt.xlabel('Frequency (Hz)') plt.axis('tight') plt.ylim(0, 1.0) plt.legend(['MLd', 'OV', 'Field L+CM'], fontsize='x-small', loc='upper right') plt.title('Mean PSTH PSDs (CRCNS Data)') """ ax = plt.subplot(gs[:, 40:]) plt.imshow(lfp_and_spike_cov, aspect='auto', interpolation='nearest', origin='lower', cmap=magma, vmin=0, vmax=1) plt.colorbar(label='Correlation Coefficient') xy = np.arange(len(pcf.freqs)) lbls = ['%d' % f for f in pcf.freqs] plt.xticks(xy, lbls, rotation=0) plt.yticks(xy, lbls) plt.axhline(nfreqs-0.5, c='w') plt.axvline(nfreqs-0.5, c='w') plt.xlabel('Frequency (Hz)') plt.ylabel('Frequency (Hz)') plt.title('LFP PSD Correlation Matrix') fname = os.path.join(get_this_dir(), 'crcns_data.svg') plt.savefig(fname, facecolor='w', edgecolor='none') plt.show()
def lda_train_scaled(fv, shrink=False): """Train the LDA classifier. Parameters ---------- fv : ``Data`` object the feature vector must have 2 dimensional data, the first dimension being the class axis. The unique class labels must be 0 and 1 otherwise a ``ValueError`` will be raised. shrink : Boolean, optional use shrinkage Returns ------- w : 1d array b : float Raises ------ ValueError : if the class labels are not exactly 0s and 1s Examples -------- >>> clf = lda_train(fv_train) >>> out = lda_apply(fv_test, clf) See Also -------- lda_apply """ assert shrink is True x = fv.data y = fv.axes[0] if len(np.unique(y)) != 2: raise ValueError('Should only have two unique class labels, instead got' ': {labels}'.format(labels=np.unique(y))) # Use sorted labels labels = np.sort(np.unique(y)) mu1 = np.mean(x[y == labels[0]], axis=0) mu2 = np.mean(x[y == labels[1]], axis=0) # x' = x - m m = np.empty(x.shape) m[y == labels[0]] = mu1 m[y == labels[1]] = mu2 x2 = x - m # w = cov(x)^-1(mu2 - mu1) if shrink: estimator = LW() covm = estimator.fit(x2).covariance_ else: covm = np.cov(x2.T) w = np.dot(np.linalg.pinv(covm), (mu2 - mu1)) # From matlab bbci toolbox: # https://github.com/bbci/bbci_public/blob/fe6caeb549fdc864a5accf76ce71dd2a926ff12b/classification/train_RLDAshrink.m#L133-L134 #C.w= C.w/(C.w'*diff(C_mean, 1, 2))*2; #C.b= -C.w' * mean(C_mean,2); w = (w / np.dot(w.T, (mu2 - mu1))) * 2 b = np.dot(-w.T, np.mean((mu1, mu2), axis=0)) assert not np.any(np.isnan(w)) assert not np.isnan(b) return w, b
time_series = masker.fit_transform(func_filename, confounds=[confound_filename]) ########################################################################## # Display time series import matplotlib.pyplot as plt for time_serie, label in zip(time_series.T, labels): plt.plot(time_serie, label=label) plt.title('Default Mode Network Time Series') plt.xlabel('Scan number') plt.ylabel('Normalized signal') plt.legend() plt.tight_layout() ########################################################################## # Compute precision matrices from sklearn.covariance import LedoitWolf cve = LedoitWolf() cve.fit(time_series) ########################################################################## # Display connectome from nilearn import plotting plotting.plot_connectome(cve.precision_, dmn_coords, title="Default Mode Network Connectivity") plotting.show()
def test_connectivity_measure_outputs(): n_subjects = 10 n_features = 49 # Generate signals and compute covariances emp_covs = [] ledoit_covs = [] signals = [] ledoit_estimator = LedoitWolf() for k in range(n_subjects): n_samples = 200 + k signal, _, _ = generate_signals(n_features=n_features, n_confounds=5, length=n_samples, same_variance=False) signals.append(signal) signal -= signal.mean(axis=0) emp_covs.append((signal.T).dot(signal) / n_samples) ledoit_covs.append(ledoit_estimator.fit(signal).covariance_) kinds = ["covariance", "correlation", "tangent", "precision", "partial correlation"] # Check outputs properties for cov_estimator, covs in zip([EmpiricalCovariance(), LedoitWolf()], [emp_covs, ledoit_covs]): input_covs = copy.copy(covs) for kind in kinds: conn_measure = ConnectivityMeasure(kind=kind, cov_estimator=cov_estimator) connectivities = conn_measure.fit_transform(signals) # Generic assert_true(isinstance(connectivities, np.ndarray)) assert_equal(len(connectivities), len(covs)) for k, cov_new in enumerate(connectivities): assert_array_equal(input_covs[k], covs[k]) assert(is_spd(covs[k], decimal=7)) # Positive definiteness if expected and output value checks if kind == "tangent": assert_array_almost_equal(cov_new, cov_new.T) gmean_sqrt = _map_eigenvalues(np.sqrt, conn_measure.mean_) assert(is_spd(gmean_sqrt, decimal=7)) assert(is_spd(conn_measure.whitening_, decimal=7)) assert_array_almost_equal(conn_measure.whitening_.dot( gmean_sqrt), np.eye(n_features)) assert_array_almost_equal(gmean_sqrt.dot( _map_eigenvalues(np.exp, cov_new)).dot(gmean_sqrt), covs[k]) elif kind == "precision": assert(is_spd(cov_new, decimal=7)) assert_array_almost_equal(cov_new.dot(covs[k]), np.eye(n_features)) elif kind == "correlation": assert(is_spd(cov_new, decimal=7)) d = np.sqrt(np.diag(np.diag(covs[k]))) if cov_estimator == EmpiricalCovariance(): assert_array_almost_equal(d.dot(cov_new).dot(d), covs[k]) assert_array_almost_equal(np.diag(cov_new), np.ones((n_features))) elif kind == "partial correlation": prec = linalg.inv(covs[k]) d = np.sqrt(np.diag(np.diag(prec))) assert_array_almost_equal(d.dot(cov_new).dot(d), -prec + 2 * np.diag(np.diag(prec))) # Check the mean_ for kind in kinds: conn_measure = ConnectivityMeasure(kind=kind) conn_measure.fit_transform(signals) assert_equal((conn_measure.mean_).shape, (n_features, n_features)) if kind != 'tangent': assert_array_almost_equal( conn_measure.mean_, np.mean(conn_measure.transform(signals), axis=0)) # Check that the mean isn't modified in transform conn_measure = ConnectivityMeasure(kind='covariance') conn_measure.fit(signals[:1]) mean = conn_measure.mean_ conn_measure.transform(signals[1:]) assert_array_equal(mean, conn_measure.mean_) # Check vectorization option for kind in kinds: conn_measure = ConnectivityMeasure(kind=kind) connectivities = conn_measure.fit_transform(signals) conn_measure = ConnectivityMeasure(vectorize=True, kind=kind) vectorized_connectivities = conn_measure.fit_transform(signals) assert_array_almost_equal(vectorized_connectivities, sym_matrix_to_vec(connectivities)) # Check not fitted error assert_raises_regex( ValueError, 'has not been fitted. ', ConnectivityMeasure().inverse_transform, vectorized_connectivities) # Check inverse transformation kinds.remove('tangent') for kind in kinds: # without vectorization: input matrices are returned with no change conn_measure = ConnectivityMeasure(kind=kind) connectivities = conn_measure.fit_transform(signals) assert_array_almost_equal( conn_measure.inverse_transform(connectivities), connectivities) # with vectorization: input vectors are reshaped into matrices # if diagonal has not been discarded conn_measure = ConnectivityMeasure(kind=kind, vectorize=True) vectorized_connectivities = conn_measure.fit_transform(signals) assert_array_almost_equal( conn_measure.inverse_transform(vectorized_connectivities), connectivities) # with vectorization if diagonal has been discarded for kind in ['correlation', 'partial correlation']: connectivities = ConnectivityMeasure(kind=kind).fit_transform(signals) conn_measure = ConnectivityMeasure(kind=kind, vectorize=True, discard_diagonal=True) vectorized_connectivities = conn_measure.fit_transform(signals) assert_array_almost_equal( conn_measure.inverse_transform(vectorized_connectivities), connectivities) for kind in ['covariance', 'precision']: connectivities = ConnectivityMeasure(kind=kind).fit_transform(signals) conn_measure = ConnectivityMeasure(kind=kind, vectorize=True, discard_diagonal=True) vectorized_connectivities = conn_measure.fit_transform(signals) diagonal = np.array([np.diagonal(conn) / sqrt(2) for conn in connectivities]) inverse_transformed = conn_measure.inverse_transform( vectorized_connectivities, diagonal=diagonal) assert_array_almost_equal(inverse_transformed, connectivities) assert_raises_regex(ValueError, 'can not reconstruct connectivity matrices', conn_measure.inverse_transform, vectorized_connectivities) # for 'tangent' kind, covariance matrices are reconstructed # without vectorization tangent_measure = ConnectivityMeasure(kind='tangent') displacements = tangent_measure.fit_transform(signals) covariances = ConnectivityMeasure(kind='covariance').fit_transform( signals) assert_array_almost_equal( tangent_measure.inverse_transform(displacements), covariances) # with vectorization # when diagonal has not been discarded tangent_measure = ConnectivityMeasure(kind='tangent', vectorize=True) vectorized_displacements = tangent_measure.fit_transform(signals) assert_array_almost_equal( tangent_measure.inverse_transform(vectorized_displacements), covariances) # when diagonal has been discarded tangent_measure = ConnectivityMeasure(kind='tangent', vectorize=True, discard_diagonal=True) vectorized_displacements = tangent_measure.fit_transform(signals) diagonal = np.array([np.diagonal(matrix) / sqrt(2) for matrix in displacements]) inverse_transformed = tangent_measure.inverse_transform( vectorized_displacements, diagonal=diagonal) assert_array_almost_equal(inverse_transformed, covariances) assert_raises_regex(ValueError, 'can not reconstruct connectivity matrices', tangent_measure.inverse_transform, vectorized_displacements)
try: import sklearn except ImportError: has_sklearn = False print('sklearn not available') def cov2corr(cov): std_ = np.sqrt(np.diag(cov)) corr = cov / np.outer(std_, std_) return corr if has_sklearn: from sklearn.covariance import LedoitWolf, OAS, MCD lw = LedoitWolf(store_precision=False) lw.fit(rr, assume_centered=False) cov_lw = lw.covariance_ corr_lw = cov2corr(cov_lw) oas = OAS(store_precision=False) oas.fit(rr, assume_centered=False) cov_oas = oas.covariance_ corr_oas = cov2corr(cov_oas) mcd = MCD()#.fit(rr, reweight=None) mcd.fit(rr, assume_centered=False) cov_mcd = mcd.covariance_ corr_mcd = cov2corr(cov_mcd) titles = ['raw correlation', 'lw', 'oas', 'mcd']
r = 0.1 real_cov = toeplitz(r ** np.arange(n_features)) coloring_matrix = cholesky(real_cov) n_samples_range = np.arange(6, 31, 1) repeat = 100 lw_mse = np.zeros((n_samples_range.size, repeat)) oa_mse = np.zeros((n_samples_range.size, repeat)) lw_shrinkage = np.zeros((n_samples_range.size, repeat)) oa_shrinkage = np.zeros((n_samples_range.size, repeat)) for i, n_samples in enumerate(n_samples_range): for j in range(repeat): X = np.dot( np.random.normal(size=(n_samples, n_features)), coloring_matrix.T) lw = LedoitWolf(store_precision=False, assume_centered=True) lw.fit(X) lw_mse[i, j] = lw.error_norm(real_cov, scaling=False) lw_shrinkage[i, j] = lw.shrinkage_ oa = OAS(store_precision=False, assume_centered=True) oa.fit(X) oa_mse[i, j] = oa.error_norm(real_cov, scaling=False) oa_shrinkage[i, j] = oa.shrinkage_ # plot MSE plt.subplot(2, 1, 1) plt.errorbar(n_samples_range, lw_mse.mean(1), yerr=lw_mse.std(1), label='Ledoit-Wolf', color='g') plt.errorbar(n_samples_range, oa_mse.mean(1), yerr=oa_mse.std(1), label='OAS', color='r')