def test_reconstruction(self): """ We can reconstruct the original data matrix exactly from the full reconstruction. """ pca = PCA().fit(self.X) self.assertTrue(np.allclose(self.X, pca.predict_reconstruction()))
def __init__(self, joint, individual, noise, obs_names=None, var_names=None, block_name=None, m=None, shape=None): self.joint = PCA.from_precomputed(n_components=joint['rank'], scores=joint['scores'], loadings=joint['loadings'], svals=joint['svals'], obs_names=obs_names, var_names=var_names, m=m, shape=shape) if joint['rank'] != 0: self.joint.set_comp_names( ['joint_comp_{}'.format(i) for i in range(self.joint.rank)]) if joint['full'] is not None: self.joint.full_ = pd.DataFrame(joint['full'], index=obs_names, columns=var_names) else: self.joint.full_ = None self.individual = PCA.from_precomputed(n_components=individual['rank'], scores=individual['scores'], loadings=individual['loadings'], svals=individual['svals'], obs_names=obs_names, var_names=var_names, m=m, shape=shape) if individual['rank'] != 0: self.individual.set_comp_names([ 'indiv_comp_{}'.format(i) for i in range(self.individual.rank) ]) if individual['full'] is not None: self.individual.full_ = pd.DataFrame(individual['full'], index=obs_names, columns=var_names) else: self.individual.full_ = None if noise is not None: self.noise_ = pd.DataFrame(noise, index=obs_names, columns=var_names) else: self.noise_ = None self.block_name = block_name
def test_frob_norm(self): """ Check Frobenius norm is calculated correctly whether the full or partial PCA is computed. """ true_frob_norm = np.linalg.norm(self.X_cent, ord='fro') pca = PCA(n_components=None).fit(self.X) self.assertTrue(np.allclose(pca.frob_norm_, true_frob_norm)) # TODO: this is failing, it could be a numerical issue. pca = PCA(n_components=3).fit(self.X) self.assertTrue(np.allclose(pca.frob_norm_, true_frob_norm))
def test_centering(self): """ Make sure PCA computes the correct centers. Also check center=False works correctly. """ self.assertTrue(np.allclose(self.pca.m_, self.X.mean(axis=0))) # no centering pca = PCA(n_components=4, center=False).fit(self.X) self.assertTrue(pca.m_ is None) Z = np.random.normal(size=(20, self.X.shape[1])) V = pca.loadings_.values self.assertTrue(np.allclose(pca.predict_scores(Z), np.dot(Z, V)))
def svals(data, joint): init = 0 last = 0 previous_step = 0 for i in PCA().fit(data).svals_: if last - i < 0.2 * previous_step: init += 1 previous_step = last - i last = i return max(init, joint)
def setUp(self): n = 100 d = 20 n_components = 10 obs_names = ['sample_{}'.format(i) for i in range(n)] var_names = ['var_{}'.format(i) for i in range(d)] X = pd.DataFrame(np.random.normal(size=(n, d)), index=obs_names, columns=var_names) X_cent = X - X.mean(axis=0) pca = PCA(n_components=n_components).fit(X) # store these for testing self.n = n self.d = d self.n_components = n_components self.obs_names = obs_names self.var_names = var_names self.X = X self.X_cent = X_cent self.pca = pca
def __init__( self, joint, individual, noise, CNS, # X, obs_names=None, var_names=None, block_name=None, m=None, shape=None, zero_index_names=True, init_signal_svd=None, X=None): self.joint = PCA.from_precomputed(n_components=joint['rank'], scores=joint['scores'], loadings=joint['loadings'], svals=joint['svals'], obs_names=obs_names, var_names=var_names, m=m, shape=shape) if joint['rank'] != 0: base = 'joint' if block_name is not None: base = '{}_{}'.format(block_name, base) self.joint.set_comp_names(base=base, zero_index=zero_index_names) if joint['full'] is not None: self.joint.full_ = pd.DataFrame(joint['full'], index=obs_names, columns=var_names) else: self.joint.full_ = None self.individual = PCA.from_precomputed(n_components=individual['rank'], scores=individual['scores'], loadings=individual['loadings'], svals=individual['svals'], obs_names=obs_names, var_names=var_names, m=m, shape=shape) if individual['rank'] != 0: base = 'indiv' if block_name is not None: base = '{}_{}'.format(block_name, base) self.individual.set_comp_names(base=base, zero_index=zero_index_names) if individual['full'] is not None: self.individual.full_ = pd.DataFrame(individual['full'], index=obs_names, columns=var_names) else: self.individual.full_ = None if noise is not None: self.noise_ = pd.DataFrame(noise, index=obs_names, columns=var_names) else: self.noise_ = None self.block_name = block_name # compute common normalized loadings # U, D, V = self.joint.get_UDV() U, D, V = init_signal_svd['scores'], init_signal_svd['svals'], \ init_signal_svd['loadings'] common_loadigs = V.dot(np.multiply(U, 1.0 / D).T.dot(CNS)) # common_loadigs = V.dot(np.multiply(U, D).T.dot(CNS)) # col_norms = np.linalg.norm(common_loadigs, axis=0) # common_loadigs *= (1.0 / col_norms) base = 'common' if block_name is None: base = '{}_{}'.format(block_name, base) comp_names = get_comp_names(base=base, num=CNS.shape[1], zero_index=zero_index_names) self.common_loadings_ = pd.DataFrame(common_loadigs, index=var_names, columns=comp_names) # TODO: delete # # regression on J # U, D, V = joint['scores'], joint['svals'], joint['loadings'] # common_loadings_reg_J = \ # V.dot(np.multiply(U, 1.0 / D).T.dot(CNS)) # self.common_loadings_reg_J = pd.DataFrame(common_loadings_reg_J, # index=var_names, # columns=comp_names) # regression on X common_loadings_reg_X = [] for j in range(CNS.shape[1]): lm = LinearRegression().fit(X, CNS[:, j]) common_loadings_reg_X.append(lm.coef_) common_loadings_reg_X = np.array(common_loadings_reg_X).T self.common_loadings_reg_X = pd.DataFrame(common_loadings_reg_X, index=var_names, columns=comp_names)
def fit(self, blocks, precomp_init_svd=None): """ Fits the AJIVE decomposition. Parameters ---------- blocks: list, dict The data matrices. If dict, will name blocks by keys, otherwise blocks are named by 0, 1, ...K. Data matrices must have observations on the rows and have the same number of observations i.e. the kth data matrix is shape (n_samples, n_features[k]). precomp_init_svd: {list, dict, None}, optional Precomputed initial SVD. Must have one entry for each data block. The SVD should be a 3 tuple (scores, svals, loadings), see output of jive.utils.svd_wrapper for formatting details. """ blocks, self.init_signal_ranks, self.indiv_ranks, precomp_init_svd,\ self.center, obs_names, var_names, self.shapes_ = \ arg_checker(blocks, self.init_signal_ranks, self.joint_rank, self.indiv_ranks, precomp_init_svd, self.center) block_names = list(blocks.keys()) num_obs = list(blocks.values())[0].shape[0] # center blocks self.centers_ = {} for bn in block_names: blocks[bn], self.centers_[bn] = centering(blocks[bn], method=self.center[bn]) ################################################################ # step 1: initial signal space extraction by SVD on each block # ################################################################ init_signal_svd = {} self.sv_threshold_ = {} for bn in block_names: # compute rank init_signal_ranks[bn] + 1 SVD of the data block if precomp_init_svd[bn] is None: # signal rank + 1 to get individual rank sv threshold U, D, V = svd_wrapper(blocks[bn], self.init_signal_ranks[bn] + 1) else: U = precomp_init_svd[bn]['scores'] D = precomp_init_svd[bn]['svals'] V = precomp_init_svd[bn]['loadings'] # The SV threshold is halfway between the init_signal_ranks[bn]th # and init_signal_ranks[bn] + 1 st singular value. Recall that # python is zero indexed. self.sv_threshold_[bn] = (D[self.init_signal_ranks[bn] - 1] \ + D[self.init_signal_ranks[bn]]) / 2 init_signal_svd[bn] = { 'scores': U[:, 0:self.init_signal_ranks[bn]], 'svals': D[0:self.init_signal_ranks[bn]], 'loadings': V[:, 0:self.init_signal_ranks[bn]] } ################################## # step 2: joint space estimation # ################################## # this step estimates the joint rank and computes the common # joint space basis # SVD of joint signal matrix joint_scores_matrix = np.bmat( [init_signal_svd[bn]['scores'] for bn in block_names]) joint_scores, joint_svals, joint_loadings = svd_wrapper( joint_scores_matrix) self.all_joint_svals_ = deepcopy(joint_svals) # estimate joint rank using wedin bound and random direction if a # joint rank estimate has not already been provided # TODO: maybe make this into an object or function if self.joint_rank is None: # if the random sv samples are not already provided compute them if self.random_sv_samples_ is None: self.random_sv_samples_ = \ sample_randdir(num_obs, signal_ranks=list(self.init_signal_ranks.values()), R=self.n_randdir_samples, n_jobs=self.n_jobs) # if the wedin samples are not already provided compute them if self.wedin_samples_ is None: self.wedin_samples_ = {} for bn in block_names: self.wedin_samples_[bn] = \ get_wedin_samples(X=blocks[bn], U=init_signal_svd[bn]['scores'], D=init_signal_svd[bn]['svals'], V=init_signal_svd[bn]['loadings'], rank=self.init_signal_ranks[bn], R=self.n_wedin_samples, n_jobs=self.n_jobs) self.wedin_sv_samples_ = len(blocks) - \ np.array([sum(self.wedin_samples_[bn][i] ** 2 for bn in block_names) for i in range(self.n_wedin_samples)]) # given the wedin and random bound samples, compute the joint rank # SV cutoff self.wedin_cutoff_ = np.percentile(self.wedin_sv_samples_, self.wedin_percentile) self.rand_cutoff_ = np.percentile(self.random_sv_samples_, self.randdir_percentile) self.svalsq_cutoff_ = max(self.wedin_cutoff_, self.rand_cutoff_) self.joint_rank_wedin_est_ = sum( joint_svals**2 > self.svalsq_cutoff_) self.joint_rank = deepcopy(self.joint_rank_wedin_est_) # check identifiability constraint and possibly remove some # joint components if self.reconsider_joint_components: joint_scores, joint_svals, joint_loadings, self.joint_rank = \ reconsider_joint_components(blocks, self.sv_threshold_, joint_scores, joint_svals, joint_loadings, self.joint_rank) # TODO: include center? # TODO: comp_names, var_names # The common joint space has now been estimated self.common = PCA.from_precomputed( scores=joint_scores[:, 0:self.joint_rank], svals=joint_svals[0:self.joint_rank], loadings=joint_loadings[:, 0:self.joint_rank], obs_names=obs_names) self.common.set_comp_names(base='common', zero_index=self.zero_index_names) ####################################### # step 3: compute final decomposition # ####################################### # this step computes the block specific estimates block_specific = {bn: {} for bn in block_names} for bn in block_names: X = blocks[bn] ######################################## # step 3.1: block specific joint space # ######################################## # project X onto the joint space then compute SVD if self.joint_rank != 0: if issparse(X): # lazy evaluation for sparse matrices J = col_proj(X, joint_scores) U, D, V = svd_wrapper(J, self.joint_rank) J = None # kill J matrix to save memory else: J = np.array( np.dot(joint_scores, np.dot(joint_scores.T, X))) U, D, V = svd_wrapper(J, self.joint_rank) if not self.store_full: J = None # kill J matrix to save memory else: U, D, V = None, None, None if self.store_full: J = np.zeros(shape=blocks[bn].shape) else: J = None block_specific[bn]['joint'] = { 'full': J, 'scores': U, 'svals': D, 'loadings': V, 'rank': self.joint_rank } ############################################# # step 3.2: block specific individual space # ############################################# # project X onto the orthogonal complement of the joint space, # estimate the individual rank, then compute SVD if issparse(X): # lazy evaluation for sparse matrices U, D, V, indiv_rank = indiv_space_for_sparse( X, joint_scores, self.joint_rank, self.init_signal_ranks[bn], self.sv_threshold_[bn]) I = None else: # project X columns onto orthogonal complement of joint_scores if self.joint_rank == 0: X_orthog = X else: X_orthog = X - np.dot(joint_scores, np.dot(joint_scores.T, X)) # estimate individual rank using sv threshold, then compute SVD if self.indiv_ranks[bn] is None: max_rank = min( X.shape) - self.joint_rank # saves computation U, D, V = svd_wrapper(X_orthog, max_rank) rank = sum(D > self.sv_threshold_[bn]) if rank == 0: U, D, V = None, None, None else: U = U[:, 0:rank] D = D[0:rank] V = V[:, 0:rank] self.indiv_ranks[bn] = rank else: # indiv_rank has been provided by the user rank = self.indiv_ranks[bn] if rank == 0: U, D, V = None, None, None else: U, D, V = svd_wrapper(X_orthog, rank) if self.store_full: if rank == 0: I = np.zeros(shape=blocks[bn].shape) else: I = np.array(np.dot(U, np.dot(np.diag(D), V.T))) else: I = None # Kill I matrix to save memory block_specific[bn]['individual'] = { 'full': I, 'scores': U, 'svals': D, 'loadings': V, 'rank': rank } ################################### # step 3.3: estimate noise matrix # ################################### if self.store_full and not issparse(X): E = X - (J + I) else: E = None block_specific[bn]['noise'] = E # save block specific estimates self.blocks = {} for bn in block_specific.keys(): self.blocks[bn] = \ BlockSpecificResults(joint=block_specific[bn]['joint'], individual=block_specific[bn]['individual'], noise=block_specific[bn]['noise'], CNS=joint_scores, block_name=bn, obs_names=obs_names, var_names=var_names[bn], m=self.centers_[bn], shape=blocks[bn].shape, zero_index_names=self.zero_index_names, init_signal_svd=init_signal_svd[bn], X=blocks[bn]) return self
from jive.AJIVE import AJIVE from jive.PCA import PCA from jive.ajive_fig2 import generate_data_ajive_fig2 from jive.viz.block_visualization import data_block_heatmaps, jive_full_estimate_heatmaps import matplotlib.pyplot as plt X, Y = generate_data_ajive_fig2() plt.figure(figsize=[6.5, 3]) data_block_heatmaps({'x': X, 'y': Y}) plt.savefig('figures/data_heatmaps.png', bbox_inches='tight') plt.close() # determine initial signal ranks by inspecting scree plots plt.figure(figsize=[8.4, 3]) plt.subplot(1, 2, 1) PCA().fit(X).plot_scree() plt.subplot(1, 2, 2) PCA().fit(Y).plot_scree() plt.savefig('figures/scree_plots.png', bbox_inches='tight') plt.close() ajive = AJIVE(init_signal_ranks={'x': 2, 'y': 3}) ajive.fit(blocks={'x': X, 'y': Y}) plt.figure(figsize=[6.5, 12]) jive_full_estimate_heatmaps(ajive.get_full_block_estimates(), blocks={ 'x': X, 'y': Y }) plt.savefig('figures/jive_estimate_heatmaps.png', bbox_inches='tight')