Ejemplo n.º 1
0
 def test_reconstruction(self):
     """
     We can reconstruct the original data matrix exactly from the full
     reconstruction.
     """
     pca = PCA().fit(self.X)
     self.assertTrue(np.allclose(self.X, pca.predict_reconstruction()))
Ejemplo n.º 2
0
    def __init__(self,
                 joint,
                 individual,
                 noise,
                 obs_names=None,
                 var_names=None,
                 block_name=None,
                 m=None,
                 shape=None):

        self.joint = PCA.from_precomputed(n_components=joint['rank'],
                                          scores=joint['scores'],
                                          loadings=joint['loadings'],
                                          svals=joint['svals'],
                                          obs_names=obs_names,
                                          var_names=var_names,
                                          m=m,
                                          shape=shape)

        if joint['rank'] != 0:
            self.joint.set_comp_names(
                ['joint_comp_{}'.format(i) for i in range(self.joint.rank)])

        if joint['full'] is not None:
            self.joint.full_ = pd.DataFrame(joint['full'],
                                            index=obs_names,
                                            columns=var_names)
        else:
            self.joint.full_ = None

        self.individual = PCA.from_precomputed(n_components=individual['rank'],
                                               scores=individual['scores'],
                                               loadings=individual['loadings'],
                                               svals=individual['svals'],
                                               obs_names=obs_names,
                                               var_names=var_names,
                                               m=m,
                                               shape=shape)
        if individual['rank'] != 0:
            self.individual.set_comp_names([
                'indiv_comp_{}'.format(i) for i in range(self.individual.rank)
            ])

        if individual['full'] is not None:
            self.individual.full_ = pd.DataFrame(individual['full'],
                                                 index=obs_names,
                                                 columns=var_names)
        else:
            self.individual.full_ = None

        if noise is not None:
            self.noise_ = pd.DataFrame(noise,
                                       index=obs_names,
                                       columns=var_names)
        else:
            self.noise_ = None

        self.block_name = block_name
Ejemplo n.º 3
0
    def test_frob_norm(self):
        """
        Check Frobenius norm is calculated correctly whether the full
        or partial PCA is computed.
        """
        true_frob_norm = np.linalg.norm(self.X_cent, ord='fro')
        pca = PCA(n_components=None).fit(self.X)
        self.assertTrue(np.allclose(pca.frob_norm_, true_frob_norm))

        # TODO: this is failing, it could be a numerical issue.
        pca = PCA(n_components=3).fit(self.X)
        self.assertTrue(np.allclose(pca.frob_norm_, true_frob_norm))
Ejemplo n.º 4
0
    def test_centering(self):
        """
        Make sure PCA computes the correct centers. Also check center=False
        works correctly.
        """
        self.assertTrue(np.allclose(self.pca.m_, self.X.mean(axis=0)))

        # no centering
        pca = PCA(n_components=4, center=False).fit(self.X)
        self.assertTrue(pca.m_ is None)

        Z = np.random.normal(size=(20, self.X.shape[1]))
        V = pca.loadings_.values
        self.assertTrue(np.allclose(pca.predict_scores(Z), np.dot(Z, V)))
Ejemplo n.º 5
0
def svals(data, joint):
    init = 0
    last = 0
    previous_step = 0

    for i in PCA().fit(data).svals_:
        if last - i < 0.2 * previous_step:
            init += 1
        previous_step = last - i
        last = i
    return max(init, joint)
Ejemplo n.º 6
0
    def setUp(self):
        n = 100
        d = 20
        n_components = 10
        obs_names = ['sample_{}'.format(i) for i in range(n)]
        var_names = ['var_{}'.format(i) for i in range(d)]

        X = pd.DataFrame(np.random.normal(size=(n, d)),
                         index=obs_names,
                         columns=var_names)
        X_cent = X - X.mean(axis=0)

        pca = PCA(n_components=n_components).fit(X)

        # store these for testing
        self.n = n
        self.d = d
        self.n_components = n_components
        self.obs_names = obs_names
        self.var_names = var_names
        self.X = X
        self.X_cent = X_cent
        self.pca = pca
Ejemplo n.º 7
0
    def __init__(
            self,
            joint,
            individual,
            noise,
            CNS,  # X,
            obs_names=None,
            var_names=None,
            block_name=None,
            m=None,
            shape=None,
            zero_index_names=True,
            init_signal_svd=None,
            X=None):

        self.joint = PCA.from_precomputed(n_components=joint['rank'],
                                          scores=joint['scores'],
                                          loadings=joint['loadings'],
                                          svals=joint['svals'],
                                          obs_names=obs_names,
                                          var_names=var_names,
                                          m=m,
                                          shape=shape)

        if joint['rank'] != 0:
            base = 'joint'
            if block_name is not None:
                base = '{}_{}'.format(block_name, base)
            self.joint.set_comp_names(base=base, zero_index=zero_index_names)

        if joint['full'] is not None:
            self.joint.full_ = pd.DataFrame(joint['full'],
                                            index=obs_names,
                                            columns=var_names)
        else:
            self.joint.full_ = None

        self.individual = PCA.from_precomputed(n_components=individual['rank'],
                                               scores=individual['scores'],
                                               loadings=individual['loadings'],
                                               svals=individual['svals'],
                                               obs_names=obs_names,
                                               var_names=var_names,
                                               m=m,
                                               shape=shape)
        if individual['rank'] != 0:
            base = 'indiv'
            if block_name is not None:
                base = '{}_{}'.format(block_name, base)
            self.individual.set_comp_names(base=base,
                                           zero_index=zero_index_names)

        if individual['full'] is not None:
            self.individual.full_ = pd.DataFrame(individual['full'],
                                                 index=obs_names,
                                                 columns=var_names)
        else:
            self.individual.full_ = None

        if noise is not None:
            self.noise_ = pd.DataFrame(noise,
                                       index=obs_names,
                                       columns=var_names)
        else:
            self.noise_ = None

        self.block_name = block_name

        # compute common normalized loadings
        # U, D, V = self.joint.get_UDV()

        U, D, V = init_signal_svd['scores'], init_signal_svd['svals'], \
            init_signal_svd['loadings']
        common_loadigs = V.dot(np.multiply(U, 1.0 / D).T.dot(CNS))
        # common_loadigs = V.dot(np.multiply(U, D).T.dot(CNS))
        # col_norms = np.linalg.norm(common_loadigs, axis=0)
        # common_loadigs *= (1.0 / col_norms)

        base = 'common'
        if block_name is None:
            base = '{}_{}'.format(block_name, base)
        comp_names = get_comp_names(base=base,
                                    num=CNS.shape[1],
                                    zero_index=zero_index_names)
        self.common_loadings_ = pd.DataFrame(common_loadigs,
                                             index=var_names,
                                             columns=comp_names)

        # TODO: delete
        # # regression on J
        # U, D, V = joint['scores'], joint['svals'], joint['loadings']
        # common_loadings_reg_J = \
        #     V.dot(np.multiply(U, 1.0 / D).T.dot(CNS))
        # self.common_loadings_reg_J = pd.DataFrame(common_loadings_reg_J,
        #                                           index=var_names,
        #                                           columns=comp_names)

        # regression on X
        common_loadings_reg_X = []
        for j in range(CNS.shape[1]):
            lm = LinearRegression().fit(X, CNS[:, j])
            common_loadings_reg_X.append(lm.coef_)
        common_loadings_reg_X = np.array(common_loadings_reg_X).T
        self.common_loadings_reg_X = pd.DataFrame(common_loadings_reg_X,
                                                  index=var_names,
                                                  columns=comp_names)
Ejemplo n.º 8
0
    def fit(self, blocks, precomp_init_svd=None):
        """
        Fits the AJIVE decomposition.

        Parameters
        ----------
        blocks: list, dict
            The data matrices. If dict, will name blocks by keys, otherwise
            blocks are named by 0, 1, ...K. Data matrices must have observations
            on the rows and have the same number of observations i.e. the
            kth data matrix is shape (n_samples, n_features[k]).

        precomp_init_svd: {list, dict, None}, optional
            Precomputed initial SVD. Must have one entry for each data block.
            The SVD should be a 3 tuple (scores, svals, loadings), see output
            of jive.utils.svd_wrapper for formatting details.

        """
        blocks, self.init_signal_ranks, self.indiv_ranks, precomp_init_svd,\
            self.center, obs_names, var_names, self.shapes_ = \
                arg_checker(blocks,
                            self.init_signal_ranks,
                            self.joint_rank,
                            self.indiv_ranks,
                            precomp_init_svd,
                            self.center)

        block_names = list(blocks.keys())
        num_obs = list(blocks.values())[0].shape[0]

        # center blocks
        self.centers_ = {}
        for bn in block_names:
            blocks[bn], self.centers_[bn] = centering(blocks[bn],
                                                      method=self.center[bn])

        ################################################################
        # step 1: initial signal space extraction by SVD on each block #
        ################################################################

        init_signal_svd = {}
        self.sv_threshold_ = {}
        for bn in block_names:

            # compute rank init_signal_ranks[bn] + 1 SVD of the data block
            if precomp_init_svd[bn] is None:
                # signal rank + 1 to get individual rank sv threshold
                U, D, V = svd_wrapper(blocks[bn],
                                      self.init_signal_ranks[bn] + 1)
            else:
                U = precomp_init_svd[bn]['scores']
                D = precomp_init_svd[bn]['svals']
                V = precomp_init_svd[bn]['loadings']

            # The SV threshold is halfway between the init_signal_ranks[bn]th
            # and init_signal_ranks[bn] + 1 st singular value. Recall that
            # python is zero indexed.
            self.sv_threshold_[bn] = (D[self.init_signal_ranks[bn] - 1] \
                                      + D[self.init_signal_ranks[bn]]) / 2

            init_signal_svd[bn] = {
                'scores': U[:, 0:self.init_signal_ranks[bn]],
                'svals': D[0:self.init_signal_ranks[bn]],
                'loadings': V[:, 0:self.init_signal_ranks[bn]]
            }

        ##################################
        # step 2: joint space estimation #
        ##################################
        # this step estimates the joint rank and computes the common
        # joint space basis

        # SVD of joint signal matrix
        joint_scores_matrix = np.bmat(
            [init_signal_svd[bn]['scores'] for bn in block_names])
        joint_scores, joint_svals, joint_loadings = svd_wrapper(
            joint_scores_matrix)
        self.all_joint_svals_ = deepcopy(joint_svals)

        # estimate joint rank using wedin bound and random direction if a
        # joint rank estimate has not already been provided
        # TODO: maybe make this into an object or function
        if self.joint_rank is None:

            # if the random sv samples are not already provided compute them
            if self.random_sv_samples_ is None:
                self.random_sv_samples_ = \
                    sample_randdir(num_obs,
                                   signal_ranks=list(self.init_signal_ranks.values()),
                                   R=self.n_randdir_samples,
                                   n_jobs=self.n_jobs)

            # if the wedin samples are not already provided compute them
            if self.wedin_samples_ is None:
                self.wedin_samples_ = {}
                for bn in block_names:
                    self.wedin_samples_[bn] = \
                        get_wedin_samples(X=blocks[bn],
                                          U=init_signal_svd[bn]['scores'],
                                          D=init_signal_svd[bn]['svals'],
                                          V=init_signal_svd[bn]['loadings'],
                                          rank=self.init_signal_ranks[bn],
                                          R=self.n_wedin_samples,
                                          n_jobs=self.n_jobs)

            self.wedin_sv_samples_ = len(blocks) - \
                np.array([sum(self.wedin_samples_[bn][i] ** 2 for bn in block_names)
                          for i in range(self.n_wedin_samples)])

            # given the wedin and random bound samples, compute the joint rank
            # SV cutoff
            self.wedin_cutoff_ = np.percentile(self.wedin_sv_samples_,
                                               self.wedin_percentile)
            self.rand_cutoff_ = np.percentile(self.random_sv_samples_,
                                              self.randdir_percentile)
            self.svalsq_cutoff_ = max(self.wedin_cutoff_, self.rand_cutoff_)
            self.joint_rank_wedin_est_ = sum(
                joint_svals**2 > self.svalsq_cutoff_)
            self.joint_rank = deepcopy(self.joint_rank_wedin_est_)

        # check identifiability constraint and possibly remove some
        # joint components
        if self.reconsider_joint_components:
            joint_scores, joint_svals, joint_loadings, self.joint_rank = \
                reconsider_joint_components(blocks, self.sv_threshold_,
                                            joint_scores, joint_svals, joint_loadings,
                                            self.joint_rank)

        # TODO: include center?
        # TODO: comp_names, var_names
        # The common joint space has now been estimated
        self.common = PCA.from_precomputed(
            scores=joint_scores[:, 0:self.joint_rank],
            svals=joint_svals[0:self.joint_rank],
            loadings=joint_loadings[:, 0:self.joint_rank],
            obs_names=obs_names)

        self.common.set_comp_names(base='common',
                                   zero_index=self.zero_index_names)

        #######################################
        # step 3: compute final decomposition #
        #######################################
        # this step computes the block specific estimates

        block_specific = {bn: {} for bn in block_names}
        for bn in block_names:
            X = blocks[bn]

            ########################################
            # step 3.1: block specific joint space #
            ########################################
            # project X onto the joint space then compute SVD
            if self.joint_rank != 0:
                if issparse(X):  # lazy evaluation for sparse matrices
                    J = col_proj(X, joint_scores)
                    U, D, V = svd_wrapper(J, self.joint_rank)
                    J = None  # kill J matrix to save memory

                else:
                    J = np.array(
                        np.dot(joint_scores, np.dot(joint_scores.T, X)))
                    U, D, V = svd_wrapper(J, self.joint_rank)
                    if not self.store_full:
                        J = None  # kill J matrix to save memory

            else:
                U, D, V = None, None, None
                if self.store_full:
                    J = np.zeros(shape=blocks[bn].shape)
                else:
                    J = None

            block_specific[bn]['joint'] = {
                'full': J,
                'scores': U,
                'svals': D,
                'loadings': V,
                'rank': self.joint_rank
            }

            #############################################
            # step 3.2: block specific individual space #
            #############################################
            # project X onto the orthogonal complement of the joint space,
            # estimate the individual rank, then compute SVD

            if issparse(X):  # lazy evaluation for sparse matrices
                U, D, V, indiv_rank = indiv_space_for_sparse(
                    X, joint_scores, self.joint_rank,
                    self.init_signal_ranks[bn], self.sv_threshold_[bn])
                I = None

            else:

                # project X columns onto orthogonal complement of joint_scores
                if self.joint_rank == 0:
                    X_orthog = X
                else:
                    X_orthog = X - np.dot(joint_scores,
                                          np.dot(joint_scores.T, X))

                # estimate individual rank using sv threshold, then compute SVD
                if self.indiv_ranks[bn] is None:
                    max_rank = min(
                        X.shape) - self.joint_rank  # saves computation
                    U, D, V = svd_wrapper(X_orthog, max_rank)
                    rank = sum(D > self.sv_threshold_[bn])

                    if rank == 0:
                        U, D, V = None, None, None
                    else:
                        U = U[:, 0:rank]
                        D = D[0:rank]
                        V = V[:, 0:rank]

                    self.indiv_ranks[bn] = rank

                else:  # indiv_rank has been provided by the user
                    rank = self.indiv_ranks[bn]
                    if rank == 0:
                        U, D, V = None, None, None
                    else:
                        U, D, V = svd_wrapper(X_orthog, rank)

                if self.store_full:
                    if rank == 0:
                        I = np.zeros(shape=blocks[bn].shape)
                    else:
                        I = np.array(np.dot(U, np.dot(np.diag(D), V.T)))
                else:
                    I = None  # Kill I matrix to save memory

            block_specific[bn]['individual'] = {
                'full': I,
                'scores': U,
                'svals': D,
                'loadings': V,
                'rank': rank
            }

            ###################################
            # step 3.3: estimate noise matrix #
            ###################################

            if self.store_full and not issparse(X):
                E = X - (J + I)
            else:
                E = None
            block_specific[bn]['noise'] = E

        # save block specific estimates
        self.blocks = {}
        for bn in block_specific.keys():
            self.blocks[bn] = \
                BlockSpecificResults(joint=block_specific[bn]['joint'],
                                     individual=block_specific[bn]['individual'],
                                     noise=block_specific[bn]['noise'],
                                     CNS=joint_scores,
                                     block_name=bn,
                                     obs_names=obs_names,
                                     var_names=var_names[bn],
                                     m=self.centers_[bn],
                                     shape=blocks[bn].shape,
                                     zero_index_names=self.zero_index_names,
                                     init_signal_svd=init_signal_svd[bn],
                                     X=blocks[bn])

        return self
Ejemplo n.º 9
0
from jive.AJIVE import AJIVE
from jive.PCA import PCA
from jive.ajive_fig2 import generate_data_ajive_fig2
from jive.viz.block_visualization import data_block_heatmaps, jive_full_estimate_heatmaps
import matplotlib.pyplot as plt

X, Y = generate_data_ajive_fig2()
plt.figure(figsize=[6.5, 3])
data_block_heatmaps({'x': X, 'y': Y})
plt.savefig('figures/data_heatmaps.png', bbox_inches='tight')
plt.close()

# determine initial signal ranks by inspecting scree plots
plt.figure(figsize=[8.4, 3])
plt.subplot(1, 2, 1)
PCA().fit(X).plot_scree()
plt.subplot(1, 2, 2)
PCA().fit(Y).plot_scree()
plt.savefig('figures/scree_plots.png', bbox_inches='tight')
plt.close()

ajive = AJIVE(init_signal_ranks={'x': 2, 'y': 3})
ajive.fit(blocks={'x': X, 'y': Y})

plt.figure(figsize=[6.5, 12])
jive_full_estimate_heatmaps(ajive.get_full_block_estimates(),
                            blocks={
                                'x': X,
                                'y': Y
                            })
plt.savefig('figures/jive_estimate_heatmaps.png', bbox_inches='tight')