Beispiel #1
0
def indiv_space_for_sparse(X, joint_scores, joint_rank, signal_rank,
                           sv_threshold):
    # compute a rank R1 SVD of I
    # if the estimated individual rank is less than R1 we are done
    # otherwise compute a rank R2 SVD of I
    # keep going until we find the individual rank
    # TODO: this could use lots of optimizing

    X_orthog = col_proj_orthog(X, joint_scores)

    # start with a low rank SVD
    max_rank = min(X.shape) - joint_rank  # saves computation
    current_rank = min(int(1.2 * signal_rank),
                       max_rank)  # 1.2 is somewhat arbitrary
    U, D, V = svd_wrapper(X_orthog, current_rank)
    indiv_rank = sum(D > sv_threshold)

    if indiv_rank == current_rank:  # SVD rank is still too low
        found_indiv_rank = False
        for t in range(3):

            # current guess at an upper bound for the individual rank
            additional_rank = signal_rank
            current_rank = current_rank + additional_rank
            current_rank = min(current_rank, max_rank)

            # compute additional additional_rank SVD components

            # TODO: possibly use svds_additional to speed up calculation
            # U, D, V = svds_additional(I, scores, sv, loadings, additional_rank)
            U, D, V = svd_wrapper(X_orthog, current_rank)
            indiv_rank = sum(D > sv_threshold)

            # we are done if the individual rank estimate is less
            # than the current_rank or if the current_rank is equal to the maximal rank
            if (indiv_rank < current_rank) or (current_rank == max_rank):
                found_indiv_rank = True
                break

        if not found_indiv_rank:
            warnings.warn('individual rank estimate probably too low')

    return U[:, 0:indiv_rank], D[0:indiv_rank], V[:, 0:indiv_rank], indiv_rank
Beispiel #2
0
def _get_rand_sample(num_obs, signal_ranks):
    M = [None for _ in range(len(signal_ranks))]
    for k in range(len(signal_ranks)):

        # sample random orthonormal basis
        Z = np.random.normal(size=(num_obs, signal_ranks[k]))
        M[k] = np.linalg.qr(Z)[0]

    # compute largest sing val of random joint matrix
    M = np.bmat(M)
    _, svs, __ = svd_wrapper(M, rank=1)

    return svs.item() ** 2
Beispiel #3
0
    def fit(self, X):
        """
        Computes the PCA decomposition of X.

        Parameters
        ----------
        X: {array-like, sparse matrix}, shape (n_samples, n_features)
            Fit PCA with data matrix X. If X is a pd.DataFrame, the observation
            and feature names will be extracted from its index/columns.
            Note X can be either dense or sparse.

        """
        self.shape_, obs_names, var_names, self.n_components, \
            = _arg_checker(X, self.n_components)

        # possibly mean center X
        X, self.m_ = centering(X, self.center)

        # compute SVD
        U, D, V = svd_wrapper(X, self.n_components)

        # compute variance explained
        if self.n_components == min(X.shape):
            self.frob_norm_ = np.sqrt(sum(D**2))
        else:
            self.frob_norm_ = _safe_frob_norm(X)
        self.var_expl_prop_ = D**2 / self.frob_norm_**2
        self.var_expl_cum_ = np.cumsum(self.var_expl_prop_)

        if self.n_components is None:
            self.n_components = self.scores_.shape[1]

        self.scores_, self.svals_, self.loadings_ = \
            svd2pd(U, D, V, obs_names=obs_names, var_names=var_names)

        return self
Beispiel #4
0
    def fit(self, blocks, precomp_init_svd=None):
        """
        Fits the AJIVE decomposition.

        Parameters
        ----------
        blocks: list, dict
            The data matrices. If dict, will name blocks by keys, otherwise
            blocks are named by 0, 1, ...K. Data matrices must have observations
            on the rows and have the same number of observations i.e. the
            kth data matrix is shape (n_samples, n_features[k]).

        precomp_init_svd: {list, dict, None}, optional
            Precomputed initial SVD. Must have one entry for each data block.
            The SVD should be a 3 tuple (scores, svals, loadings), see output
            of jive.utils.svd_wrapper for formatting details.

        """
        blocks, self.init_signal_ranks, self.indiv_ranks, precomp_init_svd,\
            self.center, obs_names, var_names, self.shapes_ = \
                arg_checker(blocks,
                            self.init_signal_ranks,
                            self.joint_rank,
                            self.indiv_ranks,
                            precomp_init_svd,
                            self.center)

        block_names = list(blocks.keys())
        num_obs = list(blocks.values())[0].shape[0]

        # center blocks
        self.centers_ = {}
        for bn in block_names:
            blocks[bn], self.centers_[bn] = centering(blocks[bn],
                                                      method=self.center[bn])

        ################################################################
        # step 1: initial signal space extraction by SVD on each block #
        ################################################################

        init_signal_svd = {}
        self.sv_threshold_ = {}
        for bn in block_names:

            # compute rank init_signal_ranks[bn] + 1 SVD of the data block
            if precomp_init_svd[bn] is None:
                # signal rank + 1 to get individual rank sv threshold
                U, D, V = svd_wrapper(blocks[bn],
                                      self.init_signal_ranks[bn] + 1)
            else:
                U = precomp_init_svd[bn]['scores']
                D = precomp_init_svd[bn]['svals']
                V = precomp_init_svd[bn]['loadings']

            # The SV threshold is halfway between the init_signal_ranks[bn]th
            # and init_signal_ranks[bn] + 1 st singular value. Recall that
            # python is zero indexed.
            self.sv_threshold_[bn] = (D[self.init_signal_ranks[bn] - 1] \
                                      + D[self.init_signal_ranks[bn]]) / 2

            init_signal_svd[bn] = {
                'scores': U[:, 0:self.init_signal_ranks[bn]],
                'svals': D[0:self.init_signal_ranks[bn]],
                'loadings': V[:, 0:self.init_signal_ranks[bn]]
            }

        ##################################
        # step 2: joint space estimation #
        ##################################
        # this step estimates the joint rank and computes the common
        # joint space basis

        # SVD of joint signal matrix
        joint_scores_matrix = np.bmat(
            [init_signal_svd[bn]['scores'] for bn in block_names])
        joint_scores, joint_svals, joint_loadings = svd_wrapper(
            joint_scores_matrix)
        self.all_joint_svals_ = deepcopy(joint_svals)

        # estimate joint rank using wedin bound and random direction if a
        # joint rank estimate has not already been provided
        # TODO: maybe make this into an object or function
        if self.joint_rank is None:

            # if the random sv samples are not already provided compute them
            if self.random_sv_samples_ is None:
                self.random_sv_samples_ = \
                    sample_randdir(num_obs,
                                   signal_ranks=list(self.init_signal_ranks.values()),
                                   R=self.n_randdir_samples,
                                   n_jobs=self.n_jobs)

            # if the wedin samples are not already provided compute them
            if self.wedin_samples_ is None:
                self.wedin_samples_ = {}
                for bn in block_names:
                    self.wedin_samples_[bn] = \
                        get_wedin_samples(X=blocks[bn],
                                          U=init_signal_svd[bn]['scores'],
                                          D=init_signal_svd[bn]['svals'],
                                          V=init_signal_svd[bn]['loadings'],
                                          rank=self.init_signal_ranks[bn],
                                          R=self.n_wedin_samples,
                                          n_jobs=self.n_jobs)

            self.wedin_sv_samples_ = len(blocks) - \
                np.array([sum(self.wedin_samples_[bn][i] ** 2 for bn in block_names)
                          for i in range(self.n_wedin_samples)])

            # given the wedin and random bound samples, compute the joint rank
            # SV cutoff
            self.wedin_cutoff_ = np.percentile(self.wedin_sv_samples_,
                                               self.wedin_percentile)
            self.rand_cutoff_ = np.percentile(self.random_sv_samples_,
                                              self.randdir_percentile)
            self.svalsq_cutoff_ = max(self.wedin_cutoff_, self.rand_cutoff_)
            self.joint_rank_wedin_est_ = sum(
                joint_svals**2 > self.svalsq_cutoff_)
            self.joint_rank = deepcopy(self.joint_rank_wedin_est_)

        # check identifiability constraint and possibly remove some
        # joint components
        if self.reconsider_joint_components:
            joint_scores, joint_svals, joint_loadings, self.joint_rank = \
                reconsider_joint_components(blocks, self.sv_threshold_,
                                            joint_scores, joint_svals, joint_loadings,
                                            self.joint_rank)

        # TODO: include center?
        # TODO: comp_names, var_names
        # The common joint space has now been estimated
        self.common = PCA.from_precomputed(
            scores=joint_scores[:, 0:self.joint_rank],
            svals=joint_svals[0:self.joint_rank],
            loadings=joint_loadings[:, 0:self.joint_rank],
            obs_names=obs_names)

        self.common.set_comp_names(base='common',
                                   zero_index=self.zero_index_names)

        #######################################
        # step 3: compute final decomposition #
        #######################################
        # this step computes the block specific estimates

        block_specific = {bn: {} for bn in block_names}
        for bn in block_names:
            X = blocks[bn]

            ########################################
            # step 3.1: block specific joint space #
            ########################################
            # project X onto the joint space then compute SVD
            if self.joint_rank != 0:
                if issparse(X):  # lazy evaluation for sparse matrices
                    J = col_proj(X, joint_scores)
                    U, D, V = svd_wrapper(J, self.joint_rank)
                    J = None  # kill J matrix to save memory

                else:
                    J = np.array(
                        np.dot(joint_scores, np.dot(joint_scores.T, X)))
                    U, D, V = svd_wrapper(J, self.joint_rank)
                    if not self.store_full:
                        J = None  # kill J matrix to save memory

            else:
                U, D, V = None, None, None
                if self.store_full:
                    J = np.zeros(shape=blocks[bn].shape)
                else:
                    J = None

            block_specific[bn]['joint'] = {
                'full': J,
                'scores': U,
                'svals': D,
                'loadings': V,
                'rank': self.joint_rank
            }

            #############################################
            # step 3.2: block specific individual space #
            #############################################
            # project X onto the orthogonal complement of the joint space,
            # estimate the individual rank, then compute SVD

            if issparse(X):  # lazy evaluation for sparse matrices
                U, D, V, indiv_rank = indiv_space_for_sparse(
                    X, joint_scores, self.joint_rank,
                    self.init_signal_ranks[bn], self.sv_threshold_[bn])
                I = None

            else:

                # project X columns onto orthogonal complement of joint_scores
                if self.joint_rank == 0:
                    X_orthog = X
                else:
                    X_orthog = X - np.dot(joint_scores,
                                          np.dot(joint_scores.T, X))

                # estimate individual rank using sv threshold, then compute SVD
                if self.indiv_ranks[bn] is None:
                    max_rank = min(
                        X.shape) - self.joint_rank  # saves computation
                    U, D, V = svd_wrapper(X_orthog, max_rank)
                    rank = sum(D > self.sv_threshold_[bn])

                    if rank == 0:
                        U, D, V = None, None, None
                    else:
                        U = U[:, 0:rank]
                        D = D[0:rank]
                        V = V[:, 0:rank]

                    self.indiv_ranks[bn] = rank

                else:  # indiv_rank has been provided by the user
                    rank = self.indiv_ranks[bn]
                    if rank == 0:
                        U, D, V = None, None, None
                    else:
                        U, D, V = svd_wrapper(X_orthog, rank)

                if self.store_full:
                    if rank == 0:
                        I = np.zeros(shape=blocks[bn].shape)
                    else:
                        I = np.array(np.dot(U, np.dot(np.diag(D), V.T)))
                else:
                    I = None  # Kill I matrix to save memory

            block_specific[bn]['individual'] = {
                'full': I,
                'scores': U,
                'svals': D,
                'loadings': V,
                'rank': rank
            }

            ###################################
            # step 3.3: estimate noise matrix #
            ###################################

            if self.store_full and not issparse(X):
                E = X - (J + I)
            else:
                E = None
            block_specific[bn]['noise'] = E

        # save block specific estimates
        self.blocks = {}
        for bn in block_specific.keys():
            self.blocks[bn] = \
                BlockSpecificResults(joint=block_specific[bn]['joint'],
                                     individual=block_specific[bn]['individual'],
                                     noise=block_specific[bn]['noise'],
                                     CNS=joint_scores,
                                     block_name=bn,
                                     obs_names=obs_names,
                                     var_names=var_names[bn],
                                     m=self.centers_[bn],
                                     shape=blocks[bn].shape,
                                     zero_index_names=self.zero_index_names,
                                     init_signal_svd=init_signal_svd[bn],
                                     X=blocks[bn])

        return self