Ejemplo n.º 1
0
    def fit(self, X, Y):
        # copy since this will contains the centered data
        check_consistent_length(X, Y)
        X = check_array(X, dtype=np.float64, copy=self.copy)
        Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False)
        if Y.ndim == 1:
            Y = Y.reshape(-1, 1)

        if self.n_components > max(Y.shape[1], X.shape[1]):
            raise ValueError("Invalid number of components n_components=%d"
                             " with X of shape %s and Y of shape %s."
                             % (self.n_components, str(X.shape), str(Y.shape)))

        # Scale (in place)
        X, Y, self.x_mean_, self.y_mean_, self.x_std_, self.y_std_ = (
            _center_scale_xy(X, Y, self.scale))
        # svd(X'Y)
        C = np.dot(X.T, Y)

        # The arpack svds solver only works if the number of extracted
        # components is smaller than rank(X) - 1. Hence, if we want to extract
        # all the components (C.shape[1]), we have to use another one. Else,
        # let's use arpacks to compute only the interesting components.
        if self.n_components >= np.min(C.shape):
            U, s, V = linalg.svd(C, full_matrices=False)
        else:
            U, s, V = arpack.svds(C, k=self.n_components)
        # Deterministic output
        U, V = svd_flip(U, V)
        V = V.T
        self.x_scores_ = np.dot(X, U)
        self.y_scores_ = np.dot(Y, V)
        self.x_weights_ = U
        self.y_weights_ = V
        return self
Ejemplo n.º 2
0
def fast_svd(X, n_components, random_state=None):
    """ Automatically switch between randomized and lapack SVD (heuristic
        of scikit-learn).

    Parameters
    ==========
    X: array, shape (n_samples, n_features)
        The data to decompose

    n_components: integer
        The order of the dimensionality of the truncated SVD

    random_state: int or RandomState
        Pseudo number generator state used for random sampling.

    Returns
    ========

    U: array, shape (n_samples, n_components)
        The first matrix of the truncated svd

    S: array, shape (n_components)
        The second matric of the truncated svd

    V: array, shape (n_components, n_features)
        The last matric of the truncated svd

    """
    random_state = check_random_state(random_state)
    # Small problem, just call full PCA
    if max(X.shape) <= 500:
        svd_solver = 'full'
    elif n_components >= 1 and n_components < .8 * min(X.shape):
        svd_solver = 'randomized'
    # This is also the case of n_components in (0,1)
    else:
        svd_solver = 'full'

    # Call different fits for either full or truncated SVD
    if svd_solver == 'full':
        U, S, V = linalg.svd(X, full_matrices=False)
        # flip eigenvectors' sign to enforce deterministic output
        U, V = svd_flip(U, V)
        # The "copy" are there to free the reference on the non reduced
        # data, and hence clear memory early
        U = U[:, :n_components].copy()
        S = S[:n_components]
        V = V[:n_components].copy()
    else:
        if LooseVersion(sklearn.__version__) >= LooseVersion('0.17'):
            n_iter = 'auto'
        else:
            n_iter = 3
        U, S, V = randomized_svd(X, n_components=n_components,
                                 n_iter=n_iter,
                                 flip_sign=True,
                                 random_state=random_state)
    return U, S, V
def pca_with_sparse(X, npcs, solver='arpack', mu=None, random_state=None):
    random_state = check_random_state(random_state)
    np.random.set_state(random_state.get_state())
    random_init = np.random.rand(np.min(X.shape))
    X = check_array(X, accept_sparse=['csr', 'csc'])

    if mu is None:
        mu = X.mean(0).A.flatten()[None, :]
    mdot = mu.dot
    mmat = mdot
    mhdot = mu.T.dot
    mhmat = mu.T.dot
    Xdot = X.dot
    Xmat = Xdot
    XHdot = X.T.conj().dot
    XHmat = XHdot
    ones = np.ones(X.shape[0])[None, :].dot

    def matvec(x):
        return Xdot(x) - mdot(x)

    def matmat(x):
        return Xmat(x) - mmat(x)

    def rmatvec(x):
        return XHdot(x) - mhdot(ones(x))

    def rmatmat(x):
        return XHmat(x) - mhmat(ones(x))

    XL = sparse.linalg.LinearOperator(
        matvec=matvec,
        dtype=X.dtype,
        matmat=matmat,
        shape=X.shape,
        rmatvec=rmatvec,
        rmatmat=rmatmat,
    )

    u, s, v = sparse.linalg.svds(XL, solver=solver, k=npcs, v0=random_init)
    u, v = svd_flip(u, v)
    idx = np.argsort(-s)
    v = v[idx, :]

    X_pca = (u * s)[:, idx]
    ev = s[idx]**2 / (X.shape[0] - 1)

    total_var = _get_mean_var(X)[1].sum()
    ev_ratio = ev / total_var

    output = {
        'X_pca': X_pca,
        'variance': ev,
        'variance_ratio': ev_ratio,
        'components': v,
    }
    return output
Ejemplo n.º 4
0
def fast_svd(X, n_components, random_state=None):
    """ Automatically switch between randomized and lapack SVD (heuristic
        of scikit-learn).

    Parameters
    ----------
    X : array, shape (n_samples, n_features)
        The data to decompose

    n_components : integer
        The order of the dimensionality of the truncated SVD

    random_state : int or RandomState, optional
        Pseudo number generator state used for random sampling.

    Returns
    -------

    U : array, shape (n_samples, n_components)
        The first matrix of the truncated svd

    S : array, shape (n_components)
        The second matric of the truncated svd

    V : array, shape (n_components, n_features)
        The last matric of the truncated svd

    """
    random_state = check_random_state(random_state)
    # Small problem, just call full PCA
    if max(X.shape) <= 500:
        svd_solver = 'full'
    elif n_components >= 1 and n_components < .8 * min(X.shape):
        svd_solver = 'randomized'
    # This is also the case of n_components in (0,1)
    else:
        svd_solver = 'full'

    # Call different fits for either full or truncated SVD
    if svd_solver == 'full':
        U, S, V = linalg.svd(X, full_matrices=False)
        # flip eigenvectors' sign to enforce deterministic output
        U, V = svd_flip(U, V)
        # The "copy" are there to free the reference on the non reduced
        # data, and hence clear memory early
        U = U[:, :n_components].copy()
        S = S[:n_components]
        V = V[:n_components].copy()
    else:
        n_iter = 'auto'

        U, S, V = randomized_svd(X,
                                 n_components=n_components,
                                 n_iter=n_iter,
                                 flip_sign=True,
                                 random_state=random_state)
    return U, S, V
Ejemplo n.º 5
0
    def _fit_full(self, X, n_components):
        """Fit the model by computing full SVD on X"""
        n_samples, n_features = X.shape

        if n_components == 'mle':
            if n_samples < n_features:
                raise ValueError("n_components='mle' is only supported "
                                 "if n_samples >= n_features")
        elif not 0 <= n_components <= n_features:
            raise ValueError("n_components=%r must be between 0 and "
                             "n_features=%r with svd_solver='full'"
                             % (n_components, n_features))

        # Center data
        self.mean_ = np.mean(X, axis=0)
        X -= self.mean_

        U, S, V = linalg.svd(X, full_matrices=False)
        # flip eigenvectors' sign to enforce deterministic output
        U, V = svd_flip(U, V)

        components_ = V

        # Get variance explained by singular values
        explained_variance_ = (S ** 2) / (n_samples - 1)
        total_var = explained_variance_.sum()
        explained_variance_ratio_ = explained_variance_ / total_var
        singular_values_ = S.copy()  # Store the singular values.

        # Postprocess the number of components required
        if n_components == 'mle':
            n_components = \
                _infer_dimension_(explained_variance_, n_samples, n_features)
        elif 0 < n_components < 1.0:
            # number of components for which the cumulated explained
            # variance percentage is superior to the desired threshold
            ratio_cumsum = stable_cumsum(explained_variance_ratio_)
            n_components = np.searchsorted(ratio_cumsum, n_components) + 1

        # Compute noise covariance using Probabilistic PCA model
        # The sigma2 maximum likelihood (cf. eq. 12.46)
        if n_components < min(n_features, n_samples):
            self.noise_variance_ = explained_variance_[n_components:].mean()
        else:
            self.noise_variance_ = 0.

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = components_[:n_components]
        self.n_components_ = n_components
        self.explained_variance_ = explained_variance_[:n_components]
        self.explained_variance_ratio_ = \
            explained_variance_ratio_[:n_components]
        self.singular_values_ = singular_values_[:n_components]

        return U, S, V
Ejemplo n.º 6
0
def svd_wrapper(X, rank=None):
    """
    Computes the (possibly partial) SVD of a matrix. Handles the case where
    X is either dense or sparse.

    Parameters
    ----------
    X: array-like,  shape (N, D)

    rank: int, None
        rank of the desired SVD.
        If None, will compute the largest min(X.shape) singular value/vectors.

    Output
    ------
    U, D, V

    U: array-like, shape (N, rank)
        Orthonormal matrix of left singular vectors.

    D: list, shape (rank, )
        Singular values in non-increasing order (e.g. D[0] is the largest).

    V: array-like, shape (D, rank)
        Orthonormal matrix of right singular vectors

    """
    # TODO: give user option to compute randomized SVD

    if rank is None:
        rank = min(X.shape)

    rank = int(rank)
    assert 1 <= rank and rank <= min(X.shape)

    if rank <= min(X.shape) - 1:
        scipy_svds = svds(X, rank)
        U, D, V = fix_scipy_svds(scipy_svds)

    else:
        assert not issparse(X)

        U, D, V = full_svd(X, full_matrices=False)
        V = V.T

        if rank:
            U = U[:, :rank]
            D = D[:rank]
            V = V[:, :rank]

    # enfoce deterministic output
    U, V = svd_flip(U, V.T)
    V = V.T

    return U, D, V
Ejemplo n.º 7
0
def randomized_pca(A, n_components, n_oversamples=10, n_iter="auto",
                   flip_sign=True, random_state=0):
    """Compute the randomized PCA decomposition of a given matrix.

    This method differs from the scikit-learn implementation in that it supports
    and handles sparse matrices well.

    """
    if n_iter == "auto":
        # Checks if the number of iterations is explicitly specified
        # Adjust n_iter. 7 was found a good compromise for PCA. See sklearn #5299
        n_iter = 7 if n_components < .1 * min(A.shape) else 4

    n_samples, n_features = A.shape

    c = np.atleast_2d(A.mean(axis=0))

    if n_samples >= n_features:
        Q = random_state.normal(size=(n_features, n_components + n_oversamples))
        Q = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q)

        # Normalized power iterations
        for _ in range(n_iter):
            Q = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :])
            Q, _ = lu(Q, permute_l=True)
            Q = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q)
            Q, _ = lu(Q, permute_l=True)

        Q, _ = qr(Q, mode="economic")

        QA = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :])
        R, s, V = svd(QA.T, full_matrices=False)
        U = Q.dot(R)

    else:  # n_features > n_samples
        Q = random_state.normal(size=(n_samples, n_components + n_oversamples))
        Q = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :])

        # Normalized power iterations
        for _ in range(n_iter):
            Q = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q)
            Q, _ = lu(Q, permute_l=True)
            Q = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :])
            Q, _ = lu(Q, permute_l=True)

        Q, _ = qr(Q, mode="economic")

        QA = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q)
        U, s, R = svd(QA, full_matrices=False)
        V = R.dot(Q.T)

    if flip_sign:
        U, V = svd_flip(U, V)

    return U[:, :n_components], s[:n_components], V[:n_components, :]
Ejemplo n.º 8
0
def D_tau_old(M, tau=None, l=5):

    if not tau:
        tau = 5 * np.sum(M.shape) / 2
    #r is rank(M)
    r = 0
    sk = r + 1
    agl = 'arpack'
    #agl = 'lobpcg'

    (U, S, VT) = svds(M, k=min(sk, min(M.shape) - 1), solver=agl)
    S = S[::-1]
    U, VT = svd_flip(U[:, ::-1], VT[::-1])

    while np.min(S) >= tau:
        sk = sk + l
        (U, S, VT) = svds(M, k=min(sk, min(M.shape) - 1), solver=agl)
        S = S[::-1]
        U, VT = svd_flip(U[:, ::-1], VT[::-1])
        print("min S:")
        print(np.min(S))
        print("sk:")
        print(sk)
        print("tau in D_tau:")
        print(tau)
    shrink_S = np.maximum(S - tau, 0)
    r = np.count_nonzero(shrink_S)
    diag_shrink_S = np.diag(shrink_S)
    res = np.linalg.multi_dot([U, diag_shrink_S, VT])
    '''
    s_thresh = np.maximum(S - tau, 0)
    rank = (s_thresh > 0).sum()
    s_thresh = s_thresh[:rank]
    U_thresh = U[:, :rank]
    VT_thresh = VT[:rank, :]
    S_thresh = np.diag(s_thresh)
    #res = np.dot(U_thresh, np.dot(S_thresh, VT_thresh))
    del U
    del VT
    res = np.linalg.multi_dot([U_thresh, S_thresh, VT_thresh])
    '''
    return res
Ejemplo n.º 9
0
    def fit_transform(self, X, y=None):
        """ Fit LSI model to X and perform dimensionality reduction on X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Training data.

        Returns
        -------

        X_new : array, shape (n_samples, n_components)
            Reduced version of X. This will always be a dense array.
        """
        X = as_float_array(X, copy=False)
        random_state = check_random_state(self.random_state)

        # If sparse and not csr or csc, convert to csr
        if sp.issparse(X) and X.getformat() not in ["csr", "csc"]:
            X = X.tocsr()

        if self.algorithm == "arpack":
            U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol)
            # svds doesn't abide by scipy.linalg.svd/randomized_svd
            # conventions, so reverse its outputs.
            Sigma = Sigma[::-1]
            U, VT = svd_flip(U[:, ::-1], VT[::-1])

        elif self.algorithm == "randomized":
            k = self.n_components
            n_features = X.shape[1]
            if k >= n_features:
                raise ValueError("n_components must be < n_features;"
                                 " got %d >= %d" % (k, n_features))
            U, Sigma, VT = randomized_svd(X,
                                          self.n_components,
                                          n_iter=self.n_iter,
                                          random_state=random_state)
        else:
            raise ValueError("unknown algorithm %r" % self.algorithm)

        self.components_ = VT
        self.Sigma = Sigma[:self.n_components]

        # Calculate explained variance & explained variance ratio
        X_transformed = np.dot(U, np.diag(Sigma))
        self.explained_variance_ = exp_var = np.var(X_transformed, axis=0)
        if sp.issparse(X):
            _, full_var = mean_variance_axis(X, axis=0)
            full_var = full_var.sum()
        else:
            full_var = np.var(X, axis=0).sum()
        self.explained_variance_ratio_ = exp_var / full_var
        return X_transformed
Ejemplo n.º 10
0
def _my_svd(M, k, algorithm):
    if algorithm == 'randomized':
        (U, S, V) = randomized_svd(M,
                                   n_components=min(k, M.shape[1] - 1),
                                   n_oversamples=20)
    elif algorithm == 'arpack':
        (U, S, V) = svds(M, k=min(k, min(M.shape) - 1))
        S = S[::-1]
        U, V = svd_flip(U[:, ::-1], V[::-1])
    else:
        raise ValueError("unknown algorithm")
    return (U, S, V)
Ejemplo n.º 11
0
def eigh_wrapper(A, B=None, rank=None, eval_descending=True):
    """
    Symmetrics eigenvector or genealized eigenvector problem.

    A v = lambda v

    or

    A v = labmda B v

    where A (and B) are symmetric (hermetian).

    Parameters
    ----------
    A: array-like, shape (n x n)

    B: None, array-like, shape (n x n)

    rank: None, int
        Number of

    eval_descending: bool
        Whether or not to compute largest or smallest eigenvalues.
        If True, will compute largest rank eigenvalues and
        eigenvalues are returned in descending order. Otherwise,
        computes smallest eigenvalues and returns them in ascending order.

    Output
    ------
    evals, evecs

    """

    if rank is not None:
        n_max_evals = A.shape[0]

        if eval_descending:
            eigvals_idxs = (n_max_evals - rank, n_max_evals - 1)
        else:
            eigvals_idxs = (0, rank - 1)
    else:
        eigvals_idxs = None

    evals, evecs = eigh(a=A, b=B, eigvals=eigvals_idxs)

    if eval_descending:
        ev_reordering = np.argsort(-evals)
        evals = evals[ev_reordering]
        evecs = evecs[:, ev_reordering]

    evecs = svd_flip(evecs, evecs.T)[0]

    return evals, evecs
Ejemplo n.º 12
0
def test_svd_flip_1d():
    # Make sure svd_flip_1d is equivalent to svd_flip
    u = np.array([1, -4, 2])
    v = np.array([1, 2, 3])

    u_expected, v_expected = svd_flip(u.reshape(-1, 1), v.reshape(1, -1))
    _svd_flip_1d(u, v)  # inplace

    assert_allclose(u, u_expected.ravel())
    assert_allclose(u, [-1, 4, -2])

    assert_allclose(v, v_expected.ravel())
    assert_allclose(v, [-1, -2, -3])
Ejemplo n.º 13
0
    def _decompose_full(self, mat):

        if self.n_components != "mle":
            if not 0 <= self.n_components <= self.n_samples_:
                raise ValueError("n_components=%r must be between 1 and "
                                 "n_samples=%r with "
                                 "svd_solver='%s'" % (
                                     self.n_components,
                                     self.n_samples_,
                                     self.svd_solver,
                                 ))
            elif self.n_components >= 1:
                if not isinstance(self.n_components, numbers.Integral):
                    raise ValueError(
                        "n_components=%r must be of type int "
                        "when greater than or equal to 1, "
                        "was of type=%r" %
                        (self.n_components, type(self.n_components)))

        U, S, Vt = linalg.svd(mat, full_matrices=False)
        U[:, S < self.tol] = 0.0
        Vt[S < self.tol] = 0.0
        S[S < self.tol] = 0.0

        # flip eigenvectors' sign to enforce deterministic output
        U, Vt = svd_flip(U, Vt)

        # Get variance explained by singular values
        explained_variance_ = (S**2) / (self.n_samples_ - 1)
        total_var = explained_variance_.sum()
        explained_variance_ratio_ = explained_variance_ / total_var

        # Postprocess the number of components required
        if self.n_components == "mle":
            self.n_components = _infer_dimension(explained_variance_,
                                                 self.n_samples_)
        elif 0 < self.n_components < 1.0:
            # number of components for which the cumulated explained
            # variance percentage is superior to the desired threshold
            # side='right' ensures that number of features selected
            # their variance is always greater than self.n_components float
            # passed. More discussion in issue: #15669
            ratio_cumsum = stable_cumsum(explained_variance_ratio_)
            self.n_components = (np.searchsorted(
                ratio_cumsum, self.n_components, side="right") + 1)
        self.n_components = self.n_components
        return (
            U[:, :self.n_components],
            S[:self.n_components],
            Vt[:self.n_components],
        )
Ejemplo n.º 14
0
    def _decompose_truncated(self, mat):

        if not 1 <= self.n_components <= self.n_samples_:
            raise ValueError("n_components=%r must be between 1 and "
                             "n_samples=%r with "
                             "svd_solver='%s'" % (
                                 self.n_components,
                                 self.n_samples_,
                                 self.svd_solver,
                             ))
        elif not isinstance(self.n_components, numbers.Integral):
            raise ValueError(
                "n_components=%r must be of type int "
                "when greater than or equal to 1, was of type=%r" %
                (self.n_components, type(self.n_components)))
        elif self.svd_solver == "arpack" and self.n_components == self.n_samples_:
            raise ValueError("n_components=%r must be strictly less than "
                             "n_samples=%r with "
                             "svd_solver='%s'" % (
                                 self.n_components,
                                 self.n_samples_,
                                 self.svd_solver,
                             ))

        random_state = check_random_state(self.random_state)

        if self._fit_svd_solver == "arpack":
            v0 = _init_arpack_v0(min(mat.shape), random_state)
            U, S, Vt = svds(mat, k=self.n_components, tol=self.tol, v0=v0)
            # svds doesn't abide by scipy.linalg.svd/randomized_svd
            # conventions, so reverse its outputs.
            S = S[::-1]
            # flip eigenvectors' sign to enforce deterministic output
            U, Vt = svd_flip(U[:, ::-1], Vt[::-1])

        # We have already eliminated all other solvers, so this must be "randomized"
        else:
            # sign flipping is done inside
            U, S, Vt = randomized_svd(
                mat,
                n_components=self.n_components,
                n_iter=self.iterated_power,
                flip_sign=True,
                random_state=random_state,
            )

        U[:, S < self.tol] = 0.0
        Vt[S < self.tol] = 0.0
        S[S < self.tol] = 0.0

        return U, S, Vt
Ejemplo n.º 15
0
def test_svd_flip():
    """Check that svd_flip works in both situations, and reconstructs input."""
    rs = np.random.RandomState(1999)
    n_samples = 20
    n_features = 10
    X = rs.randn(n_samples, n_features)

    # Check matrix reconstruction
    U, S, V = linalg.svd(X, full_matrices=False)
    U1, V1 = svd_flip(U, V, u_based_decision=False)
    assert_almost_equal(np.dot(U1 * S, V1), X, decimal=6)

    # Check transposed matrix reconstruction
    XT = X.T
    U, S, V = linalg.svd(XT, full_matrices=False)
    U2, V2 = svd_flip(U, V, u_based_decision=True)
    assert_almost_equal(np.dot(U2 * S, V2), XT, decimal=6)

    # Check that different flip methods are equivalent under reconstruction
    U_flip1, V_flip1 = svd_flip(U, V, u_based_decision=True)
    assert_almost_equal(np.dot(U_flip1 * S, V_flip1), XT, decimal=6)
    U_flip2, V_flip2 = svd_flip(U, V, u_based_decision=False)
    assert_almost_equal(np.dot(U_flip2 * S, V_flip2), XT, decimal=6)
Ejemplo n.º 16
0
def pca_fit_full_daal(X, n_components):

    fit_result, eigenvalues, eigenvectors, S = pca_fit_daal(X, min(X.shape), 'svdDense')
    U = pca_transform_daal(fit_result, X, min(X.shape), X.shape[0],
                           eigenvalues, eigenvectors,
                           whiten=True, scale_eigenvalues=True)
    V = fit_result.eigenvectors

    U, V = svd_flip(U, V)

    eigenvalues = fit_result.eigenvalues[:n_components].copy()
    eigenvectors = fit_result.eigenvectors[:n_components].copy()

    return fit_result, eigenvalues, eigenvectors, U, S, V
Ejemplo n.º 17
0
def test_svd_flip():
    # Check that svd_flip works in both situations, and reconstructs input.
    rs = np.random.RandomState(1999)
    n_samples = 20
    n_features = 10
    X = rs.randn(n_samples, n_features)

    # Check matrix reconstruction
    U, S, V = linalg.svd(X, full_matrices=False)
    U1, V1 = svd_flip(U, V, u_based_decision=False)
    assert_almost_equal(np.dot(U1 * S, V1), X, decimal=6)

    # Check transposed matrix reconstruction
    XT = X.T
    U, S, V = linalg.svd(XT, full_matrices=False)
    U2, V2 = svd_flip(U, V, u_based_decision=True)
    assert_almost_equal(np.dot(U2 * S, V2), XT, decimal=6)

    # Check that different flip methods are equivalent under reconstruction
    U_flip1, V_flip1 = svd_flip(U, V, u_based_decision=True)
    assert_almost_equal(np.dot(U_flip1 * S, V_flip1), XT, decimal=6)
    U_flip2, V_flip2 = svd_flip(U, V, u_based_decision=False)
    assert_almost_equal(np.dot(U_flip2 * S, V_flip2), XT, decimal=6)
Ejemplo n.º 18
0
def pod(X, rank=6):
    # --> Compute the rank-k truncated SVD of X.
    U, Σ, Vh = svds(X, k=rank)

    # --> ARPACK does not abide by SVD convention.
    idx = np.argsort(-Σ)
    Σ = Σ[idx]

    # --> Sign correction to ensure deterministic output from SVD.
    U, Vh = svd_flip(U[:, idx], Vh[idx])

    # --> Low-dimensional PCA state vector.
    a = np.diag(Σ) @ Vh

    return U, a, Σ**2
Ejemplo n.º 19
0
def customDomainSVD_accel(D, f, e, U, W, Bi, n, m, svd_power, t):
    Q = range_finder_domain_accel(D, f, e, U, W, Bi, n, m, svd_power, t)
    Bt = np.zeros((n, Q.shape[1]), dtype=np.float32)

    # B.T = dot(E.T, Q)
    halko.matMulTrans_SVD_domain_accel(D, f, U, W, Q, Bt, Bi, n, m, t)

    # SVD on thin matrix
    Uhat, s, V = linalg.svd(Bt.T, full_matrices=False)
    del Bt
    U = np.dot(Q, Uhat)

    # Correct sign
    U, V = svd_flip(U, V)
    return U[:, :e], s[:e], V[:e, :]
Ejemplo n.º 20
0
    def _sparpack_wrapper(self,
                          k=None,
                          ncv=None,
                          tol=0,
                          v0=None,
                          maxiter=None):
        """Wrapper for scipy.sparse.linalg.svds

        Apply Singular Value Decomposition to the embedding matrix of shape 
        (`M`, `N`) using the `scipy.sparse.linalg.svds`_ algorithm. 

        Parameters
        ----------


        See Also
        --------
        
        https://docs.scipy.org/doc/scipy/reference/generated/scipy.linalg.svd.html

        """
        # Matrix to be decomposed

        x = csc_matrix(self._embedseries())

        # Default k value is full svd

        if k is None:
            k = min(x.shape) - 1

        u, s, v = sparpack(x,
                           k=k,
                           ncv=ncv,
                           tol=tol,
                           which='LM',
                           v0=v0,
                           maxiter=maxiter,
                           return_singular_vectors=True)

        # with this implementation vectors needs to be flipped to match lapack
        # other format and sign ambiguities have to be solved to force
        # deterministic output with svd_flip

        u, v = svd_flip(u[:, ::-1], v[::-1, :])

        self.svd = [np.matrix(u), s[::-1], np.matrix(v)]

        return self.svd
Ejemplo n.º 21
0
def D_svt(M, sk):

    agl = 'arpack'
    #agl = 'lobpcg'

    (U, S, VT) = svds(M, k=min(sk, min(M.shape) - 1), solver=agl)
    S = S[::-1]
    U, VT = svd_flip(U[:, ::-1], VT[::-1])

    #shrink_S = np.maximum(S - tau, 0)
    diag_shrink_S = np.diag(S)
    res = np.linalg.multi_dot([U, diag_shrink_S, VT])
    print('D_svt S shape')
    print(diag_shrink_S.shape)

    return res
Ejemplo n.º 22
0
def svd_wrapper(Y, k, method='svds'):
    if method is 'svds':
        Ut, St, Vt = svds(Y, k)
        idx = np.argsort(St)[::-1]
        St = St[idx]  # have issue with sorting zero singular values
        Ut, Vt = svd_flip(Ut[:, idx], Vt[idx])
    elif method is 'random':
        Ut, St, Vt = randomized_svd(Y, k)
    else:
        Ut, St, Vt = np.linalg.svd(Y, full_matrices=False)
        # now truncate it to k
        Ut = Ut[:, :k]
        St = np.diag(St[:k])
        Vt = Vt[:k, :]

    return Ut, St, Vt
Ejemplo n.º 23
0
    def fit(self):
        """Fit the model by computing full SVD on m.

        SVD factors the matrix m as u * np.diag(s) * v, where u and v are
        unitary and s is a 1-d array of m‘s singular values.  Note that the SVD
        is commonly written as a = U S V.H, and the v returned by this function
        is V.H (the Hermitian transpose).  Therefore, we denote V.H as vt, and
        back into the actual v, denoted just v.

        The decomposition uses np.linalg.svd with full_matrices=False, so for
        m with shape (M, N), then the shape of:
         - u is (M, K)
         - v is (K, N
        where K = min(M, N)

        Intertia is the percentage of explained variance.

        Returns
        -------
        self, to enable method chaining
        """

        self.n_samples, self.n_features = self.ms.shape
        self.u, self.s, self.vt = np.linalg.svd(self.ms, full_matrices=False)
        self.v = self.vt.T

        # sklearn's implementation is to guarantee that the left and right
        # singular vectors (U and V) are always the same, by imposing the
        # that the largest coefficient of U in absolute value is positive
        # This implementation uses u_based_decision=False rather than the
        # default True to flip that logic and ensure the resulting
        # components and loadings have high positive coefficients
        self.u, self.vt = svd_flip(
            self.u, self.v, u_based_decision=self.u_based_decision
        )
        self.v = self.vt.T

        # Drop eigenvalues with value > threshold
        # *keep* is number of components retained
        self.eigenvalues = self.s ** 2 / self.n_samples
        self.keep = np.count_nonzero(self.eigenvalues > self.threshold)

        self.inertia = (self.eigenvalues / self.eigenvalues.sum())[: self.keep]
        self.cumulative_inertia = self.inertia.cumsum()[: self.keep]
        self.eigenvalues = self.eigenvalues[: self.keep]

        return self
Ejemplo n.º 24
0
    def fit(self, X, Y):
        """Fit model to data.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of predictors.

        Y : array-like, shape = [n_samples, n_targets]
            Target vectors, where n_samples is the number of samples and
            n_targets is the number of response variables.
        """
        # copy since this will contains the centered data
        check_consistent_length(X, Y)
        X = check_array(X, dtype=np.float64, copy=self.copy)
        Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False)
        if Y.ndim == 1:
            Y = Y.reshape(-1, 1)

        if self.n_components > max(Y.shape[1], X.shape[1]):
            raise ValueError("Invalid number of components n_components=%d"
                             " with X of shape %s and Y of shape %s." %
                             (self.n_components, str(X.shape), str(Y.shape)))

        # Scale (in place)
        X, Y, self.x_mean_, self.y_mean_, self.x_std_, self.y_std_ = (
            _center_scale_xy(X, Y, self.scale))
        # svd(X'Y)
        C = np.dot(X.T, Y)

        # The arpack svds solver only works if the number of extracted
        # components is smaller than rank(X) - 1. Hence, if we want to extract
        # all the components (C.shape[1]), we have to use another one. Else,
        # let's use arpacks to compute only the interesting components.
        if self.n_components >= np.min(C.shape):
            U, s, V = svd(C, full_matrices=False)
        else:
            U, s, V = svds(C, k=self.n_components)
        # Deterministic output
        U, V = svd_flip(U, V)
        V = V.T
        self.x_scores_ = np.dot(X, U)
        self.y_scores_ = np.dot(Y, V)
        self.x_weights_ = U
        self.y_weights_ = V
        return self
Ejemplo n.º 25
0
 def partial_fit_transform(self, X):
     shp = X.shape
     if X.ndim == 4:
         X = np.reshape(X, (shp[0], -1))
     elif X.ndim == 5:
         X = np.reshape(X, (shp[0] * shp[1], -1))
     whr = np.where(np.any(X != self.pad_value, axis=1))[0]
     if len(whr) > 0:
         if self.n_samples_seen < self.n_samples_train:
             self.lock.acquire()
             try:
                 # Update stats - they are 0 if this is the fisrt step
                 col_mean, col_var, n_total_samples = \
                     _incremental_mean_and_var(
                         X[whr], last_mean=self.mean, last_variance=self.var, last_sample_count=np.repeat(self.n_samples_seen, X[whr].shape[1]))
                 n_total_samples = n_total_samples[0]
                 if self.n_samples_seen == 0:
                     X[whr] = X[whr] - col_mean
                     _X = X[whr]
                 else:
                     col_batch_mean = np.mean(X[whr], axis=0)
                     X[whr] = X[whr] - col_batch_mean
                     # Build matrix of combined previous basis and new data
                     mean_correction = np.sqrt(
                         (self.n_samples_seen * X[whr].shape[0]) /
                         n_total_samples) * (self.mean - col_batch_mean)
                     _X = np.vstack((self.singular_values.reshape(
                         (-1, 1)) * self.components, X[whr],
                                     mean_correction))
                 U, S, V = np.linalg.svd(_X, full_matrices=False)
                 U, V = svd_flip(U, V, u_based_decision=False)
                 explained_variance = S**2 / (n_total_samples - 1)
                 self.n_samples_seen = n_total_samples
                 self.components = V[:self.max_components]
                 self.singular_values = S[:self.max_components]
                 self.mean = col_mean
                 self.var = col_var
                 self.explained_variance = explained_variance[:self.
                                                              max_components]
             finally:
                 self.lock.release()
         else:
             X[whr] = X[whr] - self.mean
         X[whr] = np.dot((np.dot(X[whr], self.components.T) /
                          np.sqrt(self.explained_variance + self.epsilon)),
                         self.components)
     return np.reshape(X, shp)
Ejemplo n.º 26
0
def sym_eigen_to_mat_singular(V, s, method='mult'):
    """returns V, sigmas"""
    if method == 'mult':
        V, s = V, np.sqrt(s)
    elif method == 'block':
        rank = np.argwhere(s > 1e-15).size

        assert rank % 2 == 0  # rank should be even, else we have a
        # non-convergence issue
        ind_vo = 0
        Vo = np.empty((A_n, rank / 2))
        for i in range(0, s.size - 1, 2):
            j = i
            if i == 0:
                if np.abs(np.inner(V[A_m:, 1], V[A_m:, 1]) -
                          1) < np.abs(np.inner(V[A_m:, 0], V[A_m:, 0]) - 1):
                    j = 1
            else:
                viol_ni = np.abs(np.inner(V[A_m:, i], V[A_m:, i]) - 1)
                viol_nj = np.abs(np.inner(V[A_m:, i + 1], V[A_m:, i + 1]) - 1)
                if viol_ni < viol_nj / 10:
                    pass
                elif viol_ni > viol_nj * 10:
                    j = i + 1
                else:
                    ipi = 0
                    ip1 = 0
                    for k in range(i / 2 - 1):
                        ipi += np.abs(np.inner(V[A_m:, i], Vo[:, k]))
                        ip1 += np.abs(np.inner(V[A_m:, i + 1], Vo[:, k]))
                    if ip1 < ipi:
                        j = i + 1
            if s[j] < 1e-15:  # no more eigenvalues
                continue
            Vo[:, ind_vo] = V[A_m:, j]
            ind_vo += 1

        s = s[0:rank:2]
        V, s = Vo, s
    else:
        raise Exception('method {} not recognized'.format(method))
    k = V.shape[1]
    _, V = svd_flip(np.eye(k), V.T, u_based_decision=False)
    V = V.T
    return V, s
    def Solve(self, K):

        # SELECT THE BEST METHOD TO CALCULATE THE EIGENVALUES
        if self.eigen_solver == 'auto':
            if K.shape[0] > 200 and self.n_components < 10:
                eigen_solver = 'arpack'
            else:
                eigen_solver = 'dense'
        else:
            eigen_solver = self.eigen_solver

        #GET EIGENVALUES AND EIGENVECTOR THE CENTER KERNEL
        if eigen_solver == 'dense':
            self.lambdas_, self.vectors_ = linalg.eigh(
                K, eigvals=(K.shape[0] - self.n_components, K.shape[0] - 1))
        elif eigen_solver == 'arpack':
            random_state = check_random_state(self.random_state)
            # initialize with [-1,1] as in ARPACK
            v0 = random_state.uniform(-1, 1, K.shape[0])
            self.lambdas_, self.vectors_ = eigsh(K,
                                                 self.n_components,
                                                 which="LA",
                                                 tol=self.tol,
                                                 maxiter=self.max_iter,
                                                 v0=v0)

        # make sure that the eigenvalues are ok and fix numerical issues
        self.lambdas_ = _check_psd_eigenvalues(self.lambdas_,
                                               enable_warnings=False)

        # flip eigenvectors' sign to enforce deterministic output
        self.vectors_, _ = svd_flip(self.vectors_,
                                    np.empty_like(self.vectors_).T)

        # sort eigenvectors in descending order
        indices = self.lambdas_.argsort()[::-1]
        self.lambdas_ = self.lambdas_[indices]
        self.vectors_ = self.vectors_[:, indices]

        # remove eigenvectors with a zero eigenvalue (null space) if required
        if self.remove_zero_eig:
            self.vectors_ = self.vectors_[:, self.lambdas_ > 0]
            self.lambdas_ = self.lambdas_[self.lambdas_ > 0]

        return K
Ejemplo n.º 28
0
def customSVD(D, f, e, F, p, Bi, n, m, svd_power, t):
    Q = range_finder(D, f, e, F, p, Bi, n, m, svd_power, t)
    Bt = np.zeros((n, Q.shape[1]), dtype=np.float32)

    # B.T = dot(E.T, Q)
    if F is None:
        halko.matMulTrans_Freq(D, f, Q, Bt, Bi, n, m, t)
    else:
        halko.matMulTrans_Guide(D, f, F, p, Q, Bt, Bi, n, m, t)

    # SVD on thin matrix
    Uhat, s, V = linalg.svd(Bt.T, full_matrices=False)
    del Bt
    U = np.dot(Q, Uhat)

    # Correct sign
    U, V = svd_flip(U, V)
    return U[:, :e], s[:e], V[:e, :]
Ejemplo n.º 29
0
    def _fit_transform(self, K):
        """ Fit's using kernel K"""
        # center kernel
        K = self._centerer.fit_transform(K)

        self.lambdas_, self.alphas_ = linalg.eigh(
            K, eigvals=(K.shape[0] - self.n_components, K.shape[0] - 1))

        # make sure that the eigenvalues are ok and fix numerical issues
        self.lambdas_ = _check_psd_eigenvalues(self.lambdas_,
                                               enable_warnings=False)

        # flip eigenvectors' sign to enforce deterministic output
        self.alphas_, _ = svd_flip(self.alphas_, np.empty_like(self.alphas_).T)

        # sort eigenvectors in descending order
        indices = self.lambdas_.argsort()[::-1]
        self.lambdas_ = self.lambdas_[indices]
        self.alphas_ = self.alphas_[:, indices]

        # remove eigenvectors with a zero eigenvalue (null space) if required
        self.alphas_ = self.alphas_[:, self.lambdas_ > 0]
        self.lambdas_ = self.lambdas_[self.lambdas_ > 0]

        # Maintenance note on Eigenvectors normalization
        # ----------------------------------------------
        # there is a link between
        # the eigenvectors of K=Phi(X)'Phi(X) and the ones of Phi(X)Phi(X)'
        # if v is an eigenvector of K
        #     then Phi(X)v  is an eigenvector of Phi(X)Phi(X)'
        # if u is an eigenvector of Phi(X)Phi(X)'
        #     then Phi(X)'u is an eigenvector of Phi(X)Phi(X)'
        #
        # At this stage our self.alphas_ (the v) have norm 1, we need to scale
        # them so that eigenvectors in kernel feature space (the u) have norm=1
        # instead
        #
        # We COULD scale them here:
        #       self.alphas_ = self.alphas_ / np.sqrt(self.lambdas_)
        #
        # But choose to perform that LATER when needed, in `fit()` and in
        # `transform()`.

        return K
Ejemplo n.º 30
0
    def _svd_train(self, data_carray):
        """Linear PCA training routine, used also by KernelPCA."""

        # Computing SVD reduction
        from numpy import linalg
        from sklearn.utils.extmath import svd_flip
        u, s, v = linalg.svd(data_carray.atleast_2d().tondarray(),
                             full_matrices=False)
        # flip eigenvectors' sign to enforce deterministic output
        u, v = svd_flip(u, v)

        eigenvec = CArray(u)
        eigenval = CArray(s)
        components = CArray(v)

        # Now we sort the eigenvalues/eigenvectors
        idx = (-eigenval).argsort(axis=None)
        eigenval = CArray(eigenval[idx])
        eigenvec = CArray(eigenvec[:, idx]).atleast_2d()
        components = CArray(components[idx, :]).atleast_2d()
        # percentage of variance explained by each component
        explained_variance = (eigenval**2) / (data_carray.shape[0] - 1)
        explained_variance_ratio = explained_variance / explained_variance.sum(
        )

        if 0 < self.n_components < 1.0:
            # number of components for which the cumulated explained variance
            # percentage is superior to the desired threshold
            ratio_cumsum = explained_variance_ratio.cumsum()
            self.n_components = CArray(
                ratio_cumsum < self.n_components).sum() + 1

        # Consider only n_components
        self._eigenval = CArray(eigenval[:self.n_components])
        self._eigenvec = CArray(eigenvec[:, :self.n_components])
        self._components = CArray(components[:self.n_components, :])

        # storing explained variance of n_components only
        self._explained_variance = explained_variance[:self.n_components]
        self._explained_variance_ratio = explained_variance_ratio[:self.
                                                                  n_components]

        return self
Ejemplo n.º 31
0
def PCA_values(data, centered=True):
    n_samples, n_features = data.shape
    #By default, the data are centered
    if (centered == True):
        data_centered = data - mean(data, axis=0)
    else:
        data_centered = data

    #apply the Single Vector Decomposition 
    U, S, V = linalg.svd(data_centered, full_matrices=False)
    # flip eigenvectors' sign to enforce deterministic output
    U, V = svd_flip(U, V)

    #components
    components_ = V

    #variance explained by PCs
    explained_variance_ratio_ = varianceExplained(S, n_samples)

    return(components_, explained_variance_ratio_)
Ejemplo n.º 32
0
    def _pca(self, Y, k):
        """PCA using a fast truncated svd implementation in scipy

        Arguments
        ---------
        Y : np.array
            p x n normalized genotype matrix
        k : int
            rank used for truncated svd

        Returns
        -------
        A tuple with elements ...

        L : np.array
            loadings matrix from running PCA
            on the original matrix
        F : np.array
            factor matrix from running PCA
            on the original dataset
        Lamb : np.array
            matrix of eigen values
        """
        # compute truncated svd of data matrix

        V, lamb, VT = svds(Y.T @ Y, k)
        sigma = np.sqrt(lamb[::-1])
        sigma_inv = 1. / sigma

        Sigma = np.diag(sigma)
        Sigma_inv = np.diag(sigma_inv)

        # flip signs of right eigenvectors
        V, VT = svd_flip(V[:, ::-1], VT[::-1])

        F = (Y @ V @ Sigma_inv)

        # project on to factors
        L = (F.T @ Y).T

        return ((L, F, Sigma))
Ejemplo n.º 33
0
 def fit(self, X):
     """X is a 2-d numpy array"""
     # Actually, the 'eig' method and the svd method
     # are the same thing
     mean_X = np.mean(X, axis=0)
     X_n = X - mean_X
     if self.use_svd == 'eig':
         sigma = np.dot(X_n.T, X_n)
         #sigma = np.cov(X)
         eigvalue, eigvector = np.linalg.eig(sigma)
         eigvalue = np.real(eigvalue)
         eigvector = np.real(eigvector)
         self.w = eigvector[np.argsort(eigvalue)[self.n_components:], :]
     elif self.use_svd == 'svd':
         # Don't forget the 'full_matrices'=False
         # as it's for reduced SVD
         U, S, V = np.linalg.svd(X_n, full_matrices=False)
         # flip eigenvectors' sign
         #to enforce deterministic output(from:sklearn/pca.py)
         U, V = svd_flip(U, V)
         self.w = V[:self.n_components]
Ejemplo n.º 34
0
    def test_normalized_gives_correct_result(self, prepare_table):
        """Make sure that normalization through widget gives correct result."""
        # Randomly set some values to zero
        random_state = check_random_state(42)
        mask = random_state.beta(1, 2, size=self.iris.X.shape) > 0.5
        self.iris.X[mask] = 0

        data = prepare_table(self.iris)

        # Enable normalization and run data through widget
        self.widget.controls.normalize.setChecked(True)
        self.send_signal(self.widget.Inputs.data, data)
        self.wait_until_stop_blocking()
        widget_result = self.get_output(self.widget.Outputs.transformed_data)

        # Compute the correct embedding
        x = self.iris.X
        x = (x - x.mean(0)) / x.std(0)
        U, S, Va = np.linalg.svd(x)
        U, S, Va = U[:, :2], S[:2], Va[:2]
        U, Va = svd_flip(U, Va)
        pca_embedding = U * S

        np.testing.assert_almost_equal(widget_result.X, pca_embedding)
Ejemplo n.º 35
0
  def partial_fit(self, X, y=None, check_input=True):
    """Incremental fit with X. All of X is processed as a single batch.

    Parameters
    ----------
    X: array-like, shape (n_samples, n_features)
        Training data, where n_samples is the number of samples and
        n_features is the number of features.

    Returns
    -------
    self: object
        Returns the instance itself.
    """
    # ====== check the samples and cahces ====== #
    if isinstance(X, Data):
      X = X[:]
    if check_input:
      X = check_array(X, copy=self.copy, dtype=[np.float64, np.float32])
    n_samples, n_features = X.shape
    # check number of components
    if self.n_components is None:
      self.n_components_ = n_features
    elif not 1 <= self.n_components <= n_features:
      raise ValueError("n_components=%r invalid for n_features=%d, need "
                       "more rows than columns for IncrementalPCA "
                       "processing" % (self.n_components, n_features))
    else:
      self.n_components_ = self.n_components
    # check the cache
    if n_samples < n_features or self._nb_cached_samples > 0:
      self._cache_batches.append(X)
      self._nb_cached_samples += n_samples
      # not enough samples yet
      if self._nb_cached_samples < n_features:
        return
      else: # group mini batch into big batch
        X = np.concatenate(self._cache_batches, axis=0)
        self._cache_batches = []
        self._nb_cached_samples = 0
    n_samples = X.shape[0]
    # ====== fit the model ====== #
    if (self.components_ is not None) and (self.components_.shape[0] !=
                                           self.n_components_):
      raise ValueError("Number of input features has changed from %i "
                       "to %i between calls to partial_fit! Try "
                       "setting n_components to a fixed value." %
                       (self.components_.shape[0], self.n_components_))
    # Update stats - they are 0 if this is the fisrt step
    col_mean, col_var, n_total_samples = \
        _incremental_mean_and_var(X, last_mean=self.mean_,
                                  last_variance=self.var_,
                                  last_sample_count=self.n_samples_seen_)
    total_var = np.sum(col_var * n_total_samples)
    if total_var == 0: # if variance == 0, make no sense to continue
      return self
    # Whitening
    if self.n_samples_seen_ == 0:
      # If it is the first step, simply whiten X
      X -= col_mean
    else:
      col_batch_mean = np.mean(X, axis=0)
      X -= col_batch_mean
      # Build matrix of combined previous basis and new data
      mean_correction = \
          np.sqrt((self.n_samples_seen_ * n_samples) /
                  n_total_samples) * (self.mean_ - col_batch_mean)
      X = np.vstack((self.singular_values_.reshape((-1, 1)) *
                    self.components_, X, mean_correction))

    U, S, V = linalg.svd(X, full_matrices=False)
    U, V = svd_flip(U, V, u_based_decision=False)
    explained_variance = S ** 2 / n_total_samples
    explained_variance_ratio = S ** 2 / total_var

    self.n_samples_seen_ = n_total_samples
    self.components_ = V[:self.n_components_]
    self.singular_values_ = S[:self.n_components_]
    self.mean_ = col_mean
    self.var_ = col_var
    self.explained_variance_ = explained_variance[:self.n_components_]
    self.explained_variance_ratio_ = \
        explained_variance_ratio[:self.n_components_]
    if self.n_components_ < n_features:
      self.noise_variance_ = \
          explained_variance[self.n_components_:].mean()
    else:
      self.noise_variance_ = 0.
    return self
Ejemplo n.º 36
0
    def _fit_truncated(self, X, n_components, svd_solver):
        """Fit the model by computing truncated SVD (by ARPACK or randomized) on X"""
        n_samples, n_features = X.shape

        if isinstance(n_components, six.string_types):
            raise ValueError(
                "n_components=%r cannot be a string with svd_solver='%s'" %
                (n_components, svd_solver)
            )
        elif not 1 <= n_components <= min(n_samples, n_features):
            raise ValueError(
                "n_components=%r must be between 1 and min(n_samples, "
                "n_features)=%r with svd_solver='%s'" % (
                    n_components, min(n_samples, n_features), svd_solver
                )
            )
        elif not isinstance(n_components, (numbers.Integral, np.integer)):
            raise ValueError(
                "n_components=%r must be of type int when greater than or "
                "equal to 1, was of type=%r" % (n_components, type(n_components))
            )
        elif svd_solver == "arpack" and n_components == min(n_samples, n_features):
            raise ValueError(
                "n_components=%r must be strictly less than min(n_samples, "
                "n_features)=%r with svd_solver='%s'" % (
                    n_components, min(n_samples, n_features), svd_solver
                )
            )

        random_state = check_random_state(self.random_state)

        self.mean_ = X.mean(axis=0)
        total_var = ut.var(X, axis=0, ddof=1)

        if svd_solver == "arpack":
            # Center data
            X -= self.mean_
            # random init solution, as ARPACK does it internally
            v0 = random_state.uniform(-1, 1, size=min(X.shape))
            U, S, V = sp.linalg.svds(X, k=n_components, tol=self.tol, v0=v0)
            # svds doesn't abide by scipy.linalg.svd/randomized_svd
            # conventions, so reverse its outputs.
            S = S[::-1]
            # flip eigenvectors' sign to enforce deterministic output
            U, V = svd_flip(U[:, ::-1], V[::-1])

        elif svd_solver == "randomized":
            # sign flipping is done inside
            U, S, V = randomized_pca(
                X,
                n_components=n_components,
                n_iter=self.iterated_power,
                flip_sign=True,
                random_state=random_state,
            )

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = V
        self.n_components_ = n_components

        # Get variance explained by singular values
        self.explained_variance_ = (S ** 2) / (n_samples - 1)
        self.explained_variance_ratio_ = self.explained_variance_ / total_var.sum()
        self.singular_values_ = S.copy()  # Store the singular values.

        if self.n_components_ < min(n_features, n_samples):
            self.noise_variance_ = (total_var.sum() - self.explained_variance_.sum())
            self.noise_variance_ /= min(n_features, n_samples) - n_components
        else:
            self.noise_variance_ = 0

        return U, S, V
    def partial_fit(self, X, y=None):
        """Incremental fit with X. All of X is processed as a single batch.
        Parameters
        ----------
        X: array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples and
            n_features is the number of features.
        Returns
        -------
        self: object
            Returns the instance itself.
        """
        #X = check_array(X, copy=self.copy, dtype=np.float)  # --- ADJUSTED
        X = np.asarray(X)                                    # --- ADJUSTED
        n_samples, n_features = X.shape
        if not hasattr(self, 'components_'):
            self.components_ = None

        if self.n_components is None:
            self.n_components_ = n_features
        elif not 1 <= self.n_components <= n_features:
            raise ValueError("n_components=%r invalid for n_features=%d, need "
                             "more rows than columns for IncrementalPCA "
                             "processing" % (self.n_components, n_features))
        else:
            self.n_components_ = self.n_components

        if (self.components_ is not None) and (self.components_.shape[0]
                                               != self.n_components_):
            raise ValueError("Number of input features has changed from %i "
                             "to %i between calls to partial_fit! Try "
                             "setting n_components to a fixed value." % (
                                 self.components_.shape[0], self.n_components_))

        if self.components_ is None:
            # This is the first pass through partial_fit
            self.n_samples_seen_ = 0
            col_var = X.var(axis=0)
            col_mean = X.mean(axis=0)
            X -= col_mean
            U, S, V = linalg.svd(X, full_matrices=False)
            U, V = svd_flip(U, V, u_based_decision=False)
            explained_variance = S ** 2 / n_samples
            explained_variance_ratio = S ** 2 / np.sum(col_var *
                                                       n_samples)
        else:
            col_batch_mean = X.mean(axis=0)
            col_mean, col_var, n_total_samples = _batch_mean_variance_update(
                X, self.mean_, self.var_, self.n_samples_seen_)
            X -= col_batch_mean
            # Build matrix of combined previous basis and new data
            mean_correction = np.sqrt((self.n_samples_seen_ * n_samples) /
                                      n_total_samples) * (self.mean_ -
                                                          col_batch_mean)
            X_combined = np.vstack((self.singular_values_.reshape((-1, 1)) *
                                    self.components_, X,
                                    mean_correction))
            U, S, V = linalg.svd(X_combined, full_matrices=False)
            U, V = svd_flip(U, V, u_based_decision=False)
            explained_variance = S ** 2 / n_total_samples
            explained_variance_ratio = S ** 2 / np.sum(col_var *
                                                       n_total_samples)
        self.n_samples_seen_ += n_samples
        self.components_ = V[:self.n_components_]
        self.singular_values_ = S[:self.n_components_]
        self.mean_ = col_mean
        self.var_ = col_var
        self.explained_variance_ = explained_variance[:self.n_components_]
        self.explained_variance_ratio_ = \
            explained_variance_ratio[:self.n_components_]
        # if self.n_components_ < n_features:          # --- ADJUSTED
        #     self.noise_variance_ = \
        #         explained_variance[self.n_components_:].mean()
        # else:
        #     self.noise_variance_ = 0.

        return self
Ejemplo n.º 38
0
    def fit(self, X, Y):
        """Fit model to data.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples in the number of samples and
            n_features is the number of predictors.

        Y : array-like of response, shape = [n_samples, n_targets]
            Target vectors, where n_samples in the number of samples and
            n_targets is the number of response variables.
        """

        # copy since this will contains the residuals (deflated) matrices
        check_consistent_length(X, Y)
        X = check_array(X, dtype=np.float64, copy=self.copy)
        Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False)
        if Y.ndim == 1:
            Y = Y.reshape(-1, 1)

        n = X.shape[0]
        p = X.shape[1]
        q = Y.shape[1]

        if self.n_components < 1 or self.n_components > p:
            raise ValueError('Invalid number of components: %d' %
                             self.n_components)
        if self.algorithm not in ("svd", "nipals"):
            raise ValueError("Got algorithm %s when only 'svd' "
                             "and 'nipals' are known" % self.algorithm)
        if self.algorithm == "svd" and self.mode == "B":
            raise ValueError('Incompatible configuration: mode B is not '
                             'implemented with svd algorithm')
        if self.deflation_mode not in ["canonical", "regression"]:
            raise ValueError('The deflation mode is unknown')
        # Scale (in place)
        X, Y, self.x_mean_, self.y_mean_, self.x_std_, self.y_std_ = (
            _center_scale_xy(X, Y, self.scale))
        # Residuals (deflated) matrices
        Xk = X
        Yk = Y
        # Results matrices
        self.x_scores_ = np.zeros((n, self.n_components))
        self.y_scores_ = np.zeros((n, self.n_components))
        self.x_weights_ = np.zeros((p, self.n_components))
        self.y_weights_ = np.zeros((q, self.n_components))
        self.x_loadings_ = np.zeros((p, self.n_components))
        self.y_loadings_ = np.zeros((q, self.n_components))
        self.n_iter_ = []

        # NIPALS algo: outer loop, over components
        for k in range(self.n_components):
            if np.all(np.dot(Yk.T, Yk) < np.finfo(np.double).eps):
                # Yk constant
                warnings.warn('Y residual constant at iteration %s' % k)
                break
            # 1) weights estimation (inner loop)
            # -----------------------------------
            if self.algorithm == "nipals":
                x_weights, y_weights, n_iter_ = \
                    _nipals_twoblocks_inner_loop(
                        X=Xk, Y=Yk, mode=self.mode, max_iter=self.max_iter,
                        tol=self.tol, norm_y_weights=self.norm_y_weights)
                self.n_iter_.append(n_iter_)
            elif self.algorithm == "svd":
                x_weights, y_weights = _svd_cross_product(X=Xk, Y=Yk)
            # Forces sign stability of x_weights and y_weights
            # Sign undeterminacy issue from svd if algorithm == "svd"
            # and from platform dependent computation if algorithm == 'nipals'
            x_weights, y_weights = svd_flip(x_weights, y_weights.T)
            y_weights = y_weights.T
            # compute scores
            x_scores = np.dot(Xk, x_weights)
            if self.norm_y_weights:
                y_ss = 1
            else:
                y_ss = np.dot(y_weights.T, y_weights)
            y_scores = np.dot(Yk, y_weights) / y_ss
            # test for null variance
            if np.dot(x_scores.T, x_scores) < np.finfo(np.double).eps:
                warnings.warn('X scores are null at iteration %s' % k)
                break
            # 2) Deflation (in place)
            # ----------------------
            # Possible memory footprint reduction may done here: in order to
            # avoid the allocation of a data chunk for the rank-one
            # approximations matrix which is then subtracted to Xk, we suggest
            # to perform a column-wise deflation.
            #
            # - regress Xk's on x_score
            x_loadings = np.dot(Xk.T, x_scores) / np.dot(x_scores.T, x_scores)
            # - subtract rank-one approximations to obtain remainder matrix
            Xk -= np.dot(x_scores, x_loadings.T)
            if self.deflation_mode == "canonical":
                # - regress Yk's on y_score, then subtract rank-one approx.
                y_loadings = (np.dot(Yk.T, y_scores)
                              / np.dot(y_scores.T, y_scores))
                Yk -= np.dot(y_scores, y_loadings.T)
            if self.deflation_mode == "regression":
                # - regress Yk's on x_score, then subtract rank-one approx.
                y_loadings = (np.dot(Yk.T, x_scores)
                              / np.dot(x_scores.T, x_scores))
                Yk -= np.dot(x_scores, y_loadings.T)
            # 3) Store weights, scores and loadings # Notation:
            self.x_scores_[:, k] = x_scores.ravel()  # T
            self.y_scores_[:, k] = y_scores.ravel()  # U
            self.x_weights_[:, k] = x_weights.ravel()  # W
            self.y_weights_[:, k] = y_weights.ravel()  # C
            self.x_loadings_[:, k] = x_loadings.ravel()  # P
            self.y_loadings_[:, k] = y_loadings.ravel()  # Q
        # Such that: X = TP' + Err and Y = UQ' + Err

        # 4) rotations from input space to transformed space (scores)
        # T = X W(P'W)^-1 = XW* (W* : p x k matrix)
        # U = Y C(Q'C)^-1 = YC* (W* : q x k matrix)
        self.x_rotations_ = np.dot(
            self.x_weights_,
            linalg.pinv2(np.dot(self.x_loadings_.T, self.x_weights_),
                         **pinv2_args))
        if Y.shape[1] > 1:
            self.y_rotations_ = np.dot(
                self.y_weights_,
                linalg.pinv2(np.dot(self.y_loadings_.T, self.y_weights_),
                             **pinv2_args))
        else:
            self.y_rotations_ = np.ones(1)

        if True or self.deflation_mode == "regression":
            # FIXME what's with the if?
            # Estimate regression coefficient
            # Regress Y on T
            # Y = TQ' + Err,
            # Then express in function of X
            # Y = X W(P'W)^-1Q' + Err = XB + Err
            # => B = W*Q' (p x q)
            self.coef_ = np.dot(self.x_rotations_, self.y_loadings_.T)
            self.coef_ = (1. / self.x_std_.reshape((p, 1)) * self.coef_ *
                          self.y_std_)
        return self