Example #1
0
def whiten_data(X, centre=True, method="PCA", epsilon=1e-10):
    """Centre and whiten the data X.

    A whitening transformation is used to decorrelate
    the variables, such that the new covariance matrix
    of the whitened data is the identity matrix.

    If X is a random vector with non-singular covariance
    matrix C, and W is a whitening matrix satisfying
    W^T W = C^-1, then the transformation Y = W X will
    yield a whitened random vector Y with unit diagonal
    covariance. In ZCA whitening, the matrix W = C^-1/2,
    while in PCA whitening, the matrix W is the
    eigensystem of C. More details can be found in [Kessy2015]_.

    Parameters
    ----------
    X : numpy array, shape (m, n)
        The input data.
    centre : bool, default True
        If True, centre the data along the features axis.
        If False, do not centre the data.
    method : {"PCA", "ZCA"}
        How to whiten the data. The default is PCA whitening.
    epsilon : float
        Small floating-point value to avoid divide-by-zero errors.

    Returns
    -------
    Y : numpy array, shape (m, n)
        The centred and whitened data.
    W : numpy array, shape (n, n)
        The whitening matrix.

    References
    ----------
    .. [Kessy2015] A. Kessy, A. Lewin, and K. Strimmer, "Optimal
        Whitening and Decorrelation", arXiv:1512.00809, (2015),
        https://arxiv.org/pdf/1512.00809.pdf

    """
    Y = X

    # Centre the variables
    if centre:
        Y -= Y.mean(axis=0)

    # Calculate the whitening matrix
    R = (Y.T @ Y) / Y.shape[0]
    U, S, _ = svd_solve(R, svd_solver="full")
    S = np.sqrt(S + epsilon)[:, np.newaxis]

    if method == "PCA":
        # PCA whitening was the default in HyperSpy < 1.6.0,
        # we keep it as the default here.
        W = U.T / S

    elif method == "ZCA":
        W = U @ (U.T / S)

    else:
        raise ValueError(f"method must be one of ['PCA', 'zca'], got {method}")

    # Whiten the data
    Y = Y @ W.T

    return Y, W
Example #2
0
def orpca(
    X,
    rank,
    store_error=False,
    project=False,
    batch_size=None,
    lambda1=0.1,
    lambda2=1.0,
    method="BCD",
    init="qr",
    training_samples=10,
    subspace_learning_rate=1.0,
    subspace_momentum=0.5,
    **kwargs,
):
    """Perform online, robust PCA on the data X.

    This is a wrapper function for the ORPCA class.

    Parameters
    ----------
    X : {numpy array, iterator}
        [n_features x n_samples] matrix of observations
        or an iterator that yields samples, each with n_features elements.
    rank : int
        The rank of the representation (number of components/factors)
    store_error : bool, default False
        If True, stores the sparse error matrix.
    project : bool, default False
        If True, project the data X onto the learnt model.
    batch_size : {None, int}, default None
        If not None, learn the data in batches, each of batch_size samples
        or less.
    lambda1 : float
        Nuclear norm regularization parameter.
    lambda2 : float
        Sparse error regularization parameter.
    method : {'CF', 'BCD', 'SGD', 'MomentumSGD'}, default 'BCD'
        * 'CF'  - Closed-form solver
        * 'BCD' - Block-coordinate descent
        * 'SGD' - Stochastic gradient descent
        * 'MomentumSGD' - Stochastic gradient descent with momentum
    init : {'qr', 'rand', np.ndarray}, default 'qr'
        * 'qr'   - QR-based initialization
        * 'rand' - Random initialization
        * np.ndarray if the shape [n_features x rank]
    training_samples : int
        Specifies the number of training samples to use in
        the 'qr' initialization.
    subspace_learning_rate : float
        Learning rate for the 'SGD' and 'MomentumSGD' methods.
        Should be a float > 0.0
    subspace_momentum : float
        Momentum parameter for 'MomentumSGD' method, should be
        a float between 0 and 1.

    Returns
    -------
    numpy arrays
        * If project is True, returns the low-rank factors and loadings only
        * Otherwise, returns the low-rank and sparse error matrices, as well
          as the results of a singular value decomposition (SVD) applied to
          the low-rank matrix.

    """
    if kwargs.get("learning_rate", False):
        warnings.warn(
            "The argument `learning_rate` has been deprecated and may "
            "be removed in future. Please use `subspace_learning_rate` instead.",
            VisibleDeprecationWarning,
        )
        subspace_learning_rate = kwargs["learning_rate"]
    if kwargs.get("momentum", False):
        warnings.warn(
            "The argument `momentum` has been deprecated and may "
            "be removed in future. Please use `subspace_momentum` instead.",
            VisibleDeprecationWarning,
        )
        subspace_momentum = kwargs["momentum"]

    X = X.T

    _orpca = ORPCA(
        rank,
        store_error=store_error,
        lambda1=lambda1,
        lambda2=lambda2,
        method=method,
        init=init,
        training_samples=training_samples,
        subspace_learning_rate=subspace_learning_rate,
        subspace_momentum=subspace_momentum,
    )
    _orpca.fit(X, batch_size=batch_size)

    if project:
        L = _orpca.L
        R = _orpca.project(X)
    else:
        L, R = _orpca.finish()

    if store_error:
        Xhat = L @ R
        Ehat = np.array(_orpca.E).T

        # Do final SVD
        U, S, Vh = svd_solve(Xhat, output_dimension=rank)
        V = Vh.T

        # Chop small singular values which
        # likely arise from numerical noise
        # in the SVD.
        S[rank:] = 0.0

        return Xhat, Ehat, U, S, V
    else:
        return L, R
Example #3
0
def mlpca(X,
          varX,
          output_dimension,
          svd_solver="auto",
          tol=1e-10,
          max_iter=50000,
          **kwargs):
    """Performs maximum likelihood PCA with missing data and/or heteroskedastic noise.

    Standard PCA based on a singular value decomposition (SVD) approach assumes
    that the data is corrupted with Gaussian, or homoskedastic noise. For many
    applications, this assumption does not hold. For example, count data from
    EDS-TEM experiments is corrupted by Poisson noise, where the noise variance
    depends on the underlying pixel value. Rather than scaling or transforming
    the data to approximately "normalize" the noise, MLPCA instead uses estimates
    of the data variance to perform the decomposition.

    This function is a transcription of a MATLAB code obtained from [Andrews1997]_.

    Read more in the :ref:`User Guide <mva.mlpca>`.

    Parameters
    ----------
    X : numpy array, shape (m, n)
        Matrix of observations.
    varX : numpy array
        Matrix of variances associated with X
        (zeros for missing measurements).
    output_dimension : int
        The model dimensionality.
    svd_solver : {"auto", "full", "arpack", "randomized"}, default "auto"
        If auto:
            The solver is selected by a default policy based on `data.shape` and
            `output_dimension`: if the input data is larger than 500x500 and the
            number of components to extract is lower than 80% of the smallest
            dimension of the data, then the more efficient "randomized"
            method is enabled. Otherwise the exact full SVD is computed and
            optionally truncated afterwards.
        If full:
            run exact SVD, calling the standard LAPACK solver via
            :py:func:`scipy.linalg.svd`, and select the components by postprocessing
        If arpack:
            use truncated SVD, calling ARPACK solver via
            :py:func:`scipy.sparse.linalg.svds`. It requires strictly
            `0 < output_dimension < min(data.shape)`
        If randomized:
            use truncated SVD, calling :py:func:`sklearn.utils.extmath.randomized_svd`
            to estimate a limited number of components
    tol : float
        Tolerance of the stopping condition.
    max_iter : int
        Maximum number of iterations before exiting without convergence.

    Returns
    -------
    U, S, V: numpy array
        The pseudo-SVD parameters.
    s_obj : float
        Value of the objective function.

    References
    ----------
    .. [Andrews1997] Darren T. Andrews and Peter D. Wentzell, "Applications
        of maximum likelihood principal component analysis: incomplete
        data sets and calibration transfer", Analytica Chimica Acta 350,
        no. 3 (September 19, 1997): 341-352.

    """
    m, n = X.shape

    with np.errstate(divide="ignore"):
        # Shouldn't really have zero variance anywhere,
        # except for missing data but handle it here.
        inv_v = 1.0 / varX
        inv_v[~np.isfinite(inv_v)] = 1.0

    _logger.info("Performing maximum likelihood principal components analysis")

    # Generate initial estimates
    _logger.info("Generating initial estimates")
    U, _, _ = svd_solve(np.cov(X), svd_solver=svd_solver, **kwargs)
    U = U[:, :output_dimension]
    s_old = 0.0

    # Placeholders
    F = np.empty((m, n))
    M = np.zeros((m, n))
    Uq = np.zeros((output_dimension, m))

    # Loop for alternating least squares
    _logger.info("Optimization iteration loop")
    for itr in range(max_iter):  # pragma: no branch
        s_obj = 0.0

        for i in range(n):
            Uq = U.T * inv_v[:, i]
            F = np.linalg.inv(Uq @ U)
            M[:, i] = np.linalg.multi_dot([U, F, Uq, X[:, i].T])
            dx = X[:, i] - M[:, i]
            s_obj += (dx * inv_v[:, i]) @ dx.T

        # Every second iteration, check the stop criterion
        if itr > 0 and itr % 2 == 0:
            stop_criterion = np.abs(s_old - s_obj) / s_obj
            _logger.info(
                f"Iteration: {itr // 2}, convergence: {stop_criterion}")

            if stop_criterion < tol:
                break

        # Transpose for next iteration
        s_old = s_obj
        _, _, V = svd_solve(M, svd_solver=svd_solver, **kwargs)

        X = X.T
        inv_v = inv_v.T
        F = F.T
        M = M.T

        m, n = X.shape
        U = V[:output_dimension].T

    U, S, V = svd_solve(M, svd_solver=svd_solver, **kwargs)
    V = V.T

    return U, S, V, s_obj
Example #4
0
def rpca_godec(X,
               rank,
               lambda1=None,
               power=0,
               tol=1e-3,
               maxiter=1000,
               **kwargs):
    """Perform Robust PCA with missing or corrupted data, using the GoDec algorithm.

    Decomposes a matrix Y = X + E, where X is low-rank and E
    is a sparse error matrix. This algorithm is based on the
    Matlab code from [Zhou2011]_. See code here:
    https://sites.google.com/site/godecomposition/matrix/artifact-1

    Read more in the :ref:`User Guide <mva.rpca>`.

    Parameters
    ----------
    X : numpy array, shape (n_features, n_samples)
        The matrix of observations.
    rank : int
        The model dimensionality.
    lambda1 : None or float
        Regularization parameter.
        If None, set to 1 / sqrt(n_features)
    power : int
        The number of power iterations used in the initialization
    tol : float
        Convergence tolerance
    maxiter : int
        Maximum number of iterations

    Returns
    -------
    Xhat : numpy array, shape (n_features, n_samples)
        The  low-rank matrix
    Ehat : numpy array, shape (n_features, n_samples)
        The sparse error matrix
    U, S, V : numpy arrays
        The results of an SVD on Xhat

    References
    ----------
    .. [Zhou2011] Tianyi Zhou and Dacheng Tao, "GoDec: Randomized Low-rank &
        Sparse Matrix Decomposition in Noisy Case", ICML-11, (2011), pp. 33-40.

    """
    # Get shape
    m, n = X.shape

    # Operate on transposed matrix for speed
    transpose = False
    if m < n:
        transpose = True
        X = X.T

    # Get shape
    m, n = X.shape

    # Check options if None
    if lambda1 is None:
        _logger.info(
            "Threshold 'lambda1' is set to default: 1 / sqrt(n_features)")
        lambda1 = 1.0 / np.sqrt(m)

    # Initialize L and E
    L = X
    E = np.zeros(L.shape)

    for itr in range(int(maxiter)):
        # Initialization with bilateral random projections
        Y2 = np.random.randn(n, rank)
        for _ in range(power + 1):
            Y2 = L.T @ (L @ Y2)

        Q, _ = scipy.linalg.qr(Y2, mode="economic")

        # Estimate the new low-rank and sparse matrices
        Lnew = (L @ Q) @ Q.T
        A = L - Lnew + E
        L = Lnew
        E = _soft_thresh(A, lambda1)
        A -= E
        L += A

        # Check convergence
        eps = np.linalg.norm(A)
        if eps < tol:
            _logger.info(f"Converged to {eps} in {itr} iterations")
            break

    # Transpose back
    if transpose:
        L = L.T
        E = E.T

    # Rescale
    Xhat = L
    Ehat = E

    # Do final SVD
    U, S, Vh = svd_solve(Xhat, output_dimension=rank, **kwargs)
    V = Vh.T

    # Chop small singular values which
    # likely arise from numerical noise
    # in the SVD.
    S[rank:] = 0.0

    return Xhat, Ehat, U, S, V