def whiten_data(X, centre=True, method="PCA", epsilon=1e-10): """Centre and whiten the data X. A whitening transformation is used to decorrelate the variables, such that the new covariance matrix of the whitened data is the identity matrix. If X is a random vector with non-singular covariance matrix C, and W is a whitening matrix satisfying W^T W = C^-1, then the transformation Y = W X will yield a whitened random vector Y with unit diagonal covariance. In ZCA whitening, the matrix W = C^-1/2, while in PCA whitening, the matrix W is the eigensystem of C. More details can be found in [Kessy2015]_. Parameters ---------- X : numpy array, shape (m, n) The input data. centre : bool, default True If True, centre the data along the features axis. If False, do not centre the data. method : {"PCA", "ZCA"} How to whiten the data. The default is PCA whitening. epsilon : float Small floating-point value to avoid divide-by-zero errors. Returns ------- Y : numpy array, shape (m, n) The centred and whitened data. W : numpy array, shape (n, n) The whitening matrix. References ---------- .. [Kessy2015] A. Kessy, A. Lewin, and K. Strimmer, "Optimal Whitening and Decorrelation", arXiv:1512.00809, (2015), https://arxiv.org/pdf/1512.00809.pdf """ Y = X # Centre the variables if centre: Y -= Y.mean(axis=0) # Calculate the whitening matrix R = (Y.T @ Y) / Y.shape[0] U, S, _ = svd_solve(R, svd_solver="full") S = np.sqrt(S + epsilon)[:, np.newaxis] if method == "PCA": # PCA whitening was the default in HyperSpy < 1.6.0, # we keep it as the default here. W = U.T / S elif method == "ZCA": W = U @ (U.T / S) else: raise ValueError(f"method must be one of ['PCA', 'zca'], got {method}") # Whiten the data Y = Y @ W.T return Y, W
def orpca( X, rank, store_error=False, project=False, batch_size=None, lambda1=0.1, lambda2=1.0, method="BCD", init="qr", training_samples=10, subspace_learning_rate=1.0, subspace_momentum=0.5, **kwargs, ): """Perform online, robust PCA on the data X. This is a wrapper function for the ORPCA class. Parameters ---------- X : {numpy array, iterator} [n_features x n_samples] matrix of observations or an iterator that yields samples, each with n_features elements. rank : int The rank of the representation (number of components/factors) store_error : bool, default False If True, stores the sparse error matrix. project : bool, default False If True, project the data X onto the learnt model. batch_size : {None, int}, default None If not None, learn the data in batches, each of batch_size samples or less. lambda1 : float Nuclear norm regularization parameter. lambda2 : float Sparse error regularization parameter. method : {'CF', 'BCD', 'SGD', 'MomentumSGD'}, default 'BCD' * 'CF' - Closed-form solver * 'BCD' - Block-coordinate descent * 'SGD' - Stochastic gradient descent * 'MomentumSGD' - Stochastic gradient descent with momentum init : {'qr', 'rand', np.ndarray}, default 'qr' * 'qr' - QR-based initialization * 'rand' - Random initialization * np.ndarray if the shape [n_features x rank] training_samples : int Specifies the number of training samples to use in the 'qr' initialization. subspace_learning_rate : float Learning rate for the 'SGD' and 'MomentumSGD' methods. Should be a float > 0.0 subspace_momentum : float Momentum parameter for 'MomentumSGD' method, should be a float between 0 and 1. Returns ------- numpy arrays * If project is True, returns the low-rank factors and loadings only * Otherwise, returns the low-rank and sparse error matrices, as well as the results of a singular value decomposition (SVD) applied to the low-rank matrix. """ if kwargs.get("learning_rate", False): warnings.warn( "The argument `learning_rate` has been deprecated and may " "be removed in future. Please use `subspace_learning_rate` instead.", VisibleDeprecationWarning, ) subspace_learning_rate = kwargs["learning_rate"] if kwargs.get("momentum", False): warnings.warn( "The argument `momentum` has been deprecated and may " "be removed in future. Please use `subspace_momentum` instead.", VisibleDeprecationWarning, ) subspace_momentum = kwargs["momentum"] X = X.T _orpca = ORPCA( rank, store_error=store_error, lambda1=lambda1, lambda2=lambda2, method=method, init=init, training_samples=training_samples, subspace_learning_rate=subspace_learning_rate, subspace_momentum=subspace_momentum, ) _orpca.fit(X, batch_size=batch_size) if project: L = _orpca.L R = _orpca.project(X) else: L, R = _orpca.finish() if store_error: Xhat = L @ R Ehat = np.array(_orpca.E).T # Do final SVD U, S, Vh = svd_solve(Xhat, output_dimension=rank) V = Vh.T # Chop small singular values which # likely arise from numerical noise # in the SVD. S[rank:] = 0.0 return Xhat, Ehat, U, S, V else: return L, R
def mlpca(X, varX, output_dimension, svd_solver="auto", tol=1e-10, max_iter=50000, **kwargs): """Performs maximum likelihood PCA with missing data and/or heteroskedastic noise. Standard PCA based on a singular value decomposition (SVD) approach assumes that the data is corrupted with Gaussian, or homoskedastic noise. For many applications, this assumption does not hold. For example, count data from EDS-TEM experiments is corrupted by Poisson noise, where the noise variance depends on the underlying pixel value. Rather than scaling or transforming the data to approximately "normalize" the noise, MLPCA instead uses estimates of the data variance to perform the decomposition. This function is a transcription of a MATLAB code obtained from [Andrews1997]_. Read more in the :ref:`User Guide <mva.mlpca>`. Parameters ---------- X : numpy array, shape (m, n) Matrix of observations. varX : numpy array Matrix of variances associated with X (zeros for missing measurements). output_dimension : int The model dimensionality. svd_solver : {"auto", "full", "arpack", "randomized"}, default "auto" If auto: The solver is selected by a default policy based on `data.shape` and `output_dimension`: if the input data is larger than 500x500 and the number of components to extract is lower than 80% of the smallest dimension of the data, then the more efficient "randomized" method is enabled. Otherwise the exact full SVD is computed and optionally truncated afterwards. If full: run exact SVD, calling the standard LAPACK solver via :py:func:`scipy.linalg.svd`, and select the components by postprocessing If arpack: use truncated SVD, calling ARPACK solver via :py:func:`scipy.sparse.linalg.svds`. It requires strictly `0 < output_dimension < min(data.shape)` If randomized: use truncated SVD, calling :py:func:`sklearn.utils.extmath.randomized_svd` to estimate a limited number of components tol : float Tolerance of the stopping condition. max_iter : int Maximum number of iterations before exiting without convergence. Returns ------- U, S, V: numpy array The pseudo-SVD parameters. s_obj : float Value of the objective function. References ---------- .. [Andrews1997] Darren T. Andrews and Peter D. Wentzell, "Applications of maximum likelihood principal component analysis: incomplete data sets and calibration transfer", Analytica Chimica Acta 350, no. 3 (September 19, 1997): 341-352. """ m, n = X.shape with np.errstate(divide="ignore"): # Shouldn't really have zero variance anywhere, # except for missing data but handle it here. inv_v = 1.0 / varX inv_v[~np.isfinite(inv_v)] = 1.0 _logger.info("Performing maximum likelihood principal components analysis") # Generate initial estimates _logger.info("Generating initial estimates") U, _, _ = svd_solve(np.cov(X), svd_solver=svd_solver, **kwargs) U = U[:, :output_dimension] s_old = 0.0 # Placeholders F = np.empty((m, n)) M = np.zeros((m, n)) Uq = np.zeros((output_dimension, m)) # Loop for alternating least squares _logger.info("Optimization iteration loop") for itr in range(max_iter): # pragma: no branch s_obj = 0.0 for i in range(n): Uq = U.T * inv_v[:, i] F = np.linalg.inv(Uq @ U) M[:, i] = np.linalg.multi_dot([U, F, Uq, X[:, i].T]) dx = X[:, i] - M[:, i] s_obj += (dx * inv_v[:, i]) @ dx.T # Every second iteration, check the stop criterion if itr > 0 and itr % 2 == 0: stop_criterion = np.abs(s_old - s_obj) / s_obj _logger.info( f"Iteration: {itr // 2}, convergence: {stop_criterion}") if stop_criterion < tol: break # Transpose for next iteration s_old = s_obj _, _, V = svd_solve(M, svd_solver=svd_solver, **kwargs) X = X.T inv_v = inv_v.T F = F.T M = M.T m, n = X.shape U = V[:output_dimension].T U, S, V = svd_solve(M, svd_solver=svd_solver, **kwargs) V = V.T return U, S, V, s_obj
def rpca_godec(X, rank, lambda1=None, power=0, tol=1e-3, maxiter=1000, **kwargs): """Perform Robust PCA with missing or corrupted data, using the GoDec algorithm. Decomposes a matrix Y = X + E, where X is low-rank and E is a sparse error matrix. This algorithm is based on the Matlab code from [Zhou2011]_. See code here: https://sites.google.com/site/godecomposition/matrix/artifact-1 Read more in the :ref:`User Guide <mva.rpca>`. Parameters ---------- X : numpy array, shape (n_features, n_samples) The matrix of observations. rank : int The model dimensionality. lambda1 : None or float Regularization parameter. If None, set to 1 / sqrt(n_features) power : int The number of power iterations used in the initialization tol : float Convergence tolerance maxiter : int Maximum number of iterations Returns ------- Xhat : numpy array, shape (n_features, n_samples) The low-rank matrix Ehat : numpy array, shape (n_features, n_samples) The sparse error matrix U, S, V : numpy arrays The results of an SVD on Xhat References ---------- .. [Zhou2011] Tianyi Zhou and Dacheng Tao, "GoDec: Randomized Low-rank & Sparse Matrix Decomposition in Noisy Case", ICML-11, (2011), pp. 33-40. """ # Get shape m, n = X.shape # Operate on transposed matrix for speed transpose = False if m < n: transpose = True X = X.T # Get shape m, n = X.shape # Check options if None if lambda1 is None: _logger.info( "Threshold 'lambda1' is set to default: 1 / sqrt(n_features)") lambda1 = 1.0 / np.sqrt(m) # Initialize L and E L = X E = np.zeros(L.shape) for itr in range(int(maxiter)): # Initialization with bilateral random projections Y2 = np.random.randn(n, rank) for _ in range(power + 1): Y2 = L.T @ (L @ Y2) Q, _ = scipy.linalg.qr(Y2, mode="economic") # Estimate the new low-rank and sparse matrices Lnew = (L @ Q) @ Q.T A = L - Lnew + E L = Lnew E = _soft_thresh(A, lambda1) A -= E L += A # Check convergence eps = np.linalg.norm(A) if eps < tol: _logger.info(f"Converged to {eps} in {itr} iterations") break # Transpose back if transpose: L = L.T E = E.T # Rescale Xhat = L Ehat = E # Do final SVD U, S, Vh = svd_solve(Xhat, output_dimension=rank, **kwargs) V = Vh.T # Chop small singular values which # likely arise from numerical noise # in the SVD. S[rank:] = 0.0 return Xhat, Ehat, U, S, V