def test_infer_dim_1(): # TODO: explain what this is testing # Or at least use explicit variable names... n, p = 1000, 5 rng = np.random.RandomState(0) X = (rng.randn(n, p) * .1 + rng.randn(n, 1) * np.array([3, 4, 5, 1, 2]) + np.array([1, 0, 7, 4, 6])) pca = PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ ll = np.array([_assess_dimension_(spect, k, n, p) for k in range(p)]) assert_greater(ll[1], ll.max() - .01 * n)
def test_infer_dim_1(): # TODO: explain what this is testing # Or at least use explicit variable names... n, p = 1000, 5 rng = np.random.RandomState(0) X = (rng.randn(n, p) * .1 + rng.randn(n, 1) * np.array([3, 4, 5, 1, 2]) + np.array([1, 0, 7, 4, 6])) pca = PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ ll = np.array([_assess_dimension_(spect, k, n, p) for k in range(p)]) assert_greater(ll[1], ll.max() - .01 * n)
def test_infer_dim_1(): """TODO: explain what this is testing Or at least use explicit variable names... """ n, p = 1000, 5 rng = np.random.RandomState(0) X = rng.randn(n, p) * 0.1 + rng.randn(n, 1) * np.array([3, 4, 5, 1, 2]) + np.array([1, 0, 7, 4, 6]) pca = PCA(n_components=p) pca.fit(X) spect = pca.explained_variance_ ll = [] for k in range(p): ll.append(_assess_dimension_(spect, k, n, p)) ll = np.array(ll) assert_greater(ll[1], ll.max() - 0.01 * n)
def test_infer_dim_1(): # TODO: explain what this is testing # Or at least use explicit variable names... n, p = 1000, 5 rng = np.random.RandomState(0) X = (rng.randn(n, p) * 0.1 + rng.randn(n, 1) * np.array([3, 4, 5, 1, 2]) + np.array([1, 0, 7, 4, 6])) X = da.from_array(X, chunks=(n, p)) pca = dd.PCA(n_components=p, svd_solver="full") pca.fit(X) spect = pca.explained_variance_ ll = [] for k in range(p): ll.append(_assess_dimension_(spect, k, n, p)) ll = np.array(ll) assert ll[1] > ll.max() - 0.01 * n
def get_space(self): cov_mat_sqrt_np = self.cov_mat_sqrt.clone().cpu().numpy() # perform PCA on DD' cov_mat_sqrt_np /= (max(1, self.rank.item() - 1))**0.5 if self.pca_rank == 'mle': pca_rank = self.rank.item() else: pca_rank = self.pca_rank pca_rank = max(1, min(pca_rank, self.rank.item())) pca_decomp = TruncatedSVD(n_components=pca_rank) pca_decomp.fit(cov_mat_sqrt_np) _, s, Vt = randomized_svd(cov_mat_sqrt_np, n_components=pca_rank, n_iter=5) # perform post-selection fitting if self.pca_rank == 'mle': eigs = s**2.0 ll = np.zeros(len(eigs)) correction = np.zeros(len(eigs)) # compute minka's PCA marginal log likelihood and the correction term for rank in range(len(eigs)): # secondary correction term based on the rank of the matrix + degrees of freedom m = cov_mat_sqrt_np.shape[1] * rank - rank * (rank + 1) / 2. correction[rank] = 0.5 * m * np.log(cov_mat_sqrt_np.shape[0]) ll[rank] = _assess_dimension_( spectrum=eigs, rank=rank, n_features=min(cov_mat_sqrt_np.shape), n_samples=max(cov_mat_sqrt_np.shape)) self.ll = ll self.corrected_ll = ll - correction self.pca_rank = np.nanargmax(self.corrected_ll) print('PCA Rank is: ', self.pca_rank) return torch.FloatTensor(s[:self.pca_rank, None] * Vt[:self.pca_rank, :]) else: return torch.FloatTensor(s[:, None] * Vt)