def test_fast_svd_infinite_rank(): """Check that extmath.fast_svd can handle noisy matrices""" n_samples = 100 n_features = 500 rank = 5 k = 10 # let us try again without 'low_rank component': just regularly but slowly # decreasing singular values: the rank of the data matrix is infinite X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=1.0, random_state=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method _, s, _ = linalg.svd(X, full_matrices=False) # compute the singular values of X using the fast approximate method # without the iterated power method _, sa, _ = fast_svd(X, k, q=0) # the approximation does not tolerate the noise: assert np.abs(s[:k] - sa).max() > 0.1 # compute the singular values of X using the fast approximate method with # iterated power method _, sap, _ = fast_svd(X, k, q=5) # the iterated power method is still managing to get most of the structure # at the requested rank assert_almost_equal(s[:k], sap, decimal=3)
def test_fast_svd_transpose_consistency(): """Check that transposing the design matrix has limit impact""" n_samples = 100 n_features = 500 rank = 4 k = 10 X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.5, random_state=0) assert_equal(X.shape, (n_samples, n_features)) U1, s1, V1 = fast_svd(X, k, q=3, transpose=False, random_state=0) U2, s2, V2 = fast_svd(X, k, q=3, transpose=True, random_state=0) U3, s3, V3 = fast_svd(X, k, q=3, transpose='auto', random_state=0) U4, s4, V4 = linalg.svd(X, full_matrices=False) assert_almost_equal(s1, s4[:k], decimal=3) assert_almost_equal(s2, s4[:k], decimal=3) assert_almost_equal(s3, s4[:k], decimal=3) assert_almost_equal(np.dot(U1, V1), np.dot(U4[:, :k], V4[:k, :]), decimal=2) assert_almost_equal(np.dot(U2, V2), np.dot(U4[:, :k], V4[:k, :]), decimal=2) # in this case 'auto' is equivalent to transpose assert_almost_equal(s2, s3)
def test_fast_svd_low_rank(): """Check that extmath.fast_svd is consistent with linalg.svd""" n_samples = 100 n_features = 500 rank = 5 k = 10 # generate a matrix X of approximate effective rank `rank` and no noise # component (very structured signal): X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.0, random_state=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method U, s, V = linalg.svd(X, full_matrices=False) # compute the singular values of X using the fast approximate method Ua, sa, Va = fast_svd(X, k) assert_equal(Ua.shape, (n_samples, k)) assert_equal(sa.shape, (k,)) assert_equal(Va.shape, (k, n_features)) # ensure that the singular values of both methods are equal up to the real # rank of the matrix assert_almost_equal(s[:k], sa) # check the singular vectors too (while not checking the sign) assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va)) # check the sparse matrix representation X = sparse.csr_matrix(X) # compute the singular values of X using the fast approximate method Ua, sa, Va = fast_svd(X, k) assert_almost_equal(s[:rank], sa[:rank])
def compute_bench(samples_range, features_range, q=3, rank=50): it = 0 results = defaultdict(lambda: []) max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print '====================' print 'Iteration %03d of %03d' % (it, max_it) print '====================' X = make_low_rank_matrix(n_samples, n_features, effective_rank=rank, tail_strength=0.2) gc.collect() print "benching scipy svd: " tstart = time() svd(X, full_matrices=False) results['scipy svd'].append(time() - tstart) gc.collect() print "benching scikit-learn fast_svd: q=0" tstart = time() fast_svd(X, rank, q=0) results['scikit-learn fast_svd (q=0)'].append(time() - tstart) gc.collect() print "benching scikit-learn fast_svd: q=%d " % q tstart = time() fast_svd(X, rank, q=q) results['scikit-learn fast_svd (q=%d)' % q].append(time() - tstart) return results
def test_fast_svd_low_rank_with_noise(): """Check that extmath.fast_svd can handle noisy matrices""" n_samples = 100 n_features = 500 rank = 5 k = 10 # generate a matrix X wity structure approximate rank `rank` and an # important noisy component X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.5, random_state=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method _, s, _ = linalg.svd(X, full_matrices=False) # compute the singular values of X using the fast approximate method # without the iterated power method _, sa, _ = fast_svd(X, k, n_iterations=0) # the approximation does not tolerate the noise: assert np.abs(s[:k] - sa).max() > 0.05 # compute the singular values of X using the fast approximate method with # iterated power method _, sap, _ = fast_svd(X, k, n_iterations=5) # the iterated power method is helping getting rid of the noise: assert_almost_equal(s[:k], sap, decimal=3)
def test_fast_svd_infinite_rank(): """Check that extmath.fast_svd can handle noisy matrices""" n_samples = 100 n_features = 500 rank = 5 k = 10 # let us try again without 'low_rank component': just regularly but slowly # decreasing singular values: the rank of the data matrix is infinite X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=1.0, random_state=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method _, s, _ = linalg.svd(X, full_matrices=False) # compute the singular values of X using the fast approximate method # without the iterated power method _, sa, _ = fast_svd(X, k, n_iterations=0) # the approximation does not tolerate the noise: assert np.abs(s[:k] - sa).max() > 0.1 # compute the singular values of X using the fast approximate method with # iterated power method _, sap, _ = fast_svd(X, k, n_iterations=5) # the iterated power method is still managing to get most of the structure # at the requested rank assert_almost_equal(s[:k], sap, decimal=3)
def test_fast_svd_transpose_consistency(): """Check that transposing the design matrix has limit impact""" n_samples = 100 n_features = 500 rank = 4 k = 10 X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.5, random_state=0) assert_equal(X.shape, (n_samples, n_features)) U1, s1, V1 = fast_svd(X, k, n_iterations=3, transpose=False, random_state=0) U2, s2, V2 = fast_svd(X, k, n_iterations=3, transpose=True, random_state=0) U3, s3, V3 = fast_svd(X, k, n_iterations=3, transpose='auto', random_state=0) U4, s4, V4 = linalg.svd(X, full_matrices=False) assert_almost_equal(s1, s4[:k], decimal=3) assert_almost_equal(s2, s4[:k], decimal=3) assert_almost_equal(s3, s4[:k], decimal=3) assert_almost_equal(np.dot(U1, V1), np.dot(U4[:, :k], V4[:k, :]), decimal=2) assert_almost_equal(np.dot(U2, V2), np.dot(U4[:, :k], V4[:k, :]), decimal=2) # in this case 'auto' is equivalent to transpose assert_almost_equal(s2, s3)
def test_fast_svd_low_rank_with_noise(): """Check that extmath.fast_svd can handle noisy matrices""" n_samples = 100 n_features = 500 rank = 5 k = 10 # generate a matrix X wity structure approximate rank `rank` and an # important noisy component X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.5, random_state=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method _, s, _ = linalg.svd(X, full_matrices=False) # compute the singular values of X using the fast approximate method # without the iterated power method _, sa, _ = fast_svd(X, k, q=0) # the approximation does not tolerate the noise: assert np.abs(s[:k] - sa).max() > 0.05 # compute the singular values of X using the fast approximate method with # iterated power method _, sap, _ = fast_svd(X, k, q=5) # the iterated power method is helping getting rid of the noise: assert_almost_equal(s[:k], sap, decimal=3)
def matrixsvd(self): svd_matrix = self.projection_matrix.tocsc() if self.svd is 'scipy': Utemp, Stemp, VTtemp = ssl.svds( svd_matrix.tocsc(), k=(int(self.projection_matrix.tocsr().shape[0] * self.precision) / 100)) UT = np.nan_to_num(Utemp.transpose()) S = np.nan_to_num(Stemp) VT = np.nan_to_num(VTtemp) elif self.svd is 'sparsesvd': (UT, S, VT) = sparsesvd(svd_matrix, (int(svd_matrix.shape[0] * self.precision) / 100)) elif self.svd is 'fast': Utemp, Stemp, VTtemp = fast_svd(svd_matrix, ( int(self.projection_matrix.tocsr().shape[0] * self.precision) / 100)) UT = np.nan_to_num(Utemp.transpose()) S = np.nan_to_num(Stemp) VT = np.nan_to_num(VTtemp) else: Utemp, Stemp, VTtemp = np.linalg.svd(svd_matrix.todense()) UT = np.nan_to_num(Utemp.transpose()) S = np.nan_to_num(Stemp) VT = np.nan_to_num(VTtemp) return UT, S, VT
def matrixsvd(self): svd_matrix = self.projection_matrix.tocsc() if self.svd is 'scipy': Utemp, Stemp, VTtemp = ssl.svds(svd_matrix.tocsc(), k=(int(self.projection_matrix.tocsr().shape[0] * self.precision) / 100)) UT = np.nan_to_num(Utemp.transpose()) S = np.nan_to_num(Stemp) VT = np.nan_to_num(VTtemp) elif self.svd is 'sparsesvd': (UT, S, VT) = sparsesvd(svd_matrix, (int(svd_matrix.shape[0] * self.precision) / 100)) elif self.svd is 'fast': Utemp, Stemp, VTtemp = fast_svd(svd_matrix, (int(self.projection_matrix.tocsr().shape[0] * self.precision) / 100)) UT = np.nan_to_num(Utemp.transpose()) S = np.nan_to_num(Stemp) VT = np.nan_to_num(VTtemp) else: Utemp, Stemp, VTtemp = np.linalg.svd(svd_matrix.todense()) UT = np.nan_to_num(Utemp.transpose()) S = np.nan_to_num(Stemp) VT = np.nan_to_num(VTtemp) return UT, S, VT
def scree_plots(t, ndim=[]): """ Performs scree tests for each dimension of the input tensor Parameters ---------- t : numpy array Higher-order tensor with size (I1 , I2 , ... , IN) along N dimensions. ndim : optional, list with N integer elements Number of components to calculate along each dimension. If not defined, the maximum size along each dimension will be used. Returns ------- scree : list of N numpy arrays One array with size ndim[i] for each dimension saving the eigenvalues for this dimension. """ total_dim = len(t.shape) if not ndim: # case with no input ndim for i in range(total_dim): ndim.append(t.shape[i]) elif len( ndim ) != total_dim: # case that input ndim does not agree with number of dimensions of the input tensor for i in range(total_dim): ndim.append(t.shape[i]) else: # check whether the number in ndim is less than the size of that dimension for i in range(total_dim): if ndim[i] > t.shape[i]: ndim[i] = t.shape[i] scree = [] for i in range(total_dim): t_unfold = unfold_axis(t, i) [_, e, _] = fast_svd(np.matmul(t_unfold, np.transpose(t_unfold)), ndim[i], n_iter=15) e = np.sqrt(e) e = np.real(e) scree.append(e) return scree
def matrixsvd(self): svd_matrix = self.projection_matrix.tocsc() svd_dict = {} result_list = [] if self.svd is 'scipy': Utemp, Stemp, VTtemp = ssl.svds( svd_matrix.tocsc(), k=(int(self.projection_matrix.tocsr().shape[0] * self.precision) / 100)) U = np.nan_to_num(Utemp.transpose()) S = np.nan_to_num(Stemp) VT = np.nan_to_num(VTtemp) elif self.svd is 'sparsesvd': (U, S, VT) = sparsesvd(svd_matrix, (int(svd_matrix.shape[0] * self.precision) / 100)) elif self.svd is 'fast': Utemp, Stemp, VTtemp = fast_svd(svd_matrix, ( int(self.projection_matrix.tocsr().shape[0] * self.precision) / 100)) U = np.nan_to_num(Utemp.transpose()) S = np.nan_to_num(Stemp) VT = np.nan_to_num(VTtemp) else: Utemp, Stemp, VTtemp = np.linalg.svd(svd_matrix.todense()) U = np.nan_to_num(Utemp.transpose()) S = np.nan_to_num(Stemp) VT = np.nan_to_num(VTtemp) rank = U.shape[0] #for k in cfor(1, lambda i: i <= rank, lambda i: i + 25): k = 1 while k <= rank: ut = U[:k] s = S[:k] vt = VT[:k] matrix_u = ss.csr_matrix(ut.T) matrix_s = ss.csr_matrix(np.diag(s)) matrix_vt = ss.csr_matrix(vt) temp_matrix = spmatrixmul(self.main_matrix, matrix_u) temp_matrix_a = spmatrixmul(matrix_s, matrix_vt) temp_matrix_b = spmatrixmul(temp_matrix_a, self.transpose_matrix.tocsr()) matrix_result = spmatrixmul(temp_matrix, temp_matrix_b) del temp_matrix, temp_matrix_a, temp_matrix_b result_list.append(matrix_result) difference = (matrix_result - self.truth_matrix) fresult = self.fnorm(difference) svd_dict[k] = fresult print 'k = ', k, 'fresult = ', fresult del matrix_result, fresult, difference if k == 1: k += (self.step - 1) else: k += self.step return svd_dict, result_list, U, S, VT
def svd_HO(data, rank, max_iter=10): """ HOOI method to decompose high order tensor with given ranks Parameters ---------- data : numpy array Higher-order noisy tensor with size (I1 , I2 , ... , IN) along N dimensions. N >1. rank : numpy array Integer array (R1, R2, R3, ... , Rn) describes the eigenvectors for each dimension max_iter: integer Number of iterations for higher-order orthogonal iteration (HOOI) algorithm Returns ------- X: numpy array Denoised tensor with low rank and the same size of input data. U: list of numpy array List of orthogonal matrix for each dimension, each component in the list has size Ik x Rk. S: numpy array Core tensor of the low rank tensor X with size (R1, R2, R3, ... , Rn). """ svd_iter = 10 data_shape = np.shape(data) # p0 # Check that number of dimensions match the number of rank numbers if len(data_shape) != len(rank): print("The rank should be the same size as the data shape") return data, [], [] # Check that for each rank, the product of all the rest ranks are larger than this rank for k in range(len(rank)): prod = 1 for i in range(len(rank)): if i != k: prod = prod * rank[i] if rank[k] > prod: print("The rank does not satisfy requirment of HOOI.") return data, [], [] dimensions = len(data_shape) # d ordered_indexes = np.argsort( data_shape ) # getting the indicies from min len to max, initialization starts from smallest size ## Initialize U and Y with SVD U = [ None ] * dimensions # Generate an empty array to save all the U matrices with fixed length X = data for k in ordered_indexes: # calculating initial SVD unfolded = unfold_axis(X, k) # unfolding from the axis with minimum size [U[k], _, _] = fast_svd(unfolded, rank[k], n_iter=svd_iter) X = ttm(X, np.transpose(U[k]), k) # This needs to be fixed! ## Update U with HOOI iter_count = 0 while iter_count < max_iter: iter_count += 1 for k in range(0, dimensions): Y = data minus_k = list(range(0, dimensions)) minus_k.remove( k ) # every value except for k, seems do it in one step will remove all the elements in the list. for j in minus_k: Y = ttm(Y, np.transpose(U[j]), j) MY = unfold_axis(Y, k) [U[k], _, _] = fast_svd(MY, rank[k], n_iter=svd_iter) ## Use the determined U matrices to calculate core tensor and denoised tensor X = data for k in ordered_indexes: X = ttm(X, np.transpose(U[k]), k) # Check this part. S = X # core tensor for k in range(0, dimensions): X = ttm(X, U[k], k) return X, U, S
def matrixsvd(self): svd_matrix = self.projection_matrix.tocsc() svd_dict = {} result_list = [] if self.svd is 'scipy': Utemp, Stemp, VTtemp = ssl.svds(svd_matrix.tocsc(), k=(int (self.projection_matrix.tocsr().shape[0] * self.precision)/100)) U = np.nan_to_num(Utemp.transpose()) S = np.nan_to_num(Stemp) VT = np.nan_to_num(VTtemp) elif self.svd is 'sparsesvd': (U, S, VT) = sparsesvd(svd_matrix, (int (svd_matrix.shape[0] * self.precision)/100)) elif self.svd is 'fast': Utemp, Stemp, VTtemp = fast_svd(svd_matrix, (int (self.projection_matrix.tocsr().shape[0] * self.precision)/100)) U = np.nan_to_num(Utemp.transpose()) S = np.nan_to_num(Stemp) VT = np.nan_to_num(VTtemp) else: Utemp, Stemp, VTtemp = np.linalg.svd(svd_matrix.todense()) U = np.nan_to_num(Utemp.transpose()) S = np.nan_to_num(Stemp) VT = np.nan_to_num(VTtemp) rank = U.shape[0] #for k in cfor(1, lambda i: i <= rank, lambda i: i + 25): k = 1 while k <= rank: ut = U[:k] s = S[:k] vt = VT[:k] matrix_u = ss.csr_matrix(ut.T) matrix_s = ss.csr_matrix(np.diag(s)) matrix_vt = ss.csr_matrix(vt) temp_matrix = spmatrixmul(self.main_matrix, matrix_u) temp_matrix_a = spmatrixmul(matrix_s, matrix_vt) temp_matrix_b = spmatrixmul(temp_matrix_a, self.transpose_matrix.tocsr()) matrix_result = spmatrixmul(temp_matrix, temp_matrix_b) del temp_matrix, temp_matrix_a, temp_matrix_b result_list.append(matrix_result) difference = (matrix_result - self.truth_matrix) fresult = self.fnorm(difference) svd_dict[k] = fresult print 'k = ', k, 'fresult = ', fresult del matrix_result, fresult, difference if k == 1: k += (self.step - 1) else: k += self.step return svd_dict, result_list, U, S, VT
X[i, j] = 1.0 del links print "Converting to CSR representation" X = X.tocsr() print "CSR conversion done" return X, redirects, index_map # stop after 5M links to make it possible to work in RAM X, redirects, index_map = get_adjacency_matrix( redirects_filename, page_links_filename, limit=5000000) names = dict((i, name) for name, i in index_map.iteritems()) print "Computing the principal singular vectors using fast_svd" t0 = time() U, s, V = fast_svd(X, 5, q=3) print "done in %0.3fs" % (time() - t0) # print the names of the wikipedia related strongest compenents of the the # principal singular vector which should be similar to the highest eigenvector print "Top wikipedia pages according to principal singular vectors" pprint([names[i] for i in np.abs(U.T[0]).argsort()[-10:]]) pprint([names[i] for i in np.abs(V[0]).argsort()[-10:]]) def centrality_scores(X, alpha=0.85, max_iter=100, tol=1e-10): """Power iteration computation of the principal eigenvector This method is also known as Google PageRank and the implementation is based on the one from the NetworkX project (BSD licensed too) with copyrights by: