def compute_bench(data_gen, samples_range, features_range, q=3): it = 0 results = defaultdict(lambda: []) max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print '====================' print 'Iteration %03d of %03d' % (it, max_it) print '====================' X = make_data(n_samples, n_features) rank = min(n_samples, n_samples) / 10 + 1 gc.collect() print "benching scipy svd: " tstart = time() svd(X, full_matrices=False) results['scipy svd'].append(time() - tstart) gc.collect() print "benching scikit-learn fast_svd: q=0" tstart = time() fast_svd(X, rank, q=0) results['scikit-learn fast_svd (q=0)'].append(time() - tstart) gc.collect() print "benching scikit-learn fast_svd: q=%d " % q tstart = time() fast_svd(X, rank, q=q) results['scikit-learn fast_svd (q=%d)' % q].append(time() - tstart) return results
def test_fast_svd_low_rank(): """Check that extmath.fast_svd is consistent with linalg.svd""" n_samples = 100 n_features = 500 rank = 5 k = 10 # generate a matrix X of approximate effective rank `rank` and no noise # component (very structured signal): X = low_rank_fat_tail(n_samples, n_features, effective_rank=rank, tail_strength=0.0, seed=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method U, s, V = linalg.svd(X, full_matrices=False) # compute the singular values of X using the fast approximate method Ua, sa, Va = fast_svd(X, k) assert_equal(Ua.shape, (n_samples, k)) assert_equal(sa.shape, (k,)) assert_equal(Va.shape, (k, n_features)) # ensure that the singular values of both methods are equal up to the real # rank of the matrix assert_almost_equal(s[:k], sa) # check the singular vectors too (while not checking the sign) assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va)) # check the sparse matrix representation X = sparse.csr_matrix(X) # compute the singular values of X using the fast approximate method Ua, sa, Va = fast_svd(X, k) assert_almost_equal(s[:rank], sa[:rank])
def test_fast_svd_low_rank_with_noise(): """Check that extmath.fast_svd can handle noisy matrices""" n_samples = 100 n_features = 500 rank = 5 k = 10 # generate a matrix X wity structure approximate rank `rank` and an # important noisy component X = low_rank_fat_tail(n_samples, n_features, effective_rank=rank, tail_strength=0.5, seed=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method _, s, _ = linalg.svd(X, full_matrices=False) # compute the singular values of X using the fast approximate method without # the iterated power method _, sa, _ = fast_svd(X, k, q=0) # the approximation does not tolerate the noise: assert np.abs(s[:k] - sa).max() > 0.05 # compute the singular values of X using the fast approximate method with # iterated power method _, sap, _ = fast_svd(X, k, q=5) # the iterated power method is helping getting rid of the noise: assert_almost_equal(s[:k], sap, decimal=3)
def fast_pseudoinverse(matrix, precision): if matrix.shape[0] <= matrix.shape[1]: val = int((precision * matrix.shape[0]) / 100) u, s, vt = slue.fast_svd(matrix, val) UT = ss.csr_matrix(np.nan_to_num(u.transpose())) SI = ss.csr_matrix(np.nan_to_num(np.diag(1 / s))) VT = ss.csr_matrix(np.nan_to_num(vt)) temp_matrix = spmatrixmul(VT.transpose(), SI) pinv_matrix = spmatrixmul(temp_matrix, UT) del u, s, vt, UT, SI, VT, temp_matrix else: val = int((precision * matrix.transpose().shape[0]) / 100) u, s, vt = slue.fast_svd(matrix.transpose(), val) UT = ss.csr_matrix(np.nan_to_num(u.transpose())) SI = ss.csr_matrix(np.nan_to_num(np.diag(1 / s))) VT = ss.csr_matrix(np.nan_to_num(vt)) temp_matrix = spmatrixmul(UT.transpose(), SI) pinv_matrix = spmatrixmul(temp_matrix, VT) del u, s, vt, UT, SI, VT, temp_matrix return pinv_matrix.tocsr()
def test_fast_svd_infinite_rank(): """Check that extmath.fast_svd can handle noisy matrices""" n_samples = 100 n_features = 500 rank = 5 k = 10 # let us try again without 'low_rank component': just regularly but slowly # decreasing singular values: the rank of the data matrix is infinite X = low_rank_fat_tail(n_samples, n_features, effective_rank=rank, tail_strength=1.0, seed=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method _, s, _ = linalg.svd(X, full_matrices=False) # compute the singular values of X using the fast approximate method without # the iterated power method _, sa, _ = fast_svd(X, k, q=0) # the approximation does not tolerate the noise: assert np.abs(s[:k] - sa).max() > 0.1 # compute the singular values of X using the fast approximate method with # iterated power method _, sap, _ = fast_svd(X, k, q=5) # the iterated power method is still managing to get most of the structure # at the requested rank assert_almost_equal(s[:k], sap, decimal=3)
def test_fast_svd_infinite_rank(): """Check that extmath.fast_svd can handle noisy matrices""" n_samples = 100 n_features = 500 rank = 5 k = 10 # let us try again without 'low_rank component': just regularly but slowly # decreasing singular values: the rank of the data matrix is infinite X = low_rank_fat_tail(n_samples, n_features, effective_rank=rank, tail_strength=1.0, seed=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method _, s, _ = linalg.svd(X, full_matrices=False) # compute the singular values of X using the fast approximate method without # the iterated power method _, sa, _ = fast_svd(X, k, q=0) # the approximation does not tolerate the noise: assert np.abs(s[:k] - sa).max() > 0.1 # compute the singular values of X using the fast approximate method with # iterated power method _, sap, _ = fast_svd(X, k, q=5) # the iterated power method is still managing to get most of the structure # at the requested rank assert_almost_equal(s[:k], sap, decimal=3)
def test_fast_svd_transpose_consistency(): """Check that transposing the design matrix has limit impact""" n_samples = 100 n_features = 500 rank = 4 k = 10 X = low_rank_fat_tail(n_samples, n_features, effective_rank=rank, tail_strength=0.5, seed=0) assert_equal(X.shape, (n_samples, n_features)) U1, s1, V1 = fast_svd(X, k, q=3, transpose=False, rng=0) U2, s2, V2 = fast_svd(X, k, q=3, transpose=True, rng=0) U3, s3, V3 = fast_svd(X, k, q=3, transpose='auto', rng=0) U4, s4, V4 = linalg.svd(X, full_matrices=False) assert_almost_equal(s1, s4[:k], decimal=3) assert_almost_equal(s2, s4[:k], decimal=3) assert_almost_equal(s3, s4[:k], decimal=3) assert_almost_equal(np.dot(U1, V1), np.dot(U4[:, :k], V4[:k, :]), decimal=2) assert_almost_equal(np.dot(U2, V2), np.dot(U4[:, :k], V4[:k, :]), decimal=2) # in this case 'auto' is equivalent to transpose assert_almost_equal(s2, s3)
def test_fast_svd_low_rank_with_noise(): """Check that extmath.fast_svd can handle noisy matrices""" n_samples = 100 n_features = 500 rank = 5 k = 10 # generate a matrix X wity structure approximate rank `rank` and an # important noisy component X = low_rank_fat_tail(n_samples, n_features, effective_rank=rank, tail_strength=0.5, seed=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method _, s, _ = linalg.svd(X, full_matrices=False) # compute the singular values of X using the fast approximate method without # the iterated power method _, sa, _ = fast_svd(X, k, q=0) # the approximation does not tolerate the noise: assert np.abs(s[:k] - sa).max() > 0.05 # compute the singular values of X using the fast approximate method with # iterated power method _, sap, _ = fast_svd(X, k, q=5) # the iterated power method is helping getting rid of the noise: assert_almost_equal(s[:k], sap, decimal=3)
def compute_bench(samples_range, features_range, q=3, rank=50): it = 0 results = defaultdict(lambda: []) max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print '====================' print 'Iteration %03d of %03d' % (it, max_it) print '====================' X = low_rank_fat_tail(n_samples, n_features, effective_rank=rank, tail_strength=0.2) gc.collect() print "benching scipy svd: " tstart = time() svd(X, full_matrices=False) results['scipy svd'].append(time() - tstart) gc.collect() print "benching scikit-learn fast_svd: q=0" tstart = time() fast_svd(X, rank, q=0) results['scikit-learn fast_svd (q=0)'].append(time() - tstart) gc.collect() print "benching scikit-learn fast_svd: q=%d " % q tstart = time() fast_svd(X, rank, q=q) results['scikit-learn fast_svd (q=%d)' % q].append(time() - tstart) return results
def test_fast_svd_low_rank(): """Check that extmath.fast_svd is consistent with linalg.svd""" n_samples = 100 n_features = 500 rank = 5 k = 10 # generate a matrix X of approximate effective rank `rank` and no noise # component (very structured signal): X = low_rank_fat_tail(n_samples, n_features, effective_rank=rank, tail_strength=0.0, seed=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method U, s, V = linalg.svd(X, full_matrices=False) # compute the singular values of X using the fast approximate method Ua, sa, Va = fast_svd(X, k) assert_equal(Ua.shape, (n_samples, k)) assert_equal(sa.shape, (k,)) assert_equal(Va.shape, (k, n_features)) # ensure that the singular values of both methods are equal up to the real # rank of the matrix assert_almost_equal(s[:k], sa) # check the singular vectors too (while not checking the sign) assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va)) # check the sparse matrix representation X = sparse.csr_matrix(X) # compute the singular values of X using the fast approximate method Ua, sa, Va = fast_svd(X, k) assert_almost_equal(s[:rank], sa[:rank])
def test_fast_svd_transpose_consistency(): """Check that transposing the design matrix has limit impact""" n_samples = 100 n_features = 500 rank = 4 k = 10 X = low_rank_fat_tail(n_samples, n_features, effective_rank=rank, tail_strength=0.5, seed=0) assert_equal(X.shape, (n_samples, n_features)) U1, s1, V1 = fast_svd(X, k, q=3, transpose=False, rng=0) U2, s2, V2 = fast_svd(X, k, q=3, transpose=True, rng=0) U3, s3, V3 = fast_svd(X, k, q=3, transpose='auto', rng=0) U4, s4, V4 = linalg.svd(X, full_matrices=False) assert_almost_equal(s1, s4[:k], decimal=3) assert_almost_equal(s2, s4[:k], decimal=3) assert_almost_equal(s3, s4[:k], decimal=3) assert_almost_equal(np.dot(U1, V1), np.dot(U4[:, :k], V4[:k, :]), decimal=2) assert_almost_equal(np.dot(U2, V2), np.dot(U4[:, :k], V4[:k, :]), decimal=2) # in this case 'auto' is equivalent to transpose assert_almost_equal(s2, s3)
def fast_pseudoinverse(matrix, precision): if matrix.shape[0] <= matrix.shape[1]: val = int((precision * matrix.shape[0]) / 100) u, s, vt = slue.fast_svd(matrix, val) UT = ss.csr_matrix(np.nan_to_num(u.transpose())) SI = ss.csr_matrix(np.nan_to_num(np.diag(1 / s))) VT = ss.csr_matrix(np.nan_to_num(vt)) temp_matrix = spmatrixmul(VT.transpose(), SI) pinv_matrix = spmatrixmul(temp_matrix, UT) del u, s, vt, UT, SI, VT, temp_matrix else: val = int((precision * matrix.transpose().shape[0]) / 100) u, s, vt = slue.fast_svd(matrix.transpose(), val) UT = ss.csr_matrix(np.nan_to_num(u.transpose())) SI = ss.csr_matrix(np.nan_to_num(np.diag(1 / s))) VT = ss.csr_matrix(np.nan_to_num(vt)) temp_matrix = spmatrixmul(UT.transpose(), SI) pinv_matrix = spmatrixmul(temp_matrix, VT) del u, s, vt, UT, SI, VT, temp_matrix return pinv_matrix.tocsr()
def compute_bench(samples_range, features_range, q=3, rank=50): it = 0 results = defaultdict(lambda: []) max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print '====================' print 'Iteration %03d of %03d' % (it, max_it) print '====================' X = low_rank_fat_tail(n_samples, n_features, effective_rank=rank, tail_strength=0.2) gc.collect() print "benching scipy svd: " tstart = time() svd(X, full_matrices=False) results['scipy svd'].append(time() - tstart) gc.collect() print "benching scikit-learn fast_svd: q=0" tstart = time() fast_svd(X, rank, q=0) results['scikit-learn fast_svd (q=0)'].append(time() - tstart) gc.collect() print "benching scikit-learn fast_svd: q=%d " % q tstart = time() fast_svd(X, rank, q=q) results['scikit-learn fast_svd (q=%d)' % q].append(time() - tstart) return results
def pca(data, fast = False, output_dimension = None): """Perform PCA using SVD. data - MxN matrix of input data (M dimensions, N trials) signals - MxN matrix of projected data PC - each column is a PC V - Mx1 matrix of variances """ print "Performing PCA with a SVD based algorithm" N, M = data.shape Y = data if fast is True and sklearn is True: if output_dimension is None: messages.warning_exit('When using fast_svd it is necessary to ' 'define the output_dimension') u, S, PC = fast_svd(Y, output_dimension, q = 3) else: u, S, PC = scipy.linalg.svd(Y, full_matrices = False) v = PC.T V = S ** 2 return v,V
def svd(X): return fast_svd(X, p, q=3)
def svd(X): return fast_svd(X, p, q = 3)
def svd_pca(data, fast = False, output_dimension = None, centre = None, auto_transpose = True): """Perform PCA using SVD. Parameters ---------- data : numpy array MxN array of input data (M variables, N trials) fast : bool Wheter to use randomized svd estimation to estimate a limited number of componentes given by output_dimension output_dimension : int Number of components to estimate when fast is True centre : None | 'variables' | 'trials' If None no centring is applied. If 'variable' the centring will be performed in the variable axis. If 'trials', the centring will be performed in the 'trials' axis. auto_transpose : bool If True, automatically transposes the data to boost performance Returns ------- factors : numpy array loadings : numpy array explained_variance : numpy array mean : numpy array or None (if center is None) """ N, M = data.shape if centre is not None: if centre == 'variables': mean = data.mean(1)[:,np.newaxis] elif centre == 'trials': mean = data.mean(0)[np.newaxis,:] else: raise AttributeError( 'centre must be one of: None, variables, trials') data -= mean else: mean = None if auto_transpose is True: if N < M: print("Auto transposing the data") data = data.T else: auto_transpose = False if fast is True and sklearn is True: if output_dimension is None: messages.warning_exit('When using fast_svd it is necessary to ' 'define the output_dimension') U, S, V = fast_svd(data, output_dimension) else: U, S, V = scipy.linalg.svd(data, full_matrices = False) if auto_transpose is False: factors = V.T explained_variance = S ** 2 / N loadings = U * S else: loadings = V.T explained_variance = S ** 2 / N factors = U * S return factors, loadings, explained_variance, mean
X[i, j] = 1.0 del links print "Converting to CSR representation" X = X.tocsr() print "CSR conversion done" return X, redirects, index_map # stop after 5M links to make it possible to work in RAM X, redirects, index_map = get_adjacency_matrix( redirects_filename, page_links_filename, limit=5000000) names = dict((i, name) for name, i in index_map.iteritems()) print "Computing the principal singular vectors using fast_svd" t0 = time() U, s, V = fast_svd(X, 5, q=3) print "done in %0.3fs" % (time() - t0) # print the names of the wikipedia related strongest compenents of the the # principal singular vector which should be similar to the highest eigenvector print "Top wikipedia pages according to principal singular vectors" pprint([names[i] for i in np.abs(U.T[0]).argsort()[-10:]]) pprint([names[i] for i in np.abs(V[0]).argsort()[-10:]]) def centrality_scores(X, alpha=0.85, max_iter=100, tol=1e-10): """Power iteration computation of the principal eigenvector This method is also known as Google PageRank and the implementation is based on the one from the NetworkX project (BSD licensed too) with copyrights by:
def svd(X): return fast_svd(X, p)
del links print "Converting to CSR representation" X = X.tocsr() print "CSR conversion done" return X, redirects, index_map # stop after 5M links to make it possible to work in RAM X, redirects, index_map = get_adjacency_matrix(redirects_filename, page_links_filename, limit=5000000) names = dict((i, name) for name, i in index_map.iteritems()) print "Computing the principal singular vectors using fast_svd" t0 = time() U, s, V = fast_svd(X, 5, q=3) print "done in %0.3fs" % (time() - t0) # print the names of the wikipedia related strongest compenents of the the # principal singular vector which should be similar to the highest eigenvector print "Top wikipedia pages according to principal singular vectors" pprint([names[i] for i in np.abs(U.T[0]).argsort()[-10:]]) pprint([names[i] for i in np.abs(V[0]).argsort()[-10:]]) def centrality_scores(X, alpha=0.85, max_iter=100, tol=1e-10): """Power iteration computation of the principal eigenvector This method is also known as Google PageRank and the implementation is based on the one from the NetworkX project (BSD licensed too) with copyrights by: