def testSvd(self): n = 100 A = scipy.sparse.rand(n, n, 0.1) ks = [10, 20, 30, 40] q = 2 lastError = numpy.linalg.norm(A.todense()) for k in ks: U, s, V = RandomisedSVD.svd(A, k, q) nptst.assert_array_almost_equal(U.T.dot(U), numpy.eye(k)) nptst.assert_array_almost_equal(V.T.dot(V), numpy.eye(k)) A2 = (U*s).dot(V.T) error = numpy.linalg.norm(A - A2) self.assertTrue(error <= lastError) lastError = error #Compare versus exact svd U, s, V = numpy.linalg.svd(numpy.array(A.todense())) inds = numpy.flipud(numpy.argsort(s))[0:k*2] U, s, V = Util.indSvd(U, s, V, inds) Ak = (U*s).dot(V.T) error2 = numpy.linalg.norm(A - Ak) self.assertTrue(error2 <= error)
def profileSvd2(self): dataDir = PathDefaults.getDataDir() + "erasm/contacts/" trainFilename = dataDir + "contacts_train" trainX = scipy.io.mmread(trainFilename) trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.int8) k = 500 U, s, V = RandomisedSVD.svd(trainX, k) print(s) print("All done")
def profileSvd3(self): dataset = NetflixDataset() iterator = dataset.getTrainIteratorFunc() X = iterator.next() #L = LinOperatorUtils.parallelSparseOp(X) L = GeneralLinearOperator.asLinearOperator(X) k = 50 U, s, V = RandomisedSVD.svd(L, k) print(s) print("All done")
#Nystrom method print("Running Nystrom") for j, nystromN in enumerate(nystromNs): omega2, Q2 = Nystrom.eigpsd(L, nystromN) inds = numpy.flipud(numpy.argsort(omega2)) omega2, Q2 = omega2[inds], Q2[:, inds] omega2k, Q2k = omega2[0:k], Q2[:, 0:k] # errors[i, j] += computeBound(L, omega, Q, omega2k, Q2k, k) errors[i, j] += computeSinTheta(Qkbot, Q2k) #Randomised SVD method print("Running Random SVD") for j, r in enumerate(randSVDVecs): Q4, omega4, R4 = RandomisedSVD.svd(L, r) inds = numpy.flipud(numpy.argsort(omega4)) omega4, Q4 = omega4[inds], Q4[:, inds] omega4k, Q4k = omega4[0:k], Q4[:, 0:k] # errors[i, j+len(nystromNs)] += computeBound(L, omega, Q, omega4k, Q4k, k) errors[i, j+len(nystromNs)] += computeSinTheta(Qkbot, Q4k) #Incremental updates print("Running Eigen-update") for j, l in enumerate(IASCL): omega3, Q3 = eigenUpdate(lastL, L, lastOmegas[j], lastQs[j], l) inds = numpy.flipud(numpy.argsort(omega3)) omega3, Q3 = omega3[inds], Q3[:, inds] omega3k, Q3k = omega3[0:k], Q3[:, 0:k]
def clusterFromIterator(self, graphListIterator, verbose=False): """ Find a set of clusters for the graphs given by the iterator. If verbose is true the each iteration is timed and bounded the results are returned as lists. The difference between a weight matrix and the previous one should be positive. """ clustersList = [] decompositionTimeList = [] kMeansTimeList = [] boundList = [] i = 0 for subW in graphListIterator: if __debug__: Parameter.checkSymmetric(subW) if self.logStep and i % self.logStep == 0: logging.debug("Graph index: " + str(i)) logging.debug("Clustering graph of size " + str(subW.shape)) if self.alg!="efficientNystrom": ABBA = GraphUtils.shiftLaplacian(subW) # --- Eigen value decomposition --- startTime = time.time() if self.alg=="IASC": if i % self.T != 0: omega, Q = self.approxUpdateEig(subW, ABBA, omega, Q) if self.computeBound: inds = numpy.flipud(numpy.argsort(omega)) Q = Q[:, inds] omega = omega[inds] bounds = self.pertBound(omega, Q, omegaKbot, AKbot, self.k2) #boundList.append([i, bounds[0], bounds[1]]) #Now use accurate values of norm of R and delta rank = Util.rank(ABBA.todense()) gamma, U = scipy.sparse.linalg.eigsh(ABBA, rank-1, which="LM", ncv = ABBA.shape[0]) #logging.debug("gamma=" + str(gamma)) bounds2 = self.realBound(omega, Q, gamma, AKbot, self.k2) boundList.append([i, bounds[0], bounds[1], bounds2[0], bounds2[1]]) else: logging.debug("Computing exact eigenvectors") self.storeInformation(subW, ABBA) if self.computeBound: #omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k2*2, ABBA.shape[0]-1), which="LM", ncv = min(10*self.k2, ABBA.shape[0])) rank = Util.rank(ABBA.todense()) omega, Q = scipy.sparse.linalg.eigsh(ABBA, rank-1, which="LM", ncv = ABBA.shape[0]) inds = numpy.flipud(numpy.argsort(omega)) omegaKbot = omega[inds[self.k2:]] QKbot = Q[:, inds[self.k2:]] AKbot = (QKbot*omegaKbot).dot(QKbot.T) omegaSort = numpy.flipud(numpy.sort(omega)) else: omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k2, ABBA.shape[0]-1), which="LM", ncv = min(10*self.k2, ABBA.shape[0])) elif self.alg == "nystrom": omega, Q = Nystrom.eigpsd(ABBA, self.k3) elif self.alg == "exact": omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k1, ABBA.shape[0]-1), which="LM", ncv = min(15*self.k1, ABBA.shape[0])) elif self.alg == "efficientNystrom": omega, Q = EfficientNystrom.eigWeight(subW, self.k2, self.k1) elif self.alg == "randomisedSvd": Q, omega, R = RandomisedSVD.svd(ABBA, self.k4) else: raise ValueError("Invalid Algorithm: " + str(self.alg)) decompositionTimeList.append(time.time()-startTime) if self.alg=="IASC": self.storeInformation(subW, ABBA) # --- Kmeans --- startTime = time.time() inds = numpy.flipud(numpy.argsort(omega)) standardiser = Standardiser() #For some very strange reason we get an overflow when computing the #norm of the rows of Q even though its elements are bounded by 1. #We'll ignore it for now try: V = standardiser.normaliseArray(Q[:, inds[0:self.k1]].real.T).T except FloatingPointError as e: logging.warn("FloatingPointError: " + str(e)) V = VqUtils.whiten(V) if i == 0: centroids, distortion = vq.kmeans(V, self.k1, iter=self.nb_iter_kmeans) else: centroids = self.findCentroids(V, clusters[:subW.shape[0]]) if centroids.shape[0] < self.k1: nb_missing_centroids = self.k1 - centroids.shape[0] random_centroids = V[numpy.random.randint(0, V.shape[0], nb_missing_centroids),:] centroids = numpy.vstack((centroids, random_centroids)) centroids, distortion = vq.kmeans(V, centroids) #iter can only be 1 clusters, distortion = vq.vq(V, centroids) kMeansTimeList.append(time.time()-startTime) clustersList.append(clusters) #logging.debug("subW.shape: " + str(subW.shape)) #logging.debug("len(clusters): " + str(len(clusters))) #from apgl.util.ProfileUtils import ProfileUtils #logging.debug("Total memory usage: " + str(ProfileUtils.memory()/10**6) + "MB") if ProfileUtils.memory() > 10**9: ProfileUtils.memDisplay(locals()) i += 1 if verbose: return clustersList, numpy.array((decompositionTimeList, kMeansTimeList)).T, boundList else: return clustersList
times[i, 0] = time.time() - startTime errors[i, 0] = numpy.linalg.norm(X - (U2*s2).dot(V2.T)) #Now RSVD + update if i == 0: startTime = time.time() U3, s3, V3 = sppy.linalg.core.rsvd(X, k,q=q) times[i, 1] = time.time() - startTime lastX = X else: E = X - lastX E.eliminate_zeros() print(X.nnz, E.nnz) startTime = time.time() U3, s3, V3 = RandomisedSVD.updateSvd(X, U3, s3, V3, E, k, p) times[i, 1] = time.time() - startTime lastX = X errors[i, 1] = numpy.linalg.norm(X - (U3*s3).dot(V3.T)) #Accurate method startTime = time.time() U4, s4, V4 = SparseUtils.svdPropack(X, k) times[i, 2] = time.time() - startTime errors[i, 2] = numpy.linalg.norm(X - (U4*s4).dot(V4.T)) #Final method - just use the same SVD if i == 0: