def testSvd(self): 
     n = 100 
     A = scipy.sparse.rand(n, n, 0.1)
     
     ks = [10, 20, 30, 40] 
     q = 2 
     
     lastError = numpy.linalg.norm(A.todense())        
     
     for k in ks: 
         U, s, V = RandomisedSVD.svd(A, k, q)
         
         nptst.assert_array_almost_equal(U.T.dot(U), numpy.eye(k))
         nptst.assert_array_almost_equal(V.T.dot(V), numpy.eye(k))
         A2 = (U*s).dot(V.T)
         
         error = numpy.linalg.norm(A - A2)
         self.assertTrue(error <= lastError)
         lastError = error 
         
         #Compare versus exact svd 
         U, s, V = numpy.linalg.svd(numpy.array(A.todense()))
         inds = numpy.flipud(numpy.argsort(s))[0:k*2]
         U, s, V = Util.indSvd(U, s, V, inds)
         
         Ak = (U*s).dot(V.T)
         
         error2 = numpy.linalg.norm(A - Ak)
         self.assertTrue(error2 <= error)
 def profileSvd2(self):
     dataDir = PathDefaults.getDataDir() + "erasm/contacts/" 
     trainFilename = dataDir + "contacts_train"        
     
     trainX = scipy.io.mmread(trainFilename)
     trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.int8)
     
     k = 500 
     U, s, V = RandomisedSVD.svd(trainX, k)
     
     print(s)
     
     print("All done")
Exemple #3
0
    def profileSvd2(self):
        dataDir = PathDefaults.getDataDir() + "erasm/contacts/"
        trainFilename = dataDir + "contacts_train"

        trainX = scipy.io.mmread(trainFilename)
        trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.int8)

        k = 500
        U, s, V = RandomisedSVD.svd(trainX, k)

        print(s)

        print("All done")
 def profileSvd3(self):
     dataset = NetflixDataset()
     iterator = dataset.getTrainIteratorFunc()
     X = iterator.next() 
     
     #L = LinOperatorUtils.parallelSparseOp(X)  
     L = GeneralLinearOperator.asLinearOperator(X)
     
     k = 50 
     U, s, V = RandomisedSVD.svd(L, k)
     
     print(s)
     
     print("All done")
Exemple #5
0
    def profileSvd3(self):
        dataset = NetflixDataset()
        iterator = dataset.getTrainIteratorFunc()
        X = iterator.next()

        #L = LinOperatorUtils.parallelSparseOp(X)
        L = GeneralLinearOperator.asLinearOperator(X)

        k = 50
        U, s, V = RandomisedSVD.svd(L, k)

        print(s)

        print("All done")
Exemple #6
0
            #Nystrom method 
            print("Running Nystrom")
            for j, nystromN in enumerate(nystromNs):  
                omega2, Q2 = Nystrom.eigpsd(L, nystromN)
                inds = numpy.flipud(numpy.argsort(omega2))
                omega2, Q2 = omega2[inds], Q2[:, inds]
                omega2k, Q2k = omega2[0:k], Q2[:, 0:k]
                
#                errors[i, j] += computeBound(L, omega, Q, omega2k, Q2k, k)
                errors[i, j] += computeSinTheta(Qkbot, Q2k)
            

            #Randomised SVD method 
            print("Running Random SVD")
            for j, r in enumerate(randSVDVecs):  
                Q4, omega4, R4 = RandomisedSVD.svd(L, r)
                inds = numpy.flipud(numpy.argsort(omega4))
                omega4, Q4 = omega4[inds], Q4[:, inds]
                omega4k, Q4k = omega4[0:k], Q4[:, 0:k]
                
#                errors[i, j+len(nystromNs)] += computeBound(L, omega, Q, omega4k, Q4k, k)
                errors[i, j+len(nystromNs)] += computeSinTheta(Qkbot, Q4k)
            
            
            #Incremental updates 
            print("Running Eigen-update")
            for j, l in enumerate(IASCL):  
                omega3, Q3 = eigenUpdate(lastL, L, lastOmegas[j], lastQs[j], l)
                inds = numpy.flipud(numpy.argsort(omega3)) 
                omega3, Q3 = omega3[inds], Q3[:, inds]
                omega3k, Q3k = omega3[0:k], Q3[:, 0:k]
    def clusterFromIterator(self, graphListIterator, verbose=False):
        """
        Find a set of clusters for the graphs given by the iterator. If verbose 
        is true the each iteration is timed and bounded the results are returned 
        as lists.
        
        The difference between a weight matrix and the previous one should be
        positive.
        """
        clustersList = []
        decompositionTimeList = [] 
        kMeansTimeList = [] 
        boundList = []
        i = 0

        for subW in graphListIterator:
            if __debug__:
                Parameter.checkSymmetric(subW)

            if self.logStep and i % self.logStep == 0:
                logging.debug("Graph index: " + str(i))
            logging.debug("Clustering graph of size " + str(subW.shape))
            if self.alg!="efficientNystrom": 
                ABBA = GraphUtils.shiftLaplacian(subW)

            # --- Eigen value decomposition ---
            startTime = time.time()
            if self.alg=="IASC": 
                if i % self.T != 0:
                    omega, Q = self.approxUpdateEig(subW, ABBA, omega, Q)   
                    
                    if self.computeBound:
                        inds = numpy.flipud(numpy.argsort(omega))
                        Q = Q[:, inds]
                        omega = omega[inds]
                        bounds = self.pertBound(omega, Q, omegaKbot, AKbot, self.k2)
                        #boundList.append([i, bounds[0], bounds[1]])
                        
                        #Now use accurate values of norm of R and delta   
                        rank = Util.rank(ABBA.todense())
                        gamma, U = scipy.sparse.linalg.eigsh(ABBA, rank-1, which="LM", ncv = ABBA.shape[0])
                        #logging.debug("gamma=" + str(gamma))
                        bounds2 = self.realBound(omega, Q, gamma, AKbot, self.k2)                  
                        boundList.append([i, bounds[0], bounds[1], bounds2[0], bounds2[1]])      
                else: 
                    logging.debug("Computing exact eigenvectors")
                    self.storeInformation(subW, ABBA)

                    if self.computeBound: 
                        #omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k2*2, ABBA.shape[0]-1), which="LM", ncv = min(10*self.k2, ABBA.shape[0]))
                        rank = Util.rank(ABBA.todense())
                        omega, Q = scipy.sparse.linalg.eigsh(ABBA, rank-1, which="LM", ncv = ABBA.shape[0])
                        inds = numpy.flipud(numpy.argsort(omega))
                        omegaKbot = omega[inds[self.k2:]]  
                        QKbot = Q[:, inds[self.k2:]] 
                        AKbot = (QKbot*omegaKbot).dot(QKbot.T)
                        
                        omegaSort = numpy.flipud(numpy.sort(omega))
                    else: 
                        omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k2, ABBA.shape[0]-1), which="LM", ncv = min(10*self.k2, ABBA.shape[0]))
                            
            elif self.alg == "nystrom":
                omega, Q = Nystrom.eigpsd(ABBA, self.k3)
            elif self.alg == "exact": 
                omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k1, ABBA.shape[0]-1), which="LM", ncv = min(15*self.k1, ABBA.shape[0]))
            elif self.alg == "efficientNystrom":
                omega, Q = EfficientNystrom.eigWeight(subW, self.k2, self.k1)
            elif self.alg == "randomisedSvd": 
                Q, omega, R = RandomisedSVD.svd(ABBA, self.k4)
            else:
                raise ValueError("Invalid Algorithm: " + str(self.alg))

            decompositionTimeList.append(time.time()-startTime)                  
                  
            if self.alg=="IASC":
                self.storeInformation(subW, ABBA)
            
            # --- Kmeans ---
            startTime = time.time()
            inds = numpy.flipud(numpy.argsort(omega))

            standardiser = Standardiser()
            #For some very strange reason we get an overflow when computing the
            #norm of the rows of Q even though its elements are bounded by 1.
            #We'll ignore it for now
            try:
                V = standardiser.normaliseArray(Q[:, inds[0:self.k1]].real.T).T
            except FloatingPointError as e:
                logging.warn("FloatingPointError: " + str(e))
            V = VqUtils.whiten(V)
            if i == 0:
                centroids, distortion = vq.kmeans(V, self.k1, iter=self.nb_iter_kmeans)
            else:
                centroids = self.findCentroids(V, clusters[:subW.shape[0]])
                if centroids.shape[0] < self.k1:
                    nb_missing_centroids = self.k1 - centroids.shape[0]
                    random_centroids = V[numpy.random.randint(0, V.shape[0], nb_missing_centroids),:]
                    centroids = numpy.vstack((centroids, random_centroids))
                centroids, distortion = vq.kmeans(V, centroids) #iter can only be 1
            clusters, distortion = vq.vq(V, centroids)
            kMeansTimeList.append(time.time()-startTime)

            clustersList.append(clusters)

            #logging.debug("subW.shape: " + str(subW.shape))
            #logging.debug("len(clusters): " + str(len(clusters)))
            #from apgl.util.ProfileUtils import ProfileUtils
            #logging.debug("Total memory usage: " + str(ProfileUtils.memory()/10**6) + "MB")
            if ProfileUtils.memory() > 10**9:
                ProfileUtils.memDisplay(locals())

            i += 1

        if verbose:
            return clustersList, numpy.array((decompositionTimeList, kMeansTimeList)).T, boundList
        else:
            return clustersList
Exemple #8
0
    times[i, 0] = time.time() - startTime 
    
    errors[i, 0] = numpy.linalg.norm(X - (U2*s2).dot(V2.T)) 

    #Now RSVD + update 
    if i == 0: 
        startTime = time.time()
        U3, s3, V3 = sppy.linalg.core.rsvd(X, k,q=q)
        times[i, 1] = time.time() - startTime 
        lastX = X 
    else: 
        E = X - lastX
        E.eliminate_zeros()
        print(X.nnz, E.nnz)
        startTime = time.time()
        U3, s3, V3 = RandomisedSVD.updateSvd(X, U3, s3, V3, E, k, p)
        times[i, 1] = time.time() - startTime 
        
        lastX = X  
        
    errors[i, 1] = numpy.linalg.norm(X - (U3*s3).dot(V3.T)) 
    
    #Accurate method 
    startTime = time.time()
    U4, s4, V4 = SparseUtils.svdPropack(X, k)    
    times[i, 2] = time.time() - startTime 
    
    errors[i, 2] = numpy.linalg.norm(X - (U4*s4).dot(V4.T)) 
    
    #Final method - just use the same SVD
    if i == 0: