def __init__(self):
        numpy.random.seed(21)        
        
        #Create a low rank matrix  
#        n = 100000 
#        m = 100000 
#        self.r = 50 
#        nKnown = 10**4
        # netflix-like
#        n = 480000
#        m = 18000 
#        self.r = 200 
#        nKnown = 10**8
        # close from netflix
        n = 480000
        m = 18000 
        self.r = 200 
        nKnown = 10**6
        # focusing on scalar-product
        n = 480000
        m = 18000 
        self.r = 50 
        nKnown = 10**5
        
        self.X = SparseUtils.generateSparseLowRank((n, m), self.r, nKnown)
        print(self.X.nnz)
    def benchmark(self):
        numMatrices = 20
        matrixList = []

        print("Generating matrices")

        for i in range(numMatrices):
            print("Iteration: " + str(i))
            m = numpy.random.randint(5000, 20000)
            n = numpy.random.randint(5000, 20000)
            density = numpy.random.rand() * 0.1
            X = scipy.sparse.rand(m, n, density)

            r = numpy.random.randint(10, 50)
            U, s, V = SparseUtils.generateLowRank((m, n), r)

            print(m, n, density, r)
            matrixList.append((X, U, s, V))

        k = 10

        times = []
        print("Starting timings for ARPACK")
        start = time.time()

        for i, matrices in enumerate(matrixList):
            print("Iteration: " + str(i))
            X, U, s, V = matrices
            SVDUpdate.addSparseArpack(U, s, V, X, k)

        times.append(time.time() - start)

        #Compare versus PROPACK
        print("Starting timings for PROPACK")
        start = time.time()

        for i, matrices in enumerate(matrixList):
            print("Iteration: " + str(i))
            X, U, s, V = matrices
            SparseUtils.svdSparseLowRank(X, U, s, V, k)

        times.append(time.time() - start)
        print(times)
 def __init__(self):
     numpy.random.seed(21)        
     
     #Create a low rank matrix  
     n = 100000 
     m = 100000 
     self.r = 200 
     k = 10**6
     
     self.X = SparseUtils.generateSparseLowRank((n, m), self.r, k)
 def __init__(self):
     numpy.random.seed(21)        
     
     #Create a low rank matrix  
     n = 500000 
     m = 500000 
     self.r = 200 
     k = 10**6
     
     print("Generating low rank")
     self.X = SparseUtils.generateSparseLowRank((n, m), self.r, k)
     print("Generating csarray")
     self.X = csarray.fromScipySparse(self.X, storageType="rowMajor")
     print("Done")
Exemple #5
0
#Test if we can easily get the SVD of a set of matrices with low rank but under 
#a fixed structure 

import numpy 
import scipy.sparse 
from exp.util.SparseUtils import SparseUtils 

numpy.set_printoptions(suppress=True, precision=3, linewidth=150)

shape = (15, 20)
r = 10
k = 50
X, U, s, V = SparseUtils.generateSparseLowRank(shape, r, k, verbose=True)

X = numpy.array(X.todense())

Y = numpy.zeros(X.shape)
Y[X.nonzero()] = 1

print(Y)

U2, s2, V2 = numpy.linalg.svd(Y)
print(s2)

X2 = numpy.zeros(X.shape)
for i in range(r): 
    X2 += s[i]*numpy.diag(U[:,i]).dot(Y).dot(numpy.diag(V[:, i]))

Exemple #6
0
"""
import sys
import logging
import scipy.sparse
import numpy
from sparsesvd import sparsesvd
from exp.util.SparseUtils import SparseUtils

numpy.random.seed(21)
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
numpy.set_printoptions(precision=3, suppress=True, linewidth=100)

m = 10
n = 10
r = 1
U0, s0, V0 = SparseUtils.generateLowRank((m, n), r)

numInds = 10
inds = numpy.unique(numpy.random.randint(0, m * n, numInds))
A = SparseUtils.reconstructLowRank(U0, s0, V0, inds)
#print(A.todense())

t0 = s0 + numpy.random.rand(s0.shape[0]) * 0.1
B = SparseUtils.reconstructLowRank(U0, t0, V0, inds)
#print(B.todense())

k = 9
U, s, V = sparsesvd(A, k)
U2, s2, V2 = sparsesvd(B, k)

print(A.todense())
Exemple #7
0
#generator = SyntheticDataset1(startM=5000, endM=10000, startN=1000, endN=1500, pnz=0.10, noise=0.01)
#generator = FlixsterDataset()
generator = MovieLensDataset()
iterator = CenterMatrixIterator(generator.getTrainIteratorFunc())

k = 50


for i in range(1): 
    X = iterator.next()
    
    if i==0: 
        lastX = scipy.sparse.csc_matrix(X.shape)
    
    print("About to compute SVD") 
    U, s, V = SparseUtils.svdPropack(X, k) 
    print("Computed SVD") 
    
    plt.figure(0)
    plt.plot(numpy.arange(s.shape[0]), s) 
    """
    deltaX = X - lastX 
    deltaX.eliminate_zeros()
    deltaX.prune()
    print(X.nnz-lastX.nnz)
    U, s, V = SparseUtils.svdPropack(deltaX, k) 
    
    plt.figure(1)
    plt.plot(numpy.arange(s.shape[0]), s) 
    lastX = X
    """
Exemple #8
0
"""
import sys 
import logging
import scipy.sparse
import numpy
from sparsesvd import sparsesvd
from exp.util.SparseUtils import SparseUtils 

numpy.random.seed(21)
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
numpy.set_printoptions(precision=3, suppress=True, linewidth=100)

m = 10 
n = 10 
r = 1
U0, s0, V0 = SparseUtils.generateLowRank((m, n), r)

numInds = 10 
inds = numpy.unique(numpy.random.randint(0, m*n, numInds)) 
A = SparseUtils.reconstructLowRank(U0, s0, V0, inds)
#print(A.todense())

t0 = s0 + numpy.random.rand(s0.shape[0])*0.1
B = SparseUtils.reconstructLowRank(U0, t0, V0, inds)
#print(B.todense())

k = 9
U, s, V = sparsesvd(A, k)
U2, s2, V2 = sparsesvd(B, k)

print(A.todense())
Exemple #9
0
        lastX = X 
    else: 
        E = X - lastX
        E.eliminate_zeros()
        print(X.nnz, E.nnz)
        startTime = time.time()
        U3, s3, V3 = RandomisedSVD.updateSvd(X, U3, s3, V3, E, k, p)
        times[i, 1] = time.time() - startTime 
        
        lastX = X  
        
    errors[i, 1] = numpy.linalg.norm(X - (U3*s3).dot(V3.T)) 
    
    #Accurate method 
    startTime = time.time()
    U4, s4, V4 = SparseUtils.svdPropack(X, k)    
    times[i, 2] = time.time() - startTime 
    
    errors[i, 2] = numpy.linalg.norm(X - (U4*s4).dot(V4.T)) 
    
    #Final method - just use the same SVD
    if i == 0: 
        startTime = time.time()
        U5, s5, V5 = SparseUtils.svdPropack(X, k)    
        times[i, 3] = time.time() - startTime 
    
    errors[i, 3] = numpy.linalg.norm(X - (U5*s5).dot(V5.T)) 
    
    
cumtimes = numpy.cumsum(times, 0)
print(cumtimes)
Exemple #10
0
X = vectoriser.fit_transform(documentList)

print(vectoriser.get_feature_names()) 

corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)

id2WordDict = dict(zip(range(len(vectoriser.get_feature_names())), vectoriser.get_feature_names()))   

k = 10
logging.getLogger('gensim').setLevel(logging.INFO)
lda = LdaModel(corpus, num_topics=k, id2word=id2WordDict, chunksize=1000, distributed=False) 
index = gensim.similarities.docsim.SparseMatrixSimilarity(lda[corpus], num_features=k)          

newX = vectoriser.transform(["graph"])
newX = [(i, newX[0, i])for i in newX.nonzero()[1]]
result = lda[newX]             
similarities = index[result]
similarities = sorted(enumerate(similarities), key=lambda item: -item[1])
print(similarities)

#Compute Helliger distance 
result = [i[1] for i in result]
newX = scipy.sparse.csc_matrix(result)
distances = SparseUtils.hellingerDistances(index.index, newX)
print(1 - distances)

#Try cosine metric 
X = Standardiser().normaliseArray(numpy.array(index.index.todense()).T).T
newX = numpy.array(newX.todense())
similarities = X.dot(newX.T).flatten()
print(similarities)