def __getitem__(self, bow, scaled=False): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. Note that this function returns the latent space representation **scaled by the singular values**. To return non-scaled embedding, set `scaled` to False. """ # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.isCorpus(bow) if is_corpus: return self._apply(bow) assert self.projection.u is not None, "decomposition not initialized yet" vec = matutils.sparse2full(bow, self.numTerms).astype(self.projection.u.dtype) vec.shape = (self.numTerms, 1) assert self.projection.u.flags.f_contiguous dgemv = matutils.blas('gemv', self.projection.u) topicDist = dgemv(1.0, self.projection.u, vec, trans=True) # u^T * x if scaled: topicDist = (1.0 / self.projection.s) * topicDist # s^-1 * u^T * x nnz = topicDist.nonzero()[0] return zip(nnz, topicDist[nnz])
def __getitem__(self, bow, scaled=False): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. Note that this function returns the latent space representation **scaled by the singular values**. To return non-scaled embedding, set `scaled` to False. """ # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.isCorpus(bow) if is_corpus: return self._apply(bow) assert self.projection.u is not None, "decomposition not initialized yet" vec = matutils.sparse2full(bow, self.numTerms).astype( self.projection.u.dtype) vec.shape = (self.numTerms, 1) assert self.projection.u.flags.f_contiguous dgemv = matutils.blas('gemv', self.projection.u) topicDist = dgemv(1.0, self.projection.u, vec, trans=True) # u^T * x if scaled: topicDist = (1.0 / self.projection.s) * topicDist # s^-1 * u^T * x nnz = topicDist.nonzero()[0] return zip(nnz, topicDist[nnz])
def getSimilarities(self, doc): """ Return similarity of sparse vector `doc` to all documents in the corpus. `doc` may be either a bag-of-words iterable (standard corpus document), or a numpy array, or a `scipy.sparse` matrix. """ if scipy.sparse.issparse(doc): vec = doc.toarray().flatten() elif isinstance(doc, numpy.ndarray): vec = doc else: vec = matutils.sparse2full(doc, self.numFeatures) vec = numpy.asfortranarray(vec, dtype = self.corpus.dtype).reshape(self.numFeatures, 1) # compute cosine similarity against every other document in the collection gemv = matutils.blas('gemv', self.corpus) allSims = gemv(1.0, self.corpus, vec) # N x T * T x 1 = N x 1 allSims = list(allSims.flat) # convert to plain python list assert len(allSims) == self.corpus.shape[0] # make sure no document got lost! return allSims
def getSimilarities(self, doc): """ Return similarity of sparse vector `doc` to all documents in the corpus. `doc` may be either a bag-of-words iterable (standard corpus document), or a numpy array, or a `scipy.sparse` matrix. """ if scipy.sparse.issparse(doc): vec = doc.toarray().flatten() elif isinstance(doc, numpy.ndarray): vec = doc else: vec = matutils.sparse2full(doc, self.numFeatures) vec = numpy.asfortranarray(vec, dtype=self.corpus.dtype).reshape( self.numFeatures, 1) # compute cosine similarity against every other document in the collection gemv = matutils.blas('gemv', self.corpus) allSims = gemv(1.0, self.corpus, vec) # N x T * T x 1 = N x 1 allSims = list(allSims.flat) # convert to plain python list assert len(allSims) == self.corpus.shape[ 0] # make sure no document got lost! return allSims
def stochasticSvd(corpus, rank, num_terms, chunks=20000, extra_dims=None, power_iters=0, dtype=numpy.float64, eps=1e-6): """ Return (U, S): the left singular vectors and the singular values of the streamed input corpus `corpus` [3]_. This may actually return less than the requested number of top `rank` factors, in case the input is of lower rank. The `extra_dims` (oversampling) and especially `power_iters` (power iterations) parameters affect accuracy of the decomposition. This algorithm uses `2+power_iters` passes over the data. In case you can only afford a single pass over the input corpus, set `onepass=True` in :class:`LsiModel` and avoid using this algorithm directly. The decomposition algorithm is based on **Halko, Martinsson, Tropp. Finding structure with randomness, 2009.** .. [3] If `corpus` is a scipy.sparse matrix instead, it is assumed the whole corpus fits into core memory and a different (more efficient) code path is chosen. """ rank = int(rank) if extra_dims is None: samples = max( 10, 2 * rank ) # use more samples than requested factors, to improve accuracy else: samples = rank + int(extra_dims) logger.info("using %i extra samples and %i power iterations" % (samples - rank, power_iters)) num_terms = int(num_terms) eps = max( float(eps), 1e-9 ) # must ignore near-zero eigenvalues (probably numerical error); the associated eigenvectors are typically unstable/garbage # first phase: construct the orthonormal action matrix Q = orth(Y) = orth((A * A.T)^q * A * O) # build Y in blocks of `chunks` documents (much faster than going one-by-one # and more memory friendly than processing all documents at once) y = numpy.zeros(dtype=dtype, shape=(num_terms, samples)) logger.info("1st phase: constructing %s action matrix" % str(y.shape)) if scipy.sparse.issparse(corpus): m, n = corpus.shape assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % ( m, num_terms) o = numpy.random.normal(0.0, 1.0, (n, samples)).astype( y.dtype) # draw a random gaussian matrix sparsetools.csc_matvecs(m, n, samples, corpus.indptr, corpus.indices, corpus.data, o.ravel(), y.ravel()) # y = corpus * o del o y = y.astype( dtype ) # TODO unlike numpy, scipy actually makes a copy even when dtype=y.dtype...marginally inefficient logger.debug("running %i power iterations" % power_iters) for power_iter in xrange(power_iters): y = corpus.T * y y = corpus * y else: chunker = itertools.groupby(enumerate(corpus), key=lambda (docno, doc): docno / chunks) num_docs = 0 for chunk_no, (key, group) in enumerate(chunker): logger.info('PROGRESS: at document #%i' % (chunk_no * chunks)) # construct the chunk as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense (num_terms x chunks) matrix! chunk = matutils.corpus2csc( (doc for _, doc in group), num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC m, n = chunk.shape assert m == num_terms assert n <= chunks # the very last chunk of A is allowed to be smaller in size num_docs += n logger.debug("multiplying chunk * gauss") o = numpy.random.normal(0.0, 1.0, (n, samples)).astype( dtype) # draw a random gaussian matrix sparsetools.csc_matvecs( num_terms, n, samples, chunk.indptr, # y = y + chunk * o chunk.indices, chunk.data, o.ravel(), y.ravel()) del chunk, o for power_iter in xrange(power_iters): logger.info("running power iteration #%i" % (power_iter + 1)) yold = y.copy() y[:] = 0.0 chunker = itertools.groupby(enumerate(corpus), key=lambda (docno, doc): docno / chunks) for chunk_no, (key, group) in enumerate(chunker): logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunks, num_docs)) chunk = matutils.corpus2csc( (doc for _, doc in group), num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC tmp = chunk.T * yold tmp = chunk * tmp del chunk y += tmp del yold logger.info("orthonormalizing %s action matrix" % str(y.shape)) y = [y] q, r = matutils.qr_destroy(y) # orthonormalize the range del y samples = clipSpectrum(numpy.diag(r), samples, discard=eps) qt = numpy.asfortranarray( q[:, :samples].T ) # discard bogus columns, in case Y was rank-deficient del q if scipy.sparse.issparse(corpus): b = qt * corpus logger.info("2nd phase: running dense svd on %s matrix" % str(b.shape)) u, s, vt = numpy.linalg.svd(b, full_matrices=False) del b, vt else: # second phase: construct the covariance matrix X = B * B.T, where B = Q.T * A # again, construct X incrementally, in chunks of `chunks` documents from the streaming # input corpus A, to avoid using O(number of documents) memory x = numpy.zeros(shape=(samples, samples), dtype=dtype) logger.info("2nd phase: constructing %s covariance matrix" % str(x.shape)) chunker = itertools.groupby(enumerate(corpus), key=lambda (docno, doc): docno / chunks) for chunk_no, (key, group) in enumerate(chunker): logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunks, num_docs)) chunk = matutils.corpus2csc((doc for _, doc in group), num_terms=num_terms, dtype=dtype) b = qt * chunk # dense * sparse matrix multiply x += numpy.dot( b, b.T ) # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :( del chunk, b # now we're ready to compute decomposition of the small matrix X logger.info("running dense decomposition on %s covariance matrix" % str(x.shape)) u, s, vt = numpy.linalg.svd( x ) # could use linalg.eigh, but who cares... and svd returns the factors already sorted :) s = numpy.sqrt( s ) # sqrt to go back from singular values of X to singular values of B = singular values of the corpus logger.info("computing the final decomposition") keep = clipSpectrum(s**2, rank, discard=eps) u = numpy.asfortranarray(u[:, :keep]) s = s[:keep] gemm = matutils.blas('gemm', u) u = gemm(1.0, qt, u, trans_a=True) return u, s
def merge(self, other, decay=1.0): """ Merge this Projection with another. The content of `other` is destroyed in the process, so pass this function a copy of `other` if you need it further. """ if other.u is None: # the other projection is empty => do nothing return if self.u is None: # we are empty => result of merge is the other projection, whatever it is self.u = other.u.copy('F') self.s = other.s.copy() return if self.m != other.m: raise ValueError( "vector space mismatch: update is using %s features, expected %s" % (other.m, self.m)) logger.info("merging projections: %s + %s" % (str(self.u.shape), str(other.u.shape))) m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1] # TODO Maybe keep the bases as elementary reflectors, without # forming explicit matrices with ORGQR. # The only operation we ever need is basis^T*basis ond basis*component. # But how to do that in scipy? And is it fast(er)? # find component of u2 orthogonal to u1 # IMPORTANT: keep matrices in memory suitable order for matrix products; failing to do so gives 8x lower performance :( self.u = numpy.asfortranarray( self.u) # does nothing if input already fortran-order array other.u = numpy.asfortranarray(other.u) gemm = matutils.blas('gemm', self.u) logger.debug("constructing orthogonal component") c = gemm(1.0, self.u, other.u, trans_a=True) gemm(-1.0, self.u, c, beta=1.0, c=other.u, overwrite_c=True) other.u = [ other.u ] # do some reference magic and call qr_destroy, to save RAM q, r = matutils.qr_destroy(other.u) # q, r = QR(component) assert not other.u # find the rotation that diagonalizes r k = numpy.bmat([[numpy.diag(decay * self.s), c * other.s], [ matutils.pad( numpy.matrix([]).reshape(0, 0), min(m, n2), n1), r * other.s ]]) logger.debug("computing SVD of %s dense matrix" % str(k.shape)) try: # in numpy < 1.1.0, running SVD sometimes results in "LinAlgError: SVD did not converge'. # for these early versions of numpy, catch the error and try to compute # SVD again, but over k*k^T. # see http://www.mail-archive.com/[email protected]/msg07224.html and # bug ticket http://projects.scipy.org/numpy/ticket/706 u_k, s_k, _ = numpy.linalg.svd( k, full_matrices=False ) # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper for partial svd/eigendecomp in numpy :( except numpy.linalg.LinAlgError: logging.error("SVD(A) failed; trying SVD(A * A^T)") u_k, s_k, _ = numpy.linalg.svd( numpy.dot(k, k.T), full_matrices=False ) # if this fails too, give up with an exception s_k = numpy.sqrt( s_k) # go back from eigen values to singular values k = clipSpectrum(s_k**2, self.k) u1_k, u2_k, s_k = u_k[:n1, :k].copy('F'), u_k[n1:, :k].copy( 'F'), s_k[:k] # update & rotate current basis U = [U, U']*[U1_k, U2_k] logger.debug("updating orthonormal basis U") self.u = gemm( 1.0, self.u, u1_k ) # TODO temporarily creates an extra (m,k) dense array in memory. find a way to avoid this! gemm(1.0, q, u2_k, beta=1.0, c=self.u, overwrite_c=True) self.s = s_k
def merge(self, other, decay=1.0): """ Merge this Projection with another. The content of `other` is destroyed in the process, so pass this function a copy of `other` if you need it further. """ if other.u is None: # the other projection is empty => do nothing return if self.u is None: # we are empty => result of merge is the other projection, whatever it is if other.s is None: # other.u contains a direct document chunk, not svd => perform svd docs = other.u assert scipy.sparse.issparse(docs) if self.m * self.k < 10000: # SVDLIBC gives spurious results for small matrices.. run full # LAPACK on them instead logger.info("computing dense SVD of %s matrix" % str(docs.shape)) u, s, vt = numpy.linalg.svd(docs.todense(), full_matrices=False) else: try: import sparsesvd except ImportError: raise ImportError( "for LSA, the `sparsesvd` module is needed but not found; run `easy_install sparsesvd`" ) logger.info("computing sparse SVD of %s matrix" % str(docs.shape)) ut, s, vt = sparsesvd.sparsesvd( docs, self.k + 30 ) # ask for a few extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested u = ut.T del ut del vt k = clipSpectrum(s**2, self.k) self.u = u[:, :k].copy('F') self.s = s[:k] else: self.u = other.u.copy('F') self.s = other.s.copy() return if self.m != other.m: raise ValueError( "vector space mismatch: update has %s features, expected %s" % (other.m, self.m)) logger.info("merging projections: %s + %s" % (str(self.u.shape), str(other.u.shape))) m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1] if other.s is None: other.u = other.u.todense() other.s = 1.0 # broadcasting will promote this to eye(n2) where needed # TODO Maybe keep the bases as elementary reflectors, without # forming explicit matrices with ORGQR. # The only operation we ever need is basis^T*basis ond basis*component. # But how to do that in scipy? And is it fast(er)? # find component of u2 orthogonal to u1 # IMPORTANT: keep matrices in memory suitable order for matrix products; failing to do so gives 8x lower performance :( self.u = numpy.asfortranarray( self.u) # does nothing if input already fortran-order array other.u = numpy.asfortranarray(other.u) gemm = matutils.blas('gemm', self.u) logger.debug("constructing orthogonal component") c = gemm(1.0, self.u, other.u, trans_a=True) gemm(-1.0, self.u, c, beta=1.0, c=other.u, overwrite_c=True) # perform q, r = QR(component); code hacked out of scipy.linalg.qr logger.debug("computing QR of %s dense matrix" % str(other.u.shape)) geqrf, = get_lapack_funcs(('geqrf', ), (other.u, )) qr, tau, work, info = geqrf(other.u, lwork=-1, overwrite_a=True) qr, tau, work, info = geqrf(other.u, lwork=work[0], overwrite_a=True) del other.u assert info >= 0 r = triu(qr[:n2, :n2]) if m < n2: # rare case, #features < #topics qr = qr[:, :m] # retains fortran order gorgqr, = get_lapack_funcs(('orgqr', ), (qr, )) q, work, info = gorgqr(qr, tau, lwork=-1, overwrite_a=True) q, work, info = gorgqr(qr, tau, lwork=work[0], overwrite_a=True) assert info >= 0, "qr failed" assert q.flags.f_contiguous # find the rotation that diagonalizes r k = numpy.bmat([[numpy.diag(decay * self.s), c * other.s], [ matutils.pad( numpy.matrix([]).reshape(0, 0), min(m, n2), n1), r * other.s ]]) logger.debug("computing SVD of %s dense matrix" % str(k.shape)) try: # in numpy < 1.1.0, running SVD sometimes results in "LinAlgError: SVD did not converge'. # for these early versions of numpy, catch the error and try to compute # SVD again, but over k*k^T. # see http://www.mail-archive.com/[email protected]/msg07224.html and # bug ticket http://projects.scipy.org/numpy/ticket/706 u_k, s_k, _ = numpy.linalg.svd( k, full_matrices=False ) # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper for partial svd/eigendecomp in numpy :( except numpy.linalg.LinAlgError: logging.error("SVD(A) failed; trying SVD(A * A^T)") u_k, s_k, _ = numpy.linalg.svd( numpy.dot(k, k.T), full_matrices=False) # if this fails too, give up s_k = numpy.sqrt(s_k) k = clipSpectrum(s_k**2, self.k) u_k, s_k = u_k[:, :k], s_k[:k] # update & rotate current basis U logger.debug("updating orthonormal basis U") self.u = gemm( 1.0, self.u, u_k[:n1] ) # TODO temporarily creates an extra (m,k) dense array in memory. find a way to avoid this! gemm(1.0, q, u_k[n1:], beta=1.0, c=self.u, overwrite_c=True) # u = [u,u']*u_k self.s = s_k
def merge(self, other, decay = 1.0): """ Merge this Projection with another. The content of `other` is destroyed in the process, so pass this function a copy of `other` if you need it further. """ if other.u is None: # the other projection is empty => do nothing return if self.u is None: # we are empty => result of merge is the other projection, whatever it is if other.s is None: # other.u contains a direct document chunk, not svd => perform svd docs = other.u assert scipy.sparse.issparse(docs) if self.m * self.k < 10000: # SVDLIBC gives spurious results for small matrices.. run full # LAPACK on them instead logger.info("computing dense SVD of %s matrix" % str(docs.shape)) u, s, vt = numpy.linalg.svd(docs.todense(), full_matrices = False) else: try: import sparsesvd except ImportError: raise ImportError("for LSA, the `sparsesvd` module is needed but not found; run `easy_install sparsesvd`") logger.info("computing sparse SVD of %s matrix" % str(docs.shape)) ut, s, vt = sparsesvd.sparsesvd(docs, self.k + 30) # ask for a few extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested u = ut.T del ut del vt k = clipSpectrum(s ** 2, self.k) self.u = u[:, :k].copy('F') self.s = s[:k] else: self.u = other.u.copy('F') self.s = other.s.copy() return if self.m != other.m: raise ValueError("vector space mismatch: update has %s features, expected %s" % (other.m, self.m)) logger.info("merging projections: %s + %s" % (str(self.u.shape), str(other.u.shape))) m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1] if other.s is None: other.u = other.u.todense() other.s = 1.0 # broadcasting will promote this to eye(n2) where needed # TODO Maybe keep the bases as elementary reflectors, without # forming explicit matrices with ORGQR. # The only operation we ever need is basis^T*basis ond basis*component. # But how to do that in scipy? And is it fast(er)? # find component of u2 orthogonal to u1 # IMPORTANT: keep matrices in memory suitable order for matrix products; failing to do so gives 8x lower performance :( self.u = numpy.asfortranarray(self.u) # does nothing if input already fortran-order array other.u = numpy.asfortranarray(other.u) gemm = matutils.blas('gemm', self.u) logger.debug("constructing orthogonal component") c = gemm(1.0, self.u, other.u, trans_a = True) gemm(-1.0, self.u, c, beta = 1.0, c = other.u, overwrite_c = True) # perform q, r = QR(component); code hacked out of scipy.linalg.qr logger.debug("computing QR of %s dense matrix" % str(other.u.shape)) geqrf, = get_lapack_funcs(('geqrf',), (other.u,)) qr, tau, work, info = geqrf(other.u, lwork = -1, overwrite_a = True) qr, tau, work, info = geqrf(other.u, lwork = work[0], overwrite_a = True) del other.u assert info >= 0 r = triu(qr[:n2, :n2]) if m < n2: # rare case, #features < #topics qr = qr[:, :m] # retains fortran order gorgqr, = get_lapack_funcs(('orgqr',), (qr,)) q, work, info = gorgqr(qr, tau, lwork = -1, overwrite_a = True) q, work, info = gorgqr(qr, tau, lwork = work[0], overwrite_a = True) assert info >= 0, "qr failed" assert q.flags.f_contiguous # find the rotation that diagonalizes r k = numpy.bmat([[numpy.diag(decay * self.s), c * other.s], [matutils.pad(numpy.matrix([]).reshape(0, 0), min(m, n2), n1), r * other.s]]) logger.debug("computing SVD of %s dense matrix" % str(k.shape)) try: # in numpy < 1.1.0, running SVD sometimes results in "LinAlgError: SVD did not converge'. # for these early versions of numpy, catch the error and try to compute # SVD again, but over k*k^T. # see http://www.mail-archive.com/[email protected]/msg07224.html and # bug ticket http://projects.scipy.org/numpy/ticket/706 u_k, s_k, _ = numpy.linalg.svd(k, full_matrices = False) # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper for partial svd/eigendecomp in numpy :( except numpy.linalg.LinAlgError: logging.error("SVD(A) failed; trying SVD(A * A^T)") u_k, s_k, _ = numpy.linalg.svd(numpy.dot(k, k.T), full_matrices = False) # if this fails too, give up s_k = numpy.sqrt(s_k) k = clipSpectrum(s_k ** 2, self.k) u_k, s_k = u_k[:, :k], s_k[:k] # update & rotate current basis U logger.debug("updating orthonormal basis U") self.u = gemm(1.0, self.u, u_k[:n1]) # TODO temporarily creates an extra (m,k) dense array in memory. find a way to avoid this! gemm(1.0, q, u_k[n1:], beta = 1.0, c = self.u, overwrite_c = True) # u = [u,u']*u_k self.s = s_k
def stochasticSvd(corpus, rank, num_terms, chunks=20000, extra_dims=None, power_iters=0, dtype=numpy.float64, eps=1e-6): """ Return (U, S): the left singular vectors and the singular values of the streamed input corpus `corpus` [3]_. This may actually return less than the requested number of top `rank` factors, in case the input is of lower rank. The `extra_dims` (oversampling) and especially `power_iters` (power iterations) parameters affect accuracy of the decomposition. This algorithm uses `2+power_iters` passes over the data. In case you can only afford a single pass over the input corpus, set `onepass=True` in :class:`LsiModel` and avoid using this algorithm directly. The decomposition algorithm is based on **Halko, Martinsson, Tropp. Finding structure with randomness, 2009.** .. [3] If `corpus` is a scipy.sparse matrix instead, it is assumed the whole corpus fits into core memory and a different (more efficient) code path is chosen. """ rank = int(rank) if extra_dims is None: samples = max(10, 2 * rank) # use more samples than requested factors, to improve accuracy else: samples = rank + int(extra_dims) logger.info("using %i extra samples and %i power iterations" % (samples - rank, power_iters)) num_terms = int(num_terms) eps = max(float(eps), 1e-9) # must ignore near-zero eigenvalues (probably numerical error); the associated eigenvectors are typically unstable/garbage # first phase: construct the orthonormal action matrix Q = orth(Y) = orth((A * A.T)^q * A * O) # build Y in blocks of `chunks` documents (much faster than going one-by-one # and more memory friendly than processing all documents at once) y = numpy.zeros(dtype = dtype, shape = (num_terms, samples)) logger.info("1st phase: constructing %s action matrix" % str(y.shape)) if scipy.sparse.issparse(corpus): m, n = corpus.shape assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % (m, num_terms) o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(y.dtype) # draw a random gaussian matrix sparsetools.csc_matvecs(m, n, samples, corpus.indptr, corpus.indices, corpus.data, o.ravel(), y.ravel()) # y = corpus * o del o y = y.astype(dtype) # TODO unlike numpy, scipy actually makes a copy even when dtype=y.dtype...marginally inefficient logger.debug("running %i power iterations" % power_iters) for power_iter in xrange(power_iters): y = corpus.T * y y = corpus * y else: chunker = itertools.groupby(enumerate(corpus), key = lambda (docno, doc): docno / chunks) num_docs = 0 for chunk_no, (key, group) in enumerate(chunker): logger.info('PROGRESS: at document #%i' % (chunk_no * chunks)) # construct the chunk as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense (num_terms x chunks) matrix! chunk = matutils.corpus2csc((doc for _, doc in group), num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC m, n = chunk.shape assert m == num_terms assert n <= chunks # the very last chunk of A is allowed to be smaller in size num_docs += n logger.debug("multiplying chunk * gauss") o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(dtype) # draw a random gaussian matrix sparsetools.csc_matvecs(num_terms, n, samples, chunk.indptr, # y = y + chunk * o chunk.indices, chunk.data, o.ravel(), y.ravel()) del chunk, o for power_iter in xrange(power_iters): logger.info("running power iteration #%i" % (power_iter + 1)) yold = y.copy() y[:] = 0.0 chunker = itertools.groupby(enumerate(corpus), key = lambda (docno, doc): docno / chunks) for chunk_no, (key, group) in enumerate(chunker): logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunks, num_docs)) chunk = matutils.corpus2csc((doc for _, doc in group), num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC tmp = chunk.T * yold tmp = chunk * tmp del chunk y += tmp del yold logger.info("orthonormalizing %s action matrix" % str(y.shape)) y = [y] q, r = matutils.qr_destroy(y) # orthonormalize the range del y samples = clipSpectrum(numpy.diag(r), samples, discard = eps) qt = numpy.asfortranarray(q[:, :samples].T) # discard bogus columns, in case Y was rank-deficient del q if scipy.sparse.issparse(corpus): b = qt * corpus logger.info("2nd phase: running dense svd on %s matrix" % str(b.shape)) u, s, vt = numpy.linalg.svd(b, full_matrices=False) del b, vt else: # second phase: construct the covariance matrix X = B * B.T, where B = Q.T * A # again, construct X incrementally, in chunks of `chunks` documents from the streaming # input corpus A, to avoid using O(number of documents) memory x = numpy.zeros(shape = (samples, samples), dtype = dtype) logger.info("2nd phase: constructing %s covariance matrix" % str(x.shape)) chunker = itertools.groupby(enumerate(corpus), key = lambda (docno, doc): docno / chunks) for chunk_no, (key, group) in enumerate(chunker): logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunks, num_docs)) chunk = matutils.corpus2csc((doc for _, doc in group), num_terms=num_terms, dtype=dtype) b = qt * chunk # dense * sparse matrix multiply x += numpy.dot(b, b.T) # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :( del chunk, b # now we're ready to compute decomposition of the small matrix X logger.info("running dense decomposition on %s covariance matrix" % str(x.shape)) u, s, vt = numpy.linalg.svd(x) # could use linalg.eigh, but who cares... and svd returns the factors already sorted :) s = numpy.sqrt(s) # sqrt to go back from singular values of X to singular values of B = singular values of the corpus logger.info("computing the final decomposition") keep = clipSpectrum(s**2, rank, discard=eps) u = numpy.asfortranarray(u[:, :keep]) s = s[:keep] gemm = matutils.blas('gemm', u) u = gemm(1.0, qt, u, trans_a=True) return u, s
def merge(self, other, decay=1.0): """ Merge this Projection with another. The content of `other` is destroyed in the process, so pass this function a copy of `other` if you need it further. """ if other.u is None: # the other projection is empty => do nothing return if self.u is None: # we are empty => result of merge is the other projection, whatever it is self.u = other.u.copy('F') self.s = other.s.copy() return if self.m != other.m: raise ValueError("vector space mismatch: update is using %s features, expected %s" % (other.m, self.m)) logger.info("merging projections: %s + %s" % (str(self.u.shape), str(other.u.shape))) m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1] # TODO Maybe keep the bases as elementary reflectors, without # forming explicit matrices with ORGQR. # The only operation we ever need is basis^T*basis ond basis*component. # But how to do that in scipy? And is it fast(er)? # find component of u2 orthogonal to u1 # IMPORTANT: keep matrices in memory suitable order for matrix products; failing to do so gives 8x lower performance :( self.u = numpy.asfortranarray(self.u) # does nothing if input already fortran-order array other.u = numpy.asfortranarray(other.u) gemm = matutils.blas('gemm', self.u) logger.debug("constructing orthogonal component") c = gemm(1.0, self.u, other.u, trans_a = True) gemm(-1.0, self.u, c, beta = 1.0, c = other.u, overwrite_c = True) other.u = [other.u] # do some reference magic and call qr_destroy, to save RAM q, r = matutils.qr_destroy(other.u) # q, r = QR(component) assert not other.u # find the rotation that diagonalizes r k = numpy.bmat([[numpy.diag(decay * self.s), c * other.s], [matutils.pad(numpy.matrix([]).reshape(0, 0), min(m, n2), n1), r * other.s]]) logger.debug("computing SVD of %s dense matrix" % str(k.shape)) try: # in numpy < 1.1.0, running SVD sometimes results in "LinAlgError: SVD did not converge'. # for these early versions of numpy, catch the error and try to compute # SVD again, but over k*k^T. # see http://www.mail-archive.com/[email protected]/msg07224.html and # bug ticket http://projects.scipy.org/numpy/ticket/706 u_k, s_k, _ = numpy.linalg.svd(k, full_matrices = False) # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper for partial svd/eigendecomp in numpy :( except numpy.linalg.LinAlgError: logging.error("SVD(A) failed; trying SVD(A * A^T)") u_k, s_k, _ = numpy.linalg.svd(numpy.dot(k, k.T), full_matrices = False) # if this fails too, give up with an exception s_k = numpy.sqrt(s_k) # go back from eigen values to singular values k = clipSpectrum(s_k ** 2, self.k) u1_k, u2_k, s_k = u_k[:n1, :k].copy('F'), u_k[n1:, :k].copy('F'), s_k[:k] # update & rotate current basis U = [U, U']*[U1_k, U2_k] logger.debug("updating orthonormal basis U") self.u = gemm(1.0, self.u, u1_k) # TODO temporarily creates an extra (m,k) dense array in memory. find a way to avoid this! gemm(1.0, q, u2_k, beta = 1.0, c = self.u, overwrite_c = True) self.s = s_k