Exemple #1
0
 def __getitem__(self, bow, scaled=False):
     """
     Return latent representation, as a list of (topic_id, topic_value) 2-tuples.
     
     This is done by folding input document into the latent topic space. 
     
     Note that this function returns the latent space representation **scaled by the
     singular values**. To return non-scaled embedding, set `scaled` to False.
     """
     # if the input vector is in fact a corpus, return a transformed corpus as a result
     is_corpus, bow = utils.isCorpus(bow)
     if is_corpus:
         return self._apply(bow)
     
     assert self.projection.u is not None, "decomposition not initialized yet"
     vec = matutils.sparse2full(bow, self.numTerms).astype(self.projection.u.dtype)
     vec.shape = (self.numTerms, 1)
     assert self.projection.u.flags.f_contiguous
     dgemv = matutils.blas('gemv', self.projection.u)
     topicDist = dgemv(1.0, self.projection.u, vec, trans=True) # u^T * x
     if scaled:
         topicDist = (1.0 / self.projection.s) * topicDist # s^-1 * u^T * x
     
     nnz = topicDist.nonzero()[0]
     return zip(nnz, topicDist[nnz])
Exemple #2
0
    def __getitem__(self, bow, scaled=False):
        """
        Return latent representation, as a list of (topic_id, topic_value) 2-tuples.
        
        This is done by folding input document into the latent topic space. 
        
        Note that this function returns the latent space representation **scaled by the
        singular values**. To return non-scaled embedding, set `scaled` to False.
        """
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.isCorpus(bow)
        if is_corpus:
            return self._apply(bow)

        assert self.projection.u is not None, "decomposition not initialized yet"
        vec = matutils.sparse2full(bow, self.numTerms).astype(
            self.projection.u.dtype)
        vec.shape = (self.numTerms, 1)
        assert self.projection.u.flags.f_contiguous
        dgemv = matutils.blas('gemv', self.projection.u)
        topicDist = dgemv(1.0, self.projection.u, vec, trans=True)  # u^T * x
        if scaled:
            topicDist = (1.0 / self.projection.s) * topicDist  # s^-1 * u^T * x

        nnz = topicDist.nonzero()[0]
        return zip(nnz, topicDist[nnz])
 def getSimilarities(self, doc):
     """
     Return similarity of sparse vector `doc` to all documents in the corpus.
     
     `doc` may be either a bag-of-words iterable (standard corpus document), 
     or a numpy array, or a `scipy.sparse` matrix.
     """
     if scipy.sparse.issparse(doc):
         vec = doc.toarray().flatten()
     elif isinstance(doc, numpy.ndarray):
         vec = doc
     else:
         vec = matutils.sparse2full(doc, self.numFeatures)
     vec = numpy.asfortranarray(vec, dtype = self.corpus.dtype).reshape(self.numFeatures, 1)
     
     # compute cosine similarity against every other document in the collection
     gemv = matutils.blas('gemv', self.corpus)
     allSims = gemv(1.0, self.corpus, vec) # N x T * T x 1 = N x 1
     allSims = list(allSims.flat) # convert to plain python list
     assert len(allSims) == self.corpus.shape[0] # make sure no document got lost!
     return allSims
Exemple #4
0
    def getSimilarities(self, doc):
        """
        Return similarity of sparse vector `doc` to all documents in the corpus.

        `doc` may be either a bag-of-words iterable (standard corpus document),
        or a numpy array, or a `scipy.sparse` matrix.
        """
        if scipy.sparse.issparse(doc):
            vec = doc.toarray().flatten()
        elif isinstance(doc, numpy.ndarray):
            vec = doc
        else:
            vec = matutils.sparse2full(doc, self.numFeatures)
        vec = numpy.asfortranarray(vec, dtype=self.corpus.dtype).reshape(
            self.numFeatures, 1)

        # compute cosine similarity against every other document in the collection
        gemv = matutils.blas('gemv', self.corpus)
        allSims = gemv(1.0, self.corpus, vec)  # N x T * T x 1 = N x 1
        allSims = list(allSims.flat)  # convert to plain python list
        assert len(allSims) == self.corpus.shape[
            0]  # make sure no document got lost!
        return allSims
Exemple #5
0
def stochasticSvd(corpus,
                  rank,
                  num_terms,
                  chunks=20000,
                  extra_dims=None,
                  power_iters=0,
                  dtype=numpy.float64,
                  eps=1e-6):
    """
    Return (U, S): the left singular vectors and the singular values of the streamed 
    input corpus `corpus` [3]_.
    
    This may actually return less than the requested number of top `rank` factors, 
    in case the input is of lower rank. The `extra_dims` (oversampling) and especially
    `power_iters` (power iterations) parameters affect accuracy of the decomposition.
    
    This algorithm uses `2+power_iters` passes over the data. In case you can only 
    afford a single pass over the input corpus, set `onepass=True` in :class:`LsiModel` 
    and avoid using this algorithm directly.

    The decomposition algorithm is based on 
    **Halko, Martinsson, Tropp. Finding structure with randomness, 2009.**
    
    .. [3] If `corpus` is a scipy.sparse matrix instead, it is assumed the whole 
       corpus fits into core memory and a different (more efficient) code path is chosen.
    """
    rank = int(rank)
    if extra_dims is None:
        samples = max(
            10, 2 * rank
        )  # use more samples than requested factors, to improve accuracy
    else:
        samples = rank + int(extra_dims)
    logger.info("using %i extra samples and %i power iterations" %
                (samples - rank, power_iters))

    num_terms = int(num_terms)

    eps = max(
        float(eps), 1e-9
    )  # must ignore near-zero eigenvalues (probably numerical error); the associated eigenvectors are typically unstable/garbage

    # first phase: construct the orthonormal action matrix Q = orth(Y) = orth((A * A.T)^q * A * O)
    # build Y in blocks of `chunks` documents (much faster than going one-by-one
    # and more memory friendly than processing all documents at once)
    y = numpy.zeros(dtype=dtype, shape=(num_terms, samples))
    logger.info("1st phase: constructing %s action matrix" % str(y.shape))

    if scipy.sparse.issparse(corpus):
        m, n = corpus.shape
        assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % (
            m, num_terms)
        o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(
            y.dtype)  # draw a random gaussian matrix
        sparsetools.csc_matvecs(m, n, samples, corpus.indptr,
                                corpus.indices, corpus.data, o.ravel(),
                                y.ravel())  # y = corpus * o
        del o
        y = y.astype(
            dtype
        )  # TODO unlike numpy, scipy actually makes a copy even when dtype=y.dtype...marginally inefficient
        logger.debug("running %i power iterations" % power_iters)
        for power_iter in xrange(power_iters):
            y = corpus.T * y
            y = corpus * y
    else:
        chunker = itertools.groupby(enumerate(corpus),
                                    key=lambda (docno, doc): docno / chunks)
        num_docs = 0
        for chunk_no, (key, group) in enumerate(chunker):
            logger.info('PROGRESS: at document #%i' % (chunk_no * chunks))
            # construct the chunk as a sparse matrix, to minimize memory overhead
            # definitely avoid materializing it as a dense (num_terms x chunks) matrix!
            chunk = matutils.corpus2csc(
                (doc for _, doc in group), num_terms=num_terms,
                dtype=dtype)  # documents = columns of sparse CSC
            m, n = chunk.shape
            assert m == num_terms
            assert n <= chunks  # the very last chunk of A is allowed to be smaller in size
            num_docs += n
            logger.debug("multiplying chunk * gauss")
            o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(
                dtype)  # draw a random gaussian matrix
            sparsetools.csc_matvecs(
                num_terms,
                n,
                samples,
                chunk.indptr,  # y = y + chunk * o
                chunk.indices,
                chunk.data,
                o.ravel(),
                y.ravel())
            del chunk, o

        for power_iter in xrange(power_iters):
            logger.info("running power iteration #%i" % (power_iter + 1))
            yold = y.copy()
            y[:] = 0.0
            chunker = itertools.groupby(enumerate(corpus),
                                        key=lambda
                                        (docno, doc): docno / chunks)
            for chunk_no, (key, group) in enumerate(chunker):
                logger.info('PROGRESS: at document #%i/%i' %
                            (chunk_no * chunks, num_docs))
                chunk = matutils.corpus2csc(
                    (doc for _, doc in group),
                    num_terms=num_terms,
                    dtype=dtype)  # documents = columns of sparse CSC
                tmp = chunk.T * yold
                tmp = chunk * tmp
                del chunk
                y += tmp
            del yold

    logger.info("orthonormalizing %s action matrix" % str(y.shape))
    y = [y]
    q, r = matutils.qr_destroy(y)  # orthonormalize the range
    del y
    samples = clipSpectrum(numpy.diag(r), samples, discard=eps)
    qt = numpy.asfortranarray(
        q[:, :samples].T
    )  # discard bogus columns, in case Y was rank-deficient
    del q

    if scipy.sparse.issparse(corpus):
        b = qt * corpus
        logger.info("2nd phase: running dense svd on %s matrix" % str(b.shape))
        u, s, vt = numpy.linalg.svd(b, full_matrices=False)
        del b, vt
    else:
        # second phase: construct the covariance matrix X = B * B.T, where B = Q.T * A
        # again, construct X incrementally, in chunks of `chunks` documents from the streaming
        # input corpus A, to avoid using O(number of documents) memory
        x = numpy.zeros(shape=(samples, samples), dtype=dtype)
        logger.info("2nd phase: constructing %s covariance matrix" %
                    str(x.shape))
        chunker = itertools.groupby(enumerate(corpus),
                                    key=lambda (docno, doc): docno / chunks)
        for chunk_no, (key, group) in enumerate(chunker):
            logger.info('PROGRESS: at document #%i/%i' %
                        (chunk_no * chunks, num_docs))
            chunk = matutils.corpus2csc((doc for _, doc in group),
                                        num_terms=num_terms,
                                        dtype=dtype)
            b = qt * chunk  # dense * sparse matrix multiply
            x += numpy.dot(
                b, b.T
            )  # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :(
            del chunk, b

        # now we're ready to compute decomposition of the small matrix X
        logger.info("running dense decomposition on %s covariance matrix" %
                    str(x.shape))
        u, s, vt = numpy.linalg.svd(
            x
        )  # could use linalg.eigh, but who cares... and svd returns the factors already sorted :)
        s = numpy.sqrt(
            s
        )  # sqrt to go back from singular values of X to singular values of B = singular values of the corpus

    logger.info("computing the final decomposition")
    keep = clipSpectrum(s**2, rank, discard=eps)
    u = numpy.asfortranarray(u[:, :keep])
    s = s[:keep]
    gemm = matutils.blas('gemm', u)
    u = gemm(1.0, qt, u, trans_a=True)
    return u, s
Exemple #6
0
    def merge(self, other, decay=1.0):
        """
        Merge this Projection with another. 
        
        The content of `other` is destroyed in the process, so pass this function a 
        copy of `other` if you need it further.
        """
        if other.u is None:
            # the other projection is empty => do nothing
            return
        if self.u is None:
            # we are empty => result of merge is the other projection, whatever it is
            self.u = other.u.copy('F')
            self.s = other.s.copy()
            return
        if self.m != other.m:
            raise ValueError(
                "vector space mismatch: update is using %s features, expected %s"
                % (other.m, self.m))
        logger.info("merging projections: %s + %s" %
                    (str(self.u.shape), str(other.u.shape)))
        m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1]
        # TODO Maybe keep the bases as elementary reflectors, without
        # forming explicit matrices with ORGQR.
        # The only operation we ever need is basis^T*basis ond basis*component.
        # But how to do that in scipy? And is it fast(er)?

        # find component of u2 orthogonal to u1
        # IMPORTANT: keep matrices in memory suitable order for matrix products; failing to do so gives 8x lower performance :(
        self.u = numpy.asfortranarray(
            self.u)  # does nothing if input already fortran-order array
        other.u = numpy.asfortranarray(other.u)
        gemm = matutils.blas('gemm', self.u)
        logger.debug("constructing orthogonal component")
        c = gemm(1.0, self.u, other.u, trans_a=True)
        gemm(-1.0, self.u, c, beta=1.0, c=other.u, overwrite_c=True)

        other.u = [
            other.u
        ]  # do some reference magic and call qr_destroy, to save RAM
        q, r = matutils.qr_destroy(other.u)  # q, r = QR(component)
        assert not other.u

        # find the rotation that diagonalizes r
        k = numpy.bmat([[numpy.diag(decay * self.s), c * other.s],
                        [
                            matutils.pad(
                                numpy.matrix([]).reshape(0, 0), min(m, n2),
                                n1), r * other.s
                        ]])
        logger.debug("computing SVD of %s dense matrix" % str(k.shape))
        try:
            # in numpy < 1.1.0, running SVD sometimes results in "LinAlgError: SVD did not converge'.
            # for these early versions of numpy, catch the error and try to compute
            # SVD again, but over k*k^T.
            # see http://www.mail-archive.com/[email protected]/msg07224.html and
            # bug ticket http://projects.scipy.org/numpy/ticket/706
            u_k, s_k, _ = numpy.linalg.svd(
                k, full_matrices=False
            )  # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper for partial svd/eigendecomp in numpy :(
        except numpy.linalg.LinAlgError:
            logging.error("SVD(A) failed; trying SVD(A * A^T)")
            u_k, s_k, _ = numpy.linalg.svd(
                numpy.dot(k, k.T), full_matrices=False
            )  # if this fails too, give up with an exception
            s_k = numpy.sqrt(
                s_k)  # go back from eigen values to singular values

        k = clipSpectrum(s_k**2, self.k)
        u1_k, u2_k, s_k = u_k[:n1, :k].copy('F'), u_k[n1:, :k].copy(
            'F'), s_k[:k]

        # update & rotate current basis U = [U, U']*[U1_k, U2_k]
        logger.debug("updating orthonormal basis U")
        self.u = gemm(
            1.0, self.u, u1_k
        )  # TODO temporarily creates an extra (m,k) dense array in memory. find a way to avoid this!
        gemm(1.0, q, u2_k, beta=1.0, c=self.u, overwrite_c=True)
        self.s = s_k
Exemple #7
0
    def merge(self, other, decay=1.0):
        """
        Merge this Projection with another. 
        
        The content of `other` is destroyed in the process, so pass this function a 
        copy of `other` if you need it further.
        """
        if other.u is None:
            # the other projection is empty => do nothing
            return
        if self.u is None:
            # we are empty => result of merge is the other projection, whatever it is
            if other.s is None:
                # other.u contains a direct document chunk, not svd => perform svd
                docs = other.u
                assert scipy.sparse.issparse(docs)
                if self.m * self.k < 10000:
                    # SVDLIBC gives spurious results for small matrices.. run full
                    # LAPACK on them instead
                    logger.info("computing dense SVD of %s matrix" %
                                str(docs.shape))
                    u, s, vt = numpy.linalg.svd(docs.todense(),
                                                full_matrices=False)
                else:
                    try:
                        import sparsesvd
                    except ImportError:
                        raise ImportError(
                            "for LSA, the `sparsesvd` module is needed but not found; run `easy_install sparsesvd`"
                        )
                    logger.info("computing sparse SVD of %s matrix" %
                                str(docs.shape))
                    ut, s, vt = sparsesvd.sparsesvd(
                        docs, self.k + 30
                    )  # ask for a few extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested
                    u = ut.T
                    del ut
                del vt
                k = clipSpectrum(s**2, self.k)
                self.u = u[:, :k].copy('F')
                self.s = s[:k]
            else:
                self.u = other.u.copy('F')
                self.s = other.s.copy()
            return
        if self.m != other.m:
            raise ValueError(
                "vector space mismatch: update has %s features, expected %s" %
                (other.m, self.m))
        logger.info("merging projections: %s + %s" %
                    (str(self.u.shape), str(other.u.shape)))
        m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1]
        if other.s is None:
            other.u = other.u.todense()
            other.s = 1.0  # broadcasting will promote this to eye(n2) where needed
        # TODO Maybe keep the bases as elementary reflectors, without
        # forming explicit matrices with ORGQR.
        # The only operation we ever need is basis^T*basis ond basis*component.
        # But how to do that in scipy? And is it fast(er)?

        # find component of u2 orthogonal to u1
        # IMPORTANT: keep matrices in memory suitable order for matrix products; failing to do so gives 8x lower performance :(
        self.u = numpy.asfortranarray(
            self.u)  # does nothing if input already fortran-order array
        other.u = numpy.asfortranarray(other.u)
        gemm = matutils.blas('gemm', self.u)
        logger.debug("constructing orthogonal component")
        c = gemm(1.0, self.u, other.u, trans_a=True)
        gemm(-1.0, self.u, c, beta=1.0, c=other.u, overwrite_c=True)

        # perform q, r = QR(component); code hacked out of scipy.linalg.qr
        logger.debug("computing QR of %s dense matrix" % str(other.u.shape))
        geqrf, = get_lapack_funcs(('geqrf', ), (other.u, ))
        qr, tau, work, info = geqrf(other.u, lwork=-1, overwrite_a=True)
        qr, tau, work, info = geqrf(other.u, lwork=work[0], overwrite_a=True)
        del other.u
        assert info >= 0
        r = triu(qr[:n2, :n2])
        if m < n2:  # rare case, #features < #topics
            qr = qr[:, :m]  # retains fortran order
        gorgqr, = get_lapack_funcs(('orgqr', ), (qr, ))
        q, work, info = gorgqr(qr, tau, lwork=-1, overwrite_a=True)
        q, work, info = gorgqr(qr, tau, lwork=work[0], overwrite_a=True)
        assert info >= 0, "qr failed"
        assert q.flags.f_contiguous

        # find the rotation that diagonalizes r
        k = numpy.bmat([[numpy.diag(decay * self.s), c * other.s],
                        [
                            matutils.pad(
                                numpy.matrix([]).reshape(0, 0), min(m, n2),
                                n1), r * other.s
                        ]])
        logger.debug("computing SVD of %s dense matrix" % str(k.shape))
        try:
            # in numpy < 1.1.0, running SVD sometimes results in "LinAlgError: SVD did not converge'.
            # for these early versions of numpy, catch the error and try to compute
            # SVD again, but over k*k^T.
            # see http://www.mail-archive.com/[email protected]/msg07224.html and
            # bug ticket http://projects.scipy.org/numpy/ticket/706
            u_k, s_k, _ = numpy.linalg.svd(
                k, full_matrices=False
            )  # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper for partial svd/eigendecomp in numpy :(
        except numpy.linalg.LinAlgError:
            logging.error("SVD(A) failed; trying SVD(A * A^T)")
            u_k, s_k, _ = numpy.linalg.svd(
                numpy.dot(k, k.T),
                full_matrices=False)  # if this fails too, give up
            s_k = numpy.sqrt(s_k)

        k = clipSpectrum(s_k**2, self.k)
        u_k, s_k = u_k[:, :k], s_k[:k]

        # update & rotate current basis U
        logger.debug("updating orthonormal basis U")
        self.u = gemm(
            1.0, self.u, u_k[:n1]
        )  # TODO temporarily creates an extra (m,k) dense array in memory. find a way to avoid this!
        gemm(1.0, q, u_k[n1:], beta=1.0, c=self.u,
             overwrite_c=True)  # u = [u,u']*u_k
        self.s = s_k
 def merge(self, other, decay = 1.0):
     """
     Merge this Projection with another. 
     
     The content of `other` is destroyed in the process, so pass this function a 
     copy of `other` if you need it further.
     """
     if other.u is None:
         # the other projection is empty => do nothing
         return
     if self.u is None:
         # we are empty => result of merge is the other projection, whatever it is
         if other.s is None:
             # other.u contains a direct document chunk, not svd => perform svd
             docs = other.u
             assert scipy.sparse.issparse(docs)
             if self.m * self.k < 10000:
                 # SVDLIBC gives spurious results for small matrices.. run full
                 # LAPACK on them instead
                 logger.info("computing dense SVD of %s matrix" % str(docs.shape))
                 u, s, vt = numpy.linalg.svd(docs.todense(), full_matrices = False)
             else:
                 try:
                     import sparsesvd
                 except ImportError:
                     raise ImportError("for LSA, the `sparsesvd` module is needed but not found; run `easy_install sparsesvd`")
                 logger.info("computing sparse SVD of %s matrix" % str(docs.shape))
                 ut, s, vt = sparsesvd.sparsesvd(docs, self.k + 30) # ask for a few extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested
                 u = ut.T
                 del ut
             del vt
             k = clipSpectrum(s ** 2, self.k)
             self.u = u[:, :k].copy('F')
             self.s = s[:k]
         else:
             self.u = other.u.copy('F')
             self.s = other.s.copy()
         return
     if self.m != other.m:
         raise ValueError("vector space mismatch: update has %s features, expected %s" %
                          (other.m, self.m))
     logger.info("merging projections: %s + %s" % (str(self.u.shape), str(other.u.shape)))
     m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1]
     if other.s is None:
         other.u = other.u.todense()
         other.s = 1.0 # broadcasting will promote this to eye(n2) where needed
     # TODO Maybe keep the bases as elementary reflectors, without 
     # forming explicit matrices with ORGQR.
     # The only operation we ever need is basis^T*basis ond basis*component.
     # But how to do that in scipy? And is it fast(er)?
     
     # find component of u2 orthogonal to u1
     # IMPORTANT: keep matrices in memory suitable order for matrix products; failing to do so gives 8x lower performance :(
     self.u = numpy.asfortranarray(self.u) # does nothing if input already fortran-order array
     other.u = numpy.asfortranarray(other.u)
     gemm = matutils.blas('gemm', self.u)
     logger.debug("constructing orthogonal component")
     c = gemm(1.0, self.u, other.u, trans_a = True)
     gemm(-1.0, self.u, c, beta = 1.0, c = other.u, overwrite_c = True)
     
     # perform q, r = QR(component); code hacked out of scipy.linalg.qr
     logger.debug("computing QR of %s dense matrix" % str(other.u.shape))
     geqrf, = get_lapack_funcs(('geqrf',), (other.u,))
     qr, tau, work, info = geqrf(other.u, lwork = -1, overwrite_a = True)
     qr, tau, work, info = geqrf(other.u, lwork = work[0], overwrite_a = True)
     del other.u
     assert info >= 0
     r = triu(qr[:n2, :n2])
     if m < n2: # rare case, #features < #topics
         qr = qr[:, :m] # retains fortran order
     gorgqr, = get_lapack_funcs(('orgqr',), (qr,))
     q, work, info = gorgqr(qr, tau, lwork = -1, overwrite_a = True)
     q, work, info = gorgqr(qr, tau, lwork = work[0], overwrite_a = True)
     assert info >= 0, "qr failed"
     assert q.flags.f_contiguous
     
     # find the rotation that diagonalizes r
     k = numpy.bmat([[numpy.diag(decay * self.s), c * other.s], [matutils.pad(numpy.matrix([]).reshape(0, 0), min(m, n2), n1), r * other.s]])
     logger.debug("computing SVD of %s dense matrix" % str(k.shape))
     try:
         # in numpy < 1.1.0, running SVD sometimes results in "LinAlgError: SVD did not converge'.
         # for these early versions of numpy, catch the error and try to compute
         # SVD again, but over k*k^T.
         # see http://www.mail-archive.com/[email protected]/msg07224.html and
         # bug ticket http://projects.scipy.org/numpy/ticket/706
         u_k, s_k, _ = numpy.linalg.svd(k, full_matrices = False) # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper for partial svd/eigendecomp in numpy :(
     except numpy.linalg.LinAlgError:
         logging.error("SVD(A) failed; trying SVD(A * A^T)")
         u_k, s_k, _ = numpy.linalg.svd(numpy.dot(k, k.T), full_matrices = False) # if this fails too, give up
         s_k = numpy.sqrt(s_k)
     
     k = clipSpectrum(s_k ** 2, self.k)
     u_k, s_k = u_k[:, :k], s_k[:k]
     
     # update & rotate current basis U
     logger.debug("updating orthonormal basis U")
     self.u = gemm(1.0, self.u, u_k[:n1]) # TODO temporarily creates an extra (m,k) dense array in memory. find a way to avoid this!
     gemm(1.0, q, u_k[n1:], beta = 1.0, c = self.u, overwrite_c = True) # u = [u,u']*u_k
     self.s = s_k
Exemple #9
0
def stochasticSvd(corpus, rank, num_terms, chunks=20000, extra_dims=None, 
                  power_iters=0, dtype=numpy.float64, eps=1e-6):
    """
    Return (U, S): the left singular vectors and the singular values of the streamed 
    input corpus `corpus` [3]_.
    
    This may actually return less than the requested number of top `rank` factors, 
    in case the input is of lower rank. The `extra_dims` (oversampling) and especially
    `power_iters` (power iterations) parameters affect accuracy of the decomposition.
    
    This algorithm uses `2+power_iters` passes over the data. In case you can only 
    afford a single pass over the input corpus, set `onepass=True` in :class:`LsiModel` 
    and avoid using this algorithm directly.

    The decomposition algorithm is based on 
    **Halko, Martinsson, Tropp. Finding structure with randomness, 2009.**
    
    .. [3] If `corpus` is a scipy.sparse matrix instead, it is assumed the whole 
       corpus fits into core memory and a different (more efficient) code path is chosen.
    """
    rank = int(rank)
    if extra_dims is None:
        samples = max(10, 2 * rank) # use more samples than requested factors, to improve accuracy
    else:
        samples = rank + int(extra_dims)
    logger.info("using %i extra samples and %i power iterations" % (samples - rank, power_iters))
    
    num_terms = int(num_terms)
    
    eps = max(float(eps), 1e-9) # must ignore near-zero eigenvalues (probably numerical error); the associated eigenvectors are typically unstable/garbage
    
    # first phase: construct the orthonormal action matrix Q = orth(Y) = orth((A * A.T)^q * A * O)
    # build Y in blocks of `chunks` documents (much faster than going one-by-one 
    # and more memory friendly than processing all documents at once)
    y = numpy.zeros(dtype = dtype, shape = (num_terms, samples))
    logger.info("1st phase: constructing %s action matrix" % str(y.shape))
    
    if scipy.sparse.issparse(corpus):
        m, n = corpus.shape
        assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % (m, num_terms)
        o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(y.dtype) # draw a random gaussian matrix
        sparsetools.csc_matvecs(m, n, samples, corpus.indptr, corpus.indices, 
                                corpus.data, o.ravel(), y.ravel()) # y = corpus * o
        del o
        y = y.astype(dtype) # TODO unlike numpy, scipy actually makes a copy even when dtype=y.dtype...marginally inefficient
        logger.debug("running %i power iterations" % power_iters)
        for power_iter in xrange(power_iters):
            y = corpus.T * y
            y = corpus * y
    else:
        chunker = itertools.groupby(enumerate(corpus), key = lambda (docno, doc): docno / chunks)
        num_docs = 0
        for chunk_no, (key, group) in enumerate(chunker):
            logger.info('PROGRESS: at document #%i' % (chunk_no * chunks))
            # construct the chunk as a sparse matrix, to minimize memory overhead
            # definitely avoid materializing it as a dense (num_terms x chunks) matrix!
            chunk = matutils.corpus2csc((doc for _, doc in group), num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC
            m, n = chunk.shape
            assert m == num_terms
            assert n <= chunks # the very last chunk of A is allowed to be smaller in size
            num_docs += n
            logger.debug("multiplying chunk * gauss")
            o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(dtype) # draw a random gaussian matrix
            sparsetools.csc_matvecs(num_terms, n, samples, chunk.indptr, # y = y + chunk * o
                                    chunk.indices, chunk.data, o.ravel(), y.ravel())
            del chunk, o
        
        for power_iter in xrange(power_iters):
            logger.info("running power iteration #%i" % (power_iter + 1))
            yold = y.copy()
            y[:] = 0.0
            chunker = itertools.groupby(enumerate(corpus), key = lambda (docno, doc): docno / chunks)
            for chunk_no, (key, group) in enumerate(chunker):
                logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunks, num_docs))
                chunk = matutils.corpus2csc((doc for _, doc in group), num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC
                tmp = chunk.T * yold
                tmp = chunk * tmp
                del chunk
                y += tmp
            del yold
    
    logger.info("orthonormalizing %s action matrix" % str(y.shape))
    y = [y]
    q, r = matutils.qr_destroy(y) # orthonormalize the range
    del y
    samples = clipSpectrum(numpy.diag(r), samples, discard = eps)
    qt = numpy.asfortranarray(q[:, :samples].T) # discard bogus columns, in case Y was rank-deficient
    del q
    
    if scipy.sparse.issparse(corpus):
        b = qt * corpus
        logger.info("2nd phase: running dense svd on %s matrix" % str(b.shape))
        u, s, vt = numpy.linalg.svd(b, full_matrices=False)
        del b, vt
    else:
        # second phase: construct the covariance matrix X = B * B.T, where B = Q.T * A
        # again, construct X incrementally, in chunks of `chunks` documents from the streaming 
        # input corpus A, to avoid using O(number of documents) memory
        x = numpy.zeros(shape = (samples, samples), dtype = dtype)
        logger.info("2nd phase: constructing %s covariance matrix" % str(x.shape))
        chunker = itertools.groupby(enumerate(corpus), key = lambda (docno, doc): docno / chunks)
        for chunk_no, (key, group) in enumerate(chunker):
            logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunks, num_docs))
            chunk = matutils.corpus2csc((doc for _, doc in group), num_terms=num_terms, dtype=dtype)
            b = qt * chunk # dense * sparse matrix multiply
            x += numpy.dot(b, b.T) # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :(
            del chunk, b
    
        # now we're ready to compute decomposition of the small matrix X
        logger.info("running dense decomposition on %s covariance matrix" % str(x.shape))
        u, s, vt = numpy.linalg.svd(x) # could use linalg.eigh, but who cares... and svd returns the factors already sorted :)
        s = numpy.sqrt(s) # sqrt to go back from singular values of X to singular values of B = singular values of the corpus
        
    logger.info("computing the final decomposition")
    keep = clipSpectrum(s**2, rank, discard=eps)
    u = numpy.asfortranarray(u[:, :keep])
    s = s[:keep]
    gemm = matutils.blas('gemm', u)
    u = gemm(1.0, qt, u, trans_a=True)
    return u, s
Exemple #10
0
 def merge(self, other, decay=1.0):
     """
     Merge this Projection with another. 
     
     The content of `other` is destroyed in the process, so pass this function a 
     copy of `other` if you need it further.
     """
     if other.u is None:
         # the other projection is empty => do nothing
         return
     if self.u is None:
         # we are empty => result of merge is the other projection, whatever it is
         self.u = other.u.copy('F')
         self.s = other.s.copy()
         return
     if self.m != other.m:
         raise ValueError("vector space mismatch: update is using %s features, expected %s" %
                          (other.m, self.m))
     logger.info("merging projections: %s + %s" % (str(self.u.shape), str(other.u.shape)))
     m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1]
     # TODO Maybe keep the bases as elementary reflectors, without 
     # forming explicit matrices with ORGQR.
     # The only operation we ever need is basis^T*basis ond basis*component.
     # But how to do that in scipy? And is it fast(er)?
     
     # find component of u2 orthogonal to u1
     # IMPORTANT: keep matrices in memory suitable order for matrix products; failing to do so gives 8x lower performance :(
     self.u = numpy.asfortranarray(self.u) # does nothing if input already fortran-order array
     other.u = numpy.asfortranarray(other.u)
     gemm = matutils.blas('gemm', self.u)
     logger.debug("constructing orthogonal component")
     c = gemm(1.0, self.u, other.u, trans_a = True)
     gemm(-1.0, self.u, c, beta = 1.0, c = other.u, overwrite_c = True)
     
     other.u = [other.u] # do some reference magic and call qr_destroy, to save RAM
     q, r = matutils.qr_destroy(other.u) # q, r = QR(component)
     assert not other.u
     
     # find the rotation that diagonalizes r
     k = numpy.bmat([[numpy.diag(decay * self.s), c * other.s], [matutils.pad(numpy.matrix([]).reshape(0, 0), min(m, n2), n1), r * other.s]])
     logger.debug("computing SVD of %s dense matrix" % str(k.shape))
     try:
         # in numpy < 1.1.0, running SVD sometimes results in "LinAlgError: SVD did not converge'.
         # for these early versions of numpy, catch the error and try to compute
         # SVD again, but over k*k^T.
         # see http://www.mail-archive.com/[email protected]/msg07224.html and
         # bug ticket http://projects.scipy.org/numpy/ticket/706
         u_k, s_k, _ = numpy.linalg.svd(k, full_matrices = False) # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper for partial svd/eigendecomp in numpy :(
     except numpy.linalg.LinAlgError:
         logging.error("SVD(A) failed; trying SVD(A * A^T)")
         u_k, s_k, _ = numpy.linalg.svd(numpy.dot(k, k.T), full_matrices = False) # if this fails too, give up with an exception
         s_k = numpy.sqrt(s_k) # go back from eigen values to singular values
     
     k = clipSpectrum(s_k ** 2, self.k)
     u1_k, u2_k, s_k = u_k[:n1, :k].copy('F'), u_k[n1:, :k].copy('F'), s_k[:k]
     
     # update & rotate current basis U = [U, U']*[U1_k, U2_k]
     logger.debug("updating orthonormal basis U")
     self.u = gemm(1.0, self.u, u1_k) # TODO temporarily creates an extra (m,k) dense array in memory. find a way to avoid this!
     gemm(1.0, q, u2_k, beta = 1.0, c = self.u, overwrite_c = True)
     self.s = s_k