Ejemplo n.º 1
0
    def test_simple_lists_of_tuples(self):
        # test list words

        # one document, one word
        potentialCorpus = [[(0, 4.)]]
        result = utils.isCorpus(potentialCorpus)
        expected = (True, potentialCorpus)
        self.assertEqual(expected, result)

        # one document, several words
        potentialCorpus = [[(0, 4.), (1, 2.)]]
        result = utils.isCorpus(potentialCorpus)
        expected = (True, potentialCorpus)
        self.assertEqual(expected, result)

        potentialCorpus = [[(0, 4.), (1, 2.), (2, 5.), (3, 8.)]]
        result = utils.isCorpus(potentialCorpus)
        expected = (True, potentialCorpus)
        self.assertEqual(expected, result)

        # several documents, one word
        potentialCorpus = [[(0, 4.)], [(1, 2.)]]
        result = utils.isCorpus(potentialCorpus)
        expected = (True, potentialCorpus)
        self.assertEqual(expected, result)

        potentialCorpus = [[(0, 4.)], [(1, 2.)], [(2, 5.)], [(3, 8.)]]
        result = utils.isCorpus(potentialCorpus)
        expected = (True, potentialCorpus)
        self.assertEqual(expected, result)
Ejemplo n.º 2
0
    def test_simple_lists_of_tuples(self):
        # test list words

        # one document, one word
        potentialCorpus = [[(0, 4.)]]
        result = utils.isCorpus(potentialCorpus)
        expected = (True, potentialCorpus)
        self.assertEqual(expected, result)

        # one document, several words
        potentialCorpus = [[(0, 4.), (1, 2.)]]
        result = utils.isCorpus(potentialCorpus)
        expected = (True, potentialCorpus)
        self.assertEqual(expected, result)

        potentialCorpus = [[(0, 4.), (1, 2.), (2, 5.), (3, 8.)]]
        result = utils.isCorpus(potentialCorpus)
        expected = (True, potentialCorpus)
        self.assertEqual(expected, result)

        # several documents, one word
        potentialCorpus = [[(0, 4.)], [(1, 2.)]]
        result = utils.isCorpus(potentialCorpus)
        expected = (True, potentialCorpus)
        self.assertEqual(expected, result)

        potentialCorpus = [[(0, 4.)], [(1, 2.)], [(2, 5.)], [(3, 8.)]]
        result = utils.isCorpus(potentialCorpus)
        expected = (True, potentialCorpus)
        self.assertEqual(expected, result)
Ejemplo n.º 3
0
 def __getitem__(self, bow, scaled=False):
     """
     Return latent representation, as a list of (topic_id, topic_value) 2-tuples.
     
     This is done by folding input document into the latent topic space. 
     
     Note that this function returns the latent space representation **scaled by the
     singular values**. To return non-scaled embedding, set `scaled` to False.
     """
     # if the input vector is in fact a corpus, return a transformed corpus as a result
     is_corpus, bow = utils.isCorpus(bow)
     if is_corpus:
         return self._apply(bow)
     
     assert self.projection.u is not None, "decomposition not initialized yet"
     vec = matutils.sparse2full(bow, self.numTerms).astype(self.projection.u.dtype)
     vec.shape = (self.numTerms, 1)
     assert self.projection.u.flags.f_contiguous
     dgemv = matutils.blas('gemv', self.projection.u)
     topicDist = dgemv(1.0, self.projection.u, vec, trans=True) # u^T * x
     if scaled:
         topicDist = (1.0 / self.projection.s) * topicDist # s^-1 * u^T * x
     
     nnz = topicDist.nonzero()[0]
     return zip(nnz, topicDist[nnz])
Ejemplo n.º 4
0
    def __getitem__(self, bow, scaled=False):
        """
        Return latent representation, as a list of (topic_id, topic_value) 2-tuples.
        
        This is done by folding input document into the latent topic space. 
        
        Note that this function returns the latent space representation **scaled by the
        singular values**. To return non-scaled embedding, set `scaled` to False.
        """
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.isCorpus(bow)
        if is_corpus:
            return self._apply(bow)

        assert self.projection.u is not None, "decomposition not initialized yet"
        vec = matutils.sparse2full(bow, self.numTerms).astype(
            self.projection.u.dtype)
        vec.shape = (self.numTerms, 1)
        assert self.projection.u.flags.f_contiguous
        dgemv = matutils.blas('gemv', self.projection.u)
        topicDist = dgemv(1.0, self.projection.u, vec, trans=True)  # u^T * x
        if scaled:
            topicDist = (1.0 / self.projection.s) * topicDist  # s^-1 * u^T * x

        nnz = topicDist.nonzero()[0]
        return zip(nnz, topicDist[nnz])
Ejemplo n.º 5
0
    def __getitem__(self, bow, scaled=False):
        """
        Return latent representation, as a list of (topic_id, topic_value) 2-tuples.
        
        This is done by folding input document into the latent topic space. 
        
        Note that this function returns the latent space representation **scaled by the
        singular values**. To return non-scaled embedding, set `scaled` to False.
        """
        # if the input vector is in fact a corpus, return a transformed corpus as result
        if utils.isCorpus(bow):
            return self._apply(bow)

        assert self.projection.u is not None, "decomposition not initialized yet"
        vec = numpy.asfortranarray(matutils.sparse2full(bow, self.numTerms),
                                   dtype=self.projection.u.dtype)
        vec.shape = (self.numTerms, 1)
        topicDist = scipy.linalg.fblas.dgemv(1.0,
                                             self.projection.u,
                                             vec,
                                             trans=True)  # u^T * x
        if scaled:
            topicDist = (1.0 / self.projection.s) * topicDist  # s^-1 * u^T * x
        return [(topicId, float(topicValue))
                for topicId, topicValue in enumerate(topicDist)
                if numpy.isfinite(topicValue)
                and not numpy.allclose(topicValue, 0.0)]
    def __init__(self, m, k, docs = None):
        """
        Store (U, S) projection itself. This is the class taking care of 'core math';
        interfacing with corpora, training etc is done through class LsiModel.
        
        `docs` is either a spare matrix or a corpus which, when converted to a 
        sparse matrix, must fit comfortably into main memory.
        """

        self.m, self.k = m, k
        if docs is not None:
            # base case decomposition: given a job `docs`, compute its decomposition 
            # in core, algorithm 1
            if utils.isCorpus(docs):
                docs = matutils.corpus2csc(m, docs)
            if m * k < 10000:
                # SVDLIBC gives spurious results for small matrices.. run full
                # LAPACK svd on them instead
                docs = docs.todense()
                logger.info("computing dense SVD of %s matrix" % str(docs.shape))
                u, s, vt = numpy.linalg.svd(docs, full_matrices = False)
            else:
                try:
                    import sparsesvd
                except ImportError:
                    raise ImportError("for LSA, the `sparsesvd` module is needed but not found; run `easy_install sparsesvd`")
                logger.info("computing sparse SVD of %s matrix" % str(docs.shape))
                ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested
                u = ut.T
                del ut
            del vt
            k = clipSpectrum(s, self.k)
            self.u, self.s = u[:, :k], s[:k]
        else:
            self.u, self.s = None, None
    def __init__(self, m, k, docs = None):
        """
        Store (U, S) projection itself. This is the class taking care of 'core math';
        interfacing with corpora, training etc is done through class LsiModel.
        
        `docs` is either a spare matrix or a corpus which, when converted to a 
        sparse matrix, must fit comfortably into main memory.
        """

        self.m, self.k = m, k
        if docs is not None:
            # base case decomposition: given a job `docs`, compute its decomposition 
            # in core, algorithm 1
            if utils.isCorpus(docs):
                docs = matutils.corpus2csc(m, docs)
            if m * k < 10000:
                # SVDLIBC gives spurious results for small matrices.. run full
                # LAPACK svd on them instead
                docs = docs.todense()
                logger.info("computing dense SVD of %s matrix" % str(docs.shape))
                u, s, vt = numpy.linalg.svd(docs, full_matrices = False)
            else:
                try:
                    import sparsesvd
                except ImportError:
                    raise ImportError("for LSA, the `sparsesvd` module is needed but not found; run `easy_install sparsesvd`")
                logger.info("computing sparse SVD of %s matrix" % str(docs.shape))
                ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested
                u = ut.T
                del ut
            del vt
            k = clipSpectrum(s, self.k)
            self.u, self.s = u[:, :k], s[:k]
        else:
            self.u, self.s = None, None
 def __getitem__(self, bow):
     """
     Return representation with the ids transformed.
     """
     # if the input vector is in fact a corpus, return a transformed corpus as a result
     if utils.isCorpus(bow):
         return self._apply(bow)
     
     return [(self.old2new[oldid], weight) for oldid, weight in bow if oldid in self.old2new]
 def __getitem__(self, bow):
     """
     Return representation with the ids transformed.
     """
     # if the input vector is in fact a corpus, return a transformed corpus as a result
     if utils.isCorpus(bow):
         return self._apply(bow)
     
     return [(self.old2new[oldid], weight) for oldid, weight in bow if oldid in self.old2new]
 def __getitem__(self, bow):
     """
     Return RP representation of the input vector and/or corpus.
     """
     # if the input vector is in fact a corpus, return a transformed corpus as result
     if utils.isCorpus(bow):
         return self._apply(bow)
     
     vec = matutils.sparse2full(bow, self.numTerms).reshape(self.numTerms, 1)
     topicDist = (self.projection * vec) / numpy.sqrt(self.numTopics) # (1, d) * (d, k) = (1, k)
     return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist.flat)
             if numpy.isfinite(topicValue) and not numpy.allclose(topicValue, 0.0)]
 def __getitem__(self, bow):
     """
     Return RP representation of the input vector and/or corpus.
     """
     # if the input vector is in fact a corpus, return a transformed corpus as result
     is_corpus, bow = utils.isCorpus(bow)
     if is_corpus:
         return self._apply(bow)
     
     vec = matutils.sparse2full(bow, self.numTerms).reshape(self.numTerms, 1) / numpy.sqrt(self.numTopics)
     vec = numpy.asfortranarray(vec, dtype = numpy.float32)
     topicDist = scipy.linalg.fblas.sgemv(1.0, self.projection, vec)  # (1, d) * (d, k) = (1, k)
     return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist.flat)
             if numpy.isfinite(topicValue) and not numpy.allclose(topicValue, 0.0)]
Ejemplo n.º 12
0
 def test_invalid_formats(self):
     # test invalid formats
     # these are no corpus, because they do not consists of 2-tuples with
     # the form(int, float).
     potentials = list()
     potentials.append(["human"])
     potentials.append("human")
     potentials.append(["human", "star"])
     potentials.append([1, 2, 3, 4, 5, 5])
     potentials.append([[(0, 'string')]])
     for noCorpus in potentials:
         result = utils.isCorpus(noCorpus)
         expected = (False, noCorpus)
         self.assertEqual(expected, result)
Ejemplo n.º 13
0
    def __getitem__(self, bow):
        """
        Return RP representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus as result
        is_corpus, bow = utils.isCorpus(bow)
        if is_corpus:
            return self._apply(bow)

        vec = matutils.sparse2full(bow, self.numTerms).reshape(self.numTerms, 1) / numpy.sqrt(self.numTopics)
        vec = numpy.asfortranarray(vec, dtype=numpy.float32)
        topicDist = scipy.linalg.fblas.sgemv(1.0, self.projection, vec)  # (k, d) * (d, 1) = (k, 1)
        return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist.flat)
                if numpy.isfinite(topicValue) and not numpy.allclose(topicValue, 0.0)]
Ejemplo n.º 14
0
 def test_invalid_formats(self):
     # test invalid formats
     # these are no corpus, because they do not consists of 2-tuples with
     # the form(int, float).
     potentials = list()
     potentials.append(["human"])
     potentials.append("human")
     potentials.append(["human", "star"])
     potentials.append([1, 2, 3, 4, 5, 5])
     potentials.append([[(0, 'string')]])
     for noCorpus in potentials:
         result = utils.isCorpus(noCorpus)
         expected = (False, noCorpus)
         self.assertEqual(expected, result)
Ejemplo n.º 15
0
    def __getitem__(self, bow):
        """
        Return log entropy representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus
        is_corpus, bow = utils.isCorpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge)
        vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id))
                  for term_id, tf in bow if term_id in self.entr]
        if self.normalize:
            vector = matutils.unitVec(vector)
        return vector
 def __getitem__(self, bow):
     """
     Return tf-idf representation of the input vector and/or corpus.
     """
     # if the input vector is in fact a corpus, return a transformed corpus as result
     if utils.isCorpus(bow):
         return self._apply(bow)
     
     # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
     # as would the strict application of the IDF formula suggest
     vector = [(termId, tf * self.idfs.get(termId, 0.0)) 
               for termId, tf in bow if self.idfs.get(termId, 0.0) != 0.0]
     if self.normalize:
         vector = matutils.unitVec(vector)
     return vector
Ejemplo n.º 17
0
    def __getitem__(self, bow):
        """
        Return log entropy representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus
        is_corpus, bow = utils.isCorpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge)
        vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id))
                  for term_id, tf in bow if term_id in self.entr]
        if self.normalize:
            vector = matutils.unitVec(vector)
        return vector
    def __getitem__(self, bow):
        """
        Return RP representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus as result
        if utils.isCorpus(bow):
            return self._apply(bow)

        vec = matutils.sparse2full(bow,
                                   self.numTerms).reshape(self.numTerms, 1)
        topicDist = (self.projection * vec) / numpy.sqrt(
            self.numTopics)  # (1, d) * (d, k) = (1, k)
        return [(topicId, float(topicValue))
                for topicId, topicValue in enumerate(topicDist.flat)
                if numpy.isfinite(topicValue)
                and not numpy.allclose(topicValue, 0.0)]
Ejemplo n.º 19
0
    def __getitem__(self, bow, eps=0.01):
        """
        Return topic distribution for the given document `bow`, as a list of
        (topic_id, topic_probability) 2-tuples.

        Ignore topics with very low probability (below `eps`).
        """
        # if the input vector is in fact a corpus, return a transformed corpus as result
        is_corpus, corpus = utils.isCorpus(bow)
        if is_corpus:
            return self._apply(corpus)

        gamma, _ = self.inference([bow])
        theta = numpy.exp(dirichlet_expectation(gamma[0]))
        topicDist = theta / theta.sum() # normalize to proper distribution
        return [(topicId, topicValue) for topicId, topicValue in enumerate(topicDist)
                if topicValue >= eps] # ignore document's topics that have prob < eps
 def __getitem__(self, bow):
     """
     Return topic distribution for the given document, as a list of 
     (topic_id, topic_value) 2-tuples.
     
     Ignore topics with very low probability (below 0.001).
     """
     # if the input vector is in fact a corpus, return a transformed corpus as result
     if utils.isCorpus(bow):
         return self._apply(bow)
     
     likelihood, phi, gamma = self.inference(bow)
     gamma -= self.alpha # subtract topic prior, to get the expected number of words for each topic
     sumGamma = gamma.sum()
     if numpy.allclose(sumGamma, 0): # if there were no topics found, return nothing (eg for empty documents)
         return []
     topicDist = gamma / sumGamma # convert to proper distribution
     return [(topicId, topicValue) for topicId, topicValue in enumerate(topicDist)
             if topicValue >= 0.001] # ignore topics with prob < 0.001
Ejemplo n.º 21
0
 def __getitem__(self, bow):
     """
     Return topic distribution for the given document, as a list of 
     (topic_id, topic_value) 2-tuples.
     
     Ignore topics with very low probability (below 0.001).
     """
     # if the input vector is in fact a corpus, return a transformed corpus as result
     if utils.isCorpus(bow):
         return self._apply(bow)
     
     likelihood, phi, gamma = self.inference(bow)
     gamma -= self.alpha # subtract topic prior, to get the expected number of words for each topic
     sumGamma = gamma.sum()
     if numpy.allclose(sumGamma, 0): # if there were no topics found, return nothing (eg for empty documents)
         return []
     topicDist = gamma / sumGamma # convert to proper distribution
     return [(topicId, topicValue) for topicId, topicValue in enumerate(topicDist)
             if topicValue >= 0.001] # ignore topics with prob < 0.001
Ejemplo n.º 22
0
    def __getitem__(self, bow, eps=0.01):
        """
        Return topic distribution for the given document `bow`, as a list of
        (topic_id, topic_probability) 2-tuples.

        Ignore topics with very low probability (below `eps`).
        """
        # if the input vector is in fact a corpus, return a transformed corpus as result
        is_corpus, corpus = utils.isCorpus(bow)
        if is_corpus:
            return self._apply(corpus)

        gamma, _ = self.inference([bow])
        theta = numpy.exp(dirichlet_expectation(gamma[0]))
        topicDist = theta / theta.sum()  # normalize to proper distribution
        return [(topicId, topicValue)
                for topicId, topicValue in enumerate(topicDist)
                if topicValue >= eps
                ]  # ignore document's topics that have prob < eps
Ejemplo n.º 23
0
 def __getitem__(self, bow, eps=0.01):
     """
     Return topic distribution for the given document `bow`, as a list of 
     (topic_id, topic_probability) 2-tuples.
     
     Ignore topics with very low probability (below `eps`).
     """
     # if the input vector is in fact a corpus, return a transformed corpus as result
     is_corpus, corpus = utils.isCorpus(bow)
     if is_corpus:
         return self._apply(corpus)
     
     gamma, _ = self.inference([bow])
     gamma = gamma[0] # inference was over a chunk of size 1
     if numpy.allclose(gamma, self.alpha): # if there were no topics found, return nothing (eg for empty documents)
         return []
     topicDist = gamma / gamma.sum() # convert to proper distribution
     return [(topicId, topicValue) for topicId, topicValue in enumerate(topicDist)
             if topicValue >= eps] # ignore document's topics that have prob < eps
Ejemplo n.º 24
0
 def __getitem__(self, bow, scaled = True):
     """
     Return latent distribution, as a list of (topic_id, topic_value) 2-tuples.
     
     This is done by folding input document into the latent topic space. 
     
     Note that this function returns the latent space representation **scaled by the
     singular values**. To return non-scaled embedding, set `scaled` to False.
     """
     # if the input vector is in fact a corpus, return a transformed corpus as result
     if utils.isCorpus(bow):
         return self._apply(bow)
     
     vec = matutils.doc2vec(bow, self.numTerms)
     vec.shape = (self.numTerms, 1)
     topicDist = self.projection * vec
     if not scaled:
         topicDist = numpy.diag(numpy.diag(1.0 / self.s)) * topicDist
     return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist)
             if numpy.isfinite(topicValue) and not numpy.allclose(topicValue, 0.0)]
 def __getitem__(self, bow, scaled = True):
     """
     Return latent distribution, as a list of (topic_id, topic_value) 2-tuples.
     
     This is done by folding input document into the latent topic space. 
     
     Note that this function returns the latent space representation **scaled by the
     singular values**. To return non-scaled embedding, set `scaled` to False.
     """
     # if the input vector is in fact a corpus, return a transformed corpus as result
     if utils.isCorpus(bow):
         return self._apply(bow)
     
     vec = matutils.sparse2full(bow, self.numTerms)
     vec.shape = (self.numTerms, 1)
     topicDist = self.projection * vec
     if not scaled:
         topicDist = numpy.diag(numpy.diag(1.0 / self.s)) * topicDist
     return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist)
             if numpy.isfinite(topicValue) and not numpy.allclose(topicValue, 0.0)]
Ejemplo n.º 26
0
    def __init__(self, m, k, docs=None):
        """
        Store (U, S) projection itself. This is the class taking care of 'core math';
        interfacing with corpora, training etc is done through class LsiModel.
        
        `docs` is either a spare matrix or a corpus which, when converted to a 
        sparse matrix, must fit comfortably into main memory.
        """

        self.m, self.k = m, k
        if docs is not None:
            # base case decomposition: given a job `docs`, compute its decomposition
            # in core, algorithm 1
            if utils.isCorpus(docs):
                docs = matutils.corpus2csc(m, docs)
            if docs.shape[1] <= max(k, 100):
                # for sufficiently small chunk size, compute svd(now, a) instead of svd(now, svd(a)).
                # this improves accuracy and is also faster for small chunks, because
                # we need to do one less svd.
                # on larger chunks this doesn't work because we run out of memory (chunks=1000
                # would already raise MemoryException on my machine)
                self.u = docs
                self.s = None
            else:
                try:
                    import sparsesvd
                except ImportError:
                    raise ImportError(
                        "for LSA, the `sparsesvd` module is needed but not found; run `easy_install sparsesvd`"
                    )
                logger.info("computing sparse SVD of %s matrix" %
                            str(docs.shape))
                ut, s, vt = sparsesvd.sparsesvd(
                    docs, k + 30
                )  # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested
                u = ut.T
                del ut, vt
                k = clipSpectrum(s, self.k)
                self.u, self.s = u[:, :k], s[:k]
        else:
            self.u, self.s = None, None
 def __getitem__(self, bow, scaled = False):
     """
     Return latent representation, as a list of (topic_id, topic_value) 2-tuples.
     
     This is done by folding input document into the latent topic space. 
     
     Note that this function returns the latent space representation **scaled by the
     singular values**. To return non-scaled embedding, set `scaled` to False.
     """
     # if the input vector is in fact a corpus, return a transformed corpus as result
     if utils.isCorpus(bow):
         return self._apply(bow)
     
     assert self.projection.u is not None, "decomposition not initialized yet"
     vec = numpy.asfortranarray(matutils.sparse2full(bow, self.numTerms), dtype = self.projection.u.dtype)
     vec.shape = (self.numTerms, 1)
     topicDist = scipy.linalg.fblas.dgemv(1.0, self.projection.u, vec, trans = True) # u^T * x
     if scaled:
         topicDist = (1.0 / self.projection.s) * topicDist # s^-1 * u^T * x
     return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist)
             if numpy.isfinite(topicValue) and not numpy.allclose(topicValue, 0.0)]
    def __init__(self, m, k, docs=None):
        """
        Store (U, S) projection itself. This is the class taking care of 'core math';
        interfacing with corpora, training etc is done through class LsiModel.
        
        `docs` is either a spare matrix or a corpus which, when converted to a 
        sparse matrix, must fit comfortably into main memory.
        """

        self.m, self.k = m, k
        if docs is not None:
            # base case decomposition: given a job `docs`, compute its decomposition
            # in core, algorithm 1
            if utils.isCorpus(docs):
                docs = matutils.corpus2csc(m, docs)
            if docs.shape[1] <= max(k, 100):
                # for sufficiently small chunk size, compute svd(now, a) instead of svd(now, svd(a)).
                # this improves accuracy and is also faster for small chunks, because
                # we need to do one less svd.
                # on larger chunks this doesn't work because we run out of memory (chunks=1000
                # would already raise MemoryException on my machine)
                self.u = docs
                self.s = None
            else:
                try:
                    import sparsesvd
                except ImportError:
                    raise ImportError(
                        "for LSA, the `sparsesvd` module is needed but not found; run `easy_install sparsesvd`"
                    )
                logger.info("computing sparse SVD of %s matrix" % str(docs.shape))
                ut, s, vt = sparsesvd.sparsesvd(
                    docs, k + 30
                )  # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested
                u = ut.T
                del ut, vt
                k = clipSpectrum(s, self.k)
                self.u, self.s = u[:, :k], s[:k]
        else:
            self.u, self.s = None, None
Ejemplo n.º 29
0
 def test_None(self):
     # test None
     result = utils.isCorpus(None)
     expected = (False, None)
     self.assertEqual(expected, result)
Ejemplo n.º 30
0
 def test_int_tuples(self):
     potentialCorpus = [[(0, 4)]]
     result = utils.isCorpus(potentialCorpus)
     expected = (True, potentialCorpus)
     self.assertEqual(expected, result)
Ejemplo n.º 31
0
 def test_None(self):
     # test None
     result = utils.isCorpus(None)
     expected = (False, None)
     self.assertEqual(expected, result)
    def addDocuments(self, corpus, chunks = None, decay = None):
        """
        Update singular value decomposition factors to take into account a new 
        corpus of documents.
        
        Training proceeds in chunks of `chunks` documents at a time. If the 
        distributed mode is on, each chunk is sent to a different worker/computer.
        Size of `chunks` is a tradeoff between increased speed (bigger `chunks`) vs. 
        lower memory footprint (smaller `chunks`). Default is processing 10,000 documents
        at a time.

        Setting `decay` < 1.0 causes re-orientation towards new data trends in the 
        input document stream, by giving less emphasis to old observations. This allows
        SVD to gradually "forget" old observations and give more preference to 
        new ones. The decay is applied once after every `chunks` documents.
        """
        logger.info("updating SVD with new documents")
        
        # get computation parameters; if not specified, use the ones from constructor
        if chunks is None:
            chunks = self.chunks
        if decay is None:
            decay = self.decay
        
        if utils.isCorpus(corpus):
            # do the actual work -- perform iterative singular value decomposition.
            chunker = itertools.groupby(enumerate(corpus), key = lambda val: val[0] / chunks)
            doc_no = 0
            for chunk_no, (key, group) in enumerate(chunker):
                # construct the job as a sparse matrix, to minimize memory overhead
                # definitely avoid materializing it as a dense matrix!
                job = matutils.corpus2csc(self.numTerms, (doc for _, doc in group))
                doc_no += job.shape[1]
                if self.dispatcher:
                    # distributed version: add this job to the job queue, so workers can work on it
                    logger.debug("creating job #%i" % chunk_no)
                    self.dispatcher.putjob(job) # put job into queue; this will eventually block, because the queue has a small finite size
                    del job
                    logger.info("dispatched documents up to #%s" % doc_no)
                else:
                    # serial version, there is only one "worker" (myself) => process the job directly
                    update = Projection(self.numTerms, self.numTopics, job)
                    del job
                    self.projection.merge(update, decay = decay)
                    del update
                    logger.info("processed documents up to #%s" % doc_no)
                    self.printDebug(5)
            
            if self.dispatcher:
                logger.info("reached the end of input; now waiting for all remaining jobs to finish")
                import time
                while self.dispatcher.jobsdone() <= chunk_no:
                    time.sleep(0.5) # check every half a second
                logger.info("all jobs finished, downloading final projection")
                del self.projection
                self.projection = self.dispatcher.getstate()
                logger.info("decomposition complete")
        else:
            assert not self.dispatcher, "must be in serial mode to receive jobs"
            assert isinstance(corpus, scipy.sparse.csc_matrix)
            update = Projection(self.numTerms, self.numTopics, corpus)
            self.projection.merge(update, decay = decay)
            logger.info("processed sparse job of %i documents" % (corpus.shape[1]))
            self.printDebug(5)
Ejemplo n.º 33
0
 def test_int_tuples(self):
     potentialCorpus = [[(0, 4)]]
     result = utils.isCorpus(potentialCorpus)
     expected = (True, potentialCorpus)
     self.assertEqual(expected, result)
Ejemplo n.º 34
0
    def addDocuments(self, corpus, chunks=None, decay=None):
        """
        Update singular value decomposition factors to take into account a new 
        corpus of documents.
        
        Training proceeds in chunks of `chunks` documents at a time. If the 
        distributed mode is on, each chunk is sent to a different worker/computer.
        Size of `chunks` is a tradeoff between increased speed (bigger `chunks`) vs. 
        lower memory footprint (smaller `chunks`).

        Setting `decay` < 1.0 causes re-orientation towards new data trends in the 
        input document stream, by giving less emphasis to old observations. This allows
        SVD to gradually "forget" old observations and give more preference to 
        new ones. The decay is applied once after every `chunks` documents.
        """
        logger.info("updating SVD with new documents")

        # get computation parameters; if not specified, use the ones from constructor
        if chunks is None:
            chunks = self.chunks
        if decay is None:
            decay = self.decay

        if utils.isCorpus(corpus):
            # do the actual work -- perform iterative singular value decomposition.
            chunker = itertools.groupby(enumerate(corpus),
                                        key=lambda val: val[0] / chunks)
            doc_no = 0
            for chunk_no, (key, group) in enumerate(chunker):
                # construct the job as a sparse matrix, to minimize memory overhead
                # definitely avoid materializing it as a dense matrix!
                job = matutils.corpus2csc(self.numTerms,
                                          (doc for _, doc in group))
                doc_no += job.shape[1]
                if self.dispatcher:
                    # distributed version: add this job to the job queue, so workers can work on it
                    logger.debug("creating job #%i" % chunk_no)
                    self.dispatcher.putjob(
                        job
                    )  # put job into queue; this will eventually block, because the queue has a small finite size
                    del job
                    logger.info("dispatched documents up to #%s" % doc_no)
                else:
                    # serial version, there is only one "worker" (myself) => process the job directly
                    update = Projection(self.numTerms, self.numTopics, job)
                    del job
                    self.projection.merge(update, decay=decay)
                    del update
                    logger.info("processed documents up to #%s" % doc_no)
                    #self.printDebug(5)
                    self.printTopics(
                        5
                    )  # TODO see if printDebug works and remove one of these..

            if self.dispatcher:
                logger.info(
                    "reached the end of input; now waiting for all remaining jobs to finish"
                )
                import time
                while self.dispatcher.jobsdone() <= chunk_no:
                    time.sleep(0.5)  # check every half a second
                logger.info("all jobs finished, downloading final projection")
                del self.projection
                self.projection = self.dispatcher.getstate()
                logger.info("decomposition complete")
        else:
            assert not self.dispatcher, "must be in serial mode to receive jobs"
            assert isinstance(corpus, scipy.sparse.csc_matrix)
            update = Projection(self.numTerms, self.numTopics, corpus)
            self.projection.merge(update, decay=decay)
            logger.info("processed sparse job of %i documents" %
                        (corpus.shape[1]))
            self.printTopics(5)