def test_simple_lists_of_tuples(self): # test list words # one document, one word potentialCorpus = [[(0, 4.)]] result = utils.isCorpus(potentialCorpus) expected = (True, potentialCorpus) self.assertEqual(expected, result) # one document, several words potentialCorpus = [[(0, 4.), (1, 2.)]] result = utils.isCorpus(potentialCorpus) expected = (True, potentialCorpus) self.assertEqual(expected, result) potentialCorpus = [[(0, 4.), (1, 2.), (2, 5.), (3, 8.)]] result = utils.isCorpus(potentialCorpus) expected = (True, potentialCorpus) self.assertEqual(expected, result) # several documents, one word potentialCorpus = [[(0, 4.)], [(1, 2.)]] result = utils.isCorpus(potentialCorpus) expected = (True, potentialCorpus) self.assertEqual(expected, result) potentialCorpus = [[(0, 4.)], [(1, 2.)], [(2, 5.)], [(3, 8.)]] result = utils.isCorpus(potentialCorpus) expected = (True, potentialCorpus) self.assertEqual(expected, result)
def __getitem__(self, bow, scaled=False): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. Note that this function returns the latent space representation **scaled by the singular values**. To return non-scaled embedding, set `scaled` to False. """ # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.isCorpus(bow) if is_corpus: return self._apply(bow) assert self.projection.u is not None, "decomposition not initialized yet" vec = matutils.sparse2full(bow, self.numTerms).astype(self.projection.u.dtype) vec.shape = (self.numTerms, 1) assert self.projection.u.flags.f_contiguous dgemv = matutils.blas('gemv', self.projection.u) topicDist = dgemv(1.0, self.projection.u, vec, trans=True) # u^T * x if scaled: topicDist = (1.0 / self.projection.s) * topicDist # s^-1 * u^T * x nnz = topicDist.nonzero()[0] return zip(nnz, topicDist[nnz])
def __getitem__(self, bow, scaled=False): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. Note that this function returns the latent space representation **scaled by the singular values**. To return non-scaled embedding, set `scaled` to False. """ # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.isCorpus(bow) if is_corpus: return self._apply(bow) assert self.projection.u is not None, "decomposition not initialized yet" vec = matutils.sparse2full(bow, self.numTerms).astype( self.projection.u.dtype) vec.shape = (self.numTerms, 1) assert self.projection.u.flags.f_contiguous dgemv = matutils.blas('gemv', self.projection.u) topicDist = dgemv(1.0, self.projection.u, vec, trans=True) # u^T * x if scaled: topicDist = (1.0 / self.projection.s) * topicDist # s^-1 * u^T * x nnz = topicDist.nonzero()[0] return zip(nnz, topicDist[nnz])
def __getitem__(self, bow, scaled=False): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. Note that this function returns the latent space representation **scaled by the singular values**. To return non-scaled embedding, set `scaled` to False. """ # if the input vector is in fact a corpus, return a transformed corpus as result if utils.isCorpus(bow): return self._apply(bow) assert self.projection.u is not None, "decomposition not initialized yet" vec = numpy.asfortranarray(matutils.sparse2full(bow, self.numTerms), dtype=self.projection.u.dtype) vec.shape = (self.numTerms, 1) topicDist = scipy.linalg.fblas.dgemv(1.0, self.projection.u, vec, trans=True) # u^T * x if scaled: topicDist = (1.0 / self.projection.s) * topicDist # s^-1 * u^T * x return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist) if numpy.isfinite(topicValue) and not numpy.allclose(topicValue, 0.0)]
def __init__(self, m, k, docs = None): """ Store (U, S) projection itself. This is the class taking care of 'core math'; interfacing with corpora, training etc is done through class LsiModel. `docs` is either a spare matrix or a corpus which, when converted to a sparse matrix, must fit comfortably into main memory. """ self.m, self.k = m, k if docs is not None: # base case decomposition: given a job `docs`, compute its decomposition # in core, algorithm 1 if utils.isCorpus(docs): docs = matutils.corpus2csc(m, docs) if m * k < 10000: # SVDLIBC gives spurious results for small matrices.. run full # LAPACK svd on them instead docs = docs.todense() logger.info("computing dense SVD of %s matrix" % str(docs.shape)) u, s, vt = numpy.linalg.svd(docs, full_matrices = False) else: try: import sparsesvd except ImportError: raise ImportError("for LSA, the `sparsesvd` module is needed but not found; run `easy_install sparsesvd`") logger.info("computing sparse SVD of %s matrix" % str(docs.shape)) ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested u = ut.T del ut del vt k = clipSpectrum(s, self.k) self.u, self.s = u[:, :k], s[:k] else: self.u, self.s = None, None
def __getitem__(self, bow): """ Return representation with the ids transformed. """ # if the input vector is in fact a corpus, return a transformed corpus as a result if utils.isCorpus(bow): return self._apply(bow) return [(self.old2new[oldid], weight) for oldid, weight in bow if oldid in self.old2new]
def __getitem__(self, bow): """ Return RP representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus as result if utils.isCorpus(bow): return self._apply(bow) vec = matutils.sparse2full(bow, self.numTerms).reshape(self.numTerms, 1) topicDist = (self.projection * vec) / numpy.sqrt(self.numTopics) # (1, d) * (d, k) = (1, k) return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist.flat) if numpy.isfinite(topicValue) and not numpy.allclose(topicValue, 0.0)]
def __getitem__(self, bow): """ Return RP representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus as result is_corpus, bow = utils.isCorpus(bow) if is_corpus: return self._apply(bow) vec = matutils.sparse2full(bow, self.numTerms).reshape(self.numTerms, 1) / numpy.sqrt(self.numTopics) vec = numpy.asfortranarray(vec, dtype = numpy.float32) topicDist = scipy.linalg.fblas.sgemv(1.0, self.projection, vec) # (1, d) * (d, k) = (1, k) return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist.flat) if numpy.isfinite(topicValue) and not numpy.allclose(topicValue, 0.0)]
def test_invalid_formats(self): # test invalid formats # these are no corpus, because they do not consists of 2-tuples with # the form(int, float). potentials = list() potentials.append(["human"]) potentials.append("human") potentials.append(["human", "star"]) potentials.append([1, 2, 3, 4, 5, 5]) potentials.append([[(0, 'string')]]) for noCorpus in potentials: result = utils.isCorpus(noCorpus) expected = (False, noCorpus) self.assertEqual(expected, result)
def __getitem__(self, bow): """ Return RP representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus as result is_corpus, bow = utils.isCorpus(bow) if is_corpus: return self._apply(bow) vec = matutils.sparse2full(bow, self.numTerms).reshape(self.numTerms, 1) / numpy.sqrt(self.numTopics) vec = numpy.asfortranarray(vec, dtype=numpy.float32) topicDist = scipy.linalg.fblas.sgemv(1.0, self.projection, vec) # (k, d) * (d, 1) = (k, 1) return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist.flat) if numpy.isfinite(topicValue) and not numpy.allclose(topicValue, 0.0)]
def __getitem__(self, bow): """ Return log entropy representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus is_corpus, bow = utils.isCorpus(bow) if is_corpus: return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge) vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id)) for term_id, tf in bow if term_id in self.entr] if self.normalize: vector = matutils.unitVec(vector) return vector
def __getitem__(self, bow): """ Return tf-idf representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus as result if utils.isCorpus(bow): return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge weight, # as would the strict application of the IDF formula suggest vector = [(termId, tf * self.idfs.get(termId, 0.0)) for termId, tf in bow if self.idfs.get(termId, 0.0) != 0.0] if self.normalize: vector = matutils.unitVec(vector) return vector
def __getitem__(self, bow): """ Return RP representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus as result if utils.isCorpus(bow): return self._apply(bow) vec = matutils.sparse2full(bow, self.numTerms).reshape(self.numTerms, 1) topicDist = (self.projection * vec) / numpy.sqrt( self.numTopics) # (1, d) * (d, k) = (1, k) return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist.flat) if numpy.isfinite(topicValue) and not numpy.allclose(topicValue, 0.0)]
def __getitem__(self, bow, eps=0.01): """ Return topic distribution for the given document `bow`, as a list of (topic_id, topic_probability) 2-tuples. Ignore topics with very low probability (below `eps`). """ # if the input vector is in fact a corpus, return a transformed corpus as result is_corpus, corpus = utils.isCorpus(bow) if is_corpus: return self._apply(corpus) gamma, _ = self.inference([bow]) theta = numpy.exp(dirichlet_expectation(gamma[0])) topicDist = theta / theta.sum() # normalize to proper distribution return [(topicId, topicValue) for topicId, topicValue in enumerate(topicDist) if topicValue >= eps] # ignore document's topics that have prob < eps
def __getitem__(self, bow): """ Return topic distribution for the given document, as a list of (topic_id, topic_value) 2-tuples. Ignore topics with very low probability (below 0.001). """ # if the input vector is in fact a corpus, return a transformed corpus as result if utils.isCorpus(bow): return self._apply(bow) likelihood, phi, gamma = self.inference(bow) gamma -= self.alpha # subtract topic prior, to get the expected number of words for each topic sumGamma = gamma.sum() if numpy.allclose(sumGamma, 0): # if there were no topics found, return nothing (eg for empty documents) return [] topicDist = gamma / sumGamma # convert to proper distribution return [(topicId, topicValue) for topicId, topicValue in enumerate(topicDist) if topicValue >= 0.001] # ignore topics with prob < 0.001
def __getitem__(self, bow, eps=0.01): """ Return topic distribution for the given document `bow`, as a list of (topic_id, topic_probability) 2-tuples. Ignore topics with very low probability (below `eps`). """ # if the input vector is in fact a corpus, return a transformed corpus as result is_corpus, corpus = utils.isCorpus(bow) if is_corpus: return self._apply(corpus) gamma, _ = self.inference([bow]) theta = numpy.exp(dirichlet_expectation(gamma[0])) topicDist = theta / theta.sum() # normalize to proper distribution return [(topicId, topicValue) for topicId, topicValue in enumerate(topicDist) if topicValue >= eps ] # ignore document's topics that have prob < eps
def __getitem__(self, bow, eps=0.01): """ Return topic distribution for the given document `bow`, as a list of (topic_id, topic_probability) 2-tuples. Ignore topics with very low probability (below `eps`). """ # if the input vector is in fact a corpus, return a transformed corpus as result is_corpus, corpus = utils.isCorpus(bow) if is_corpus: return self._apply(corpus) gamma, _ = self.inference([bow]) gamma = gamma[0] # inference was over a chunk of size 1 if numpy.allclose(gamma, self.alpha): # if there were no topics found, return nothing (eg for empty documents) return [] topicDist = gamma / gamma.sum() # convert to proper distribution return [(topicId, topicValue) for topicId, topicValue in enumerate(topicDist) if topicValue >= eps] # ignore document's topics that have prob < eps
def __getitem__(self, bow, scaled = True): """ Return latent distribution, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. Note that this function returns the latent space representation **scaled by the singular values**. To return non-scaled embedding, set `scaled` to False. """ # if the input vector is in fact a corpus, return a transformed corpus as result if utils.isCorpus(bow): return self._apply(bow) vec = matutils.doc2vec(bow, self.numTerms) vec.shape = (self.numTerms, 1) topicDist = self.projection * vec if not scaled: topicDist = numpy.diag(numpy.diag(1.0 / self.s)) * topicDist return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist) if numpy.isfinite(topicValue) and not numpy.allclose(topicValue, 0.0)]
def __getitem__(self, bow, scaled = True): """ Return latent distribution, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. Note that this function returns the latent space representation **scaled by the singular values**. To return non-scaled embedding, set `scaled` to False. """ # if the input vector is in fact a corpus, return a transformed corpus as result if utils.isCorpus(bow): return self._apply(bow) vec = matutils.sparse2full(bow, self.numTerms) vec.shape = (self.numTerms, 1) topicDist = self.projection * vec if not scaled: topicDist = numpy.diag(numpy.diag(1.0 / self.s)) * topicDist return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist) if numpy.isfinite(topicValue) and not numpy.allclose(topicValue, 0.0)]
def __init__(self, m, k, docs=None): """ Store (U, S) projection itself. This is the class taking care of 'core math'; interfacing with corpora, training etc is done through class LsiModel. `docs` is either a spare matrix or a corpus which, when converted to a sparse matrix, must fit comfortably into main memory. """ self.m, self.k = m, k if docs is not None: # base case decomposition: given a job `docs`, compute its decomposition # in core, algorithm 1 if utils.isCorpus(docs): docs = matutils.corpus2csc(m, docs) if docs.shape[1] <= max(k, 100): # for sufficiently small chunk size, compute svd(now, a) instead of svd(now, svd(a)). # this improves accuracy and is also faster for small chunks, because # we need to do one less svd. # on larger chunks this doesn't work because we run out of memory (chunks=1000 # would already raise MemoryException on my machine) self.u = docs self.s = None else: try: import sparsesvd except ImportError: raise ImportError( "for LSA, the `sparsesvd` module is needed but not found; run `easy_install sparsesvd`" ) logger.info("computing sparse SVD of %s matrix" % str(docs.shape)) ut, s, vt = sparsesvd.sparsesvd( docs, k + 30 ) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested u = ut.T del ut, vt k = clipSpectrum(s, self.k) self.u, self.s = u[:, :k], s[:k] else: self.u, self.s = None, None
def __getitem__(self, bow, scaled = False): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. Note that this function returns the latent space representation **scaled by the singular values**. To return non-scaled embedding, set `scaled` to False. """ # if the input vector is in fact a corpus, return a transformed corpus as result if utils.isCorpus(bow): return self._apply(bow) assert self.projection.u is not None, "decomposition not initialized yet" vec = numpy.asfortranarray(matutils.sparse2full(bow, self.numTerms), dtype = self.projection.u.dtype) vec.shape = (self.numTerms, 1) topicDist = scipy.linalg.fblas.dgemv(1.0, self.projection.u, vec, trans = True) # u^T * x if scaled: topicDist = (1.0 / self.projection.s) * topicDist # s^-1 * u^T * x return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist) if numpy.isfinite(topicValue) and not numpy.allclose(topicValue, 0.0)]
def test_None(self): # test None result = utils.isCorpus(None) expected = (False, None) self.assertEqual(expected, result)
def test_int_tuples(self): potentialCorpus = [[(0, 4)]] result = utils.isCorpus(potentialCorpus) expected = (True, potentialCorpus) self.assertEqual(expected, result)
def addDocuments(self, corpus, chunks = None, decay = None): """ Update singular value decomposition factors to take into account a new corpus of documents. Training proceeds in chunks of `chunks` documents at a time. If the distributed mode is on, each chunk is sent to a different worker/computer. Size of `chunks` is a tradeoff between increased speed (bigger `chunks`) vs. lower memory footprint (smaller `chunks`). Default is processing 10,000 documents at a time. Setting `decay` < 1.0 causes re-orientation towards new data trends in the input document stream, by giving less emphasis to old observations. This allows SVD to gradually "forget" old observations and give more preference to new ones. The decay is applied once after every `chunks` documents. """ logger.info("updating SVD with new documents") # get computation parameters; if not specified, use the ones from constructor if chunks is None: chunks = self.chunks if decay is None: decay = self.decay if utils.isCorpus(corpus): # do the actual work -- perform iterative singular value decomposition. chunker = itertools.groupby(enumerate(corpus), key = lambda val: val[0] / chunks) doc_no = 0 for chunk_no, (key, group) in enumerate(chunker): # construct the job as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense matrix! job = matutils.corpus2csc(self.numTerms, (doc for _, doc in group)) doc_no += job.shape[1] if self.dispatcher: # distributed version: add this job to the job queue, so workers can work on it logger.debug("creating job #%i" % chunk_no) self.dispatcher.putjob(job) # put job into queue; this will eventually block, because the queue has a small finite size del job logger.info("dispatched documents up to #%s" % doc_no) else: # serial version, there is only one "worker" (myself) => process the job directly update = Projection(self.numTerms, self.numTopics, job) del job self.projection.merge(update, decay = decay) del update logger.info("processed documents up to #%s" % doc_no) self.printDebug(5) if self.dispatcher: logger.info("reached the end of input; now waiting for all remaining jobs to finish") import time while self.dispatcher.jobsdone() <= chunk_no: time.sleep(0.5) # check every half a second logger.info("all jobs finished, downloading final projection") del self.projection self.projection = self.dispatcher.getstate() logger.info("decomposition complete") else: assert not self.dispatcher, "must be in serial mode to receive jobs" assert isinstance(corpus, scipy.sparse.csc_matrix) update = Projection(self.numTerms, self.numTopics, corpus) self.projection.merge(update, decay = decay) logger.info("processed sparse job of %i documents" % (corpus.shape[1])) self.printDebug(5)
def addDocuments(self, corpus, chunks=None, decay=None): """ Update singular value decomposition factors to take into account a new corpus of documents. Training proceeds in chunks of `chunks` documents at a time. If the distributed mode is on, each chunk is sent to a different worker/computer. Size of `chunks` is a tradeoff between increased speed (bigger `chunks`) vs. lower memory footprint (smaller `chunks`). Setting `decay` < 1.0 causes re-orientation towards new data trends in the input document stream, by giving less emphasis to old observations. This allows SVD to gradually "forget" old observations and give more preference to new ones. The decay is applied once after every `chunks` documents. """ logger.info("updating SVD with new documents") # get computation parameters; if not specified, use the ones from constructor if chunks is None: chunks = self.chunks if decay is None: decay = self.decay if utils.isCorpus(corpus): # do the actual work -- perform iterative singular value decomposition. chunker = itertools.groupby(enumerate(corpus), key=lambda val: val[0] / chunks) doc_no = 0 for chunk_no, (key, group) in enumerate(chunker): # construct the job as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense matrix! job = matutils.corpus2csc(self.numTerms, (doc for _, doc in group)) doc_no += job.shape[1] if self.dispatcher: # distributed version: add this job to the job queue, so workers can work on it logger.debug("creating job #%i" % chunk_no) self.dispatcher.putjob( job ) # put job into queue; this will eventually block, because the queue has a small finite size del job logger.info("dispatched documents up to #%s" % doc_no) else: # serial version, there is only one "worker" (myself) => process the job directly update = Projection(self.numTerms, self.numTopics, job) del job self.projection.merge(update, decay=decay) del update logger.info("processed documents up to #%s" % doc_no) #self.printDebug(5) self.printTopics( 5 ) # TODO see if printDebug works and remove one of these.. if self.dispatcher: logger.info( "reached the end of input; now waiting for all remaining jobs to finish" ) import time while self.dispatcher.jobsdone() <= chunk_no: time.sleep(0.5) # check every half a second logger.info("all jobs finished, downloading final projection") del self.projection self.projection = self.dispatcher.getstate() logger.info("decomposition complete") else: assert not self.dispatcher, "must be in serial mode to receive jobs" assert isinstance(corpus, scipy.sparse.csc_matrix) update = Projection(self.numTerms, self.numTopics, corpus) self.projection.merge(update, decay=decay) logger.info("processed sparse job of %i documents" % (corpus.shape[1])) self.printTopics(5)