def __getitem__(self, query): """Get similarities of the given document or corpus against this index. Uses :meth:`~gensim.interfaces.SimilarityABC.get_similarities` internally. Notes ----- Passing an entire corpus as `query` can be more efficient than passing its documents one after another, because it will issue queries in batches internally. Parameters ---------- query : {list of (int, number), iterable of list of (int, number)} Document in the sparse Gensim bag-of-words format, or a streamed corpus of such documents. Returns ------- {`scipy.sparse.csr.csr_matrix`, list of (int, float)} Similarities given document or corpus and objects corpus, depends on `query`. """ is_corpus, query = utils.is_corpus(query) if self.normalize: # self.normalize only works if the input is a plain gensim vector/corpus (as # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix # as well, but in that case assume tricks are happening and don't normalize # anything (self.normalize has no effect). if not matutils.ismatrix(query): if is_corpus: query = [matutils.unitvec(v) for v in query] else: query = matutils.unitvec(query) result = self.get_similarities(query) if self.num_best is None: return result # if maintain_sparsity is True, result is scipy sparse. Sort, clip the # topn and return as a scipy sparse matrix. if getattr(self, 'maintain_sparsity', False): return matutils.scipy2scipy_clipped(result, self.num_best) # if the input query was a corpus (=more documents), compute the top-n # most similar for each document in turn if matutils.ismatrix(result): return [ matutils.full2sparse_clipped(v, self.num_best) for v in result ] else: # otherwise, return top-n of the single input document return matutils.full2sparse_clipped(result, self.num_best)
def __getitem__(self, query): """Get access to similarities of document/corpus `query` to all documents in the corpus. Using :meth:`~gensim.interfaces.SimilarityABC.get_similarities` Notes ----- Passing corpus to `query` (instead of document) can be more efficient, because will processed in batching-way. Parameters ---------- query : {list of (int, int), iterable of list of (int, int)} Document or corpus in BoW format. Returns ------- {`scipy.sparse.csr.csr_matrix`, list of (int, float)} Similarities given document or corpus and objects corpus, depends on `query`. """ is_corpus, query = utils.is_corpus(query) if self.normalize: # self.normalize only works if the input is a plain gensim vector/corpus (as # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix # as well, but in that case assume tricks are happening and don't normalize # anything (self.normalize has no effect). if not matutils.ismatrix(query): if is_corpus: query = [matutils.unitvec(v) for v in query] else: query = matutils.unitvec(query) result = self.get_similarities(query) if self.num_best is None: return result # if maintain_sparsity is True, result is scipy sparse. Sort, clip the # topn and return as a scipy sparse matrix. if getattr(self, 'maintain_sparsity', False): return matutils.scipy2scipy_clipped(result, self.num_best) # if the input query was a corpus (=more documents), compute the top-n # most similar for each document in turn if matutils.ismatrix(result): return [ matutils.full2sparse_clipped(v, self.num_best) for v in result ] else: # otherwise, return top-n of the single input document return matutils.full2sparse_clipped(result, self.num_best)
def __getitem__(self, query): """Get similarities of the given document or corpus against this index. Uses :meth:`~gensim.interfaces.SimilarityABC.get_similarities` internally. Notes ----- Passing an entire corpus as `query` can be more efficient than passing its documents one after another, because it will issue queries in batches internally. Parameters ---------- query : {list of (int, number), iterable of list of (int, number)} Document in the sparse Gensim bag-of-words format, or a streamed corpus of such documents. Returns ------- {`scipy.sparse.csr.csr_matrix`, list of (int, float)} Similarities given document or corpus and objects corpus, depends on `query`. """ is_corpus, query = utils.is_corpus(query) if self.normalize: # self.normalize only works if the input is a plain gensim vector/corpus (as # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix # as well, but in that case assume tricks are happening and don't normalize # anything (self.normalize has no effect). if not matutils.ismatrix(query): if is_corpus: query = [matutils.unitvec(v) for v in query] else: query = matutils.unitvec(query) result = self.get_similarities(query) if self.num_best is None: return result # if maintain_sparsity is True, result is scipy sparse. Sort, clip the # topn and return as a scipy sparse matrix. if getattr(self, 'maintain_sparsity', False): return matutils.scipy2scipy_clipped(result, self.num_best) # if the input query was a corpus (=more documents), compute the top-n # most similar for each document in turn if matutils.ismatrix(result): return [matutils.full2sparse_clipped(v, self.num_best) for v in result] else: # otherwise, return top-n of the single input document return matutils.full2sparse_clipped(result, self.num_best)
def __getitem__(self, query): """Get access to similarities of document/corpus `query` to all documents in the corpus. Using :meth:`~gensim.interfaces.SimilarityABC.get_similarities` Notes ----- Passing corpus to `query` (instead of document) can be more efficient, because will processed in batching-way. Parameters ---------- query : {list of (int, int), iterable of list of (int, int)} Document or corpus in BoW format. Returns ------- {`scipy.sparse.csr.csr_matrix`, list of (int, float)} Similarities given document or corpus and objects corpus, depends on `query`. """ is_corpus, query = utils.is_corpus(query) if self.normalize: # self.normalize only works if the input is a plain gensim vector/corpus (as # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix # as well, but in that case assume tricks are happening and don't normalize # anything (self.normalize has no effect). if not matutils.ismatrix(query): if is_corpus: query = [matutils.unitvec(v) for v in query] else: query = matutils.unitvec(query) result = self.get_similarities(query) if self.num_best is None: return result # if maintain_sparsity is True, result is scipy sparse. Sort, clip the # topn and return as a scipy sparse matrix. if getattr(self, 'maintain_sparsity', False): return matutils.scipy2scipy_clipped(result, self.num_best) # if the input query was a corpus (=more documents), compute the top-n # most similar for each document in turn if matutils.ismatrix(result): return [matutils.full2sparse_clipped(v, self.num_best) for v in result] else: # otherwise, return top-n of the single input document return matutils.full2sparse_clipped(result, self.num_best)
def _extract_data(topic_model, corpus, dictionary, doc_topic_dists=None): if not matutils.ismatrix(corpus): corpus_csc = matutils.corpus2csc(corpus, num_terms=len(dictionary)) else: corpus_csc = corpus # Need corpus to be a streaming gensim list corpus for len and inference functions below: corpus = matutils.Sparse2Corpus(corpus_csc) beta = 0.01 fnames_argsort = np.asarray(list(dictionary.token2id.values()), dtype=np.int_) term_freqs = corpus_csc.sum(axis=1).A.ravel()[fnames_argsort] term_freqs[term_freqs == 0] = beta doc_lengths = corpus_csc.sum(axis=0).A.ravel() assert term_freqs.shape[0] == len( dictionary ), 'Term frequencies and dictionary have different shape {} != {}'.format( term_freqs.shape[0], len(dictionary)) assert doc_lengths.shape[0] == len( corpus ), 'Document lengths and corpus have different sizes {} != {}'.format( doc_lengths.shape[0], len(corpus)) if hasattr(topic_model, 'lda_alpha'): num_topics = len(topic_model.lda_alpha) else: num_topics = topic_model.num_topics if doc_topic_dists is None: # If its an HDP model. if hasattr(topic_model, 'lda_beta'): gamma = topic_model.inference(corpus) else: gamma, _ = topic_model.inference(corpus) doc_topic_dists = gamma / gamma.sum(axis=1)[:, None] else: if isinstance(doc_topic_dists, list): doc_topic_dists = matutils.corpus2dense(doc_topic_dists, num_topics).T elif issparse(doc_topic_dists): doc_topic_dists = doc_topic_dists.T.todense() doc_topic_dists = doc_topic_dists / doc_topic_dists.sum(axis=1) assert doc_topic_dists.shape[ 1] == num_topics, 'Document topics and number of topics do not match {} != {}'.format( doc_topic_dists.shape[1], num_topics) # get the topic-term distribution straight from gensim without # iterating over tuples if hasattr(topic_model, 'lda_beta'): topic = topic_model.lda_beta else: topic = topic_model.state.get_lambda() topic = topic / topic.sum(axis=1)[:, None] topic_term_dists = topic[:, fnames_argsort] assert topic_term_dists.shape[0] == doc_topic_dists.shape[1] return doc_topic_dists
def __getitem__(self, query): """Get similarities of document `query` to all documents in the corpus. **or** If `query` is a corpus (iterable of documents), return a matrix of similarities of all query documents vs. all corpus document. Using this type of batch query is more efficient than computing the similarities one document after another. """ is_corpus, query = utils.is_corpus(query) if self.normalize: # self.normalize only works if the input is a plain gensim vector/corpus (as # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix # as well, but in that case assume tricks are happening and don't normalize # anything (self.normalize has no effect). if matutils.ismatrix(query): import warnings # warnings.warn("non-gensim input must already come normalized") else: if is_corpus: query = [matutils.unitvec(v) for v in query] else: query = matutils.unitvec(query) result = self.get_similarities(query) if self.num_best is None: return result # if maintain_sparity is True, result is scipy sparse. Sort, clip the # topn and return as a scipy sparse matrix. if getattr(self, 'maintain_sparsity', False): return matutils.scipy2scipy_clipped(result, self.num_best) # if the input query was a corpus (=more documents), compute the top-n # most similar for each document in turn if matutils.ismatrix(result): return [ matutils.full2sparse_clipped(v, self.num_best) for v in result ] else: # otherwise, return top-n of the single input document return matutils.full2sparse_clipped(result, self.num_best)
def __getitem__(self, query): """Get similarities of document `query` to all documents in the corpus. **or** If `query` is a corpus (iterable of documents), return a matrix of similarities of all query documents vs. all corpus document. Using this type of batch query is more efficient than computing the similarities one document after another. """ is_corpus, query = utils.is_corpus(query) if self.normalize: # self.normalize only works if the input is a plain gensim vector/corpus (as # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix # as well, but in that case assume tricks are happening and don't normalize # anything (self.normalize has no effect). if matutils.ismatrix(query): import warnings # noqa:F401 # warnings.warn("non-gensim input must already come normalized") else: if is_corpus: query = [matutils.unitvec(v) for v in query] else: query = matutils.unitvec(query) result = self.get_similarities(query) if self.num_best is None: return result # if maintain_sparity is True, result is scipy sparse. Sort, clip the # topn and return as a scipy sparse matrix. if getattr(self, 'maintain_sparsity', False): return matutils.scipy2scipy_clipped(result, self.num_best) # if the input query was a corpus (=more documents), compute the top-n # most similar for each document in turn if matutils.ismatrix(result): return [matutils.full2sparse_clipped(v, self.num_best) for v in result] else: # otherwise, return top-n of the single input document return matutils.full2sparse_clipped(result, self.num_best)
def extract_data(topic_model, corpus, dictionary, doc_topic_dists=None): if not matutils.ismatrix(corpus): corpus_csc = matutils.corpus2csc(corpus, num_terms=len(dictionary)) else: corpus_csc = corpus # Need corpus to be a streaming gensim list corpus for len and inference functions below: corpus = matutils.Sparse2Corpus(corpus_csc) # TODO: add the hyperparam to smooth it out? no beta in online LDA impl.. hmm.. # for now, I'll just make sure we don't ever get zeros... fnames_argsort = np.asarray(list(dictionary.token2id.values()), dtype=np.int_) doc_lengths = corpus_csc.sum(axis=0).A.ravel() assert doc_lengths.shape[0] == len( corpus ), 'Document lengths and corpus have different sizes {} != {}'.format( doc_lengths.shape[0], len(corpus)) if hasattr(topic_model, 'lda_alpha'): num_topics = len(topic_model.lda_alpha) else: num_topics = topic_model.num_topics if doc_topic_dists is None: # If its an HDP model. if hasattr(topic_model, 'lda_beta'): gamma = topic_model.inference(corpus) else: gamma, _ = topic_model.inference(corpus) doc_topic_dists = gamma / gamma.sum(axis=1)[:, None] else: if isinstance(doc_topic_dists, list): doc_topic_dists = matutils.corpus2dense(doc_topic_dists, num_topics).T elif issparse(doc_topic_dists): doc_topic_dists = doc_topic_dists.T.todense() doc_topic_dists = doc_topic_dists / doc_topic_dists.sum(axis=1) assert doc_topic_dists.shape[ 1] == num_topics, 'Document topics and number of topics do not match {} != {}'.format( doc_topic_dists.shape[1], num_topics) # get the topic-term distribution straight from gensim without iterating over tuples if hasattr(topic_model, 'lda_beta'): topic = topic_model.lda_beta else: topic = topic_model.state.get_lambda() topic = topic / topic.sum(axis=1)[:, None] topic_term_dists = topic[:, fnames_argsort] assert topic_term_dists.shape[0] == doc_topic_dists.shape[1] coherence_model = models.CoherenceModel(model=topic_model, corpus=corpus, dictionary=dictionary, coherence='u_mass') return { 'topic_term_dists': topic_term_dists, 'doc_topic_dists': doc_topic_dists, 'doc_lengths': doc_lengths, 'num_topics': num_topics }