def test_simple_lists_of_tuples(self): # test list words # one document, one word potentialCorpus = [[(0, 4.)]] result = utils.is_corpus(potentialCorpus) expected = (True, potentialCorpus) self.assertEqual(expected, result) # one document, several words potentialCorpus = [[(0, 4.), (1, 2.)]] result = utils.is_corpus(potentialCorpus) expected = (True, potentialCorpus) self.assertEqual(expected, result) potentialCorpus = [[(0, 4.), (1, 2.), (2, 5.), (3, 8.)]] result = utils.is_corpus(potentialCorpus) expected = (True, potentialCorpus) self.assertEqual(expected, result) # several documents, one word potentialCorpus = [[(0, 4.)], [(1, 2.)]] result = utils.is_corpus(potentialCorpus) expected = (True, potentialCorpus) self.assertEqual(expected, result) potentialCorpus = [[(0, 4.)], [(1, 2.)], [(2, 5.)], [(3, 8.)]] result = utils.is_corpus(potentialCorpus) expected = (True, potentialCorpus) self.assertEqual(expected, result)
def __getitem__(self, bow, eps=1e-12): """ Return esa representation of the input vector and/or corpus. bow should already be weights, e.g. with TF-IDF """ # if the input vector is in fact a corpus, return a transformed corpus # as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) # use similarity index to calculate similarity with each vector of corpus vector = self.similarity_index[bow] # consine similarity is in [-1, 1] shift and scale to make it [0, 1] vector += 1 vector /= 2 #normalize vector = matutils.unitvec(vector) # make sure there are no explicit zeroes in the vector (must be sparse) vector = [(concept_id, weight) for concept_id, weight in enumerate(vector) if abs(weight) > eps] return vector
def __getitem__(self, bow, eps=1e-12): """ Return esa representation of the input vector and/or corpus. bow should already be weights, e.g. with TF-IDF """ # if the input vector is in fact a corpus, return a transformed corpus # as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) #use corpus as interpreter matrix #simply multiply feature vector of input with corpus matrix #to get the weight of the concept vector = numpy.dot(matutils.sparse2full(bow, self.num_features), self.corpus) #normalize vector = matutils.unitvec(vector) # make sure there are no explicit zeroes in the vector (must be sparse) vector = [(concept_id, weight) for concept_id, weight in enumerate(vector) if abs(weight) > eps] return vector
def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False): """ Return topic distribution for the given document `bow`, as a list of (topic_id, topic_probability) 2-tuples. Ignore topics with very low probability (below `minimum_probability`). If per_word_topics is True, it also returns a list of topics, sorted in descending order of most likely topics for that word. It also returns a list of word_ids and each words corresponding topics' phi_values, multiplied by feature length (i.e, word count) """ if minimum_probability is None: minimum_probability = self.minimum_probability minimum_probability = max(minimum_probability, 1e-8) # never allow zero values in sparse output if minimum_phi_value is None: minimum_phi_value = self.minimum_probability minimum_phi_value = max(minimum_phi_value, 1e-8) # never allow zero values in sparse output # if the input vector is a corpus, return a transformed corpus is_corpus, corpus = utils.is_corpus(bow) if is_corpus: kwargs = dict( per_word_topics=per_word_topics, minimum_probability=minimum_probability, minimum_phi_value=minimum_phi_value ) return self._apply(corpus, **kwargs) gamma, phis = self.inference([bow], collect_sstats=per_word_topics) topic_dist = gamma[0] / sum(gamma[0]) # normalize distribution document_topics = [ (topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist) if topicvalue >= minimum_probability ] if not per_word_topics: return document_topics else: word_topic = [] # contains word and corresponding topic word_phi = [] # contains word and phi values for word_type, weight in bow: phi_values = [] # contains (phi_value, topic) pairing to later be sorted phi_topic = [] # contains topic and corresponding phi value to be returned 'raw' to user for topic_id in range(0, self.num_topics): if phis[topic_id][word_type] >= minimum_phi_value: # appends phi values for each topic for that word # these phi values are scaled by feature length phi_values.append((phis[topic_id][word_type], topic_id)) phi_topic.append((topic_id, phis[topic_id][word_type])) # list with ({word_id => [(topic_0, phi_value), (topic_1, phi_value) ...]). word_phi.append((word_type, phi_topic)) # sorts the topics based on most likely topic # returns a list like ({word_id => [topic_id_most_probable, topic_id_second_most_probable, ...]). sorted_phi_values = sorted(phi_values, reverse=True) topics_sorted = [x[1] for x in sorted_phi_values] word_topic.append((word_type, topics_sorted)) return (document_topics, word_topic, word_phi) # returns 2-tuple
def serialize(filename_prefix, layer, current_representation, num_terms=None, chunksize=10000): is_corpus, current_representation = utils.is_corpus(current_representation) if is_corpus: for chunk_no, chunk in enumerate(utils.grouper(current_representation, chunksize)): ln.debug("preparing chunk for conversion (%s documents)..." % len(chunk)) assert num_terms is not None, "Need num_terms to properly handle sparse corpus format" chunk_as_csc = matutils.corpus2csc(chunk, num_terms=num_terms) ln.debug("Chunk converted to csc, running through layer..") chunk_trans = layer.__getitem__(chunk_as_csc) ln.debug("Serializing hidden representation..") fname = "%s_%s" % (filename_prefix, ("0" * (15 - len(str(chunk_no)))) + str(chunk_no)) np.save(fname, chunk_trans) ln.debug("Finished serializing chunk. Processed %s documents so far." % (chunk_no * chunksize + len(chunk))) else: ln.info("Beginning serialization of non-gensim corpus format intermediate representation.") ln.debug("Type of current_representation is %s" % type(current_representation)) for chunk_no, chunk in enumerate(current_representation): ln.debug("converting chunk (%s documents)..." % chunksize) chunk_trans = layer.__getitem__(chunk) ln.debug("Serializing hidden representation..") fname = "%s_%s" % (filename_prefix, ("0" * (15 - len(str(chunk_no)))) + str(chunk_no)) np.save(fname, chunk_trans) ln.debug("finished serializing chunk.") ln.info("Finished serializing all chunks.")
def __getitem__(self, bow, chunksize=10000): #ln.debug("getitem: %s" % chunksize) is_corpus, bow = utils.is_corpus(bow) if not is_corpus: bow = [bow] ln.info("Computing hidden representation for %s documents..." % len(bow)) if not chunksize: # todo I think could be removed altogether chunksize = 1 def transformed_corpus(): for chunk_no, doc_chunk in enumerate(utils.grouper(bow, chunksize)): ln.debug("Converting chunk %s to csc format.." % chunk_no) chunk = matutils.corpus2csc(doc_chunk, self.input_dimensionality) ln.debug("Computing hidden representation for chunk.. ") hidden = self._get_hidden_representations(chunk) ln.info("Finished computing representation for chunk %s, yielding results. Total docs processed: %s" % (chunk_no, chunk_no * chunksize + len(doc_chunk))) for column in hidden.T: yield matutils.dense2vec(column.T) ln.debug("Done yielding chunk %s" % chunk_no) ln.info("Finished computing representations for all chunks.") if not is_corpus: res = list(transformed_corpus()).pop() return res else: return transformed_corpus()
def __getitem__(self, bow, eps=1e-12): """ Return tf-idf representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge weight, # as strict application of the IDF formula would dictate) vector = [ (termid, self.wlocal(tf) * self.idfs.get(termid)) for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0 ] # and finally, normalize the vector either to unit length, or use a # user-defined normalization function if self.normalize is True: vector = matutils.unitvec(vector) elif self.normalize: vector = self.normalize(vector) # make sure there are no explicit zeroes in the vector (must be sparse) vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps] return vector
def __getitem__(self, bow, scaled=False, chunksize=512): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. """ assert self.projection.u is not None, "decomposition not initialized yet" # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus and chunksize: # by default, transform 256 documents at once, when called as `lsi[corpus]`. # this chunking is completely transparent to the user, but it speeds # up internal computations (one mat * mat multiplication, instead of # 256 smaller mat * vec multiplications). return self._apply(bow, chunksize=chunksize) if not is_corpus: bow = [bow] vec = matutils.corpus2csc(bow, num_terms=self.num_terms) topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x if scaled: topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist # s^-1 * u^-1 * x # convert a numpy array to gensim sparse vector = tuples of (feature_id, feature_weight), # with no zero weights. if not is_corpus: # lsi[single_document] result = matutils.full2sparse(topic_dist.flat) else: # lsi[chunk of documents] result = matutils.Dense2Corpus(topic_dist) return result
def get_similarities(self, query): """ Return similarity of sparse vector `query` to all documents in the corpus, as a numpy array. If `query` is a collection of documents, return a 2D array of similarities of each document in `query` to all documents in the corpus (=batch query, faster than processing each document in turn). **Do not use this function directly; use the self[query] syntax instead.** """ is_corpus, query = utils.is_corpus(query) if is_corpus: query = matutils.corpus2csc(query, self.index.shape[1], dtype=self.index.dtype) else: if scipy.sparse.issparse(query): query = query.T # convert documents=rows to documents=columns elif isinstance(query, numpy.ndarray): if query.ndim == 1: query.shape = (1, len(query)) query = scipy.sparse.csr_matrix(query, dtype=self.index.dtype).T else: # default case: query is a single vector, in sparse gensim format query = matutils.corpus2csc([query], self.index.shape[1], dtype=self.index.dtype) # compute cosine similarity against every other document in the collection result = self.index * query.tocsc() # N x T * T x C = N x C if result.shape[1] == 1: # for queries of one document, return a 1d array result = result.toarray().flatten() else: # otherwise, return a 2d matrix (#queries x #index) result = result.toarray().T return result
def __getitem__(self, bow, iterations=100): """Get vector for document(s). Parameters ---------- bow : {list of (int, int), iterable of list of (int, int)} Document (or corpus) in BoW format. iterations : int, optional Number of iterations that will be used for inferring. Returns ------- list of (int, float) LDA vector for document as sequence of (topic_id, topic_probability) **OR** list of list of (int, float) LDA vectors for corpus in same format. """ is_corpus, corpus = utils.is_corpus(bow) if not is_corpus: # query is a single document => make a corpus out of it bow = [bow] self.convert_input(bow, infer=True) cmd = \ self.mallet_path + ' infer-topics --input %s --inferencer %s ' \ '--output-doc-topics %s --num-iterations %s --doc-topics-threshold %s' cmd = cmd % ( self.fcorpusmallet() + '.infer', self.finferencer(), self.fdoctopics() + '.infer', iterations, self.topic_threshold ) logger.info("inferring topics with MALLET LDA '%s'", cmd) check_output(args=cmd, shell=True) result = list(self.read_doctopics(self.fdoctopics() + '.infer')) return result if is_corpus else result[0]
def __getitem__(self, query): """Get similarities of document `query` to all documents in the corpus. **or** If `query` is a corpus (iterable of documents), return a matrix of similarities of all query documents vs. all corpus document. This batch query is more efficient than computing the similarities one document after another. """ self.close_shard() # no-op if no documents added to index since last query results = [] for shard in self.shards: shard.num_best = self.num_best shard.normalize = self.normalize results.append(shard[query]) if self.num_best is None: return numpy.hstack(results) # only top-n most similars requested; merge the partial results from all shards is_corpus, results = utils.is_corpus(results) if is_corpus: # query = single document? result = sorted(sum(results, []), key=lambda item: -item[1])[ : self.num_best] else: result = [] for parts in itertools.izip(*results): merged = sorted(sum(parts, []), key=lambda item: -item[1])[ : self.num_best] result.append(merged) return result
def top_topics(self, corpus, num_words=20): """ Calculate the Umass topic coherence for each topic. Algorithm from **Mimno, Wallach, Talley, Leenders, McCallum: Optimizing Semantic Coherence in Topic Models, CEMNLP 2011.** """ is_corpus, corpus = utils.is_corpus(corpus) if not is_corpus: logger.warning("LdaModel.top_topics() called with an empty corpus") return topics = [] str_topics = [] for topic in self.state.get_lambda(): topic = topic / topic.sum() # normalize to probability distribution bestn = matutils.argsort(topic, topn=num_words, reverse=True) topics.append(bestn) beststr = [(topic[id], self.id2word[id]) for id in bestn] str_topics.append(beststr) # top_ids are limited to every topics top words. should not exceed the # vocabulary size. top_ids = set(chain.from_iterable(topics)) # create a document occurence sparse matrix for each word doc_word_list = {} for id in top_ids: id_list = set() for n, document in enumerate(corpus): if id in frozenset(x[0] for x in document): id_list.add(n) doc_word_list[id] = id_list coherence_scores = [] for t, top_words in enumerate(topics): # Calculate each coherence score C(t, top_words) coherence = 0.0 # Sum of top words m=2..M for m in top_words[1:]: # m_docs is v_m^(t) m_docs = doc_word_list[m] # Sum of top words l=1..m-1 # i.e., all words ranked higher than the current word m for l in top_words[:m - 1]: # l_docs is v_l^(t) l_docs = doc_word_list[l] # make sure this word appears in some documents. if len(l_docs) > 0: # co_doc_frequency is D(v_m^(t), v_l^(t)) co_doc_frequency = len(m_docs.intersection(l_docs)) # add to the coherence sum for these two words m, l coherence += numpy.log((co_doc_frequency + 1.0) / len(l_docs)) coherence_scores.append((str_topics[t], coherence)) top_topics = sorted(coherence_scores, key=lambda t: t[1], reverse=True) return top_topics
def get_similarities(self, query): """ Return similarity of sparse vector `query` to all documents in the corpus, as a numpy array. If `query` is a collection of documents, return a 2D array of similarities of each document in `query` to all documents in the corpus (=batch query, faster than processing each document in turn). **Do not use this function directly; use the self[query] syntax instead.** """ is_corpus, query = utils.is_corpus(query) if is_corpus: query = numpy.asarray( [matutils.sparse2full(vec, self.num_features) for vec in query], dtype=self.index.dtype) else: if scipy.sparse.issparse(query): query = query.toarray() # convert sparse to dense elif isinstance(query, numpy.ndarray): pass else: # default case: query is a single vector in sparse gensim format query = matutils.sparse2full(query, self.num_features) query = numpy.asarray(query, dtype=self.index.dtype) # do a little transposition dance to stop numpy from making a copy of # self.index internally in numpy.dot (very slow). result = numpy.dot(self.index, query.T).T # return #queries x #index return result # XXX: removed casting the result from array to list; does anyone care?
def get_similarities(self, query): """Get similarity between `query` and this index. Warnings -------- Do not use this function directly; use the `self[query]` syntax instead. Parameters ---------- query : {list of (int, number), iterable of list of (int, number)} Document or collection of documents. Return ------ :class:`numpy.ndarray` Similarity matrix. """ if not self.corpus: return numpy.array() is_corpus, query = utils.is_corpus(query) if not is_corpus and isinstance(query, numpy.ndarray): query = [self.corpus[i] for i in query] # convert document indexes to actual documents result = self.similarity_matrix.inner_product(query, self.corpus, normalized=True) if scipy.sparse.issparse(result): return numpy.asarray(result.todense()) if numpy.isscalar(result): return numpy.array(result) return numpy.asarray(result)[0]
def __getitem__(self, bow): """Get log entropy representation of the input vector and/or corpus. Parameters ---------- bow : list of (int, int) Document in BoW format. Returns ------- list of (int, float) Log-entropy vector for passed `bow`. """ # if the input vector is in fact a corpus, return a transformed corpus is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge) vector = [ (term_id, math.log(tf + 1) * self.entr.get(term_id)) for term_id, tf in bow if term_id in self.entr ] if self.normalize: vector = matutils.unitvec(vector) return vector
def __getitem__(self, bow, eps=0.01): """Convert document or corpus in BoW format to LDA vectors in BoW format Parameters ---------- bow : {list of (int, int), iterable of list of (int, int)} Document or corpus in BoW format. eps : float Threshold value (all topics with probability < `eps` will be ignored. Returns ------- list of (int, float) LDA vector for document **OR** list of list of (int, float) LDA vectors for corpus. """ is_corpus, dummy_corpus = utils.is_corpus(bow) if not is_corpus: bow = [bow] predictions = self._predict(bow)[0] topics = [] for row in predictions: row_topics = [] for topic_id, val in enumerate(row): if val > eps: row_topics.append((topic_id, val)) topics.append(row_topics) return topics if is_corpus else topics[0]
def _getbow(self, doc): # if doc is an iterable apply to all is_corpus, doc = utils.is_corpus(doc) if is_corpus: return SimpleCorpus(self._apply(doc)) return self.dict.doc2bow(doc, allow_update=True)
def __getitem__(self, doc): # if doc is an iterable apply to all is_corpus, doc = utils.is_corpus(doc) if is_corpus: return self._apply(doc) # return transformed doc according to function return self.funct(doc, *self.fargs, **self.fkwargs)
def __getitem__(self, bow, eps=0.01): is_corpus, corpus = utils.is_corpus(bow) if is_corpus: return self._apply(corpus) gamma = self.inference([bow])[0] topic_dist = gamma / sum(gamma) if sum(gamma) != 0 else [] return [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist) if topicvalue >= eps]
def __getitem__(self, bow, eps=0.01): is_corpus, corpus = utils.is_corpus(bow) if is_corpus: return self._apply(corpus) gamma, _ = self.inference([bow]) topic_dist = gamma[0] / sum(gamma[0]) # normalize to proper distribution return [topicvalue for topicid, topicvalue in enumerate(topic_dist)]
def __getitem__(self, bow, eps=1e-12): """Get the tf-idf representation of an input vector and/or corpus. bow : {list of (int, int), iterable of iterable of (int, int)} Input document in the `sparse Gensim bag-of-words format <https://radimrehurek.com/gensim/intro.html#core-concepts>`_, or a streamed corpus of such documents. eps : float Threshold value, will remove all position that have tfidf-value less than `eps`. Returns ------- vector : list of (int, float) TfIdf vector, if `bow` is a single document :class:`~gensim.interfaces.TransformedCorpus` TfIdf corpus, if `bow` is a corpus. """ self.eps = eps # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge weight, # as strict application of the IDF formula would dictate) termid_array, tf_array = [], [] for termid, tf in bow: termid_array.append(termid) tf_array.append(tf) tf_array = self.wlocal(np.array(tf_array)) vector = [ (termid, tf * self.idfs.get(termid)) for termid, tf in zip(termid_array, tf_array) if abs(self.idfs.get(termid, 0.0)) > self.eps ] if self.normalize is True: self.normalize = matutils.unitvec elif self.normalize is False: self.normalize = utils.identity # and finally, normalize the vector either to unit length, or use a # user-defined normalization function if self.pivot is None: norm_vector = self.normalize(vector) norm_vector = [(termid, weight) for termid, weight in norm_vector if abs(weight) > self.eps] else: _, old_norm = self.normalize(vector, return_norm=True) pivoted_norm = (1 - self.slope) * self.pivot + self.slope * old_norm norm_vector = [ (termid, weight / float(pivoted_norm)) for termid, weight in vector if abs(weight / float(pivoted_norm)) > self.eps ] return norm_vector
def test_getitem_dense2gensim(self): corpus = ShardedCorpus(self.tmp_fname, self.data, shardsize=100, dim=self.dim, sparse_serialization=False, gensim=True) item = corpus[3] self.assertTrue(isinstance(item, list)) self.assertTrue(isinstance(item[0], tuple)) dslice = corpus[2:6] self.assertTrue(next(dslice) == corpus[2]) dslice = list(dslice) self.assertTrue(isinstance(dslice, list)) self.assertTrue(isinstance(dslice[0], list)) self.assertTrue(isinstance(dslice[0][0], tuple)) iscorp, _ = is_corpus(dslice) self.assertTrue(iscorp, "Is the object returned by slice notation " "a gensim corpus?") ilist = corpus[[2, 3, 4, 5]] self.assertTrue(next(ilist) == corpus[2]) ilist = list(ilist) self.assertTrue(isinstance(ilist, list)) self.assertTrue(isinstance(ilist[0], list)) self.assertTrue(isinstance(ilist[0][0], tuple)) # From generators to lists self.assertEqual(len(ilist), len(dslice)) for i in xrange(len(ilist)): self.assertEqual(len(ilist[i]), len(dslice[i]), "Row %d: dims %d/%d" % (i, len(ilist[i]), len(dslice[i]))) for j in xrange(len(ilist[i])): self.assertEqual(ilist[i][j], dslice[i][j], "ilist[%d][%d] = %s ,dslice[%d][%d] = %s" % ( i, j, str(ilist[i][j]), i, j, str(dslice[i][j]))) iscorp, _ = is_corpus(ilist) self.assertTrue(iscorp, "Is the object returned by list notation " "a gensim corpus?")
def __getitem__(self, item): iscorpus, _ = is_corpus(item) if iscorpus or isinstance(item, DatasetABC): return self._apply(item) else: raise ValueError('Cannot apply flatten_composite to individual ' 'documents.')
def __getitem__(self, doc): # if doc is an iterable apply to all is_corpus, doc = utils.is_corpus(doc) if is_corpus: return self._apply(doc) self.counter.update(doc) return doc
def __getitem__(self, doc): # if doc is an iterable apply to all is_corpus, doc = utils.is_corpus(doc) if is_corpus: return self._apply(doc) # appling transformation, return doc as a bag-of-bitokens list allow_update = False if len(self.bidict) > 0 else True return self.bidict.doc2bob(doc, allow_update)
def __getitem__(self, bow, scaled=False, chunksize=512): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. If `scaled` is set, scale topics by the inverse of singular values (default: no scaling). """ assert self.projection.u is not None, "decomposition not initialized yet" # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus and chunksize: # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`. # this chunking is completely transparent to the user, but it speeds # up internal computations (one mat * mat multiplication, instead of # `chunksize` smaller mat * vec multiplications). return self._apply(bow, chunksize=chunksize) if not is_corpus: bow = [bow] # convert input to scipy.sparse CSC, then do "sparse * dense = dense" multiplication vec = matutils.corpus2csc(bow, num_terms=self.num_terms, dtype=self.projection.u.dtype) topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x # # convert input to dense, then do dense * dense multiplication # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse), # but consumes more memory # vec = matutils.corpus2dense(bow, num_terms=self.num_terms, num_docs=len(bow)) # topic_dist = np.dot(self.projection.u[:, :self.num_topics].T, vec) # # use np's advanced indexing to simulate sparse * dense # # ± same speed again # u = self.projection.u[:, :self.num_topics] # topic_dist = np.empty((u.shape[1], len(bow)), dtype=u.dtype) # for vecno, vec in enumerate(bow): # indices, data = zip(*vec) if vec else ([], []) # topic_dist[:, vecno] = np.dot(u.take(indices, axis=0).T, np.array(data, dtype=u.dtype)) if not is_corpus: # convert back from matrix into a 1d vec topic_dist = topic_dist.reshape(-1) if scaled: topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist # s^-1 * u^-1 * x # convert a np array to gensim sparse vector = tuples of (feature_id, feature_weight), # with no zero weights. if not is_corpus: # lsi[single_document] result = matutils.full2sparse(topic_dist) else: # lsi[chunk of documents] result = matutils.Dense2Corpus(topic_dist) return result
def __getitem__(self, bow): """ Return representation with the ids transformed. """ # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) return sorted((self.old2new[oldid], weight) for oldid, weight in bow if oldid in self.old2new)
def __getitem__(self, item): iscorpus, _ = is_corpus(item) if iscorpus: return self._apply(item) else: #raise ValueError('Cannot apply serializer to individual documents.') # Will this work? return self.serialized_data[item]
def __getitem__(self, vec, eps=1e-12): is_corpus, vec = utils.is_corpus(vec) if is_corpus: return self._apply(vec) if self.L1: score = sum( v for _, v in vec) / len(vec) if vec else 0 else: score = sum(v*v for _, v in vec) / len(vec) if vec else 0 return score
def __getitem__(self, bow, iterations=100): is_corpus, corpus = utils.is_corpus(bow) if not is_corpus: bow = [bow] self.convert_input(bow, infer=True) cmd = self.mallet_path + " infer-topics --input %s --inferencer %s --output-doc-topics %s --num-iterations %s" cmd = cmd % (self.fcorpusmallet() + '.infer', self.finferencer(), self.fdoctopics() + '.infer', iterations) logger.info("inferring with Mallet LDA with %s" % cmd) call(cmd, shell=True) return list(read_doctopics(self.fdoctopics() + '.infer'))