def _get_topics(self): """Internal helper function to return topics from a trained topic model.""" topics = [] # FIXME : Meant to work for LDAModel, LdaVowpalWabbit right now. Make it work for others. if isinstance(self.model, LdaModel): for topic in self.model.state.get_lambda(): bestn = argsort(topic, topn=10, reverse=True) topics.append(bestn) elif isinstance(self.model, LdaVowpalWabbit): for topic in self.model._get_topics(): bestn = argsort(topic, topn=10, reverse=True) topics.append(bestn) return topics
def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): """ Print the `num_words` most probable words for `num_topics` number of topics. Set `num_topics=-1` to print all topics. Set `formatted=True` to return the topics as a list of strings, or `False` as lists of (weight, word) pairs. """ if num_topics < 0 or num_topics >= self.num_topics: num_topics = self.num_topics chosen_topics = range(num_topics) else: num_topics = min(num_topics, self.num_topics) # add a little random jitter, to randomize results around the same alpha sort_alpha = self.alpha + 0.0001 * numpy.random.rand(len(self.alpha)) sorted_topics = list(matutils.argsort(sort_alpha)) chosen_topics = sorted_topics[: num_topics // 2] + sorted_topics[-num_topics // 2:] shown = [] for i in chosen_topics: if formatted: topic = self.print_topic(i, topn=num_words) else: topic = self.show_topic(i, topn=num_words) shown.append((i, topic)) if log: logger.info("topic #%i (%.3f): %s", i, self.alpha[i], topic) return shown
def top_topics_as_word_lists(model, dictionary, topn=20): """Get `topn` topics as list of words. Parameters ---------- model : :class:`~gensim.models.basemodel.BaseTopicModel` Pre-trained topic model. dictionary : :class:`~gensim.corpora.dictionary.Dictionary` Gensim dictionary mapping of id word. topn : int, optional Integer corresponding to the number of top words to be extracted from each topic. Return ------ list of list of str Top topics in list-of-list-of-words format. """ if not dictionary.id2token: dictionary.id2token = {v: k for k, v in dictionary.token2id.items()} str_topics = [] for topic in model.get_topics(): bestn = matutils.argsort(topic, topn=topn, reverse=True) beststr = [dictionary.id2token[_id] for _id in bestn] str_topics.append(beststr) return str_topics
def show_topic(self, topicno, topn=10): """Get the words that define a topic along with their contribution. This is actually the left singular vector of the specified topic. The most important words in defining the topic (greatest absolute value) are included in the output, along with their contribution to the topic. Parameters ---------- topicno : int The topics id number. topn : int Number of words to be included to the result. Returns ------- list of (str, float) Topic representation in BoW format. """ # size of the projection matrix can actually be smaller than `self.num_topics`, # if there were not enough factors (real rank of input matrix smaller than # `self.num_topics`). in that case, return an empty string if topicno >= len(self.projection.u.T): return '' c = np.asarray(self.projection.u.T[topicno, :]).flatten() norm = np.sqrt(np.sum(np.dot(c, c))) most = matutils.argsort(np.abs(c), topn, reverse=True) return [(self.id2word[val], 1.0 * c[val] / norm) for val in most]
def _get_topics_from_model(model, topn): """Internal helper function to return topics from a trained topic model. Parameters ---------- model : :class:`~gensim.models.basemodel.BaseTopicModel` Pre-trained topic model. topn : int Integer corresponding to the number of top words. Return ------ list of :class:`numpy.ndarray` Topics matrix """ try: return [ matutils.argsort(topic, topn=topn, reverse=True) for topic in model.get_topics() ] except AttributeError: raise ValueError( "This topic model is not currently supported. Supported topic models" " should implement the `get_topics` method.")
def optimal_ordering(self): """Performs ordering on the topics.""" idx = matutils.argsort(self.m_lambda_sum, reverse=True) self.m_varphi_ss = self.m_varphi_ss[idx] self.m_lambda = self.m_lambda[idx, :] self.m_lambda_sum = self.m_lambda_sum[idx] self.m_Elogbeta = self.m_Elogbeta[idx, :]
def get_topic_terms(self, topicid, topn=10, normalize=None): """Get the representation for a single topic. Words the integer IDs, in constrast to :meth:`~gensim.models.nmf.Nmf.show_topic` that represents words by the actual strings. Parameters ---------- topicid : int The ID of the topic to be returned topn : int, optional Number of the most significant words that are associated with the topic. normalize: bool or None, optional Whether to normalize the result. Allows for estimation of perplexity, coherence, e.t.c. Returns ------- list of (int, float) Word ID - probability pairs for the most relevant words generated by the topic. """ topic = self._W[:, topicid] if normalize is None: normalize = self.normalize if normalize: topic /= topic.sum() bestn = matutils.argsort(topic, topn, reverse=True) return [(idx, topic[idx]) for idx in bestn]
def top_topics(self, corpus, num_words=20): """ Calculate the Umass topic coherence for each topic. Algorithm from **Mimno, Wallach, Talley, Leenders, McCallum: Optimizing Semantic Coherence in Topic Models, CEMNLP 2011.** """ is_corpus, corpus = utils.is_corpus(corpus) if not is_corpus: logger.warning("LdaModel.top_topics() called with an empty corpus") return topics = [] str_topics = [] for topic in self.state.get_lambda(): topic = topic / topic.sum() # normalize to probability distribution bestn = matutils.argsort(topic, topn=num_words, reverse=True) topics.append(bestn) beststr = [(topic[id], self.id2word[id]) for id in bestn] str_topics.append(beststr) # top_ids are limited to every topics top words. should not exceed the # vocabulary size. top_ids = set(chain.from_iterable(topics)) # create a document occurence sparse matrix for each word doc_word_list = {} for id in top_ids: id_list = set() for n, document in enumerate(corpus): if id in frozenset(x[0] for x in document): id_list.add(n) doc_word_list[id] = id_list coherence_scores = [] for t, top_words in enumerate(topics): # Calculate each coherence score C(t, top_words) coherence = 0.0 # Sum of top words m=2..M for m in top_words[1:]: # m_docs is v_m^(t) m_docs = doc_word_list[m] # Sum of top words l=1..m-1 # i.e., all words ranked higher than the current word m for l in top_words[:m - 1]: # l_docs is v_l^(t) l_docs = doc_word_list[l] # make sure this word appears in some documents. if len(l_docs) > 0: # co_doc_frequency is D(v_m^(t), v_l^(t)) co_doc_frequency = len(m_docs.intersection(l_docs)) # add to the coherence sum for these two words m, l coherence += numpy.log((co_doc_frequency + 1.0) / len(l_docs)) coherence_scores.append((str_topics[t], coherence)) top_topics = sorted(coherence_scores, key=lambda t: t[1], reverse=True) return top_topics
def show_topic(self, topicid, topn=10, num_words=None): """Get `num_words` most probable words for the given `topicid`. Parameters ---------- topicid : int Id of topic. topn : int, optional Top number of topics that you'll receive. num_words : int, optional DEPRECATED PARAMETER, use `topn` instead. Returns ------- list of (str, float) Sequence of probable words, as a list of `(word, word_probability)` for `topicid` topic. """ if num_words is not None: # deprecated num_words is used warnings.warn("The parameter `num_words` is deprecated, will be removed in 4.0.0, use `topn` instead.") topn = num_words if self.word_topics is None: logger.warning("Run train or load_word_topics before showing topics.") topic = self.word_topics[topicid] topic = topic / topic.sum() # normalize to probability dist bestn = matutils.argsort(topic, topn, reverse=True) beststr = [(self.id2word[idx], topic[idx]) for idx in bestn] return beststr
def print_topics(self, ldamodel, topn=10): Lambda = ldamodel.state.get_lambda() Phi = Lambda / Lambda.sum(axis=1)[:, np.newaxis] Phi2 = Lambda / Lambda.sum(axis=0)[np.newaxis, :] entropy = np.zeros(Phi2.shape[1]) topics = "" # calcula a entropia Ew≜∑kp(k|w)logp(k|w) for w in range(Phi2.shape[1]): for k in range(Phi2.shape[0]): entropy[w] += Phi2[k,w]*np.log2(Phi2[k,w]+1e-100) print(entropy) # calcula p(w|k)e−Hw for k in range(Phi.shape[0]): for w in range(Phi.shape[1]): Phi[k,w] = Phi[k,w]/pow(math.e,(-1)*entropy[w]) for k in range(Phi.shape[0]): bestn = matutils.argsort(Phi[k], topn, reverse=True) topic_terms = [(id, Phi[k,id]) for id in bestn] lda_words = [(ldamodel.id2word[id], value) for id, value in topic_terms] topics += ' + '.join(['%.3f*%s' % (v, k) for k, v in lda_words])+"\n" return topics
def show_topic(self, topicid, time, topn=50, num_words=None): """Get `num_words` most probable words for the given `topicid`. Parameters ---------- topicid : int Id of topic. time : int Timestamp. topn : int, optional Top number of topics that you'll receive. num_words : int, optional DEPRECATED PARAMETER, use `topn` instead. Returns ------- list of (float, str) Sequence of probable words, as a list of `(word_probability, word)`. """ if num_words is not None: # deprecated num_words is used warnings.warn("The parameter `num_words` is deprecated, will be removed in 4.0.0, use `topn` instead.") topn = num_words topics = self.lambda_[:, :, time] topic = topics[topicid] # likelihood to probability topic = np.exp(topic) # normalize to probability dist topic = topic / topic.sum() # sort according to prob bestn = matutils.argsort(topic, topn, reverse=True) beststr = [(topic[idx], self.id2word[idx]) for idx in bestn] return beststr
def most_similar_to_vec(vector,model,topn, list_words): dists = np.dot(model.syn0norm, vector) best = matutils.argsort(dists, topn=topn + len(list_words), reverse=True) # ignore (don't return) words from the input result = [(model.index2word[sim], float(dists[sim])) for sim in best if model.index2word[sim] not in list_words] return result[:topn]
def show_topic(self, topicid, num_words=10): if self.word_topics is None: logger.warn("Run train or load_word_topics before showing topics.") topic = self.word_topics[topicid] topic = topic / topic.sum() # normalize to probability dist bestn = matutils.argsort(topic, num_words, reverse=True) beststr = [(topic[id], self.id2word[id]) for id in bestn] return beststr
def most_similar(self, words={}, topn=10, restrict_vocab=None): """ Find the top-N most similar words. words : a dict where the words are the keys and the weights are the values. This method computes cosine similarity between a simple mean of the projection weight vectors of the given words and the vectors for each word in the model. The method corresponds to the `word-analogy` and `distance` scripts in the original word2vec implementation. If topn is False, most_similar returns the vector of similarity scores. `restrict_vocab` is an optional integer which limits the range of vectors which are searched for most-similar values. For example, restrict_vocab=10000 would only check the first 10000 word vectors in the vocabulary order. (This may be meaningful if you've sorted the vocabulary by descending frequency.) Example:: >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man']) [('queen', 0.50882536), ...] """ self.init_sims() # if isinstance(positive, string_types) and not negative: # # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) # positive = [positive] # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words # positive = [ # (word, 1.0) if isinstance(word, string_types + (ndarray,)) else word # for word in positive # ] # negative = [ # (word, -1.0) if isinstance(word, string_types + (ndarray,)) else word # for word in negative # ] # compute the weighted average of all words all_words, mean = set(), [] for word, weight in words.items(): if isinstance(word, ndarray): mean.append(weight * word) elif word in self.vocab: mean.append(weight * self.syn0norm[self.vocab[word].index]) all_words.add(self.vocab[word].index) else: Warning("word '%s' not in vocabulary" % word) if not mean: raise ValueError("cannot compute similarity with no input") mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) limited = self.syn0norm if restrict_vocab is None else self.syn0norm[:restrict_vocab] dists = dot(limited, mean) if not topn: return dists best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True) # ignore (don't return) words from the input result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words] return result[:topn]
def _get_topics(self): """Internal helper function to return topics from a trained topic model.""" topics = [] if isinstance(self.model, LdaModel): for topic in self.model.state.get_lambda(): bestn = argsort(topic, topn=self.topn, reverse=True) topics.append(bestn) elif isinstance(self.model, LdaVowpalWabbit): for topic in self.model._get_topics(): bestn = argsort(topic, topn=self.topn, reverse=True) topics.append(bestn) elif isinstance(self.model, LdaMallet): for topic in self.model.word_topics: bestn = argsort(topic, topn=self.topn, reverse=True) topics.append(bestn) else: raise ValueError("This topic model is not currently supported. Supported topic models " " are LdaModel, LdaVowpalWabbit and LdaMallet.") return topics
def top_topics_as_word_lists(model, dictionary, topn=20): if not dictionary.id2token: dictionary.id2token = {v: k for k, v in dictionary.token2id.items()} str_topics = [] for topic in model.get_topics(): bestn = matutils.argsort(topic, topn=topn, reverse=True) beststr = [dictionary.id2token[_id] for _id in bestn] str_topics.append(beststr) return str_topics
def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): """ For `num_topics` number of topics, return `num_words` most significant words (10 words per topic, by default). The topics are returned as a list -- a list of strings if `formatted` is True, or a list of `(word, probability)` 2-tuples if False. If `log` is True, also output this result to log. Unlike LSA, there is no natural ordering between the topics in LDA. The returned `num_topics <= self.num_topics` subset of all topics is therefore arbitrary and may change between two LDA training runs. """ if num_topics < 0 or num_topics >= self.num_topics: num_topics = self.num_topics chosen_topics = range(num_topics) else: num_topics = min(num_topics, self.num_topics) # add a little random jitter, to randomize results around the same alpha sort_alpha = self.alpha + 0.0001 * self.random_state.rand(len(self.alpha)) sorted_topics = list(matutils.argsort(sort_alpha)) chosen_topics = sorted_topics[:num_topics // 2] + sorted_topics[-num_topics // 2:] shown = [] topic = self.state.get_lambda() for i in chosen_topics: topic_ = topic[i] topic_ = topic_ / topic_.sum() # normalize to probability distribution bestn = matutils.argsort(topic_, num_words, reverse=True) topic_ = [(self.id2word[id], topic_[id]) for id in bestn] if formatted: topic_ = ' + '.join(['%.3f*"%s"' % (v, k) for k, v in topic_]) shown.append((i, topic_)) if log: logger.info("topic #%i (%.3f): %s", i, self.alpha[i], topic_) return shown
def most_similar(self, positive=[], negative=[], topn=10, clip_start=0, clip_end=None, indexer=None): """ Find the top-N most similar docvecs known from training. Positive docs contribute positively towards the similarity, negative docs negatively. This method computes cosine similarity between a simple mean of the projection weight vectors of the given docs. Docs may be specified as vectors, integer indexes of trained docvecs, or if the documents were originally presented with string tags, by the corresponding tags. The 'clip_start' and 'clip_end' allow limiting results to a particular contiguous range of the underlying doctag_syn0norm vectors. (This may be useful if the ordering there was chosen to be significant, such as more popular tag IDs in lower indexes.) """ self.init_sims() clip_end = clip_end or len(self.doctag_syn0norm) if isinstance(positive, string_types + integer_types) and not negative: # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) positive = [positive] # add weights for each doc, if not already present; default to 1.0 for positive and -1.0 for negative docs positive = [ (doc, 1.0) if isinstance(doc, string_types + (ndarray,) + integer_types) else doc for doc in positive ] negative = [ (doc, -1.0) if isinstance(doc, string_types + (ndarray,) + integer_types) else doc for doc in negative ] # compute the weighted average of all docs all_docs, mean = set(), [] for doc, weight in positive + negative: if isinstance(doc, ndarray): mean.append(weight * doc) elif doc in self.doctags or doc < self.count: mean.append(weight * self.doctag_syn0norm[self._int_index(doc)]) all_docs.add(self._int_index(doc)) else: raise KeyError("doc '%s' not in trained set" % doc) if not mean: raise ValueError("cannot compute similarity with no input") mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) if indexer is not None: return indexer.most_similar(mean, topn) dists = dot(self.doctag_syn0norm[clip_start:clip_end], mean) if not topn: return dists best = matutils.argsort(dists, topn=topn + len(all_docs), reverse=True) # ignore (don't return) docs from the input result = [(self.index_to_doctag(sim), float(dists[sim])) for sim in best if sim not in all_docs] return result[:topn]
def most_similar_cosmul(self, positive=[], negative=[], topn=10): """ Find the top-N most similar words, using the multiplicative combination objective proposed by Omer Levy and Yoav Goldberg in [4]_. Positive words still contribute positively towards the similarity, negative words negatively, but with less susceptibility to one large distance dominating the calculation. In the common analogy-solving case, of two positive and one negative examples, this method is equivalent to the "3CosMul" objective (equation (4)) of Levy and Goldberg. Additional positive or negative examples contribute to the numerator or denominator, respectively – a potentially sensible but untested extension of the method. (With a single positive example, rankings will be the same as in the default most_similar.) Example:: >>> trained_model.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london']) [(u'iraq', 0.8488819003105164), ...] .. [4] Omer Levy and Yoav Goldberg. Linguistic Regularities in Sparse and Explicit Word Representations, 2014. """ self.init_sims() if isinstance(positive, string_types) and not negative: # allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog']) positive = [positive] all_words = set([self.vocab[word].index for word in positive+negative if not isinstance(word, ndarray) and word in self.vocab]) positive = [ self.word_vec(word, use_norm=True) if isinstance(word, string_types) else word for word in positive ] negative = [ self.word_vec(word, use_norm=True) if isinstance(word, string_types) else word for word in negative ] if not positive: raise ValueError("cannot compute similarity with no input") # equation (4) of Levy & Goldberg "Linguistic Regularities...", # with distances shifted to [0,1] per footnote (7) pos_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in positive] neg_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in negative] dists = prod(pos_dists, axis=0) / (prod(neg_dists, axis=0) + 0.000001) if not topn: return dists best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True) # ignore (don't return) words from the input result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words] return result[:topn]
def top_topics(self, corpus, texts=None, dictionary=None, window_size=None, coherence='u_mass', topn=20, processes=-1): """Get the topics sorted by coherence. Parameters ---------- corpus : iterable of list of (int, float) or `csc_matrix` with the shape (n_tokens, n_documents) Training corpus. Can be either iterable of documents, which are lists of `(word_id, word_count)`, or a sparse csc matrix of BOWs for each document. If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`). texts : list of list of str, optional Tokenized texts, needed for coherence models that use sliding window based (i.e. coherence=`c_something`) probability estimator . dictionary : {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}, optional Dictionary mapping of id word to create corpus. If `model.id2word` is present, this is not needed. If both are provided, passed `dictionary` will be used. window_size : int, optional Is the size of the window to be used for coherence measures using boolean sliding window as their probability estimator. For 'u_mass' this doesn't matter. If None - the default window sizes are used which are: 'c_v' - 110, 'c_uci' - 10, 'c_npmi' - 10. coherence : {'u_mass', 'c_v', 'c_uci', 'c_npmi'}, optional Coherence measure to be used. Fastest method - 'u_mass', 'c_uci' also known as `c_pmi`. For 'u_mass' corpus should be provided, if texts is provided, it will be converted to corpus using the dictionary. For 'c_v', 'c_uci' and 'c_npmi' `texts` should be provided (`corpus` isn't needed) topn : int, optional Integer corresponding to the number of top words to be extracted from each topic. processes : int, optional Number of processes to use for probability estimation phase, any value less than 1 will be interpreted as num_cpus - 1. Returns ------- list of (list of (int, str), float) Each element in the list is a pair of a topic representation and its coherence score. Topic representations are distributions of words, represented as a list of pairs of word IDs and their probabilities. """ cm = CoherenceModel( model=self, corpus=corpus, texts=texts, dictionary=dictionary, window_size=window_size, coherence=coherence, topn=topn, processes=processes ) coherence_scores = cm.get_coherence_per_topic() str_topics = [] for topic in self.get_topics(): # topic = array of vocab_size floats, one per term bestn = matutils.argsort(topic, topn=topn, reverse=True) # top terms for topic beststr = [(topic[_id], self.id2word[_id]) for _id in bestn] # membership, token str_topics.append(beststr) # list of topn (float membership, token) tuples scored_topics = zip(str_topics, coherence_scores) return sorted(scored_topics, key=lambda tup: tup[1], reverse=True)
def _get_topics_from_model(model, topn): """Internal helper function to return topics from a trained topic model.""" try: return [ matutils.argsort(topic, topn=topn, reverse=True) for topic in model.get_topics() ] except AttributeError: raise ValueError( "This topic model is not currently supported. Supported topic models" " should implement the `get_topics` method.")
def testAccumulatorCachingWithModelSetting(self): kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass') cm1 = CoherenceModel(topics=self.topics1, **kwargs) cm1.estimate_probabilities() self.assertIsNotNone(cm1._accumulator) cm1.model = self.ldamodel topics = [] for topic in self.ldamodel.state.get_lambda(): bestn = argsort(topic, topn=cm1.topn, reverse=True) topics.append(bestn) self.assertTrue(np.array_equal(topics, cm1.topics)) self.assertIsNone(cm1._accumulator)
def show_topic(self, topicid, topn=10, num_words=None): if num_words is not None: # deprecated num_words is used warnings.warn("The parameter `num_words` is deprecated, will be removed in 4.0.0, use `topn` instead.") topn = num_words if self.word_topics is None: logger.warning("Run train or load_word_topics before showing topics.") topic = self.word_topics[topicid] topic = topic / topic.sum() # normalize to probability dist bestn = matutils.argsort(topic, topn, reverse=True) beststr = [(self.id2word[idx], topic[idx]) for idx in bestn] return beststr
def get_topic_terms(self, topicid, topn=10): """ Return a list of `(word_id, probability)` 2-tuples for the most probable words in topic `topicid`. Only return 2-tuples for the topn most probable words (ignore the rest). """ topic = self.state.get_lambda()[topicid] topic = topic / topic.sum() # normalize to probability distribution bestn = matutils.argsort(topic, topn, reverse=True) return [(id, topic[id]) for id in bestn]
def most_similar(self, sWord, iTopN=10, fMinDist=-1.0): npaWord_unit = self.getUnitVector(sWord) if npaWord_unit is None: return None npaCosineSimilarities = np.dot(self.npaWordEmbeddings_units, npaWord_unit) npaBestIndices = \ matutils.argsort(npaCosineSimilarities, topn=iTopN +1, reverse=True) # npaBestIndices[1:] - Ignore the first one (which is sWord itself) return [(self.oVocab.index2word(x), npaCosineSimilarities[x]) for x in npaBestIndices[1:] if npaCosineSimilarities[x] > fMinDist]
def most_similar_simple(self, sWord, iTopN=10): npaWordEmbedding = self[sWord] if npaWordEmbedding is None: return None npaSimilarities = np.dot(self.npaWordEmbeddings, npaWordEmbedding) npaBestIndices = \ matutils.argsort(npaSimilarities, topn=iTopN +1, reverse=True) # npaBestIndices[1:] - Ignore the first one (which is sWord itself) return [(self.oVocab.index2word(x), npaSimilarities[x]) for x in npaBestIndices[1:]]
def print_topic(self, topic, time=0, top_terms=20): """ Topic is the topic number Time is for a particular time_slice top_terms is the number of terms to display """ topic = self.topic_chains[topic].e_log_prob topic = numpy.transpose(topic) topic = numpy.exp(topic[time]) topic = topic / topic.sum() bestn = matutils.argsort(topic, top_terms, reverse=True) beststr = [(self.id2word[id_], round(topic[id_], 3)) for id_ in bestn] return beststr
def show_topic(self, topicid, topn=10, num_words=None): if num_words is not None: # deprecated num_words is used logger.warning("The parameter num_words for show_topic() would be deprecated in the updated version.") logger.warning("Please use topn instead.") topn = num_words if self.word_topics is None: logger.warning("Run train or load_word_topics before showing topics.") topic = self.word_topics[topicid] topic = topic / topic.sum() # normalize to probability dist bestn = matutils.argsort(topic, topn, reverse=True) beststr = [(self.id2word[idx], topic[idx]) for idx in bestn] return beststr
def show_topic(self, topicid, topn=10): """ Return a list of `(words_probability, word)` 2-tuples for the most probable words in topic `topicid`. Only return 2-tuples for the topn most probable words (ignore the rest). """ topic = self.state.get_lambda()[topicid] topic = topic / topic.sum() # normalize to probability distribution bestn = matutils.argsort(topic, topn, reverse=True) beststr = [(topic[id], self.id2word[id]) for id in bestn] return beststr
def reject_words_1(A, B, model = model): '''Takes two **LIST OF WORDS** and returns most_similar for word A, while rejecting words with meanings closer to B. Seems to work better than just giving in negative words. ''' in_words = A+B basic_word = [model[each] for each in A] reject_word = [model[each] for each in B] basic_mean = matutils.unitvec(array(basic_word).mean(axis=0)).astype(REAL) reject_mean = matutils.unitvec(array(reject_word).mean(axis=0)).astype(REAL) r = reject(basic_mean, reject_mean) dists = np.linalg.linalg.dot(model.syn0norm, r) best = matutils.argsort(dists, topn = 500, reverse = True) result = [(model.index2word[sim], float(dists[sim])) for sim in best if model.index2word[sim] not in in_words] return result
def mostSimilarSent(self, sent, query, allDoc, topn): words2 = query.split() try: words2.remove(u'\ufeff') except ValueError: words2 = words2 v2 = numpy.array([self[word] for word in words2], dtype=object) mean = matutils.unitvec(array(v2).mean(axis=0)) print "starting search dist" dists = dot(allDoc[0:None], mean) best = matutils.argsort(dists, topn, reverse=True) print "done!" result = [] for index in best: result.append(sent[index]) return result
def temporal_change(ldaseq, whichtopic=0,wordtime=0,npick=5): # for topic 0, pick the top 5 words at time 0 and see their frequency evolution # TODO: pick random 5 words (top words at different times) topicP = ldaseq.topic_chains[whichtopic].e_log_prob topicP = np.transpose(topicP) wordids = matutils.argsort(topicP[wordtime], npick, reverse=True) wfreqs = np.empty((len(time_slice),npick)) for kt in range(len(time_slice)): topic = np.exp(topicP[kt]) topic = topic/sum(topic) wfreqs[kt] = np.array([topic[id_] for id_ in wordids]) plt.plot(wfreqs,'-+') plt.yticks(wfreqs[0],[ldaseq.id2word[id_] for id_ in wordids]) plt.xticks(np.arange(len(time_slice))) plt.title('topic %d'%(whichtopic+1)) plt.show()
def aysn_file_flush(dists_all_temp, prei_temp): sim_num = dists_all_temp.shape[1] line_msgs = '' for j in range(sim_num): real_index = prei_temp + j dists = dists_all_temp[:, j] uquery = index2word[real_index] best = matutils.argsort(dists, 100, reverse=True) bestwords = [ index2word[simindex] + '(' + str(dists[simindex]) + ')' for simindex in best if simindex != real_index and dists[simindex] >= 0.5 ] if len(bestwords) == 0: continue line_msg = uquery + '\t' + ' '.join(bestwords) line_msgs += line_msg + '\n' mutex.acquire() simoutpathfile.write(line_msgs.encode('utf-8')) simoutpathfile.flush() mutex.release()
def paper2vec_recommend(context): """ Make recommendations based on the Paper2vec vectors.""" #if not hasattr(papervecs, 'syn0'): # raise RuntimeError("Parameters required for predicting the output words not found.") topn = 500 context_words_list = context.split() sleep(0.3) # REMEMBER: Here, papervecs.wv.vocab contains not words, but docids # Use the doc2vec wv word_vocabs = [model.wv.vocab[w] for w in context_words_list if w in model.wv.vocab] word2_indices = [word.index for word in word_vocabs] l1 = np.sum(model.wv.syn0[word2_indices], axis=0) if word2_indices: l1 /= len(word2_indices) prob_values = np.exp(np.dot(l1, papervecs.syn0.T)) #prob_values = np.exp(np.dot(l1, model.docvecs.doctag_syn0.T)) prob_values = np.nan_to_num(prob_values) prob_values /= sum(prob_values) # some of the vectors in papervecs stand for docs, some just for words (where are these ids coming from?) top_indices = matutils.argsort(prob_values, topn=topn, reverse=True) return [papervecs.index2entity[index1] for index1 in top_indices]
def show_topic(self, topicno, topn=10): """ Return a specified topic (=left singular vector), 0 <= `topicno` < `self.num_topics`, as a string. Return only the `topn` words which contribute the most to the direction of the topic (both negative and positive). >>> lsimodel.show_topic(10, topn=5) [("category", -0.340), ("$M$", 0.298), ("algebra", 0.183), ("functor", -0.174), ("operator", -0.168)] """ # size of the projection matrix can actually be smaller than `self.num_topics`, # if there were not enough factors (real rank of input matrix smaller than # `self.num_topics`). in that case, return an empty string if topicno >= len(self.projection.u.T): return '' c = np.asarray(self.projection.u.T[topicno, :]).flatten() norm = np.sqrt(np.sum(np.dot(c, c))) most = matutils.argsort(np.abs(c), topn, reverse=True) return [(self.id2word[val], 1.0 * c[val] / norm) for val in most]
def get_topic_terms(self, topic_id, topn=10, readable=True): # TODO move this and similar methods to parent class """ Args: topic_id: topn: readable: If False returns term_id, if True returns the actual word. Returns: A list of tuples (term, prob) of the topn terms in topic_id, formated according to format. """ topic_term_probs = self.phi[topic_id] bestn = matutils.argsort(topic_term_probs, topn, reverse=True) if readable: return [(self.id2word[idx], topic_term_probs[idx]) for idx in bestn] else: return [(idx, topic_term_probs[idx]) for idx in bestn]
def __init__(self, dictionary=None, topic_data=None, topic_file=None, style=None): if dictionary is None: raise ValueError('no dictionary!') if topic_data is not None: topics = topic_data elif topic_file is not None: topics = np.loadtxt('%s' % topic_file) else: raise ValueError('no topic data!') # sort topics topics_sums = np.sum(topics, axis=1) idx = matutils.argsort(topics_sums, reverse=True) self.data = topics[idx] self.dictionary = dictionary if style is None: style = self.STYLE_GENSIM self.style = style
def show_topics(self, num_topics: int = 10, num_words: int = 10, log: bool = False) -> list[tuple[int, list[str, float]]]: """Get the `num_words` most probable words for `num_topics` number of topics. Parameters ---------- num_topics : int, optional Number of topics to return, set `-1` to get all topics. num_words : int, optional Number of words. log : bool, optional If True - write topic with logging too, used for debug proposes. Returns ------- list of (float, str) Topics as list of (weight, word) pairs """ if num_topics < 0 or num_topics >= self.num_topics: num_topics = self.num_topics chosen_topics = range(num_topics) else: num_topics = min(num_topics, self.num_topics) # add a little random jitter, to randomize results around the same alpha sort_alpha = self.alpha + 0.0001 * numpy.random.rand( len(self.alpha)) sorted_topics = list(matutils.argsort(sort_alpha)) chosen_topics = sorted_topics[:num_topics // 2] + sorted_topics[-num_topics // 2:] shown = [] for i in chosen_topics: topic = self.show_topic(i, topn=num_words) shown.append((i, topic)) if log: logger.info("topic #%i (%.3f): %s", i, self.alpha[i], topic) return shown
def compute_dt_dist(docs, labels, tags, model, max_len, batch_size, pad_id, idxvocab, output_file): #generate batches num_batches = int(math.ceil(float(len(docs)) / batch_size)) dt_dist = [] t = [] combined = [] docid = 0 for i in xrange(num_batches): x, _, _, t, s = get_batch_doc(docs, labels, tags, i, max_len, cf.tag_len, batch_size, pad_id) attention, mean_topic = sess.run([model.attention, model.mean_topic], {model.doc: x, model.tag: t}) dt_dist.extend(attention[:s]) if debug: for si in xrange(s): d = x[si] print "\n\nDoc", docid, "=", " ".join([idxvocab[item] for item in d if (item != pad_id)]) sorted_dist = matutils.argsort(attention[si], reverse=True) for ti in sorted_dist: print "Topic", ti, "=", attention[si][ti] docid += 1 np.save(open(output_file, "w"), dt_dist)
def show_topic(self, topicid, time, topn=50, num_words=None): """ Return `num_words` most probable words for the given `topicid`, as a list of `(word_probability, word)` 2-tuples. """ if num_words is not None: # deprecated num_words is used warnings.warn( "The parameter `num_words` is deprecated, will be removed in 4.0.0, use `topn` instead." ) topn = num_words topics = self.lambda_[:, :, time] topic = topics[topicid] # likelihood to probability topic = np.exp(topic) # normalize to probability dist topic = topic / topic.sum() # sort according to prob bestn = matutils.argsort(topic, topn, reverse=True) beststr = [(topic[idx], self.id2word[idx]) for idx in bestn] return beststr
def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): """ For `num_topics` number of topics, return `num_words` most significant words (10 words per topic, by default). The topics are returned as a list -- a list of strings if `formatted` is True, or a list of (probability, word) 2-tuples if False. If `log` is True, also output this result to log. Unlike LSA, there is no natural ordering between the topics in LDA. The returned `num_topics <= self.num_topics` subset of all topics is therefore arbitrary and may change between two LDA training runs. """ if num_topics < 0 or num_topics >= self.num_topics: num_topics = self.num_topics chosen_topics = range(num_topics) else: num_topics = min(num_topics, self.num_topics) # add a little random jitter, to randomize results around the same alpha sort_alpha = self.alpha + 0.0001 * numpy.random.rand(len(self.alpha)) sorted_topics = list(matutils.argsort(sort_alpha)) chosen_topics = sorted_topics[:num_topics // 2] + sorted_topics[-num_topics // 2:] shown = [] for i in chosen_topics: if formatted: topic = self.print_topic(i, topn=num_words) else: topic = self.show_topic(i, topn=num_words) shown.append(topic) if log: logger.info("topic #%i (%.3f): %s" % (i, self.alpha[i], topic)) return shown
def get_nodes_link_to(model, node, topn): """ re-use implementation of gensim :param model: gensim.models.Word2Vec :param node: :param topn: :return: """ word_vocabs = [model.wv.vocab[node]] word2_indices = [word.index for word in word_vocabs] l1 = np.sum(model.trainables.syn1neg[word2_indices], axis=0) if word2_indices and model.cbow_mean: l1 /= len(word2_indices) prob_values = np.exp(np.dot(l1, model.wv.vectors.T)) prob_values /= sum(prob_values) top_indices = matutils.argsort(prob_values, topn=topn, reverse=True) return [(model.wv.index2word[index1], prob_values[index1]) for index1 in top_indices]
def predict_output_word(model, context_words_list, topn=10, do_sorting=True, possible_mutations=None, vocab_indices=None): """Modified function from method of Word2Vec class from gensim library""" word_vocabs = [ model.wv.vocab[w] for w in context_words_list if w in model.wv.vocab ] word2_indices = [word.index for word in word_vocabs] l1 = np.sum(model.wv.vectors[word2_indices], axis=0) if word2_indices and model.cbow_mean: l1 /= len(word2_indices) # propagate hidden -> output and take softmax to get probabilities prob_values = np.exp(np.dot(l1, model.trainables.syn1neg.T)) prob_values /= sum(prob_values) if do_sorting: top_indices = matutils.argsort(prob_values, topn=topn, reverse=True) else: top_indices = list(range(len(prob_values)))[:topn] if not possible_mutations and not vocab_indices: # returning the most probable output words with their probabilities return [(model.wv.index2word[index1], prob_values[index1]) for index1 in top_indices] elif vocab_indices: return [prob_values[index1] for index1 in vocab_indices] else: return [ prob_values[index1] for index1 in top_indices if model.wv.index2word[index1] in possible_mutations ]
def show_topic(self, topicid, time, topn=50, num_words=None): """ Return `num_words` most probable words for the given `topicid`, as a list of `(word_probability, word)` 2-tuples. """ if num_words is not None: # deprecated num_words is used logger.warning( "The parameter num_words for show_topic() would be deprecated in the updated version." ) logger.warning("Please use topn instead.") topn = num_words topics = self.lambda_[:, :, time] topic = topics[topicid] # liklihood to probability topic = np.exp(topic) # normalize to probability dist topic = topic / topic.sum() # sort according to prob bestn = matutils.argsort(topic, topn, reverse=True) beststr = [(topic[id], self.id2word[id]) for id in bestn] return beststr
def hd2v_recommend(context): """ Recommend based on the hyperdoc2vec model using IN and OUT vectors""" topn = 500 context_words_list = context.split() word_vocabs = [ hd2vmodel.wv.vocab[w] for w in context_words_list if w in hd2vmodel.wv.vocab ] word2_indices = [word.index for word in word_vocabs] sleep(0.2) # Get the sum of the IN word vectors l1 = np.sum(hd2vmodel.wv.syn0[word2_indices], axis=0) # And the sum of the OUT word vectors l2 = np.sum(hd2vmodel.syn1neg[word2_indices], axis=0) if word2_indices: l2 /= len(word2_indices) l1 /= len(word2_indices) # Following hd2v code, e^(sumwvIN.docvecIN + sumwvOUT.docvecOUT) prob_values = exp( dot(l1, hd2vmodel.docvecs.doctag_syn1neg.T) + dot(l2, hd2vmodel.docvecs.doctag_syn0.T)) prob_values = nan_to_num(prob_values) top_indices = matutils.argsort(prob_values, topn=topn, reverse=True) return [hd2vmodel.docvecs.offset2doctag[index1] for index1 in top_indices]
def calculate_text_similar(vec_ques, matrix_org_norm, matrix_org_index, top_vec): """ 最相似的句子,句向量与矩阵点乘 :param vec: :param matrix: :param keys: :param topn: :return: """ # 问句向量标准化, Scale a vector to unit length. The only exception is the zero vector, which is returned back unchanged. vec_ques_mean = matutils.unitvec(np.array( [vec_ques]).mean(axis=0)).astype(numpy_type) # 矩阵点乘, 即问句与标准问句库里边的问句点乘, matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean) # 相似度排序 most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True) # 获取最相似标准问句的index和得分score index_score = [] for t in most_similar_sentence_vec_sort[:top_vec]: index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])]) return index_score
def show_topic(self, topicid, time, topn=50, num_words=None): """Get `num_words` most probable words for the given `topicid`. Parameters ---------- topicid : int Id of topic. time : int Timestamp. topn : int, optional Top number of topics that you'll receive. num_words : int, optional DEPRECATED PARAMETER, use `topn` instead. Returns ------- list of (float, str) Sequence of probable words, as a list of `(word_probability, word)`. """ if num_words is not None: # deprecated num_words is used warnings.warn( "The parameter `num_words` is deprecated, will be removed in 4.0.0, use `topn` instead." ) topn = num_words topics = self.lambda_[:, :, time] topic = topics[topicid] # likelihood to probability topic = np.exp(topic) # normalize to probability dist topic = topic / topic.sum() # sort according to prob bestn = matutils.argsort(topic, topn, reverse=True) beststr = [(topic[idx], self.id2word[idx]) for idx in bestn] return beststr
def top_topics(self, corpus, num_words=20): """ Calculate the Umass topic coherence for each topic. Algorithm from **Mimno, Wallach, Talley, Leenders, McCallum: Optimizing Semantic Coherence in Topic Models, CEMNLP 2011.** """ is_corpus, corpus = utils.is_corpus(corpus) if not is_corpus: logger.warning("LdaModel.top_topics() called with an empty corpus") return topics = [] str_topics = [] for topic in self.state.get_lambda(): topic = topic / topic.sum( ) # normalize to probability distribution bestn = matutils.argsort(topic, topn=num_words, reverse=True) topics.append(bestn) beststr = [(topic[id], self.id2word[id]) for id in bestn] str_topics.append(beststr) # top_ids are limited to every topics top words. should not exceed the # vocabulary size. top_ids = set(chain.from_iterable(topics)) # create a document occurence sparse matrix for each word doc_word_list = {} for id in top_ids: id_list = set() for n, document in enumerate(corpus): if id in frozenset(x[0] for x in document): id_list.add(n) doc_word_list[id] = id_list coherence_scores = [] for t, top_words in enumerate(topics): # Calculate each coherence score C(t, top_words) coherence = 0.0 # Sum of top words m=2..M for m in top_words[1:]: # m_docs is v_m^(t) m_docs = doc_word_list[m] m_index = numpy.where(top_words == m)[0] # Sum of top words l=1..m-1 # i.e., all words ranked higher than the current word m for l in top_words[:m_index - 1]: # l_docs is v_l^(t) l_docs = doc_word_list[l] # make sure this word appears in some documents. if len(l_docs) > 0: # co_doc_frequency is D(v_m^(t), v_l^(t)) co_doc_frequency = len(m_docs.intersection(l_docs)) # add to the coherence sum for these two words m, l coherence += numpy.log( (co_doc_frequency + 1.0) / len(l_docs)) coherence_scores.append((str_topics[t], coherence)) top_topics = sorted(coherence_scores, key=lambda t: t[1], reverse=True) return top_topics
def label_rank(self, X): scores = self.scores(X) return scores, argsort(scores, reverse=True)
def most_similar( self, positive: [int, ndarray] = None, negative: [int, ndarray] = None, indexable: [IndexedList, IndexedLineDocument] = None, topn: int = 10, restrict_size: [int, Tuple[int, int]] = None, ) -> List[Tuple[int, float]]: """Find the top-N most similar sentences. Positive sentences contribute positively towards the similarity, negative sentences negatively. This method computes cosine similarity between a simple mean of the projection weight vectors of the given sentences and the vectors for each sentence in the model. Parameters ---------- positive : list of int, optional List of indices that contribute positively. negative : list of int, optional List of indices that contribute negatively. indexable: list, IndexedList, IndexedLineDocument Provides an indexable object from where the most similar sentences are read topn : int or None, optional Number of top-N similar sentences to return, when `topn` is int. When `topn` is None, then similarities for all sentences are returned. restrict_size : int or Tuple(int,int), optional Optional integer which limits the range of vectors which are searched for most-similar values. For example, restrict_vocab=10000 would only check the first 10000 sentence vectors. restrict_vocab=(500, 1000) would search the sentence vectors with indices between 500 and 1000. Returns ------- list of (int, float) or list of (str, int, float) A sequence of (index, similarity) is returned. When an indexable is provided, returns (str, index, similarity) When `topn` is None, then similarities for all words are returned as a one-dimensional numpy array with the size of the vocabulary. """ if indexable is not None and not hasattr(indexable, "__getitem__"): raise RuntimeError("Indexable must provide __getitem__") if positive is None: positive = [] if negative is None: negative = [] self.init_sims() if isinstance(positive, (int, integer)) and not negative: positive = [positive] if isinstance(positive, (ndarray)) and not negative: if len(positive.shape) == 1: positive = [positive] positive = [ (sent, 1.0) if isinstance(sent, (int, integer, ndarray)) else sent for sent in positive ] negative = [ (sent, -1.0) if isinstance(sent, (int, integer, ndarray)) else sent for sent in negative ] all_sents, mean = set(), [] for sent, weight in positive + negative: if isinstance(sent, ndarray): mean.append(weight * sent) else: mean.append(weight * self.get_vector(index=sent, use_norm=True)) if sent in self: all_sents.add(sent) if not mean: raise ValueError("cannot compute similarity with no input") mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) if isinstance(restrict_size, (int, integer)): lo, hi = 0, restrict_size elif isinstance(restrict_size, Tuple): lo, hi = restrict_size else: lo, hi = 0, None limited = (self.vectors_norm if restrict_size is None else self.vectors_norm[lo:hi]) dists = dot(limited, mean) if not topn: return dists best = matutils.argsort(dists, topn=topn + len(all_sents), reverse=True) best_off = best + lo if indexable is not None: result = [(indexable[off_idx], off_idx, float(dists[idx])) for off_idx, idx in zip(best_off, best) if off_idx not in all_sents] else: result = [(off_idx, float(dists[idx])) for off_idx, idx in zip(best_off, best) if off_idx not in all_sents] return result[:topn]
def most_similar(self, positive=[], negative=[], topn=10, clip_start=0, clip_end=None): """ Find the top-N most similar docvecs known from training. Positive docs contribute positively towards the similarity, negative docs negatively. This method computes cosine similarity between a simple mean of the projection weight vectors of the given docs. Docs may be specified as vectors, integer indexes of trained docvecs, or if the documents were originally presented with string tags, by the corresponding tags. The 'clip_start' and 'clip_end' allow limiting results to a particular contiguous range of the underlying doctag_syn0norm vectors. (This may be useful if the ordering there was chosen to be significant, such as more popular tag IDs in lower indexes.) """ self.init_sims() clip_end = clip_end or len(self.doctag_syn0norm) if isinstance(positive, string_types + integer_types) and not negative: # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) positive = [positive] # add weights for each doc, if not already present; default to 1.0 for positive and -1.0 for negative docs positive = [ (doc, 1.0) if isinstance(doc, string_types + (ndarray, ) + integer_types) else doc for doc in positive ] negative = [ (doc, -1.0) if isinstance(doc, string_types + (ndarray, ) + integer_types) else doc for doc in negative ] # compute the weighted average of all docs all_docs, mean = set(), [] for doc, weight in positive + negative: if isinstance(doc, ndarray): mean.append(weight * doc) elif doc in self.doctags or doc < self.count: mean.append(weight * self.doctag_syn0norm[self._int_index(doc)]) all_docs.add(self._int_index(doc)) else: raise KeyError("doc '%s' not in trained set" % doc) if not mean: raise ValueError("cannot compute similarity with no input") mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) dists = dot(self.doctag_syn0norm[clip_start:clip_end], mean) if not topn: return dists best = matutils.argsort(dists, topn=topn + len(all_docs), reverse=True) # ignore (don't return) docs from the input result = [(self.index_to_doctag(sim), float(dists[sim])) for sim in best if sim not in all_docs] return result[:topn]
def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True, normalize=None): """Get the topics sorted by sparsity. Parameters ---------- num_topics : int, optional Number of topics to be returned. Unlike LSA, there is no natural ordering between the topics in NMF. The returned topics subset of all topics is therefore arbitrary and may change between two NMF training runs. num_words : int, optional Number of words to be presented for each topic. These will be the most relevant words (assigned the highest probability for each topic). log : bool, optional Whether the result is also logged, besides being returned. formatted : bool, optional Whether the topic representations should be formatted as strings. If False, they are returned as 2 tuples of (word, probability). normalize: bool or None, optional Whether to normalize the result. Allows for estimation of perplexity, coherence, e.t.c. Returns ------- list of {str, tuple of (str, float)} a list of topics, each represented either as a string (when `formatted` == True) or word-probability pairs. """ if normalize is None: normalize = self.normalize # Compute fraction of zero elements in each column sparsity = np.zeros(self._W.shape[1]) for row in self._W: sparsity += (row == 0) sparsity /= self._W.shape[0] if num_topics < 0 or num_topics >= self.num_topics: num_topics = self.num_topics chosen_topics = range(num_topics) else: num_topics = min(num_topics, self.num_topics) sorted_topics = list(matutils.argsort(sparsity)) chosen_topics = (sorted_topics[:num_topics // 2] + sorted_topics[-num_topics // 2:]) shown = [] topics = self.get_topics(normalize=normalize) for i in chosen_topics: topic = topics[i] bestn = matutils.argsort(topic, num_words, reverse=True).ravel() topic = [(self.id2word[id], topic[id]) for id in bestn] if formatted: topic = " + ".join(['%.3f*"%s"' % (v, k) for k, v in topic]) shown.append((i, topic)) if log: logger.info("topic #%i (%.3f): %s", i, sparsity[i], topic) return shown
def get_labels(topic_num): valdoc2vec = 0.0 valword2vec = 0.0 cnt = 0 store_indices = [] print "Processing Topic number " + str(topic_num) for item in topic_list[topic_num]: try: tempdoc2vec = model1.syn0norm[ model1.vocab[item]. index] # The word2vec value of topic word from doc2vec trained model except: pass else: meandoc2vec = matutils.unitvec(tempdoc2vec).astype( REAL) # Getting the unit vector distsdoc2vec = dot( model1.docvecs.doctag_syn0norm, meandoc2vec ) # The dot product of all labels in doc2vec with the unit vector of topic word valdoc2vec = valdoc2vec + distsdoc2vec try: tempword2vec = model2.syn0norm[ model2.vocab[item]. index] # The word2vec value of topic word from word2vec trained model except: pass else: meanword2vec = matutils.unitvec(tempword2vec).astype( REAL) # Unit vector distsword2vec = dot( model3, meanword2vec ) # The dot prodiuct of all possible labels in word2vec vocab with the unit vector of topic word """ This next section of code checks if the topic word is also a potential label in trained word2vec model. If that is the case, it is important the dot product of label with that topic word is not taken into account.Hence we make that zero and further down the code also exclude it in taking average of that label over all topic words. """ if (model2.vocab[item].index) in w_indices: i_val = w_indices.index(model2.vocab[item].index) store_indices.append(i_val) distsword2vec[i_val] = 0.0 valword2vec = valword2vec + distsword2vec avgdoc2vec = valdoc2vec / float(len( topic_list[topic_num])) # Give the average vector over all topic words avgword2vec = valword2vec / float( len(topic_list[topic_num] )) # Average of word2vec vector over all topic words bestdoc2vec = matutils.argsort( avgdoc2vec, topn=100, reverse=True) # argsort and get top 100 doc2vec label indices resultdoc2vec = [] # Get the doc2vec labels from indices for elem in bestdoc2vec: ind = d_indices[elem] temp = model1.docvecs.index_to_doctag(ind) resultdoc2vec.append((temp, float(avgdoc2vec[elem]))) # This modifies the average word2vec vector for cases in which the word2vec label was same as topic word. for element in store_indices: avgword2vec[element] = (avgword2vec[element] * len( topic_list[topic_num])) / (float(len(topic_list[topic_num]) - 1)) bestword2vec = matutils.argsort( avgword2vec, topn=100, reverse=True) #argsort and get top 100 word2vec label indices # Get the word2vec labels from indices resultword2vec = [] for element in bestword2vec: ind = w_indices[element] temp = model2.index2word[ind] resultword2vec.append((temp, float(avgword2vec[element]))) # Get the combined set of both doc2vec labels and word2vec labels comb_labels = list( set([i[0] for i in resultdoc2vec] + [i[0] for i in resultword2vec])) newlist_doc2vec = [] newlist_word2vec = [] # Get indices from combined labels for elem in comb_labels: try: newlist_doc2vec.append( d_indices.index(model1.docvecs.doctags[elem].offset)) temp = get_word(elem) newlist_word2vec.append(w_indices.index(model2.vocab[temp].index)) except: pass newlist_doc2vec = list(set(newlist_doc2vec)) newlist_word2vec = list(set(newlist_word2vec)) # Finally again get the labels from indices. We searched for the score from both doctvec and word2vec models resultlist_doc2vecnew = [(model1.docvecs.index_to_doctag(d_indices[elem]), float(avgdoc2vec[elem])) for elem in newlist_doc2vec] resultlist_word2vecnew = [(model2.index2word[w_indices[elem]], float(avgword2vec[elem])) for elem in newlist_word2vec] # Finally get the combined score with the label. The label used will be of doc2vec not of word2vec. new_score = [] for item in resultlist_word2vecnew: k, v = item for elem in resultlist_doc2vecnew: k2, v2 = elem k3 = get_word(k2) if k == k3: v3 = v + v2 new_score.append((k2, v3)) new_score = sorted(new_score, key=lambda x: x[1], reverse=True) return new_score[:(int(args.num_cand_labels))]
def most_similar_cosmul(self, positive=None, negative=None, topn=10): """ Find the top-N most similar words, using the multiplicative combination objective proposed by Omer Levy and Yoav Goldberg in [4]_. Positive words still contribute positively towards the similarity, negative words negatively, but with less susceptibility to one large distance dominating the calculation. In the common analogy-solving case, of two positive and one negative examples, this method is equivalent to the "3CosMul" objective (equation (4)) of Levy and Goldberg. Additional positive or negative examples contribute to the numerator or denominator, respectively – a potentially sensible but untested extension of the method. (With a single positive example, rankings will be the same as in the default most_similar.) Example:: >>> trained_model.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london']) [(u'iraq', 0.8488819003105164), ...] .. [4] Omer Levy and Yoav Goldberg. Linguistic Regularities in Sparse and Explicit Word Representations, 2014. """ if positive is None: positive = [] if negative is None: negative = [] self.init_sims() if isinstance(positive, string_types) and not negative: # allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog']) positive = [positive] all_words = { self.vocab[word].index for word in positive + negative if not isinstance(word, ndarray) and word in self.vocab } positive = [ self.word_vec(word, use_norm=True) if isinstance( word, string_types) else word for word in positive ] negative = [ self.word_vec(word, use_norm=True) if isinstance( word, string_types) else word for word in negative ] if not positive: raise ValueError("cannot compute similarity with no input") # equation (4) of Levy & Goldberg "Linguistic Regularities...", # with distances shifted to [0,1] per footnote (7) pos_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in positive] neg_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in negative] dists = prod(pos_dists, axis=0) / (prod(neg_dists, axis=0) + 0.000001) if not topn: return dists best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True) # ignore (don't return) words from the input result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words] return result[:topn]
def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None): """ Find the top-N most similar words. Positive words contribute positively towards the similarity, negative words negatively. This method computes cosine similarity between a simple mean of the projection weight vectors of the given words and the vectors for each word in the model. The method corresponds to the `word-analogy` and `distance` scripts in the original word2vec implementation. If topn is False, most_similar returns the vector of similarity scores. `restrict_vocab` is an optional integer which limits the range of vectors which are searched for most-similar values. For example, restrict_vocab=10000 would only check the first 10000 word vectors in the vocabulary order. (This may be meaningful if you've sorted the vocabulary by descending frequency.) Example:: >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man']) [('queen', 0.50882536), ...] """ if positive is None: positive = [] if negative is None: negative = [] self.init_sims() if isinstance(positive, string_types) and not negative: # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) positive = [positive] # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words positive = [ (word, 1.0) if isinstance(word, string_types + (ndarray, )) else word for word in positive ] negative = [ (word, -1.0) if isinstance(word, string_types + (ndarray, )) else word for word in negative ] # compute the weighted average of all words all_words, mean = set(), [] for word, weight in positive + negative: if isinstance(word, ndarray): mean.append(weight * word) else: mean.append(weight * self.word_vec(word, use_norm=True)) if word in self.vocab: all_words.add(self.vocab[word].index) if not mean: raise ValueError("cannot compute similarity with no input") mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) if indexer is not None: return indexer.most_similar(mean, topn) limited = self.syn0norm if restrict_vocab is None else self.syn0norm[: restrict_vocab] dists = dot(limited, mean) if not topn: return dists best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True) # ignore (don't return) words from the input result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words] return result[:topn]
def best_items(topic, dictionary, f, n): data = [(i, score) for i, score in enumerate(topic) if score > 0 and f(dictionary, i)] indices, scores = zip(*data) return [indices[i] for i in matutils.argsort(scores, n, reverse=True)]
def new_accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, case_insensitive=True): """ Compute accuracy of the model. `questions` is a filename where lines are 4-tuples of words, split into sections by ": SECTION NAME" lines. See questions-words.txt in https://storage.googleapis.com/google-code-archive-source/v2/code.google.com/word2vec/source-archive.zip for an example. The accuracy is reported (=printed to log and returned as a list) for each section separately, plus there's one aggregate summary at the end. Use `restrict_vocab` to ignore all questions containing a word not in the first `restrict_vocab` words (default 30,000). This may be meaningful if you've sorted the vocabulary by descending frequency. In case `case_insensitive` is True, the first `restrict_vocab` words are taken first, and then case normalization is performed. Use `case_insensitive` to convert all words in questions and vocab to their uppercase form before evaluating the accuracy (default True). Useful in case of case-mismatch between training tokens and question words. In case of multiple case variants of a single word, the vector for the first occurrence (also the most frequent if vocabulary is sorted) is taken. This method corresponds to the `compute-accuracy` script of the original C word2vec. """ print("INFO: Using new accuracy") ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab) } if case_insensitive else dict(ok_vocab) oov_counter, idx_cnt, is_vn_counter = 0, 0, 0 sections, section = [], None for line_no, line in enumerate(utils.smart_open(questions)): # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed line = utils.to_unicode(line) if line.startswith(': '): # a new section starts => store the old section if section: sections.append(section) self.log_accuracy(section) section = { 'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': [] } else: # Count number of analogy to check idx_cnt += 1 if not section: raise ValueError( "missing section header before line #%i in %s" % (line_no, questions)) try: if case_insensitive: a, b, c, expected = [ word.upper() for word in line.split(" | ") ] else: a, b, c, expected = [ word for word in line.split(" | ") ] # print("Line : ", line) # print("a, b, c, expected: %s, %s, %s, %s"%(a, b, c, expected)) # input(">>> Wait ...") except ValueError: logger.info("SVX: ERROR skipping invalid line #%i in %s", line_no, questions) print("Line : ", line) print("a, b, c, expected: %s, %s, %s, %s" % (a, b, c, expected)) input(">>> Wait ...") continue # In case of Vietnamese, word analogy can be a phrase if " " in a or " " in b or " " in c or " " in expected: is_vn_counter += 1 pass else: if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: logger.debug( "SVX: skipping line #%i with OOV words: %s", line_no, line.strip()) oov_counter += 1 continue original_vocab = self.vocab self.vocab = ok_vocab ignore = {a, b, c} # input words to be ignored predicted = None # find the most likely prediction, ignoring OOV words and input words sims = most_similar(self, positive=[b, c], negative=[a], topn=False, restrict_vocab=restrict_vocab) self.vocab = original_vocab for index in matutils.argsort(sims, reverse=True): predicted = self.index2word[index].upper( ) if case_insensitive else self.index2word[index] if predicted in ok_vocab and predicted not in ignore: if predicted != expected: logger.debug("%s: expected %s, predicted %s", line.strip(), expected, predicted) break if predicted == expected: section['correct'].append((a, b, c, expected)) else: section['incorrect'].append((a, b, c, expected)) if section: # store the last section, too sections.append(section) self.log_accuracy(section) total = { 'OOV/Total/VNCompound_Words': [oov_counter, (idx_cnt), is_vn_counter], 'section': 'total', 'correct': sum((s['correct'] for s in sections), []), 'incorrect': sum((s['incorrect'] for s in sections), []), } self.log_accuracy(total) sections.append(total) return sections
def show_topic(self, topicid, topn=10): topic = self.wordtopics[topicid] topic = topic / topic.sum() # normalize to probability dist bestn = matutils.argsort(topic, topn, reverse=True) beststr = [(topic[id], self.id2word[id]) for id in bestn] return beststr
def top_topics(self, corpus, texts=None, dictionary=None, window_size=None, coherence='u_mass', topn=20, processes=-1): """Get the topics sorted by coherence. Parameters ---------- corpus : iterable of list of (int, float) or `csc_matrix` with the shape (n_tokens, n_documents) Training corpus. Can be either iterable of documents, which are lists of `(word_id, word_count)`, or a sparse csc matrix of BOWs for each document. If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`). texts : list of list of str, optional Tokenized texts, needed for coherence models that use sliding window based (i.e. coherence=`c_something`) probability estimator . dictionary : {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}, optional Dictionary mapping of id word to create corpus. If `model.id2word` is present, this is not needed. If both are provided, passed `dictionary` will be used. window_size : int, optional Is the size of the window to be used for coherence measures using boolean sliding window as their probability estimator. For 'u_mass' this doesn't matter. If None - the default window sizes are used which are: 'c_v' - 110, 'c_uci' - 10, 'c_npmi' - 10. coherence : {'u_mass', 'c_v', 'c_uci', 'c_npmi'}, optional Coherence measure to be used. Fastest method - 'u_mass', 'c_uci' also known as `c_pmi`. For 'u_mass' corpus should be provided, if texts is provided, it will be converted to corpus using the dictionary. For 'c_v', 'c_uci' and 'c_npmi' `texts` should be provided (`corpus` isn't needed) topn : int, optional Integer corresponding to the number of top words to be extracted from each topic. processes : int, optional Number of processes to use for probability estimation phase, any value less than 1 will be interpreted as num_cpus - 1. Returns ------- list of (list of (int, str), float) Each element in the list is a pair of a topic representation and its coherence score. Topic representations are distributions of words, represented as a list of pairs of word IDs and their probabilities. """ cm = CoherenceModel(model=self, corpus=corpus, texts=texts, dictionary=dictionary, window_size=window_size, coherence=coherence, topn=topn, processes=processes) coherence_scores = cm.get_coherence_per_topic() str_topics = [] for topic in self.get_topics( ): # topic = array of vocab_size floats, one per term bestn = matutils.argsort(topic, topn=topn, reverse=True) # top terms for topic beststr = [(topic[_id], self.id2word[_id]) for _id in bestn] # membership, token str_topics.append( beststr) # list of topn (float membership, token) tuples scored_topics = zip(str_topics, coherence_scores) return sorted(scored_topics, key=lambda tup: tup[1], reverse=True)
def get_topic_extraction_detail(self, message, id): # self.load_lda_topic_model() tf_vectorizer = CountVectorizer(max_df=1, min_df=1, vocabulary=self.glda_tf_feature_names) docs = [] message = re.sub('\n', ' ', message) docs = self.message_corpus(message) print('Building BiGrams from the message...') bigram = Phrases(docs, min_count=2, threshold=2, delimiter=b' ') logger.propagate = False bigram_phraser = Phraser(bigram) texts = [bigram_phraser[line] for line in docs] bg_message = ' '.join(texts[0]) tf = tf_vectorizer.fit_transform([bg_message]) print('Extracting topics...') doc_topic = self.glda.transform(tf) document_topics = [(topicid, topicvalue) for topicid, topicvalue in enumerate(doc_topic[0]) if topicvalue >= 0.01] document_topics = sorted(document_topics, key=lambda score: score[1], reverse=True) doc_distribution = np.array([tup[0] for tup in document_topics]) # print(doc_distribution) print('Extracting term per topic...') count_vec = np.asarray(tf.sum(axis=0)).ravel() zipped = list(zip(self.glda_tf_feature_names, count_vec)) x, y = (list(x) for x in zip( *sorted(zipped, key=lambda x: x[1], reverse=True))) Y = np.concatenate([y[0:tf.indices.shape[0]], y[-1:-1]]) X = np.concatenate([x[0:tf.indices.shape[0]], x[-1:-1]]) # for i in range(len(X)): # print("Top to Bottom Frequent Words : {} , count: {}".format(X.tolist()[i],Y.tolist()[i])) self.config_dict = dict(self.config.items('TOPIC_LABEL')) list_topic_names = eval(self.config_dict['list_topic_names']) doc_term_topic = {"topics": []} # print topics with words and score rank for i in doc_distribution[-40:][::-1]: # topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1] # print('Topic {}: {}'.format(i, ' '.join(topic_words))) print('Extracting term for topic: ' + repr(i)) topic_ = self.glda.topic_word_[i] topic_ = topic_ / topic_.sum( ) # normalize to probability distribution bestn = matutils.argsort(topic_, topic_.shape[0], reverse=True) topic_ = sorted([(self.glda_tf_feature_names[id], topic_[id]) for id in bestn], reverse=True) topic_terms = {"terms": []} ss = dict((k, 1.0) for k in X) test = dict(topic_) d = {x: test[x] for x in test if x in ss} for term in X: topic_id = ''.join( ['{"term":"%s" ,"score": %.6f}' % (term, d[term] * 100)]) topic_terms["terms"].append(json.loads(topic_id)) # print(topic_terms) # for k, v in topic_: # {k: v for k, v in topic_} # } # } topic_terms["terms"] = sorted(topic_terms["terms"], key=lambda k: k['score'], reverse=True) topic = { "score": [score for k, score in document_topics if i == k][0], "terms": topic_terms["terms"], "topic": list_topic_names[i] } doc_term_topic["topics"].append(topic) # doc_term_topic_score.append(topic_id) # topic_ = ' + '.join(['%.4f*"%s"' % (v, k) for k, v in topic_]) # print(topic) print('Sorting topics by score...') doc_term_topic["topics"] = sorted(doc_term_topic["topics"], key=lambda k: k['score'], reverse=True) terms_not_in_topics = json.dumps(list(set(texts[0]).difference(X))) doc_term_topic["terms_not_in_topics"] = terms_not_in_topics # print(document_topics) print('Done!') return doc_term_topic