def for_topics(cls, topics_as_topn_terms, **kwargs): """Initialize a CoherenceModel with estimated probabilities for all of the given topics. Args: topics_as_topn_terms (list of lists): Each element in the top-level list should be the list of topics for a model. The topics for the model should be a list of top-N words, one per topic. """ if not topics_as_topn_terms: raise ValueError("len(topics) must be > 0.") if any(len(topic_lists) == 0 for topic_lists in topics_as_topn_terms): raise ValueError("found empty topic listing in `topics`") topn = 0 for topic_list in topics_as_topn_terms: for topic in topic_list: topn = max(topn, len(topic)) topn = min(kwargs.pop('topn', topn), topn) super_topic = utils.flatten(topics_as_topn_terms) logging.info( "Number of relevant terms for all %d models: %d", len(topics_as_topn_terms), len(super_topic)) cm = CoherenceModel(topics=[super_topic], topn=len(super_topic), **kwargs) cm.estimate_probabilities() cm.topn = topn return cm
def not_in_vocab(self, words): uniq_words = set(utils.flatten(words)) return set(word for word in uniq_words if word not in self.model.vocab)
def test_flatten_not_nested(self): not_nested = [1, 2, 3, 4, 5, 6] expected = [1, 2, 3, 4, 5, 6] self.assertEqual(utils.flatten(not_nested), expected)
def test_flatten_nested(self): nested_list = [[[1, 2, 3], [4, 5]], 6] expected = [1, 2, 3, 4, 5, 6] self.assertEqual(utils.flatten(nested_list), expected)