def evaluate(self, topic_candidates=None, nbtopterms=None): """ evaluate topic coherence. This method is for convenience and actually redundant. The coherence scores should optimally be calculated in evaluate_topics.py which provides more features and metrics. """ self.logg('evaluating topic candidates') # reference scores per topic for top topic terms if nbtopterms is None: nbtopterms = self.nb_top_terms if topic_candidates is None: topic_candidates = self.topic_candidates topic_candidates = topic_candidates.loc[:, 'term0': f'term{nbtopterms - 1}'] topics_list = topic_candidates.values.tolist() self.logg('> u_mass') t0 = time() cm_umass = CoherenceModel(topics=topics_list, corpus=self.corpus, dictionary=self.dict_from_corpus, coherence='u_mass', topn=nbtopterms, processes=self.processes) umass_scores = cm_umass.get_coherence_per_topic(with_std=False, with_support=False) t1 = int(time() - t0) self.logg(" done in {:02d}:{:02d}:{:02d}".format( t1 // 3600, (t1 // 60) % 60, t1 % 60)) self.logg('> c_v') t0 = time() cm_cv = CoherenceModel(topics=topics_list, texts=self.texts, dictionary=self.dict_from_corpus, coherence='c_v', topn=nbtopterms, processes=self.processes) cv_scores = cm_cv.get_coherence_per_topic() t1 = int(time() - t0) self.logg(" done in {:02d}:{:02d}:{:02d}".format( t1 // 3600, (t1 // 60) % 60, t1 % 60)) # changed segmentation for c_uci and c_npmi from s_one_set to s_one_one (default) self.logg('> c_uci') t0 = time() cm_cuci = CoherenceModel(topics=topics_list, texts=self.texts, dictionary=self.dict_from_corpus, coherence='c_uci', topn=nbtopterms, processes=self.processes) cuci_scores = cm_cuci.get_coherence_per_topic() t1 = int(time() - t0) self.logg(" done in {:02d}:{:02d}:{:02d}".format( t1 // 3600, (t1 // 60) % 60, t1 % 60)) self.logg('> c_npmi') t0 = time() cm_cuci.coherence = 'c_npmi' # reusing precalculated probability estimates cnpmi_scores1 = cm_cuci.get_coherence_per_topic() t1 = int(time() - t0) self.logg(" done in {:02d}:{:02d}:{:02d}".format( t1 // 3600, (t1 // 60) % 60, t1 % 60)) scores = { 'u_mass_eval': umass_scores, 'c_v_eval': cv_scores, 'c_uci_eval': cuci_scores, 'c_npmi_eval': cnpmi_scores1, } scores = pd.DataFrame(scores) scores.index = topic_candidates.index.copy() self.eval_scores = scores return scores
def _rerank_coherence_per_metric(self, metric, coherence_model=None): """ Object method to trigger the reranking for a given metric. It uses the fast heuristic for the reranking in O(n) with n being the number of candidate terms. A coherence metric is applied on each set of topic terms, when we leave exactly one term out. The resulting coherence score indicates, if a term strengthens or weakens the coherence of a topic. We remove those terms from the set whose absence resulted in higher scores. :param metric: :param coherence_model: :return: """ if self.shifted_topics is None: self.shifted_topics = self._shift_topics() t0 = time() self.logg( f'Calculating topic candidates using {metric} coherence measure ' f'on {self.nb_candidate_terms} candidate terms ' f'for {self.nb_topics} topics') # calculate the scores for all shifted topics kwargs = dict(topics=self.shifted_topics, dictionary=self.dict_from_corpus, coherence=metric, topn=self.nb_candidate_terms - 1, processes=self.processes) if metric == 'u_mass': kwargs['corpus'] = self.corpus else: kwargs['texts'] = self.texts if coherence_model is None: cm = CoherenceModel(**kwargs) else: cm = coherence_model cm.coherence = metric scores1d = cm.get_coherence_per_topic() scores2d = np.reshape(scores1d, (self.nb_candidate_terms, -1)).T # the highest values indicate the terms whose absence improves the topic coherence most sorted_scores = np.argsort(scores2d, axis=1) # thus we will keep the first nbtopterms (default 10) indices top_scores = sorted_scores[:, :self.nb_top_terms] # and sort them back for convenience top_scores = np.sort(top_scores, axis=1) # replacing indices with token-ids tpx_ids = [ self.topic_ids.values[i, top_scores[i]] for i in range(self.nb_topics) ] tpx_ids = (pd.DataFrame.from_records( tpx_ids, columns=self.topic_terms.columns[:self.nb_top_terms], index=self.topic_ids.index).assign(metric=metric).set_index( 'metric', append=True)) t1 = int(time() - t0) self._statistics_[metric] = dict() self._statistics_[metric]['runtime'] = t1 self.logg(" done in {:02d}:{:02d}:{:02d}".format( t1 // 3600, (t1 // 60) % 60, t1 % 60)) return tpx_ids