def main(): trace('---train topics---', config.log_file) model = DtmModel(dtm_path, corpus=gensim_data.corpus, id2word=gensim_data.dictionary, time_slices=train_set.time_slices[:-1], num_topics=config.z_dim, lda_sequence_min_iter=50, lda_sequence_max_iter=config.epochs) trace('---model trained---', config.log_file) # sample_topic = model.dtm_coherence(time=0, num_words=10) print('sample topic is like: {}'.format(' '.join(sample_topic[0])), config.log_file) # tw_nps = model.show_topics(num_topics=config.z_dim, times=-1, num_words=train_set.vocab_size(), formatted=False) for t in range(T): # topics in time t tw_np = tw_nps[t * config.z_dim:(t + 1) * config.z_dim] tw_np = get_topic_np(tw_np, config.z_dim, gensim_data.dictionary.token2id) tw_tensor = torch.from_numpy(tw_np) tw_list_t = get_tw_list(tw_tensor, gensim_data.dictionary) # coh cohs_t = get_cohs(tw_list_t) p = ppl(gensim_data.test, tw_tensor) TWmatrix.append(tw_np) TWlist.append(tw_list_t) COHs.append(cohs_t) PPLs.append(p) avg_COHs.append((sum(cohs_t) / len(cohs_t))) seg = '---------- topics in time {}/{} ----------'.format(t + 1, T) display_topics(tw_list=tw_list_t, cohs=cohs_t, head='topics', seg=seg, file=config.topic_file) trace('topic result(coherence) written.', file=config.log_file) p_file = os.path.join(config.output_path, 'ppl.jpg') draw_ppl(PPLs, title='perplexities over time', file=p_file) a_file = os.path.join(config.output_path, 'avg_coh.jpg') draw_ppl(avg_COHs, title='avg coherence over time', file=a_file)
def getCoherenceScores(nTopics): model = DtmModel(path_to_dtm_binary, corpus=corpus, num_topics=nTopics, id2word=dictionary, time_slices=timeSlice) model.save(f'./Models/model{nTopics}Topics') wordRepresentationTopics = [ model.dtm_coherence(time=time) for time in range(0, len(timeSlice)) ] coherenceModels = [ CoherenceModel(topics=wordRepresentationTopics[time], corpus=corpus, dictionary=dictionary, coherence='u_mass') for time in range(0, len(timeSlice)) ] coherenceScores = [ coherenceModels[time].get_coherence() for time in range(0, len(timeSlice)) ] return coherenceScores
def _dtm(table, input_col, topic_name='topic', num_topic=5, num_topic_word=10, max_iter=20, time_slice=None, coherence='u_mass', vis_time=0, seed=None): running_os = platform.system() is_os_64bit = platform.machine().endswith('64') if running_os == 'Linux': if is_os_64bit: dtm_filename = 'dtm-linux64' else: dtm_filename = 'dtm-linux32' elif running_os == 'Windows': if is_os_64bit: dtm_filename = 'dtm-win64.exe' else: dtm_filename = 'dtm-win32.exe' else: # Mac dtm_filename = 'dtm-darwin64' dtm_path = os.path.join(str(pathlib.Path(__file__).parent.absolute()), 'dtm', dtm_filename) if running_os != 'Windows': bash_command = "chmod +x {}".format(dtm_path) os.system(bash_command) tokenized_doc = np.array(table[input_col]) num_doc = len(tokenized_doc) if time_slice is None: time_slice = [num_doc] elif sum(time_slice) != num_doc: raise_runtime_error("The sum of time slice list does not match the number of documents.") if vis_time < 0 or vis_time >= len(time_slice): raise_runtime_error("Invalid time parameter: {}".format(vis_time)) dictionary = corpora.Dictionary(tokenized_doc) corpus = [dictionary.doc2bow(text) for text in tokenized_doc] dtm_params = {"corpus": corpus, "id2word": dictionary, "time_slices": time_slice, "num_topics": num_topic, "lda_sequence_max_iter": max_iter, "model": 'dtm'} if seed is not None: dtm_params["rng_seed"] = seed dtm_model = DtmModel(dtm_path, **dtm_params) topic_time = [[dtm_model.show_topic(topicid=id, time=t, topn=num_topic_word) for id in range(num_topic)] for t in range(len(time_slice))] topic_time = [[["{}: {}".format(tup[1], tup[0]) for tup in topic] for topic in time] for time in topic_time] timeline = ["{} ({} docs)".format(ind, t) for ind, t in enumerate(time_slice)] columns = ["topic_{}".format(i + 1) for i in range(num_topic)] topic_table = pd.DataFrame(topic_time, columns=columns) topic_table['time'] = timeline topic_table = topic_table[['time'] + columns] prop_arr = dtm_model.gamma_ out_table = pd.DataFrame.copy(table, deep=True) if topic_name in table.columns: raise BrighticsFunctionException.from_errors( [{'0100': "Existing table contains Topic Column Name. Please choose again."}]) out_table[topic_name] = [item.argmax() + 1 for item in prop_arr] out_table['topic_distribution'] = prop_arr.tolist() coherence_topic_arr = [dtm_model.dtm_coherence(time) for time in range(len(time_slice))] if coherence == 'u_mass': coh_arr = [CoherenceModel(topics=item, dictionary=dictionary, corpus=corpus, coherence='u_mass').get_coherence() for item in coherence_topic_arr] else: coh_arr = [CoherenceModel(topics=item, dictionary=dictionary, corpus=corpus, texts=tokenized_doc, coherence='c_v').get_coherence() for item in coherence_topic_arr] doc_topic, topic_term, doc_lengths, term_frequency, vocab = dtm_model.dtm_vis(corpus, vis_time) prepared_data = plv.prepare(topic_term, doc_topic, doc_lengths, vocab, term_frequency, sort_topics=False) html_result = plv.prepared_data_to_html(prepared_data) params = {'Input column': input_col, 'Topic column name': topic_name, 'Number of topics': num_topic, 'Number of words for each topic': num_topic_word, 'Maximum number of iterations': max_iter, 'Time slice': time_slice, 'Coherence measure': coherence, 'Time to visualize': vis_time} rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Dynamic Topic Modeling Result | ### Summary | """)) rb.addHTML(html_result) rb.addMD(strip_margin(""" | ### Coherence for each period | {coh_arr} | | ### Parameters | {params} """.format(coh_arr=coh_arr, params=dict2MD(params)))) model = _model_dict('dtm_model') model['params'] = params model['dtm_model'] = dtm_model model['coherences'] = coh_arr model['corpus'] = corpus model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'topic_table': topic_table, 'model': model}