コード例 #1
0
def main():
    trace('---train topics---', config.log_file)
    model = DtmModel(dtm_path,
                     corpus=gensim_data.corpus,
                     id2word=gensim_data.dictionary,
                     time_slices=train_set.time_slices[:-1],
                     num_topics=config.z_dim,
                     lda_sequence_min_iter=50,
                     lda_sequence_max_iter=config.epochs)
    trace('---model trained---', config.log_file)
    #
    sample_topic = model.dtm_coherence(time=0, num_words=10)
    print('sample topic is like: {}'.format(' '.join(sample_topic[0])),
          config.log_file)

    #
    tw_nps = model.show_topics(num_topics=config.z_dim,
                               times=-1,
                               num_words=train_set.vocab_size(),
                               formatted=False)

    for t in range(T):
        # topics in time t
        tw_np = tw_nps[t * config.z_dim:(t + 1) * config.z_dim]

        tw_np = get_topic_np(tw_np, config.z_dim,
                             gensim_data.dictionary.token2id)
        tw_tensor = torch.from_numpy(tw_np)
        tw_list_t = get_tw_list(tw_tensor, gensim_data.dictionary)

        # coh
        cohs_t = get_cohs(tw_list_t)
        p = ppl(gensim_data.test, tw_tensor)

        TWmatrix.append(tw_np)
        TWlist.append(tw_list_t)
        COHs.append(cohs_t)
        PPLs.append(p)

        avg_COHs.append((sum(cohs_t) / len(cohs_t)))

        seg = '---------- topics in time {}/{} ----------'.format(t + 1, T)
        display_topics(tw_list=tw_list_t,
                       cohs=cohs_t,
                       head='topics',
                       seg=seg,
                       file=config.topic_file)
        trace('topic result(coherence) written.', file=config.log_file)

    p_file = os.path.join(config.output_path, 'ppl.jpg')
    draw_ppl(PPLs, title='perplexities over time', file=p_file)
    a_file = os.path.join(config.output_path, 'avg_coh.jpg')
    draw_ppl(avg_COHs, title='avg coherence over time', file=a_file)
コード例 #2
0
def getCoherenceScores(nTopics):
    model = DtmModel(path_to_dtm_binary,
                     corpus=corpus,
                     num_topics=nTopics,
                     id2word=dictionary,
                     time_slices=timeSlice)
    model.save(f'./Models/model{nTopics}Topics')
    wordRepresentationTopics = [
        model.dtm_coherence(time=time) for time in range(0, len(timeSlice))
    ]
    coherenceModels = [
        CoherenceModel(topics=wordRepresentationTopics[time],
                       corpus=corpus,
                       dictionary=dictionary,
                       coherence='u_mass')
        for time in range(0, len(timeSlice))
    ]
    coherenceScores = [
        coherenceModels[time].get_coherence()
        for time in range(0, len(timeSlice))
    ]
    return coherenceScores
コード例 #3
0
def _dtm(table, input_col, topic_name='topic', num_topic=5, num_topic_word=10, max_iter=20, time_slice=None,
         coherence='u_mass', vis_time=0, seed=None):
    running_os = platform.system()
    is_os_64bit = platform.machine().endswith('64')
    if running_os == 'Linux':
        if is_os_64bit:
            dtm_filename = 'dtm-linux64'
        else:
            dtm_filename = 'dtm-linux32'
    elif running_os == 'Windows':
        if is_os_64bit:
            dtm_filename = 'dtm-win64.exe'
        else:
            dtm_filename = 'dtm-win32.exe'
    else:  # Mac
        dtm_filename = 'dtm-darwin64'
    dtm_path = os.path.join(str(pathlib.Path(__file__).parent.absolute()), 'dtm', dtm_filename)
    if running_os != 'Windows':
        bash_command = "chmod +x {}".format(dtm_path)
        os.system(bash_command)
    tokenized_doc = np.array(table[input_col])
    num_doc = len(tokenized_doc)
    if time_slice is None:
        time_slice = [num_doc]
    elif sum(time_slice) != num_doc:
        raise_runtime_error("The sum of time slice list does not match the number of documents.")
    if vis_time < 0 or vis_time >= len(time_slice):
        raise_runtime_error("Invalid time parameter: {}".format(vis_time))
    dictionary = corpora.Dictionary(tokenized_doc)
    corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
    dtm_params = {"corpus": corpus,
                  "id2word": dictionary,
                  "time_slices": time_slice,
                  "num_topics": num_topic,
                  "lda_sequence_max_iter": max_iter,
                  "model": 'dtm'}
    if seed is not None:
        dtm_params["rng_seed"] = seed
    dtm_model = DtmModel(dtm_path, **dtm_params)

    topic_time = [[dtm_model.show_topic(topicid=id, time=t, topn=num_topic_word) for id in range(num_topic)]
                  for t in range(len(time_slice))]
    topic_time = [[["{}: {}".format(tup[1], tup[0]) for tup in topic] for topic in time] for time in topic_time]
    timeline = ["{} ({} docs)".format(ind, t) for ind, t in enumerate(time_slice)]
    columns = ["topic_{}".format(i + 1) for i in range(num_topic)]
    topic_table = pd.DataFrame(topic_time, columns=columns)
    topic_table['time'] = timeline
    topic_table = topic_table[['time'] + columns]

    prop_arr = dtm_model.gamma_
    out_table = pd.DataFrame.copy(table, deep=True)
    if topic_name in table.columns:
        raise BrighticsFunctionException.from_errors(
            [{'0100': "Existing table contains Topic Column Name. Please choose again."}])
    out_table[topic_name] = [item.argmax() + 1 for item in prop_arr]
    out_table['topic_distribution'] = prop_arr.tolist()

    coherence_topic_arr = [dtm_model.dtm_coherence(time) for time in range(len(time_slice))]
    if coherence == 'u_mass':
        coh_arr = [CoherenceModel(topics=item, dictionary=dictionary, corpus=corpus, coherence='u_mass').get_coherence()
                   for item in coherence_topic_arr]
    else:
        coh_arr = [CoherenceModel(topics=item, dictionary=dictionary, corpus=corpus, texts=tokenized_doc,
                                  coherence='c_v').get_coherence() for item in coherence_topic_arr]

    doc_topic, topic_term, doc_lengths, term_frequency, vocab = dtm_model.dtm_vis(corpus, vis_time)
    prepared_data = plv.prepare(topic_term, doc_topic, doc_lengths, vocab, term_frequency, sort_topics=False)
    html_result = plv.prepared_data_to_html(prepared_data)

    params = {'Input column': input_col,
              'Topic column name': topic_name,
              'Number of topics': num_topic,
              'Number of words for each topic': num_topic_word,
              'Maximum number of iterations': max_iter,
              'Time slice': time_slice,
              'Coherence measure': coherence,
              'Time to visualize': vis_time}
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Dynamic Topic Modeling Result
    | ### Summary
    |
    """))
    rb.addHTML(html_result)
    rb.addMD(strip_margin("""
    | ### Coherence for each period
    | {coh_arr}
    |
    | ### Parameters
    | {params}
    """.format(coh_arr=coh_arr, params=dict2MD(params))))

    model = _model_dict('dtm_model')
    model['params'] = params
    model['dtm_model'] = dtm_model
    model['coherences'] = coh_arr
    model['corpus'] = corpus
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'topic_table': topic_table, 'model': model}