if not model_path.exists():
            model_path.mkdir(exist_ok=True, parents=True)
        print((num_topics, passes), end=' ', flush=True)
        lda = LdaMulticore(corpus=train_corpus,
                           num_topics=num_topics,
                           id2word=id2word,
                           passes=passes,
                           eval_every=None,
                           workers=72,
                           random_state=42)
        test_perplexity = 2**(-lda.log_perplexity(test_corpus))
        lda.update(corpus=test_corpus)
        lda.save((model_path / 'lda').resolve().as_posix())

        topic_coherence = lda.top_topics(corpus=corpus,
                                         coherence='u_mass',
                                         topn=20)
        coherence.append([c[1] for c in topic_coherence])

        perplexity.append([
            vocab_size, test_vocab, min_df, max_df, binary, num_topics, passes,
            test_perplexity
        ])

    elapsed = time() - start
    print(
        f'\nDone: {i / n:.2%} | Duration: {format_time(elapsed)} | To Go: {format_time(elapsed / i * (n - i))}\n'
    )
    perplexity = pd.DataFrame(perplexity,
                              columns=cols).sort_values('perplexity')
    print(perplexity)
Ejemplo n.º 2
0
# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaMulticore(corpus=corpus,
                     id2word=id2word,
                     chunksize=chunksize,
                     alpha=0.05,
                     eta=0.01,
                     iterations=iterations,
                     num_topics=num_topics,
                     passes=passes,
                     eval_every=eval_every,
                     workers=4)

top_topics = model.top_topics(corpus)  #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

pprint(top_topics)
print(top_topics)
numpy.save(os.path.join(out_path, "topics.npy"), top_topics)
model.save(os.path.join(out_path, "lda_model"))

#predict a topic for a document
important_words = docs[2]
print(important_words)
print(len(important_words))
Ejemplo n.º 3
0
id2word = dictionary.id2token

lda_model = LdaMulticore(
    corpus=tfidf_corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='symmetric',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every,
    workers=4  # Use all four cores
)

top_topics = lda_model.top_topics(tfidf_corpus)
pprint(top_topics)
# -

# Gensim calculates the [intrinsic coherence score](http://qpleple.com/topic-coherence-to-evaluate-topic-models/) for
# each topic. By averaging across all of the topics in the model you can get an average coherence score. Coherence
# is a measure of the strength of the association between words in a topic cluster. It is supposed to be an objective
# way to evaluate the quailty of the topic clusters. Higher scores are better.

# +
# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)
# -

# References: