Example #1
0
                               for f, w in words))
        output.write("\n\n\n")

# We first identify the most discussed topic, i.e., the one with the
# highest total weight

topics = matutils.corpus2dense(model[corpus], num_terms=model.num_topics)
weight = topics.sum(1)
max_topic = weight.argmax()

# Get the top 64 words for this topic
# Without the argument, show_topic would return only 10 words
words = model.show_topic(max_topic, 64)

# This function will actually check for the presence of pytagcloud and is otherwise a no-op
create_cloud('cloud_blei_lda.png', words)

num_topics_used = [len(model[doc]) for doc in corpus]
fig, ax = plt.subplots()
ax.hist(num_topics_used, np.arange(42))
ax.set_ylabel('Nr of documents')
ax.set_xlabel('Nr of topics')
fig.tight_layout()
fig.savefig('Figure_04_01.png')

# Now, repeat the same exercise using alpha=1.0
# You can edit the constant below to play around with this parameter
ALPHA = 1.0

model1 = models.ldamodel.LdaModel(corpus,
                                  num_topics=NUM_TOPICS,
Example #2
0
# 100, 2246

weight = topics.sum(1)
weight.shape
# 100,

max_topic = weight.argmax()
# 30
np.sum(topics[30] > 0.1)
# 960 from 2246 topics have 0.1 frecvency or higher for topic 30

# get the top 64 words for this topic
words = model.show_topic(max_topic, 64)

### Create a word cloud using pytagcloud
create_cloud('lda_gensim_tagcloud.png', words)

### Plot number of topics, number of posts
num_topics_used = [len(model[doc]) for doc in corpus]
fig, ax = plt.subplots()
ax.hist(num_topics_used, np.arange(42))
ax.set_ylabel('Nr of documents')
ax.set_xlabel('Nr of topics')
fig.tight_layout()
fig.savefig('topics_vs_docs1.png')

# change alpha and plot again
# bigger alpha => more topics per document
# DEFAULT gensim ALPHA = 1 / len(corpus)
ALPHA = 1.0
Example #3
0
# load corpus
corpus = corpora.MmCorpus('../data/aaj.mm')

# Build the topic model
model = models.ldamodel.LdaModel(corpus,
                                 num_topics=NUM_TOPICS,
                                 id2word=dictionary,
                                 alpha=None)

# We first identify the most discussed topic, i.e., the one with the
# highest total weight
topics = matutils.corpus2dense(model[corpus], num_terms=model.num_topics)
weight = topics.sum(1)
max_topic = weight.argmax()

# Get the top 64 words for this topic
# Without the argument, show_topic would return only 10 words
words = model.show_topic(max_topic, 64)

# ワードクラウドを生成
create_cloud('../data/cloud_lda.png', words)

# トピック数分布をプロット
num_topics_used = [len(model[doc]) for doc in corpus]
fig, ax = plt.subplots()
ax.hist(num_topics_used, np.arange(42))
ax.set_ylabel('Nr of documents')
ax.set_xlabel('Nr of topics')
fig.tight_layout()
fig.savefig('../data/Figure_04_01.png')
Example #4
0
topics = np.load('topics.npy', mmap_mode='r')

# 각 문서에서 언급된 주제 개수 계산
lens = (topics > 0).sum(axis=1)
print('Mean number of topics mentioned: {0:.3}'.format(np.mean(lens)))
print('Percentage of articles mentioning less than 10 topics: {0:.1%}'.format(
    np.mean(lens <= 10)))

# Weights는 각 주제의 총 가중치
weights = topics.sum(0)

# 단어 클아우드로 최대로 사용한 주제 찾기와 그리기
words = model.show_topic(weights.argmax(), 64)

# 매개변수 ``maxsize``로 보기좋게 만든다.
create_cloud('Wikipedia_most.png', words, maxsize=250, fontname='Cardo')

fraction_mention = np.mean(topics[:, weights.argmax()] > 0)
print("The most mentioned topics is mentioned in {:.1%} of documents.".format(
    fraction_mention))
total_weight = np.mean(topics[:, weights.argmax()])
print(
    "It represents {:.1%} of the total number of words.".format(total_weight))
print()
print()
print()

#  단어 클아우드로 최소로 사용한 주제 찾기와 그리기
words = model.show_topic(weights.argmin(), 64)
create_cloud('Wikipedia_least.png', words, maxsize=150, fontname='Cardo')
fraction_mention = np.mean(topics[:, weights.argmin()] > 0)
# Load the precomputed model
model = gensim.models.ldamodel.LdaModel.load('wiki_lda.pkl')

topics = np.load('topics.npy', mmap_mode='r')

# Compute the number of topics mentioned in each document
lens = (topics > 0).sum(1)
print('Mean number of topics mentioned: {0:.3}'.format(np.mean(lens)))
print('Percentage of articles mentioning less than 10 topics: {0:.1%}'.format(np.mean(lens <= 10)))

# Weights will be the total weight of each topic
weights = topics.sum(0)

# Retrieve the most heavily used topic and plot it as a word cloud:
words = model.show_topic(weights.argmax(), 64)

# The parameter ``maxsize`` often needs some manual tuning to make it look nice.
create_cloud('Wikipedia_most.png', words, maxsize=410, fontname='Neucha')
print(words)
print()
print()
print()

# Retrieve the **least** heavily used topic and plot it as a word cloud:
words = model.show_topic(weights.argmin(), 64)
create_cloud('Wikipedia_least.png', words, maxsize=180, fontname='Neucha')
print(words)
print()
print()
print()
weight  = topics.sum(1)
weight.shape
# 100,

max_topic = weight.argmax()
# 30
np.sum(topics[30] > 0.1)
# 960 from 2246 topics have 0.1 frecvency or higher for topic 30


# get the top 64 words for this topic
words = model.show_topic(max_topic, 64)

### Create a word cloud using pytagcloud
create_cloud('lda_gensim_tagcloud.png', words)

### Plot number of topics, number of posts
num_topics_used = [len(model[doc]) for doc in corpus]
fig,ax = plt.subplots()
ax.hist(num_topics_used, np.arange(42))
ax.set_ylabel('Nr of documents')
ax.set_xlabel('Nr of topics')
fig.tight_layout()
fig.savefig('topics_vs_docs1.png')

# change alpha and plot again
# bigger alpha => more topics per document
# DEFAULT gensim ALPHA = 1 / len(corpus)
ALPHA = 1.0
topics = np.load('topics.npy', mmap_mode='r')

# Compute the number of topics mentioned in each document
lens = (topics > 0).sum(axis=1)
print('Mean number of topics mentioned: {0:.3}'.format(np.mean(lens)))
print('Percentage of articles mentioning less than 10 topics: {0:.1%}'.format(np.mean(lens <= 10)))

# Weights will be the total weight of each topic
weights = topics.sum(0)

# Retrieve the most heavily used topic and plot it as a word cloud:
words = model.show_topic(weights.argmax(), 64)

# The parameter ``maxsize`` often needs some manual tuning to make it look nice.
create_cloud('Wikipedia_most.png', words, maxsize=250, fontname='Cardo')

fraction_mention = np.mean(topics[:,weights.argmax()] > 0)
print("The most mentioned topics is mentioned in {:.1%} of documents.".format(fraction_mention))
total_weight = np.mean(topics[:,weights.argmax()])
print("It represents {:.1%} of the total number of words.".format(total_weight))
print()
print()
print()

# Retrieve the **least** heavily used topic and plot it as a word cloud:
words = model.show_topic(weights.argmin(), 64)
create_cloud('Wikipedia_least.png', words, maxsize=150, fontname='Cardo')
fraction_mention = np.mean(topics[:,weights.argmin()] > 0)
print("The least mentioned topics is mentioned in {:.1%} of documents.".format(fraction_mention))
total_weight = np.mean(topics[:,weights.argmin()])
# First, we need to sum up the weights across all the documents
weight = np.zeros(model.num_topics)
for doc in corpus:
    for col, val in model[doc]:
        weight[col] += val
        # As a reasonable alternative, we could have used the log of val:
        # weight[col] += np.log(val)
max_topic = weight.argmax()

# Get the top 64 words for this topic
# Without the argument, show_topic would return only 10 words
words = model.show_topic(max_topic, 64)

# This function will actually check for the presence of pytagcloud and is otherwise a no-op
create_cloud('cloud_blei_lda.png', words)

num_topics_used = [len(model[doc]) for doc in corpus]
plt.hist(num_topics_used, np.arange(42))
plt.ylabel('Nr of documents')
plt.xlabel('Nr of topics')
plt.savefig('../1400OS_04_01+.png')
plt.clf()


# Now, repeat the same exercise using alpha=1:

model1 = models.ldamodel.LdaModel(
    corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=1.)
num_topics_used1 = [len(model1[doc]) for doc in corpus]