Beispiel #1
0
'''
37. 頻度上位10語
出現頻度が高い10語とその出現頻度をグラフ(例えば棒グラフなど)で表示せよ.
'''
from collections import defaultdict
from matplotlib import pyplot as plt
from knock30 import get_morpheme_list

word_freq = defaultdict(lambda: 0)

for sentence in get_morpheme_list('neko.txt.mecab'):
    for morpheme in sentence:
        word_freq[morpheme['surface']] += 1

d = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)

words = [d[i][0] for i in range(10)]
count = [d[i][1] for i in range(10)]

plt.rcParams['font.family'] = 'AppleGothic'
plt.bar(words, count)
plt.xlabel("出現頻度が高い10語")
plt.ylabel("出現頻度")
plt.grid(True)
plt.show()
Beispiel #2
0
'''
31. 動詞
動詞の表層形をすべて抽出せよ.
'''
from knock30 import get_morpheme_list

verbs = []
for sentence in get_morpheme_list("neko.txt.mecab"):
    for morpheme in sentence:
        if morpheme['pos'] == '動詞':
            verbs.append(morpheme['surface'])

print(verbs[:10])
Beispiel #3
0
'''
34. 「AのB」
2つの名詞が「の」で連結されている名詞句を抽出せよ.
'''
from knock30 import get_morpheme_list

noun_phrases = []
for s in get_morpheme_list("neko.txt.mecab"):
    for i in range(len(s) - 2):
        if s[i]['pos'] == '名詞' and \
           s[i+1]['surface'] == 'の' and \
           s[i+2]['pos'] == '名詞':
            noun_phrases.append(s[i]['surface'] + s[i + 1]['surface'] +
                                s[i + 2]['surface'])

print(noun_phrases[:10])