''' 37. 頻度上位10語 出現頻度が高い10語とその出現頻度をグラフ(例えば棒グラフなど)で表示せよ. ''' from collections import defaultdict from matplotlib import pyplot as plt from knock30 import get_morpheme_list word_freq = defaultdict(lambda: 0) for sentence in get_morpheme_list('neko.txt.mecab'): for morpheme in sentence: word_freq[morpheme['surface']] += 1 d = sorted(word_freq.items(), key=lambda x: x[1], reverse=True) words = [d[i][0] for i in range(10)] count = [d[i][1] for i in range(10)] plt.rcParams['font.family'] = 'AppleGothic' plt.bar(words, count) plt.xlabel("出現頻度が高い10語") plt.ylabel("出現頻度") plt.grid(True) plt.show()
''' 31. 動詞 動詞の表層形をすべて抽出せよ. ''' from knock30 import get_morpheme_list verbs = [] for sentence in get_morpheme_list("neko.txt.mecab"): for morpheme in sentence: if morpheme['pos'] == '動詞': verbs.append(morpheme['surface']) print(verbs[:10])
''' 34. 「AのB」 2つの名詞が「の」で連結されている名詞句を抽出せよ. ''' from knock30 import get_morpheme_list noun_phrases = [] for s in get_morpheme_list("neko.txt.mecab"): for i in range(len(s) - 2): if s[i]['pos'] == '名詞' and \ s[i+1]['surface'] == 'の' and \ s[i+2]['pos'] == '名詞': noun_phrases.append(s[i]['surface'] + s[i + 1]['surface'] + s[i + 2]['surface']) print(noun_phrases[:10])