from knock30 import load_mecab verb = set() for line in load_mecab(): for morpheme in line: if(morpheme['pos'] == '動詞'): verb.add(morpheme['base']) print(verb)
# -*- coding: utf-8 -*- from knock30 import load_mecab path = 'neko.txt.mecab' ans = [] for sentence in load_mecab(path): for morpheme in sentence: if morpheme['pos'] == '動詞': ans.append(morpheme['surface']) print(ans)
# -*- coding: utf-8 -*- from knock30 import load_mecab from knock36 import morpheme_count import matplotlib.pyplot as plt def log_log_graph(rank): x = [] y = [] for i, (morpheme, count) in enumerate(rank): x.append(i + 1) y.append(count) plt.scatter(x, y, s=10, marker='.') plt.yscale('log') plt.xscale('log') plt.show() if __name__ == '__main__': path = 'neko.txt.mecab' rank = sorted(morpheme_count(load_mecab(path)).items(), key=lambda x: -x[1]) log_log_graph(rank)
# -*- coding: utf-8 -*- from knock30 import load_mecab from collections import defaultdict def morpheme_count(sentences): count = defaultdict(lambda: 0) for sentence in sentences: for morpheme in sentence: count[morpheme['surface']] += 1 return count if __name__ == '__main__': path = 'neko.txt.mecab' for morpheme, count in sorted(morpheme_count(load_mecab(path)).items(), key=lambda x: -x[1]): print(f'{morpheme} {count}')
def word_counter(num): word_count = Counter() for line in load_mecab(): for morpheme in line: word_count[morpheme['surface']]+=1 return word_count.most_common(num)
# -*- coding: utf-8 -*- from knock30 import load_mecab from knock36 import morpheme_count import matplotlib.pyplot as plt def histogram(data): counts = [] for morpheme, count in data.items(): counts.append(count) plt.hist(counts, bins=20, range=(1, 50)) plt.show() if __name__ == '__main__': path = 'neko.txt.mecab' histogram(morpheme_count(load_mecab(path)))