if __name__ == "__main__": fileobj = open("source/sample.html", "r", encoding="utf_8") text = fileobj.read() fileobj.close() #形態素解析をしやすくするためのクリーニング tcleaner = TextCleaner.TextCleaner() text = tcleaner.remove_header(text) text = tcleaner.clean_html_and_js_tags(text) text = tcleaner.clean_url(text) text = tcleaner.clean_code(text) text = tcleaner.clean_text(text) tcleaner.output(text) tokenizer = Tokenizer.JanomeTokenizer() words = tokenizer.wakati(text) #words = tokenizer.filter_by_pos(text, pos=('名詞')) tokenizer.output(words) #MeCab #tokenizer = Tokenizer.MeCabTokenizer() #words = tokenizer.wakati(text) #words = tokenizer.filter_by_pos(text, pos=('名詞')) #tokenizer.output(words) tnormalizer = TextNormalizer.TextNormalizer() nwords = [] for w in words: nw = tnormalizer.normalize(w) nw = tnormalizer.lemmatize_term(nw, pos='v')