def create_gensim_dictionary(data_path, no_below=2, no_above=0.1): for root, dirs, files in os.walk(data_path): print("# morphological analysis") docs = {} docs_title = {} for docname in files: docs[docname] = [] with open(os.path.join(data_path, docname), "r") as f: lines = f.readlines() docs_title[docname] = lines[0] for text in lines: text_replace = text.replace(" ", "").replace("\n", "").replace( "#", "").replace("@", "") if text_replace != "": result = Jumanpp().analysis(text_replace) for mrph in result.mrph_list(): if len(mrph.midasi) > 1: docs[docname].append(mrph.midasi) dictionary = gensim.corpora.Dictionary(docs.values()) dictionary.filter_extremes(no_below=no_below, no_above=no_above) return docs, docs_title, dictionary
def split_into_words(text): '''記事を単語リストに変換する''' result = Jumanpp().analysis(text) return [mrph.midasi for mrph in result.mrph_list()]
def split_into_words(text): result = Jumanpp().analysis(text) return [mrph.midasi for mrph in result.mrph_list()]
def morphological_analysis(doc): r = Jumanpp().analysis(doc) return [mrph.midasi for mrph in r.mrph_list()]