def bow(model_name): """ bag-of-words """ print("prepare data.") os.chdir("data") set_data(mode="doc") sentences = bow_read_docs(folder_name="tmp_file") print("train model.") model = Dictionary(sentences) print("save model.") os.chdir("..") model.save("model/%s" % model_name)
def tfidf(model_name): """ tf-idf """ print("prepare data.") os.chdir("data") set_data(mode="doc") sentences = bow_read_docs(folder_name="tmp_file") dic = Dictionary(sentences) ## 「出現頻度が20未満の単語」と「30%以上の文書で出現する単語」を排除 ## dic.filter_extremes(no_below = 20, no_above = 0.3) bow_corpus = [dic.doc2bow(d) for d in sentences] print("train model.") model = TfidfModel(bow_corpus) print("save model.") os.chdir("..") model.save("model/%s" % model_name)
def d2v(model_name, iter_count): """ doc2vec """ print("prepare data.") os.chdir("data") set_data(mode="doc") sentences = list(d2v_read_docs(folder_name="tmp_file", tokens_only=False)) print("train model.") # workers=1にしなければseed固定は意味がない(ドキュメントより) model = doc2vec.Doc2Vec(min_count=1, seed=1, workers=1, iter=iter_count) model.build_vocab(sentences) model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) print("save model.") os.chdir("..") model.save("model/%s" % model_name)
def w2v(model_name, iter_count): """ word2vec """ print("prepare data.") os.chdir("data") set_data(mode="word") corpus_file = "tmp.txt" sentences = word2vec.LineSentence(corpus_file) print("train model.") # workers=1にしなければseed固定は意味がない(ドキュメントより) model = word2vec.Word2Vec(sentences, min_count=1, seed=1, workers=1, iter=iter_count) print("save model.") os.chdir("..") model.save("model/%s" % model_name)
def ft(model_name, iter_count): """ fasttext """ print("prepare data.") os.chdir("data") set_data(mode="word") corpus_file = "tmp.txt" f = open("%s" % corpus_file, "r", encoding="utf-8") text = f.read() sentences = [s.split(" ") for s in text.split("\n")] print("train model.") # workers=1にしなければseed固定は意味がない(ドキュメントより) model = fasttext.FastText(min_count=1, seed=1, workers=1, iter=iter_count) model.build_vocab(sentences) model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) print("save model.") os.chdir("..") model.save("model/%s" % model_name)