Example #1
0
def bow(model_name):
    """
    bag-of-words
    """

    print("prepare data.")
    os.chdir("data")
    set_data(mode="doc")
    sentences = bow_read_docs(folder_name="tmp_file")

    print("train model.")
    model = Dictionary(sentences)

    print("save model.")
    os.chdir("..")
    model.save("model/%s" % model_name)
Example #2
0
def tfidf(model_name):
    """
    tf-idf
    """
    print("prepare data.")
    os.chdir("data")
    set_data(mode="doc")
    sentences = bow_read_docs(folder_name="tmp_file")

    dic = Dictionary(sentences)
    ## 「出現頻度が20未満の単語」と「30%以上の文書で出現する単語」を排除
    ## dic.filter_extremes(no_below = 20, no_above = 0.3)
    bow_corpus = [dic.doc2bow(d) for d in sentences]

    print("train model.")
    model = TfidfModel(bow_corpus)

    print("save model.")
    os.chdir("..")
    model.save("model/%s" % model_name)
Example #3
0
def d2v(model_name, iter_count):
    """
    doc2vec
    """

    print("prepare data.")
    os.chdir("data")
    set_data(mode="doc")
    sentences = list(d2v_read_docs(folder_name="tmp_file", tokens_only=False))

    print("train model.")
    # workers=1にしなければseed固定は意味がない(ドキュメントより)
    model = doc2vec.Doc2Vec(min_count=1, seed=1, workers=1, iter=iter_count)
    model.build_vocab(sentences)
    model.train(sentences,
                total_examples=model.corpus_count,
                epochs=model.iter)

    print("save model.")
    os.chdir("..")
    model.save("model/%s" % model_name)
Example #4
0
def w2v(model_name, iter_count):
    """
    word2vec
    """

    print("prepare data.")
    os.chdir("data")
    set_data(mode="word")
    corpus_file = "tmp.txt"
    sentences = word2vec.LineSentence(corpus_file)

    print("train model.")
    # workers=1にしなければseed固定は意味がない(ドキュメントより)
    model = word2vec.Word2Vec(sentences,
                              min_count=1,
                              seed=1,
                              workers=1,
                              iter=iter_count)

    print("save model.")
    os.chdir("..")
    model.save("model/%s" % model_name)
Example #5
0
def ft(model_name, iter_count):
    """
    fasttext
    """

    print("prepare data.")
    os.chdir("data")
    set_data(mode="word")
    corpus_file = "tmp.txt"
    f = open("%s" % corpus_file, "r", encoding="utf-8")
    text = f.read()
    sentences = [s.split(" ") for s in text.split("\n")]

    print("train model.")
    # workers=1にしなければseed固定は意味がない(ドキュメントより)
    model = fasttext.FastText(min_count=1, seed=1, workers=1, iter=iter_count)
    model.build_vocab(sentences)
    model.train(sentences,
                total_examples=model.corpus_count,
                epochs=model.iter)

    print("save model.")
    os.chdir("..")
    model.save("model/%s" % model_name)