def model_training():
    bangumis = bangumi_dao.find_all_bangumis()
    tsc_docs = TscTaggedDocument(bangumis)
    model = Doc2Vec(dm=0, dbow_words=1, size=200, window=8, min_count=1, iter=10, workers=multiprocessing.cpu_count())
    print 'Building vocabulary......'
    model.build_vocab(tsc_docs)
    print 'Training doc2vec model......'
    model.train(tsc_docs, total_examples=model.corpus_count, epochs=model.iter)
    print 'Vocabulary size:', len(model.wv.vocab)
    model.save("intro_doc2vec_200.model")
Exemple #2
0
def model_training():
    bangumis = bangumi_dao.find_all_bangumis()
    bangumi_dict = dict()
    for bangumi in bangumis:
        print 'Collecting bangumi ' + str(bangumi.season_id)
        words = []
        episodes = episode_dao.find_episodes_by_bangumi(bangumi.season_id)
        for episode in episodes:
            danmakus = danmaku_dao.find_danmakus_by_episode(episode.episode_id)
            for danmku in danmakus:
                words.extend(preprocess_util.word_segment(danmku.content))
        bangumi_dict[str(bangumi.season_id)] = words
    tsc_docs = TscTaggedDocument(bangumi_dict)
    model = Doc2Vec(dm=0, dbow_words=1, size=200, window=8, min_count=10, iter=10, workers=multiprocessing.cpu_count())
    print 'Building vocabulary......'
    model.build_vocab(tsc_docs)
    print 'Training doc2vec model......'
    model.train(tsc_docs, total_examples=model.corpus_count, epochs=model.iter)
    print 'Vocabulary size:', len(model.wv.vocab)
    model.save("content_doc2vec_200.model")
def get_bangumis():
    bangumi_set = set()
    bangumis = bangumi_dao.find_all_bangumis()
    for bangumi in bangumis:
        bangumi_set.add(bangumi.season_id)
    return bangumi_set