def model_training(): bangumis = bangumi_dao.find_all_bangumis() tsc_docs = TscTaggedDocument(bangumis) model = Doc2Vec(dm=0, dbow_words=1, size=200, window=8, min_count=1, iter=10, workers=multiprocessing.cpu_count()) print 'Building vocabulary......' model.build_vocab(tsc_docs) print 'Training doc2vec model......' model.train(tsc_docs, total_examples=model.corpus_count, epochs=model.iter) print 'Vocabulary size:', len(model.wv.vocab) model.save("intro_doc2vec_200.model")
def model_training(): bangumis = bangumi_dao.find_all_bangumis() bangumi_dict = dict() for bangumi in bangumis: print 'Collecting bangumi ' + str(bangumi.season_id) words = [] episodes = episode_dao.find_episodes_by_bangumi(bangumi.season_id) for episode in episodes: danmakus = danmaku_dao.find_danmakus_by_episode(episode.episode_id) for danmku in danmakus: words.extend(preprocess_util.word_segment(danmku.content)) bangumi_dict[str(bangumi.season_id)] = words tsc_docs = TscTaggedDocument(bangumi_dict) model = Doc2Vec(dm=0, dbow_words=1, size=200, window=8, min_count=10, iter=10, workers=multiprocessing.cpu_count()) print 'Building vocabulary......' model.build_vocab(tsc_docs) print 'Training doc2vec model......' model.train(tsc_docs, total_examples=model.corpus_count, epochs=model.iter) print 'Vocabulary size:', len(model.wv.vocab) model.save("content_doc2vec_200.model")
def get_bangumis(): bangumi_set = set() bangumis = bangumi_dao.find_all_bangumis() for bangumi in bangumis: bangumi_set.add(bangumi.season_id) return bangumi_set