def train_planner():
    print("Training Word2Vec-based planner ...")
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    if not check_uptodate(plan_data_path):
        gen_train_data()
    word_lists = []
    with open(plan_data_path, 'r') as fin:
        for line in fin.readlines():
            word_lists.append(line.strip().split('\t'))
    model = models.Word2Vec(word_lists, size=512, min_count=5)
    model.save(_plan_model_path)
def train_planner():
    """利用gensim,将提取的关键词向量化"""
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    if not check_uptodate(plan_data_path):
        gen_train_data()
    keywords_list = [
    ]  # 格式:[ ['keyword1', 'keyword2', 'keyword3', 'keyword4'] ]
    with open(plan_data_path, 'r') as infile:
        for line in infile.readlines():
            keywords_list.append(line.strip().split('\t'))
    # word2vec 训练词向量
    model = models.Word2Vec(keywords_list, size=512, window=4, min_count=1)
    model.save(_plan_model_path)
def train_planner():
    # TODO: try other keyword-expansion models.
    print("Training Word2Vec-based planner ...")
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    if not check_uptodate(plan_data_path):
        gen_train_data()
    word_lists = []
    with open(plan_data_path, 'r') as fin:
        for line in fin.readlines():
            word_lists.append(line.strip().split('\t'))

    # model = models.FastText(word_lists, size = 512, min_count = 5)
    model = models.Word2Vec(word_lists, size=300, min_count=3)
    # print(model.wv.vocab)
    model.save(_plan_model_path)
Exemple #4
0
def main(_):
    print("\nParameters: ")
    for k, v in sorted(FLAGS.__flags.items()):
        print("{} = {}".format(k, v))

    if not os.path.exists("./prepro/"):
        os.makedirs("./prepro/")

    if FLAGS.eval:
        print("Evaluation...")
        feats, test_id = data_utils.load_test_data(FLAGS.test_id,
                                                   FLAGS.test_dir)
        vocab_processor = VocabularyProcessor.restore(FLAGS.vocab)

        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True

        with tf.Session(config=config) as sess:
            model = load_model(sess, FLAGS.checkpoint_file, vocab_processor)
            sentences = greedy_inference(sess, model, feats, vocab_processor)
            # sentences = beam_search(sess, model, feats, vocab_processor)
            ans = []
            for idx, sentence in enumerate(sentences):
                ans.append({"caption": sentence, "id": test_id[idx]})
            json.dump(ans, open(FLAGS.output, 'w'))

    else:
        if FLAGS.prepro:
            print("Start preprocessing data...")
            vocab_processor, train_dict = data_utils.load_text_data(
                train_lab=FLAGS.train_lab,
                prepro_train_p=FLAGS.prepro_train,
                vocab_path=FLAGS.vocab)
            print("Vocabulary size: {}".format(
                len(vocab_processor._reverse_mapping)))

            print("Start dumping word2vec matrix...")
            w2v_W = data_utils.build_w2v_matrix(vocab_processor,
                                                FLAGS.w2v_data,
                                                FLAGS.vector_file,
                                                FLAGS.embedding_dim)

        else:
            train_dict = cPickle.load(open(FLAGS.prepro_train, 'rb'))
            vocab_processor = VocabularyProcessor.restore(FLAGS.vocab)
            w2v_W = cPickle.load(open(FLAGS.w2v_data, 'rb'))

        print("Start generating training data...")
        feats, encoder_in_idx, decoder_in = data_utils.gen_train_data(
            FLAGS.train_dir, FLAGS.train_lab, train_dict)
        print("Start generating validation data...")
        v_encoder_in, truth_captions = data_utils.load_valid(
            FLAGS.valid_dir, FLAGS.valid_lab)

        t_encoder_in = None
        files = None
        if FLAGS.task_dir != None:
            t_encoder_in, files = data_utils.load_task(FLAGS.task_dir)

        print('feats size: {}, training size: {}'.format(
            len(feats), len(encoder_in_idx)))
        print(encoder_in_idx.shape, decoder_in.shape)
        print(v_encoder_in.shape, len(truth_captions))

        data = Data(feats, encoder_in_idx, decoder_in, v_encoder_in,
                    truth_captions, t_encoder_in, files)

        model = CapGenModel(data, w2v_W, vocab_processor)

        model.build_model()

        model.train()