Example #1
0
def validate(model,
             model_type,
             dataloader,
             device,
             w2i,
             i2w,
             data_mode,
             max_length=15,
             beam_size=3):
    gts_dict = {}
    hyps_dict = {}
    bad_count = 0
    for i, (image, texts) in tqdm(enumerate(dataloader),
                                  total=len(dataloader)):
        hyps = beam_search(model, model_type, image, w2i, i2w, device,
                           max_length, beam_size, data_mode)
        if len(hyps) == 0:
            bad_count += 1
            continue
        hyp = hyps[0][1:]
        if hyp[-1] == w2i['<END>']:
            hyp = hyp[:-1]
        hyp = ' '.join([i2w[word] for word in hyp.tolist()])
        gts_dict[i] = texts[0]
        # Temporary
        hyps_dict[i] = [hyp]
    if len(list(gts_dict.keys())) == 0:
        print('Bad validation')
        return {}
    print('Bad hypothesis count: ', bad_count)
    return evaluation.compute_scores(gts_dict, hyps_dict)
Example #2
0
def predict_captions(model, dataloader, text_field):
    import itertools

    model.eval()
    gen = {}
    gts = {}
    with tqdm(desc="Evaluation", unit="it", total=len(dataloader)) as pbar:
        for it, (images, caps_gt) in enumerate(iter(dataloader)):
            images = images.to(device)
            with torch.no_grad():
                out, _ = model.beam_search(images,
                                           20,
                                           text_field.vocab.stoi["<eos>"],
                                           5,
                                           out_size=1)
            caps_gen = text_field.decode(out, join_words=False)
            for i, (gts_i, gen_i) in enumerate(zip(caps_gt, caps_gen)):
                gen_i = " ".join([k for k, g in itertools.groupby(gen_i)])
                gen["%d_%d" % (it, i)] = [
                    gen_i.strip(),
                ]
                gts["%d_%d" % (it, i)] = gts_i
            pbar.update()

    gts = evaluation.PTBTokenizer.tokenize(gts)
    gen = evaluation.PTBTokenizer.tokenize(gen)
    scores, _ = evaluation.compute_scores(gts, gen)

    return scores
def evaluate_metrics(model, dataloader, text_field):
    import itertools
    model.eval()
    gen = {}
    gts = {}
    with tqdm(desc='Epoch %d - evaluation' % e,
              unit='it',
              total=len(dataloader)) as pbar:
        for it, (images, caps_gt) in enumerate(iter(dataloader)):
            images = images.to(device)
            with torch.no_grad():
                out, _ = model.beam_search(images,
                                           20,
                                           text_field.vocab.stoi['<eos>'],
                                           5,
                                           out_size=1)
            caps_gen = text_field.decode(out, join_words=False)
            for i, (gts_i, gen_i) in enumerate(zip(caps_gt, caps_gen)):
                gen_i = ' '.join([k for k, g in itertools.groupby(gen_i)])
                gen['%d_%d' % (it, i)] = [
                    gen_i,
                ]
                gts['%d_%d' % (it, i)] = gts_i
            pbar.update()

    gts = evaluation.PTBTokenizer.tokenize(gts)
    gen = evaluation.PTBTokenizer.tokenize(gen)
    scores, _ = evaluation.compute_scores(gts, gen)
    return scores
Example #4
0
def evaluate_metrics(model, dataloader, word_to_id, idxs_word):
    import itertools
    model.eval()
    save_out = {}
    gen = {}
    gts = {}
    with tqdm(desc=' - evaluation', unit='it', total=len(dataloader)) as pbar:
        for it, (feats, caps_gt, vid_id) in enumerate(iter(dataloader)):
            feats = feats.to(device)
            with torch.no_grad():
                out, _ = model.beam_search(feats,
                                           26,
                                           word_to_id['<EOS>'],
                                           5,
                                           out_size=1)
            caps_gen = decode(out, idxs_word, join_words=False)
            for i, (gts_i, gen_i) in enumerate(zip(caps_gt, caps_gen)):
                gen_i = ' '.join([k for k, g in itertools.groupby(gen_i)])
                d = {vid_id[i]: gen_i}
                save_out.update(d)
                gen['%d_%d' % (it, i)] = [
                    gen_i,
                ]
                gts['%d_%d' % (it, i)] = gts_i
            pbar.update()
    # fi = open("/home/leiyu/workspace/en_captions_%s_test.json" % args.exp_name,'w')
    # json.dump(save_out,fi)
    gts = evaluation.PTBTokenizer.tokenize(gts)
    gen = evaluation.PTBTokenizer.tokenize(gen)
    scores, _ = evaluation.compute_scores(gts, gen)
    return scores
Example #5
0
def predict_captions(model, dataloader, text_field, cider, args):
    import itertools
    tokenizer_pool = multiprocessing.Pool()
    res = {}
    model.eval()
    gen = {}
    gts = {}
    with tqdm(desc='Evaluation', unit='it', total=len(dataloader)) as pbar:
        for it, ((detections, boxes, grids, masks),
                 caps_gt) in enumerate(iter(dataloader)):
            detections = detections.to(device)
            boxes = boxes.to(device)
            grids = grids.to(device)
            masks = masks.to(device)
            with torch.no_grad():
                out, _ = model.beam_search(detections,
                                           20,
                                           text_field.vocab.stoi['<eos>'],
                                           args.beam_size,
                                           out_size=1,
                                           **{
                                               'boxes': boxes,
                                               'grids': grids,
                                               'masks': masks
                                           })
            caps_gen = text_field.decode(out, join_words=False)
            caps_gen1 = text_field.decode(out)
            caps_gt1 = list(itertools.chain(*([
                c,
            ] * 1 for c in caps_gt)))

            caps_gen1, caps_gt1 = tokenizer_pool.map(
                evaluation.PTBTokenizer.tokenize, [caps_gen1, caps_gt1])
            reward = cider.compute_score(caps_gt1,
                                         caps_gen1)[1].astype(np.float32)
            # reward = reward.mean().item()

            for i, (gts_i, gen_i) in enumerate(zip(caps_gt1, caps_gen1)):
                res[len(res)] = {
                    'gt': caps_gt1[gts_i],
                    'gen': caps_gen1[gen_i],
                    'cider': reward[i].item(),
                }

            for i, (gts_i, gen_i) in enumerate(zip(caps_gt, caps_gen)):
                gen_i = ' '.join([k for k, g in itertools.groupby(gen_i)])
                gen['%d_%d' % (it, i)] = [
                    gen_i.strip(),
                ]
                gts['%d_%d' % (it, i)] = gts_i
            pbar.update()

    gts = evaluation.PTBTokenizer.tokenize(gts)
    gen = evaluation.PTBTokenizer.tokenize(gen)
    scores, _ = evaluation.compute_scores(gts, gen, spice=args.spice)
    if not args.only_test:
        json.dump(res, open(args.dump_json, 'w'))
    return scores
Example #6
0
def predict_captions(model, dataloader, text_field, emotion_encoder=None):
    import itertools
    if emotion_encoder is not None:
        emotion_encoder.eval()
    model.eval()
    gen = {}
    gts = {}
    with tqdm(desc='Evaluation', unit='it', total=len(dataloader)) as pbar:

        for it, (images, caps_emo_pair) in enumerate(iter(dataloader)):
            images = images.to(device)
            caps_gt, emotions = caps_emo_pair
            if emotion_encoder is not None:
                emotions = torch.stack(
                    [torch.mode(emotion).values for emotion in emotions])
                emotions = F.one_hot(emotions, num_classes=9)
                emotions = emotions.type(torch.FloatTensor)
                emotions = emotions.to(device)
                enc_emotions = emotion_encoder(emotions)
                enc_emotions = enc_emotions.unsqueeze(1).repeat(
                    1, images.shape[1], 1)
                images = torch.cat([images, enc_emotions], dim=-1)

            with torch.no_grad():
                out, _ = model.beam_search(images,
                                           20,
                                           text_field.vocab.stoi['<eos>'],
                                           5,
                                           out_size=1)
            caps_gen = text_field.decode(out, join_words=False)
            for i, (gts_i, gen_i) in enumerate(zip(caps_gt, caps_gen)):
                gen_i = ' '.join([k for k, g in itertools.groupby(gen_i)])
                gen['%d_%d' % (it, i)] = [
                    gen_i.strip(),
                ]
                gts['%d_%d' % (it, i)] = gts_i
            pbar.update()

    store_dict = {'gen': gen, 'gts': gts}
    with open('test_results.pickle', 'wb') as f:
        pickle.dump(store_dict, f)

    gts = evaluation.PTBTokenizer.tokenize(gts)
    gen = evaluation.PTBTokenizer.tokenize(gen)
    scores, _ = evaluation.compute_scores(gts, gen)

    return scores
Example #7
0
def evaluate_metrics(model, dataloader, text_field, emotion_encoder=None):
    import itertools
    model.eval()
    if emotion_encoder is not None:
        emotion_encoder.eval()
    gen = {}
    gts = {}
    with tqdm(desc='Epoch %d - evaluation' % e,
              unit='it',
              total=len(dataloader)) as pbar:
        for it, (images, caps_emos) in enumerate(iter(dataloader)):
            images = images.to(device)
            caps_gt, emotions = caps_emos
            if emotion_encoder is not None:
                emotions = torch.stack([
                    torch.mode(emotion).values for emotion in emotions
                ])  # pick the most frequent emotion
                emotions = F.one_hot(emotions, num_classes=9)
                emotions = emotions.type(torch.FloatTensor)
                emotions = emotions.to(device)
                enc_emotions = emotion_encoder(emotions)
                enc_emotions = enc_emotions.unsqueeze(1).repeat(
                    1, images.shape[1], 1)
                images = torch.cat([images, enc_emotions], dim=-1)

            with torch.no_grad():
                out, _ = model.beam_search(images,
                                           20,
                                           text_field.vocab.stoi['<eos>'],
                                           5,
                                           out_size=1)
            caps_gen = text_field.decode(out, join_words=False)
            for i, (gts_i, gen_i) in enumerate(zip(caps_gt, caps_gen)):
                gen_i = ' '.join([k for k, g in itertools.groupby(gen_i)])
                gen['%d_%d' % (it, i)] = [
                    gen_i,
                ]
                gts['%d_%d' % (it, i)] = gts_i
            pbar.update()

    gts = evaluation.PTBTokenizer.tokenize(gts)
    gen = evaluation.PTBTokenizer.tokenize(gen)
    scores, _ = evaluation.compute_scores(gts, gen)
    return scores
Example #8
0
def evaluate_metrics(model,
                     dataloader,
                     text_field,
                     max_len=20,
                     beam_size=5,
                     eval_beam_size=1):
    import itertools
    model.eval()
    gen = {}
    gts = {}
    with tqdm(desc='Epoch %d - evaluation' % e,
              unit='it',
              total=len(dataloader)) as pbar:
        for it, (images, caps_gt) in enumerate(iter(dataloader)):
            images = images.to(device)
            with torch.no_grad():
                out, _ = model.beam_search(images,
                                           max_len,
                                           text_field.vocab.stoi['<end>'],
                                           beam_size,
                                           out_size=1)
            caps_gen = text_field.decode(out, join_words=False)
            for i, (gts_i, gen_i) in enumerate(zip(caps_gt, caps_gen)):
                gen_i = ' '.join([k for k, g in itertools.groupby(gen_i)])
                gen['%d_%d' % (it, i)] = [
                    gen_i,
                ]
                gts['%d_%d' % (it, i)] = gts_i
            pbar.update()

    gts = my_tokenize(gts)  #evaluation.PTBTokenizer.tokenize(gts)
    gen = my_tokenize(gen)  #evaluation.PTBTokenizer.tokenize(gen)
    gts = {k: list(map(rm_caption_special_tokens, v)) for k, v in gts.items()}
    gen = {k: list(map(rm_caption_special_tokens, v)) for k, v in gen.items()}
    scores, _ = evaluation.compute_scores(gts, gen)
    return scores
Example #9
0
def test():
    ## argument
    train_path = sys.argv[1]
    test_path = sys.argv[2]
    predict_path = sys.argv[3]
    model_name = sys.argv[4]
    char_embed_path = sys.argv[5]
    word_embed_path = sys.argv[6]
    pos_embed_path = sys.argv[7]
    dict_path = sys.argv[8]

    train_rate = 0.9
    max_char_ctx_len = 1160
    max_word_ctx_len = 680

    char_ctx_len = 1160
    char_qus_len = 240

    word_ctx_len = 400
    word_qus_len = 40

    word_char_len = 5

    char_embed_size = 128
    word_embed_size = 128
    pos_embed_size = 32
    hidden_size = 64
    model_size = 64

    max_epochs = 50
    batch_size = 8

    lr = 0.001
    drop_rate = 0.5
    recur_drop_rate = 0.0
    patience = 20

    ## load data
    print("load data")
    st = time.time()
    train_raw_data = data_utils.load_json_data(train_path)
    test_raw_data = data_utils.load_json_data(test_path)
    #    # load pos data
    #    train_gen_pos_data = data_utils.load_json_data(train_pos_path)
    #    test_gen_pos_data = data_utils.load_json_data(test_pos_path)
    # load embedding
    char_embedding = word2vec.Word2Vec.load(char_embed_path)
    word_embedding = word2vec.Word2Vec.load(word_embed_path)
    pos_embedding = word2vec.Word2Vec.load(pos_embed_path)
    et = time.time()
    print("cost time:", et - st)

    ## process data
    print("process data")
    st = time.time()
    train_data = data_utils.make_train_data(
        train_raw_data
    )  # data format: (id, context, question, answer_start, answer_end)
    test_data = data_utils.make_test_data(
        test_raw_data)  # data format: (id, context, question)
    train_context = [data[1] for data in train_data]
    train_question = [data[2] for data in train_data]
    train_char_answer_start = [data[3] for data in train_data]
    train_char_answer_end = [data[4] for data in train_data]
    #    train_context_poss = [data['context'] for data in train_gen_pos_data['data']]
    #    train_question_poss = [data['question'] for data in train_gen_pos_data['data']]
    test_id = [data[0] for data in test_data]
    test_context = [data[1] for data in test_data]
    test_question = [data[2] for data in test_data]
    #    test_context_poss = [data['context'] for data in test_gen_pos_data['data']]
    #    test_question_poss = [data['question'] for data in test_gen_pos_data['data']]
    del train_data
    del test_data
    et = time.time()
    print("cost time:", et - st)

    ## load vocabulary
    print("load vocabulary")
    st = time.time()
    char_vocab = data_utils.load_json_data('model_%s_char_vocab.json' %
                                           model_name)
    word_vocab = data_utils.load_json_data('model_%s_word_vocab.json' %
                                           model_name)
    pos_vocab = data_utils.load_json_data('model_%s_pos_vocab.json' %
                                          model_name)
    #    poss = train_context_poss + train_question_poss + test_context_poss + test_question_poss
    #    pos_vocab, rev_pos_vocab = data_utils.build_vocabulary_with_embedding(poss, pos_embedding)
    char_vocab_size = len(char_vocab)
    word_vocab_size = len(word_vocab)
    pos_vocab_size = len(pos_vocab)
    et = time.time()
    print("char vocab size:", char_vocab_size)
    print("word vocab size:", word_vocab_size)
    print("pos vocab size:", pos_vocab_size)
    print("cost time:", et - st)

    ## tokenize data
    print("tokenize data")
    st = time.time()
    train_context_chars = data_utils.tokenize_to_chars(train_context)
    train_question_chars = data_utils.tokenize_to_chars(train_question)
    test_context_chars = data_utils.tokenize_to_chars(test_context)
    test_question_chars = data_utils.tokenize_to_chars(test_question)
    train_context_words = data_utils.tokenize_to_words(train_context,
                                                       init_dict=True,
                                                       dict_path=dict_path)
    train_question_words = data_utils.tokenize_to_words(train_question,
                                                        init_dict=True,
                                                        dict_path=dict_path)
    test_context_words = data_utils.tokenize_to_words(test_context,
                                                      init_dict=True,
                                                      dict_path=dict_path)
    test_question_words = data_utils.tokenize_to_words(test_question,
                                                       init_dict=True,
                                                       dict_path=dict_path)
    train_context_poss = data_utils.tokenize_to_poss(train_context,
                                                     init_dict=True,
                                                     dict_path=dict_path)
    train_question_poss = data_utils.tokenize_to_poss(train_question,
                                                      init_dict=True,
                                                      dict_path=dict_path)
    test_context_poss = data_utils.tokenize_to_poss(test_context,
                                                    init_dict=True,
                                                    dict_path=dict_path)
    test_question_poss = data_utils.tokenize_to_poss(test_question,
                                                     init_dict=True,
                                                     dict_path=dict_path)
    et = time.time()
    print("cost time:", et - st)

    ## select data
    # select the data which sequence lengths satisfy length constraints
    print("select data")
    st = time.time()
    select_indices = data_utils.select_data_by_lengths(train_context_words,
                                                       train_question_words,
                                                       word_ctx_len,
                                                       word_qus_len)
    train_context_chars = [train_context_chars[i] for i in select_indices]
    train_context_words = [train_context_words[i] for i in select_indices]
    train_context_poss = [train_context_poss[i] for i in select_indices]
    train_question_chars = [train_question_chars[i] for i in select_indices]
    train_question_words = [train_question_words[i] for i in select_indices]
    train_question_poss = [train_question_poss[i] for i in select_indices]
    train_char_answer_start = [
        train_char_answer_start[i] for i in select_indices
    ]
    train_char_answer_end = [train_char_answer_end[i] for i in select_indices]
    et = time.time()
    print("cost time:", et - st)

    ## set answer
    # it should be done after tokenize sentences to words
    print("set answer")
    st = time.time()
    train_word_answer_start, train_word_answer_end = data_utils.set_word_answer(
        train_context_words, train_char_answer_start, train_char_answer_end,
        word_ctx_len)
    train_answer_start, train_answer_end = train_word_answer_start, train_word_answer_end
    et = time.time()
    print("cost time:", et - st)

    ## pad data
    print("pad data")
    st = time.time()
    # clip words to chars
    # it should be done after build vocab (add PAD)
    train_context_clip_chars = data_utils.clip_words_to_chars(
        train_context_words, word_char_len)
    train_question_clip_chars = data_utils.clip_words_to_chars(
        train_question_words, word_char_len)
    test_context_clip_chars = data_utils.clip_words_to_chars(
        test_context_words, word_char_len)
    test_question_clip_chars = data_utils.clip_words_to_chars(
        test_question_words, word_char_len)
    #    print("Debug: tarin_context_clip_chars[0]:")
    #    print(train_context_clip_chars[0])
    #    print("Debug: train_question_clip_chars[0]:")
    #    print(train_question_clip_chars[0])

    # padding
    train_context_pad_chars = data_utils.pad_sequences(
        train_context_clip_chars, word_ctx_len * word_char_len)
    train_question_pad_chars = data_utils.pad_sequences(
        train_question_clip_chars, word_qus_len * word_char_len)
    train_context_pad_words = data_utils.pad_sequences(train_context_words,
                                                       word_ctx_len)
    train_question_pad_words = data_utils.pad_sequences(
        train_question_words, word_qus_len)
    train_context_pad_poss = data_utils.pad_sequences(train_context_poss,
                                                      word_ctx_len)
    train_question_pad_poss = data_utils.pad_sequences(train_question_poss,
                                                       word_qus_len)
    test_context_pad_chars = data_utils.pad_sequences(
        test_context_clip_chars, word_ctx_len * word_char_len)
    test_question_pad_chars = data_utils.pad_sequences(
        test_question_clip_chars, word_qus_len * word_char_len)
    test_context_pad_words = data_utils.pad_sequences(test_context_words,
                                                      word_ctx_len)
    test_question_pad_words = data_utils.pad_sequences(test_question_words,
                                                       word_qus_len)
    test_context_pad_poss = data_utils.pad_sequences(test_context_poss,
                                                     word_ctx_len)
    test_question_pad_poss = data_utils.pad_sequences(test_question_poss,
                                                      word_qus_len)
    et = time.time()
    print("cost time:", et - st)
    ## make arrays
    print("make arrays")
    st = time.time()
    # map vocab to index
    #    print("Debug: train_context_pad_words[0]:")
    #    print(train_context_pad_words[0])
    #    print("Debug: train_question_pad_words[0]:")
    #    print(train_question_pad_words[0])
    train_context_char_indices = data_utils.map_vocabulary_index(
        train_context_pad_chars, char_vocab)
    train_question_char_indices = data_utils.map_vocabulary_index(
        train_question_pad_chars, char_vocab)
    train_context_word_indices = data_utils.map_vocabulary_index(
        train_context_pad_words, word_vocab)
    train_question_word_indices = data_utils.map_vocabulary_index(
        train_question_pad_words, word_vocab)
    train_context_pos_indices = data_utils.map_vocabulary_index(
        train_context_pad_poss, pos_vocab)
    train_question_pos_indices = data_utils.map_vocabulary_index(
        train_question_pad_poss, pos_vocab)
    test_context_char_indices = data_utils.map_vocabulary_index(
        test_context_pad_chars, char_vocab)
    test_question_char_indices = data_utils.map_vocabulary_index(
        test_question_pad_chars, char_vocab)
    test_context_word_indices = data_utils.map_vocabulary_index(
        test_context_pad_words, word_vocab)
    test_question_word_indices = data_utils.map_vocabulary_index(
        test_question_pad_words, word_vocab)
    test_context_pos_indices = data_utils.map_vocabulary_index(
        test_context_pad_poss, pos_vocab)
    test_question_pos_indices = data_utils.map_vocabulary_index(
        test_question_pad_poss, pos_vocab)
    # make one-hot label
    train_answer_start_onehot = data_utils.one_hot_encoding(
        train_answer_start, word_ctx_len)
    train_answer_end_onehot = data_utils.one_hot_encoding(
        train_answer_end, word_ctx_len)
    # to array
    # X1: context chars; X2: context words; X3: context poss;
    # X4: question chars; X5: question words; X6: question poss;
    # Y1: answer_start, Y2: answer_end
    train_X1 = np.array(train_context_char_indices, dtype=np.int32)
    train_X2 = np.array(train_context_word_indices, dtype=np.int32)
    train_X3 = np.array(train_context_pos_indices, dtype=np.int32)
    train_X4 = np.array(train_question_char_indices, dtype=np.int32)
    train_X5 = np.array(train_question_word_indices, dtype=np.int32)
    train_X6 = np.array(train_question_pos_indices, dtype=np.int32)
    train_Y1 = np.array(train_answer_start_onehot, dtype=np.int32)
    train_Y2 = np.array(train_answer_end_onehot, dtype=np.int32)
    train_word_ans1 = np.array(train_answer_start, dtype=np.int32)
    train_word_ans2 = np.array(train_answer_end, dtype=np.int32)
    train_ans1 = np.array(train_char_answer_start, dtype=np.int32)
    train_ans2 = np.array(train_char_answer_end, dtype=np.int32)
    test_X1 = np.array(test_context_char_indices, dtype=np.int32)
    test_X2 = np.array(test_context_word_indices, dtype=np.int32)
    test_X3 = np.array(test_context_pos_indices, dtype=np.int32)
    test_X4 = np.array(test_question_char_indices, dtype=np.int32)
    test_X5 = np.array(test_question_word_indices, dtype=np.int32)
    test_X6 = np.array(test_question_pos_indices, dtype=np.int32)
    # make embedding weight matrix
    word_embed_matrix = data_utils.make_embedding_matrix(
        word_embedding, word_vocab, word_embed_size)
    char_embed_matrix = data_utils.make_embedding_matrix(
        char_embedding, char_vocab, char_embed_size)
    pos_embed_matrix = data_utils.make_embedding_matrix(
        pos_embedding, pos_vocab, pos_embed_size)

    # delete data for releasing memory
    del train_context, train_question, test_context, test_question
    del train_context_chars, train_question_chars, test_context_chars, test_question_chars
    #    del train_context_words, train_question_words, test_context_words, test_question_words
    del train_context_clip_chars, train_question_clip_chars, test_context_clip_chars, test_question_clip_chars
    del train_context_char_indices, train_question_char_indices, test_context_char_indices, test_question_char_indices
    del train_context_word_indices, train_question_word_indices, test_context_word_indices, test_question_word_indices
    del train_context_pos_indices, train_question_pos_indices, test_context_pos_indices, test_question_pos_indices
    del train_word_answer_start, train_word_answer_end, train_char_answer_start, train_char_answer_end
    del train_answer_start_onehot, train_answer_end_onehot
    et = time.time()
    print("train shape:", train_X1.shape, train_X2.shape, train_X3.shape,
          train_X4.shape, train_X5.shape, train_X6.shape, train_Y1.shape,
          train_Y2.shape)
    print("test shape:", test_X1.shape, test_X2.shape, test_X3.shape,
          test_X4.shape, test_X5.shape, test_X6.shape)
    print("cost time:", et - st)

    ## XXX build model
    print("build model")
    st = time.time()
    # input layers
    # X1: context chars; X2: context words; X3: context poss;
    # X4: question chars; X5: question words; X6: question poss;
    # Y1: answer_start; Y2: answer_end
    var_x1_input = Input(shape=(word_ctx_len * word_char_len, ),
                         dtype=np.int32)
    var_x2_input = Input(shape=(word_ctx_len, ), dtype=np.int32)
    var_x3_input = Input(shape=(word_ctx_len, ), dtype=np.int32)
    var_x4_input = Input(shape=(word_qus_len * word_char_len, ),
                         dtype=np.int32)
    var_x5_input = Input(shape=(word_qus_len, ), dtype=np.int32)
    var_x6_input = Input(shape=(word_qus_len, ), dtype=np.int32)

    # embedding layers
    var_x1_embed = Embedding(
        input_dim=char_vocab_size,
        output_dim=char_embed_size,
        weights=[char_embed_matrix],
        input_length=word_ctx_len * word_char_len,
        trainable=False
    )(var_x1_input)  # shape: (None, ctx_length * word_length, char_embed_size)
    var_x2_embed = Embedding(
        input_dim=word_vocab_size,
        output_dim=word_embed_size,
        weights=[word_embed_matrix],
        input_length=word_ctx_len,
        trainable=False)(
            var_x2_input)  # shape: (None, ctx_length, word_embed_size)
    var_x3_embed = Embedding(
        input_dim=pos_vocab_size,
        output_dim=pos_embed_size,
        weights=[pos_embed_matrix],
        input_length=word_ctx_len,
        trainable=False)(
            var_x3_input)  # shape: (None, ctx_length, pos_embed_size)
    var_x4_embed = Embedding(
        input_dim=char_vocab_size,
        output_dim=char_embed_size,
        weights=[char_embed_matrix],
        input_length=word_qus_len * word_char_len,
        trainable=False
    )(var_x4_input)  # shape: (None, qus_length * word_length, char_embed_size)
    var_x5_embed = Embedding(
        input_dim=word_vocab_size,
        output_dim=word_embed_size,
        weights=[word_embed_matrix],
        input_length=word_qus_len,
        trainable=False)(
            var_x5_input)  # shape: (None, qus_length, word_embed_size)
    var_x6_embed = Embedding(
        input_dim=pos_vocab_size,
        output_dim=pos_embed_size,
        weights=[pos_embed_matrix],
        input_length=word_qus_len,
        trainable=False)(
            var_x6_input)  # shape: (None, qus_length, pos_embed_size)

    var_x1_embed = Reshape([word_ctx_len, word_char_len * char_embed_size])(
        var_x1_embed
    )  # shape: (None, ctx_length, word_length * char_embed_size)
    var_x4_embed = Reshape([word_qus_len, word_char_len * char_embed_size])(
        var_x4_embed
    )  # shape: (None, qus_length, word_length * char_embed_size)
    var_char_embed_layer = Dense(units=word_embed_size)
    var_x1_embed = TimeDistributed(
        var_char_embed_layer,
        input_shape=(word_ctx_len, word_char_len * char_embed_size))(
            var_x1_embed)  # shape: (None, ctx_length, word_embed_size)
    var_x1_embed = Activation('relu')(var_x1_embed)
    #    var_x1_embed = Dropout(rate=drop_rate)(var_x1_embed)
    var_x4_embed = TimeDistributed(
        var_char_embed_layer,
        input_shape=(word_qus_len, word_char_len * char_embed_size))(
            var_x4_embed)  # shape: (None, qus_length, word_embed_size)
    var_x4_embed = Activation('relu')(var_x4_embed)
    #    var_x4_embed = Dropout(rate=drop_rate)(var_x4_embed)

    #XXX concatenate word embedding and pos embedding directly
    var_ctx_embed = concatenate(
        [var_x1_embed, var_x2_embed, var_x3_embed], axis=2
    )  # shape: (None, ctx_length, word_embed_size * 2 + pos_embed_size)
    var_qus_embed = concatenate(
        [var_x4_embed, var_x5_embed, var_x6_embed], axis=2
    )  # shape: (None, qus_length, word_embed_size * 2 + pos_embed_size)
    var_ctx_embed = Dropout(rate=drop_rate)(var_ctx_embed)
    var_qus_embed = Dropout(rate=drop_rate)(var_qus_embed)

    var_ctx_lstm = Bidirectional(
        LSTM(units=hidden_size,
             recurrent_dropout=recur_drop_rate,
             return_sequences=True))(
                 var_ctx_embed)  # shape: (None, ctx_length, hidden_size * 2)
    var_qus_lstm = Bidirectional(
        LSTM(units=hidden_size,
             recurrent_dropout=recur_drop_rate,
             return_sequences=True))(
                 var_qus_embed)  # shape: (None, qus_length, hidden_size * 2)
    # dropout ?
    #    var_ctx_lstm = Dropout(rate=drop_rate)(var_ctx_lstm)
    #    var_qus_lstm = Dropout(rate=drop_rate)(var_qus_lstm)

    # attention layers
    var_ctx_flatten = Flatten()(
        var_ctx_lstm)  # shape: (None, ctx_length * hidden_size * 2)
    var_qus_flatten = Flatten()(
        var_qus_lstm)  # shape: (None, qus_length * hidden_size * 2)
    var_ctx_repeat = RepeatVector(word_qus_len)(
        var_ctx_flatten
    )  # shape: (None, qus_length, ctx_length * hidden_size * 2)
    var_qus_repeat = RepeatVector(word_ctx_len)(
        var_qus_flatten
    )  # shape: (None, ctx_length, qus_length * hidden_size * 2)
    var_ctx_repeat = Reshape([word_qus_len, word_ctx_len, hidden_size * 2])(
        var_ctx_repeat
    )  # shape: (None, qus_length, ctx_length, hidden_size * 2)
    var_qus_repeat = Reshape([word_ctx_len, word_qus_len, hidden_size * 2])(
        var_qus_repeat
    )  # shape: (None, ctx_length, qus_length, hidden_size * 2)
    var_ctx_repeat = Permute(
        [2, 1, 3])(var_ctx_repeat
                   )  # shape: (None, ctx_length, qus_length, hidden_size * 2)
    var_mul_repeat = multiply([
        var_ctx_repeat, var_qus_repeat
    ])  # shape: (None, ctx_length, qus_length, hidden_size * 2)

    var_sim_repeat = concatenate(
        [var_ctx_repeat, var_qus_repeat, var_mul_repeat],
        axis=3)  # shape: (None, ctx_length, qus_length, hidden_size * 6)
    var_sim_sequence = Reshape([word_ctx_len * word_qus_len, hidden_size * 6])(
        var_sim_repeat
    )  # shape: (None, ctx_length * qus_length, hidden_size * 6)
    # dropout ?
    #    var_sim_sequence = Dropout(rate=drop_rate)(var_sim_sequence)
    var_similarity = TimeDistributed(
        Dense(units=1),
        input_shape=(word_ctx_len * word_qus_len, hidden_size * 6))(
            var_sim_sequence)  # shape: (None, ctx_length * qus_length, 1)
    var_similarity = Reshape([word_ctx_len, word_qus_len])(
        var_similarity)  # shape: (None, ctx_length, qus_length)
    var_similarity = Activation('relu')(var_similarity)
    # dropout ?
    #    var_similarity = Dropout(rate=drop_rate)(var_similarity)

    var_c2qatt_weight = TimeDistributed(
        Activation('softmax'), input_shape=(word_ctx_len, word_qus_len))(
            var_similarity)  # shape: (None, ctx_length, qus_length)
    var_c2qatt_ctx = Lambda(lambda x: K.batch_dot(x[0], x[1], axes=[2, 1]))(
        [var_c2qatt_weight,
         var_qus_lstm])  # shape: (None, ctx_length, hidden_size * 2)

    var_q2catt_weight = Lambda(lambda x: K.max(x, axis=2))(
        var_similarity)  # shape: (None, ctx_length)
    var_q2catt_weight = RepeatVector(hidden_size * 2)(
        var_q2catt_weight)  # shape: (None, hidden_size * 2, ctx_length)
    var_q2catt_weight = Permute([2, 1])(
        var_q2catt_weight)  # shape: (None, ctx_length, hidden_size * 2)
    var_q2catt_ctx = multiply([var_q2catt_weight, var_ctx_lstm
                               ])  # shape: (None, ctx_length, hidden_size * 2)

    var_c2qctx_attmul = multiply(
        [var_ctx_lstm,
         var_c2qatt_ctx])  # shape: (None, ctx_length, hidden_size * 2)
    var_q2cctx_attmul = multiply(
        [var_ctx_lstm,
         var_q2catt_ctx])  # shape: (None, ctx_length, hidden_size * 2)
    var_attention = concatenate(
        [var_ctx_lstm, var_c2qatt_ctx, var_c2qctx_attmul, var_q2cctx_attmul],
        axis=2)  # shape: (None, ctx_length, hidden_size * 8)
    var_attention = Activation('relu')(var_attention)
    #    # dropout ?
    #    var_attention = Dropout(rate=drop_rate)(var_attention)

    # model layers
    var_model1_lstm = Bidirectional(
        LSTM(units=model_size,
             recurrent_dropout=recur_drop_rate,
             return_sequences=True))(
                 var_attention)  # shape: (None, ctx_length, model_size * 2)
    var_model1_att = concatenate(
        [var_attention, var_model1_lstm],
        axis=2)  # shape: (None, ctx_length, hidden_size * 8 + model_size * 2)
    # dropout ?
    #    var_model1_att = Dropout(rate=drop_rate)(var_model1_att)

    var_model2_lstm = Bidirectional(
        LSTM(units=model_size,
             recurrent_dropout=recur_drop_rate,
             return_sequences=True))(
                 var_model1_lstm)  # shape: (None, ctx_length, model_size * 2)
    var_model2_att = concatenate(
        [var_attention, var_model2_lstm],
        axis=2)  # shape: (None, ctx_length, hidden_size * 8 + model_size * 2)
    # dropout ?
    #    var_model2_att = Dropout(rate=drop_rate)(var_model2_att)

    # output layers
    var_pointer1_weight = TimeDistributed(
        Dense(units=1),
        input_shape=(word_ctx_len, hidden_size * 8 + model_size * 2))(
            var_model1_att)  # shape: (None, ctx_length, 1)
    var_pointer1_weight = Flatten()(
        var_pointer1_weight)  # shape: (None, ctx_length)
    var_pointer1 = Activation('softmax')(
        var_pointer1_weight)  # shape: (None, ctx_length)

    var_pointer2_weight = TimeDistributed(
        Dense(units=1),
        input_shape=(word_ctx_len, hidden_size * 8 + model_size * 2))(
            var_model2_att)  # shape: (None, ctx_length, 1)
    var_pointer2_weight = Flatten()(
        var_pointer2_weight)  # shape: (None, ctx_length)
    var_pointer2 = Activation('softmax')(
        var_pointer2_weight)  # shape: (None, ctx_length)

    model = Model(inputs=[
        var_x1_input, var_x2_input, var_x3_input, var_x4_input, var_x5_input,
        var_x6_input
    ],
                  outputs=[var_pointer1, var_pointer2])

    adam = Adam(lr=lr)

    #    # Set loss functions ?
    #    def two_pointers_crossentropy(y_true, y_pred):
    #        p1_true, p1_pred = y_true[0], y_pred[0]
    #        p2_true, p2_pred = y_true[:,1], y_pred[1]
    #        p1_loss = categorical_crops
    # XXX use multiple loss
    model.compile(
        optimizer=adam,
        loss=['categorical_crossentropy', 'categorical_crossentropy'],
        loss_weights=[0.5, 0.5],
        metrics=['accuracy'])
    et = time.time()
    print("cost time:", et - st)

    ## evaluate
    print("evaluate")
    st = time.time()
    model = load_model('model_%s.h5' % model_name, custom_objects={'tf': tf})
    # compute predict
    print("predict")
    st = time.time()
    train_Y1_hat, train_Y2_hat = model.predict(
        [train_X1, train_X2, train_X3, train_X4, train_X5, train_X6],
        batch_size=batch_size)
    et = time.time()
    print("cost time:", et - st)
    train_Y1_word_pred, train_Y2_word_pred = model_utils.constraint_predict(
        train_Y1_hat, train_Y2_hat)
    train_Y1_pred, train_Y2_pred = data_utils.set_char_answer(
        train_context_words, train_Y1_word_pred, train_Y2_word_pred)
    train_Y1_pred = np.array(train_Y1_pred, dtype=np.int32)
    train_Y2_pred = np.array(train_Y2_pred, dtype=np.int32)
    # evaluate predict with setting answer (word answer)
    train_acc1, train_acc2, train_accuracy = evaluation.compute_accuracy(
        train_word_ans1, train_Y1_word_pred, train_word_ans2,
        train_Y2_word_pred)
    train_prec, train_rec, train_f1 = evaluation.compute_scores(
        train_word_ans1, train_Y1_word_pred, train_word_ans2,
        train_Y2_word_pred, word_ctx_len)
    print("word-level train accuracy:", train_acc1, train_acc2, train_accuracy)
    print("word-level train prec rec:", train_prec, train_rec)
    print("word-level train f1:", train_f1)
    # evaluate predict with real answer (char answer)
    train_acc1, train_acc2, train_accuracy = evaluation.compute_accuracy(
        train_ans1, train_Y1_pred, train_ans2, train_Y2_pred)
    train_prec, train_rec, train_f1 = evaluation.compute_scores(
        train_ans1, train_Y1_pred, train_ans2, train_Y2_pred, max_char_ctx_len)
    print("char-level train accuracy:", train_acc1, train_acc2, train_accuracy)
    print("char-level train prec rec:", train_prec, train_rec)
    print("char-level train f1:", train_f1)
    et = time.time()
    print("cost time:", et - st)

    ## test
    print("test")
    st = time.time()
    test_Y1_hat, test_Y2_hat = model.predict(
        [test_X1, test_X2, test_X3, test_X4, test_X5, test_X6],
        batch_size=batch_size)
    # compute predict
    test_Y1_word_pred, test_Y2_word_pred = model_utils.constraint_predict(
        test_Y1_hat, test_Y2_hat)
    test_Y1_pred, test_Y2_pred = data_utils.set_char_answer(
        test_context_words, test_Y1_word_pred, test_Y2_word_pred)
    test_Y1_pred = np.array(test_Y1_pred, dtype=np.int32)
    test_Y2_pred = np.array(test_Y2_pred, dtype=np.int32)
    data_utils.write_predict(predict_path, test_id, test_Y1_pred, test_Y2_pred)
    et = time.time()
    print("cost time:", et - st)
            for i,c in enumerate(c_set):
                caps = {'gen':{}, 'gts':{}}
                caps['gen'][i] = [c]
                caps['gts'][i] = [c_start for c_start in c_set if c != c_start]
                gts = evaluation.PTBTokenizer.tokenize(caps['gts'])
                gen = evaluation.PTBTokenizer.tokenize(caps['gen'])

                score, _ = metric.compute_score(gts, gen)
                lex_sim_scores.append(np.mean(score))
            '''
            #gen_cap = [c_set[np.argmax(lex_sim_scores)]]
            gen_cap = [c_set[np.random.randint(0, len(c_set))]]
            utterances = image_df[image_df['art_paint'] ==
                                  test_paints[img_id]]['utterances'].values[0]

            print('Generated: ', gen_cap)
            print('Ground Truth ', utterances)

            nncaps['gen'][test_paints[img_id]] = gen_cap
            nncaps['gts'][test_paints[img_id]] = utterances

            pbar.update()

    with open('nnbaseline.pickle', 'wb') as f:
        pickle.dump(nncaps, f)

    gts = evaluation.PTBTokenizer.tokenize(nncaps['gts'])
    gen = evaluation.PTBTokenizer.tokenize(nncaps['gen'])
    scores, _ = evaluation.compute_scores(gts, gen)

    print(scores)
    model.fit_generator(batch_generator(A,
                                        X,
                                        Y,
                                        'train',
                                        batch_size=BATCH_SIZE),
                        steps_per_epoch=len(A['train']) // BATCH_SIZE,
                        verbose=1)

    val_predictions = model.predict_generator(
        batch_generator(A, X, Y, 'val', batch_size=BATCH_SIZE),
        steps=len(A['val']) // BATCH_SIZE,
        verbose=1)

    val_predicted_labels, val_actual_labels = evaluation.predict_labels(
        val_predictions, val_y, meta['idx2label'])
    val_precision, val_recall, val_f1 = evaluation.compute_scores(
        val_predicted_labels, val_actual_labels)

    print("=== Validation Results ===")
    print("Precision: {:.2f}%".format(val_precision * 100))
    print("Recall: {:.2f}%".format(val_recall * 100))
    print("F1: {:.2f}".format(val_f1 * 100))

    test_predictions = model.predict_generator(batch_generator(A,
                                                               X,
                                                               Y,
                                                               'test',
                                                               batch_size=8),
                                               steps=len(A['test']) // 8,
                                               verbose=1)

    test_predicted_labels, test_actual_labels = evaluation.predict_labels(