Ejemplo n.º 1
0
def language_eval_excoco(predictions, predictions_bleu, sents_label_eval,
                         loader):

    Scorer = CiderD()
    Bleu_scorer = Bleu(4)
    METEOR_scorer = Meteor()
    ROUGE_scorer = Rouge()

    c_score, _ = Scorer.compute_score(sents_label_eval, predictions)
    b_score, _ = Bleu_scorer.compute_score(sents_label_eval, predictions_bleu)
    m_score, _ = METEOR_scorer.compute_score(sents_label_eval,
                                             predictions_bleu)
    r_score, _ = ROUGE_scorer.compute_score(sents_label_eval, predictions_bleu)

    print('Evaluating {} samples'.format(len(predictions)))

    print('Bleu_1 : ' + str(b_score[0]))
    print('Bleu_2 : ' + str(b_score[1]))
    print('Bleu_3 : ' + str(b_score[2]))
    print('Bleu_4 : ' + str(b_score[3]))
    print('METEOR : ' + str(m_score))
    print('ROUGE_L : ' + str(r_score))
    print('CIDEr : ' + str(c_score))

    lang_stat = {}
    lang_stat['BLEU_1'] = b_score[0]
    lang_stat['BLEU_2'] = b_score[1]
    lang_stat['BLEU_3'] = b_score[2]
    lang_stat['BLEU_4'] = b_score[3]
    lang_stat['METEOR'] = m_score
    lang_stat['ROUGE_L'] = r_score
    lang_stat['CIDEr'] = c_score

    return lang_stat
Ejemplo n.º 2
0
def test(model, dataloader, args):
    scorer = Bleu(4)
    m_scorer = Meteor()
    r_scorer = Rouge()
    hyp = []
    ref = []
    model.eval()
    gold_file = open('tmp_gold.txt', 'w')
    pred_file = open('tmp_pred.txt', 'w')
    with tqdm(dataloader, desc='Test ',  mininterval=1) as tq:
        for batch in tq:
            with torch.no_grad():
                seq = model(batch, beam_size=args.beam_size)
            r = write_txt(batch, batch['tgt_text'], gold_file, args)
            h = write_txt(batch, seq, pred_file, args)
            hyp.extend(h)
            ref.extend(r)
    hyp = dict(zip(range(len(hyp)), hyp))
    ref = dict(zip(range(len(ref)), ref))
    print(hyp[0], ref[0])
    print('BLEU INP', len(hyp), len(ref))
    print('BLEU', scorer.compute_score(ref, hyp)[0])
    print('METEOR', m_scorer.compute_score(ref, hyp)[0])
    print('ROUGE_L', r_scorer.compute_score(ref, hyp)[0])
    gold_file.close()
    pred_file.close()
Ejemplo n.º 3
0
def test(model_path='models/model-61', video_feat_path=video_feat_path):

    train_data, test_data = get_video_data(video_data_path, video_feat_path, train_ratio=0.7)
    test_videos = test_data['video_path'].values
    test_captions = test_data['Description'].values
    ixtoword = pd.Series(np.load('./data/ixtoword.npy').tolist())

    test_videos_unique = list()
    test_captions_list = list()
    for (video, caption) in zip(test_videos, test_captions):
        if len(test_videos_unique) == 0 or test_videos_unique[-1] != video:
            test_videos_unique.append(video)
            test_captions_list.append([caption])
        else:
            test_captions_list[-1].append(caption)

    model = Video_Caption_Generator(
            dim_image=dim_image,
            n_words=len(ixtoword),
            dim_embed=dim_embed,
            dim_hidden=dim_hidden,
            batch_size=batch_size,
            encoder_max_sequence_length=encoder_step,
            decoder_max_sentence_length=decoder_step,
            bias_init_vector=None)

    video_tf, video_mask_tf, caption_tf, probs_tf, last_embed_tf = model.build_generator()
    sess = tf.InteractiveSession()

    saver = tf.train.Saver()
    saver.restore(sess, model_path)

    scorer = Meteor()
    scorer_bleu = Bleu(4)
    GTS = defaultdict(list)
    RES = defaultdict(list)
    counter = 0

    for (video_feat_path, caption) in zip(test_videos_unique, test_captions_list):
        generated_sentence = gen_sentence(
            sess, video_tf, video_mask_tf, caption_tf, video_feat_path, ixtoword)
        print video_feat_path, generated_sentence
        #print caption

        GTS[str(counter)] = [{'image_id':str(counter),'cap_id':i,'caption':s} for i, s in enumerate(caption)]
        RES[str(counter)] = [{'image_id':str(counter),'caption':generated_sentence[:-2]+'.'}]

        #GTS[video_feat_path] = caption
        #RES[video_feat_path] = [generated_sentence[:-2] + '.']
        counter += 1
        #ipdb.set_trace()

    tokenizer = PTBTokenizer()
    GTS = tokenizer.tokenize(GTS)
    RES = tokenizer.tokenize(RES)

    score, scores = scorer.compute_score(GTS, RES)
    print "METEOR", score
    score, scores = scorer_bleu.compute_score(GTS, RES)
    print "BLEU", score
Ejemplo n.º 4
0
def bleu():
    scorer = Bleu(n=4)
    # scorer += (hypo[0], ref1)   # hypo[0] = 'word1 word2 word3 ...'
    #                                 # ref = ['word1 word2 word3 ...', 'word1 word2 word3 ...']
    score, scores = scorer.compute_score(gts, res)

    print('belu = %s' % score)
class CaptionStatsManager(nt.StatsManager):
    
    def __init__(self):
        super(CaptionStatsManager, self).__init__()
        
    def init(self):
        super(CaptionStatsManager, self).init()
        self.tokenized_true = {}
        self.tokenized_pred = {}
        self.scorer = Bleu(4)
        self.running_bleu_scores = [0 for _ in range(4)]
        
    def accumulate(self, loss, x, y, d):
        super(CaptionStatsManager, self).accumulate(loss, x, y, d)        
        self.tokenized_true[0] = []
        self.tokenized_pred[0] = []
        _, pred_cap_lab = torch.max(y, 1)
        true_cap_lab = d
        pred_cap = index_to_cap(pred_cap_lab)
        true_cap = index_to_cap(true_cap_lab)
        self.tokenized_true[0].append(true_cap)
        self.tokenized_pred[0].append(pred_cap)
        bleu_scores, _ = self.scorer.compute_score(self.tokenized_true, self.tokenized_pred)
        self.running_bleu_scores = list(map(add, self.running_bleu_scores, bleu_scores))
        
        
    def summarize(self):
        # this is the average loss when called
        loss = super(CaptionStatsManager, self).summarize()
        
        # this is the average accuracy percentage when called
        bleu_score = [ a / self.number_update for a in self.running_bleu_scores]
        return {'loss' : loss, 'bleu' : bleu_score}
Ejemplo n.º 6
0
class TextCapsBleu4Evaluator:
    def __init__(self):
        # The following script requires Java 1.8.0 and pycocotools installed.
        # The pycocoevalcap can be installed with pip as
        # pip install git+https://github.com/ronghanghu/coco-caption.git@python23
        # Original pycocoevalcap code is at https://github.com/tylin/coco-caption
        # but has no python3 support yet.
        try:
            from pycocoevalcap.bleu.bleu import Bleu
            from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
        except ModuleNotFoundError:
            print(
                "Please install pycocoevalcap module using "
                "pip install git+https://github.com/ronghanghu/coco-caption.git@python23"  # noqa
            )
            raise

        self.tokenizer = PTBTokenizer()
        self.scorer = Bleu(4)

    def eval_pred_list(self, pred_list):
        # Create reference and hypotheses captions.
        gts = {}
        res = {}
        for idx, entry in enumerate(pred_list):
            gts[idx] = [{"caption": a} for a in entry["gt_answers"]]
            res[idx] = [{"caption": entry["pred_answer"]}]

        gts = self.tokenizer.tokenize(gts)
        res = self.tokenizer.tokenize(res)
        score, _ = self.scorer.compute_score(gts, res)

        bleu4 = score[3]  # score is (Bleu-1, Bleu-2, Bleu-3, Bleu-4)
        return bleu4
def eval_div_stats(dataset, preds_n, model_id, split):
    tokenizer = PTBTokenizer()

    capsById = {}
    for i, d in enumerate(preds_n):
        d['id'] = i
        capsById[d['image_id']] = capsById.get(d['image_id'], []) + [d]

    n_caps_perimg = len(capsById[list(capsById.keys())[0]])
    print(n_caps_perimg)
    _capsById = capsById  # save the untokenized version
    capsById = tokenizer.tokenize(capsById)

    div_1, adiv_1 = compute_div_n(capsById, 1)
    div_2, adiv_2 = compute_div_n(capsById, 2)

    globdiv_1, _ = compute_global_div_n(capsById, 1)

    print(
        'Diversity Statistics are as follows: \n Div1: %.2f, Div2: %.2f, gDiv1: %d\n'
        % (div_1, div_2, globdiv_1))

    # compute mbleu
    scorer = Bleu(4)
    all_scrs = []
    scrperimg = np.zeros((n_caps_perimg, len(capsById)))

    for i in range(n_caps_perimg):
        tempRefsById = {}
        candsById = {}
        for k in capsById:
            tempRefsById[k] = capsById[k][:i] + capsById[k][i + 1:]
            candsById[k] = [capsById[k][i]]

        score, scores = scorer.compute_score(tempRefsById, candsById)
        all_scrs.append(score)
        scrperimg[i, :] = scores[1]

    all_scrs = np.array(all_scrs)

    out = {}
    out['overall'] = {'Div1': div_1, 'Div2': div_2, 'gDiv1': globdiv_1}
    for k, score in zip(range(4), all_scrs.mean(axis=0).tolist()):
        out['overall'].update({'mBLeu_%d' % (k + 1): score})
    imgToEval = {}
    for i, imgid in enumerate(capsById.keys()):
        imgToEval[imgid] = {'mBleu_2': scrperimg[:, i].mean()}
        imgToEval[imgid]['individuals'] = []
        for j, d in enumerate(_capsById[imgid]):
            imgToEval[imgid]['individuals'].append(preds_n[d['id']])
            imgToEval[imgid]['individuals'][-1]['mBleu_2'] = scrperimg[j, i]
    out['ImgToEval'] = imgToEval

    print(
        'Mean mutual Bleu scores on this set is:\nmBLeu_1, mBLeu_2, mBLeu_3, mBLeu_4'
    )
    print(all_scrs.mean(axis=0))

    return out
Ejemplo n.º 8
0
    def coco_evaluate(self,
                      path1: str,
                      path2: str,
                      kaldi_stream: str,
                      kaldi_scp: str,
                      caption_file: str,
                      max_length: int = None,
                      output: str = "coco_scores.txt"):
        key2pred = self._ensemble(path1, path2, kaldi_stream, kaldi_scp,
                                  max_length)

        caption_df = pd.read_json(caption_file)
        caption_df["key"] = caption_df["filename"].apply(
            lambda x: os.path.splitext(x)[0])
        key2refs = caption_df.groupby(["key"])["caption"].apply(list).to_dict()

        from pycocoevalcap.bleu.bleu import Bleu
        from pycocoevalcap.rouge.rouge import Rouge
        from pycocoevalcap.cider.cider import Cider
        from pycocoevalcap.meteor.meteor import Meteor
        from pycocoevalcap.spice.spice import Spice

        f = open(output, "w")

        scorer = Bleu(n=4)
        score, scores = scorer.compute_score(key2refs, key2pred)
        for n in range(4):
            f.write("Bleu-{}: {:6.3f}\n".format(n + 1, score[n]))

        scorer = Rouge()
        score, scores = scorer.compute_score(key2refs, key2pred)
        f.write("ROUGE: {:6.3f}\n".format(score))

        scorer = Cider()
        score, scores = scorer.compute_score(key2refs, key2pred)
        f.write("CIDEr: {:6.3f}\n".format(score))

        scorer = Meteor()
        score, scores = scorer.compute_score(key2refs, key2pred)
        f.write("Meteor: {:6.3f}\n".format(score))

        scorer = Spice()
        score, scores = scorer.compute_score(key2refs, key2pred)
        f.write("Spice: {:6.3f}\n".format(score))

        f.close()
Ejemplo n.º 9
0
class Metrics:
    def __init__(self):
        pass

    def bleu(self, hypo, ref):
        self.bleu_scorer = Bleu(4)
        final_scores = {}
        score, scores = self.bleu_scorer.compute_score(ref, hypo)
        for m, s in zip(["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"], score):
            final_scores[m] = s
        return final_scores
Ejemplo n.º 10
0
def get_corpus_bleu(model, data_loader, vocabs, device, beam_size):
    import torch
    from pycocoevalcap.bleu.bleu import Bleu
    from pycocoevalcap.cider.cider import Cider
    from pycocoevalcap.rouge.rouge import Rouge
    """Defining Scorers"""
    scorer_bleu = Bleu(4)
    scorer_rouge = Rouge()
    scorer_cider = Cider()

    sequences_ref = {}
    sequences_gen = {}

    bad_words = ['<SOS>', '<EOS>', '<UNK>']
    bad_toks = [vocabs['word_vocab'](i) for i in bad_words]
    """Generation Loop"""
    for i, data in enumerate(data_loader):
        with torch.no_grad():
            captions = data['captions']
            length = captions.size(1) - 1
            targets = captions.narrow(1, 1, length)
            images = data['images'].to(device)
            topics = data['topics'].to(device)

            predictions = model.sample_v2(images, topics, beam_size=beam_size)
            sequences_ref[i] = [
                " ".join([
                    vocabs['word_vocab'](j.item()) for j in targets[0]
                    if j.item() not in bad_toks
                ])
            ]
            sequences_gen[i] = [
                " ".join([
                    vocabs['word_vocab'](j.item()) for j in predictions[0][1]
                    if j.item() not in bad_toks
                ])
            ]
            # sequences_gen[i] = [" ".join([vocabs['word_vocab'](j) for j in predictions[0] if j not in bad_toks])]
    """Getting Scores"""
    bleu_score, bleu_scores = scorer_bleu.compute_score(
        sequences_ref, sequences_gen)
    rouge_score, rouge_scores = scorer_rouge.compute_score(
        sequences_ref, sequences_gen)
    cider_score, cider_scores = scorer_cider.compute_score(
        sequences_ref, sequences_gen)
    scores = {
        'bleu_score': bleu_score,
        'rouge_score': rouge_score,
        'cider_score': cider_score
    }
    print(scores)
    return scores
Ejemplo n.º 11
0
        def val_score(self, s_start=0, num_batches=2):
            bs = self.imp["BATCH_SIZE"]
            bleu = Bleu()
            eval_store_gen = {}
            eval_store_gt = {}
            num_examples = self.test_data.dec_in.get_num_seqs()
            max_num_batches = num_examples / bs
            for i in xrange(min(num_batches, max_num_batches)):
                s = s_start + bs * i
                e = s_start + bs * (i + 1)
                gen_txt = self.generate(s=s, allow_unk=False)
                gt_txt = self.test_data.dec_out.get_text(s, e)
                fnames = self.test_data.filenames[s:e]
                for g, f in zip(gen_txt, fnames):
                    if f not in eval_store_gen:
                        eval_store_gen[f] = [" ".join(g)]

                for g, f in zip(gt_txt, fnames):
                    if f not in eval_store_gt:
                        eval_store_gt[f] = []
                    eval_store_gt[f].append(" ".join(g))
            print bleu.compute_score(eval_store_gt, eval_store_gen)[0]
def _define_metrics(gts, res):
    bleu_scorer = Bleu(n=4)
    bleu, _ = bleu_scorer.compute_score(gts=gts, res=res)

    rouge_scorer = Rouge()
    rouge, _ = rouge_scorer.compute_score(gts=gts, res=res)

    cider_scorer = Cider()
    cider, _ = cider_scorer.compute_score(gts=gts, res=res)

    meteor_scorer = Meteor()
    meteor, _ = meteor_scorer.compute_score(gts=gts, res=res)

    for i in range(4):
        bleu[i] = round(bleu[i], 4)

    return bleu, round(meteor, 4), round(rouge, 4), round(cider, 4)
Ejemplo n.º 13
0
def compute_bleu_score(decode_res, keys, gts, start_idx, end_idx, vocabulary):
    """
    Args:
        decode_res: decoding results of model, [B, max_length]
        keys: keys of this batch, tuple [B,]
        gts: ground truth sentences of all audios, dict(<key> -> [ref_1, ref_2, ..., ref_n])
    Return:
        score: scores of this batch, [B,]
    """
    from pycocoevalcap.bleu.bleu import Bleu
    scorer = Bleu(4)

    hypothesis = {}
    references = {}

    for i in range(decode_res.shape[0]):

        if keys[i] in hypothesis:
            continue

        # prepare candidate
        candidate = []
        for t, w_t in enumerate(decode_res[i]):
            if w_t == start_idx:
                continue
            elif w_t == end_idx:
                break
            else:
                candidate.append(vocabulary.idx2word[w_t])
        hypothesis[keys[i]] = [
            " ".join(candidate),
        ]

        # prepare reference
        references[keys[i]] = gts[keys[i]]

    (score, scores) = scorer.compute_score(references, hypothesis)

    key2score = {key: scores[3][i] for i, key in enumerate(hypothesis.keys())}
    results = np.zeros(decode_res.shape[0])
    for i in range(decode_res.shape[0]):
        results[i] = key2score[keys[i]]

    return results
Ejemplo n.º 14
0
def eval(result_gts_path, result_res_path):
    with open(result_gts_path, 'r') as file:
        gts_dict = json.load(file)
    with open(result_res_path, 'r') as file:
        res_dict = json.load(file)

    bleu_score = Bleu(n=4)
    bleu, _ = bleu_score.compute_score(gts=gts_dict, res=res_dict)

    meteor_score = Meteor()
    meteor, _ = meteor_score.compute_score(gts=gts_dict, res=res_dict)

    rouge_scorer = Rouge()
    rouge, _ = rouge_scorer.compute_score(gts=gts_dict, res=res_dict)

    cider_scorer = Cider()
    cider, _ = cider_scorer.compute_score(gts=gts_dict, res=res_dict)

    return bleu, meteor, rouge, cider
class RougeBleuScore(Metric):

    def __init__(self, coco, vocab, n = 4):
        self.coco = coco
        self.vocab = vocab
        self.bleu = Bleu(n)
        self.n = n
        self.rouge = Rouge()

    def evaluate(self, y_pred, y, image_ids):
        if type(y_pred) == list:
            caption_pred_list = caption_list_to_words(y_pred, self.vocab)
        else:
            caption_pred_list = tensor_to_words(y_pred, y, self.vocab)
        captions_pred, captions_gt = extract_captions(image_ids, caption_pred_list, self.coco)
        blockPrint()
        scores = self.bleu.compute_score(captions_gt, captions_pred)[0]
        enablePrint()
        scores.append(self.rouge.compute_score(captions_gt, captions_pred)[0])
        return scores
Ejemplo n.º 16
0
        def bleu_scorer(reference, hypothesis):
            # =================================================
            # Compute scores
            # =================================================
            scorer = Bleu(4)
            method = ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]
            # print('computing %s score...' % (scorer.method()))

            score, scores = scorer.compute_score(reference, hypothesis)

            bleus = {}
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    # print("%s: %0.3f" % (m, sc))
                    bleus[m] = sc
            else:
                # print("%s: %0.3f" % (method, score))
                bleus[method] = score

            return bleus
def calculate_metric(rnn, meteor=None):
    gts = {}
    res = {}
    lp_avg = 0.0
    lp_c = 0
    for idx in range(rnn.V_valid.shape[0]):
        iid = rnn.Id_valid[idx]
        if iid not in gts: gts[iid] = []
        #gts[iid].append(' '.join([rnn.dp.i2w[w] for w in rnn.X_valid[idx] if w != 0][::-1]))
        gts[iid] = [
            ' '.join(rnn.dp.tokens[i][::-1])
            for i in rnn.dp.img_id_to_tokens[iid]
        ]
        if iid in res: continue
        res[iid] = []
        #pos_sen, pos_att = rnn.get_sentence(rnn.V_valid[idx], senti=np.array([1.0], dtype=theano.config.floatX))
        (lp, pos_sen) = decoder_beamsearch(rnn,
                                           rnn.V_valid[idx],
                                           senti=1.0,
                                           beam_size=1)
        pos_sen = pos_sen[:-1]
        print(' '.join(pos_sen[::-1]))
        res[iid].append(' '.join(pos_sen[::-1]))
        lp_avg += np.exp(lp)
        lp_c += 1
    lp_avg /= float(lp_c)
    return lp_avg

    bleu = Bleu()
    print("Bleu:")
    print("Positive:", bleu.compute_score(gts, res)[0])
    rouge = Rouge()
    print("Rouge:")
    print("Positive:", rouge.compute_score(gts, res)[0])
    if meteor is None:
        meteor = Meteor()
    print("Meteor:")
    mscore = meteor.compute_score(gts, res)[0]
    print("Positive:", mscore)
    return mscore
Ejemplo n.º 18
0
def eval_epoch_bleu(model, validation_data, device, vocab, list_of_refs_dev, args):
    ''' Epoch operation in evaluation phase '''

    model.eval()

    total_loss = 0
    n_word_total = 0
    n_word_correct = 0

    hypotheses = {}
    count = 0

    with torch.no_grad():
        for batch in tqdm(
                validation_data, mininterval=2,
                desc='  - (Validation) ', leave=False):

            # prepare data
            image0, image1, image0_attribute, image1_attribute = map(lambda x: x.to(device), batch)

            """[src/tgt/memory]_key_padding_mask should be a ByteTensor where True values are positions
                                    that should be masked with float('-inf') and False values will be unchanged.
                                    This mask ensures that no information will be taken from position i if
                                    it is masked, and has a separate mask for each sequence in a batch."""

            hyp = beam_search(image0, image1, model, args, vocab, image0_attribute, image1_attribute)

            hyp = hyp.split("<end>")[0].strip()

            hypotheses[count] = [hyp]

            count += 1

        scorer = Bleu(4)

        score, _ = scorer.compute_score(list_of_refs_dev, hypotheses)

    return score
Ejemplo n.º 19
0
class TextCapsBleu4Evaluator:
    def __init__(self):
        # The following script requires Java 1.8.0 and pycocotools installed.
        # The pycocoevalcap can be installed with pip from M4C-Captioner's Github repo
        # but has no python3 support yet.
        from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
        from pycocoevalcap.bleu.bleu import Bleu
        self.tokenizer = PTBTokenizer()
        self.scorer = Bleu(4)

    def eval_pred_list(self, pred_list):
        # Create reference and hypotheses captions.
        gts = {}
        res = {}
        for idx, entry in enumerate(pred_list):
            gts[idx] = [{'caption': a} for a in entry['gt_answers']]
            res[idx] = [{'caption': entry['pred_answer']}]

        gts = self.tokenizer.tokenize(gts)
        res = self.tokenizer.tokenize(res)
        score, _ = self.scorer.compute_score(gts, res)

        bleu4 = score[3]  # score is (Bleu-1, Bleu-2, Bleu-3, Bleu-4)
        return bleu4
Ejemplo n.º 20
0
def train(args):
    iter_per_epoch = int(math.ceil(conf.num_coco_data * 1.0 / conf.batch_size))

    pkl_path = os.path.join(conf.val_small_data_path, 'val_caption.pkl')
    with open(pkl_path, 'rb') as f:
        caption_data = pickle.load(f)
        pass
    image_id_list = caption_data.keys()

    bleu_test = Bleu()
    vocab, reverse_vocab = utils.load_dict(conf.dictionary_path)

    with tf.device('/cpu:0'):
        train_image_batch, train_sequence_batch = get_data.batch_train_data(
            'train', conf.batch_size, conf.shuffer_buffer_size, 6,
            conf.train_data_path)
        val_dataset = get_data.batch_val_data('val', conf.batch_size, 6,
                                              conf.val_small_data_path)
        val_id_batch, val_image_batch = get_data.make_val_iterator(val_dataset)
        pass
    logging.info("The input graph defined!")

    with tf.variable_scope(tf.get_variable_scope()) as scope:
        train_model = ShowAttendTell(first_time=args.first_time,
                                     start_token_index=vocab[conf.start_token],
                                     pad_token_index=vocab[conf.pad_token],
                                     mat_file=conf.vgg_checkpoint,
                                     max_timestep=conf.sentence_length,
                                     train_vgg=conf.train_vgg)
        batch_loss, perplexity, _ = train_model.build_model()
        scope.reuse_variables()
        generated_words = train_model.build_validation()
        pass

    ave_train_loss = tf.Variable(0,
                                 name='ave_train_loss',
                                 dtype=tf.float32,
                                 trainable=False)
    bleu1 = tf.Variable(0, name='bleu1', dtype=tf.float32, trainable=False)
    bleu2 = tf.Variable(0, name='bleu2', dtype=tf.float32, trainable=False)
    bleu3 = tf.Variable(0, name='bleu3', dtype=tf.float32, trainable=False)
    bleu4 = tf.Variable(0, name='bleu4', dtype=tf.float32, trainable=False)

    tf.summary.scalar('ave_train_loss', ave_train_loss)
    tf.summary.scalar('batch_loss', batch_loss)
    tf.summary.scalar('batch_perplexity', perplexity)
    tf.summary.scalar('bleu1', bleu1)
    tf.summary.scalar('bleu2', bleu2)
    tf.summary.scalar('bleu3', bleu3)
    tf.summary.scalar('bleu4', bleu4)

    all_variable = tf.trainable_variables()
    for variable in all_variable:
        tf.summary.histogram(variable.op.name, variable)
        pass

    all_gradient = tf.gradients(batch_loss, all_variable)
    for index, variable in enumerate(all_variable):
        tf.summary.histogram(variable.op.name + "/gradient",
                             all_gradient[index])
        pass

    with open(conf.global_step_file
              ) as fd1:  # for logging the last global step saved
        number = int(fd1.readline().strip())
        pass

    global_step_t = tf.Variable(number, name='global_step', trainable=False)
    learning_rate = tf.train.exponential_decay(conf.learning_rate,
                                               global_step_t,
                                               conf.decay_step,
                                               conf.decay_rate,
                                               staircase=True)

    # optimizer = tf.train.AdamOptimizer(learning_rate=conf.learning_rate)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)

    # for updating the moving average and variance in batch norm
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        train_op = optimizer.minimize(batch_loss, global_step=global_step_t)
        pass
    logging.info("The optimization operation defined!")

    saver = tf.train.Saver(max_to_keep=80)
    ckpt_filename = os.path.join(conf.ckpt_upper_path, 'model.ckpt')

    with tf.Session() as sess:
        if args.load_ckpt:
            newest_checkpoint = tf.train.latest_checkpoint(
                conf.ckpt_upper_path)
            utils.restore(sess, newest_checkpoint)
            pass

        new_folder_name = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
        log_whole_path = os.path.join(conf.model_log_path, new_folder_name)
        if not os.path.exists(log_whole_path):
            os.makedirs(log_whole_path)
            pass

        merged_summary = tf.summary.merge_all()
        summary_writer = tf.summary.FileWriter(log_whole_path)
        summary_writer.add_graph(sess.graph)

        sess.run(tf.global_variables_initializer())

        total_loss = 0.0
        start_time = time.time()
        all_time = 0

        counter = 0
        # b = 30
        # for e in range(1):
        #     for i in range(b):
        for _ in range(conf.epoch):
            for _ in range(iter_per_epoch):
                counter += 1
                logging.info("In iter %d " % (counter))
                image_batch_data, sequence_batch_data = sess.run(
                    [train_image_batch, train_sequence_batch])

                feed_dict = {
                    train_model.input_image: image_batch_data,
                    train_model.input_caption: sequence_batch_data
                }

                batch_loss_value, batch_perplexity_value, _ = sess.run(
                    [batch_loss, perplexity, train_op], feed_dict=feed_dict)

                logging.info("batch loss: %s " % batch_loss_value)
                logging.info("batch perplexity value: %s " %
                             batch_perplexity_value)
                total_loss += batch_loss_value

                if counter % 100 == 0:
                    prediction = {}

                    while True:
                        try:
                            val_id_batch_data, val_image_batch_data = sess.run(
                                [val_id_batch, val_image_batch])
                            pass
                        except tf.errors.OutOfRangeError:
                            with tf.device('/cpu:0'):
                                val_id_batch, val_image_batch = get_data.make_val_iterator(
                                    val_dataset)
                                pass
                            break
                        val_feed_dict = {
                            train_model.input_image: val_image_batch_data
                        }
                        caption = sess.run(generated_words,
                                           feed_dict=val_feed_dict)
                        for index, id in enumerate(val_id_batch_data):
                            sentence = utils.get_sentence(
                                caption[index], reverse_vocab)
                            prediction[int(id)] = [sentence]
                            pass

                    random_id = random.choice(image_id_list)
                    logging.info("Prediction %s " % prediction[random_id][0])
                    logging.info("Label %s " % caption_data[random_id][0])

                    print len(caption_data.keys())
                    print len(prediction.keys())
                    score, _ = bleu_test.compute_score(caption_data,
                                                       prediction)

                    # print "score ", score
                    logging.info("Bleu1 %f " % (score[0]))
                    logging.info("Bleu2 %f " % (score[1]))
                    logging.info("Bleu3 %f " % (score[2]))
                    logging.info("Bleu4 %f " % (score[3]))

                    sess.run(bleu1.assign(score[0]))
                    sess.run(bleu2.assign(score[1]))
                    sess.run(bleu3.assign(score[2]))
                    sess.run(bleu4.assign(score[3]))

                    pass

                if counter % 50 == 0:
                    sess.run(
                        ave_train_loss.assign(total_loss * 1.0 / (counter)))
                    logging.info("train average loss %f " % (total_loss * 1.0 /
                                                             (counter)))

                    summary = sess.run(merged_summary, feed_dict=feed_dict)
                    summary_writer.add_summary(
                        summary, tf.train.global_step(sess, global_step_t))
                    summary_writer.flush()
                    pass

                if counter % 300 == 0:
                    with open(conf.global_step_file, 'w') as fd:
                        fd.write(str(tf.train.global_step(sess,
                                                          global_step_t)))
                        pass
                    saver.save(sess, ckpt_filename, global_step=global_step_t)

                new_time = time.time()
                time_range = new_time - start_time
                start_time = new_time
                all_time += time_range
                logging.info("batch %d take %f \n" % (counter, time_range))
                pass
            pass
        pass
        logging.info("Average time %f " % (all_time * 1.0 / counter))
        summary_writer.close()
    pass
Ejemplo n.º 21
0
    def end_epoch(self, ):
        path = Path(Options()["exp.dir"])

        dirname = path.joinpath("generated_sentences")
        # Create directory if it does not exist
        if not os.path.exists(dirname):
            try:
                os.makedirs(dirname)
            except OSError as exc:  # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise

        # Dump sentences to the directory
        for field in ["action", "justification"]:
            for key in ["ground_truth", "predicted"]:
                filepath = dirname.joinpath("%s_%s.txt" % (key, field))
                with open(filepath, "w") as f:
                    f.write("\n".join(self.sentences[key][field]))

        # Compute NLP quality scores (bleu, meteor, cider...)
        for field in ["action", "justification"]:
            cider = Cider()
            bleu = Bleu()
            meteor = Meteor()

            # Check if this is not empty
            if len(self.sentences["ground_truth"][field]) > 0:
                ground_truth = {
                    i: [sentence]
                    for i, sentence in enumerate(self.sentences["ground_truth"]
                                                 [field])
                }
                predicted = {
                    i: [sentence]
                    for i, sentence in enumerate(self.sentences["predicted"]
                                                 [field])
                }

                cider_score, _ = cider.compute_score(ground_truth, predicted)
                cider_score = cider_score * 100  # Convert to percentage

                bleus_score, _ = bleu.compute_score(ground_truth, predicted)
                bleu_score = bleus_score[
                    3] * 100  # Take bleu-4 and convert to percentage

                meteor_score, _ = meteor.compute_score(ground_truth, predicted)
                meteor_score = meteor_score * 100  # Convert to percentage
            else:
                # Otherwise all scores are 0
                cider_score, bleu_score, meteor_score = 0, 0, 0

            Logger().log_value('%s_epoch.cider_%s' % (self.mode, field),
                               cider_score,
                               should_print=True)
            Logger().log_value('%s_epoch.bleucoco_%s' % (self.mode, field),
                               bleu_score,
                               should_print=True)
            Logger().log_value('%s_epoch.meteorcoco_%s' % (self.mode, field),
                               meteor_score,
                               should_print=True)

        # Reset sentences
        self.sentences = {
            "ground_truth": {
                "action": [],
                "justification": []
            },
            "predicted": {
                "action": [],
                "justification": []
            }
        }
        return
Ejemplo n.º 22
0
def coco_caption_metrics_hier(predicts_list,
                              sentences_list,
                              image_id_list,
                              config,
                              batch_size=26,
                              is_training=True):
    with open(config.vocabulary_path, 'r') as file:
        vocabulary_list = json.load(file)
    word2id = {}
    for i in range(vocabulary_list.__len__()):
        word2id[vocabulary_list[i]] = i
    id2word = {v: k for k, v in word2id.items()}

    gts = {}
    res = {}
    for i in range(0, predicts_list.__len__()):
        for j in range(0, batch_size):
            sent_pre, sent_gt = [], []
            for k in range(config.max_sentence_num *
                           config.max_sentence_length):
                id_input = int(predicts_list[i][k][j])
                sent_pre.append(id2word[id_input])

                id_gt = sentences_list[i][j][k]
                if (not id2word[id_gt].__eq__('</S>')) and (
                        not id2word[id_gt].__eq__('<EOS>')):
                    sent_gt.append(id2word[id_gt])

            # sent_pre2 = sent_pre
            sent_pre2 = []
            for n in range(config.max_sentence_num):
                for m in range(config.max_sentence_length):
                    word = sent_pre[n * config.max_sentence_length + m]
                    if word != '</S>':
                        sent_pre2.append(word)
                    else:
                        break

            str_pre, str_gt = ' '.join(sent_pre2), ' '.join(sent_gt)
            image_id = image_id_list[i][j][0]
            gts[str(image_id)] = [str_gt]
            res[str(image_id)] = [str_pre]

    if not is_training:
        with open(config.result_gts_path, 'w') as file:
            json.dump(gts, file)
        with open(config.result_res_path, 'w') as file:
            json.dump(res, file)

    bleu_scorer = Bleu(n=4)
    bleu, _ = bleu_scorer.compute_score(gts=gts, res=res)

    rouge_scorer = Rouge()
    rouge, _ = rouge_scorer.compute_score(gts=gts, res=res)

    cider_scorer = Cider()
    cider, _ = cider_scorer.compute_score(gts=gts, res=res)
    # #
    # meteor_scorer = Meteor()
    # meteor, _ = meteor_scorer.compute_score(gts=gts, res=res)

    for i in range(4):
        bleu[i] = round(bleu[i], 4)

    # return bleu, round(meteor, 4), round(rouge, 4), round(cider, 4)
    return bleu, round(rouge, 4), round(cider, 4)
Ejemplo n.º 23
0
def bleu(gts, res):
    scorer = Bleu(n=4)

    score, scores = scorer.compute_score(gts, res)

    out_file.write('BLEU(1-4) = %s' % score + '\n')
Ejemplo n.º 24
0
 def get_bleu_score(self):
     bleu = Bleu()
     scores = bleu.compute_score(self.eval_store_gt, self.eval_store_gen)[0]
     return scores
Ejemplo n.º 25
0
def coco_caption_metrics(predictions_list,
                         image_id_list,
                         vocabulary_path='data/vocabulary.json',
                         max_caption_length=25,
                         batch_size=32,
                         is_training=True):
    with open(vocabulary_path, 'r') as file:
        vocabulary_list = json.load(file)
    word2id = {}
    for i in range(vocabulary_list.__len__()):
        word2id[vocabulary_list[i]] = i
    id2word = {v: k for k, v in word2id.items()}

    with open('data/captions_gt.json', 'r') as file:
        captions_gt_dict = json.load(file)

    gts = {}
    res = {}
    for i in range(0, predictions_list.__len__()):
        for j in range(0, batch_size):
            sen_input, sen_ground_truth = [], []
            for k in range(max_caption_length):
                id_input = int(predictions_list[i][k][j])
                sen_input.append(id2word[id_input])

            sen_pre = []
            for n in range(max_caption_length):
                word = sen_input[n]
                if word != '</S>':
                    sen_pre.append(word)
                else:
                    break

            str_input = ' '.join(sen_pre)
            image_id = image_id_list[i][j][0]

            # print(image_id)
            res[image_id] = [str_input]
            gts[image_id] = captions_gt_dict[str(image_id)]

    if not is_training:
        # for key in gts.keys():
        #     str_input = res[key]
        #     str_grundtruth = gts[key]
        #     print(key)
        #     print(str_input)
        #     print(str_grundtruth)
        #     print('*' * 100)

        with open('data/result/result_res.json', 'w') as file:
            json.dump(res, file)
        with open('data/result/result_gts.json', 'w') as file:
            json.dump(gts, file)
        # print('result.json get success')

    bleu_scorer = Bleu(n=4)
    bleu, _ = bleu_scorer.compute_score(gts=gts, res=res)

    rouge_scorer = Rouge()
    rouge, _ = rouge_scorer.compute_score(gts=gts, res=res)

    cider_scorer = Cider()
    cider, _ = cider_scorer.compute_score(gts=gts, res=res)

    meteor_scorer = Meteor()
    meteor, _ = meteor_scorer.compute_score(gts=gts, res=res)

    for i in range(4):
        bleu[i] = round(bleu[i], 4)
    return bleu, round(meteor, 4), round(rouge, 4), round(cider, 4)
Ejemplo n.º 26
0
def main():
    bleu_test = Bleu()
    vocab, reverse_vocab = utils.load_dict(conf.dictionary_path)

    pkl_path = os.path.join(conf.val_data_path, 'val_caption.pkl')
    with open(pkl_path, 'rb') as f:
        caption_data = pickle.load(f)
    image_id_list = caption_data.keys()
    image_to_show = set(random.sample(image_id_list, 10))

    with tf.device('/cpu:0'):
        val_dataset = get_data.batch_val_data('val', conf.batch_size, 6, conf.val_data_path)
        val_id_batch, val_image_batch = get_data.make_val_iterator(val_dataset)
        pass
    logging.info("The input graph defined!")

    with tf.variable_scope(tf.get_variable_scope()) as scope:
        train_model = ShowAttendTell(first_time=False, start_token_index=vocab[conf.start_token],
                                     pad_token_index=vocab[conf.pad_token], max_timestep=conf.sentence_length)
        caption_generator = InferenceWrapper(train_model, vocab[conf.start_token], vocab[conf.end_token], beam_size=3)
        caption_generator.build_inference_model()
        pass

    # saver = tf.train.Saver()

    result = {}
    counter = 0
    with tf.Session() as sess:
        newest_checkpoint = tf.train.latest_checkpoint(conf.ckpt_upper_path)
        utils.restore(sess, newest_checkpoint)

        while True:
            counter += 1
            logging.info("Batch %d " % counter)
            try:
                val_id_batch_data, val_image_batch_data = sess.run([val_id_batch, val_image_batch])
                pass
            except tf.errors.OutOfRangeError:
                break
            for index, image_id in enumerate(val_id_batch_data):
                caption = caption_generator.run_inference(sess, val_image_batch_data[index])
                if len(caption) == 0:
                    sentence = ""
                else:
                    sentence = utils.get_sentence(caption[0][0], reverse_vocab)
                    pass
                result[int(image_id)] = [sentence]

                if image_id in image_to_show:
                    scipy.misc.imsave(str(image_id) + ".png", val_image_batch_data[index])
                    logging.info("%d : %s" % (image_id, sentence))
                pass
            pass

        score, _ = bleu_test.compute_score(caption_data, result)

        logging.info("Bleu1 %f " % (score[0]))
        logging.info("Bleu2 %f " % (score[1]))
        logging.info("Bleu3 %f " % (score[2]))
        logging.info("Bleu4 %f " % (score[3]))
        pass
Ejemplo n.º 27
0
    wrd = {i: [sys_strs[i]]}
    rouge, _ = rouge_obj.compute_score(wtd, wrd)

    rouges.append(rouge)

print(np.mean(rouges))

with open("%s-rouges.txt" % system, 'w') as outf:
    for r in rouges:
        outf.write(str(r) + '\n')

for i in range(len(ref1_strs)):
    word_target_dict[i] = [ref1_strs[i], ref2_strs[i]]
    word_response_dict[i] = [sys_strs[i]]

bleu_score, bleu_scores = bleu_obj.compute_score(word_target_dict,
                                                 word_response_dict)
bleu1_score, _, _, bleu4_score = bleu_score
bleu1_scores, _, _, bleu4_scores = bleu_scores
meteor_score, meteor_scores = meteor_obj.compute_score(word_target_dict,
                                                       word_response_dict)
rouge_score, rouge_scores = rouge_obj.compute_score(word_target_dict,
                                                    word_response_dict)
cider_score, cider_scores = cider_obj.compute_score(word_target_dict,
                                                    word_response_dict)

print("ROUGE-L: ", rouge_score)
print("BLEU-1: ", bleu1_score)
print("BLEU-4: ", bleu4_score)
print("METEOR: ", meteor_score)
print("CiDER: ", cider_score)
Ejemplo n.º 28
0
    def evaluate(self,
                 experiment_path: str,
                 feature_file: str,
                 feature_scp: str,
                 caption_file: str,
                 caption_output: str = "eval_output.json",
                 score_output: str = "scores.txt",
                 **kwargs):
        """kwargs: {'max_length': int, 'method': str, 'beam_size': int}"""

        dump = torch.load(os.path.join(experiment_path, "saved.pth"),
                          map_location="cpu")
        # Load previous training config
        config = dump["config"]

        vocabulary = torch.load(config["vocab_file"])
        model = self._get_model(config, vocabulary)
        model.load_state_dict(dump["model"])
        # Some scaler (sklearn standardscaler)
        scaler = dump["scaler"]
        zh = config["zh"]
        model = model.to(self.device)

        dataset = SJTUDatasetEval(feature=feature_file,
                                  eval_scp=feature_scp,
                                  transform=scaler.transform)
        dataloader = torch.utils.data.DataLoader(dataset,
                                                 shuffle=False,
                                                 collate_fn=collate_fn((1, )),
                                                 batch_size=32,
                                                 num_workers=0)

        caption_df = pd.read_json(caption_file, dtype={"key": str})
        if zh:
            key2refs = caption_df.groupby("key")["tokens"].apply(
                list).to_dict()
        else:
            key2refs = caption_df.groupby("key")["caption"].apply(
                list).to_dict()

        model.eval()

        key2pred = {}

        def _sample(engine, batch):
            with torch.no_grad():
                model.eval()
                keys = batch[0]
                output = self._forward(model, batch, mode="sample", **kwargs)
                seqs = output["seqs"].cpu().numpy()

                for idx, seq in enumerate(seqs):
                    caption = self._convert_idx2sentence(seq, vocabulary, zh)
                    key2pred[keys[idx]] = [
                        caption,
                    ]

        pbar = ProgressBar(persist=False, ascii=True)
        sampler = Engine(_sample)
        pbar.attach(sampler)
        sampler.run(dataloader)

        pred_df = []
        for key, pred in key2pred.items():
            pred_df.append({
                "filename": key + ".wav",
                "caption": "".join(pred[0]) if zh else pred[0],
                "tokens": pred[0] if zh else pred[0].split()
            })
        pred_df = pd.DataFrame(pred_df)
        pred_df.to_json(os.path.join(experiment_path, caption_output))

        from pycocoevalcap.bleu.bleu import Bleu
        from pycocoevalcap.rouge.rouge import Rouge
        from pycocoevalcap.cider.cider import Cider
        from pycocoevalcap.meteor.meteor import Meteor
        from pycocoevalcap.spice.spice import Spice

        f = open(os.path.join(experiment_path, score_output), "w")

        scorer = Bleu(n=4, zh=zh)
        score, scores = scorer.compute_score(key2refs, key2pred)
        for n in range(4):
            f.write("Bleu-{}: {:6.3f}\n".format(n + 1, score[n]))

        scorer = Rouge(zh=zh)
        score, scores = scorer.compute_score(key2refs, key2pred)
        f.write("ROUGE: {:6.3f}\n".format(score))

        scorer = Cider(zh=zh)
        score, scores = scorer.compute_score(key2refs, key2pred)
        f.write("CIDEr: {:6.3f}\n".format(score))

        if not zh:
            scorer = Meteor()
            score, scores = scorer.compute_score(key2refs, key2pred)
            f.write("Meteor: {:6.3f}\n".format(score))

            scorer = Spice()
            score, scores = scorer.compute_score(key2refs, key2pred)
            f.write("Spice: {:6.3f}\n".format(score))

        f.close()
Ejemplo n.º 29
0
def evaluate(beam_size):
    """
    Evaluation

    :param beam_size: beam size at which to generate captions for evaluation
    :return: BLEU-4 score
    """
    # DataLoader
    loader = torch.utils.data.DataLoader(CaptionDataset(
        data_folder,
        data_name,
        'TEST',
        transform=transforms.Compose([normalize])),
                                         batch_size=1,
                                         shuffle=True,
                                         num_workers=0,
                                         pin_memory=False)

    # TODO: Batched Beam Search
    # Therefore, do not use a batch_size greater than 1 - IMPORTANT!

    # Lists to store references (true captions), and hypothesis (prediction) for each image
    # If for n images, we have n hypotheses, and references a, b, c... for each image, we need -
    # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]
    references = dict()
    hypotheses = dict()

    # For each image
    for j, (image, caps, caplens, allcaps) in enumerate(
            tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))):

        k = beam_size

        # Move to GPU device, if available
        image = image.to(device)  # (1, 3, 256, 256)

        attrs, encoder_out = encoder(image)
        attrs = attrs.expand(3, attrs_dim)

        enc_image_size = encoder_out.size(1)
        encoder_dim = encoder_out.size(3)
        encoder_out = encoder_out.view(1, -1, encoder_dim)
        num_pixels = encoder_out.size(1)
        encoder_out = encoder_out.expand(k, num_pixels, encoder_dim)
        x0 = decoder.init_x0(attrs)

        # Tensor to store top k previous words at each step; now they're just <start>
        k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to(
            device)  # (k, 1)

        # Tensor to store top k sequences; now they're just <start>
        seqs = k_prev_words  # (k, 1)

        # Tensor to store top k sequences' scores; now they're just 0
        top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)

        # Lists to store completed sequences and scores
        complete_seqs = list()
        complete_seqs_scores = list()

        # Start decoding
        step = 1
        h1, c1, h2, c2 = decoder.init_hidden_state(attrs,
                                                   encoder_out,
                                                   zero=True)
        h1, c1 = decoder.decode_step1(x0, (h1, c1))
        # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
        while True:

            embeddings = decoder.embedding(k_prev_words).squeeze(
                1)  # (s, embed_dim)

            h1, c1 = decoder.decode_step1(embeddings, (h1, c1))

            awe, _ = decoder.attention(encoder_out, h1, h2)
            # gate = decoder.sigmoid(decoder.f_beta(h2))
            # awe = gate * awe

            h2, c2 = decoder.decode_step2(torch.cat([embeddings, awe], dim=1),
                                          (h2, c2))

            scores = decoder.fc2(decoder.dropout2(h2))
            scores = F.log_softmax(scores, dim=1)

            # Add
            scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)

            # For the first step, all k points will have the same scores (since same k previous words, h, c)
            if step == 1:
                top_k_scores, top_k_words = scores[0].topk(k, 0, True,
                                                           True)  # (s)
            else:
                # Unroll and find top scores, and their unrolled indices
                # (s) 所有分数中最大的k个
                top_k_scores, top_k_words = scores.view(-1).topk(
                    k, 0, True, True)

            # Convert unrolled indices to actual indices of scores
            # 上面展开了,prev_word_inds得到哪些句子是概率最大的
            prev_word_inds = top_k_words / vocab_size  # (s)
            next_word_inds = top_k_words % vocab_size  # (s)

            # Add new words to sequences
            seqs = torch.cat(
                [seqs[prev_word_inds],
                 next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)

            # Which sequences are incomplete (didn't reach <end>)?
            incomplete_inds = [
                ind for ind, next_word in enumerate(next_word_inds)
                if next_word != word_map['<end>']
            ]
            complete_inds = list(
                set(range(len(next_word_inds))) - set(incomplete_inds))

            # Set aside complete sequences
            if len(complete_inds) > 0:
                complete_seqs.extend(seqs[complete_inds].tolist())
                complete_seqs_scores.extend(top_k_scores[complete_inds])
            k -= len(complete_inds)  # reduce beam length accordingly

            # Proceed with incomplete sequences
            if k == 0:
                break
            seqs = seqs[incomplete_inds]
            h1 = h1[prev_word_inds[incomplete_inds]]
            c1 = c1[prev_word_inds[incomplete_inds]]
            h2 = h2[prev_word_inds[incomplete_inds]]
            c2 = c2[prev_word_inds[incomplete_inds]]
            encoder_out = encoder_out[prev_word_inds[incomplete_inds]]

            top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
            k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

            # Break if things have been going on too long
            if step > 50:
                break
            step += 1

        i = complete_seqs_scores.index(max(complete_seqs_scores))
        seq = complete_seqs[i]

        # References
        img_caps = allcaps[0].tolist()
        img_captions = list(
            map(
                lambda c: [
                    rev_word_map[w] for w in c if w not in {
                        word_map['<start>'], word_map['<end>'], word_map[
                            '<pad>']
                    }
                ], img_caps))  # remove <start> and pads
        img_caps = [' '.join(c) for c in img_captions]
        # print(img_caps)
        references[str(j)] = img_caps

        # Hypotheses
        hypothesis = ([
            rev_word_map[w] for w in seq if w not in
            {word_map['<start>'], word_map['<end>'], word_map['<pad>']}
        ])
        hypothesis = [' '.join(hypothesis)]
        # print(hypothesis)
        hypotheses[str(j)] = hypothesis

        assert len(references) == len(hypotheses)

    # Calculate BLEU-1~BLEU4 scores
    m1 = Bleu()
    m2 = Meteor()
    m3 = Cider()
    m4 = Rouge()
    m5 = Spice()
    (score1, scores1) = m1.compute_score(references, hypotheses)
    (score2, scores2) = m2.compute_score(references, hypotheses)
    (score3, scores3) = m3.compute_score(references, hypotheses)
    (score4, scores4) = m4.compute_score(references, hypotheses)
    (score5, scores5) = m5.compute_score(references, hypotheses)

    return score1, score2, score3, score4, score5
def run_load_gap_filler(pretrained_filename,
                        do_bleu=False,
                        must_have_anp=False,
                        copy_if_no_anp=False,
                        replace_adj=False,
                        get_human=False,
                        semi_human=False):
    rnn = RNNModel()
    rnn.load_model(pretrained_filename)
    rnn.conf['VAL_SPLIT'] = RNNDataProvider.TEST

    if get_human:
        id_to_caps = pickle.load(open("coco_mturk/id_to_caps.pik", "rb"))

    rnn.build_model_core()
    rnn.load_val_dataset()

    rnn.build_sentence_generator()

    rnn.build_perplexity_calculator()
    #print rnn.sample_sentence(rnn.V_valid[0])
    #print decoder_beamsearch2(rnn, rnn.V_valid[0])
    #print decoder_beamsearch(rnn, rnn.V_valid[0])

    #calculate_metric(rnn)
    #sys.exit(0)

    pos_sentence_res = []
    pos_att_res = []

    des_sentence_res = []
    des_att_res = []

    img_files = []
    img_ids = []

    id_to_sentences = {}

    seen_ids = set()
    if 'added_words' in rnn.conf:
        new_words = set([w[0] for w in rnn.conf['added_words']])
    else:
        new_words = set()
    num_ignore = 0
    num_not_ignore = 0
    for idx in range(rnn.V_valid.shape[0]):
        img_file = rnn.dp.img_id_to_filename[rnn.Id_valid[idx]]
        img_id = rnn.Id_valid[idx]
        if img_id not in id_to_sentences: id_to_sentences[img_id] = []
        #id_to_sentences[img_id].append(' '.join([rnn.dp.i2w[w] for w in rnn.X_valid[idx] if w != 0][::-1]))
        if replace_adj:
            id_to_sentences[img_id] = [
                ' '.join(do_replace_adj(rnn.dp.tokens[i])[::-1])
                for i in rnn.dp.img_id_to_tokens[img_id]
            ]
        elif get_human:
            id_to_sentences[img_id] = [
                ' '.join(rnn.dp.tokens[i][::-1])
                for i in rnn.dp.img_id_to_tokens[img_id]
            ]
            np.random.shuffle(id_to_sentences[img_id])
            print(len(id_to_sentences[img_id]))
            human_sen_pos = id_to_sentences[img_id].pop()
            print(len(id_to_sentences[img_id]))
            if not id_to_sentences[img_id]: continue
        else:
            id_to_sentences[img_id] = [
                ' '.join(rnn.dp.tokens[i][::-1])
                for i in rnn.dp.img_id_to_tokens[img_id]
            ]
        #print id_to_sentences[img_id]
        if img_id in seen_ids: continue
        seen_ids.add(img_id)
        if get_human and not semi_human:
            pos_sen = human_sen_pos.split()[::-1]
            np.random.shuffle(id_to_caps[img_id])
            des_sen = id_to_caps[img_id][0][::-1]
        else:
            lp, pos_sen, pos_att = decoder_beamsearch_with_attention(
                rnn, rnn.V_valid[idx], senti=1.0, beam_size=5)
            lp, des_sen, des_att = decoder_beamsearch_with_attention(
                rnn, rnn.V_valid[idx], senti=-1.0, beam_size=5)
            pos_sen = pos_sen[:-1]
            des_sen = des_sen[:-1]
            #des_att = des_att[:-1]
            pos_att = pos_att[:-1]
        #pos_sen, pos_att = rnn.get_sentence(rnn.V_valid[idx], senti=np.array([1.0], dtype=theano.config.floatX))
        pos_att = np.array(pos_att)
        pos_att = pos_att.flatten()
        #des_att = np.array(des_att)
        #des_att = des_att.flatten()
        des_att = np.zeros((len(des_sen), ))
        #pos_att = np.zeros((len(pos_sen),))
        if must_have_anp:
            if not sentence_has_anp(pos_sen[::-1]):
                num_ignore += 1
                continue
            num_not_ignore += 1
        if copy_if_no_anp:
            if not sentence_has_anp(pos_sen[::-1]):
                pos_sen = des_sen
        if replace_adj:
            pos_sen = do_replace_adj(pos_sen[::-1])[::-1]
            des_sen = do_replace_adj(des_sen[::-1])[::-1]

        #des_sen, des_att = rnn.get_sentence(rnn.V_valid[idx], senti=np.array([-1.0], dtype=theano.config.floatX))
        new_pos_sen = []
        for vv, a in zip(pos_sen, pos_att):
            out = vv
            col = ""
            if a > 0.75:
                col = "#FF3300"
            elif a > 0.5:
                col = "#FF5C33"
            elif a > 0.25:
                col = "#FF8566"
            #if a > 0.75:
            #    col = "#33CC33"# "#3366FF"
            #elif a > 0.5:
            #    col = "#70DB70" #"#5C85FF"
            #elif a > 0.25:
            #    col = "#ADEBAD" #"#85A3FF"
            if col:
                out = "<font style='background-color: %s'>%s</font>" % (col,
                                                                        vv)
            new_pos_sen.append(out)
        pos_sen = new_pos_sen
        print(pos_sen)
        print(pos_att)
        print(des_sen)
        print_it = False
        for v in pos_sen:
            if v in new_words:
                print_it = True
        if print_it:
            for x in zip(pos_sen, pos_att)[::-1]:
                print(x[0], end=' ')
            print("")
        #for x in zip(pos_sen, pos_att)[::-1]:
        #    print x[0],
        #print ""
        #for x in zip(des_sen, des_att)[::-1]:
        #    print x[0],
        #print "\n"
        pos_att = pos_att[:len(pos_sen)]
        des_att = des_att[:len(des_sen)]
        pos_sentence_res.append(pos_sen[::-1])
        pos_att_res.append(np.exp(pos_att[::-1]))
        des_sentence_res.append(des_sen[::-1])
        des_att_res.append(np.exp(des_att[::-1]))
        img_files.append(img_file)
        img_ids.append(img_id)

    output = {
        'pos_sen': pos_sentence_res,
        'pos_att': pos_att_res,
        'des_sen': des_sentence_res,
        'des_att': des_att_res,
        'img_files': img_files,
        'img_ids': img_ids
    }
    pickle.dump(output,
                open("output_data/sen_att_pos_01.pik", "wb"),
                protocol=2)

    if must_have_anp:
        print("Must have ANP % removed:",
              num_ignore / float(num_not_ignore) * 100.0)

    print("getting Positive perplexity")
    print(rnn.get_val_perplexity())
    print("got perplexity")

    print("getting Descriptive perplexity")
    print(rnn.get_val_perplexity(base=True))
    print("got perplexity")

    gts = {}
    res = {}
    fout = open("eval/output_pos", "w")
    for line, iid in zip(pos_sentence_res, img_ids):
        fout.write(' '.join(line) + '\n')
        if iid not in res: res[iid] = []
        res[iid].append(' '.join(line))
    fout.close()

    res_des = {}
    fout = open("eval/output_des", "w")
    for line, iid in zip(des_sentence_res, img_ids):
        fout.write(' '.join(line) + '\n')
        if iid not in res_des: res_des[iid] = []
        res_des[iid].append(' '.join(line))
    fout.close()

    for i in range(3):
        fout = open("eval/reference%d" % i, "w")
        for cid in img_ids:
            if cid not in gts: gts[cid] = []
            if len(id_to_sentences[cid]) > i:
                gts[cid].append(id_to_sentences[cid][i])
                fout.write(id_to_sentences[cid][i] + "\n")
            else:
                fout.write("\n")
        fout.close()

    bleu = Bleu()
    #for i in gts.keys()[:10]:
    #    print gts[i]
    #    print res_des[i]
    #    print res[i]
    #    print ""
    total_ref_sentences = 0
    for i in list(gts.keys()):
        total_ref_sentences += len(gts[i])
    print("Total ref sentences:", total_ref_sentences)
    print("Bleu:")
    print("Positive:", bleu.compute_score(gts, res)[0])
    print("Descriptive:", bleu.compute_score(gts, res_des)[0])
    rouge = Rouge()
    print("Rouge:")
    print("Positive:", rouge.compute_score(gts, res)[0])
    print("Descriptive:", rouge.compute_score(gts, res_des)[0])
    cider = Cider()
    print("Cider:")
    print("Positive:", cider.compute_score(gts, res)[0])
    print("Descriptive:", cider.compute_score(gts, res_des)[0])
    meteor = Meteor()
    print("Meteor:")
    print("Positive:", meteor.compute_score(gts, res)[0])
    print("Descriptive:", meteor.compute_score(gts, res_des)[0])