def main(ckpt_path, gts_name='/gts.json', res_name='/res.json'):
    print("eval_spice.py")
    print(ckpt_path)
    with open(ckpt_path + gts_name) as f:
        gts = json.load(f)
    with open(ckpt_path + res_name) as f:
        res = json.load(f)
    scorer = Spice()
    score, scores = scorer.compute_score(gts, res)
    with open(ckpt_path + '/score.json', 'w') as f:
        json.dump(score, f)
    with open(ckpt_path + '/scores.json', 'w') as f:
        json.dump(scores, f)
Beispiel #2
0
def spice():
    scorer = Spice()
    score, scores = scorer.compute_score(gts, res)
    print('spice = %s' % score)
Beispiel #3
0
def spice(gts, res):
    scorer = Spice()
    score, scores = scorer.compute_score(gts, res)
    out_file.write('SPICE = %s' % score + '\n')
Beispiel #4
0
import time
import os

import sys
sys.path.append("coco-caption")
from pycocotools.coco import COCO
from pycocoevalcap.spice.spice import Spice

train_path = '/home/yangxu/project/self-critical.pytorch/data/coco_annotations/captions_train2014.json'
val_path = '/home/yangxu/project/self-critical.pytorch/data/coco_annotations/captions_val2014.json'

coco_train = COCO(train_path)
coco_val = COCO(val_path)

coco_use = coco_train

image_ids = coco_use.getImgIds()
gts = {}
res = {}
for img_id in image_ids:
    gts[img_id] = []
    data_temp = coco_use.imgToAnns[img_id]
    for dt in data_temp:
        gts[img_id].append(dt['caption'])
    res[img_id] = []
    res[img_id].append(gts[img_id][0])

scorer = Spice()
score, scores = scorer.compute_score(gts, res)
Beispiel #5
0
def evaluate(beam_size):
    """
    Evaluation

    :param beam_size: beam size at which to generate captions for evaluation
    :return: BLEU-4 score
    """
    # DataLoader
    loader = torch.utils.data.DataLoader(CaptionDataset(
        data_folder,
        data_name,
        'TEST',
        transform=transforms.Compose([normalize])),
                                         batch_size=1,
                                         shuffle=True,
                                         num_workers=0,
                                         pin_memory=False)

    # TODO: Batched Beam Search
    # Therefore, do not use a batch_size greater than 1 - IMPORTANT!

    # Lists to store references (true captions), and hypothesis (prediction) for each image
    # If for n images, we have n hypotheses, and references a, b, c... for each image, we need -
    # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]
    references = dict()
    hypotheses = dict()

    # For each image
    for j, (image, caps, caplens, allcaps) in enumerate(
            tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))):

        k = beam_size

        # Move to GPU device, if available
        image = image.to(device)  # (1, 3, 256, 256)

        attrs, encoder_out = encoder(image)
        attrs = attrs.expand(3, attrs_dim)

        enc_image_size = encoder_out.size(1)
        encoder_dim = encoder_out.size(3)
        encoder_out = encoder_out.view(1, -1, encoder_dim)
        num_pixels = encoder_out.size(1)
        encoder_out = encoder_out.expand(k, num_pixels, encoder_dim)
        x0 = decoder.init_x0(attrs)

        # Tensor to store top k previous words at each step; now they're just <start>
        k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to(
            device)  # (k, 1)

        # Tensor to store top k sequences; now they're just <start>
        seqs = k_prev_words  # (k, 1)

        # Tensor to store top k sequences' scores; now they're just 0
        top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)

        # Lists to store completed sequences and scores
        complete_seqs = list()
        complete_seqs_scores = list()

        # Start decoding
        step = 1
        h1, c1, h2, c2 = decoder.init_hidden_state(attrs,
                                                   encoder_out,
                                                   zero=True)
        h1, c1 = decoder.decode_step1(x0, (h1, c1))
        # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
        while True:

            embeddings = decoder.embedding(k_prev_words).squeeze(
                1)  # (s, embed_dim)

            h1, c1 = decoder.decode_step1(embeddings, (h1, c1))

            awe, _ = decoder.attention(encoder_out, h1, h2)
            # gate = decoder.sigmoid(decoder.f_beta(h2))
            # awe = gate * awe

            h2, c2 = decoder.decode_step2(torch.cat([embeddings, awe], dim=1),
                                          (h2, c2))

            scores = decoder.fc2(decoder.dropout2(h2))
            scores = F.log_softmax(scores, dim=1)

            # Add
            scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)

            # For the first step, all k points will have the same scores (since same k previous words, h, c)
            if step == 1:
                top_k_scores, top_k_words = scores[0].topk(k, 0, True,
                                                           True)  # (s)
            else:
                # Unroll and find top scores, and their unrolled indices
                # (s) 所有分数中最大的k个
                top_k_scores, top_k_words = scores.view(-1).topk(
                    k, 0, True, True)

            # Convert unrolled indices to actual indices of scores
            # 上面展开了,prev_word_inds得到哪些句子是概率最大的
            prev_word_inds = top_k_words / vocab_size  # (s)
            next_word_inds = top_k_words % vocab_size  # (s)

            # Add new words to sequences
            seqs = torch.cat(
                [seqs[prev_word_inds],
                 next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)

            # Which sequences are incomplete (didn't reach <end>)?
            incomplete_inds = [
                ind for ind, next_word in enumerate(next_word_inds)
                if next_word != word_map['<end>']
            ]
            complete_inds = list(
                set(range(len(next_word_inds))) - set(incomplete_inds))

            # Set aside complete sequences
            if len(complete_inds) > 0:
                complete_seqs.extend(seqs[complete_inds].tolist())
                complete_seqs_scores.extend(top_k_scores[complete_inds])
            k -= len(complete_inds)  # reduce beam length accordingly

            # Proceed with incomplete sequences
            if k == 0:
                break
            seqs = seqs[incomplete_inds]
            h1 = h1[prev_word_inds[incomplete_inds]]
            c1 = c1[prev_word_inds[incomplete_inds]]
            h2 = h2[prev_word_inds[incomplete_inds]]
            c2 = c2[prev_word_inds[incomplete_inds]]
            encoder_out = encoder_out[prev_word_inds[incomplete_inds]]

            top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
            k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

            # Break if things have been going on too long
            if step > 50:
                break
            step += 1

        i = complete_seqs_scores.index(max(complete_seqs_scores))
        seq = complete_seqs[i]

        # References
        img_caps = allcaps[0].tolist()
        img_captions = list(
            map(
                lambda c: [
                    rev_word_map[w] for w in c if w not in {
                        word_map['<start>'], word_map['<end>'], word_map[
                            '<pad>']
                    }
                ], img_caps))  # remove <start> and pads
        img_caps = [' '.join(c) for c in img_captions]
        # print(img_caps)
        references[str(j)] = img_caps

        # Hypotheses
        hypothesis = ([
            rev_word_map[w] for w in seq if w not in
            {word_map['<start>'], word_map['<end>'], word_map['<pad>']}
        ])
        hypothesis = [' '.join(hypothesis)]
        # print(hypothesis)
        hypotheses[str(j)] = hypothesis

        assert len(references) == len(hypotheses)

    # Calculate BLEU-1~BLEU4 scores
    m1 = Bleu()
    m2 = Meteor()
    m3 = Cider()
    m4 = Rouge()
    m5 = Spice()
    (score1, scores1) = m1.compute_score(references, hypotheses)
    (score2, scores2) = m2.compute_score(references, hypotheses)
    (score3, scores3) = m3.compute_score(references, hypotheses)
    (score4, scores4) = m4.compute_score(references, hypotheses)
    (score5, scores5) = m5.compute_score(references, hypotheses)

    return score1, score2, score3, score4, score5