Esempio n. 1
0
    def compute_score(self, gts, res):
        """

        :param gts:
        :param res:
        :return:
        """
        assert(gts.keys() == res.keys())
        imgIds = gts.keys()

        bleu_scorer = BleuScorer(n=self._n)
        for id in imgIds:
            hypo = res[id]
            ref = gts[id]

            # Sanity check.
            assert(type(hypo) is list)
            assert(len(hypo) == 1)
            assert(type(ref) is list)
            assert(len(ref) > 0)

            bleu_scorer += (hypo[0], ref)

        score, scores = bleu_scorer.compute_score(option='closest', verbose=1)

        return score, scores
Esempio n. 2
0
    def compute_score(self, gts, res):

        assert (gts.keys() == res.keys())
        imgIds = gts.keys()

        bleu_scorer = BleuScorer(n=self._n)
        for id in imgIds:
            hypo = res[id]
            ref = gts[id]

            # Sanity check.
            assert (type(hypo) is list)
            #print(len(hypo))
            assert (len(hypo) == 1)
            assert (type(ref) is list)
            assert (len(ref) >= 1)

            bleu_scorer += (hypo[0], ref)

        #score, scores = bleu_scorer.compute_score(option='shortest')
        score, scores = bleu_scorer.compute_score(option='closest', verbose=1)
        #score, scores = bleu_scorer.compute_score(option='average', verbose=1)

        # return (bleu, bleu_info)
        return score, scores
Esempio n. 3
0
    def compute_score(self, gts, res):
        """

        :param gts:
        :param res:
        :return:
        """
        assert (gts.keys() == res.keys())
        imgIds = gts.keys()

        bleu_scorer = BleuScorer(n=self._n)
        for id in imgIds:
            hypo = res[id]
            ref = gts[id]

            # Sanity check.
            assert (type(hypo) is list)
            assert (len(hypo) == 1)
            assert (type(ref) is list)
            assert (len(ref) > 0)

            bleu_scorer += (hypo[0], ref)

        score, scores = bleu_scorer.compute_score(option='closest', verbose=1)

        return score, scores
Esempio n. 4
0
class Bleu:
    def __init__(self, n=4):
        # default compute Blue score up to 4
        self._n = n
        self._hypo_for_image = {}
        self.ref_for_image = {}
        self.bleu_scorer = BleuScorer(n=self._n)

    def compute_score(self, gts, res):

        self.bleu_scorer.clear()

        for res_id in res:

            hypo = res_id['caption']
            ref = gts[res_id['image_id']]
            # Sanity check.
            assert(type(hypo) is list)
            assert(len(hypo) == 1)
            assert(type(ref) is list)
            assert(len(ref) >= 1)

            self.bleu_scorer += (hypo[0], ref)

        #score, scores = bleu_scorer.compute_score(option='shortest')
        score, scores = self.bleu_scorer.compute_score(option='closest', verbose=0)
        #score, scores = bleu_scorer.compute_score(option='average', verbose=1)

        # return (bleu, bleu_info)
        return score, scores

    def method(self):
        return "Bleu"
Esempio n. 5
0
    def __init__(self, coco, useBleu=False, useCider=False):
        self.coco = coco
        self.useBleu = useBleu
        self.useCider = useCider
        self.params = {'image_id': coco.getImgIds()}

        imgIds = self.params['image_id']
        gts = {}
        for imgId in imgIds:
            gts[imgId] = self.coco.imgToAnns[imgId]

        if self.useBleu:
            self.b_scorer = BleuScorer()
        if self.useCider:
            self.c_scorer = CiderScorer()

        print('tokenization...')
        tokenizer = PTBTokenizer()
        gts = tokenizer.tokenize(gts)

        for imgId in imgIds:
            ref = gts[imgId]

            assert (type(ref) is list)
            assert (len(ref) > 0)

            if self.useCider:
                self.c_scorer += (None, ref)

        if self.useCider:
            self.c_scorer.compute_doc_freq()
            assert (len(self.c_scorer.ctest) >= max(
                self.c_scorer.document_frequency.values()))
Esempio n. 6
0
    def compute_score_for_consensus2(self, gts, subGts):

        assert(len(gts.keys()) == 1)
        assert(gts.keys() == subGts.keys())
        imgIds = gts.keys()

        bleu_scorer = BleuScorer(n=self._n)
        for id in imgIds:
            ref = gts[id]
            subRef = subGts[id]

            sent_len = len(ref)

            # Sanity check.
            #assert(type(hypo) is list)
            #assert(len(hypo) == 1)
            assert(type(subRef) is list)
            assert(type(ref) is list)
            assert(len(ref) > 1)

            for r1 in ref:
                for r2 in subRef:
                    bleu_scorer += (r1, [r2])

        #score, scores = bleu_scorer.compute_score(option='shortest')
        score, scores = bleu_scorer.compute_score(option='closest', verbose=0)
        #score, scores = bleu_scorer.compute_score(option='average', verbose=1)

        # return (bleu, bleu_info)
        scores = np.sum(scores, axis = 0)
        scores = scores.reshape((len(ref), len(subRef)))
        scores = np.sum(scores, axis = 1)

        return score, scores
Esempio n. 7
0
    def compute_score(self, gts, res):

        assert(gts.keys() == res.keys())
        imgIds = gts.keys()

        bleu_scorer = BleuScorer(n=self._n)
        for id in imgIds:
            hypo = res[id]
            ref = gts[id]

            # Sanity check.
            assert(type(hypo) is list)
            # assert(len(hypo) == 1)
            assert(type(ref) is list)
            assert(len(ref) >= 1)
            
            # Revise to incorporate paragraph-level situation
            for hypo_element in hypo:
                bleu_scorer += (hypo_element, ref)
            
            # bleu_scorer += (hypo[0], ref)

        #score, scores = bleu_scorer.compute_score(option='shortest')
        score, scores = bleu_scorer.compute_score(option='closest', verbose=1)
        #score, scores = bleu_scorer.compute_score(option='average', verbose=1)

        # return (bleu, bleu_info)
        return score, scores
Esempio n. 8
0
    def compute_score(self, gts, res):

        assert(gts.keys() == res.keys())
        imgIds = gts.keys()


        bleu_scorer = BleuScorer(n=self._n)
        for id in imgIds:
            hypo = res[id]
            ref = gts[id]


            # Sanity check.
            assert(type(hypo) is list)
            assert(len(hypo) == 1)
            assert(type(ref) is list)
            assert(len(ref) >= 1)

            bleu_scorer += (hypo[0], ref)

        #score, scores = bleu_scorer.compute_score(option='shortest')
        score, scores = bleu_scorer.compute_score(option='closest', verbose=1)
        #score, scores = bleu_scorer.compute_score(option='average', verbose=1)

        # return (bleu, bleu_info)
        return score, scores
Esempio n. 9
0
    def compute_score_for_consensus2(self, gts, subGts):

        assert (len(gts.keys()) == 1)
        assert (gts.keys() == subGts.keys())
        imgIds = gts.keys()

        bleu_scorer = BleuScorer(n=self._n)
        for id in imgIds:
            ref = gts[id]
            subRef = subGts[id]

            sent_len = len(ref)

            # Sanity check.
            #assert(type(hypo) is list)
            #assert(len(hypo) == 1)
            assert (type(subRef) is list)
            assert (type(ref) is list)
            assert (len(ref) > 1)

            for r1 in ref:
                for r2 in subRef:
                    bleu_scorer += (r1, [r2])

        #score, scores = bleu_scorer.compute_score(option='shortest')
        score, scores = bleu_scorer.compute_score(option='closest', verbose=0)
        #score, scores = bleu_scorer.compute_score(option='average', verbose=1)

        # return (bleu, bleu_info)
        scores = np.sum(scores, axis=0)
        scores = scores.reshape((len(ref), len(subRef)))
        scores = np.sum(scores, axis=1)

        return score, scores
Esempio n. 10
0
class evalSentence:
    def __init__(self, coco, useBleu=False, useCider=False):
        self.coco = coco
        self.useBleu = useBleu
        self.useCider = useCider
        self.params = {'image_id': coco.getImgIds()}

        imgIds = self.params['image_id']
        gts = {}
        for imgId in imgIds:
            gts[imgId] = self.coco.imgToAnns[imgId]

        if self.useBleu:
            self.b_scorer = BleuScorer()
        if self.useCider:
            self.c_scorer = CiderScorer()

        print('tokenization...')
        tokenizer = PTBTokenizer()
        gts = tokenizer.tokenize(gts)

        for imgId in imgIds:
            ref = gts[imgId]

            assert (type(ref) is list)
            assert (len(ref) > 0)

            if self.useCider:
                self.c_scorer += (None, ref)

        if self.useCider:
            self.c_scorer.compute_doc_freq()
            assert (len(self.c_scorer.ctest) >= max(
                self.c_scorer.document_frequency.values()))

    def eval_cider(self, test, ref):
        assert (self.useCider)

        c_score = self.c_scorer.compute_cider(test, ref)
        return np.array(c_score)

    def eval_bleu(self, test, ref):
        assert (self.useBleu)

        self.b_scorer.reset_list()
        for ts, rs in zip(test, ref):
            self.b_scorer += (ts, rs)
        b_score, b_scores = self.b_scorer.compute_score()
        return b_scores[3]  # return bleu_4
Esempio n. 11
0
model.load_state_dict(torch.load("latest_model_49.pt"))
model.to(device)

scaler = transforms.Scale((224, 224))
totensor = transforms.ToTensor()
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])

orders = torch.cat([
    torch.arange(MAX_SEQ_LEN, dtype=torch.long, device=device).unsqueeze(0)
    for _ in range(BATCHSIZE)
],
                   dim=0)

bleu_scorer = BleuScorer(n=4)
result_json = []
model.eval()
for ids in val_loader:

    ids = ids.squeeze(1)
    captions = [val[val.id == a.item()].iloc[0].captions for a in ids]
    filenames = [val[val.id == a.item()].iloc[0].filename for a in ids]
    filenames = [val_folder_path + filename for filename in filenames]
    # filenames = [test_folder_path + filename for filename in filenames]
    image_features = torch.cat([
        get_image(filename, scaler, totensor, normalize)
        for filename in filenames
    ],
                               axis=0).to(device)
    if len(filenames) == BATCHSIZE:
Esempio n. 12
0
def evaluate_stylize(G12, G21, loader, opts, split='test'):
    """
    Evaluates sentence generation from both the generators using
    BLEU, CIDER, METEOR, ROUGE-L, SPICE
    """
    depleted = False
    sents_s1_all = []  # GT s1
    sents_s2_hat_all = []  # GT s1 -> s2 hat
    sents_s2_all = []  # GT s2
    sents_s1_hat_all = []  # GT s2 -> s1 hat
    while not depleted:
        # Sents: batch_size x max_length [w1, w2, ..., <eos>, <pad>, <pad>, ...]
        # Masks: batch_size x max_length [ 1,  1, ...,     1,     0,     0, ...]
        if split == 'train':
            sents_s1, masks_s1, sents_s2, masks_s2, depleted = loader.next_batch_train(
            )
        elif split == 'val':
            sents_s1, masks_s1, sents_s2, masks_s2, depleted = loader.next_batch_val(
            )
        elif split == 'test':
            sents_s1, masks_s1, sents_s2, masks_s2, depleted = loader.next_batch_test(
            )

        batch_size = sents_s1.shape[0]
        # Assuming both styles have same max_length
        max_length = sents_s1.shape[1]
        # source input must not contain a start token
        input_sents_s1 = torch.LongTensor(sents_s1)
        input_sents_s2 = torch.LongTensor(sents_s2)
        if use_cuda:
            input_sents_s1 = input_sents_s1.cuda()
            input_sents_s2 = input_sents_s2.cuda()
        input_sents_s1 = Variable(input_sents_s1)
        input_sents_s2 = Variable(input_sents_s2)
        # encode the input source sentence
        input_sents_s1_encoded, hidden_12 = G12.encode(input_sents_s1)
        input_sents_s2_encoded, hidden_21 = G21.encode(input_sents_s2)
        # generate the predicted target
        # initial input must be the start token
        decoder_input_s2_hat = Variable(
            torch.LongTensor(np.ones((batch_size, 1)) * opts.start_idx_s2))
        decoder_input_s1_hat = Variable(
            torch.LongTensor(np.ones((batch_size, 1)) * opts.start_idx_s1))
        if use_cuda:
            decoder_input_s2_hat = decoder_input_s2_hat.cuda()
            decoder_input_s1_hat = decoder_input_s1_hat.cuda()
        rollouts_s2_hat, _ = G12.decoder_rollout(max_length,
                                                 decoder_input_s2_hat,
                                                 hidden_12,
                                                 input_sents_s1_encoded,
                                                 opts.alpha)
        rollouts_s1_hat, _ = G21.decoder_rollout(max_length,
                                                 decoder_input_s1_hat,
                                                 hidden_21,
                                                 input_sents_s2_encoded,
                                                 opts.alpha)
        sents_s2_hat = rollouts_s2_hat.data.cpu().numpy().astype(int)
        sents_s1_hat = rollouts_s1_hat.data.cpu().numpy().astype(int)

        # computing the string sentences
        sents_s1_all.extend(get_sentence_from_np(sents_s1, loader, src=True))
        sents_s1_hat_all.extend(
            get_sentence_from_np(sents_s1_hat, loader, src=True))
        sents_s2_all.extend(get_sentence_from_np(sents_s2, loader, src=False))
        sents_s2_hat_all.extend(
            get_sentence_from_np(sents_s2_hat, loader, src=False))

    # Compute BLEU scores
    bleu_scorer_G21 = BleuScorer(n=4)
    bleu_scorer_G12 = BleuScorer(n=4)
    for i in range(len(sents_s1_all)):
        bleu_scorer_G21 += (sents_s1_hat_all[i], [sents_s1_all[i]])
        bleu_scorer_G12 += (sents_s2_hat_all[i], [sents_s2_all[i]])
    bleu_G21, _ = bleu_scorer_G21.compute_score(option='closest')
    bleu_G12, _ = bleu_scorer_G12.compute_score(option='closest')

    print(
        'BLEU scores for Style 1 to 2 ===> B1: %.3f  B2: %.3f B3: %.3f B4: %.3f'
        % (bleu_G12[0], bleu_G12[1], bleu_G12[2], bleu_G12[3]))
    print(
        'BLEU scores for Style 2 to 1 ===> B1: %.3f  B2: %.3f B3: %.3f B4: %.3f'
        % (bleu_G21[0], bleu_G21[1], bleu_G21[2], bleu_G21[3]))
Esempio n. 13
0
# from https://github.com/mtanti/coco-caption/blob/master/pycocoevalcap/bleu/bleu_scorer.py
from bleu_scorer import BleuScorer

true_sentences = []
pred_sentences = []

f = open("eval/preds.txt", "r")
for pred in f:
    pred = pred.split("[SEP]", 1)[0]
    pred_sentences.append(pred)
    #print(pred)
f.close()

g = open("eval/golds.txt", "r")
for true in g:
    true = true.split("[SEP]", 1)[0]
    true_sentences.append([true])
    #print(true)
g.close()

bleu_scorer = BleuScorer(n=4)  # up to 4 gram
for true, pred in zip(true_sentences, pred_sentences):
    bleu_scorer += (pred, true)

scores, instance_scores = bleu_scorer.compute_score(option='closest',
                                                    verbose=0)
print("BLEU1 score: ", scores[0])
print("BLEU4 score: ", scores[3])
Esempio n. 14
0
 def __init__(self, n=4):
     # default compute Blue score up to 4
     self._n = n
     self._hypo_for_image = {}
     self.ref_for_image = {}
     self.bleu_scorer = BleuScorer(n=self._n)
Esempio n. 15
0
def bleu(output, ref):
    scorer = BleuScorer(n=4)
    scorer += (output.lower(), [ref.lower()])
    score, _ = scorer.compute_score()

    return score