Esempio n. 1
0
def init_scorer(cached_tokens):
    global CiderD_scorer
    CiderD_scorer = CiderD_scorer or CiderD(df=cached_tokens)
    global Bleu_scorer
    Bleu_scorer = Bleu_scorer or Bleu(4)
    global Meteor_scorer
    Meteor_scorer = Meteor()
    global Rouge_scorer
    Rouge_scorer = Rouge()
Esempio n. 2
0
def cal_BLEU(generated, reference, is_corpus=False):
    #print 'in BLEU score calculation'
    #the maximum is bigram, so assign the weight into 2 half.
    BLEUscore = [0.0, 0.0, 0.0]
    for idx, g in enumerate(generated):
        if is_corpus:
            score, scores = Bleu(4).compute_score(reference, {0: [g]})
        else:
            score, scores = Bleu(4).compute_score({0: [reference[0][idx]]},
                                                  {0: [g]})
        #print g, score
        for i, s in zip([0, 1, 2], score[1:]):
            BLEUscore[i] += s
        #BLEUscore += nltk.translate.bleu_score.sentence_bleu(reference, g, weight)
    BLEUscore[0] = BLEUscore[0] / len(generated)
    BLEUscore[1] = BLEUscore[1] / len(generated)
    BLEUscore[2] = BLEUscore[2] / len(generated)
    return BLEUscore
Esempio n. 3
0
def compute_bleu_score(decode_res, keys, gts, start_idx, end_idx, vocabulary):
    """
    Args:
        decode_res: decoding results of model, [B, max_length]
        keys: keys of this batch, tuple [B,]
        gts: ground truth sentences of all audios, dict(<key> -> [ref_1, ref_2, ..., ref_n])
    Return:
        score: scores of this batch, [B,]
    """
    from pycocoevalcap.bleu.bleu import Bleu
    scorer = Bleu(4)

    hypothesis = {}
    references = {}

    for i in range(decode_res.shape[0]):

        if keys[i] in hypothesis:
            continue

        # prepare candidate
        candidate = []
        for t, w_t in enumerate(decode_res[i]):
            if w_t == start_idx:
                continue
            elif w_t == end_idx:
                break
            else:
                candidate.append(vocabulary.idx2word[w_t])
        hypothesis[keys[i]] = [
            " ".join(candidate),
        ]

        # prepare reference
        references[keys[i]] = gts[keys[i]]

    (score, scores) = scorer.compute_score(references, hypothesis)

    key2score = {key: scores[3][i] for i, key in enumerate(hypothesis.keys())}
    results = np.zeros(decode_res.shape[0])
    for i in range(decode_res.shape[0]):
        results[i] = key2score[keys[i]]

    return results
Esempio n. 4
0
    def computeBleuScore(self):
        methods = ["Blue 1.0", "Blue 2.0", "Blue 3.0", "Blue 4.0"]

        #Compute Score
        scores, blueList = Bleu(4).compute_score(self.gtc_tokens,
                                                 self.pc_tokens)

        for score, blue, method in zip(scores, blueList, methods):
            self.evalResults[method] = score
            self.setVideoEvalResults(blue, method)
Esempio n. 5
0
 def __init__(self, vocab_file='graph2text/data/vocabs.txt'):
     super(Evaluate, self).__init__()
     self.scorers = [
         (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
         # (Rouge(), "ROUGE_L")
         ]
     with open(vocab_file, encoding='utf-8') as f:
         vocab_list = f.readlines()
     self.vocab = [_.strip('\n') for _ in vocab_list]
     self.padding_idx = self.vocab.index('<blank>')
Esempio n. 6
0
def eval(result_gts_path, result_res_path):
    with open(result_gts_path, 'r') as file:
        gts_dict = json.load(file)
    with open(result_res_path, 'r') as file:
        res_dict = json.load(file)

    bleu_score = Bleu(n=4)
    bleu, _ = bleu_score.compute_score(gts=gts_dict, res=res_dict)

    meteor_score = Meteor()
    meteor, _ = meteor_score.compute_score(gts=gts_dict, res=res_dict)

    rouge_scorer = Rouge()
    rouge, _ = rouge_scorer.compute_score(gts=gts_dict, res=res_dict)

    cider_scorer = Cider()
    cider, _ = cider_scorer.compute_score(gts=gts_dict, res=res_dict)

    return bleu, meteor, rouge, cider
Esempio n. 7
0
def get_qg_metrics(generated, question, promptQuestion, metrics):

    evaluation = {}

    # computing bleu scores
    for name, score in zip(['bleu{}'.format(i) for i in range(1, 5)],
                           Bleu(4).compute_score(question, generated)[0]):
        if name in metrics:
            evaluation[name] = score

    # computing edit-f1 score
    if 'edit-f1' in metrics:

        def _get_edits(tokens1, tokens2):
            allCommon = []
            while True:
                commons = list(set(tokens1) & set(tokens2))
                if len(commons) == 0:
                    break
                allCommon += commons
                for c in commons:
                    ind1, ind2 = tokens1.index(c), tokens2.index(c)
                    tokens1 = tokens1[:ind1] + tokens1[ind1 + 1:]
                    tokens2 = tokens2[:ind2] + tokens2[ind2 + 1:]
            deleted = ["[DELETED]" + token for token in tokens1]
            added = ["[ADDED]" + token for token in tokens2]
            common = ["[FIXED]" + token for token in allCommon]
            return deleted + added  #+common

        assert len(generated) == len(promptQuestion) == 1
        generated = generated["sent"][0].split(" ")
        promptQuestion = promptQuestion["sent"][0].split(" ")
        prediction = _get_edits(promptQuestion, generated)
        edit_f1 = 0
        for _question in question["sent"]:
            _question = _question.split(" ")
            reference = _get_edits(promptQuestion, _question)
            # now compare the reference edits and predicted edits
            if len(reference) == len(prediction) == 0:
                # rarely, reference has no edits after normalization
                # then, if the prediction also has no edits, it gets full score
                edit_f1 = 1
            elif len(reference) == 0 or len(prediction) == 0:
                # if only one of them has no edits, zero score
                edit_f1 = max(edit_f1, 0)
            else:
                # otherwise, compute F1 score between prediction and reference
                edit_f1 = max(
                    edit_f1,
                    get_f1(prediction, reference,
                           is_equal=lambda x, y: x == y))
        evaluation["edit-f1"] = edit_f1

    assert len(metrics) == len(evaluation)
    return evaluation
        def bleu_scorer(reference, hypothesis):
            # =================================================
            # Compute scores
            # =================================================
            scorer = Bleu(4)
            method = ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]
            # print('computing %s score...' % (scorer.method()))

            score, scores = scorer.compute_score(reference, hypothesis)

            bleus = {}
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    # print("%s: %0.3f" % (m, sc))
                    bleus[m] = sc
            else:
                # print("%s: %0.3f" % (method, score))
                bleus[method] = score

            return bleus
Esempio n. 9
0
def get_auxiliary_features(contexts, gtresponses, modelresponses, num_examples):
    aux_features = np.zeros((num_examples, 5))
    bleu1 = []
    bleu2 = []
    bleu3 = []
    bleu4 = []
    meteor = []
    rouge = []
    for i in xrange(num_examples):
        bleu1.append(Bleu(1).compute_score({0: [gtresponses[i]]}, {0: [modelresponses[i]]})[0][0])
        bleu2.append(Bleu(2).compute_score({0: [gtresponses[i]]}, {0: [modelresponses[i]]})[0][1])
        bleu3.append(Bleu(3).compute_score({0: [gtresponses[i]]}, {0: [modelresponses[i]]})[0][2])
        bleu4.append(Bleu(4).compute_score({0: [gtresponses[i]]}, {0: [modelresponses[i]]})[0][3])
        rouge.append(Rouge().compute_score({0: [gtresponses[i]]}, {0: [modelresponses[i]]})[0])
    aux_features[:,0] = bleu1
    aux_features[:,1] = bleu2
    aux_features[:,2] = bleu3
    aux_features[:,3] = bleu4
    aux_features[:,4] = rouge
    return aux_features
Esempio n. 10
0
class Metrics:
    def __init__(self):
        pass

    def bleu(self, hypo, ref):
        self.bleu_scorer = Bleu(4)
        final_scores = {}
        score, scores = self.bleu_scorer.compute_score(ref, hypo)
        for m, s in zip(["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"], score):
            final_scores[m] = s
        return final_scores
Esempio n. 11
0
def score(ref, hypo):
    scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"])]
    final_scores = {}
    for scorer, method in scorers:
        score, scores = scorer.compute_score(ref, hypo)
        if type(score) == list:
            for m, s in zip(method, score):
                final_scores[m] = s
        else:
            final_scores[method] = score

    return final_scores
Esempio n. 12
0
def init_scorer(cache_tokens):
    global CiderD_scorer
    if CiderD_scorer is None:
        CiderD_scorer = CiderD(df=cache_tokens)
    else:
        CiderD_scorer = CiderD_scorer
    # CiderD_scorer = CiderD_scorer or CiderD(df=cache_tokens)
    global Bleu_scorer
    if Bleu_scorer is None:
        Bleu_scorer = Bleu(4)
    else:
        Bleu_scorer = Bleu_scorer
Esempio n. 13
0
    def evaluate_tiou(self, tiou):
        # For every prediction, find it's respective references with tIoU > the passed in argument.
        res = {}
        gts = {}
        unique_index = 0
        for vid_id in self.prediction:
            for pred in self.prediction[vid_id]:
                res[unique_index] = [{'caption': pred['sentence']}]
                matches = []
                for gt in self.ground_truths:
                    refs = gt[vid_id]
                    for ref_i, ref_timestamp in enumerate(refs['timestamps']):
                        if self.iou(pred['timestamp'], ref_timestamp) > tiou:
                            matches.append(refs['sentences'][ref_i])
                if len(matches) == 0:
                    gts[unique_index] = [{'caption': 'abc123!@#'}]
                else:
                    gts[unique_index] = [{'caption': v} for v in matches]
                unique_index += 1

        # Set up scorers
        if self.verbose:
            print '| Tokenizing ...'
        # Suppressing tokenizer output
        tokenizer = PTBTokenizer()
        gts = tokenizer.tokenize(gts)
        res = tokenizer.tokenize(res)

        # Set up scorers
        if self.verbose:
            print '| Setting up scorers ...'
        scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                   (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"),
                   (Cider(), "CIDEr")]

        # Compute scores
        output = {}
        for scorer, method in scorers:
            if self.verbose:
                print 'computing %s score...' % (scorer.method())
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    output[m] = sc
                    if self.verbose:
                        print "Calculated tIoU: %1.1f, %s: %0.3f" % (tiou, m,
                                                                     sc)
            else:
                output[method] = score
                if self.verbose:
                    print "Calculated tIoU: %1.1f, %s: %0.3f" % (tiou, method,
                                                                 score)
        return output
def calculate_metric(rnn, meteor=None):
    gts = {}
    res = {}
    lp_avg = 0.0
    lp_c = 0
    for idx in range(rnn.V_valid.shape[0]):
        iid = rnn.Id_valid[idx]
        if iid not in gts: gts[iid] = []
        #gts[iid].append(' '.join([rnn.dp.i2w[w] for w in rnn.X_valid[idx] if w != 0][::-1]))
        gts[iid] = [
            ' '.join(rnn.dp.tokens[i][::-1])
            for i in rnn.dp.img_id_to_tokens[iid]
        ]
        if iid in res: continue
        res[iid] = []
        #pos_sen, pos_att = rnn.get_sentence(rnn.V_valid[idx], senti=np.array([1.0], dtype=theano.config.floatX))
        (lp, pos_sen) = decoder_beamsearch(rnn,
                                           rnn.V_valid[idx],
                                           senti=1.0,
                                           beam_size=1)
        pos_sen = pos_sen[:-1]
        print(' '.join(pos_sen[::-1]))
        res[iid].append(' '.join(pos_sen[::-1]))
        lp_avg += np.exp(lp)
        lp_c += 1
    lp_avg /= float(lp_c)
    return lp_avg

    bleu = Bleu()
    print("Bleu:")
    print("Positive:", bleu.compute_score(gts, res)[0])
    rouge = Rouge()
    print("Rouge:")
    print("Positive:", rouge.compute_score(gts, res)[0])
    if meteor is None:
        meteor = Meteor()
    print("Meteor:")
    mscore = meteor.compute_score(gts, res)[0]
    print("Positive:", mscore)
    return mscore
Esempio n. 15
0
        def val_score(self, s_start=0, num_batches=2):
            bs = self.imp["BATCH_SIZE"]
            bleu = Bleu()
            eval_store_gen = {}
            eval_store_gt = {}
            num_examples = self.test_data.dec_in.get_num_seqs()
            max_num_batches = num_examples / bs
            for i in xrange(min(num_batches, max_num_batches)):
                s = s_start + bs * i
                e = s_start + bs * (i + 1)
                gen_txt = self.generate(s=s, allow_unk=False)
                gt_txt = self.test_data.dec_out.get_text(s, e)
                fnames = self.test_data.filenames[s:e]
                for g, f in zip(gen_txt, fnames):
                    if f not in eval_store_gen:
                        eval_store_gen[f] = [" ".join(g)]

                for g, f in zip(gt_txt, fnames):
                    if f not in eval_store_gt:
                        eval_store_gt[f] = []
                    eval_store_gt[f].append(" ".join(g))
            print bleu.compute_score(eval_store_gt, eval_store_gen)[0]
def score(ref, hypo):
    scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
               (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]
    final_scores = {}
    for scorer, method in scorers:
        score, scores = scorer.compute_score(ref, hypo)
        if type(score) == list:
            for m, s in zip(method, score):
                final_scores[m] = s
        else:
            final_scores[method] = score

    return final_scores
Esempio n. 17
0
    def score(self, GT, RES, IDs):
        self.eval = {}
        self.imgToEval = {}
        gts = {}
        res = {}
        for ID in IDs:
            #            print ID
            gts[ID] = GT[ID]
            res[ID] = RES[ID]
        print('tokenization...')
        tokenizer = PTBTokenizer()
        gts = tokenizer.tokenize(gts)
        res = tokenizer.tokenize(res)

        # =================================================
        # Set up scorers
        # =================================================
        print('setting up scorers...')
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            (Meteor(), "METEOR"),
            (Rouge(), "ROUGE_L"),
            (Cider(), "CIDEr"),
            # (Spice(), "SPICE")
        ]

        # =================================================
        # Compute scores
        # =================================================
        eval = {}
        sub_category_score = None
        for scorer, method in scorers:
            print('computing %s score...' % (scorer.method()))
            if method == 'SPICE':
                score, scores, sub_category_score = scorer.compute_score(
                    gts, res)
            else:
                score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    self.setEval(sc, m)
                    self.setImgToEvalImgs(scs, IDs, m)
                    print("%s: %0.3f" % (m, sc))
            else:
                self.setEval(score, method)
                self.setImgToEvalImgs(scores, IDs, method)
                print("%s: %0.3f" % (method, score))

        # for metric, score in self.eval.items():
        #    print '%s: %.3f'%(metric, score)
        return self.eval, sub_category_score
Esempio n. 18
0
def eval_epoch_bleu(model, validation_data, device, vocab, list_of_refs_dev, args):
    ''' Epoch operation in evaluation phase '''

    model.eval()

    total_loss = 0
    n_word_total = 0
    n_word_correct = 0

    hypotheses = {}
    count = 0

    with torch.no_grad():
        for batch in tqdm(
                validation_data, mininterval=2,
                desc='  - (Validation) ', leave=False):

            # prepare data
            image0, image1, image0_attribute, image1_attribute = map(lambda x: x.to(device), batch)

            """[src/tgt/memory]_key_padding_mask should be a ByteTensor where True values are positions
                                    that should be masked with float('-inf') and False values will be unchanged.
                                    This mask ensures that no information will be taken from position i if
                                    it is masked, and has a separate mask for each sequence in a batch."""

            hyp = beam_search(image0, image1, model, args, vocab, image0_attribute, image1_attribute)

            hyp = hyp.split("<end>")[0].strip()

            hypotheses[count] = [hyp]

            count += 1

        scorer = Bleu(4)

        score, _ = scorer.compute_score(list_of_refs_dev, hypotheses)

    return score
Esempio n. 19
0
def score(gts, res, ids, log_out):
    tokenizer = PTBTokenizer()
    gts = tokenizer.tokenize(gts)
    res = tokenizer.tokenize(res)
    scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
               (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]
    for scorer, method in scorers:
        # print 'computing %s score...'%(scorer.method())
        score, scores = scorer.compute_score(gts, res)
        if type(method) == list:
            for sc, scs, m in zip(score, scores, method):
                print >> log_out, "%s: %f" % (m, sc)
        else:
            print >> log_out, "%s: %f" % (method, score)
Esempio n. 20
0
    def __init__(self, ground_truth_fname, lang=DEFAULT_LANG):
        self.eval = {}
        self.imgToEval = {}
        self.gts = {}

        data = open(
            ground_truth_fname).readlines() if 0 == lang else codecs.open(
                ground_truth_fname, 'r', 'utf-8').readlines()
        for line in data:
            sent_id, sent = line.strip().split(' ', 1)
            sent = ' '.join(TextTool.tokenize(sent, lang))  #process_sent(sent)
            img_id = os.path.splitext(sent_id.split('#')[0])[0]
            self.gts.setdefault(img_id, []).append(sent)

        logger.info('setting up scorers...')
        if 0 == lang:
            self.scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3",
                                       "Bleu_4"]), (Meteor(), "METEOR"),
                            (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]
        else:
            self.scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3",
                                       "Bleu_4"]), (Rouge(), "ROUGE_L"),
                            (Cider(), "CIDEr")]
Esempio n. 21
0
def score(ref, sample):
    # ref and sample are both dict
    scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
               (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]
    final_scores = {}
    for scorer, method in scorers:
        print 'computing %s score with COCO-EVAL...' % (scorer.method())
        score, scores = scorer.compute_score(ref, sample)
        if type(score) == list:
            for m, s in zip(method, score):
                final_scores[m] = s
        else:
            final_scores[method] = score
    return final_scores
Esempio n. 22
0
def evaluate_captions_bleu(ref, cand):
    hypo = {}
    refe = {}
    for i, caption in enumerate(cand):
        hypo[i] = [caption]
        refe[i] = ref[i]
    scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"])]
    final_scores = {}
    for scorer, method in scorers:
        _, scores = scorer.compute_score(refe, hypo)
        for m, s in zip(method, scores):
            final_scores[m] = s
            assert len(s) == len(cand)
    return final_scores['Bleu_4']
    def score(self, GT, RES, IDs):
        self.eval = {}
        self.imgToEval = {}
        gts = {}
        res = {}
        for ID in IDs:
            gts[ID] = GT[ID]
            res[ID] = RES[ID]
        print 'tokenization...'
        tokenizer = PTBTokenizer()
        gts = tokenizer.tokenize(gts)
        res = tokenizer.tokenize(res)
        with open('all_samples.txt', 'w') as f:
            for i in res.keys():
                print >> f, 'valid stuff'
                print >> f, '\t'.join(res[i])
                print >> f, 'ground truth'
                print >> f, '\n'.join(gts[i])
        # =================================================
        # Set up scorers
        # =================================================
        print 'setting up scorers...'
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            #            (Meteor(),"METEOR"),
            (Rouge(), "ROUGE_L"),
            (Cider(), "CIDEr")
        ]

        # =================================================
        # Compute scores
        # =================================================
        eval = {}
        for scorer, method in scorers:
            print 'computing %s score...' % (scorer.method())
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    self.setEval(sc, m)
                    self.setImgToEvalImgs(scs, IDs, m)
                    print "%s: %0.3f" % (m, sc)
            else:
                self.setEval(score, method)
                self.setImgToEvalImgs(scores, IDs, method)
                print "%s: %0.3f" % (method, score)

        for metric, score in self.eval.items():
            print '%s: %.3f' % (metric, score)
        return self.eval
Esempio n. 24
0
    def score(self, GT, RES, IDs, result_file):
        self.eval = {}
        self.imgToEval = {}
        gts = {}
        res = {}
        for ID in IDs:
            #            print ID
            gts[ID] = GT[ID]
            res[ID] = RES[ID]
        print 'tokenization...'
        tokenizer = PTBTokenizer()
        gts = tokenizer.tokenize(gts)
        res = tokenizer.tokenize(res)

        # =================================================
        # Set up scorers
        # =================================================
        print 'setting up scorers...'
        scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                   (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"),
                   (Cider(), "CIDEr")]

        #         result_file = '/home/anguyen/workspace/paper_src/2018.icra.v2c.source/output/' + net_id + '/prediction/score_result.txt'
        print 'RESULT FILE: ', result_file

        fwriter = open(result_file, 'w')

        # =================================================
        # Compute scores
        # =================================================
        eval = {}
        for scorer, method in scorers:
            print 'computing %s score...' % (scorer.method())
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    self.setEval(sc, m)
                    self.setImgToEvalImgs(scs, IDs, m)
                    print "%s: %0.3f" % (m, sc)
                    fwriter.write("%s %0.3f\n" % (m, sc))
            else:
                self.setEval(score, method)
                self.setImgToEvalImgs(scores, IDs, method)
                print "%s: %0.3f" % (method, score)
                fwriter.write("%s %0.3f\n" % (method, score))

        #for metric, score in self.eval.items():
        #    print '%s: %.3f'%(metric, score)
        return self.eval
Esempio n. 25
0
    def evaluate(self):
        res = {}
        for r in self.rests:
            res[str(r['image_id'])] = [{'caption': r['caption']}]

        gts = {}
        for imgId in self.annos:
            gts[str(imgId)] = [{'caption': c} for c in self.annos[imgId]]

        # =================================================
        # Set up scorers
        # =================================================
        # print('tokenization...')
        tokenizer = self.Tokenizer()
        gts = tokenizer.tokenize(gts)
        res = tokenizer.tokenize(res)
        # =================================================
        # Set up scorers
        # =================================================
        # print('setting up scorers...')
        use_scorers = self.use_scorers
        scorers = []
        if 'Bleu' in use_scorers:
            scorers.append((Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]))
        if 'METEOR' in use_scorers:
            scorers.append((Meteor(), "METEOR"))
        if 'ROUGE_L' in use_scorers:
            scorers.append((Rouge(), "ROUGE_L"))
        if 'CIDEr' in use_scorers:
            scorers.append((Cider(), "CIDEr"))
        if 'SPICE' in use_scorers:
            scorers.append((Spice(), "SPICE"))

        # =================================================
        # Compute scores
        # =================================================
        for scorer, method in scorers:
            # print('computing %s score...'%(scorer.method()))
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    self.setEval(sc, m)
                    self.setImgToEvalImgs(scs, gts.keys(), m)
                    # print("%s: %0.1f" % (m, sc*100))
            else:
                self.setEval(score, method)
                self.setImgToEvalImgs(scores, gts.keys(), method)
                # print("%s: %0.1f" % (method, score*100))
        self.setEvalImgs()
Esempio n. 26
0
def evaluate():
  with open (os.path.join(FLAGS.data_dir, 'feature.test'), 'rb') as f:
    feature = cPickle.load(f)
  with open(os.path.join(FLAGS.data_dir, 'caption.test'), 'rb') as f:
    sentence = cPickle.load(f)

  scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(),"METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]
  vocab, re_vocab = data_utils.initialize_vocabulary()
  GTS = {}
  RES = {}
  batch_size = 1
  max_meteor = 0

  with tf.Session() as sess:
    model = Seq2Seq(FLAGS.num_units, FLAGS.use_lstm, FLAGS.epsilon, FLAGS.max_computation, FLAGS.encoder_max_sequence_length, FLAGS.decoder_max_sentence_length, FLAGS.feature_size, FLAGS.vocab_size, FLAGS.learning_rate, FLAGS.learning_rate_decay_factor, FLAGS.time_penalty, FLAGS.max_gradient_norm, forward_only=True)
    step = 0
    while True:
      step += FLAGS.steps_per_checkpoint
      ckpt_path = os.path.join(FLAGS.checkpoint_dir,'ckpt-%d'%step)
      if os.path.isfile(ckpt_path+'.meta'):
        model.saver.restore(sess, ckpt_path)
        for vid, _ in feature.iteritems():
          feature_inputs, batch_decoder_inputs, batch_weights = model.get_batch(feature, [(vid, [0])])
          output_logits, remainders, iterations = model.step(sess, feature_inputs, batch_decoder_inputs, batch_weights, forward_only=True)
          outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
          if data_utils.EOS_ID in outputs:
            outputs = outputs[:outputs.index(data_utils.EOS_ID)]
          R = ['%.3f'%remainders[j][0] for j in xrange(FLAGS.encoder_max_sequence_length)]
          I = ['%d'%iterations[j][0] for j in xrange(FLAGS.encoder_max_sequence_length)]
          print(' '.join(R))
          print(' '.join(I))
          sen = " ".join([tf.compat.as_str(re_vocab[output]) for output in outputs])
          print ("%s - %s: %s"%(vid, sen, sentence[vid][9]))
          GTS[vid] = sentence[vid]
          RES[vid] = [sen]
        print('STEP: %d'%step)
        for scorer, method in scorers:
          score, scores = scorer.compute_score(GTS, RES)
          if method == "METEOR" and score > max_meteor:
            max_meteor = score
          if isinstance(method, list):
            for k, v in zip(method, score):
              print("%s:\t%f"%(k, v))
          else:
            print("%s:\t%f"%(method, score))
        sys.stdout.flush()
      else:
        break
  print("Max METEOR:\t%f"%max_meteor)
Esempio n. 27
0
    def score(self, GT, RES, IDs):
        self.eval = {}
        self.imgToEval = {}
        gts = {}
        res = {}
        for ID in IDs:
            gts[ID] = GT[ID]
            res[ID] = RES[ID]
        print('tokenization...')
        tokenizer = PTBTokenizer()
        '''
        print("gts: ")
        for key in gts:
        	print(key)
        	for value in gts[key]:
        		print(value)
        '''
        gts = tokenizer.tokenize(gts)
        res = tokenizer.tokenize(res)

        # =================================================
        # Set up scorers
        # =================================================
        print('setting up scorers...')
        scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                   (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"),
                   (Cider(), "CIDEr")]

        # =================================================
        # Compute scores
        # =================================================
        eval = {}
        for scorer, method in scorers:
            print('computing %s score...' % (scorer.method()))
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    self.setEval(sc, m)
                    self.setImgToEvalImgs(scs, IDs, m)
                    print("%s: %0.3f" % (m, sc))
            else:
                self.setEval(score, method)
                self.setImgToEvalImgs(scores, IDs, method)
                print("%s: %0.3f" % (method, score))

        for metric, score in self.eval.items():
            print('%s: %.3f' % (metric, score))
        return self.eval
Esempio n. 28
0
    def evaluate(self):
        imgIds = self.params['image_id']
        # imgIds = self.coco.getImgIds()
        gts = {}
        res = {}
        for imgId in imgIds:
            gts[imgId] = self.coco.imgToAnns[imgId]
            res[imgId] = self.cocoRes.imgToAnns[imgId]

        # =================================================
        # Set up scorers
        # =================================================
        print('you')
        print('tokenization...')



        tokenizer = PTBTokenizer()
        gts  = tokenizer.tokenize(gts)
        res = tokenizer.tokenize(res)

        # =================================================
        # Set up scorers
        # =================================================
        print('setting up scorers...')
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            (Meteor(),"METEOR"),
            (Rouge(), "ROUGE_L"),
            (Cider(), "CIDEr")
        ]

        # =================================================
        # Compute scores
        # =================================================
        for scorer, method in scorers:
            print('computing %s score...'%(scorer.method()))
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    self.setEval(sc, m)
                    self.setImgToEvalImgs(scs, gts.keys(), m)
                    print("%s: %0.3f"%(m, sc))
            else:
                self.setEval(score, method)
                self.setImgToEvalImgs(scores, gts.keys(), method)
                print("%s: %0.3f"%(method, score))
        self.setEvalImgs()
Esempio n. 29
0
def score(ref, hypo):
    """
    ref, dictionary of reference sentences (id, sentence)
    hypo, dictionary of hypothesis sentences (id, sentence)
    score, dictionary of scores
    """
    scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"])]
    final_scores = {}
    for scorer, method in scorers:
        score, scores = scorer.compute_score(ref, hypo)
        if type(score) == list:
            for m, s in zip(method, score):
                final_scores[m] = s
        else:
            final_scores[method] = score
    return final_scores
Esempio n. 30
0
def main(eval_caption_file, output, zh=False, embedding_path=None):
    df = pd.read_json(eval_caption_file)
    if zh:
        refs = df.groupby("key")["tokens"].apply(list).to_dict()
    else:
        refs = df.groupby("key")["caption"].apply(list).to_dict()

    from pycocoevalcap.bleu.bleu import Bleu
    from pycocoevalcap.cider.cider import Cider
    from pycocoevalcap.rouge.rouge import Rouge

    scorer = Bleu(zh=zh)
    bleu_scores = coco_score(copy.deepcopy(refs), scorer)
    print(bleu_scores)
    scorer = Cider(zh=zh)
    cider_score = coco_score(copy.deepcopy(refs), scorer)
    print(cider_score)
    scorer = Rouge(zh=zh)
    rouge_score = coco_score(copy.deepcopy(refs), scorer)
    print(rouge_score)

    if not zh:
        from pycocoevalcap.meteor.meteor import Meteor
        scorer = Meteor()
        meteor_score = coco_score(copy.deepcopy(refs), scorer)

        from pycocoevalcap.spice.spice import Spice
        scorer = Spice()
        spice_score = coco_score(copy.deepcopy(refs), scorer)

    diverse_score = diversity_score(refs, zh)

    with open(embedding_path, "rb") as f:
        ref_embeddings = pickle.load(f)

    bert_score = embedding_score(ref_embeddings, zh)

    with open(output, "w") as f:
        for n in range(4):
            f.write("BLEU-{}: {:6.3f}\n".format(n + 1, bleu_scores[n]))
        f.write("CIDEr: {:6.3f}\n".format(cider_score))
        f.write("ROUGE: {:6.3f}\n".format(rouge_score))
        if not zh:
            f.write("Meteor: {:6.3f}\n".format(meteor_score))
            f.write("SPICE: {:6.3f}\n".format(spice_score))
        f.write("SentenceBert: {:6.3f}\n".format(bert_score))
        f.write("Diversity: {:6.3f}\n".format(diverse_score))