Example #1
0
    def __init__(self,
                 ground_truth_filenames=None,
                 prediction_filename=None,
                 tious=None,
                 max_proposals=1000,
                 prediction_fields=PREDICTION_FIELDS,
                 verbose=False):
        # Check that the gt and submission files exist and load them
        if len(tious) == 0:
            raise IOError('Please input a valid tIoU.')
        if not ground_truth_filenames:
            raise IOError('Please input a valid ground truth file.')
        if not prediction_filename:
            raise IOError('Please input a valid prediction file.')

        self.verbose = verbose
        self.tious = tious
        self.max_proposals = max_proposals
        self.pred_fields = prediction_fields
        self.ground_truths = self.import_ground_truths(ground_truth_filenames)
        self.prediction = self.import_prediction(prediction_filename)
        self.tokenizer = PTBTokenizer()

        # Set up scorers, if not verbose, we only use the one we're
        # testing on: METEOR
        if self.verbose:
            self.scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3",
                                       "Bleu_4"]), (Meteor(), "METEOR"),
                            (Rouge(), "ROUGE_L"), (Cider('corpus'), "CIDEr"),
                            (Spice(), "SPICE")]
        else:
            self.scorers = [(Cider('corpus'), "CIDEr")]
Example #2
0
def get_corpus_bleu(model, data_loader, vocabs, device, beam_size):
    import torch
    from pycocoevalcap.bleu.bleu import Bleu
    from pycocoevalcap.cider.cider import Cider
    from pycocoevalcap.rouge.rouge import Rouge
    """Defining Scorers"""
    scorer_bleu = Bleu(4)
    scorer_rouge = Rouge()
    scorer_cider = Cider()

    sequences_ref = {}
    sequences_gen = {}

    bad_words = ['<SOS>', '<EOS>', '<UNK>']
    bad_toks = [vocabs['word_vocab'](i) for i in bad_words]
    """Generation Loop"""
    for i, data in enumerate(data_loader):
        with torch.no_grad():
            captions = data['captions']
            length = captions.size(1) - 1
            targets = captions.narrow(1, 1, length)
            images = data['images'].to(device)
            topics = data['topics'].to(device)

            predictions = model.sample_v2(images, topics, beam_size=beam_size)
            sequences_ref[i] = [
                " ".join([
                    vocabs['word_vocab'](j.item()) for j in targets[0]
                    if j.item() not in bad_toks
                ])
            ]
            sequences_gen[i] = [
                " ".join([
                    vocabs['word_vocab'](j.item()) for j in predictions[0][1]
                    if j.item() not in bad_toks
                ])
            ]
            # sequences_gen[i] = [" ".join([vocabs['word_vocab'](j) for j in predictions[0] if j not in bad_toks])]
    """Getting Scores"""
    bleu_score, bleu_scores = scorer_bleu.compute_score(
        sequences_ref, sequences_gen)
    rouge_score, rouge_scores = scorer_rouge.compute_score(
        sequences_ref, sequences_gen)
    cider_score, cider_scores = scorer_cider.compute_score(
        sequences_ref, sequences_gen)
    scores = {
        'bleu_score': bleu_score,
        'rouge_score': rouge_score,
        'cider_score': cider_score
    }
    print(scores)
    return scores
Example #3
0
def compute_batch_score(decode_res,
                        key2refs,
                        keys,
                        start_idx,
                        end_idx,
                        vocabulary,
                        scorer):
    """
    Args:
        decode_res: decoding results of model, [N, max_length]
        key2refs: references of all samples, dict(<key> -> [ref_1, ref_2, ..., ref_n]
        keys: keys of this batch, used to match decode results and refs
    Return:
        scores of this batch, [N,]
    """

    if scorer is None:
        from pycocoevalcap.cider.cider import Cider
        scorer = Cider()

    hypothesis = {}
    references = {}

    for i in range(len(keys)):

        if keys[i] in hypothesis.keys():
            continue

        # prepare candidate sentence
        candidate = []
        for w_t in decode_res[i]:
            if w_t == start_idx:
                continue
            elif w_t == end_idx:
                break
            candidate.append(vocabulary.idx2word[w_t])

        hypothesis[keys[i]] = [" ".join(candidate), ]

        # prepare reference sentences
        references[keys[i]] = key2refs[keys[i]]

    score, scores = scorer.compute_score(references, hypothesis)
    key2score = {key: scores[i] for i, key in enumerate(references.keys())}
    results = np.zeros(decode_res.shape[0])
    for i in range(decode_res.shape[0]):
        results[i] = key2score[keys[i]]
    return results 
Example #4
0
def get_self_critical_reward(model, feat0, feat1, feat_mask, pos_feat,
                             groundtruth, probability_sample, id_word):
    batch_size = feat0.size(0)
    double_batch_size = batch_size * 2
    seq_length = probability_sample.size(1)

    greedy_sample, _ = model.sample(feat0, feat1, feat_mask, pos_feat)
    res = OrderedDict()
    gts = OrderedDict()
    greedy_sample = greedy_sample.cpu().numpy()
    probability_sample = probability_sample.cpu().numpy()

    for i in range(batch_size):
        res[i] = [numbers_to_str(probability_sample[i])]
    for i in range(batch_size, double_batch_size):
        res[i] = [numbers_to_str(greedy_sample[i - batch_size])]

    length = len(groundtruth[0])
    for i in range(batch_size):
        gts[i] = [numbers_to_str(groundtruth[i][j]) for j in range(length)]
    gts = {i: gts[i % batch_size] for i in range(double_batch_size)}
    assert len(gts.keys()) == len(
        res.keys()), 'len of gts.keys is not equal to that of res.keys'
    avg_cider_score, cider_score = Cider().compute_score(gts=gts, res=res)
    cider_score = np.array(cider_score)
    reward = cider_score[:batch_size] - cider_score[batch_size:]
    reward = np.repeat(reward[:, np.newaxis], seq_length, axis=1)
    return reward
    def __init__(self, ground_truth_filenames, prediction_filename, verbose=False, all_scorer=False):
        # Check that the gt and submission files exist and load them
        self.verbose = verbose
        self.all_scorer = all_scorer
        self.ground_truths = self.import_ground_truths(ground_truth_filenames)
        self.prediction = self.import_prediction(prediction_filename)
        self.tokenizer = PTBTokenizer()

        # Set up scorers, if not verbose, we only use the one we're
        # testing on: METEOR

        # Meteor is java-based and can crash alot.
        try:
            met = Meteor()
        except (AttributeError, FileNotFoundError) as e:
            print(f"Meteor couldn't start due to {e}")
            met = None

        if self.verbose or self.all_scorer:
            self.scorers = [
                (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                (met, "METEOR"),
                (Rouge(), "ROUGE_L"),
                (Cider(), "CIDEr")
            ]
        else:
            self.scorers = [(met, "METEOR")]

        # init some attributes
        self.easy_samples = {}
        self.hard_samples = {}
        self.n_ref_vids = set()
        self.scores = {}
Example #6
0
    def evaluate(self):
        # =================================================
        # Tokenization
        # =================================================
        print("Tokenization")
        tokenizer = PTBTokenizer()
        gts = tokenizer.tokenize(self.ground_truth)
        preds = tokenizer.tokenize(self.prediction)

        # =================================================
        # Setup scorers
        # =================================================
        print("Setting up scorers...")
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            (Meteor(), "METEOR"),
            (Rouge(), "ROUGE_L"),
            (Cider(), "CIDEr"),
            # (Spice(), "SPICE")
        ]

        # =================================================
        # Compute scores
        # =================================================
        for scorer, method in scorers:
            print("Computing {} score...".format(scorer.method()))
            score, scores = scorer.compute_score(gts, preds)
            if isinstance(method, list):
                for sc, scs, m in zip(score, scores, method):
                    self.eval_res[m] = sc * 100
            else:
                self.eval_res[method] = score * 100
Example #7
0
def calc_scores(file1, file2):
    """
    ref, dictionary of reference sentences (id, sentence)
    hypo, dictionary of hypothesis sentences (id, sentence)
    score, dictionary of scores
    ref: 真实的数据,类型为dict,如dict{"id":"[sentences]"}
    hypo: 生成的数据,格式如上。
    需满足:
        assert(type(hypo) is list);
        assert(len(hypo) == 1);
        assert(type(ref) is list);
        assert(len(ref) >= 1);
    
    """

    pred = readfiles(file1)
    test = readfiles(file2)
    # 合成dict类型
    i = [i for i in range(len(pred))]
    hypo = dict(zip(i, pred))
    ref = dict(zip(i, test))

    scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
               (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]
    final_scores = {}
    for scorer, method in scorers:
        score, scores = scorer.compute_score(ref, hypo)
        if type(score) == list:
            for m, s in zip(method, score):
                final_scores[m] = s
        else:
            final_scores[method] = score
    return final_scores
Example #8
0
    def get_scorers(self):
        # from pycoco_scorers_vizseq import BLEUScorerAll
        from pycocoevalcap.bleu.bleu import Bleu

        # from pycocoevalcap.spice.spice import Spice
        from pycocoevalcap.cider.cider import Cider
        from pycocoevalcap.rouge.rouge import Rouge
        from pycocoevalcap.meteor.meteor import Meteor
        from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
        import logging
        import transformers

        transformers.tokenization_utils.logger.setLevel(logging.ERROR)
        transformers.configuration_utils.logger.setLevel(logging.ERROR)
        transformers.modeling_utils.logger.setLevel(logging.ERROR)
        Scorer_ = namedtuple("Scorer_", ["cls_fn", "to_init", "out_str"])
        self.scorer_dict = {
            "bleu":
            Scorer_(Bleu(4, verbose=0), False,
                    ["bleu@1", "bleu@2", "bleu@3", "bleu@4"]),
            "meteor":
            Scorer_(Meteor(), False, ["meteor"]),
            "cider":
            Scorer_(Cider("corpus"), False, ["cider"]),
            "rouge":
            Scorer_(Rouge(), False, ["rouge"]),
            # "spice": Scorer_(Spice(), False, ["spice"]),
            "bert_score":
            Scorer_(BertScoreSimple, True, ["bert_score"]),
        }
        self.tokenizer = PTBTokenizer()
Example #9
0
def evaluate(gts, res):
    eval = {}

    # =================================================
    # Set up scorers
    # =================================================
    print('tokenization...')
    tokenizer = PTBTokenizer()
    gts = tokenizer.tokenize(gts)
    res = tokenizer.tokenize(res)

    # =================================================
    # Set up scorers
    # =================================================
    print('setting up scorers...')
    scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
               (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]

    # =================================================
    # Compute scores
    # =================================================
    for scorer, method in scorers:
        print('computing %s score...' % (scorer.method()))
        score, scores = scorer.compute_score(gts, res)
        if type(method) == list:
            for sc, scs, m in zip(score, scores, method):
                eval[m] = sc
        else:
            eval[method] = score

    return eval
Example #10
0
    def get_scorers(self):
        # from pycoco_scorers_vizseq import BLEUScorerAll
        from pycocoevalcap.bleu.bleu import Bleu
        from pycocoevalcap.cider.cider import Cider
        from pycocoevalcap.rouge.rouge import Rouge

        from pycocoevalcap.meteor.meteor import Meteor

        # from pycocoevalcap.spice.spice import Spice

        from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer

        Scorer_ = namedtuple("Scorer_", ["cls_fn", "to_init", "out_str"])
        self.scorer_dict = {
            "bleu": Scorer_(
                Bleu(4, verbose=0), False, ["bleu_1", "bleu_2", "bleu_3", "bleu_4"]
            ),
            "meteor": Scorer_(Meteor(), False, ["meteor"]),
            "cider": Scorer_(Cider("corpus"), False, ["cider"]),
            "rouge": Scorer_(Rouge(), False, ["rouge"]),
            # "spice": Scorer_(Spice(), False, ["spice"]),
        }
        self.tokenizer = PTBTokenizer()

        self.coval_all_metrics = [
            ("mentions", evaluator.mentions),
            ("muc", evaluator.muc),
            ("bcub", evaluator.b_cubed),
            ("ceafe", evaluator.ceafe),
            ("lea", evaluator.lea),
            ("lea_soft", evaluator.lea_soft),
        ]
        self.reset_coval_scorer_dict()
def score(ref, hypo):
    """
    ref, dictionary of reference sentences (id, sentence)
    hypo, dictionary of hypothesis sentences (id, sentence)
    score, dictionary of scores
    """
    # print('ref')
    # print(ref)
    # print('hypo')
    # print(hypo)
    scorers = [
        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        (Meteor(), "METEOR"),
        (Rouge(), "ROUGE_L"),
        (Cider(), "CIDEr")
    ]
    final_scores = {}
    for scorer, method in scorers:
        score, scores = scorer.compute_score(ref, hypo)
        if type(score) == list:
            for m, s in zip(method, score):
                final_scores[m] = s
        else:
            final_scores[method] = score
    return final_scores
Example #12
0
def language_eval(sample_seqs, groundtruth_seqs):
    assert len(sample_seqs) == len(groundtruth_seqs), 'length of sampled seqs is different from that of groundtruth seqs!'

    references = OrderedDict()
    predictions = OrderedDict()
    for i in range(len(groundtruth_seqs)):
        references[i] = [groundtruth_seqs[i][j] for j in range(len(groundtruth_seqs[i]))]
    for i in range(len(sample_seqs)):
        predictions[i] = [sample_seqs[i]]

    predictions = {i: predictions[i] for i in range(len(sample_seqs))}
    references = {i: references[i] for i in range(len(groundtruth_seqs))}

    avg_bleu_score, bleu_score = Bleu(4).compute_score(references, predictions)
    print('avg_bleu_score == ', avg_bleu_score)
    avg_cider_score, cider_score = Cider().compute_score(references, predictions)
    print('avg_cider_score == ', avg_cider_score)
    avg_meteor_score, meteor_score = Meteor().compute_score(references, predictions)
    print('avg_meteor_score == ', avg_meteor_score)
    avg_rouge_score, rouge_score = Rouge().compute_score(references, predictions)
    print('avg_rouge_score == ', avg_rouge_score)

    # print('BLEU1:{}\nBLEU2:{}\nBLEU3:{}\nBLEU4:{}\nMETEOR:{}\nROUGE:{}CIDEr:{}\n'.format(avg_bleu_score[0],
    #                                                                                      avg_bleu_score[1],
    #                                                                                      avg_bleu_score[2],
    #                                                                                      avg_bleu_score[3],
    #                                                                                      avg_meteor_score,
    #                                                                                      avg_rouge_score,
    #                                                                                      avg_cider_score))
    return {'BLEU': avg_bleu_score, 'CIDEr': avg_cider_score,  'METEOR': avg_meteor_score,   'ROUGE': avg_rouge_score}
Example #13
0
def CocoScore(ref, hyp, metrics_list=None, language='en'):
    """
    Obtains the COCO scores from the references and hypotheses.

    :param ref: Dictionary of reference sentences (id, sentence)
    :param hyp: Dictionary of hypothesis sentences (id, sentence)
    :param metrics_list: List of metrics to evaluate on
    :param language: Language of the sentences (for METEOR)
    :return: dictionary of scores
    """
    if metrics_list is None:
        metrics_list = ['bleu', 'ter', 'meteor', 'rouge_l', 'cider']
    else:
        metrics_list = [metric.lower() for metric in metrics_list]
    scorers = []
    if 'bleu' in metrics_list:
        scorers.append((Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]))
    if 'meteor' in metrics_list:
        scorers.append((Meteor(language), "METEOR"))
    if 'ter' in metrics_list:
        scorers.append((Ter(), "TER"))
    if 'rouge_l' in metrics_list or 'rouge' in metrics_list:
        scorers.append((Rouge(), "ROUGE_L"))
    if 'cider' in metrics_list:
        scorers.append((Cider(), "CIDEr"))

    final_scores = {}
    for scorer, method in scorers:
        score, _ = scorer.compute_score(ref, hyp)
        if isinstance(score, list):
            for m, s in zip(method, score):
                final_scores[m] = s
        else:
            final_scores[method] = score
    return final_scores
def score(ref, hypo):
    scorers = [
        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        #(Meteor(),"METEOR"),#......................................issue
        (Rouge(), "ROUGE_L"),
        (Cider(), "CIDEr")
    ]

    final_scores = {}
    #print ("length: ",len(scorers))
    #print ("scorers :",scorers)
    #i = 0
    for scorer, method in scorers:
        #print("scorer :",scorer)
        #print("method : ",method)
        #print (i)
        #i = i+1

        score, scores = scorer.compute_score(ref, hypo)
        #print(type(score))

        if type(score) == list:
            #print("done")
            for m, s in zip(method, score):
                final_scores[m] = s

        else:
            #print("not done")
            final_scores[method] = score
        #print("phase complete")

    return final_scores
Example #15
0
 def __init__(self):
     self.scorers = [
         (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
         # (Meteor(), "METEOR"),
         (Rouge(), "ROUGE_L"),
         (Cider(), "CIDEr")
     ]  #,        (Cider(), "CIDEr")
Example #16
0
def main(eval_caption_file, output, zh=False):
    df = pd.read_json(eval_caption_file)
    if zh:
        refs = df.groupby("key")["tokens"].apply(list).to_dict()
    else:
        refs = df.groupby("key")["caption"].apply(list).to_dict()

    from pycocoevalcap.bleu.bleu import Bleu
    from pycocoevalcap.cider.cider import Cider
    from pycocoevalcap.rouge.rouge import Rouge

    scorer = Bleu(zh=zh)
    bleu_scores = coco_score(copy.deepcopy(refs), scorer)
    scorer = Cider(zh=zh)
    cider_score = coco_score(copy.deepcopy(refs), scorer)
    scorer = Rouge(zh=zh)
    rouge_score = coco_score(copy.deepcopy(refs), scorer)

    if not zh:
        from pycocoevalcap.meteor.meteor import Meteor
        scorer = Meteor()
        meteor_score = coco_score(copy.deepcopy(refs), scorer)

        from pycocoevalcap.spice.spice import Spice
        scorer = Spice()
        spice_score = coco_score(copy.deepcopy(refs), scorer)

    with open(output, "w") as f:
        for n in range(4):
            f.write("BLEU-{}: {:6.3f}\n".format(n + 1, bleu_scores[n]))
        f.write("CIDEr: {:6.3f}\n".format(cider_score))
        f.write("ROUGE: {:6.3f}\n".format(rouge_score))
        if not zh:
            f.write("Meteor: {:6.3f}\n".format(meteor_score))
            f.write("SPICE: {:6.3f}\n".format(spice_score))
Example #17
0
def get_coco_score(gt_list, pred_list, verbose, extra_vars):
    """
    gt_list, dictionary of reference sentences (id, sentence)
    pred_list, dictionary of hypothesis sentences (id, sentence)
    verbose - if greater than 0 the metric measures are printed out
    extra_vars - extra variables, here are:
            extra_vars['language'] - the target language
    score, dictionary of scores

    """

    x_trgs = [x.lower() for x in gt_list]
    hypo = {idx: [lines.strip()] for (idx, lines) in enumerate(pred_list)}
    refs = {idx: [rr] for idx, rr in enumerate(x_trgs)}

    scorers = [
        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        #(Meteor(language=extra_vars['language']),"METEOR"),
        (Rouge(), "ROUGE_L"),
        (Cider(), "CIDEr")
    ]
    final_scores = {}
    for scorer, method in scorers:
        score, scores = scorer.compute_score(refs, hypo)

        if type(score) == list:
            for m, s in zip(method, score):
                final_scores[m] = s
        else:
            final_scores[method] = score
    return final_scores
def compute_scores(gts, res):
    """
    Performs the MS COCO evaluation using the Python 3 implementation (https://github.com/salaniz/pycocoevalcap)

    :param gts: Dictionary with the image ids and their gold captions,
    :param res: Dictionary with the image ids ant their generated captions
    :print: Evaluation score (the mean of the scores of all the instances) for each measure
    """

    # Preprocess captions
    gts = preprocess_captions(gts)
    res = preprocess_captions(res)

    # Set up scorers
    scorers = [
        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        (Meteor(), "METEOR"),
        (Rouge(), "ROUGE_L"),
        (Spice(), "SPICE"),
        (Cider(), "CIDEr")
    ]

    # Compute score for each metric
    for scorer, method in scorers:
        print("Computing", scorer.method(), "...")
        score, scores = scorer.compute_score(gts, res)
        if type(method) == list:
            for sc, m in zip(score, method):
                print("%s : %0.3f" % (m, sc))
        else:
            print("%s : %0.3f" % (method, score))
Example #19
0
    def __init__(self, ground_truth_filenames=None, prediction_filename=None,
                 verbose=False, all_scorer=False):
        # Check that the gt and submission files exist and load them
        if not ground_truth_filenames:
            raise IOError('Please input a valid ground truth file.')
        if not prediction_filename:
            raise IOError('Please input a valid prediction file.')

        self.verbose = verbose
        self.all_scorer = all_scorer
        self.ground_truths = self.import_ground_truths(ground_truth_filenames)
        self.prediction = self.import_prediction(prediction_filename)
        self.tokenizer = PTBTokenizer()

        # Set up scorers, if not verbose, we only use the one we're
        # testing on: METEOR
        if self.verbose or self.all_scorer:
            self.scorers = [
                (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                (Meteor(),"METEOR"),
                (Rouge(), "ROUGE_L"),
                (Cider(), "CIDEr")
            ]
        else:
            self.scorers = [(Meteor(), "METEOR")]
Example #20
0
    def get_dcc_scores(self):

        imgIds = self.params['image_id']
        # imgIds = self.coco.getImgIds()
        gts = {}
        res = {}
        for imgId in imgIds:
            gts[imgId] = self.coco.imgToAnns[imgId]
            res[imgId] = self.cocoRes.imgToAnns[imgId]

        tokenizer = PTBTokenizer()
        gts = tokenizer.tokenize(gts)
        res = tokenizer.tokenize(res)
        scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                   (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"),
                   (Cider(), "CIDEr")]
        score_dict = {}
        for scorer, method in scorers:
            print 'computing %s score...' % (scorer.method())
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    score_dict[m] = sc
                    print "%s: %0.3f" % (m, sc)
            else:
                score_dict[method] = score
                print "%s: %0.3f" % (method, score)

        return score_dict
    def evaluate(self):
        imgIds = self.params['image_id']
        gts = self.gts
        res = self.res

        # =================================================
        # Set up scorers
        # =================================================
        tokenizer = PTBTokenizer()
        gts = tokenizer.tokenize(gts)
        res = tokenizer.tokenize(res)

        # =================================================
        # Set up scorers
        # =================================================
        scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                   (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"),
                   (Cider(), "CIDEr")]

        # =================================================
        # Compute scores
        # =================================================
        eval = {}
        for scorer, method in scorers:
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    self.setEval(sc, m)
                    self.setImgToEvalImgs(scs, imgIds, m)
            else:
                self.setEval(score, method)
                self.setImgToEvalImgs(scores, imgIds, method)
        self.setEvalImgs()
Example #22
0
    def evaluate(self):
        assert len(self.ground) == len(self.predictions)

        # =================================================
        # Set up scorers
        # =================================================
        #print('setting up scorers...')
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            #(Meteor(), "METEOR"),
            (Rouge(), "ROUGE_L"),
            (Cider(), "CIDEr")
        ]

        # =================================================
        # Compute scores
        # =================================================
        for scorer, method in scorers:
            #print('computing %s score...' % (scorer.method()))
            score, scores = scorer.compute_score(self.ground, self.predictions)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    self.setEval(sc, m)
                    #print("%s: %0.3f" % (m, sc))
            else:
                self.setEval(score, method)
Example #23
0
    def score(self, GT, RES, IDs):
        # edited by rgh
        #self.eval = {}
        self.eval = OrderedDict()
        self.imgToEval = {}
        gts = {}
        res = {}
        for ID in IDs:
            #            print ID
            gts[ID] = GT[ID]
            res[ID] = RES[ID]
        print('tokenization...')
        tokenizer = PTBTokenizer()
        gts = tokenizer.tokenize(gts)
        res = tokenizer.tokenize(res)

        # =================================================
        # Set up scorers
        # =================================================
        print('setting up scorers...')
        # edited by rgh
        # scorers = [
        #     (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        #     (Meteor(),"METEOR"),
        #     (Rouge(), "ROUGE_L"),
        #     (Cider(), "CIDEr"),
        #     #(Spice(), "SPICE")
        # ]
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            (Meteor(), "METEOR"),
            (Cider(), "CIDEr"),
            (Rouge(), "ROUGE_L"),
            # (Spice(), "SPICE")
        ]

        # =================================================
        # Compute scores
        # =================================================
        eval = {}
        for scorer, method in scorers:
            print('computing %s score...' % (scorer.method()))
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                # added by rgh
                # for sc, scs, m in zip(score, scores, method):
                #     self.setEval(sc, m)
                #     self.setImgToEvalImgs(scs, IDs, m)
                #     print("%s: %0.3f" % (m, sc))
                self.setEval("%.4f" % score[-1], method[-1])
                self.setImgToEvalImgs(scores[-1], IDs, method[-1])
                print("%s: %0.4f" % (method[-1], score[-1]))
            else:
                self.setEval("%.4f" % score, method)
                self.setImgToEvalImgs(scores, IDs, method)
                print("%s: %0.4f" % (method, score))

        # for metric, score in self.eval.items():
        #    print '%s: %.3f'%(metric, score)
        return self.eval
Example #24
0
def score(num, DIR):
    print("Testing results on epoch ", num, " in DIR=", DIR)
    print("Loading coco annotations")
    dataDir = '.'
    dataType = 'val2014'
    algName = 'fakecap'
    annFile = '%s/annotations/captions_%s.json' % (dataDir, dataType)
    subtypes = ['results', 'evalImgs', 'eval']
    [resFile, evalImgsFile, evalFile]= \
    ['%s/results/captions_%s_%s_%s.json'%(dataDir,dataType,algName,subtype) for subtype in subtypes]
    coco_anns = COCO(annFile)
    print("COCO anns imported")

    path = DIR + str(num) + '_test_result.tar.gz'
    save = pickle.load(open(path))
    cocoRes = {}
    coco = {}
    for key, val in save.items():
        reslst = val[u'res']
        res = []
        for data in reslst:
            if data != u'<SEND>':
                res.append(data)
            else:
                break
        res = res[1:]
        #print "RES: ",reslst
        #print "ANN: ", val[u'ann']
        #res = [word for word in res if word!=u'<SEND>'][1:]
        #print "RES FIXED: ", res

        if len(res) == 0:
            res = [u'a']  #just not to be empty, and it has low low idf
        cocoRes[key] = [{u'caption': ' '.join(res)}]

        #coco[key] = [{u'caption':' '.join(val[u'ann'][1:-1])}]
        coco[key] = coco_anns.imgToAnns[key]
    print 'examples'
    for key in coco.keys()[:5]:
        print "IMG_NUM=", key
        print "Annotation: ", '\n'.join(
            [coco[key][i][u'caption'] for i in range(len(coco[key]))])
        print "Generated data: ", ' '.join(save[key][u'res'])
        print "Cleared generation: ", cocoRes[key][0][u'caption']

    print 'tokenization...'
    tokenizer = PTBTokenizer()
    gts = tokenizer.tokenize(coco)
    res = tokenizer.tokenize(cocoRes)

    print 'setting up scorers...'
    scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
               (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"),
               (Spice(), "SPICE")]

    for scorer, method in scorers:
        print 'computing %s score...' % (scorer.method())
        score, scores = scorer.compute_score(gts, res)
        print(score)
def _define_metrics(gts, res):
    bleu_scorer = Bleu(n=4)
    bleu, _ = bleu_scorer.compute_score(gts=gts, res=res)

    rouge_scorer = Rouge()
    rouge, _ = rouge_scorer.compute_score(gts=gts, res=res)

    cider_scorer = Cider()
    cider, _ = cider_scorer.compute_score(gts=gts, res=res)

    meteor_scorer = Meteor()
    meteor, _ = meteor_scorer.compute_score(gts=gts, res=res)

    for i in range(4):
        bleu[i] = round(bleu[i], 4)

    return bleu, round(meteor, 4), round(rouge, 4), round(cider, 4)
def cider_scores(trues, pred, n=4):
    """
  Compute CIDEr and CIDEr-D for a fixed prediction, with pycocoevalcap
  """
    trues = dict([(i, [r]) for i, r in enumerate(trues)])
    preds = dict([(i, [pred]) for i, _ in enumerate(trues)])
    ciders, _ = Cider().compute_score(trues, preds)
    return ciders
Example #27
0
 def _train_batch(engine, batch):
     model.train()
     with torch.enable_grad():
         optimizer.zero_grad()
         train_scorer = Cider(zh=zh)
         output = self._forward(model, batch, "train", key2refs=train_key2refs,                                       scorer=train_scorer)
         output["loss"].backward()
         optimizer.step()
         return output
Example #28
0
def init_eval_metric(bleu_n=4):
    global Meteor_scorer
    global Cider_scorer
    global Bleu_scorer
    global Bleu_N
    Meteor_scorer = Meteor_scorer or Meteor()
    Cider_scorer = Cider_scorer or Cider()
    Bleu_scorer = Bleu_scorer or Bleu(bleu_n)
    Bleu_N = bleu_n
Example #29
0
    def __init__(self, args, task):
        super().__init__(args, task)
        self.task = task

        self.generator = SimpleSequenceGenerator(
            beam=args.scst_beam,
            penalty=args.scst_penalty,
            max_pos=args.max_target_positions,
            eos_index=task.target_dictionary.eos_index)

        # Needed for decoding model output to string
        self.conf_tokenizer = encoders.build_tokenizer(args)
        self.conf_decoder = encoders.build_bpe(args)
        self.captions_dict = task.target_dictionary

        # Tokenizer needed for computing CIDEr scores
        self.tokenizer = PTBTokenizer()
        self.scorer = Cider()
Example #30
0
def compute_cider_score(decode_res, keys, gts, start_idx, end_idx, vocabulary):
    """
    Args:
        decode_res: decoding results of model, [B, max_length]
        keys: keys of this batch, tuple [B,]
        gts: ground truth sentences of all audios, dict(<key> -> [ref_1, ref_2, ..., ref_n])
    Return:
        score: scores of this batch, [B,]
    """
    from pycocoevalcap.cider.cider import Cider
    scorer = Cider()

    hypothesis = {}
    references = {}

    for i in range(decode_res.shape[0]):

        if keys[i] in hypothesis:
            continue

        # prepare candidate
        candidate = []
        for t, w_t in enumerate(decode_res[i]):
            if w_t == start_idx:
                continue
            elif w_t == end_idx:
                break
            else:
                candidate.append(vocabulary.idx2word[w_t])
        hypothesis[keys[i]] = [
            " ".join(candidate),
        ]

        # prepare reference
        references[keys[i]] = gts[keys[i]]

    (score, scores) = scorer.compute_score(references, hypothesis)

    key2score = {key: scores[i] for i, key in enumerate(hypothesis.keys())}
    results = np.zeros(decode_res.shape[0])
    for i in range(decode_res.shape[0]):
        results[i] = key2score[keys[i]]

    return results