Exemple #1
0
def test(model, dataloader, args):
    scorer = Bleu(4)
    m_scorer = Meteor()
    r_scorer = Rouge()
    hyp = []
    ref = []
    model.eval()
    gold_file = open('tmp_gold.txt', 'w')
    pred_file = open('tmp_pred.txt', 'w')
    with tqdm(dataloader, desc='Test ',  mininterval=1) as tq:
        for batch in tq:
            with torch.no_grad():
                seq = model(batch, beam_size=args.beam_size)
            r = write_txt(batch, batch['tgt_text'], gold_file, args)
            h = write_txt(batch, seq, pred_file, args)
            hyp.extend(h)
            ref.extend(r)
    hyp = dict(zip(range(len(hyp)), hyp))
    ref = dict(zip(range(len(ref)), ref))
    print(hyp[0], ref[0])
    print('BLEU INP', len(hyp), len(ref))
    print('BLEU', scorer.compute_score(ref, hyp)[0])
    print('METEOR', m_scorer.compute_score(ref, hyp)[0])
    print('ROUGE_L', r_scorer.compute_score(ref, hyp)[0])
    gold_file.close()
    pred_file.close()
Exemple #2
0
    def __init__(self,
                 ground_truth_filenames=None,
                 prediction_filename=None,
                 tious=None,
                 max_proposals=1000,
                 prediction_fields=PREDICTION_FIELDS,
                 verbose=False):
        # Check that the gt and submission files exist and load them
        if len(tious) == 0:
            raise IOError('Please input a valid tIoU.')
        if not ground_truth_filenames:
            raise IOError('Please input a valid ground truth file.')
        if not prediction_filename:
            raise IOError('Please input a valid prediction file.')

        self.verbose = verbose
        self.tious = tious
        self.max_proposals = max_proposals
        self.pred_fields = prediction_fields
        self.ground_truths = self.import_ground_truths(ground_truth_filenames)
        self.prediction = self.import_prediction(prediction_filename)
        self.tokenizer = PTBTokenizer()

        # Set up scorers, if not verbose, we only use the one we're
        # testing on: METEOR
        if self.verbose:
            self.scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3",
                                       "Bleu_4"]), (Meteor(), "METEOR"),
                            (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]
        else:
            #self.scorers = [(Meteor(), "METEOR")]
            self.scorers = (Rouge(), "ROUGE_L")
Exemple #3
0
 def __init__(self):
     self.gt = {}
     self.gen = {}
     self.count = 0
     self.bleu = Bleu()
     self.rouge = Rouge()
     self.rb = pyrb.Readability(syllable_counter=pyrb.CMUDictCounter())
Exemple #4
0
def cal_ROUGE(generated, reference, is_corpus=False):
    # ref and sample are both dict
    # scorers = [
    #     (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
    #     (Meteor(),"METEOR"),
    #     (Rouge(), "ROUGE_L"),
    #     (Cider(), "CIDEr")
    # ]
    # output rouge 1-4 and rouge L and rouge L from pycocoevaluate

    ROUGEscore = [0.0] * 6
    for idx, g in enumerate(generated):
        score = [0.0] * 6
        if is_corpus:
            for order in range(4):
                score[order] = rouge_n(g.split(),
                                       [x.split() for x in reference[0]],
                                       order + 1, 0.5)
            score[4] = rouge_l(g.split(), [x.split() for x in reference[0]],
                               0.5)
            score[5], _ = Rouge().compute_score(reference, {0: [g]})

        else:
            for order in range(4):
                score[order] = rouge_n(g.split(), [reference[0][idx].split()],
                                       order + 1, 0.5)
            score[4] = rouge_l(g.split(), [reference[0][idx].split()], 0.5)
            score[5], _ = Rouge().compute_score({0: [reference[0][idx]]},
                                                {0: [g]})
            #pdb.set_trace()
        #print g, score
        ROUGEscore = [r + score[idx] for idx, r in enumerate(ROUGEscore)]
        #BLEUscore += nltk.translate.bleu_score.sentence_bleu(reference, g, weight)
    ROUGEscore = [r / len(generated) for r in ROUGEscore]
    return ROUGEscore
Exemple #5
0
def language_eval_excoco(predictions, predictions_bleu, sents_label_eval,
                         loader):

    Scorer = CiderD()
    Bleu_scorer = Bleu(4)
    METEOR_scorer = Meteor()
    ROUGE_scorer = Rouge()

    c_score, _ = Scorer.compute_score(sents_label_eval, predictions)
    b_score, _ = Bleu_scorer.compute_score(sents_label_eval, predictions_bleu)
    m_score, _ = METEOR_scorer.compute_score(sents_label_eval,
                                             predictions_bleu)
    r_score, _ = ROUGE_scorer.compute_score(sents_label_eval, predictions_bleu)

    print('Evaluating {} samples'.format(len(predictions)))

    print('Bleu_1 : ' + str(b_score[0]))
    print('Bleu_2 : ' + str(b_score[1]))
    print('Bleu_3 : ' + str(b_score[2]))
    print('Bleu_4 : ' + str(b_score[3]))
    print('METEOR : ' + str(m_score))
    print('ROUGE_L : ' + str(r_score))
    print('CIDEr : ' + str(c_score))

    lang_stat = {}
    lang_stat['BLEU_1'] = b_score[0]
    lang_stat['BLEU_2'] = b_score[1]
    lang_stat['BLEU_3'] = b_score[2]
    lang_stat['BLEU_4'] = b_score[3]
    lang_stat['METEOR'] = m_score
    lang_stat['ROUGE_L'] = r_score
    lang_stat['CIDEr'] = c_score

    return lang_stat
        def rouge_scorer(reference, hypothesis):
            # =================================================
            # Compute scores
            # =================================================
            scorer = Rouge()

            average_score, score = scorer.compute_score(reference, hypothesis)

            return average_score, score
Exemple #7
0
def get_corpus_bleu(model, data_loader, vocabs, device, beam_size):
    import torch
    from pycocoevalcap.bleu.bleu import Bleu
    from pycocoevalcap.cider.cider import Cider
    from pycocoevalcap.rouge.rouge import Rouge
    """Defining Scorers"""
    scorer_bleu = Bleu(4)
    scorer_rouge = Rouge()
    scorer_cider = Cider()

    sequences_ref = {}
    sequences_gen = {}

    bad_words = ['<SOS>', '<EOS>', '<UNK>']
    bad_toks = [vocabs['word_vocab'](i) for i in bad_words]
    """Generation Loop"""
    for i, data in enumerate(data_loader):
        with torch.no_grad():
            captions = data['captions']
            length = captions.size(1) - 1
            targets = captions.narrow(1, 1, length)
            images = data['images'].to(device)
            topics = data['topics'].to(device)

            predictions = model.sample_v2(images, topics, beam_size=beam_size)
            sequences_ref[i] = [
                " ".join([
                    vocabs['word_vocab'](j.item()) for j in targets[0]
                    if j.item() not in bad_toks
                ])
            ]
            sequences_gen[i] = [
                " ".join([
                    vocabs['word_vocab'](j.item()) for j in predictions[0][1]
                    if j.item() not in bad_toks
                ])
            ]
            # sequences_gen[i] = [" ".join([vocabs['word_vocab'](j) for j in predictions[0] if j not in bad_toks])]
    """Getting Scores"""
    bleu_score, bleu_scores = scorer_bleu.compute_score(
        sequences_ref, sequences_gen)
    rouge_score, rouge_scores = scorer_rouge.compute_score(
        sequences_ref, sequences_gen)
    cider_score, cider_scores = scorer_cider.compute_score(
        sequences_ref, sequences_gen)
    scores = {
        'bleu_score': bleu_score,
        'rouge_score': rouge_score,
        'cider_score': cider_score
    }
    print(scores)
    return scores
def score(ref, hypo):
    scorers = [
        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        #(Meteor(),"METEOR"),#......................................issue
        (Rouge(), "ROUGE_L"),
        (Cider(), "CIDEr")
    ]

    final_scores = {}
    #print ("length: ",len(scorers))
    #print ("scorers :",scorers)
    #i = 0
    for scorer, method in scorers:
        #print("scorer :",scorer)
        #print("method : ",method)
        #print (i)
        #i = i+1

        score, scores = scorer.compute_score(ref, hypo)
        #print(type(score))

        if type(score) == list:
            #print("done")
            for m, s in zip(method, score):
                final_scores[m] = s

        else:
            #print("not done")
            final_scores[method] = score
        #print("phase complete")

    return final_scores
    def evaluate(self):
        imgIds = self.params['image_id']
        gts = self.gts
        res = self.res

        # =================================================
        # Set up scorers
        # =================================================
        tokenizer = PTBTokenizer()
        gts = tokenizer.tokenize(gts)
        res = tokenizer.tokenize(res)

        # =================================================
        # Set up scorers
        # =================================================
        scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                   (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"),
                   (Cider(), "CIDEr")]

        # =================================================
        # Compute scores
        # =================================================
        eval = {}
        for scorer, method in scorers:
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    self.setEval(sc, m)
                    self.setImgToEvalImgs(scs, imgIds, m)
            else:
                self.setEval(score, method)
                self.setImgToEvalImgs(scores, imgIds, method)
        self.setEvalImgs()
Exemple #10
0
def get_scorers(cider_idx_path):
    return {
        'cider': CiderD(df=cider_idx_path),
        'bleu': Bleu(),
        'rouge': Rouge(),
        'meteor': Meteor()
    }
    def score(self, GT, RES, IDs):
        # edited by rgh
        #self.eval = {}
        self.eval = OrderedDict()
        self.imgToEval = {}
        gts = {}
        res = {}
        for ID in IDs:
            #            print ID
            gts[ID] = GT[ID]
            res[ID] = RES[ID]
        print('tokenization...')
        tokenizer = PTBTokenizer()
        gts = tokenizer.tokenize(gts)
        res = tokenizer.tokenize(res)

        # =================================================
        # Set up scorers
        # =================================================
        print('setting up scorers...')
        # edited by rgh
        # scorers = [
        #     (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        #     (Meteor(),"METEOR"),
        #     (Rouge(), "ROUGE_L"),
        #     (Cider(), "CIDEr"),
        #     #(Spice(), "SPICE")
        # ]
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            (Meteor(), "METEOR"),
            (Cider(), "CIDEr"),
            (Rouge(), "ROUGE_L"),
            # (Spice(), "SPICE")
        ]

        # =================================================
        # Compute scores
        # =================================================
        eval = {}
        for scorer, method in scorers:
            print('computing %s score...' % (scorer.method()))
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                # added by rgh
                # for sc, scs, m in zip(score, scores, method):
                #     self.setEval(sc, m)
                #     self.setImgToEvalImgs(scs, IDs, m)
                #     print("%s: %0.3f" % (m, sc))
                self.setEval("%.4f" % score[-1], method[-1])
                self.setImgToEvalImgs(scores[-1], IDs, method[-1])
                print("%s: %0.4f" % (method[-1], score[-1]))
            else:
                self.setEval("%.4f" % score, method)
                self.setImgToEvalImgs(scores, IDs, method)
                print("%s: %0.4f" % (method, score))

        # for metric, score in self.eval.items():
        #    print '%s: %.3f'%(metric, score)
        return self.eval
Exemple #12
0
def calc_scores(file1, file2):
    """
    ref, dictionary of reference sentences (id, sentence)
    hypo, dictionary of hypothesis sentences (id, sentence)
    score, dictionary of scores
    ref: 真实的数据,类型为dict,如dict{"id":"[sentences]"}
    hypo: 生成的数据,格式如上。
    需满足:
        assert(type(hypo) is list);
        assert(len(hypo) == 1);
        assert(type(ref) is list);
        assert(len(ref) >= 1);
    
    """

    pred = readfiles(file1)
    test = readfiles(file2)
    # 合成dict类型
    i = [i for i in range(len(pred))]
    hypo = dict(zip(i, pred))
    ref = dict(zip(i, test))

    scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
               (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]
    final_scores = {}
    for scorer, method in scorers:
        score, scores = scorer.compute_score(ref, hypo)
        if type(score) == list:
            for m, s in zip(method, score):
                final_scores[m] = s
        else:
            final_scores[method] = score
    return final_scores
Exemple #13
0
    def get_scorers(self):
        # from pycoco_scorers_vizseq import BLEUScorerAll
        from pycocoevalcap.bleu.bleu import Bleu

        # from pycocoevalcap.spice.spice import Spice
        from pycocoevalcap.cider.cider import Cider
        from pycocoevalcap.rouge.rouge import Rouge
        from pycocoevalcap.meteor.meteor import Meteor
        from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
        import logging
        import transformers

        transformers.tokenization_utils.logger.setLevel(logging.ERROR)
        transformers.configuration_utils.logger.setLevel(logging.ERROR)
        transformers.modeling_utils.logger.setLevel(logging.ERROR)
        Scorer_ = namedtuple("Scorer_", ["cls_fn", "to_init", "out_str"])
        self.scorer_dict = {
            "bleu":
            Scorer_(Bleu(4, verbose=0), False,
                    ["bleu@1", "bleu@2", "bleu@3", "bleu@4"]),
            "meteor":
            Scorer_(Meteor(), False, ["meteor"]),
            "cider":
            Scorer_(Cider("corpus"), False, ["cider"]),
            "rouge":
            Scorer_(Rouge(), False, ["rouge"]),
            # "spice": Scorer_(Spice(), False, ["spice"]),
            "bert_score":
            Scorer_(BertScoreSimple, True, ["bert_score"]),
        }
        self.tokenizer = PTBTokenizer()
Exemple #14
0
def evaluate(gts, res):
    eval = {}

    # =================================================
    # Set up scorers
    # =================================================
    print('tokenization...')
    tokenizer = PTBTokenizer()
    gts = tokenizer.tokenize(gts)
    res = tokenizer.tokenize(res)

    # =================================================
    # Set up scorers
    # =================================================
    print('setting up scorers...')
    scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
               (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]

    # =================================================
    # Compute scores
    # =================================================
    for scorer, method in scorers:
        print('computing %s score...' % (scorer.method()))
        score, scores = scorer.compute_score(gts, res)
        if type(method) == list:
            for sc, scs, m in zip(score, scores, method):
                eval[m] = sc
        else:
            eval[method] = score

    return eval
Exemple #15
0
    def get_scorers(self):
        # from pycoco_scorers_vizseq import BLEUScorerAll
        from pycocoevalcap.bleu.bleu import Bleu
        from pycocoevalcap.cider.cider import Cider
        from pycocoevalcap.rouge.rouge import Rouge

        from pycocoevalcap.meteor.meteor import Meteor

        # from pycocoevalcap.spice.spice import Spice

        from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer

        Scorer_ = namedtuple("Scorer_", ["cls_fn", "to_init", "out_str"])
        self.scorer_dict = {
            "bleu": Scorer_(
                Bleu(4, verbose=0), False, ["bleu_1", "bleu_2", "bleu_3", "bleu_4"]
            ),
            "meteor": Scorer_(Meteor(), False, ["meteor"]),
            "cider": Scorer_(Cider("corpus"), False, ["cider"]),
            "rouge": Scorer_(Rouge(), False, ["rouge"]),
            # "spice": Scorer_(Spice(), False, ["spice"]),
        }
        self.tokenizer = PTBTokenizer()

        self.coval_all_metrics = [
            ("mentions", evaluator.mentions),
            ("muc", evaluator.muc),
            ("bcub", evaluator.b_cubed),
            ("ceafe", evaluator.ceafe),
            ("lea", evaluator.lea),
            ("lea_soft", evaluator.lea_soft),
        ]
        self.reset_coval_scorer_dict()
    def compute_ms_coco(self):
        """Performs the MS COCO evaluation using the Python 3 implementation (https://github.com/salaniz/pycocoevalcap)
        :param gts: Dictionary with the image ids and their gold captions,
        :param res: Dictionary with the image ids and their generated captions
        :print: Evaluation score (the mean of the scores of all the instances) for each measure
        """

        # load the csv files, containing the results and gold data.
        self.logger.info("Loading data")
        self._load_data()

        # Preprocess captions
        self.logger.info("Preprocessing captions")
        self.gold_data = self._preprocess_captions(self.gold_data)
        self.result_data = self._preprocess_captions(self.result_data)
        if len(self.gold_data) == len(self.result_data):
            # Set up scorers
            scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                       (Meteor(), "METEOR"), (Rouge(), "ROUGE_L")]

            # Compute score for each metric
            self.logger.info("Computing COCO score.")
            for scorer, method in scorers:
                print("Computing", scorer.method(), "...")
                score, scores = scorer.compute_score(self.gold_data,
                                                     self.result_data)
                if type(method) == list:
                    for sc, m in zip(score, method):
                        print("%s : %0.3f" % (m, sc))
                else:
                    print("%s : %0.3f" % (method, score))
        else:
            self.logger.error(
                "Gold data len={0} and results data len={1} have not equal size"
                .format(len(self.gold_data), len(self.result_data)))
def evaluate_captions_cider(ref, cand):
    #hypo = []

    #refe = defaultdict()
    #for i, caption in enumerate(cand):
    #    temp = defaultdict()
    #    temp['image_id'] = i
    #    temp['caption'] = [caption]
    #    hypo.append(temp)
    #    refe[i] = ref[i]
    #final_scores = score(refe, hypo)
    # # return final_scores['Bleu_1']
    # #### normal scores ###
    hypo = {}
    final_scores = defaultdict()
    refe = {}
    for i, caption in enumerate(cand):
        hypo[i] = [caption]
        refe[i] = ref[i]
    #score1, scores = Bleu(4).compute_score(refe, hypo)
    #method = ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]
    #for m, s in zip(method, scores):
    #     final_scores[m] = s
    score1, scores = Rouge().compute_score(refe, hypo)
    final_scores['ROUGE_L'] = scores
    #
    # return 2 * final_scores['CiderD'] + 1 * final_scores['Bleu_4'] + 1*final_scores['ROUGE_L']
    return final_scores['ROUGE_L']
Exemple #18
0
    def evaluate(self):
        # =================================================
        # Tokenization
        # =================================================
        print("Tokenization")
        tokenizer = PTBTokenizer()
        gts = tokenizer.tokenize(self.ground_truth)
        preds = tokenizer.tokenize(self.prediction)

        # =================================================
        # Setup scorers
        # =================================================
        print("Setting up scorers...")
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            (Meteor(), "METEOR"),
            (Rouge(), "ROUGE_L"),
            (Cider(), "CIDEr"),
            # (Spice(), "SPICE")
        ]

        # =================================================
        # Compute scores
        # =================================================
        for scorer, method in scorers:
            print("Computing {} score...".format(scorer.method()))
            score, scores = scorer.compute_score(gts, preds)
            if isinstance(method, list):
                for sc, scs, m in zip(score, scores, method):
                    self.eval_res[m] = sc * 100
            else:
                self.eval_res[method] = score * 100
Exemple #19
0
    def evaluate(self):
        assert len(self.ground) == len(self.predictions)

        # =================================================
        # Set up scorers
        # =================================================
        #print('setting up scorers...')
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            #(Meteor(), "METEOR"),
            (Rouge(), "ROUGE_L"),
            (Cider(), "CIDEr")
        ]

        # =================================================
        # Compute scores
        # =================================================
        for scorer, method in scorers:
            #print('computing %s score...' % (scorer.method()))
            score, scores = scorer.compute_score(self.ground, self.predictions)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    self.setEval(sc, m)
                    #print("%s: %0.3f" % (m, sc))
            else:
                self.setEval(score, method)
    def __init__(self, ground_truth_filenames, prediction_filename, verbose=False, all_scorer=False):
        # Check that the gt and submission files exist and load them
        self.verbose = verbose
        self.all_scorer = all_scorer
        self.ground_truths = self.import_ground_truths(ground_truth_filenames)
        self.prediction = self.import_prediction(prediction_filename)
        self.tokenizer = PTBTokenizer()

        # Set up scorers, if not verbose, we only use the one we're
        # testing on: METEOR

        # Meteor is java-based and can crash alot.
        try:
            met = Meteor()
        except (AttributeError, FileNotFoundError) as e:
            print(f"Meteor couldn't start due to {e}")
            met = None

        if self.verbose or self.all_scorer:
            self.scorers = [
                (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                (met, "METEOR"),
                (Rouge(), "ROUGE_L"),
                (Cider(), "CIDEr")
            ]
        else:
            self.scorers = [(met, "METEOR")]

        # init some attributes
        self.easy_samples = {}
        self.hard_samples = {}
        self.n_ref_vids = set()
        self.scores = {}
Exemple #21
0
 def __init__(self):
     self.scorers = [
         (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
         # (Meteor(), "METEOR"),
         (Rouge(), "ROUGE_L"),
         (Cider(), "CIDEr")
     ]  #,        (Cider(), "CIDEr")
def score(ref, hypo):
    """
    ref, dictionary of reference sentences (id, sentence)
    hypo, dictionary of hypothesis sentences (id, sentence)
    score, dictionary of scores
    """
    # print('ref')
    # print(ref)
    # print('hypo')
    # print(hypo)
    scorers = [
        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        (Meteor(), "METEOR"),
        (Rouge(), "ROUGE_L"),
        (Cider(), "CIDEr")
    ]
    final_scores = {}
    for scorer, method in scorers:
        score, scores = scorer.compute_score(ref, hypo)
        if type(score) == list:
            for m, s in zip(method, score):
                final_scores[m] = s
        else:
            final_scores[method] = score
    return final_scores
Exemple #23
0
    def __init__(self, ground_truth_filenames=None, prediction_filename=None,
                 verbose=False, all_scorer=False):
        # Check that the gt and submission files exist and load them
        if not ground_truth_filenames:
            raise IOError('Please input a valid ground truth file.')
        if not prediction_filename:
            raise IOError('Please input a valid prediction file.')

        self.verbose = verbose
        self.all_scorer = all_scorer
        self.ground_truths = self.import_ground_truths(ground_truth_filenames)
        self.prediction = self.import_prediction(prediction_filename)
        self.tokenizer = PTBTokenizer()

        # Set up scorers, if not verbose, we only use the one we're
        # testing on: METEOR
        if self.verbose or self.all_scorer:
            self.scorers = [
                (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                (Meteor(),"METEOR"),
                (Rouge(), "ROUGE_L"),
                (Cider(), "CIDEr")
            ]
        else:
            self.scorers = [(Meteor(), "METEOR")]
Exemple #24
0
def language_eval(sample_seqs, groundtruth_seqs):
    assert len(sample_seqs) == len(groundtruth_seqs), 'length of sampled seqs is different from that of groundtruth seqs!'

    references = OrderedDict()
    predictions = OrderedDict()
    for i in range(len(groundtruth_seqs)):
        references[i] = [groundtruth_seqs[i][j] for j in range(len(groundtruth_seqs[i]))]
    for i in range(len(sample_seqs)):
        predictions[i] = [sample_seqs[i]]

    predictions = {i: predictions[i] for i in range(len(sample_seqs))}
    references = {i: references[i] for i in range(len(groundtruth_seqs))}

    avg_bleu_score, bleu_score = Bleu(4).compute_score(references, predictions)
    print('avg_bleu_score == ', avg_bleu_score)
    avg_cider_score, cider_score = Cider().compute_score(references, predictions)
    print('avg_cider_score == ', avg_cider_score)
    avg_meteor_score, meteor_score = Meteor().compute_score(references, predictions)
    print('avg_meteor_score == ', avg_meteor_score)
    avg_rouge_score, rouge_score = Rouge().compute_score(references, predictions)
    print('avg_rouge_score == ', avg_rouge_score)

    # print('BLEU1:{}\nBLEU2:{}\nBLEU3:{}\nBLEU4:{}\nMETEOR:{}\nROUGE:{}CIDEr:{}\n'.format(avg_bleu_score[0],
    #                                                                                      avg_bleu_score[1],
    #                                                                                      avg_bleu_score[2],
    #                                                                                      avg_bleu_score[3],
    #                                                                                      avg_meteor_score,
    #                                                                                      avg_rouge_score,
    #                                                                                      avg_cider_score))
    return {'BLEU': avg_bleu_score, 'CIDEr': avg_cider_score,  'METEOR': avg_meteor_score,   'ROUGE': avg_rouge_score}
def CocoScore(ref, hyp, metrics_list=None, language='en'):
    """
    Obtains the COCO scores from the references and hypotheses.

    :param ref: Dictionary of reference sentences (id, sentence)
    :param hyp: Dictionary of hypothesis sentences (id, sentence)
    :param metrics_list: List of metrics to evaluate on
    :param language: Language of the sentences (for METEOR)
    :return: dictionary of scores
    """
    if metrics_list is None:
        metrics_list = ['bleu', 'ter', 'meteor', 'rouge_l', 'cider']
    else:
        metrics_list = [metric.lower() for metric in metrics_list]
    scorers = []
    if 'bleu' in metrics_list:
        scorers.append((Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]))
    if 'meteor' in metrics_list:
        scorers.append((Meteor(language), "METEOR"))
    if 'ter' in metrics_list:
        scorers.append((Ter(), "TER"))
    if 'rouge_l' in metrics_list or 'rouge' in metrics_list:
        scorers.append((Rouge(), "ROUGE_L"))
    if 'cider' in metrics_list:
        scorers.append((Cider(), "CIDEr"))

    final_scores = {}
    for scorer, method in scorers:
        score, _ = scorer.compute_score(ref, hyp)
        if isinstance(score, list):
            for m, s in zip(method, score):
                final_scores[m] = s
        else:
            final_scores[method] = score
    return final_scores
def main(eval_caption_file, output, zh=False):
    df = pd.read_json(eval_caption_file)
    if zh:
        refs = df.groupby("key")["tokens"].apply(list).to_dict()
    else:
        refs = df.groupby("key")["caption"].apply(list).to_dict()

    from pycocoevalcap.bleu.bleu import Bleu
    from pycocoevalcap.cider.cider import Cider
    from pycocoevalcap.rouge.rouge import Rouge

    scorer = Bleu(zh=zh)
    bleu_scores = coco_score(copy.deepcopy(refs), scorer)
    scorer = Cider(zh=zh)
    cider_score = coco_score(copy.deepcopy(refs), scorer)
    scorer = Rouge(zh=zh)
    rouge_score = coco_score(copy.deepcopy(refs), scorer)

    if not zh:
        from pycocoevalcap.meteor.meteor import Meteor
        scorer = Meteor()
        meteor_score = coco_score(copy.deepcopy(refs), scorer)

        from pycocoevalcap.spice.spice import Spice
        scorer = Spice()
        spice_score = coco_score(copy.deepcopy(refs), scorer)

    with open(output, "w") as f:
        for n in range(4):
            f.write("BLEU-{}: {:6.3f}\n".format(n + 1, bleu_scores[n]))
        f.write("CIDEr: {:6.3f}\n".format(cider_score))
        f.write("ROUGE: {:6.3f}\n".format(rouge_score))
        if not zh:
            f.write("Meteor: {:6.3f}\n".format(meteor_score))
            f.write("SPICE: {:6.3f}\n".format(spice_score))
def compute_scores(gts, res):
    """
    Performs the MS COCO evaluation using the Python 3 implementation (https://github.com/salaniz/pycocoevalcap)

    :param gts: Dictionary with the image ids and their gold captions,
    :param res: Dictionary with the image ids ant their generated captions
    :print: Evaluation score (the mean of the scores of all the instances) for each measure
    """

    # Preprocess captions
    gts = preprocess_captions(gts)
    res = preprocess_captions(res)

    # Set up scorers
    scorers = [
        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        (Meteor(), "METEOR"),
        (Rouge(), "ROUGE_L"),
        (Spice(), "SPICE"),
        (Cider(), "CIDEr")
    ]

    # Compute score for each metric
    for scorer, method in scorers:
        print("Computing", scorer.method(), "...")
        score, scores = scorer.compute_score(gts, res)
        if type(method) == list:
            for sc, m in zip(score, method):
                print("%s : %0.3f" % (m, sc))
        else:
            print("%s : %0.3f" % (method, score))
    def get_dcc_scores(self):

        imgIds = self.params['image_id']
        # imgIds = self.coco.getImgIds()
        gts = {}
        res = {}
        for imgId in imgIds:
            gts[imgId] = self.coco.imgToAnns[imgId]
            res[imgId] = self.cocoRes.imgToAnns[imgId]

        tokenizer = PTBTokenizer()
        gts = tokenizer.tokenize(gts)
        res = tokenizer.tokenize(res)
        scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                   (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"),
                   (Cider(), "CIDEr")]
        score_dict = {}
        for scorer, method in scorers:
            print 'computing %s score...' % (scorer.method())
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    score_dict[m] = sc
                    print "%s: %0.3f" % (m, sc)
            else:
                score_dict[method] = score
                print "%s: %0.3f" % (method, score)

        return score_dict
Exemple #29
0
def get_coco_score(gt_list, pred_list, verbose, extra_vars):
    """
    gt_list, dictionary of reference sentences (id, sentence)
    pred_list, dictionary of hypothesis sentences (id, sentence)
    verbose - if greater than 0 the metric measures are printed out
    extra_vars - extra variables, here are:
            extra_vars['language'] - the target language
    score, dictionary of scores

    """

    x_trgs = [x.lower() for x in gt_list]
    hypo = {idx: [lines.strip()] for (idx, lines) in enumerate(pred_list)}
    refs = {idx: [rr] for idx, rr in enumerate(x_trgs)}

    scorers = [
        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        #(Meteor(language=extra_vars['language']),"METEOR"),
        (Rouge(), "ROUGE_L"),
        (Cider(), "CIDEr")
    ]
    final_scores = {}
    for scorer, method in scorers:
        score, scores = scorer.compute_score(refs, hypo)

        if type(score) == list:
            for m, s in zip(method, score):
                final_scores[m] = s
        else:
            final_scores[method] = score
    return final_scores
Exemple #30
0
def score(num, DIR):
    print("Testing results on epoch ", num, " in DIR=", DIR)
    print("Loading coco annotations")
    dataDir = '.'
    dataType = 'val2014'
    algName = 'fakecap'
    annFile = '%s/annotations/captions_%s.json' % (dataDir, dataType)
    subtypes = ['results', 'evalImgs', 'eval']
    [resFile, evalImgsFile, evalFile]= \
    ['%s/results/captions_%s_%s_%s.json'%(dataDir,dataType,algName,subtype) for subtype in subtypes]
    coco_anns = COCO(annFile)
    print("COCO anns imported")

    path = DIR + str(num) + '_test_result.tar.gz'
    save = pickle.load(open(path))
    cocoRes = {}
    coco = {}
    for key, val in save.items():
        reslst = val[u'res']
        res = []
        for data in reslst:
            if data != u'<SEND>':
                res.append(data)
            else:
                break
        res = res[1:]
        #print "RES: ",reslst
        #print "ANN: ", val[u'ann']
        #res = [word for word in res if word!=u'<SEND>'][1:]
        #print "RES FIXED: ", res

        if len(res) == 0:
            res = [u'a']  #just not to be empty, and it has low low idf
        cocoRes[key] = [{u'caption': ' '.join(res)}]

        #coco[key] = [{u'caption':' '.join(val[u'ann'][1:-1])}]
        coco[key] = coco_anns.imgToAnns[key]
    print 'examples'
    for key in coco.keys()[:5]:
        print "IMG_NUM=", key
        print "Annotation: ", '\n'.join(
            [coco[key][i][u'caption'] for i in range(len(coco[key]))])
        print "Generated data: ", ' '.join(save[key][u'res'])
        print "Cleared generation: ", cocoRes[key][0][u'caption']

    print 'tokenization...'
    tokenizer = PTBTokenizer()
    gts = tokenizer.tokenize(coco)
    res = tokenizer.tokenize(cocoRes)

    print 'setting up scorers...'
    scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
               (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"),
               (Spice(), "SPICE")]

    for scorer, method in scorers:
        print 'computing %s score...' % (scorer.method())
        score, scores = scorer.compute_score(gts, res)
        print(score)