def test(model, dataloader, args): scorer = Bleu(4) m_scorer = Meteor() r_scorer = Rouge() hyp = [] ref = [] model.eval() gold_file = open('tmp_gold.txt', 'w') pred_file = open('tmp_pred.txt', 'w') with tqdm(dataloader, desc='Test ', mininterval=1) as tq: for batch in tq: with torch.no_grad(): seq = model(batch, beam_size=args.beam_size) r = write_txt(batch, batch['tgt_text'], gold_file, args) h = write_txt(batch, seq, pred_file, args) hyp.extend(h) ref.extend(r) hyp = dict(zip(range(len(hyp)), hyp)) ref = dict(zip(range(len(ref)), ref)) print(hyp[0], ref[0]) print('BLEU INP', len(hyp), len(ref)) print('BLEU', scorer.compute_score(ref, hyp)[0]) print('METEOR', m_scorer.compute_score(ref, hyp)[0]) print('ROUGE_L', r_scorer.compute_score(ref, hyp)[0]) gold_file.close() pred_file.close()
def __init__(self, ground_truth_filenames=None, prediction_filename=None, tious=None, max_proposals=1000, prediction_fields=PREDICTION_FIELDS, verbose=False): # Check that the gt and submission files exist and load them if len(tious) == 0: raise IOError('Please input a valid tIoU.') if not ground_truth_filenames: raise IOError('Please input a valid ground truth file.') if not prediction_filename: raise IOError('Please input a valid prediction file.') self.verbose = verbose self.tious = tious self.max_proposals = max_proposals self.pred_fields = prediction_fields self.ground_truths = self.import_ground_truths(ground_truth_filenames) self.prediction = self.import_prediction(prediction_filename) self.tokenizer = PTBTokenizer() # Set up scorers, if not verbose, we only use the one we're # testing on: METEOR if self.verbose: self.scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] else: #self.scorers = [(Meteor(), "METEOR")] self.scorers = (Rouge(), "ROUGE_L")
def __init__(self): self.gt = {} self.gen = {} self.count = 0 self.bleu = Bleu() self.rouge = Rouge() self.rb = pyrb.Readability(syllable_counter=pyrb.CMUDictCounter())
def cal_ROUGE(generated, reference, is_corpus=False): # ref and sample are both dict # scorers = [ # (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), # (Meteor(),"METEOR"), # (Rouge(), "ROUGE_L"), # (Cider(), "CIDEr") # ] # output rouge 1-4 and rouge L and rouge L from pycocoevaluate ROUGEscore = [0.0] * 6 for idx, g in enumerate(generated): score = [0.0] * 6 if is_corpus: for order in range(4): score[order] = rouge_n(g.split(), [x.split() for x in reference[0]], order + 1, 0.5) score[4] = rouge_l(g.split(), [x.split() for x in reference[0]], 0.5) score[5], _ = Rouge().compute_score(reference, {0: [g]}) else: for order in range(4): score[order] = rouge_n(g.split(), [reference[0][idx].split()], order + 1, 0.5) score[4] = rouge_l(g.split(), [reference[0][idx].split()], 0.5) score[5], _ = Rouge().compute_score({0: [reference[0][idx]]}, {0: [g]}) #pdb.set_trace() #print g, score ROUGEscore = [r + score[idx] for idx, r in enumerate(ROUGEscore)] #BLEUscore += nltk.translate.bleu_score.sentence_bleu(reference, g, weight) ROUGEscore = [r / len(generated) for r in ROUGEscore] return ROUGEscore
def language_eval_excoco(predictions, predictions_bleu, sents_label_eval, loader): Scorer = CiderD() Bleu_scorer = Bleu(4) METEOR_scorer = Meteor() ROUGE_scorer = Rouge() c_score, _ = Scorer.compute_score(sents_label_eval, predictions) b_score, _ = Bleu_scorer.compute_score(sents_label_eval, predictions_bleu) m_score, _ = METEOR_scorer.compute_score(sents_label_eval, predictions_bleu) r_score, _ = ROUGE_scorer.compute_score(sents_label_eval, predictions_bleu) print('Evaluating {} samples'.format(len(predictions))) print('Bleu_1 : ' + str(b_score[0])) print('Bleu_2 : ' + str(b_score[1])) print('Bleu_3 : ' + str(b_score[2])) print('Bleu_4 : ' + str(b_score[3])) print('METEOR : ' + str(m_score)) print('ROUGE_L : ' + str(r_score)) print('CIDEr : ' + str(c_score)) lang_stat = {} lang_stat['BLEU_1'] = b_score[0] lang_stat['BLEU_2'] = b_score[1] lang_stat['BLEU_3'] = b_score[2] lang_stat['BLEU_4'] = b_score[3] lang_stat['METEOR'] = m_score lang_stat['ROUGE_L'] = r_score lang_stat['CIDEr'] = c_score return lang_stat
def rouge_scorer(reference, hypothesis): # ================================================= # Compute scores # ================================================= scorer = Rouge() average_score, score = scorer.compute_score(reference, hypothesis) return average_score, score
def get_corpus_bleu(model, data_loader, vocabs, device, beam_size): import torch from pycocoevalcap.bleu.bleu import Bleu from pycocoevalcap.cider.cider import Cider from pycocoevalcap.rouge.rouge import Rouge """Defining Scorers""" scorer_bleu = Bleu(4) scorer_rouge = Rouge() scorer_cider = Cider() sequences_ref = {} sequences_gen = {} bad_words = ['<SOS>', '<EOS>', '<UNK>'] bad_toks = [vocabs['word_vocab'](i) for i in bad_words] """Generation Loop""" for i, data in enumerate(data_loader): with torch.no_grad(): captions = data['captions'] length = captions.size(1) - 1 targets = captions.narrow(1, 1, length) images = data['images'].to(device) topics = data['topics'].to(device) predictions = model.sample_v2(images, topics, beam_size=beam_size) sequences_ref[i] = [ " ".join([ vocabs['word_vocab'](j.item()) for j in targets[0] if j.item() not in bad_toks ]) ] sequences_gen[i] = [ " ".join([ vocabs['word_vocab'](j.item()) for j in predictions[0][1] if j.item() not in bad_toks ]) ] # sequences_gen[i] = [" ".join([vocabs['word_vocab'](j) for j in predictions[0] if j not in bad_toks])] """Getting Scores""" bleu_score, bleu_scores = scorer_bleu.compute_score( sequences_ref, sequences_gen) rouge_score, rouge_scores = scorer_rouge.compute_score( sequences_ref, sequences_gen) cider_score, cider_scores = scorer_cider.compute_score( sequences_ref, sequences_gen) scores = { 'bleu_score': bleu_score, 'rouge_score': rouge_score, 'cider_score': cider_score } print(scores) return scores
def score(ref, hypo): scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), #(Meteor(),"METEOR"),#......................................issue (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] final_scores = {} #print ("length: ",len(scorers)) #print ("scorers :",scorers) #i = 0 for scorer, method in scorers: #print("scorer :",scorer) #print("method : ",method) #print (i) #i = i+1 score, scores = scorer.compute_score(ref, hypo) #print(type(score)) if type(score) == list: #print("done") for m, s in zip(method, score): final_scores[m] = s else: #print("not done") final_scores[method] = score #print("phase complete") return final_scores
def evaluate(self): imgIds = self.params['image_id'] gts = self.gts res = self.res # ================================================= # Set up scorers # ================================================= tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # ================================================= # Set up scorers # ================================================= scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] # ================================================= # Compute scores # ================================================= eval = {} for scorer, method in scorers: score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): self.setEval(sc, m) self.setImgToEvalImgs(scs, imgIds, m) else: self.setEval(score, method) self.setImgToEvalImgs(scores, imgIds, method) self.setEvalImgs()
def get_scorers(cider_idx_path): return { 'cider': CiderD(df=cider_idx_path), 'bleu': Bleu(), 'rouge': Rouge(), 'meteor': Meteor() }
def score(self, GT, RES, IDs): # edited by rgh #self.eval = {} self.eval = OrderedDict() self.imgToEval = {} gts = {} res = {} for ID in IDs: # print ID gts[ID] = GT[ID] res[ID] = RES[ID] print('tokenization...') tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # ================================================= # Set up scorers # ================================================= print('setting up scorers...') # edited by rgh # scorers = [ # (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), # (Meteor(),"METEOR"), # (Rouge(), "ROUGE_L"), # (Cider(), "CIDEr"), # #(Spice(), "SPICE") # ] scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Cider(), "CIDEr"), (Rouge(), "ROUGE_L"), # (Spice(), "SPICE") ] # ================================================= # Compute scores # ================================================= eval = {} for scorer, method in scorers: print('computing %s score...' % (scorer.method())) score, scores = scorer.compute_score(gts, res) if type(method) == list: # added by rgh # for sc, scs, m in zip(score, scores, method): # self.setEval(sc, m) # self.setImgToEvalImgs(scs, IDs, m) # print("%s: %0.3f" % (m, sc)) self.setEval("%.4f" % score[-1], method[-1]) self.setImgToEvalImgs(scores[-1], IDs, method[-1]) print("%s: %0.4f" % (method[-1], score[-1])) else: self.setEval("%.4f" % score, method) self.setImgToEvalImgs(scores, IDs, method) print("%s: %0.4f" % (method, score)) # for metric, score in self.eval.items(): # print '%s: %.3f'%(metric, score) return self.eval
def calc_scores(file1, file2): """ ref, dictionary of reference sentences (id, sentence) hypo, dictionary of hypothesis sentences (id, sentence) score, dictionary of scores ref: 真实的数据,类型为dict,如dict{"id":"[sentences]"} hypo: 生成的数据,格式如上。 需满足: assert(type(hypo) is list); assert(len(hypo) == 1); assert(type(ref) is list); assert(len(ref) >= 1); """ pred = readfiles(file1) test = readfiles(file2) # 合成dict类型 i = [i for i in range(len(pred))] hypo = dict(zip(i, pred)) ref = dict(zip(i, test)) scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] final_scores = {} for scorer, method in scorers: score, scores = scorer.compute_score(ref, hypo) if type(score) == list: for m, s in zip(method, score): final_scores[m] = s else: final_scores[method] = score return final_scores
def get_scorers(self): # from pycoco_scorers_vizseq import BLEUScorerAll from pycocoevalcap.bleu.bleu import Bleu # from pycocoevalcap.spice.spice import Spice from pycocoevalcap.cider.cider import Cider from pycocoevalcap.rouge.rouge import Rouge from pycocoevalcap.meteor.meteor import Meteor from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer import logging import transformers transformers.tokenization_utils.logger.setLevel(logging.ERROR) transformers.configuration_utils.logger.setLevel(logging.ERROR) transformers.modeling_utils.logger.setLevel(logging.ERROR) Scorer_ = namedtuple("Scorer_", ["cls_fn", "to_init", "out_str"]) self.scorer_dict = { "bleu": Scorer_(Bleu(4, verbose=0), False, ["bleu@1", "bleu@2", "bleu@3", "bleu@4"]), "meteor": Scorer_(Meteor(), False, ["meteor"]), "cider": Scorer_(Cider("corpus"), False, ["cider"]), "rouge": Scorer_(Rouge(), False, ["rouge"]), # "spice": Scorer_(Spice(), False, ["spice"]), "bert_score": Scorer_(BertScoreSimple, True, ["bert_score"]), } self.tokenizer = PTBTokenizer()
def evaluate(gts, res): eval = {} # ================================================= # Set up scorers # ================================================= print('tokenization...') tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # ================================================= # Set up scorers # ================================================= print('setting up scorers...') scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] # ================================================= # Compute scores # ================================================= for scorer, method in scorers: print('computing %s score...' % (scorer.method())) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): eval[m] = sc else: eval[method] = score return eval
def get_scorers(self): # from pycoco_scorers_vizseq import BLEUScorerAll from pycocoevalcap.bleu.bleu import Bleu from pycocoevalcap.cider.cider import Cider from pycocoevalcap.rouge.rouge import Rouge from pycocoevalcap.meteor.meteor import Meteor # from pycocoevalcap.spice.spice import Spice from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer Scorer_ = namedtuple("Scorer_", ["cls_fn", "to_init", "out_str"]) self.scorer_dict = { "bleu": Scorer_( Bleu(4, verbose=0), False, ["bleu_1", "bleu_2", "bleu_3", "bleu_4"] ), "meteor": Scorer_(Meteor(), False, ["meteor"]), "cider": Scorer_(Cider("corpus"), False, ["cider"]), "rouge": Scorer_(Rouge(), False, ["rouge"]), # "spice": Scorer_(Spice(), False, ["spice"]), } self.tokenizer = PTBTokenizer() self.coval_all_metrics = [ ("mentions", evaluator.mentions), ("muc", evaluator.muc), ("bcub", evaluator.b_cubed), ("ceafe", evaluator.ceafe), ("lea", evaluator.lea), ("lea_soft", evaluator.lea_soft), ] self.reset_coval_scorer_dict()
def compute_ms_coco(self): """Performs the MS COCO evaluation using the Python 3 implementation (https://github.com/salaniz/pycocoevalcap) :param gts: Dictionary with the image ids and their gold captions, :param res: Dictionary with the image ids and their generated captions :print: Evaluation score (the mean of the scores of all the instances) for each measure """ # load the csv files, containing the results and gold data. self.logger.info("Loading data") self._load_data() # Preprocess captions self.logger.info("Preprocessing captions") self.gold_data = self._preprocess_captions(self.gold_data) self.result_data = self._preprocess_captions(self.result_data) if len(self.gold_data) == len(self.result_data): # Set up scorers scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L")] # Compute score for each metric self.logger.info("Computing COCO score.") for scorer, method in scorers: print("Computing", scorer.method(), "...") score, scores = scorer.compute_score(self.gold_data, self.result_data) if type(method) == list: for sc, m in zip(score, method): print("%s : %0.3f" % (m, sc)) else: print("%s : %0.3f" % (method, score)) else: self.logger.error( "Gold data len={0} and results data len={1} have not equal size" .format(len(self.gold_data), len(self.result_data)))
def evaluate_captions_cider(ref, cand): #hypo = [] #refe = defaultdict() #for i, caption in enumerate(cand): # temp = defaultdict() # temp['image_id'] = i # temp['caption'] = [caption] # hypo.append(temp) # refe[i] = ref[i] #final_scores = score(refe, hypo) # # return final_scores['Bleu_1'] # #### normal scores ### hypo = {} final_scores = defaultdict() refe = {} for i, caption in enumerate(cand): hypo[i] = [caption] refe[i] = ref[i] #score1, scores = Bleu(4).compute_score(refe, hypo) #method = ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"] #for m, s in zip(method, scores): # final_scores[m] = s score1, scores = Rouge().compute_score(refe, hypo) final_scores['ROUGE_L'] = scores # # return 2 * final_scores['CiderD'] + 1 * final_scores['Bleu_4'] + 1*final_scores['ROUGE_L'] return final_scores['ROUGE_L']
def evaluate(self): # ================================================= # Tokenization # ================================================= print("Tokenization") tokenizer = PTBTokenizer() gts = tokenizer.tokenize(self.ground_truth) preds = tokenizer.tokenize(self.prediction) # ================================================= # Setup scorers # ================================================= print("Setting up scorers...") scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), # (Spice(), "SPICE") ] # ================================================= # Compute scores # ================================================= for scorer, method in scorers: print("Computing {} score...".format(scorer.method())) score, scores = scorer.compute_score(gts, preds) if isinstance(method, list): for sc, scs, m in zip(score, scores, method): self.eval_res[m] = sc * 100 else: self.eval_res[method] = score * 100
def evaluate(self): assert len(self.ground) == len(self.predictions) # ================================================= # Set up scorers # ================================================= #print('setting up scorers...') scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), #(Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] # ================================================= # Compute scores # ================================================= for scorer, method in scorers: #print('computing %s score...' % (scorer.method())) score, scores = scorer.compute_score(self.ground, self.predictions) if type(method) == list: for sc, scs, m in zip(score, scores, method): self.setEval(sc, m) #print("%s: %0.3f" % (m, sc)) else: self.setEval(score, method)
def __init__(self, ground_truth_filenames, prediction_filename, verbose=False, all_scorer=False): # Check that the gt and submission files exist and load them self.verbose = verbose self.all_scorer = all_scorer self.ground_truths = self.import_ground_truths(ground_truth_filenames) self.prediction = self.import_prediction(prediction_filename) self.tokenizer = PTBTokenizer() # Set up scorers, if not verbose, we only use the one we're # testing on: METEOR # Meteor is java-based and can crash alot. try: met = Meteor() except (AttributeError, FileNotFoundError) as e: print(f"Meteor couldn't start due to {e}") met = None if self.verbose or self.all_scorer: self.scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (met, "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] else: self.scorers = [(met, "METEOR")] # init some attributes self.easy_samples = {} self.hard_samples = {} self.n_ref_vids = set() self.scores = {}
def __init__(self): self.scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), # (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] #, (Cider(), "CIDEr")
def score(ref, hypo): """ ref, dictionary of reference sentences (id, sentence) hypo, dictionary of hypothesis sentences (id, sentence) score, dictionary of scores """ # print('ref') # print(ref) # print('hypo') # print(hypo) scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] final_scores = {} for scorer, method in scorers: score, scores = scorer.compute_score(ref, hypo) if type(score) == list: for m, s in zip(method, score): final_scores[m] = s else: final_scores[method] = score return final_scores
def __init__(self, ground_truth_filenames=None, prediction_filename=None, verbose=False, all_scorer=False): # Check that the gt and submission files exist and load them if not ground_truth_filenames: raise IOError('Please input a valid ground truth file.') if not prediction_filename: raise IOError('Please input a valid prediction file.') self.verbose = verbose self.all_scorer = all_scorer self.ground_truths = self.import_ground_truths(ground_truth_filenames) self.prediction = self.import_prediction(prediction_filename) self.tokenizer = PTBTokenizer() # Set up scorers, if not verbose, we only use the one we're # testing on: METEOR if self.verbose or self.all_scorer: self.scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(),"METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] else: self.scorers = [(Meteor(), "METEOR")]
def language_eval(sample_seqs, groundtruth_seqs): assert len(sample_seqs) == len(groundtruth_seqs), 'length of sampled seqs is different from that of groundtruth seqs!' references = OrderedDict() predictions = OrderedDict() for i in range(len(groundtruth_seqs)): references[i] = [groundtruth_seqs[i][j] for j in range(len(groundtruth_seqs[i]))] for i in range(len(sample_seqs)): predictions[i] = [sample_seqs[i]] predictions = {i: predictions[i] for i in range(len(sample_seqs))} references = {i: references[i] for i in range(len(groundtruth_seqs))} avg_bleu_score, bleu_score = Bleu(4).compute_score(references, predictions) print('avg_bleu_score == ', avg_bleu_score) avg_cider_score, cider_score = Cider().compute_score(references, predictions) print('avg_cider_score == ', avg_cider_score) avg_meteor_score, meteor_score = Meteor().compute_score(references, predictions) print('avg_meteor_score == ', avg_meteor_score) avg_rouge_score, rouge_score = Rouge().compute_score(references, predictions) print('avg_rouge_score == ', avg_rouge_score) # print('BLEU1:{}\nBLEU2:{}\nBLEU3:{}\nBLEU4:{}\nMETEOR:{}\nROUGE:{}CIDEr:{}\n'.format(avg_bleu_score[0], # avg_bleu_score[1], # avg_bleu_score[2], # avg_bleu_score[3], # avg_meteor_score, # avg_rouge_score, # avg_cider_score)) return {'BLEU': avg_bleu_score, 'CIDEr': avg_cider_score, 'METEOR': avg_meteor_score, 'ROUGE': avg_rouge_score}
def CocoScore(ref, hyp, metrics_list=None, language='en'): """ Obtains the COCO scores from the references and hypotheses. :param ref: Dictionary of reference sentences (id, sentence) :param hyp: Dictionary of hypothesis sentences (id, sentence) :param metrics_list: List of metrics to evaluate on :param language: Language of the sentences (for METEOR) :return: dictionary of scores """ if metrics_list is None: metrics_list = ['bleu', 'ter', 'meteor', 'rouge_l', 'cider'] else: metrics_list = [metric.lower() for metric in metrics_list] scorers = [] if 'bleu' in metrics_list: scorers.append((Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"])) if 'meteor' in metrics_list: scorers.append((Meteor(language), "METEOR")) if 'ter' in metrics_list: scorers.append((Ter(), "TER")) if 'rouge_l' in metrics_list or 'rouge' in metrics_list: scorers.append((Rouge(), "ROUGE_L")) if 'cider' in metrics_list: scorers.append((Cider(), "CIDEr")) final_scores = {} for scorer, method in scorers: score, _ = scorer.compute_score(ref, hyp) if isinstance(score, list): for m, s in zip(method, score): final_scores[m] = s else: final_scores[method] = score return final_scores
def main(eval_caption_file, output, zh=False): df = pd.read_json(eval_caption_file) if zh: refs = df.groupby("key")["tokens"].apply(list).to_dict() else: refs = df.groupby("key")["caption"].apply(list).to_dict() from pycocoevalcap.bleu.bleu import Bleu from pycocoevalcap.cider.cider import Cider from pycocoevalcap.rouge.rouge import Rouge scorer = Bleu(zh=zh) bleu_scores = coco_score(copy.deepcopy(refs), scorer) scorer = Cider(zh=zh) cider_score = coco_score(copy.deepcopy(refs), scorer) scorer = Rouge(zh=zh) rouge_score = coco_score(copy.deepcopy(refs), scorer) if not zh: from pycocoevalcap.meteor.meteor import Meteor scorer = Meteor() meteor_score = coco_score(copy.deepcopy(refs), scorer) from pycocoevalcap.spice.spice import Spice scorer = Spice() spice_score = coco_score(copy.deepcopy(refs), scorer) with open(output, "w") as f: for n in range(4): f.write("BLEU-{}: {:6.3f}\n".format(n + 1, bleu_scores[n])) f.write("CIDEr: {:6.3f}\n".format(cider_score)) f.write("ROUGE: {:6.3f}\n".format(rouge_score)) if not zh: f.write("Meteor: {:6.3f}\n".format(meteor_score)) f.write("SPICE: {:6.3f}\n".format(spice_score))
def compute_scores(gts, res): """ Performs the MS COCO evaluation using the Python 3 implementation (https://github.com/salaniz/pycocoevalcap) :param gts: Dictionary with the image ids and their gold captions, :param res: Dictionary with the image ids ant their generated captions :print: Evaluation score (the mean of the scores of all the instances) for each measure """ # Preprocess captions gts = preprocess_captions(gts) res = preprocess_captions(res) # Set up scorers scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Spice(), "SPICE"), (Cider(), "CIDEr") ] # Compute score for each metric for scorer, method in scorers: print("Computing", scorer.method(), "...") score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, m in zip(score, method): print("%s : %0.3f" % (m, sc)) else: print("%s : %0.3f" % (method, score))
def get_dcc_scores(self): imgIds = self.params['image_id'] # imgIds = self.coco.getImgIds() gts = {} res = {} for imgId in imgIds: gts[imgId] = self.coco.imgToAnns[imgId] res[imgId] = self.cocoRes.imgToAnns[imgId] tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] score_dict = {} for scorer, method in scorers: print 'computing %s score...' % (scorer.method()) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): score_dict[m] = sc print "%s: %0.3f" % (m, sc) else: score_dict[method] = score print "%s: %0.3f" % (method, score) return score_dict
def get_coco_score(gt_list, pred_list, verbose, extra_vars): """ gt_list, dictionary of reference sentences (id, sentence) pred_list, dictionary of hypothesis sentences (id, sentence) verbose - if greater than 0 the metric measures are printed out extra_vars - extra variables, here are: extra_vars['language'] - the target language score, dictionary of scores """ x_trgs = [x.lower() for x in gt_list] hypo = {idx: [lines.strip()] for (idx, lines) in enumerate(pred_list)} refs = {idx: [rr] for idx, rr in enumerate(x_trgs)} scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), #(Meteor(language=extra_vars['language']),"METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] final_scores = {} for scorer, method in scorers: score, scores = scorer.compute_score(refs, hypo) if type(score) == list: for m, s in zip(method, score): final_scores[m] = s else: final_scores[method] = score return final_scores
def score(num, DIR): print("Testing results on epoch ", num, " in DIR=", DIR) print("Loading coco annotations") dataDir = '.' dataType = 'val2014' algName = 'fakecap' annFile = '%s/annotations/captions_%s.json' % (dataDir, dataType) subtypes = ['results', 'evalImgs', 'eval'] [resFile, evalImgsFile, evalFile]= \ ['%s/results/captions_%s_%s_%s.json'%(dataDir,dataType,algName,subtype) for subtype in subtypes] coco_anns = COCO(annFile) print("COCO anns imported") path = DIR + str(num) + '_test_result.tar.gz' save = pickle.load(open(path)) cocoRes = {} coco = {} for key, val in save.items(): reslst = val[u'res'] res = [] for data in reslst: if data != u'<SEND>': res.append(data) else: break res = res[1:] #print "RES: ",reslst #print "ANN: ", val[u'ann'] #res = [word for word in res if word!=u'<SEND>'][1:] #print "RES FIXED: ", res if len(res) == 0: res = [u'a'] #just not to be empty, and it has low low idf cocoRes[key] = [{u'caption': ' '.join(res)}] #coco[key] = [{u'caption':' '.join(val[u'ann'][1:-1])}] coco[key] = coco_anns.imgToAnns[key] print 'examples' for key in coco.keys()[:5]: print "IMG_NUM=", key print "Annotation: ", '\n'.join( [coco[key][i][u'caption'] for i in range(len(coco[key]))]) print "Generated data: ", ' '.join(save[key][u'res']) print "Cleared generation: ", cocoRes[key][0][u'caption'] print 'tokenization...' tokenizer = PTBTokenizer() gts = tokenizer.tokenize(coco) res = tokenizer.tokenize(cocoRes) print 'setting up scorers...' scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE")] for scorer, method in scorers: print 'computing %s score...' % (scorer.method()) score, scores = scorer.compute_score(gts, res) print(score)