def evaluate(self, gts, res): # ================================================= # Set up scorers # ================================================= logging.info('tokenization...') tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # ================================================= # Set up scorers # ================================================= logging.info('setting up scorers...') scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") #(Spice(), "SPICE") ] # ================================================= # Compute scores # ================================================= bleu_4_score = 0 for scorer, method in scorers: logging.info('computing %s score...' % (scorer.method())) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): self.setEval(sc, m) self.setImgToEvalImgs(scs, gts.keys(), m) if m == "Bleu_4": bleu_4_score = sc logging.info("%s: %0.3f" % (m, sc)) print("%s: %0.3f" % (m, sc)) else: self.setEval(score, method) self.setImgToEvalImgs(scores, gts.keys(), method) logging.info("%s: %0.3f" % (method, score)) print("%s: %0.3f" % (method, score)) self.setEvalImgs() return bleu_4_score
def evaluate(self): imgIds = self.params['image_id'] # imgIds = self.coco.getImgIds() gts = {} res = {} for imgId in imgIds: gts[imgId] = self.coco.imgToAnns[imgId] res[imgId] = self.cocoRes.imgToAnns[imgId] # ================================================= # Set up scorers # ================================================= print 'tokenization...' tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # ================================================= # Set up scorers # ================================================= print 'setting up scorers...' scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE")] # ================================================= # Compute scores # ================================================= for scorer, method in scorers: print 'computing %s score...' % (scorer.method()) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): self.setEval(sc, m) self.setImgToEvalImgs(scs, gts.keys(), m) print "%s: %0.3f" % (m, sc) else: self.setEval(score, method) self.setImgToEvalImgs(scores, gts.keys(), method) print "%s: %0.3f" % (method, score) self.setEvalImgs()
def __score(ref, hypo): """ ref, dictionary of reference sentences (id, sentence) hypo, dictionary of hypothesis sentences (id, sentence) score, dictionary of scores """ scorers = [ (Bleu(4), ["Bleu_1"]), (Rouge(), "ROUGE_L"), ] final_scores = {} for scorer, method in scorers: score, scores = scorer.compute_score(ref, hypo) if type(score) == list: for m, s in zip(method, score): final_scores[m] = s else: final_scores[method] = score return final_scores
def main(hyp, ref, len): with open(hyp, 'r') as r: hypothesis = r.readlines() res = {k: [" ".join(v.strip().lower().split()[:len])] for k, v in enumerate(hypothesis)} with open(ref, 'r') as r: references = r.readlines() gts = {k: [v.strip().lower()] for k, v in enumerate(references)} score_Bleu, scores_Bleu = Bleu(4).compute_score(gts, res) print("Bleu_1: "), np.mean(scores_Bleu[0]) print("Bleu_2: "), np.mean(scores_Bleu[1]) print("Bleu_3: "), np.mean(scores_Bleu[2]) print("Bleu_4: "), np.mean(scores_Bleu[3]) score_Meteor, scores_Meteor = Meteor().compute_score(gts, res) print("Meteor: "), score_Meteor score_Rouge, scores_Rouge = Rouge().compute_score(gts, res) print("ROUGe: "), score_Rouge score_Cider, scores_Cider = Cider().compute_score(gts, res) print("Cider: "), score_Cider
def evaluate(self): output = [] scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), #, (Rouge(), "ROUGE_L") ] # ================================================= # Compute scores # ================================================= for scorer, method in scorers: print("scorer:", scorer) print("method:", method) score, scores = scorer.compute_score(self.gts, self.res) if type(method) == list: for sc, scs, m in zip(score, scores, method): print "%s: %0.5f" % (m, sc) output.append(sc) else: print "%s: %0.5f" % (method, score) output.append(score) return output
def evaluate(self): imgIds = self.params['image_id'] # imgIds = self.coco.getImgIds() gts = {} res = {} for imgId in imgIds: gts[imgId] = self.coco.imgToAnns[imgId] res[imgId] = self.cocoRes.imgToAnns[imgId] # ================================================= # Set up scorers # ================================================= tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # ================================================= # Set up scorers # ================================================= scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(),"METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] # ================================================= # Compute scores # ================================================= for scorer, method in scorers: score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): self.setEval(sc, m) self.setImgToEvalImgs(scs, gts.keys(), m) else: self.setEval(score, method) self.setImgToEvalImgs(scores, gts.keys(), method) self.setEvalImgs()
def evaluate(self): # imgIds = self.coco.getImgIds() gts = dict(zip(range(0, len(self.predicted_list)), self.predicted_list)) res = dict(zip(range(0, len(self.label_list)), self.label_list)) # ================================================= # Set up scorers # ================================================= print 'tokenization...' tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # ================================================= # Set up scorers # ================================================= print 'setting up scorers...' scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] # ================================================= # Compute scores # ================================================= for scorer, method in scorers: print 'computing %s score...' % (scorer.method()) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): self.setEval(sc, m) self.set_textid_to_eval(scs, gts.keys(), m) print "%s: %0.3f" % (m, sc) else: self.setEval(score, method) self.set_textid_to_eval(scores, gts.keys(), method) print "%s: %0.3f" % (method, score) self.set_eval()
def evaluate(self): output = [] scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), # (Meteor(),"METEOR"), (Rouge(), "ROUGE_L"), ] # ================================================= # Compute scores # ================================================= for scorer, method in scorers: # print 'computing %s score...'%(scorer.method()) score, scores = scorer.compute_score(self.gts, self.res) if type(method) == list: for sc, scs, m in zip(score, scores, method): print "%s: %0.5f" % (m, sc) output.append(sc) else: print "%s: %0.5f" % (method, score) output.append(score) return output
def evaluate(self): # ================================================== # Tokenization, remove punctutions # ================================================= ''' print "tokenization ..." tokenizer = PTBTokenizer() gts = tokenizer.tokenize(self.ref) res = tokenizer.tokenize(self.res) ''' gts = self.ref # ================================================== # Set up scorers # ================================================== print "setting up scorers ..." scorers = [ (Bleu(4), ("Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4")), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] # ================================================== # Set up scorers # ================================================== out = {} for scorer, method in scorers: print "computing %s score ..." %(scorer.method()) score, scores = scorer.compute_score(gts, res) if isinstance(method, tuple): for sc, scs, m in zip(score, scores, method): out[m] = sc print "%s: %0.4f" %(m, sc) else: print "%s: %0.4f" %(method, score) out[method] = score return out
def language_eval( sample_seqs, gt_seqs ): # sample_seqs:list[[x,x],[x,x],...], gt_seqs:list[[list1,list2,...],[list1,list2,...],...] import sys #sys.path.append("caption-eval") sys.path.append("coco-caption/pycocoevalcap") from bleu.bleu import Bleu from cider.cider import Cider from meteor.meteor import Meteor from rouge.rouge import Rouge assert len(sample_seqs) == len(gt_seqs), "number of eval data is different" res = OrderedDict() # res: {0:[xx],1:[xx],...} for i in range(len(sample_seqs)): # for each data(sent) res[i] = [sample_seqs[i]] gts = OrderedDict() # gts: {0:[sent1,sent2,...],1:[sent1,sent2,...], ...} for i in range(len(gt_seqs)): gts[i] = [gt_seqs[i][j] for j in range(len(gt_seqs[i]))] res = {i: res[i] for i in range(len(sample_seqs))} gts = {i: gts[i] for i in range(len(gt_seqs))} avg_bleu_score, bleu_scores = Bleu(4).compute_score(gts, res) avg_cider_score, cider_scores = Cider().compute_score(gts, res) avg_meteor_score, meteor_scores = Meteor().compute_score(gts, res) avg_rouge_score, rouge_scores = Rouge().compute_score(gts, res) print(" BLEU1:{}\n BLEU2:{}\n BLEU3:{}\n BLEU4:{}\n METEOR:{}\n ROUGE:{}\n CIDEr:{}\n"\ .format(avg_bleu_score[0], avg_bleu_score[1], avg_bleu_score[2], avg_bleu_score[3], \ avg_meteor_score, avg_rouge_score, avg_cider_score)) return { 'BLEU': avg_bleu_score, 'METEOR': avg_meteor_score, 'ROUGE': avg_rouge_score, 'CIDEr': avg_cider_score }
def evaluate(self): output = {} scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), # (Cider(), "CIDEr") ] # ================================================= # Compute scores # ================================================= for scorer, method in scorers: # print 'computing %s score...'%(scorer.method()) score, scores = scorer.compute_score(self.gts, self.res) if type(method) == list: for sc, scs, m in zip(score, scores, method): #print ("%s: %0.5f"%(m, sc)) output[m] = sc else: #print ("%s: %0.5f"%(method, score)) output[method] = score return output
def score(ref, hypo): """ ref, dictionary of reference sentences (id, sentence) hypo, dictionary of hypothesis sentences (id, sentence) score, dictionary of scores """ #print('setting up scorers...') scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), #(Meteor(), "METEOR"), # hidde currently due to slow speed (Rouge(), "ROUGE_L") #(Cider(), "CIDEr") ] final_scores = {} for scorer, method in scorers: score, scores = scorer.compute_score(ref, hypo) if type(score) == list: for m, s in zip(method, score): final_scores[m] = s else: final_scores[method] = score # print('final_scores: ', final_scores) return final_scores
subparsers = parser.add_subparsers(help="choose between different modes of operation") preprocess = subparsers.add_parser("preprocess", help="preprocessing related commands") preprocess.add_argument("--op", help="operation", type=str, choices=["make", "vocab"], required=True) exclusive = preprocess.add_mutually_exclusive_group(required=True) exclusive.add_argument("--dataset", help="the dataset to process, should be subdir of iobasedir", action="append", default=[]) exclusive.add_argument("--topic", help="the topic to process, should be subdir of iobasedir, and contain a processed topic", action="append", default=[]) # preprocess_data = preprocess.add_subparsers(help="preprocessing commands") # preprocess_dataset = preprocess_data.add_parser("dataset", # help="data preprocessing tool. prepare raw dataset for summarization") # preprocess_dataset.add_argument("") # postprocess = subparsers.add_parser("postprocess", # help="Postprocessing of results. Convering of raw results into pretty pictures and reports") args = parser.parse_args() do_preprocess(rouge=Rouge(args.rouge), datasets=args.dataset, topics=args.topic, operation=args.op)
def test_n_score(self): models = PerlScriptUtils._parse_models(self.models_path) systems = PerlScriptUtils._parse_systems(self.systems_path) # no swr, no stem n_scores = {} for system_id, cand_texts in tqdm(systems.items()): if system_id not in n_scores: n_scores[system_id] = {} for topic_id, cand_text in cand_texts.items(): if topic_id not in n_scores[system_id]: n_scores[system_id][topic_id] = {} ref_texts = models[topic_id].values() n_score = self.rouge.n_score(ref_texts, cand_text) n_scores[system_id][topic_id] = n_score n_scores_perl = PerlScriptUtils.parse_rouge_perl_out("duc2005_subset/rouge_perl.out") for system_id, pyrouge_topics in n_scores.items(): for topic_id, pyrouge_scores in pyrouge_topics.items(): scores_perl = n_scores_perl[system_id][topic_id] for n, scores in scores_perl.items(): for k,v in scores.items(): # ROUGE truncates, while we round. self.assertAlmostEqual(v, pyrouge_scores[n][k], 4, "Results different from original ROUGE.") # swr, no stem self.rouge = Rouge.from_rouge155_args({"s": True}) n_scores = {} for system_id, cand_texts in tqdm(systems.items()): if system_id not in n_scores: n_scores[system_id] = {} for topic_id, cand_text in cand_texts.items(): if topic_id not in n_scores[system_id]: n_scores[system_id][topic_id] = {} ref_texts = models[topic_id].values() n_score = self.rouge.n_score(ref_texts, cand_text) n_scores[system_id][topic_id] = n_score n_scores_perl = PerlScriptUtils.parse_rouge_perl_out("duc2005_subset/rouge_perl_swr.out") for system_id, pyrouge_topics in n_scores.items(): for topic_id, pyrouge_scores in pyrouge_topics.items(): scores_perl = n_scores_perl[system_id][topic_id] for n, scores in scores_perl.items(): for k,v in scores.items(): # ROUGE truncates, while we round. self.assertAlmostEqual(v, pyrouge_scores[n][k], 4, "Results different from original ROUGE (swr).") # stem, no swr self.rouge = Rouge.from_rouge155_args({"m": True}) n_scores = {} for system_id, cand_texts in tqdm(systems.items()): if system_id not in n_scores: n_scores[system_id] = {} for topic_id, cand_text in cand_texts.items(): if topic_id not in n_scores[system_id]: n_scores[system_id][topic_id] = {} ref_texts = models[topic_id].values() n_score = self.rouge.n_score(ref_texts, cand_text) n_scores[system_id][topic_id] = n_score n_scores_perl = PerlScriptUtils.parse_rouge_perl_out("duc2005_subset/rouge_perl_stem.out") for system_id, pyrouge_topics in n_scores.items(): for topic_id, pyrouge_scores in pyrouge_topics.items(): scores_perl = n_scores_perl[system_id][topic_id] for n, scores in scores_perl.items(): for k,v in scores.items(): # ROUGE truncates, while we round. self.assertAlmostEqual(v, pyrouge_scores[n][k], 4, "Results different from original ROUGE (stem).") # stem, swr self.rouge = Rouge.from_rouge155_args({"m": True, "s": True}) n_scores = {} for system_id, cand_texts in tqdm(systems.items()): if system_id not in n_scores: n_scores[system_id] = {} for topic_id, cand_text in cand_texts.items(): if topic_id not in n_scores[system_id]: n_scores[system_id][topic_id] = {} ref_texts = models[topic_id].values() n_score = self.rouge.n_score(ref_texts, cand_text) n_scores[system_id][topic_id] = n_score n_scores_perl = PerlScriptUtils.parse_rouge_perl_out("duc2005_subset/rouge_perl_stem_swr.out") for system_id, pyrouge_topics in n_scores.items(): for topic_id, pyrouge_scores in pyrouge_topics.items(): scores_perl = n_scores_perl[system_id][topic_id] for n, scores in scores_perl.items(): for k,v in scores.items(): # ROUGE truncates, while we round. self.assertAlmostEqual(v, pyrouge_scores[n][k], 4, "Results different from original ROUGE (stem, swr).")
'オレンジ色 の Tシャツ を 着ている 人 が 立って います', ] #prediceted は一つだけじゃないとダメ predicted = {} predicted['262148'] = ['人 が オレンジ色 の シャツ を 着て 立って います'] #keyは数字でも文字列でもどっちでもいいけど、ground truth と predicedで対応が取れるように! #compute blue scorer = Bleu(4) score, scores = scorer.compute_score(ground_truth, predicted) print(scores) for i, value in enumerate(scores): print( i, np.mean(value) ) # not same. Blue does not use standard mean.some weighted geometric mean? #meter requires other thesaurus to #compute Rouge scorer = Rouge() score, scores = scorer.compute_score(ground_truth, predicted) print(score) print(np.mean(scores)) #compute CIDEr scorer = Cider() score, scores = scorer.compute_score(ground_truth, predicted) print(score) print(np.mean(scores))
def cross_evaluate(self): """ We will evaluate how relevant is the generated expression to the ground-truth expressions, and how different it is to the expressions of the other objects within the same image. Thus, the prerequisite is the dataset is split by image_id, and each ann has multiple expressions, e.g., our new RefCOCO dataset whose tesing object has ~10 expressions. We first compute score on sc_ii = (sent_i, gd_sents_i), then compute score on sc_ij = (sent_i, gd_sents_j), the margin of max(0, sc_ii - sc_ij) will be considered as final score. Speficically, we choose METEOR and CIDEr for this kind of evaluation. For doing so, we need to prepare ref_to_gts and ref_to_res. As we want to do cross evaluation, our key would be paird_id, i.e., "ref_id1_to_ref_id2", e.g, '123_456', then input: - Gts[123_456] = [456's gd sents] - Res[123_456] = [123's predicted sents]. return: - ref_to_eval[123_456] = {method: score}, which measures 123's generation over 456's gd-sents Note, we also compute score of 123_123 We will use "sids" and "cids" to donate source_ref_ids and cross_ref_ids. """ source_ref_ids = [pred['ref_id'] for pred in self.preds] Preds = {pred['ref_id']: pred['sent'] for pred in self.preds} # construct pair_id, which is [source_ref_id]_[target_ref_id], i.e, 123_456 Gts = {} Res = {} for source_ref_id in source_ref_ids: image_id = self.refer.Refs[source_ref_id]['image_id'] cross_refs = self.refer.imgToRefs[ image_id] # including source_ref itself for cross_ref in cross_refs: pair_id = str(source_ref_id) + '_' + str(cross_ref['ref_id']) Res[pair_id] = [Preds[source_ref_id]] Gts[pair_id] = [ sent['sent'] for sent in cross_ref['sentences'] ] # tokenize print 'tokenization...' tokenizer = PTBTokenizer() Gts = tokenizer.tokenize(Gts) Res = tokenizer.tokenize(Res) # set up scorers print 'setting up scorers...' scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] # compute scores for scorer, method in scorers: print 'computing %s score...' % (scorer.method()) score, scores = scorer.compute_score(Gts, Res) if type(method) == list: for sc, scs, m in zip(score, scores, method): self.setEvals(scs, Gts.keys(), m) print "%s: %0.3f" % (m, sc) else: self.setEvals(scores, Gts.keys(), method) print "%s: %0.3f" % (method, score)
def valid(model, mode='all'): model.eval() with open(DATASET_DEV_CLS3, 'rb') as f: dataset_cls3 = pickle.load(f) with open(DATASET_DEV_CLS18, 'rb') as f: dataset_cls18 = pickle.load(f) dataset_summ_qa = data.ConcatDataset([dataset_cls3, dataset_cls18]) cls3_loader = torch.utils.data.DataLoader(dataset=dataset_cls3, batch_size=VALID_BATCH, shuffle=False, collate_fn=lambda x: x) cls3_iterator = iter(cls3_loader) cls18_loader = torch.utils.data.DataLoader(dataset=dataset_cls18, batch_size=VALID_BATCH, shuffle=False, collate_fn=lambda x: x) cls18_iterator = iter(cls18_loader) rouge_summ = rouge_qa = None acc_cls3 = acc_cls18 = 0 # -------------------------------------------------------------------- if mode in ['all', 'summ', 'qa']: data_val_sum_qa = [] if VALID_NUM > 0: for i in range(VALID_NUM): data_val_sum_qa.append(dataset_summ_qa[i]) else: for i in range(len(dataset_summ_qa)): data_val_sum_qa.append(dataset_summ_qa[i]) if mode in ['all', 'summ']: refs = [' '.join(data['question']) for data in data_val_sum_qa] x = [data['description'] for data in data_val_sum_qa] hyps = beam_search('summ', model, x) hyps = [' '.join(list(sent)) for sent in hyps] rouge = Rouge() try: rouge_summ = rouge.get_scores(hyps, refs, avg=True, ignore_empty=True) print_rouge(rouge_summ) except RuntimeError: print('Failed to compute Rouge!') if mode in ['all', 'qa']: refs = [' '.join(data['answer']) for data in data_val_sum_qa] x = [data['question'] for data in data_val_sum_qa] hyps = beam_search('qa', model, x) hyps = [' '.join(list(sent)) for sent in hyps] rouge = Rouge() try: rouge_qa = rouge.get_scores(hyps, refs, avg=True, ignore_empty=True) print_rouge(rouge_qa) except RuntimeError: print('Failed to compute Rouge!') # cls3 & cls18 def iter_through_cls_dev(iterator, mode): val_correct = 0 val_num = 0 for i in range(math.ceil(VALID_NUM / VALID_BATCH)): mini_batch = next(iterator) question = [data['question'] for data in mini_batch] description = [data['description'] for data in mini_batch] y_gt = torch.tensor([data['category'] for data in mini_batch]).to(device) y_pred = model(source=description, source2=question, target=None, mode=mode) y_pred_labels = torch.argmax(y_pred, dim=1) val_correct += (y_gt == y_pred_labels).sum().item() val_num += len(mini_batch) return val_correct / val_num if mode in ['all', 'cls3']: acc_cls3 = iter_through_cls_dev(cls3_iterator, 'cls3') print('Acc_cls3:', acc_cls3) if mode in ['all', 'cls18']: acc_cls18 = iter_through_cls_dev(cls18_iterator, 'cls18') print('Acc_cls18:', acc_cls18) if is_training: model.train() return rouge_summ, rouge_qa, acc_cls3, acc_cls18
class CaptionEvaluater(object): def __init__(self, ): self.blue_scorer = Bleu(4) self.rouge_scorer = Rouge() self.cider_scorer = Cider() self.truth = None remove = string.punctuation + "、。,." self.remove_pattern = r"[{}]".format(remove) # create the pattern def remove_punctuation(self, line): #I am not sure how unicode works in python, so just in case. line = line.replace(u"<unk>", "") line = line.replace("<unk>", "") line = line.replace(u"。", "") line = line.replace('\u3002', "") return re.sub(self.remove_pattern, "", line) def trnasform_utf8(self, line): # return u' '.join(line).encode('utf-8').strip() return line def set_ground_truth(self, ground_truth): ''' ground_truth should be a python dictonary whose shape is; {"image_identifier": ["a caption", "a similar caption", ...], ...} "image_identifier" can be either string or number. ''' for img in ground_truth: # ground_truth[img]=map(self.trnasform_utf8,ground_truth[img]) ground_truth[img] = map(self.remove_punctuation, ground_truth[img]) self.truth = ground_truth def evaluate(self, predicetd_captions): ''' predicetd_captions should be a python dictonary whose shape is; {"image_identifier": ["the prediced caption"], ...} "image_identifier" need to be same as used in ground truth. make sure the number of caption is only one, even though it uses python list. ''' for img in predicetd_captions: # predicetd_captions[img]=map(self.trnasform_utf8,predicetd_captions[img]) predicetd_captions[img] = map(self.remove_punctuation, predicetd_captions[img]) results = {} for i, score in enumerate(self.get_bleu(predicetd_captions)[0]): results["bleu-%d" % i] = score results["rouge"] = self.get_rouge(predicetd_captions)[0] results["cider"] = self.get_cider(predicetd_captions)[0] return results def get_bleu(self, predicetd_captions): score, scores = self.blue_scorer.compute_score(self.truth, predicetd_captions) #output is a python list [bleu-1,bleu-2,bleu-3,bleu-4] return score, scores def get_rouge(self, predicetd_captions): score, scores = self.rouge_scorer.compute_score( self.truth, predicetd_captions) return score, scores def get_cider(self, predicetd_captions): score, scores = self.cider_scorer.compute_score( self.truth, predicetd_captions) return score, scores
def evaluate(self): evalRefIds = [ann['ref_id'] for ann in self.Res] refToGts = {} refToGtRanks1 = {} refToGtRanks2 = {} for ref_id in evalRefIds: ref = self.refer.Refs[ref_id] gt_sents = [sent['sent'] for sent in ref['sentences']] refToGts[ref_id] = gt_sents if self.eval_cider_r: gt_ranks1 = self.refer.get_rank1(ref) gt_ranks2 = self.refer.get_rank2(ref) refToGtRanks1[ref_id] = gt_ranks1 refToGtRanks2[ref_id] = gt_ranks2 refToRes = {ann['ref_id']: [ann['sent']] for ann in self.Res} print('tokenization...') tokenizer = PTBTokenizer() self.refToRes = tokenizer.tokenize(refToRes) self.refToGts = tokenizer.tokenize(refToGts) # ================================================= # Set up scorers # ================================================= print('setting up scorers...') scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), ] if self.eval_cider_r: scorers.append((CiderR(), "R_CIDEr")) scorers.append((CiderRa(), "Ra_CIDEr")) # ================================================= # Compute scores # ================================================= for scorer, method in scorers: print('computing %s score...' % (scorer.method())) if method == "R_CIDEr": score, scores = scorer.compute_score(self.refToGts, self.refToRes, refToGtRanks1) elif method == "Ra_CIDEr": score, scores = scorer.compute_score(self.refToGts, self.refToRes, refToGtRanks2) else: score, scores = scorer.compute_score(self.refToGts, self.refToRes) if type(method) == list: for sc, scs, m in zip(score, scores, method): self.setEval(sc, m) self.setRefToEvalRefs(scs, self.refToGts.keys(), m) print("%s: %0.3f" % (m, sc)) else: self.setEval(score, method) self.setRefToEvalRefs(scores, self.refToGts.keys(), method) print("%s: %0.3f" % (method, score)) self.setEvalRefs()
def evaluate(self): imgIds = self.params['image_id'] # imgIds1 = self.params1 # print "type of imgids ",type(imgIds) # # print "imgids",imgIds # # print "imgids1",imgIds1 # for x in xrange(1,10): # print "image ids in evaluate function",imgIds[x] # imgIds = self.coco.getImgIds() gts = {} res = {} # count =0 # print "image ids for gts COCO type ",type(self.coco.imgToAnns) for imgId in imgIds: # if count <= 10: # # for x in xrange(1,10): # # print "image ids for gts",self.coco.imgToAnns[imgId] # # print "image ids for gts len ",len(self.cocoRes.imgToAnns[imgId]) # # gts[imgId] = self.coco.imgToAnns[imgId] res[imgId] = self.cocoRes.imgToAnns[imgId] gts[imgId] = self.coco.imgToAnns[imgId] # print "image ids for res ",self.cocoRes.imgToAnns[imgId] # print "image ids for res typr ",type(self.cocoRes.imgToAnns[imgId]) # count = count+1; # for imgId in imgIds1: # if count <= 10: # # for x in xrange(1,10): # # print "image ids for gts",self.coco.imgToAnns[imgId] # # print "image ids for gts len ",len(self.cocoRes.imgToAnns[imgId]) # gts[imgId] = self.coco.imgToAnns[imgId] # # res[imgId] = self.cocoRes.imgToAnns[imgId] # print "image ids for gts ",self.coco.imgToAnns[imgId] # print "image ids for gts typr ",type(self.coco.imgToAnns[imgId]) # count = count+1; # ================================================= # Set up scorers # ================================================= print 'tokenization...' tokenizer = PTBTokenizer() gts = tokenizer.tokenize_gt(gts) res = tokenizer.tokenize(res) # ================================================= # Set up scorers # ================================================= print 'setting up scorers...' scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] # ================================================= # Compute scores # ================================================= for scorer, method in scorers: print 'computing %s score...' % (scorer.method()) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): self.setEval(sc, m) self.setImgToEvalImgs(scs, gts.keys(), m) print "%s: %0.3f" % (m, sc) else: self.setEval(score, method) self.setImgToEvalImgs(scores, gts.keys(), method) print "%s: %0.3f" % (method, score) self.setEvalImgs()
def evaluate_summ_qa(model, dataset, mode, batch_size=64): assert mode in ('summ', 'qa'), 'Invalid mode!' model.eval() data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False, collate_fn=lambda x: x) rouge1_f_sum = rouge2_f_sum = rougeL_f_sum = bleu_sum = 0 examples_rouge = examples_bleu = 0 rouge = Rouge() count = 0 if mode == 'summ': for mini_batch in tqdm(data_loader): count += 1 refs = [' '.join(data['question']) for data in mini_batch] x = [data['description'] for data in mini_batch] hyps_raw = beam_search('summ', model, x) hyps = [' '.join(list(sent)) for sent in hyps_raw] try: rouge_score = rouge.get_scores(hyps, refs, avg=True, ignore_empty=True) rouge1_f_sum += rouge_score['rouge-1']['f'] * len(mini_batch) rouge2_f_sum += rouge_score['rouge-2']['f'] * len(mini_batch) rougeL_f_sum += rouge_score['rouge-l']['f'] * len(mini_batch) examples_rouge += len(mini_batch) except ValueError as e: print(str(e) + ' | continuing...') continue elif mode == 'qa': for mini_batch in tqdm(data_loader): count += 1 refs = [' '.join(data['answer']) for data in mini_batch] x = [data['question'] for data in mini_batch] hyps_raw = beam_search('qa', model, x) hyps = [' '.join(list(sent)) for sent in hyps_raw] try: rouge_score = rouge.get_scores(hyps, refs, avg=True, ignore_empty=True) rouge1_f_sum += rouge_score['rouge-1']['f'] * len(mini_batch) rouge2_f_sum += rouge_score['rouge-2']['f'] * len(mini_batch) rougeL_f_sum += rouge_score['rouge-l']['f'] * len(mini_batch) examples_rouge += len(mini_batch) except ValueError as e: print(str(e) + ' | continuing...') continue # calculate BLEU score refs = [data['answer'] for data in mini_batch] hyps = [list(sent) for sent in hyps_raw] smoothie = SmoothingFunction().method4 for i in range(len(hyps)): try: bleu = sentence_bleu([refs[i]], hyps[i], smoothing_function=smoothie) bleu_sum += bleu examples_bleu += 1 except ZeroDivisionError as e: print(str(e) + ' | continuing...') continue rouge_1_f = rouge1_f_sum / examples_rouge rouge_2_f = rouge2_f_sum / examples_rouge rouge_L_f = rougeL_f_sum / examples_rouge if mode == 'qa': bleu_score = bleu_sum / examples_bleu # with open('output/test_{}.txt'.format(mode), 'w', encoding='utf-8') as f: # f.write('rouge-1 f: ' + str(rouge_1_f) + '\n') # f.write('rouge-2 f: ' + str(rouge_2_f) + '\n') # f.write('rouge-L f: ' + str(rouge_L_f) + '\n') # f.write('\n') # # for i in range((len(candidates)): # f.write('input: ' + inputs[i] + '\n') # f.write('hyp: ' + ''.join(candidates[i]) + '\n') # f.write('ref: ' + targets[i] + '\n\n') if is_training: model.train() print('rouge-1 f: ' + str(rouge_1_f)) print('rouge-2 f: ' + str(rouge_2_f)) print('rouge-L f: ' + str(rouge_L_f)) if mode == 'qa': print('bleu: ', bleu_score)
hdf = hdf.loc[hdf['LP'] == lp] hdf = hdf.loc[hdf['SYSTEM'] == sys] hdf.reset_index(drop=True, inplace=True) cands = [] fc = open(csdir + '/' + cs, "r", encoding='utf-8') while True: line = fc.readline() if not line: break cands.append(line) assert len(cands) == len(refs) rouge = Rouge() scores = rouge.get_scores(cands, refs) R1 = [one['rouge-1']['f'] for one in scores] R2 = [one['rouge-2']['f'] for one in scores] R3 = [one['rouge-l']['f'] for one in scores] outlist.append([ lp, sys, np.mean(R1), np.mean(R2), np.mean(R3), hdf['HUMAN'].item() ]) end = perf_counter() print("LP : {0:10}SYS: {1:30s}time taken: {2:5.3f}".format( lp, sys, end - start)) sz = len(cses) pees = [row[2] for row in outlist[-sz:]]
def setUp(self): self.rouge = Rouge.from_rouge155_args() self.models_path = Path("duc2005_subset/models") self.systems_path = Path("duc2005_subset/peers")
def evaluate(self, gt_path, results_path): #imgIds = self.params['image_id'] # imgIds = self.coco.getImgIds() gts = {} gt_pkl = pickle.load(open(gt_path, 'rb')) for key in gt_pkl.keys(): sample = gt_pkl[key] img_id = int(sample['image_id']) caption = sample['caption'] if img_id in gts: gts[img_id].append({'image_id': img_id, 'caption': caption}) else: gts[img_id] = [{'image_id': img_id, 'caption': caption}] #print(gts) res = {} res_json = json.load(open(results_path,'rb')) for sample in res_json: img_id = int(sample['image_id']) res[img_id] = [sample] #print(res) #print(1/0) #for imgId in imgIds: #gts[imgId] = self.coco.imgToAnns[imgId] #print(gts[imgId]) #res[imgId] = self.cocoRes.imgToAnns[imgId] #print(imgId) #print(res[imgId]) #print(1/0) #print(res) #print(1/0) # ================================================= # Set up scorers # ================================================= print 'tokenization...' tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # ================================================= # Set up scorers # ================================================= print 'setting up scorers...' scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(),"METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") #(Spice(), "SPICE") ] # ================================================= # Compute scores # ================================================= for scorer, method in scorers: print 'computing %s score...'%(scorer.method()) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): self.setEval(sc, m) self.setImgToEvalImgs(scs, gts.keys(), m) print "%s: %0.3f"%(m, sc) else: self.setEval(score, method) self.setImgToEvalImgs(scores, gts.keys(), method) print "%s: %0.3f"%(method, score) self.setEvalImgs()
if i % 2 == 1: res[int(line.strip('\n').split(':')[0])] = [ line.strip('\n').split(':')[2] ] elif i % 2 == 0: gts[int(line.strip('\n').split(':')[0])] = [ line.strip('\n').split(':')[2] ] hyps = [] refs = [] bleu_score = 0.0 for k in res: assert k in gts hyps.append(res[k][0]) refs.append(gts[k][0]) for hyp, ref in zip(hyps, refs): hyp = hyp.strip().split() ref = ref.strip().split() bleu_score += sentence_bleu([ref], hyp, smoothing_function=SmoothingFunction().method4) print("score_Bleu: "), bleu_score * 1.0 / len(hyps) score_Meteor, scores_Meteor = Meteor().compute_score(gts, res) print("Meteor: "), score_Meteor score_Rouge, scores_Rouge = Rouge().compute_score(gts, res) print("ROUGe: "), score_Rouge
# res[imgId] = res_results[imgId] # ================================================= # Set up scorers # ================================================= # print 'tokenization...' # tokenizer = PTBTokenizer() # gts = tokenizer.tokenize(gts) # res = tokenizer.tokenize(res) # ================================================= # Set up scorers # ================================================= print 'setting up scorers...' scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), #(Meteor(),"METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] # ================================================= # Compute scores # ================================================= for scorer, method in scorers: print 'computing %s score...' % (scorer.method()) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): #self.setEval(sc, m) #self.setImgToEvalImgs(scs, gts.keys(), m) print "%s: %0.3f" % (m, sc) else: #self.setEval(score, method)
filename = '../data/ASAP_AES/training_set_rel3.tsv' ref_file = '../data/ASAP_AES/reference_3_aes.tsv' df = pd.read_csv(filename, delimiter='\t', encoding='ISO-8859–1') ref1 = pd.read_csv(ref_file, delimiter='\t') can1 = df.loc[df['essay_set'] == 3] can1.reset_index(drop=True, inplace=True) cands = list(can1['essay']) print("Candidate Sentences: ", len(cands)) # print(cands[0]) ref1.reset_index(drop=True, inplace=True) ref = ref1['Reference'][0] rouge = Rouge() sc = [] for cand, canid, hscore in zip(cands, can1['essay_id'], can1['domain1_score']): scores = rouge.get_scores(cand, ref)[0] sc.append([ canid, scores['rouge-1']['f'], scores['rouge-1']['p'], scores['rouge-1']['r'], \ scores['rouge-2']['f'], scores['rouge-2']['p'], scores['rouge-2']['r'], \ scores['rouge-l']['f'], scores['rouge-l']['p'], scores['rouge-l']['r'], hscore ]) # print(can1.columns) odf = pd.DataFrame(sc, columns=[ 'cand id', 'R1f', 'R1p', 'R1r', 'R2f', 'R2p', 'R2r', 'RLf', 'RLp', 'RLr', 'score' ]) odf.reset_index(drop=True, inplace=True) odf.to_csv("outs.tsv", sep="\t", index=False, header=True)
from six.moves import cPickle import torch import torch.nn as nn import opts opt = opts.parse_opt() from bleu.bleu import Bleu from meteor.meteor import Meteor from cider.cider import Cider from rouge.rouge import Rouge Bleu_score = Bleu(4) Meteor_score = Meteor() Cider_score = Cider() Rouge_score = Rouge() with open(opt.train_data_path, 'rb') as f: print('\nload {}'.format(opt.train_data_path)) train_data = cPickle.load(f) with open(opt.val_data_path, 'rb') as f: print('\nload {}'.format(opt.val_data_path)) val_data = cPickle.load(f) with open(opt.test_data_path, 'rb') as f: print('\nload {}'.format(opt.test_data_path)) test_data = cPickle.load(f) with open(opt.token2index_path, 'rb') as f: print('\nload {}'.format(opt.token2index_path))
def compute_metrics_from_files(p_path_to_reference_file, p_path_to_candidate_file, p_max_bleu_order): """Compute BLEU-N and ROUGE-L metrics. IMPORTANT: No-answer reference will be excluded from calculation. Args: p_path_to_reference_file (str): path to reference file. p_path_to_candidate_file (str): path to candidate file. Both files should be in format: {QUERY_ID_JSON_ID: <a_query_id_int>, ANSWERS_JSON_ID: [<list_of_answers_string>]} p_max_bleu_order: the maximum n order in bleu_n calculation. Returns: dict: dictionary of {'bleu_n': <bleu_n score>, 'rouge_l': <rouge_l score>} """ reference_dictionary, reference_no_answer_query_ids = \ load_file(p_path_to_reference_file) candidate_dictionary, candidate_no_answer_query_ids = load_file(p_path_to_candidate_file) query_id_answerable = set(reference_dictionary.keys())-reference_no_answer_query_ids query_id_answerable_candidate = set(candidate_dictionary.keys())-candidate_no_answer_query_ids true_positives = len(query_id_answerable_candidate.intersection(query_id_answerable)) false_negatives = len(query_id_answerable)-true_positives true_negatives = len(candidate_no_answer_query_ids.intersection(reference_no_answer_query_ids)) false_positives = len(reference_no_answer_query_ids)-true_negatives precision = float(true_positives)/(true_positives+false_positives) if (true_positives+false_positives)>0 else 1. recall = float(true_positives)/(true_positives+false_negatives) if (true_positives+false_negatives)>0 else 1. F1 = 2 *((precision*recall)/(precision+recall)) filtered_reference_dictionary = \ {key: value for key, value in reference_dictionary.items() \ if key not in reference_no_answer_query_ids} filtered_candidate_dictionary = \ {key: value for key, value in candidate_dictionary.items() \ if key not in reference_no_answer_query_ids} for query_id, answers in filtered_candidate_dictionary.items(): assert \ len(answers) <= 1, \ 'query_id %d contains more than 1 answer \"%s\" in candidate file' % \ (query_id, str(answers)) reference_query_ids = set(filtered_reference_dictionary.keys()) candidate_query_ids = set(filtered_candidate_dictionary.keys()) common_query_ids = reference_query_ids.intersection(candidate_query_ids) assert (len(common_query_ids) == len(reference_query_ids)) and \ (len(common_query_ids) == len(candidate_query_ids)), \ 'Reference and candidate files must share same query ids' all_scores = {} bleu_scores, _ = \ Bleu(p_max_bleu_order).compute_score(filtered_reference_dictionary, \ filtered_candidate_dictionary) for i, bleu_score in enumerate(bleu_scores): all_scores['bleu_%d' % (i+1)] = bleu_score rouge_score, _ = Rouge().compute_score(filtered_reference_dictionary, \ filtered_candidate_dictionary) all_scores['rouge_l'] = rouge_score all_scores['F1'] = F1 similarity = 0 for key in filtered_reference_dictionary: candidate_answer = nlp(filtered_candidate_dictionary[key][0]) reference_answer = filtered_reference_dictionary[key] answersimilarity = 0 for answer in reference_answer: answersimilarity += candidate_answer.similarity(nlp(answer)) similarity += answersimilarity/len(reference_answer) semantic_similarity = similarity/len(filtered_reference_dictionary) all_scores['Semantic_Similarity'] = semantic_similarity return all_scores
def evaluate(self, preds, measure=None): """ measure is a subset of ['bleu', 'meteor', 'rouge', 'cider'] if measure is None, we will apply all the above. """ # story_img_ids -> pred story str stimgids_to_Res = { item['stimgids']: [item['pred_story_str'].encode('ascii', 'ignore').decode('ascii')] for item in preds } # story_img_ids -> gt storie str(s) stimgids_to_stories = {} for story in self.vist_sis.stories: story_img_ids = '_'.join( [str(img_id) for img_id in story['img_ids']]) if story_img_ids in stimgids_to_stories: stimgids_to_stories[story_img_ids] += [story] else: stimgids_to_stories[story_img_ids] = [story] stimgids_to_Gts = {} for stimgids in stimgids_to_Res.keys(): gd_story_strs = [] related_stories = stimgids_to_stories[stimgids] for story in related_stories: gd_sent_ids = self.vist_sis.Stories[story['id']]['sent_ids'] gd_story_str = ' '.join([ self.vist_sis.Sents[sent_id]['text'] for sent_id in gd_sent_ids ]) gd_story_str = gd_story_str.encode('ascii', 'ignore').decode( 'ascii') # ignore some weird token gd_story_strs += [gd_story_str] stimgids_to_Gts[stimgids] = gd_story_strs # tokenize # print 'tokenization ... ' # tokenizer = PTBTokenizer() # self.stimgids_to_Res = tokenizer.tokenize(stimgids_to_Res) # self.stimgids_to_Gts = tokenizer.tokenize(stimgids_to_Gts) self.stimgids_to_Res = stimgids_to_Res self.stimgids_to_Gts = stimgids_to_Gts # ================================================= # Set up scorers # ================================================= print 'setting up scorers...' scorers = [] if not measure: scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] else: if 'bleu' in measure: scorers += [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"])] if 'meteor' in measure: scorers += [(Meteor(), "METEOR")] if 'rouge' in measure: scorers += [(Rouge(), "ROUGE_L")] if 'cider' in measure: scorers += [(Cider(), "CIDEr")] # ================================================= # Compute scores # ================================================= for scorer, method in scorers: print 'computing %s score ...' % (scorer.method()) score, scores = scorer.compute_score(self.stimgids_to_Gts, self.stimgids_to_Res) if isinstance(method, list): for sc, scs, m in zip(score, scores, method): self.setEval(sc, m) self.setStimgidsToEval(scs, self.stimgids_to_Gts.keys(), m) print '%s: %.3f' % (m, sc) else: self.setEval(score, method) self.setStimgidsToEval(scores, self.stimgids_to_Gts.keys(), method) print '%s: %.3f' % (method, score) self.setEvalStimgids()