def eval_meteor(references, preds, best_match=False): if best_match: meteor_scores = [] for refs, pred in zip(references, preds): instance_scores = [meteor_score([ref], pred) for ref in refs] meteor_scores.append(max(instance_scores)) else: meteor_scores = [meteor_score(inst[0], inst[1]) for inst in zip(references, preds)] return round(sum(meteor_scores)/len(meteor_scores),3), meteor_scores
def calculate_metric(hyp, ref, context, effective_length=1024): # ===== Calculate rouge ======== with open('../result/rouge.txt', 'a') as f_result: rouge = Rouge() print(len(hyp)) print(len(ref)) hyp, ref = zip(*[(x, y) for x, y in zip(hyp, ref) if len(x) > 3 and len(y) > 3]) print(len(hyp)) hyp = [x[:effective_length] for x in hyp] ref = [x[:effective_length] for x in ref] scores = rouge.get_scores(hyp, ref, avg=True) print("ROUGE", scores) import time f_result.write(time.asctime() + '\n') f_result.write(args.model_dir + '\t' + str(effective_length) + '\n') f_result.write(str(scores)) f_result.write('\n') # ====== Calculate Meteor ========= print("#ref{} #hyp{}".format(len(ref), len(hyp))) meteor_sum = 0 for i in range(min(len(ref), len(hyp))): meteor_sum += meteor_score([ref[i]], hyp[i]) meteor_sum /= min(len(ref), len(hyp)) print(meteor_sum)
def print_out_bleu_and_meteor_score(predicted_path, expected_path): scores = [('BLEU SCORE-1: ', []), ('BLEU SCORE-2: ', []), ('BLEU SCORE-3: ', []), ('BLEU SCORE-4: ', []), ('METEOR SCORE: ', [])] with open(predicted_path, 'r') as fp_pred, open(expected_path, 'r') as fp_exp: for prediction, expected in tzip(fp_pred, fp_exp): prediction = prediction.split(' ') expected_list = expected.split(' ') scores[0][1].append( sentence_bleu(prediction, expected_list, weights=(1, 0, 0, 0))) scores[1][1].append( sentence_bleu(prediction, expected_list, weights=(0, 1, 0, 0))) scores[2][1].append( sentence_bleu(prediction, expected_list, weights=(0, 0, 1, 0))) scores[3][1].append( sentence_bleu(prediction, expected_list, weights=(0, 0, 0, 1))) scores[4][1].append(meteor_score(prediction, expected)) for score in scores: print(score[0] + str(sum(score[1]) / len(score[1]))) return 0
def print_metrics(model, device, dataset, dataloader): references, hypotheses = get_references_and_hypotheses( model, device, dataset, dataloader) # bleu scores bleu_1 = corpus_bleu(references, hypotheses, weights=(1, 0, 0, 0)) bleu_2 = corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0, 0)) bleu_3 = corpus_bleu(references, hypotheses, weights=(0.33, 0.33, 0.33, 0)) bleu_4 = corpus_bleu(references, hypotheses) print('BLEU-1 ({})\t' 'BLEU-2 ({})\t' 'BLEU-3 ({})\t' 'BLEU-4 ({})\t'.format(bleu_1, bleu_2, bleu_3, bleu_4)) # meteor score total_m_score = 0.0 for i in range(len(references)): actual = [" ".join(ref) for ref in references[i]] total_m_score += meteor_score(actual, " ".join(hypotheses[i])) m_score = total_m_score / len(references) print('Meteor Score: {}'.format(m_score)) metrics = { 'bleu_1': bleu_1, 'bleu_2': bleu_2, 'bleu_3': bleu_3, 'bleu_4': bleu_4, 'meteor': m_score } return metrics
def compute_score(self, candidate: str, references: List[str]) -> Tensor: score = meteor_score(references, candidate, alpha=self.alpha, beta=self.beta, gamma=self.gamma) return torch.scalar_tensor(score)
def getBLUEAndMEteroScores(sumDoc, refDoc, tClean): #refDoc = tClean.getSentTokenization(refDoc) BLUE = sentence_bleu(refDoc, sumDoc) MEtero = meteor_score(refDoc, sumDoc) # # return "{}\t{}".format(tClean.toRound(BLUE), tClean.toRound(MEtero))
def calculate_m_score(target, predictions, length): score = 0 for t, p in zip(target, predictions): score += meteor_score(t, p) return score / length
def _get_sent_meteor( hypothesis: List[str], references: List[List[str]], extra_args: Optional[Dict[str, str]] = None ) -> List[float]: joined_references = list(zip(*references)) return [ meteor_score(r, h) for r, h in zip(joined_references, hypothesis) ]
def calculate_meteor(results): meteor_scores = [] for key in results: references = results[key][1] hypothesis = results[key][0] score = meteor_score([' '.join(reference) for reference in references], ' '.join(hypothesis)) meteor_scores.append(score) return statistics.mean(meteor_scores), statistics.stdev(meteor_scores)
def compute_(self, **kwargs): question_decoded = self.dataset.question_tokenizer.decode( kwargs["state"].text.numpy()[0], ignored=["<SOS>"], stop_at_end=True) ref_questions = kwargs["ref_questions_decoded"] score = meteor_score(references=ref_questions, hypothesis=question_decoded) self.metric.append(score)
def Metrics(file_loc, increment=4, embedding_dict=None): rouge_p = [] rouge_r = [] rouge_f = [] bleu = [] fp = open(file_loc) D = fp.readlines() r_pre = 0.0 r_rec = 0.0 r_f1 = 0.0 bert = 0.0 sent_bleu = 0.0 meteor_s = 0.0 cnt_ = 0 i = 0 while i < len(D): tar = D[i + 2].split()[1:] mod = D[i + 1].split()[1:] if '<eor>' in tar: ind_tar = tar.index('<eor>') else: ind_tar = -1 if '<eor>' in mod: ind_mod = mod.index('<eor>') else: ind_mod = -1 tar_embs = [] mod_embs = [] for word in tar[:ind_tar]: if word in embedding_dict: tar_embs += [embedding_dict[word]] tar_embs = np.stack(tar_embs) for word in mod[:ind_mod]: if word in embedding_dict: mod_embs += [embedding_dict[word]] mod_embs = np.stack(mod_embs) tar_emb = np.sum(tar_embs, axis=0) mod_emb = np.sum(mod_embs, axis=0) bert -= np.mean((tar_emb - mod_emb)**2) r_scores = R.get_scores(' '.join(mod[:ind_mod]), ' '.join(tar[:ind_tar])) sent_bleu += bleu_met([mod[:ind_mod]], tar[:ind_tar], (0.5, 0.5)) meteor_s += meteor_score([' '.join(mod[:ind_mod])], ' '.join(tar[:ind_tar])) r_pre += r_scores[0]['rouge-l']['p'] r_rec += r_scores[0]['rouge-l']['r'] r_f1 += r_scores[0]['rouge-l']['f'] i += increment cnt_ += 1 return { 'METEOR': meteor_s / float(cnt_), 'BLEU': sent_bleu / float(cnt_), 'F1': r_f1 / float(cnt_), 'BERT': bert / float(cnt_) }
def get_meteor_score(result_list): total_meteor = 0 for line in result_list: single_reference = [line[0], line[1]] # reference_list.append(single_reference) # candidate_list.append(line[2]) score = meteor_score(single_reference, line[2], wordnet=wordnet) total_meteor += score print("meteor_score: ", total_meteor / len(result_list))
def get_meteor_score(hypothesis: List[List[str]], reference: List[str]) -> list: meteor_score_list = [] for (hyp, ref) in list(zip(hypothesis, reference)): try: m_score = meteor_score(hyp, ref) meteor_score_list.append(m_score) except: continue return meteor_score_list
def _compute_meteor(reference,predict): """Fun:compute meteor score Input: sentence with string, e.g: reference= "I have a car" For meteor_score: [reference1, reference2, reference3] input: predict is the string of sentence """ meteor = meteor_score(reference,predict) #### single mean one to one # meteor = single_meteor_score(reference,predict) return meteor
def corpus_meteor(list_of_refs, list_of_hypos): # the original input format of Meteor metric is different form BLEU series # in this function, we change the format of BLEU to fit Meteor Meteor = 0.0 for i, ref in enumerate(list_of_refs): ref_list_tmp = [' '.join(intlist2strlist(val)) for val in ref] hypo_tmp = ' '.join(intlist2strlist(list_of_hypos[i])) Meteor += meteor_score(ref_list_tmp, hypo_tmp) return Meteor / (len(list_of_hypos))
def compute(self, hypotheses, references): try: nltk.data.find('wordnet') except LookupError: nltk.download('wordnet') return sum([ meteor_score([ref], hyp) for (ref, hyp) in zip(hypotheses, references) ]) / len(references)
def forward(self, hypothesis: List[List[str]], references: List[List[List[str]]]) -> float: if len(hypothesis) != len(references): raise ValueError(f'Batch size of hypothesis and references are different ({len(hypothesis)} != {len(references)}).') batch_scores = [] for hyp, refs in zip(hypothesis, references): hyp = ' '.join(hyp) refs = [' '.join(ref) for ref in refs] score = meteor_score(hypothesis=hyp, references=refs, alpha=self.alpha, beta=self.beta, gamma=self.gamma) batch_scores.append(score) return torch.mean(torch.as_tensor(batch_scores)).item()
def test_preprocess(self): # Using lists instead of strings specifically to demonstrate use of `preprocess`. reference = [["this", "is", "a", "test"], ["this", "is" "test"]] candidate = ["this", "is", "a", "test"] # no `preprocess` argument self.assertRaises(TypeError, meteor_score, reference, candidate) # with `preprocess` argument score = meteor_score(reference, candidate, preprocess=lambda x: " ".join(x)) assert score == 0.9921875
def corpus_meteor(references, hypotheses): """ The original input format of Meteor metric is different form BLEU series. In this function, we change the format of BLEU to fit Meteor. """ def to_str(values): return [str(val) for val in values] Meteor = 0.0 for gt_group, pred in zip(references, hypotheses): gt = [' '.join(to_str(val)) for val in gt_group] pred = ' '.join(to_str(pred)) Meteor += meteor_score(gt, pred) return Meteor / (len(references))
def METEOR(image_names, captions, encoder, decoder, range1=6000, range2=7000): scoreList4 = [] for i in range(range1, range2): image_path = image_names[i] cap = captions[image_path][0].split() cap1 = captions[image_path][1].split() cap2 = captions[image_path][2].split() cap3 = captions[image_path][3].split() cap4 = captions[image_path][4].split() result, attention_plot = evaluate(image_path, encoder, decoder) result = result[:-1] s="" for m in result: s +=m + ' ' c="" for m in cap: c +=m + ' ' c1="" for m in cap1: c1 +=m + ' ' c2="" for m in cap2: c2 +=m + ' ' c3="" for m in cap3: c3 +=m + ' ' c4="" for m in cap4: c4 +=m + ' ' # print(s,c) score = meteor_score(s, c) score1 = meteor_score(s, c1) score2 = meteor_score(s, c2) score3= meteor_score(s, c3) score4 = meteor_score(s, c4) score= max(score,score1,score2,score3,score4) scoreList4.append(score) return (sum(scoreList4)*100)/len(scoreList4)
def sim_meteor(self, hyps, ref): """ :param refs - a list of strings representing references :param hyps - a list of tokens of the hypothesis :return maxbleu - recall bleu :return avgbleu - precision bleu """ scores = [] for hyp in hyps: #try: scores.append(meteor_score([ref], hyp)) #except: # scores.append(0.0) return np.max(scores), np.mean(scores)
def get_scores(model, loader, word_dict, idx_dict, device, debug): model.eval() references = [] hypotheses = [] for batch_idx, (imgs, captions, all_captions) in tqdm(enumerate(loader)): imgs, captions = Variable(imgs).to(device), Variable(captions).to( device) max_timespan = max([ len(caption) for caption in captions ]) - 1 # -1, because assuming ke model already generated start token preds, alphas = model(imgs, max_timespan) for cap_set in all_captions.tolist(): caps = [] for caption in cap_set: cap = [ word_idx for word_idx in caption if word_idx != word_dict['<start>'] and word_idx != word_dict['<pad>'] ] caps.append(cap) references.append(caps) word_idxs = torch.max(preds, dim=2)[1] for idxs in word_idxs.tolist(): hypotheses.append([ idx for idx in idxs if idx != word_dict['<start>'] and idx != word_dict['<pad>'] ]) if debug: break bleu_1 = corpus_bleu(references, hypotheses, weights=(1, 0, 0, 0)) bleu_2 = corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0, 0)) bleu_3 = corpus_bleu(references, hypotheses, weights=(0.33, 0.33, 0.33, 0)) bleu_4 = corpus_bleu(references, hypotheses) score = [] for i in range(len(references)): references_i = [] for j in references[i]: words = [] for k in j: words.append(idx_dict[k]) references_i.append(' '.join(words)) hypo_i = [] for j in hypotheses[i]: hypo_i.append(idx_dict[j]) score.append(meteor_score.meteor_score(references_i, ' '.join(hypo_i))) return (bleu_1, bleu_2, bleu_3, bleu_4, np.mean(score))
def calculate_metrics(predict, reference): reference_len = len(reference) predict_len = len(predict) #-------------------bleu---------- bleu_2 = bleu(predict, reference, 2) bleu_4 = bleu(predict, reference, 4) #-------------------nist---------- nist_2 = nist(predict, reference, 2) nist_4 = nist(predict, reference, 4) #-------------------meteor---------- predict = " ".join(predict) reference = " ".join(reference) meteor_scores = meteor_score([reference], predict) return bleu_2, bleu_4, nist_2, nist_4, meteor_scores
def get_metrics(pred, target): turns = len(target) bleu_2 = 0 bleu_4 = 0 meteor = 0 nist_2 = 0 nist_4 = 0 for index in range(turns): pred_utt = pred[index] target_utt = target[index] min_len = min(len(pred_utt), len(target_utt)) lens = min(min_len, 4) if lens == 0: continue if lens >= 4: bleu_4_utt = sentence_bleu( [target_utt], pred_utt, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=SmoothingFunction().method1) nist_4_utt = sentence_nist([target_utt], pred_utt, 4) else: bleu_4_utt = 0 nist_4_utt = 0 if lens >= 2: bleu_2_utt = sentence_bleu( [target_utt], pred_utt, weights=(0.5, 0.5), smoothing_function=SmoothingFunction().method1) nist_2_utt = sentence_nist([target_utt], pred_utt, 2) else: bleu_2_utt = 0 nist_2_utt = 0 bleu_2 += bleu_2_utt bleu_4 += bleu_4_utt meteor += meteor_score([" ".join(target_utt)], " ".join(pred_utt)) nist_2 += nist_2_utt nist_4 += nist_4_utt bleu_2 /= turns bleu_4 /= turns meteor /= turns nist_2 /= turns nist_4 /= turns return bleu_2, bleu_4, meteor, nist_2, nist_4
def main(): nltk.data.path.append('/data/chuancen/pip_package/nltk_data') print(nltk.__version__) file_handler = open('../../result/reference_SR_only.txt', 'r') ref = file_handler.readlines() file_handler = open('../../result/SR_only.txt', 'r') hyp = file_handler.readlines() print("#ref{} #hyp{}".format(len(ref), len(hyp))) meteor_sum = 0 for i in range(min(len(ref), len(hyp))): meteor_sum += meteor_score([ref[i]], hyp[i]) meteor_sum /= min(len(ref), len(hyp)) print(meteor_sum) tokenizer = GPT2Tokenizer.from_pretrained( '/data/chuancen/LIT/models/345M_Alex')
def compute_score(self, gts, res): assert(gts.keys() == res.keys()) imgIds = gts.keys() scores = [] for i in imgIds: assert(len(res[i]) == 1) score = round(meteor_score(gts[i], res[i][0]), 4) scores.append(score) #print('{}\n'.format(eval_line)) #self.meteor_p.stdin.write('{}\n'.format(eval_line)) #print(self.meteor_p.stdout.readline().strip()) #for i in range(0,len(imgIds)): # scores.append(float(self.meteor_p.stdout.readline().strip())) #score = float(self.meteor_p.stdout.readline().strip()) #self.lock.release() return sum(scores)/len(scores), scores
def _calc_metrics_info(self, generate_corpus, reference_corpus): generate_corpus = [ self._preprocess(generate_sentence) for generate_sentence in generate_corpus ] reference_corpus = [ self._preprocess(reference_sentence) for reference_sentence in reference_corpus ] reference_corpus = [[reference_sentence] for reference_sentence in reference_corpus] result = {} scores = [] for gen, refs in zip(generate_corpus, reference_corpus): score = meteor_score(refs, gen) scores.append(score) result['meteor'] = scores return result
def test(encoder, decoder, dataloader): score = 0 n = 0 for j, batch in tqdm(enumerate(dataloader)): input_tensor, target_tensor = batch if torch.cuda.is_available(): target_tensor = target_tensor.cuda() input_tensor = input_tensor.cuda() a, b = predict(encoder, decoder, input_tensor, target_tensor) try: score += meteor_score(a, b) n += 1 except Exception: pass input_text = [] for i in range(input_tensor.shape[1]): input_text.append(num2code[input_tensor[:, i].view(1).item()]) if input_text[-1] == 'PAD': break print(' '.join(input_text[1:-1])) print(a) score /= n print('METEOR: {}'.format(score))
def get_rouge_meteor_from_output(s_pred, s_true, reverse_word_map, order, weights): r_score_tot = 0 k_score_tot = 0 for batch in range(s_pred.shape[0]): sentence = '' sentence_true = [''] for word in range(s_pred.shape[2]): encoded_word = reverse_word_map.get(np.argmax(s_pred[batch, :, word])) if encoded_word: sentence +=' '+ encoded_word true_word = reverse_word_map.get(s_true[batch, word]) if true_word: sentence_true[0] += " "+ true_word r_score = rouge.get_scores(sentence_true[0], sentence) k_score = meteor_score(sentence_true, sentence, 4) r_score = r_score[0]['rouge-1']['f'] r_score_tot += r_score/s_pred.shape[0] k_score_tot += k_score/s_pred.shape[0] return r_score_tot, k_score_tot
def test_meteor(self): score = meteor_score(self.reference, self.candidate, preprocess=str.lower) assert score == 0.9921875