def main(): references = [[]] hypotheses = [] # Create NlG metrics evaluator nlgeval = NLGEval(metrics_to_omit=['SkipThoughtCS', 'GreedyMatchingScore', 'VectorExtremaCosineSimilarity', 'EmbeddingAverageCosineSimilarity']) with open('/home/jcardoso/MIMIC/encodedTestCaptionsF.json') as json_file: referenceCaptionsDict = json.load(json_file) with open('/home/jcardoso/MIMIC/encodedTrainCaptionsF.json') as json_file: KBCaptionsDict = json.load(json_file) reference_ids = list(referenceCaptionsDict.keys()) KB_ids = list(KBCaptionsDict.keys()) for i in tqdm(range(len(referenceCaptionsDict.keys()))): references[0].append(unifyCaption(referenceCaptionsDict[reference_ids[i]])) hypotheses.append(get_random_report(KB_ids, KBCaptionsDict)) metrics_dict = nlgeval.compute_metrics(references, hypotheses) print(metrics_dict) with open("RandomRefs.txt", 'w+') as file: for reference in references[0]: file.write(reference.strip() + '\n') with open("RandomPreds.txt", 'w+') as file: for hypothesis in hypotheses: file.write(hypothesis.strip() + '\n') with open("random_TestResults.txt", "w+") as file: for metric in metrics_dict: file.write(metric + ":" + str(metrics_dict[metric]) + "\n")
def run_metrics(self, output, refer_dataset): refer = refer_dataset.refer hypothesis = [] references = [] mp1 = 0.0 mp2 = 0.0 mean_objects = 0.0 total = 0.0 for row in output: ref_id = int(row['refID']) gen_sentence = row['gen_sentence'] hypothesis.append(row['gen_sentence']) references.append( [s['sent'] for s in refer.Refs[ref_id]['sentences']]) total += 1.0 mean_objects += row['n_objects'] mp1 += row['p@1'] mp2 += row['p@2'] references = list(zip(*references)) nlgeval = NLGEval(no_skipthoughts=True, no_glove=True, metrics_to_omit=['METEOR']) # loads the models metrics_dict = nlgeval.compute_metrics(references, hypothesis) metrics_dict['p@1'] = mp1 / total metrics_dict['p@2'] = mp2 / total return metrics_dict
def evaluate_trans(thenet, references, vali_data, vali_raw_data): hypothesis = [] score_total = 0. num_word_total = 0 for batch in vali_data: pred_batch, gold_batch, pred_scores, gold_scores, attn, src = thenet.translate( batch, vali_raw_data) score_total += sum([score[0] for score in pred_scores]) num_word_total += sum(len(x) for x in batch.tgt[1:]) hypothesis.extend([' '.join(x[0]) for x in pred_batch]) ppl = math.exp(-score_total / num_word_total) bleu_score = bleu.corpus_bleu( hypothesis, references)[0][0] #[final, n-gram1,n-gram2,...], [bp, ...] nlg_ref = [[x[0] for x in references if x is not None]] nlg_eval = NLGEval() save_txt('/fl/txtfile/rnn_h1.txt', hypothesis) metrics_eval = nlg_eval.compute_metrics(nlg_ref, hypothesis) print(metrics_eval) print('BLEU: {}'.format(bleu_score)) # training/validation 阶段的ppl计算在onmt/Trainer.py的Statisci()中;translating的ppl计算在 translate.py中的reprot_score函数里 print('PPL: {}'.format(ppl)) return torch.FloatTensor([ppl, bleu_score, 0.0]) # the last reserved for rank number
class NLGMetrics(BaseMetric): def __init__(self, *args, **kwargs): self.nlgeval = NLGEval(no_glove=True, no_skipthoughts=True) @staticmethod def prepare_sent(tokens: List[str]) -> str: return recover_desc(tokens) def eval(self, hypos: Iterable[List[List[str]]], references: Iterable[List[str]], src_references: Iterable[List[str]], *args, **kwargs) -> dict: # List[str] first_hypos = [self.prepare_sent(hypo_list[0]) for hypo_list in hypos] src_ref_strs = [self.prepare_sent(src_ref) for src_ref in src_references] # List[List[str]] references_lists = [[self.prepare_sent(ref) for ref in references]] # distinct metrics_dict = self.nlgeval.compute_metrics(references_lists, first_hypos) # relative improve src_metrics_dict = self.nlgeval.compute_metrics(references_lists, src_ref_strs) relative_metrics_dict = OrderedDict({}) for key in metrics_dict: relative_metrics_dict[key] = (metrics_dict[key] - src_metrics_dict[key]) / src_metrics_dict[key] return { 'Bleu_4': metrics_dict['Bleu_4'], 'METEOR': metrics_dict['METEOR'] }
def get_evalutation_scores(hypothesis, refrences, testing_mode=False): gleu_scores = {"Gleu_1": gleu.corpus_gleu(refrences, hypothesis, min_len=1, max_len=1), "Gleu_2": gleu.corpus_gleu(refrences, hypothesis, min_len=1, max_len=2), "Gleu_3": gleu.corpus_gleu(refrences, hypothesis, min_len=1, max_len=3), "Gleu_4": gleu.corpus_gleu(refrences, hypothesis, min_len=1, max_len=4) } if testing_mode: for i in range(len(hypothesis)): hypothesis[i] = ' '.join(hypothesis[i]) refs = [[]] for i in range(len(refrences)): refs[0].append(' '.join(refrences[i][0])) if refs[0][-1] == "": refs[0][-1] = "no" refrences = refs n = NLGEval() scores = n.compute_metrics(ref_list=refrences, hyp_list=hypothesis) else: scores = {"Bleu_1": bleu_score.corpus_bleu(refrences, hypothesis, weights=[1.0]), "Bleu_2": bleu_score.corpus_bleu(refrences, hypothesis, weights=[1. / 2, 1. / 2]), "Bleu_3": bleu_score.corpus_bleu(refrences, hypothesis, weights=[1. / 3, 1. / 3, 1. / 3]), "Bleu_4": bleu_score.corpus_bleu(refrences, hypothesis, weights=[1. / 4, 1. / 4, 1. / 4, 1. / 4])} for key, val in gleu_scores.items(): scores[key] = val return scores
def get_all_nlgeval_metrics(complex_sentence, simple_sentence): if 'NLGEVAL' not in globals(): global NLGEVAL print('Loading NLGEval models...') # Change False to True if you want to use skipthought or glove NLGEVAL = NLGEval(no_skipthoughts=True, no_glove=True) print('Done.') return NLGEVAL.compute_individual_metrics([complex_sentence], simple_sentence)
def __init__(self): self.eval = NLGEval(no_skipthoughts=True, no_glove=True, metrics_to_omit=[ 'EmbeddingAverageCosineSimilairty', 'VectorExtremaCosineSimilarity', 'GreedyMatchingScore' ])
def __init__(self,hparams,glove_comparer): super(Metrics_Calculator, self).__init__() self.nlg_eval = NLGEval(metrics_to_omit=['EmbeddingAverageCosineSimilairty', 'EmbeddingAverageCosineSimilarity','GreedyMatchingScore','SkipThoughtCS','VectorExtremaCosineSimilarity']) self.list_dict_track = {"data":[]} self.hparams = hparams self.glove_comparer = glove_comparer
def __init__(self, model, data, lr=0.001, to_record=None, patience=3, metric='Bleu_1'): self.model = model self.data = data self.criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX) self.optimizer = optim.Adam(model.parameters(), lr=lr) self.stats = Stats(to_record) self.evaluator = NLGEval(no_skipthoughts=True, no_glove=True) self.earlystopper = EarlyStopper(patience=patience, metric=metric)
def evaluate(hypothesis, references, no_skipthoughts=True, no_glove=True, metrics_to_omit=['METEOR']): nlgeval = NLGEval(no_skipthoughts=no_skipthoughts, no_glove=no_glove, metrics_to_omit=metrics_to_omit) return nlgeval.compute_metrics(references, hypothesis)
def __init__(self, config, metric_names=[ "Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4", "METEOR", "ROUGE_L", "CIDEr" ]): super().__init__(config, metric_names) # please install NLGEval from `https://github.com/Maluuba/nlg-eval` from nlgeval import NLGEval self.nlg = NLGEval()
def eval_using_nlgeval(ref_list, pred_list, multiple): if VERBOSE: print('Loading the NLG eval model...') nlge = NLGEval(metrics_to_omit=['METEOR', 'CIDEr'], no_skipthoughts=True, no_glove=True) # nlge = NLGEval(metrics_to_omit=['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4', 'CIDEr', 'ROUGE_L'], no_skipthoughts=True, no_glove=True) if VERBOSE: print('\nComputing Scores...') return nlge.compute_metrics(ref_list, pred_list, multiple=multiple)
def com_score(self, ref, pre): # for gold, hype in zip(ref, pre): # temp = [] # temp.append(gold) # metrics_dict = compute_individual_metrics(temp, hype) # break r_list = [] r_list.append(ref) nlgeval = NLGEval() metrics_dict = nlgeval.compute_metrics(r_list, pre) return metrics_dict
def meteor(self): """ Computes METEOR using the NLGEval library Link: https://github.com/Maluuba/nlg-eval""" metrics_to_omit = { "Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4", "ROUGE_L", "CIDEr" } nlgeval = NLGEval(no_skipthoughts=True, no_glove=True, metrics_to_omit=metrics_to_omit) self.metrics.update( nlgeval.compute_metrics([self.target], self.hypothesis))
def test_oo_api(): with open("examples/hyp.txt") as f: hyp = f.readlines() with open("examples/ref1.txt") as f: ref1 = f.readlines() with open("examples/ref2.txt") as f: ref2 = f.readlines() nlge = NLGEval() res = nlge.evaluate([ref1[0]] + [ref2[0]], hyp[0]) res = nlge.evaluate([ref1[1]] + [ref2[1]], hyp[1])
def calculate_rouge(prediction, ground_truth, tokenizer): nlgeval = NLGEval() references = [] hypotheses = [] for x, y in zip(ground_truth, prediction): x = tokenizer.decode(x, skip_special_tokens=True) y = tokenizer.decode(y, skip_special_tokens=True) references.append([x]) hypotheses.append(y) metrics_dict = nlgeval.compute_metrics(references, hypotheses) return metrics_dict['ROUGE_L'], references, hypotheses
def evaluate_model(model, descriptions, photos, tokenizer, max_length): actual, predicted = list(), list() # step over the whole set for key, desc_list in descriptions.items(): # generate description yhat = generate_desc(model, tokenizer, photos[key], max_length) # store actual and predicted references = [d.split() for d in desc_list] actual.append(references) predicted.append(yhat.split()) # calculate all metrics nlgeval = NLGEval() # loads the models metrics_dict = nlgeval.compute_metrics(actual, predicted)
def evaluate(loader, lstmDec, linNet, VocabData): Index2Word = dict([val,key] for key,val in VocabData['word_dict'].items()) # dictionary from index to word # if torch.cuda.is_available(): lstmDec = lstmDec.to(device).eval() linNet = linNet.to(device).eval() # nn.DataParallel(linNet,device_ids=[0, 1]).to(device) nlgeval = NLGEval() ld = iter(loader) numiters = len(ld) qdar = tqdm.tqdm(range(numiters), total=numiters, ascii=True) loss_itr_list = [] def linOut2DecIn(global_hidden, box_feat): # box_feat [8, 4, 4096, 3, 3] global_hidden = global_hidden.unsqueeze(0) encoder_hidden = (global_hidden,torch.zeros_like(global_hidden).to(device)) B,M,D,H,W = box_feat.size() encoder_outputs = box_feat.permute(0,1,3,4,2).contiguous().view(B,-1,D) return encoder_hidden, encoder_outputs def lstr(ts,pres=3): return str(np.round(ts.data.cpu().numpy(), 3)) with torch.no_grad(): # evaluate mode references = [[]] hypothesis = [] for i in qdar: # step 1: load data batchdata = next(ld) box_feats, box_global_feats, numBoxes, box_captions_gt = makeInp(*batchdata) # box_feats: (numImage,numBoxes,512,7,7) box_global_feats: list, numImage [(512,34,56)] references[0] += [" ".join([Index2Word[i] for i in s if Index2Word[i] != '<PAD>']) for s in box_captions_gt.data.cpu().numpy()] #create batch of reference based on indices # step 2: data transform by linNet box_feat,box_feat_dec, global_hidden = linNet(box_feats, box_global_feats) # step 3: decode to captions by lstmDec encoder_hidden, encoder_outputs = linOut2DecIn(global_hidden,box_feat_dec) decoder_outputs, decoder_hidden, ret_dict = lstmDec(encoder_hidden=encoder_hidden, encoder_outputs=encoder_outputs, max_len=int(5*numBoxes)) # box_feat [8, 4, 4096, 3, 3] # step 4: calculate loss # Loss 1: Similarity loss lengths = torch.LongTensor(ret_dict['length']).to(device) decoder_outputs = torch.stack([decoder_outputs[i] for i in range(len(decoder_outputs))], 1) # decoder_outputs [8, 15, 10878] word_indices = decoder_outputs.argmax(2).data.cpu().numpy() #batch_size x seq_len hypothesis += [" ".join([Index2Word[i] for i in s if Index2Word[i] != '<PAD>']) for s in word_indices] #create batch of hypothesis based on indices if i == 10: break print(nlgeval.compute_metrics(references, hypothesis))
def test_compute_metrics_omit(self): n = NLGEval(metrics_to_omit=['Bleu_3', 'METEOR', 'EmbeddingAverageCosineSimilarity']) # Individual Metrics scores = n.compute_individual_metrics(ref=["this is a test", "this is also a test"], hyp="this is a good test") self.assertAlmostEqual(0.799999, scores['Bleu_1'], places=5) self.assertAlmostEqual(0.632455, scores['Bleu_2'], places=5) self.assertAlmostEqual(0.9070631, scores['ROUGE_L'], places=5) self.assertAlmostEqual(0.0, scores['CIDEr'], places=5) self.assertAlmostEqual(0.8375251, scores['SkipThoughtCS'], places=5) self.assertAlmostEqual(0.94509, scores['VectorExtremaCosineSimilarity'], places=5) self.assertAlmostEqual(0.960771, scores['GreedyMatchingScore'], places=5) self.assertEqual(7, len(scores))
def evaluateNLG(gen_dials, ref_dialogues): hyp_list, ref_list = [], [] for fname in gen_dials: hyp_list.extend(gen_dials[fname]) # list of sentence string ref_list.extend([ s.strip() for s in ref_dialogues[fname]['sys'] ]) # list of ref_list, each ref_list is a list of sentence string ref_lists = [ref_list] # only put 1 reference from nlgeval import NLGEval nlgeval = NLGEval() # loads the models metrics_dict = nlgeval.compute_metrics(ref_list=ref_lists, hyp_list=hyp_list) print(metrics_dict) return metrics_dict
def NLGE_evaluation(encoder, decoder, search_method, word2ix, ix2word, input_seqs, target_seqs, templates=None): """ Function that computes several metrics using the NLG-eval python package (https://github.com/Maluuba/nlg-eval) :param encoder: Pytorch model that serves as encoder. :param decoder: Pytorch model that serves as decoder. :param search_method: Pytorch model used for making searches during inference. (e.g GreedySearch) :param word2ix: Python dictionary with tokens as keys and indexes as values. :param ix2word: Python dictionary with indexes as keys and tokens as values. :param input_seqs: List containing the vectorized question that will be used for testing the model. :param target_seqs: List containing the vectorized ground truth answers that will be used for testing the model. """ nlg_eval = NLGEval(metrics_to_omit=[ 'Bleu_3', 'Bleu_4', 'METEOR', 'CIDEr', 'SkipThoughtCS' ]) hypothesis = [] references = [] if templates: vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, strip_accents='ascii') template2vec = vectorizer.fit_transform(templates) for input_seq, target_seq in tqdm(zip(input_seqs, target_seqs), total=input_seqs.shape[0]): input_seq, input_length, _, _, _ = prepare_data([input_seq], [target_seq]) tokens = search_method(input_seq, input_length, 300, word2ix['_BOS_']) tokens = tokens.view( 1, -1 )[0] if search_method.__class__.__name__ == "GreedySearchDecoder" else tokens answer = ' '.join([ ix2word[token] for token in tokens.cpu().numpy() if token != word2ix['_PAD_'] ]) if templates: template, score = template_retrieval(answer, templates, template2vec, vectorizer) if score > 0.75: answer = template hypothesis.append(answer) references.append(' '.join([ix2word[token] for token in target_seq])) return nlg_eval.compute_metrics(ref_list=[references], hyp_list=hypothesis)
class EvaluateNL(): def __init__(self): self.eval = NLGEval(no_skipthoughts=True, no_glove=True, metrics_to_omit=[ 'EmbeddingAverageCosineSimilairty', 'VectorExtremaCosineSimilarity', 'GreedyMatchingScore' ]) def compute(self, refs, hyps): data = [] for i, ref in enumerate(refs): ref = ref.replace('\n', '') hyp = hyps[i].replace('\n', '') if not ref: continue scores = self.eval.compute_individual_metrics(ref=[ref], hyp=hyp) scores = sorted(scores.items()) self._metrics = [s[0] for s in scores] #data.append([ref, hyp]) data.append([ ref, hyp, *[ str(float('%0.6f' % (s[1]))).replace('.', ',') for s in scores ] ]) return pd.DataFrame(data, columns=['Reference', 'Hypotesi', *self._metrics])
def get_nlg_eval(): if not hasattr(get_nlg_eval, "nlg_eval"): print("Loading eval data (first time only)") get_nlg_eval.nlg_eval = NLGEval(no_glove=True, no_skipthoughts=True, metrics_to_omit=["CIDEr"]) return get_nlg_eval.nlg_eval
def __init__(self, model_name: str = "gpt2", range_cand: bool = False, make_eval: bool = False, tokenizer_path: str = "default", pretrained_path: str = "default") -> None: """ Possible models: mt5-large, mt5-base, mt5-small, gpt2, gpt3 :param model_name: :param make_filter: :param cache_file_path: """ self.logger = logging.getLogger(__name__) self.tokenizer_path = tokenizer_path self.pretrained_path = pretrained_path self.make_eval = make_eval self.range_cand = range_cand self.device = torch.device("cpu") if self.range_cand: self.smodel = SentenceTransformer( "paraphrase-xlm-r-multilingual-v1") if self.make_eval: self.ngeval = NLGEval(metrics_to_omit=[ "EmbeddingAverageCosineSimilairty", "CIDEr", "METEOR", "SkipThoughtCS", "VectorExtremaCosineSimilarity", "GreedyMatchingScore", ]) self.model_name = model_name self._check_model(model_name)
class NLGMetric(Metric): def __init__(self, config, metric_names=[ "Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4", "METEOR", "ROUGE_L", "CIDEr" ]): super().__init__(config, metric_names) # please install NLGEval from `https://github.com/Maluuba/nlg-eval` from nlgeval import NLGEval self.nlg = NLGEval() def compute_metrics(self, outputs, targets, **kwargs): return self.nlg.compute_metrics(hyp_list=outputs, ref_list=targets) def print_computed_metrics(self, metrics): Bleu_1 = metrics["Bleu_1"] Bleu_2 = metrics["Bleu_2"] Bleu_3 = metrics["Bleu_3"] Bleu_4 = metrics["Bleu_4"] METEOR = metrics["METEOR"] ROUGE_L = metrics["ROUGE_L"] CIDEr = metrics["CIDEr"] print( "Bleu_1: {:.4f} - Bleu_2: {:.4f} - Bleu_3: {:.4f} - Bleu_4: {:.4f} - METEOR: {:.4f} - ROUGE_L: {:.4f} - CIDEr: {:.4f}" .format(Bleu_1, Bleu_2, Bleu_3, Bleu_4, METEOR, ROUGE_L, CIDEr))
def main(args): nlgeval = NLGEval(no_skipthoughts=True, no_glove=True) samples = {} with open(args.gen_file) as f: for line in tqdm(f): hypo, refs = line.rstrip().split('\t') metrics_dict = nlgeval.compute_individual_metrics( refs.split('*#'), hypo) samples[(hypo, refs)] = metrics_dict['Bleu_4'] for hypo, refs in sorted(samples.keys(), key=samples.__getitem__)[:args.num_samples]: print('BLEU:', samples[(hypo, refs)]) print('H:', hypo) for r in refs.split('*#'): print('R:', r) print('---')
def get_nlgeval(): try: from nlgeval import NLGEval except ModuleNotFoundError: print( 'nlg-eval module not installed. Please install with ', 'pip install nlg-eval@git+https://github.com/Maluuba/nlg-eval.git') print('Loading NLGEval models...') return NLGEval(no_skipthoughts=True, no_glove=True)
def __init__(self): """ Loads metrics without extra (slow) models. Calculates BLEU-1, BLEU-2, BLEU-3, BLEU-4, ROUGE-L, METEOER, and CIDEr. """ self.nlgeval_metrics = NLGEval(no_overlap=False, no_glove=True, no_skipthoughts=True)
def test_oo_api(): with open("examples/hyp.txt") as f: hyp = f.readlines() hyp = [x.strip() for x in hyp] with open("examples/ref1.txt") as f: ref1 = f.readlines() ref1 = [x.strip() for x in ref1] with open("examples/ref2.txt") as f: ref2 = f.readlines() ref2 = [x.strip() for x in ref2] nlge = NLGEval() res = nlge.compute_individual_metrics([ref1[0]] + [ref2[0]], hyp[0]) res = nlge.compute_individual_metrics([ref1[1]] + [ref2[1]], hyp[1]) hyp_list = hyp ref_list = [ref1, ref2] res = nlge.compute_metrics(ref_list, hyp_list)
def __calculate_scores(result_file, ref_file, block_print=True): reference_file = json.load(open(ref_file)) ref_video_keys = sorted(list(reference_file.keys())) ref_text_list = sum( [reference_file[item]['sentences'] for item in ref_video_keys], []) file_data = json.load(open(result_file)) hyp_text_list = sum( [[i['sentence'].lower() for i in file_data['results'][item]] for item in ref_video_keys], []) hyp_text_list = [ '<NONE>' if len(item) == 0 else item for item in hyp_text_list ] # for empty generated result nlgeval = NLGEval(no_skipthoughts=True, no_glove=True) result = nlgeval.compute_metrics(hyp_list=ref_text_list, ref_list=[hyp_text_list]) metrics = {'Average across tIoUs': result} return metrics