def main() -> None: args = parse_arguments(subtype="evaluate_shallow_metrics") # get verbosity if args.verbosity == 1: logger = logging.getLogger('base') else: logger = logging.getLogger('root') # define json glob json_glob = args.json_glob # define search space files = glob(json_glob) for input_file in files: # log information logger.info("Computing bleu and chrf scores: %s", input_file) # load single dictionary and compute surface similarity scores with open(input_file, "r") as f: store = json.load(f) for key in tqdm(store.keys()): source_orig_de = store[key]["sentence_original"]["source"] source_para_de = store[key]["sentence_paraphrase"]["source"] target_orig_en = store[key]["sentence_original"]["target"] target_para_en = store[key]["sentence_paraphrase"]["target"] chrf_bar_source = (sacrebleu.sentence_chrf( source_orig_de, [source_para_de]).score + sacrebleu.sentence_chrf( source_para_de, [source_orig_de]).score) / 2 chrf_bar_target = (sacrebleu.sentence_chrf( target_orig_en, [target_para_en]).score + sacrebleu.sentence_chrf( target_para_en, [target_orig_en]).score) / 2 bleu_bar_source = (sacrebleu.sentence_bleu( source_orig_de, [source_para_de]).score + sacrebleu.sentence_bleu( source_para_de, [source_orig_de]).score) / 2 bleu_bar_target = (sacrebleu.sentence_bleu( target_orig_en, [target_para_en]).score + sacrebleu.sentence_bleu( target_para_en, [target_orig_en]).score) / 2 store[key]["chrf_bar_source"] = chrf_bar_source store[key]["chrf_bar_target"] = chrf_bar_target store[key]["bleu_bar_source"] = bleu_bar_source store[key]["bleu_bar_target"] = bleu_bar_target store[key]["chrf_bar_mean"] = (chrf_bar_source + chrf_bar_target) / 2 store[key]["bleu_bar_mean"] = (bleu_bar_source + bleu_bar_target) / 2 # write back json to disk with open(input_file, "w") as f: store = json.dump(store, f, ensure_ascii=False)
def validate(val_loader, encoder, decoder, criterion, tok_tgt): ''' Performs one epoch's validation. ''' decoder.eval() # eval mode (no dropout or batchnorm) if encoder is not None: encoder.eval() references = list() # references (true captions) for calculating corpus BLEU-4 score hypotheses = list() # hypotheses (predictions) avg_loss = 0 with torch.no_grad(): # Batches for cnt, (srccap, tgtcap, video, audio, caplen_src, caplen_tgt, srcrefs, tgtrefs) in enumerate(val_loader, 1): srccap, tgtcap, caplen_src, caplen_tgt = srccap.cuda(), tgtcap.cuda(), caplen_src.cuda(), caplen_tgt.cuda() video, audio = video.cuda(), audio.cuda() # Forward prop. src_out, init_hidden, vid_out = encoder(srccap, video, audio) # fea: decoder input from encoder, should be of size (mb, encout_dim) = (mb, decoder_dim) scores, pred_lengths = decoder.inference(srccap, tgtcap, init_hidden, src_out, vid_out, args.MAX_INPUT_LENGTH) # Since we decoded starting with <start>, the targets are all words after <start>, up to <end> targets = tgtcap[:, 1:] scores_copy = scores.clone() # Calculate loss loss = criterion(scores[:, 1:].contiguous().view(-1, decoder.vocab_size), targets.contiguous().view(-1)) # Hypotheses _, preds = torch.max(scores_copy, dim=2) preds = preds.tolist() temp_preds = list() for j, p in enumerate(preds): temp_preds.append(preds[j][1:pred_lengths[j]]) # remove pads and idx- preds = [tok_tgt.decode_sentence(t) for t in temp_preds] hypotheses.extend(preds) # preds= [1,2,3] tgtrefs = [list(map(int, i.split())) for i in tgtrefs] # tgtrefs = [[1,2,3], [2,4,3], [1,4,5,]] for r in tgtrefs: references.extend([tok_tgt.decode_sentence(r)]) assert len(references) == len(hypotheses) avg_loss += loss.item() # Calculate metrics print('pred example:', hypotheses[0]) print('ref example:', references[0]) avg_loss = avg_loss/cnt corpbleu = sacrebleu.corpus_bleu(hypotheses, [references]) print(corpbleu) sentbleu = 0 for i, (r, h) in enumerate(zip(references, hypotheses), 1): sentbleu += sacrebleu.sentence_bleu([h], [[r]]).score sentbleu /= i return avg_loss, sentbleu, corpbleu.score
def get_similarity_by_sacrebleu(text1, text2): # pip install sacrebleu references = [text1] hypothesis = text2 score = sacrebleu.sentence_bleu(hypothesis, references).score return score
def sentence_level_eval(self): self.metrics["sentence_bleu"] = sacrebleu.sentence_bleu( self.prediction(), [self.reference()]).score self.metrics["latency"] = eval_all_latency(self.delays, self.source_length()) self.metrics["latency_ca"] = eval_all_latency(self.elapsed, self.source_length())
def compare_lines(hyp_lines, ref_lines, return_sorted=False): scores = [ sacrebleu.sentence_bleu(hyp, ref) for hyp, ref in zip(hyp_lines, ref_lines) ] result = zip(scores, hyp_lines, ref_lines) return sorted(result, reverse=True) if return_sorted else result
def main(): parser = argparse.ArgumentParser() parser.add_argument('reference', help='Reference translation.') parser.add_argument('nbest', help='N-best list to score.') parser.add_argument( '-val_indices', help='File containing indices of validation sentences.') args = parser.parse_args() with open(args.reference, 'r') as f: refs = [line.rstrip('\n') for line in f] if args.val_indices: with open(args.val_indices, 'r') as f: val_indices = {int(line.rstrip('\n')) for line in f} else: val_indices = [] with open(args.nbest, 'r') as f: for line in f: if ' ||| ' not in line: continue fields = line.rstrip('\n').split(' ||| ') sntno = int(fields[0]) score = sacrebleu.sentence_bleu(fields[1], refs[sntno], smooth_method='floor', smooth_value=.1) outfile = sys.stderr if sntno in val_indices else sys.stdout print(sntno, fields[1], score.score, sep=' ||| ', file=outfile)
def evaluate_example(self, summary, reference): #print("BLEU is intended as a corpus-level metric. Be careful!") if isinstance(reference, str): reference = [reference] score = sacrebleu.sentence_bleu(summary, reference, smooth_method=self.sent_smooth_method, \ smooth_value=self.sent_smooth_value, use_effective_order=self.sent_use_effective_order) score_dict = {"bleu" : score.score} return score_dict
def compute_bleu(hyp: str, ref: str) -> float: """ :param hyp: :param ref: :return: """ return sacrebleu.sentence_bleu(hyp, [ref]).score
def sent_bleu(hypotheses, references) -> List[float]: scores = [] for h, r in zip(hypotheses, references): scores.append( sacrebleu.sentence_bleu(hypothesis=h, references=r, smooth_method="add-k", smooth_value=1).score) return scores
def eval(test_loader, encoder, decoder, cp_file, tok_tgt, result_path): ''' Testing the model ''' ### the best model is the last model saved in our implementation epoch = torch.load(cp_file)['epoch'] logging.info ('Use epoch {0} as the best model for testing'.format(epoch)) encoder.load_state_dict(torch.load(cp_file)['enc_state_dict']) decoder.load_state_dict(torch.load(cp_file)['dec_state_dict']) decoder.eval() # eval mode (no dropout or batchnorm) if encoder is not None: encoder.eval() ids = list() # sentence ids hypotheses = list() # hypotheses (predictions) references = list() with torch.no_grad(): # Batches for cnt, (srccap, tgtcap, video, audio, caplen_src, caplen_tgt, srcrefs, tgtrefs) in enumerate(tqdm(test_loader)): srccap, tgtcap, caplen_src, caplen_tgt = srccap.cuda(), tgtcap.cuda(), caplen_src.cuda(), caplen_tgt.cuda() video, audio = video.cuda(), audio.cuda() # Forward prop. src_out, init_hidden, vid_out = encoder(srccap, video, audio) # fea: decoder input from encoder, should be of size (mb, encout_dim) = (mb, decoder_dim) preds, pred_lengths = decoder.beam_decoding(srccap, init_hidden, src_out, vid_out, args.MAX_INPUT_LENGTH, beam_size=5) # Hypotheses preds = preds.tolist() temp_preds = list() for j, p in enumerate(preds): temp_preds.append(preds[j][:pred_lengths[j]]) # remove pads and idx-0 preds = [tok_tgt.decode_sentence(t) for t in temp_preds] hypotheses.extend(preds) # preds= [[1,2,3], ... ] tgtrefs = [ list(map(int, i.split())) for i in tgtrefs] # tgtrefs = [[1,2,3], [2,4,3], [1,4,5,]] for r in tgtrefs: references.extend([tok_tgt.decode_sentence(r)]) # Calculate metrics print('pred example:', hypotheses[0]) print('ref example:', references[0]) corpbleu = sacrebleu.corpus_bleu(hypotheses, [references]) print(corpbleu) sentbleu = 0 for i, (r, h) in enumerate(zip(references, hypotheses), 1): sentbleu += sacrebleu.sentence_bleu([h], [[r]]).score sentbleu /= i print('beam5 bleu: ',corpbleu, sentbleu) if not os.path.exists(result_path): os.makedirs(result_path) # Save results df = pd.DataFrame([hypotheses,references]).T df.columns = ['baseline NMT prediction','references'] df.to_excel(result_path+'results.xlsx')
def test_sentence_score(): refs = ['The dog bit the man.', 'The dog had bit the man.'] sys = 'The dog bit the man.' bleu = sacrebleu.sentence_bleu(sys, refs) mover = sentence_score(sys, refs) print(bleu.score) print(mover)
def ds_score(self): ds_avg = 0 for prompt in self.pred: y = self.pred[prompt].keys() ds = 0 for y1, y2 in combinations(y, 2): ds += 1 - sacrebleu.sentence_bleu(y1, [y2]).score / 100 ds = ds / (len(y) * (len(y) - 1)) ds_avg += ds return ds_avg / len(self.pred)
def get_char_bleu(src: str, trg: str) -> float: """ Fucntion to calculate character BLEU score :param src: string, source :param trg: string, target :return: float, BLEU score """ src = " ".join(src) trg = " ".join(trg) return sacrebleu.sentence_bleu(src, [trg]).score
def evaluate_example(self, summary, reference): if isinstance(reference, str): reference = [reference] score = sacrebleu.sentence_bleu( summary, reference, smooth_method=self.sent_smooth_method, smooth_value=self.sent_smooth_value, use_effective_order=self.sent_use_effective_order, ) score_dict = {"bleu": score.score} return score_dict
def get_sent_bleu(hyp, ref): # hyp: hypothesis ref: reference # Returns: # bleu_score: Object with BLEU related information about sentence-level BLEU with exponential smoothing between hyp and ref hyp_line = hyp.strip() ref_line = ref.strip() bleu_score = sacrebleu.sentence_bleu(hyp_line, ref_line, smooth_method='exp') return bleu_score
def single_reference_sentence_bleu(reference: str, variant: str, stem: bool = False): def stem_sentence(sentence): return " ".join([ STEMMER.stem(w) for w in word_tokenize(sentence) ]) if stem: variant = stem_sentence(variant) reference = stem_sentence(reference) return sacrebleu.sentence_bleu(variant, [reference]).score
def sentence_bleu(sentence, reference, detokenizer=None): """ Utility function for calculating sentence BLEU. Expects sentence and reference as list of tokens. Reference may be list of multiple references """ if not isinstance(reference[0], list): reference = [reference] if detokenizer is not None: sentence = detokenizer(sentence.split()) reference = [detokenizer(r.split()) for r in reference] return sacrebleu.sentence_bleu(sentence, [reference]).score
def main(): args = parse_args() with open(args.ref) as infile, open(args.sys) as wefile: outputs = [] for idx, (inline, weline) in enumerate(zip(infile, wefile), start=1): bleu = sacrebleu.sentence_bleu(weline, inline) print(bleu) outputs.append((idx, bleu.score, inline.strip(), weline.strip())) outputs.sort(key=lambda x: -x[1]) with open(args.out, 'w') as outfile: print('line_no\tbleu\tref\tsys', file=outfile) for idx, bleu, inline, weline in outputs: print('{}\t{}\t{}\t{}'.format(idx, bleu, inline, weline), file=outfile)
def score_multi_all( self, summaries_list: List[List[SummaryType]], references_list: List[List[ReferenceType]], **kwargs, ) -> List[List[MetricsDict]]: scores_list = [] for summaries, references in zip(summaries_list, references_list): references = [flatten(reference) for reference in references] scores_list.append([]) for summary in summaries: summary = flatten(summary) score = sentence_bleu(summary, references, **self.kwargs) scores_list[-1].append(MetricsDict({'sent-bleu': score.score})) return scores_list
def get_bleu_scores(trg_tensor, pred_tensor, TGT): bleus_per_sentence = torch.zeros(trg_tensor.shape[1], requires_grad=False) for col in range(trg_tensor.shape[1]): #each column contains sentence true_sentence = [ TGT.vocab.itos[i] for i in trg_tensor[:, col] if TGT.vocab.itos[i] != settings.BLANK_WORD ][1:-1] pred_sentence = [ TGT.vocab.itos[i] for i in pred_tensor[:, col] if TGT.vocab.itos[i] != settings.BLANK_WORD ] #print('Before: ') #print(true_sentence) #print(pred_sentence) #now also need to stop pred_sentence after first EOS_WORD outputted #also don't want to use BOS chars ind_first_eos = 0 for tok in pred_sentence: if tok == settings.EOS_WORD: break ind_first_eos += 1 if ind_first_eos != 0: pred_sentence = pred_sentence[ 1:ind_first_eos] #this gets rid of EOS_WORD #now undo some of the weird tokenization pred_sentence = fix_sentence(pred_sentence, as_str=True) true_sentence = fix_sentence(true_sentence, as_str=True) #This bleu_score defaults to calculating BLEU-4 (not normal bleu) so change weights, #this change of weights gives BLEU based on 1-grams so normal bleu I believe #score = nltk.translate.bleu_score.sentence_bleu([true_sentence], pred_sentence, weights=(1, 0, 0, 0)) score = sacrebleu.sentence_bleu(pred_sentence, true_sentence, smooth_method='exp').score #score = score*len(true_sentence) #print(true_sentence) #print(pred_sentence) #print(score) #print() bleus_per_sentence[col] = score / 100.0 return bleus_per_sentence
def calc_bleu_score_document(references, MT): """ Calculating bleu score by using sacrebleu module. In this method, all Complete segmented in MT is sys document and all Ts sentences is ref document. :param references: a list of references :param MT: a list of MT senetnces :return sacre_blue_score: the bleu score """ merge_mt_sentences = [] for i in range(len(MT)): mt = MT[i][-1][3:-1] merge_mt_sentences += mt merge_references_sentences = [] for ref in references: l = [] for sentence in ref: l.append(' '.join(sentence[:-1])) merge_references_sentences.append(l) refs = [' '.join(i) for i in merge_references_sentences] sys = ' '.join(merge_mt_sentences[:]) b_sacre = sacrebleu.sentence_bleu(sys, refs) sacre_blue_score = b_sacre.score return sacre_blue_score
# Open the test dataset human translation file and detokenize the references refs = [] with open(target_test) as test: for line in test: line = line.strip().split() line = md.detokenize(line) refs.append(line) print("Reference 1st sentence:", refs[0]) # Open the translation file by the NMT model and detokenize the predictions preds = [] with open(target_pred) as pred: for line in pred: line = line.strip().split() line = md.detokenize(line) preds.append(line) # Calculate BLEU for sentence by sentence and save the result to a file with open("bleu-" + target_pred + ".txt", "w+") as output: for line in zip(refs,preds): test = line[0] pred = line[1] print(test, "\t--->\t", pred) bleu = sacrebleu.sentence_bleu(pred, [test], smooth_method='exp') print(bleu.score, "\n") output.write(str(bleu.score) + "\n")
def sentbleu(y_hat, y): return sentence_bleu(y_hat, y, smooth_method="add-n", smooth_value=1.0)
def score_sentence(self, hyp, ref, lang=None): return sacrebleu.sentence_bleu(hyp, ref, smooth_value=0.01) / 100
def get_bleu(self, source, reference, beam=5): predicted = self.model.translate(source, beam) return sacrebleu.sentence_bleu(predicted, reference).score, predicted
def compute_sacrebleu(references, translation): hypo = ' '.join(translation) refs = [' '.join(r) for r in references] return sacrebleu.sentence_bleu(hypo, refs).score
def bleu(self): """ BLEU of the hypothesis """ return sacrebleu.sentence_bleu(self.new_hyp.replace(' ', '').replace('</s>', '').replace('▁', ' '), [self.cur_ref]).score
def calc_bleu_score_sentence_by_time(Ts, MT, time_step): """ Calculates blue score using the NLTK module with time slice strategy. :param Ts: a list of T tables :param MT: a list of MT senetnces :param time_step: size of time-step :return blue_scores: the average bleu score between time-step scores :return avg_SacreBleu: a list of time-step scores """ tail_number = float(MT[-1][-1][2]) start = 0 end = time_step mt_sentences = list() while start <= float(tail_number): l = [] for i in range(len(MT)): estimat_word_times = build_A_Time_Based_quality(MT[i]) for k, v in estimat_word_times.items(): if v >= start and v <= end: l.append(k) mt_sentences.append(l) start += time_step end += time_step references_sentences = list() start = 0 end = time_step while start <= float(tail_number): l = [] for i in range(len(Ts)): s = [] for sentence in Ts[i]: for k, v in sentence.items(): if v >= start and v <= end: s.append(k) l.append(s) references_sentences.append(l) start += time_step end += time_step start = 0 end = time_step sacreBLEU_list = [] blue_scores = [] for t in range(len(mt_sentences)): try: sys = ' '.join(mt_sentences[t]) refs = [' '.join(ref) for ref in references_sentences[t]] b_sacre = sacrebleu.sentence_bleu(sys, refs) sacre_blue_score = b_sacre.score text1 = 'detailed sacreBLEU span-' + format( start, '06') + '-' + format(end, '06') + ' ' + str( "{0:.3f}".format(sacre_blue_score)) sacreBLEU_list.append(sacre_blue_score) start += time_step end += time_step blue_scores.append(text1) except: pass avg_SacreBleu = "avg sacreBLEU span* " + str( "{0:.3f}".format(round( (sum(sacreBLEU_list) / len(sacreBLEU_list)), 3))) return blue_scores, avg_SacreBleu
def score_sentence(self, hyp, ref, lang=None): return sacrebleu.sentence_bleu(hyp, ref) / 100
def get_Bleu_for_beam(key, Src_tokens, Src_text, Tgt_tokens, Tgt_text, model, plot_path, args): import sacrebleu from sacrebleu import sentence_bleu SMOOTH_VALUE_DEFAULT = 1e-8 #----------------------------------- """ If you see best-hypothesis having worse WER that the remainig beam them tweak with the beam hyperpearmaeters Am_wt, len_pen, gamma If you see best-hypothesis having better performance than the oothers in the beam then improve the model training """ #----------------------------------- #----------------------------------- ####get the model predictions Output_seq = model.predict(Src_tokens, args) #Output_seq = model.predict(input,args.LM_model,args.Am_weight,args.beam,args.gamma,args.len_pen) ###get the true label if it exists True_label = Tgt_text #----------------------------------- llr = [item.get('score').unsqueeze(0) for item in Output_seq] norm_llr = torch.nn.functional.softmax(torch.cat(llr, dim=0), dim=0) print("final_ouputs", '====', 'key', 'Text_seq', 'LLR', 'Beam_norm_llr', 'Yseq', 'CER') print("True_label", True_label) #----------------------------------- #----------------------------------- for ind, seq in enumerate(Output_seq): Text_seq = seq['Text_seq'] if len(Text_seq) > 1: Text_seq = Text_seq[0] Text_seq_formatted = [x for x in Text_seq.split(' ') if x.strip()] Text_seq_formatted = " ".join(Text_seq_formatted) else: Text_seq_formatted = Text_seq[0] Yseq = seq['yseq'].data.numpy() Ynorm_llr = norm_llr[ind].data.numpy() Yllr = seq['score'].data.data.numpy() #--------------------------------------------- attention_record = seq.get('alpha_i_list', 'None') if (torch.is_tensor(attention_record)): #--------------------------------------------- attention_record = attention_record[:, :, 0].transpose(0, 1) attention_record = attention_record.data.cpu().numpy() #--------------------------------------------- if args.plot_decoding_pics: pname = str(key) + '_beam_' + str(ind) plotting_name = join(plot_path_name, pname) plotting(plotting_name, attention_record) #----------------------------------- #----------------------------------- if True_label: if Text_seq_formatted.strip(): CER = compute_cer(Text_seq_formatted, True_label, 'doesnot_matter') * 100 else: CER = 100 #breakpoint() hyp_value = Text_seq_formatted ref_value = True_label Bleu_score = sentence_bleu(hyp_value, [ref_value], smooth_value=SMOOTH_VALUE_DEFAULT, smooth_method='exp', use_effective_order='True') Bleu_score = Bleu_score.score else: CER = None Bleu_score = None #--------------------------------------------- if ind == 0: print("nbest_output", '=', key, '=', Text_seq_formatted, '=', True_label, '=', CER, '=', Bleu_score) print("final_ouputs", '=', ind, '=', key, '=', Text_seq_formatted, '=', Yllr, '=', Ynorm_llr, '=', Yseq, '=', CER, '=', Bleu_score)