def test_offset(hypothesis, reference, expected_with_offset, expected_without_offset): score_without_offset = sacrebleu.raw_corpus_bleu(hypothesis, reference, 0.0).score / 100 assert abs(expected_without_offset - score_without_offset) < EPSILON score_with_offset = sacrebleu.raw_corpus_bleu(hypothesis, reference, 0.1).score / 100 assert abs(expected_with_offset - score_with_offset) < EPSILON
def test_offset(hypothesis, reference, expected_with_offset, expected_without_offset): score_without_offset = sacrebleu.raw_corpus_bleu(hypothesis, reference, 0.0).score / 100 assert abs(expected_without_offset - score_without_offset) < EPSILON # let it use BLEU's internal default of 0.1 through passing `None` score_with_offset = sacrebleu.raw_corpus_bleu(hypothesis, reference, None).score / 100 assert abs(expected_with_offset - score_with_offset) < EPSILON # let it use BLEU's internal default of 0.1 score_with_offset = sacrebleu.raw_corpus_bleu(hypothesis, reference).score / 100 assert abs(expected_with_offset - score_with_offset) < EPSILON
def eval_val(file_name, model, valid_iter, targ_field, datasets): references = [" ".join(example.trg) for example in datasets["val"]] hypotheses = [] alphas = [] # save the last attention scores for batch in valid_iter: batch = rebatch(PAD_INDEX, batch) pred, attention = beam_decode( model, batch.src, batch.src_mask, batch.src_lengths, sos_index=targ_field.vocab.stoi[SOS_TOKEN], eos_index=targ_field.vocab.stoi[EOS_TOKEN]) hypotheses.append(pred) alphas.append(attention) hypotheses = [lookup_words(x, TRG.vocab) for x in hypotheses] hypotheses = [" ".join(x) for x in hypotheses] with open(file_name, "w") as file: for line in hypotheses: file.write(line + "\n") bleu = sacrebleu.raw_corpus_bleu(hypotheses, [references], .01).score print(bleu)
def get_bleu_metrics(self, dev_samples, model: Model): reached_eof = False current_index = 0 all_hypothesis_sentences = [] all_reference_sentences = [] while not reached_eof: batch_x, batch_y, batch_z, batch_tt, actual_batch_size, reached_eof = read_batch_from_samples( dev_samples, self.batch_size, self.token_per_batch, current_index, model.config.data_config.input_features, model.config.data_config.output_features, model.config.data_config.output_translations, model.config.data_config.output_translation_features, model.config.data_config.input_clear_text, model.config.data_config.output_translation_clear_text) if actual_batch_size == 0: break reference = unpad_turn_to_text_and_remove_bpe_of_batch_t( batch_tt[0][0], model.config.data_config.output_translation_vocabularies[0][0]) for sentence in reference: all_reference_sentences.append(sentence) output = model.predict_translation_on_batch(batch_x) output = unpad_turn_to_text_and_remove_bpe_of_batch_t( output, model.config.data_config.output_translation_vocabularies[0][0]) for sentence in output: all_hypothesis_sentences.append(sentence) current_index += actual_batch_size if reached_eof is True: break bleu = sacrebleu.raw_corpus_bleu(sys_stream=all_hypothesis_sentences, ref_streams=[all_reference_sentences]) return bleu.score
def compute_metrics(hyp_dec_all, ref_dec_all, use_sacrebleu=True, use_torchtext=True, use_ter=False): metrics = {} # Sacrebleu if use_sacrebleu: metrics["sacrebleu_rawcorpusbleu"] = sacrebleu.raw_corpus_bleu( hyp_dec_all, [ref_dec_all]).score metrics["sacrebleu_bleu"] = sacrebleu.corpus_bleu( hyp_dec_all, [ref_dec_all]).score metrics["sacrebleu_chrf"] = sacrebleu.corpus_chrf( hyp_dec_all, [ref_dec_all]).score if use_ter: # Quite slow metrics["sacrebleu_ter"] = sacrebleu.corpus_ter( hyp_dec_all, [ref_dec_all]).score # Torchtext if use_torchtext: m_bleu_score = bleu_score([x.split(" ") for x in hyp_dec_all], [[x.split(" ")] for x in ref_dec_all]) metrics["torchtext_bleu"] = m_bleu_score * 100 return metrics
def bleu_eval(ref_trans, new_trans, raw_trans=True): # returns a bleu score # input lists of strings, must be of the same length! if raw_trans: return sacrebleu.raw_corpus_bleu(new_trans, [ref_trans]).score else: return sacrebleu.corpus_bleu(new_trans, [ref_trans]).score
def score(predicted, predicted_imgs, gold, gold_imgs): scores = dict() # BLEU bleu = sacrebleu.raw_corpus_bleu(predicted, [gold]).score scores['bleu'] = bleu # todo: String edit distance # todo: Image difference return scores
def calculate_bleu_score(hyps: list, refs: list) -> float: """ calculates bleu score. """ assert len(refs) == len( hyps), "no of hypothesis and references sentences must be same length" bleu = raw_corpus_bleu(hyps, [refs]) return bleu.score
def calculate_bleu(predictions, labels): """ Only pass a list of strings """ # tthis is ony with n_gram = 4 bleu = sacrebleu.raw_corpus_bleu(predictions, [labels], .01).score return bleu
def bleu(hypotheses, references): """ Raw corpus BLEU from sacrebleu (without tokenization) :param hypotheses: list of hypotheses (strings) :param references: list of references (strings) :return: """ return sacrebleu.raw_corpus_bleu(sys_stream=hypotheses, ref_streams=[references]).score
def raw_corpus_bleu(hypotheses: Iterable[str], references: Iterable[str], offset: Optional[float] = 0.01) -> float: """ Simple wrapper around sacreBLEU's BLEU without tokenization and smoothing. :param hypotheses: Hypotheses stream. :param references: Reference stream. :param offset: Smoothing constant. :return: BLEU score as float between 0 and 1. """ return sacrebleu.raw_corpus_bleu(hypotheses, [references], smooth_floor=offset).score / 100.0
def score_results(self, results): if self.tgt_dataset is None: return [] ref_stream = (line.replace(self.bpe_symbol, '') for line in self.tgt_dataset) sys_stream = (line.replace(self.bpe_symbol, '') for line in results[::len(results) // len(self.dataset)]) bleu = sacrebleu.raw_corpus_bleu(sys_stream, [ref_stream]) return ["{:.2f} BLEU".format(bleu.score)]
def evaluate_bleu(predictions, labels): try: bleu_sacre = sacrebleu.raw_corpus_bleu(predictions, [labels], .01).score except (KeyboardInterrupt, SystemExit): raise except BaseException as e: print("\nWARNING: Could not compute BLEU-score. Error:", str(e)) return bleu_sacre
def compute_bleu(hypotheses, references, epoch, config, direction, kl=None): bleu = sacrebleu.raw_corpus_bleu(hypotheses, [references]).score scores = '{}/{}/bleu-scores.txt'.format(config["out_dir"], config["session"]) with open(scores, 'a') as f_score: sentence = "Epoch: {}, Bleu {}".format(epoch, bleu, direction) if kl is not None: sentence += ", KL: {}".format(kl) sentence += ", Direction {}\n" f_score.write(sentence) return bleu
def compute_bleu(hypotheses, references, subword_token=None): """ Computes sacrebleu for a single set of references. """ # Remove any subword tokens such as "@@". if subword_token is not None: references = remove_subword_tokens(references, subword_token) hypotheses = remove_subword_tokens(hypotheses, subword_token) # Compute the BLEU score. return sacrebleu.raw_corpus_bleu(hypotheses, [references]).score
def bleu(itos, translation_output, reference): ''' Args: trg.vocab.itos: a list the match indices to string. translation_output: 2D tensor of tranlation output. shape: N x B reference: 1D list of reference sentences (words, not indices). len(reference) = B ''' EN_ind2word = np.array(itos) detok_translation = detok(translation_output, EN_ind2word) bleu_score = sacrebleu.raw_corpus_bleu(detok_translation, [reference], .01).score return bleu_score
def compute(self, labels: Sequence[Text], preds: Sequence[Text], label_spec: types.TextSegment, pred_spec: types.GeneratedText, config: Optional[JsonDict] = None) -> Dict[Text, float]: """Compute metric(s) between labels and predictions.""" del label_spec del pred_spec del config if not labels or not preds: return {} bleu = sacrebleu.raw_corpus_bleu(preds, [labels]) return {'corpus_bleu': bleu.score}
def __call__(self, output, target): """Computes the BLEU score of a translation task Args: output (:obj:`torch.Tensor`): Translated output (not tokenized) target (:obj:`torch.Tensor`): Target labels Returns: loss (:obj:`torch.Tensor`): BLEU score """ if self.use_raw: bleu_score = sacrebleu.raw_corpus_bleu(output, [target]).score else: bleu_score = sacrebleu.corpus_bleu( output, [target], tokenize="intl", lowercase=True ).score return torch.tensor([bleu_score])
def evaluate(self, iterator): """ Evaluation loop for the model :param iterator: PyTorch DataIterator instance :return: average epoch loss """ # disable training of model layers self.model.eval() epoch_loss = 0 accuracy = 0 # don't update model parameters with torch.no_grad(): for i, batch in tqdm(enumerate(iterator), total=len(iterator), desc='evaluation loop'): # get source and target data src, src_lengths = batch.src trg, trg_lengths = batch.trg output, _ = self.model(src, trg, src_lengths, trg_lengths) decoded_output = self.model.decoder.decode_mechanism(output) # reshape same as train loop y_pred = output[:, 1:].contiguous().view(-1, output.size(-1)) y = trg[:, 1:].contiguous().view(-1) # compute loss loss = self.criterion(y_pred, y) epoch_loss += loss.item() # using BLEU score for machine translation tasks accuracy += sacrebleu.raw_corpus_bleu( sys_stream=self.lookup_words(decoded_output), ref_streams=[self.lookup_words(trg)]).score # return the average loss return epoch_loss / len(iterator), accuracy / len(iterator)
def compute(self, labels: Sequence[Text], preds: Sequence[Union[Text, types.ScoredTextCandidates]], label_spec: types.TextSegment, pred_spec: Union[types.GeneratedText, types.GeneratedTextCandidates], config: Optional[JsonDict] = None) -> Dict[Text, float]: """Compute metric(s) between labels and predictions.""" del label_spec del config if not labels or not preds: return {} name_suffix = '' if isinstance(pred_spec, types.GeneratedTextCandidates): preds = [types.GeneratedTextCandidates.top_text(v) for v in preds] name_suffix = '@1' bleu = sacrebleu.raw_corpus_bleu(preds, [labels], BLEU_SMOOTHING_VAL) return {'corpus_bleu' + name_suffix: bleu.score}
def run_test(model, test_data, test_iter): preds = [] for k, batch in enumerate(test_iter): model.eval() with torch.no_grad(): batch = rebatch(PAD_INDEX, batch) pred, attn = greedy_decode(model, batch.src, batch.src_mask, batch.src_lengths, max_len=25, sos_index=TRG.vocab.stoi[SOS_TOKEN], eos_index=TRG.vocab.stoi[EOS_TOKEN]) preds.append(pred) hypotheses = [lookup_words(pred, TRG.vocab) for pred in preds] hypotheses = [" ".join(h) for h in hypotheses] references = [" ".join(data.trg) for data in test_data] bleu = sacrebleu.raw_corpus_bleu(hypotheses, [references], .01).score print("BLEU score: ", bleu) return bleu
print_loss_total += loss.item() / target_sentences.size()[1] if i > 0 and i % print_every == 0: step_record.append(i + epoch * total_step) print('%s (%d %d%%) %.4f' % (timeSince(start, (i+epoch*total_step) /(total_step*EPOCH)), \ i+epoch*total_step, ((i+epoch*total_step) /(total_step*EPOCH))*100, \ print_loss_total/print_every)) train_loss_record.append( [i + epoch * total_step, print_loss_total / print_every]) print_loss_total = 0 if i > 0 and i % val_every == 0: val_loss = dataset_loss(encoder, decoder, val_loader) predict_sentences = evaluate(val_loader, encoder, decoder, output_lang) val_Bleu = raw_corpus_bleu(predict_sentences, [dev_en]).score print('step: {}, valNLLLoss: {}, valBleuScore: {}'.format( i + epoch * total_step, val_loss, val_Bleu)) val_loss_record.append( [i + epoch * total_step, val_loss, val_Bleu]) if epoch > 0 and val_Bleu > max_Bleu: torch.save(encoder.state_dict(), res_dataDir_cn + \ "//zh_tmp_encoder_1210_simple_t2_"+str(EMB_SIZE)+'_'+str(HIDDEN_SIZE)+'_'+str(EPOCH)+".pth") torch.save(decoder.state_dict(), res_dataDir_cn + \ "//zh_tmp_attn_decoder_1210_simple_t2_"+str(EMB_SIZE)+'_'+str(HIDDEN_SIZE)+'_'+str(EPOCH)+".pth") max_Bleu = max(max_Bleu, val_Bleu) print('Cache memo after Eva:{}'.format( torch.cuda.memory_allocated())) # Early Termination if epoch > 0 and val_loss_record[-1][2] < max_Bleu-3 and \ val_loss_record[-2][2] < max_Bleu-2 and \
def test_statistics(hypothesis, reference, expected_stat): result = sacrebleu.raw_corpus_bleu(hypothesis, reference, .01) stat = Statistics(result.counts, result.totals) assert stat == expected_stat
def test_effective_order(hypotheses, references, expected_bleu): bleu = sacrebleu.raw_corpus_bleu(hypotheses, references, .01).score / 100 assert abs(bleu - expected_bleu) < EPSILON
train_loss = [] for epoch in range(30): total_loss = 0 for idx, batch_data in enumerate(train_iter): input, in_lens, output, out_lens = batch_data loss = train_attn(input, output, in_lens, out_lens, encoder, decoder, encoder_optim, decoder_optim, 0.1) total_loss += loss if idx % 800 == 0: print('Training Loss: {}'.format(loss)) train_loss.append((total_loss / (idx + 1))) translated_corp = [] output_corp = [] for idx, batch_data in enumerate(valid_iter): input, in_lens, output, out_lens = batch_data Beam_evaluate(encoder, decoder, batch_data) BLEU_score_epo = sacrebleu.raw_corpus_bleu(translated_corp, [output_corp]).score print('Validation Score After Epoch: {}'.format(BLEU_score_epo)) try: if BLEU_score_epo > max(Bleu_score): torch.save(encoder.state_dict(), 'encoder_vi.pth') torch.save(decoder.state_dict(), 'decoder_vi.pth') except: torch.save(encoder.state_dict(), 'encoder_vi.pth') torch.save(decoder.state_dict(), 'decoder_vi.pth') Bleu_score.append(BLEU_score_epo) print(train_loss) print(Bleu_score)
def evaluate(self, sentences_file="eval_sentences.txt", distance_file="distances.txt", mutual_avg_file="mutual_distances.txt", mutual_avg_file_A="mutual_distances_A.txt", mutual_avg_file_B="mutual_distances_B.txt", top_k_file="top_k.txt", sacre_file="sacre_bleu.tsv"): #logging.info("\n\nEvaluating...") self.netG_AB.module.eval() self.netG_BA.module.eval() self.netD_AB.module.eval() self.netD_BA.module.eval() with torch.no_grad(): self.forward( ) # calculate loss functions, get gradients, update network weights gc.collect() with open(sentences_file, "a") as sentences_file: for j in range(len(self.real_A)): str1 = " A->B->A : " + self.real_A[j] + " -> " + self.fake_B[ j] + " -> " + self.rec_A[j] str2 = " B->A->B : " + self.real_B[j] + " -> " + self.fake_A[ j] + " -> " + self.rec_B[j] #logging.info(str1) #logging.info(str2) sentences_file.write('%s\n' % str1) # save the message sentences_file.write('%s\n\n' % str2) # save the message distances = sklearn.metrics.pairwise_distances( self.fake_A_embeddings.cpu().detach().numpy(), self.fake_B_embeddings.cpu().detach().numpy(), metric='cosine', n_jobs=-1) triang = np.triu(distances) np.fill_diagonal(triang, 0) mutual_distances = np.sum(triang) / np.count_nonzero(triang) with open(mutual_avg_file, "a") as mutual_avg_f: mutual_avg_f.write(str(mutual_distances) + '\n') with open(distance_file, "a") as distances_file: for i in range(len(distances)): distances_file.write(str(distances[i][i]) + '\n') distances_fake_A = sklearn.metrics.pairwise_distances( self.fake_A_embeddings.cpu().detach().numpy(), self.fake_A_embeddings.cpu().detach().numpy(), metric='cosine', n_jobs=-1) triang = np.triu(distances_fake_A) np.fill_diagonal(triang, 0) mutual_distances = np.sum(triang) / np.count_nonzero(triang) with open(mutual_avg_file_A, "a") as mutual_avg_f: mutual_avg_f.write(str(mutual_distances) + '\n') distances_fake_B = sklearn.metrics.pairwise_distances( self.fake_B_embeddings.cpu().detach().numpy(), self.fake_B_embeddings.cpu().detach().numpy(), metric='cosine', n_jobs=-1) triang = np.triu(distances_fake_B) np.fill_diagonal(triang, 0) mutual_distances = np.sum(triang) / np.count_nonzero(triang) with open(mutual_avg_file_B, "a") as mutual_avg_f: mutual_avg_f.write(str(mutual_distances) + '\n') #with open(distance_file, "a") as distances_file: # distances_file.write(distances+"\n") dim = len(distances) top_k = np.zeros(dim, dtype=np.float) for i in range(dim): lower = 0 for j in range(len(distances[i])): if i != j and distances[i][i] > distances[i][j]: lower += 1 top_k[lower] += 1 with open(top_k_file, "a") as top_file: tot = 0 for i in range(dim): top_k[i] = top_k[i] / dim * 100 tot += top_k[i] top_file.write('Top ' + str(i + 1) + ': ' + str(tot) + '%\n') # mi salvo in un dict per ogni frase quanto lontano è l'embedding reale (quanti ce ne sono più vicini) e faccio una classifica # per vedere quanti hanno l'embedding reale nella top 1, top 2 e cosi via (cumulativo) # salvo info in un file, per ogni epoca bleu_fake_A = sacrebleu.raw_corpus_bleu(self.fake_A, [self.real_A]).score bleu_rec_A = sacrebleu.raw_corpus_bleu(self.rec_A, [self.real_A]).score bleu_fake_B = sacrebleu.raw_corpus_bleu(self.fake_B, [self.real_B]).score bleu_rec_B = sacrebleu.raw_corpus_bleu(self.rec_B, [self.real_B]).score with open(sacre_file, "a") as sacre_file: for i in range(dim): sacre_file.write( str(bleu_fake_A) + '\t' + str(bleu_rec_A) + '\t' + str(bleu_fake_B) + '\t' + str(bleu_rec_B) + '\n') self.netG_AB.module.train() self.netG_BA.module.train() self.netD_AB.module.train() self.netD_BA.module.train() gc.collect()
def test_degenerate_uneven(hypotheses, references): with pytest.raises(EOFError, match=r'.*stream.*'): sacrebleu.raw_corpus_bleu(hypotheses, references)
from __future__ import unicode_literals, print_function, division from io import open import io import unicodedata import string import re import random import numpy as np import torch import torch.nn as nn from torch import optim from torch.autograd import Variable from torch.utils.data import Dataset, DataLoader import torch.nn.functional as F from tqdm import tqdm from collections import Counter, namedtuple import pickle import sacrebleu import pandas as pd import string device = torch.device("cuda" if torch.cuda.is_available() else "cpu") PAD = 0 SOS = 1 EOS = 2 UNK = 3 exclude = set(string.punctuation) ''' https://stackoverflow.com/questions/4984647/accessing-dict-keys-like-an-attribute ''' class AttrDict(dict):
def forward(self, rationale_text_label, generate, q_id, question, features, spatials, segment_ids, input_mask, image_mask, co_attention_mask, num_options, freeze=-1): outs = self.vilbert_model(question, features, spatials, segment_ids, input_mask, image_mask, co_attention_mask, num_options=num_options) gpt2_inp, pred_ans = outs[7:] gpt2_inp = self.embed(gpt2_inp) gpt2_inputs = (gpt2_inp, rationale_text_label) gpt2_outputs = self.gpt2_model(gpt2_inputs, labels=rationale_text_label) gpt2_loss = gpt2_outputs[0] to_return = outs[:7] + (gpt2_loss, ) references = [] hypotheses = [] if generate: out = sample_sequence( model=self.gpt2_model, context=gpt2_inp, length=30, #TODO 3 * (self._max_caption_length//4 temperature=1, #TODO change here ) for i in range(out.size()[0]): first_rat = out[i].tolist( ) #TODO changed from out[0, len(context_tokens):].tolist() text = self.gpt2_tokenizer.decode( first_rat, clean_up_tokenization_spaces=False, skip_special_tokens=True) # text = text[: text.find(self.gpt2_tokenizer.stop_token)] rationale_text = self.gpt2_tokenizer.decode( rationale_text_label[i].tolist(), clean_up_tokenization_spaces=False, skip_special_tokens=True) # rationale_text = rationale_text[: rationale_text.find(self.gpt2_tokenizer)] q_id_f = (q_id[i] - 1000000).item() pred_ans_f = pred_ans[i].item() logger.info( "[Img ID: {}] Predicted Ans: {} \t| Gold rationale: {} | Generated rationale: {}" .format(q_id_f, pred_ans_f, rationale_text, text)) for rat_ids, gen_ids in zip(rationale_text_label.tolist(), out.tolist()): rat_dec = self.gpt2_tokenizer.decode( rat_ids, clean_up_tokenization_spaces=False, skip_special_tokens=True) gen_dec = self.gpt2_tokenizer.decode( gen_ids, clean_up_tokenization_spaces=False, skip_special_tokens=True) references.append(rat_dec) hypotheses.append(gen_dec) bleu_score = sacrebleu.raw_corpus_bleu(hypotheses, [references], .01).score to_return = to_return + (bleu_score, ) return to_return
dev_perplexities.append(dev_perplexity) return dev_perplexities model = make_model(len(SRC.vocab), len(TRG.vocab), emb_size=256, hidden_size=256, num_layers=1, dropout=0.2) dev_perplexities = train(model, print_every=100) hypotheses = ["this is a test"] references = ["this is a test"] bleu = sacrebleu.raw_corpus_bleu(hypotheses, [references], .01).score print(bleu) hypotheses = ["this is a test"] references = ["this is a fest"] bleu = sacrebleu.raw_corpus_bleu(hypotheses, [references], .01).score print(bleu) len(valid_data) references = [" ".join(example.trg) for example in valid_data] print(len(references)) print(references[0]) references[-2]