def calculate_bleu(data, src_field, trg_field, model, device, max_len=80): trgs = [] pred_trgs = [] trgs_filter = [] pred_trgs_filter = [] target_bleu_list = [ 'b@0', 'b@1', 'b@2', 'b@3', 'b@4', 'b@5', 'b@6', 'b@7', 'b@8', 'b@9' ] for datum in data: ques = vars(datum)['question'] ans = vars(datum)['answer_text'] doc = vars(datum)['article'] trg = vars(datum)['distractor'] bleu = vars(datum)['bleu1'] print('ques = ' + ' '.join(ques)) print('ans = ' + ' '.join(ans)) trg = trg[1:] print('trg = ' + ' '.join(trg)) max_bleu = 0 max_pred_trg = '' for target_bleu in target_bleu_list: pred_trg, _ = translate_sentence(ans, ques, doc, bleu, src_field, trg_field, model, device, target_bleu, max_len) print(target_bleu + ' : ' + ' '.join(pred_trg)) #cut off <eos> token, cut off special char "b@n" pred_trg = pred_trg[:-1] bleu_filter = bleu_score([pred_trg], [[ans]], max_n=1, weights=[1.0]) print(bleu_filter) if bleu_filter > max_bleu: max_pred_trg = pred_trg max_bleu = bleu_filter pred_trgs.append(max_pred_trg) trgs.append([trg]) print('predicted = ' + ' '.join(max_pred_trg)) print() bleu_filter = bleu_score([pred_trg], [[ans]], max_n=1, weights=[1.0]) if 0.2 < bleu_filter < 0.6: pred_trgs_filter.append(pred_trg) trgs_filter.append([trg]) orinum = len(pred_trgs) newnum = len(pred_trgs_filter) print(f'original number = {orinum}') print(f'new number = {newnum}') return bleu_score(pred_trgs, trgs, max_n=1, weights=[1.0]), \ bleu_score(pred_trgs, trgs, max_n=2, weights=[1.0/2]*2), \ bleu_score(pred_trgs, trgs, max_n=3, weights=[1.0/3]*3), \ bleu_score(pred_trgs, trgs, max_n=4, weights=[1.0/4]*4), \ bleu_score(pred_trgs_filter, trgs_filter, max_n=1, weights=[1.0]), \ bleu_score(pred_trgs_filter, trgs_filter, max_n=2, weights=[1.0/2]*2), \ bleu_score(pred_trgs_filter, trgs_filter, max_n=3, weights=[1.0/3]*3), \ bleu_score(pred_trgs_filter, trgs_filter, max_n=4, weights=[1.0/4]*4)
def test_bleu_score(self): # Full match candidate = [['My', 'full', 'pytorch', 'test']] refs = [[['My', 'full', 'pytorch', 'test'], ['Completely', 'Different']]] assert bleu_score(candidate, refs) == 1 # No 4-gram candidate = [['My', 'full', 'pytorch']] refs = [[['My', 'full', 'pytorch', 'test'], ['Completely', 'Different']]] assert bleu_score(candidate, refs) == 0 # Partial match candidate = [['My', 'full', 'pytorch', 'test']] refs = [[['My', 'full', 'pytorch', 'test', '!'], ['Different']]] self.assertEqual(bleu_score(candidate, refs), 0.7788007) # Bigrams and unigrams only candidate = [['My', 'pytorch', 'test']] refs = [[['My', 'full', 'pytorch', 'test'], ['Different']]] self.assertEqual(bleu_score(candidate, refs, max_n=2, weights=[0.5, 0.5]), 0.5066641) # Multi-sentence corpus candidate = [['My', 'full', 'pytorch', 'test'], ['Another', 'Sentence']] refs = [[['My', 'full', 'pytorch', 'test'], ['Completely', 'Different']], [['No', 'Match']]] self.assertEqual(bleu_score(candidate, refs), 0.8408964) # Empty input candidate = [[]] refs = [[[]]] assert bleu_score(candidate, refs) == 0 # Long input, compared to NLTK implementation score # nltl version used: 3.4.5 candidate = [['Lucille', 'B', 'has', '3', 'sons'], ['She', 'loves', 'all', 'her', 'children', 'equally'], ['No', 'match', 'here', 'at', 'all']] refs = [[['I', 'heard', 'Lucille', 'has', 'three', 'sons'], ['Rumor', 'has', 'it', 'Lucille', 'has', '3', 'sons', '!']], [['I', 'love', 'all', 'my', 'children', 'equally'], ['She', 'loves', 'all', 'her', 'children', 'equally']], [['I', 'have', 'made', 'a', 'terrible', 'mistake'], ['Big', 'mistake']]] # The comments below give the code used to get each hardcoded bleu score # nltk.translate.bleu_score.corpus_bleu(refs, candidate) self.assertEqual(bleu_score(candidate, refs), 0.4573199) # nltk.translate.bleu_score.corpus_bleu(refs, candidate, weights=[0.33]*3) self.assertEqual(bleu_score(candidate, refs, 3, weights=[0.33, 0.33, 0.33]), 0.4901113) # nltk.translate.bleu_score.corpus_bleu(refs, candidate, weights=[0.5]*2) self.assertEqual(bleu_score(candidate, refs, 2, weights=[0.5, 0.5]), 0.5119535) # nltk.translate.bleu_score.corpus_bleu(refs, candidate, weights=[1]) self.assertEqual(bleu_score(candidate, refs, 1, weights=[1]), 0.5515605)
def test(data_loader, model, device, tokenizer, logger): model.eval() scores = [] n_samples = len(data_loader) with tqdm(total=n_samples) as progress: sample = 0 for videos, sentences in data_loader: if videos.shape[0] > MAX_SEQUENCE_LENGTH: continue decoder_input_ids = tokenizer.encode(sentences) decoder_input_ids = torch.tensor(decoder_input_ids) if data_loader.batch_size == 1: decoder_input_ids.unsqueeze_(0) videos.unsqueeze_(0) videos = videos.to(device) decoder_input_ids = decoder_input_ids.to(device) outputs = model(videos, decoder_input_ids) # Calculate BLEU score output_ids = torch.argmax(outputs[0], -1) output_sentences = tokenizer.decode(output_ids.view(-1)) # Using PyTorch function (for more info: https://pytorch.org/text/data_metrics.html) scores.append(bleu_score(output_sentences, [sentences])) progress.update() sample += 1 logger.info('Average BLEU score: {}'.format(np.array(scores).mean())) return scores
def calculate_bleu_alt(iterator, src_field, trg_field, model, device, max_len=50): trgs = [] pred_trgs = [] with torch.no_grad(): for batch in iterator: src = batch.src trg = batch.trg _trgs = [] for sentence in trg: tmp = [] # Start from the first token which skips the <start> token for i in sentence[1:]: # Targets are padded. So stop appending as soon as a padding or eos token is encountered if i == trg_field.vocab.stoi[ trg_field.eos_token] or i == trg_field.vocab.stoi[ trg_field.pad_token]: break tmp.append(trg_field.vocab.itos[i]) _trgs.append([tmp]) trgs += _trgs pred_trg, _ = translate_sentence_vectorized( src, src_field, trg_field, model, device) pred_trgs += pred_trg return pred_trgs, trgs, bleu_score(pred_trgs, trgs)
def calculate_bleu_score(self, test_dataset: Dataset, max_len=128) -> float: trgs = [] pred_trgs = [] with torch.no_grad(): for batch in tqdm(test_dataset.generate(self.config.batch_sz)): src = batch.src trg = batch.trg _trgs = [] for sentence in trg: tmp = [] for i in sentence[1:]: if i == self.config.trg_vocab.eos_idx or\ i == self.config.trg_vocab.pad_idx: break tmp.append(self.config.trg_vocab.id_to_piece(i.item())) _trgs.append([tmp]) trgs += _trgs pred_trg, _ = self.translate_sentence_vectorized( src, max_len=max_len) pred_trgs += pred_trg final_bleu_score = bleu_score(pred_trgs, trgs) logger.info(f'BLEU score = {final_bleu_score*100:.2f}') return pred_trgs, trgs, final_bleu_score
def calculate_bleu(data, source_field: Field, target_field: Field, model: nn.Module, device: str, max_len=50) -> float: targets = [] predicted_targets = [] for datum in data: src = vars(datum)['src'] trg = vars(datum)['trg'] predicted_target = translate_sentence( sentence=src, source_field=source_field, target_field=target_field, model=model, device=device, max_len=max_len, ) #cut off <eos> token predicted_target = predicted_target[:-1] predicted_targets.append(predicted_target) targets.append([trg]) return bleu_score(predicted_targets, targets)
def show_bleu(data, SRC, TRG, model, device, logging = False, max_len=50): trgs = [] pred_trgs = [] index = 0 for datum in data: src = vars(datum)['src'] trg = vars(datum)['trg'] pred_trg, _ = translate_sentence(src, SRC, TRG, model, device, max_len, logging=False) # remove <eos> pred_trg = pred_trg[:-1] pred_trgs.append(pred_trg) trgs.append([trg]) index+=1 if (index + 1) % 100 == 0 and logging: print(f'[{index+1}/{len(data)}]') print(f'pred: {pred_trg}') print(f'answer: {trg}') bleu = bleu_score(pred_trgs, trgs, max_n=4, weights=[0.25, 0.25, 0.25, 0.25]) print(f'Total BLEU Score = {bleu*100:.2f}') sys.stdout.flush()
def calculate_bleu(dataset, src_field, trg_field, model, device, max_len=50): trgs = [] pred_trgs = [] for data in dataset: src = vars(data)['QnA'] #src = src.permute(1,0) trg = vars(data)['Ans_Sen'] #trg = trg.permute(1,0) #print(trg) pred_trg = rnn_predict(src, src_field, trg_field, model, device, max_len) #print(pred_trg) #cut off <eos> token pred_trg = pred_trg[:-1] pred_trgs.append(pred_trg) trgs.append([trg]) return bleu_score(pred_trgs, trgs)
def calculate_bleu_score(model, dataloader, german_word_to_idx, english_idx_to_word, device): model.eval() predicted_sentences = [] target_sentences = [] with torch.no_grad(): for num, d in tqdm(enumerate(dataloader), total=len(dataloader)): german_idx = d['german_idx'].to(device) english_idx = d['english_idx'].to(device) predicted_english_idx = model(german_idx, english_idx, teacher_force_ratio=10) english_idx = english_idx.detach().cpu().numpy() predicted_english_idx = torch.softmax(predicted_english_idx, dim=-1) predicted_english_idx = predicted_english_idx.argmax(-1) predicted_english_idx = predicted_english_idx.detach().cpu().numpy( ) for num in range(len(predicted_english_idx)): target_idx = english_idx[num] output = predicted_english_idx[num] predicted_sentence = decode(output, english_idx_to_word) predicted_sentences.append(predicted_sentence) target_sentence = decode(target_idx, english_idx_to_word) target_sentences.append([target_sentence]) return bleu_score(predicted_sentences, target_sentences)
def main(): parser = argparse.ArgumentParser(description='translate.py') parser.add_argument('-data_pkl', required=True, help='Pickle file with vocabulary.') parser.add_argument('-trg_data', default='PSLG-PC12/ENG-ASL_Test.en') parser.add_argument('-pred_data', default='predictions.txt', help="""Path to output the predictions (each line will be the decoded sequence""") opt = parser.parse_args() data = pickle.load(open(opt.data_pkl, 'rb')) SRC, TRG = data['vocab']['src'], data['vocab']['trg'] fields = [('src', SRC)] with open(opt.trg_data, 'r') as f: trg_loader = Dataset( examples=[Example.fromlist([x], fields) for x in f], fields={'src': SRC}) trg_txt = [x.src for x in trg_loader] with open(opt.pred_data, 'r') as f: pred_loader = Dataset( examples=[Example.fromlist([x], fields) for x in f], fields={'src': SRC}) pred_txt = [[x.src] for x in pred_loader] score = bleu_score(trg_txt, pred_txt) print('Bleu 4 score is {}'.format(str(score))) with open('bleu_score.txt', 'w') as f: f.write('Bleu 4 score is {}'.format(str(score)))
def calculate_bleu_score(candidate, reference, lang, max_n=2): """ Input: - candidate: numpy array or pytorch tensor of shape (batch, max_seq_len1) - reference : numpy array or pytorch tensor of shape (batch, 5, max_seq_len2) - lang: Lang class instance that can be used to decode numerical captions Output: - scores: list containing BLEU scores for each sample, where len(scores) == batch """ if isinstance(candidate, torch.Tensor): candidate = candidate.cpu().numpy() if isinstance(reference, torch.Tensor): reference = reference.cpu().numpy() scores = [] for cand, ref_list in zip( candidate, reference): # calculate the BLEU score for all items in the batch cand = [lang.decode_caption(cand).split()] ref_list = [[lang.decode_caption(ann).split() for ann in ref_list]] score = bleu_score(cand, ref_list, max_n=max_n, weights=[1 / max_n for _ in range(max_n)]) scores.append(score * 100) return scores
def compute_metrics(hyp_dec_all, ref_dec_all, use_sacrebleu=True, use_torchtext=True, use_ter=False): metrics = {} # Sacrebleu if use_sacrebleu: metrics["sacrebleu_rawcorpusbleu"] = sacrebleu.raw_corpus_bleu( hyp_dec_all, [ref_dec_all]).score metrics["sacrebleu_bleu"] = sacrebleu.corpus_bleu( hyp_dec_all, [ref_dec_all]).score metrics["sacrebleu_chrf"] = sacrebleu.corpus_chrf( hyp_dec_all, [ref_dec_all]).score if use_ter: # Quite slow metrics["sacrebleu_ter"] = sacrebleu.corpus_ter( hyp_dec_all, [ref_dec_all]).score # Torchtext if use_torchtext: m_bleu_score = bleu_score([x.split(" ") for x in hyp_dec_all], [[x.split(" ")] for x in ref_dec_all]) metrics["torchtext_bleu"] = m_bleu_score * 100 return metrics
def bleu(data, model, german, english, device, syntax_embedding_size): targets = [] outputs = [] count = 0 for example in data: count += 1 src = vars(example)["src"] trg = vars(example)["trg"] prediction = translate_sentence(model, src, german, english, device, syntax_embedding_size) prediction = prediction #[:-1] # remove <eos> token src = german.decode(src) trg = english.decode(trg) print(count, "src :>>>", src) print("target: ", trg) print("pred : ", prediction) targets.append([trg.split()]) outputs.append(prediction.split()) print("calc blue 1") #print(outputs) print("====================") #print(targets) #exit() blue = bleu_score(outputs, targets) #blue = bleu_score(targets, targets) print("calc blue 2") return blue
def calculate_bleu(data, src_field, trg_field, model, device, max_len=50): print('*' * 40, ' Calculating BLEU ', '*' * 40) trgs = [] pred_trgs = [] pred_trgs_out = [] for datum in data: src = vars(datum)['src'] trg = vars(datum)['trg'] pred_trg, _ = translate_sentence(src, src_field, trg_field, model, device, max_len) #cut off <eos> token pred_trg = pred_trg[:-1] pred_trg_out = ' '.join(pred_trg) pred_trgs.append(pred_trg) trgs.append([trg]) pred_trgs_out.append(pred_trg_out) with open(file='./output/test_output.csv', mode='w') as f: writer = csv.writer(f, delimiter='\n', quotechar='\"', quoting=csv.QUOTE_MINIMAL) writer.writerows([pred_trgs_out]) return bleu_score(pred_trgs, trgs)
def bleu(data, model, source_lang, target_lang, device, max_length, generate_outputs): targets = [] outputs = [] for example in data: src = vars(example)["src"] trg = vars(example)["trg"] #print("trg=========>",trg) #print("src==",src) prediction = Translate(model, src, source_lang, target_lang, device, max_length) #print("predictions===",prediction) # remove <sos> <eos> token prediction = prediction[1:-1] targets.append([trg]) outputs.append(prediction) if generate_outputs: writer = [' '.join(s) for s in outputs] with open("Outputs/testset_translated.txt", 'w', encoding='utf-8') as op: for sent in writer[:-1]: op.write(sent + '\n') op.write(writer[-1]) #print("org outputs-->",outputs) #print("org tagets-->",targets) return bleu_score(outputs, targets)
def calculate_bleu_score(ground_truth, predictions): for i in range(len(ground_truth)): ground_truth[i] = [ground_truth[i].split(" ")] for i in range(len(predictions)): predictions[i] = predictions[i].split(' ') return bleu_score(predictions, ground_truth)
def show_train_info(epoch, start_time, end_time, train_loss, valid_loss, metric='perplexit', **kwargs): # kwargs for bleu_score: # https://pytorch.org/text/data_metrics.html?highlight=bleu#torchtext.data.metrics.bleu_score epoch_mins, epoch_secs = epoch_time(start_time, end_time) train_score = math.exp( train_loss) if metric == 'perplexity' else bleu_score(**kwargs) valid_score = math.exp( valid_loss) if metric == 'perplexity' else bleu_score(**kwargs) print(f'Epoch: {epoch:02} | Time: {epoch_mins}m {epoch_secs}s') print(f'\tTrain Loss: {train_loss:.3f} | Train BLEU: {train_score:7.3f}') print(f'\t Val. Loss: {valid_loss:.3f} | Val. BLEU: {valid_score:7.3f}')
def count_bleu(output, trg, TRG): # output shape: seq_len * batch_size * feature # trg shape: seq_len * batch_size # corpus level or sentence level bleu ? output = output.permute(1,0,2).max(2)[1] trg = trg.permute(1,0) candidate_corpus = [itos(idx_list, TRG) for idx_list in output] references_corpus = [[itos(idx_list, TRG)] for idx_list in trg] return bleu_score(candidate_corpus, references_corpus)
def google_bleu_score(gtrans, ref): m1 = [] m2 = [] for sent in gtrans: m1.append(sent.split()) for sent in ref: m2.append([sent.split()]) r = bleu_score(m1, m2) return r
def evaluate(model: Model, dataset: Im2LatexDataset, args: Munch, num_batches: int = None, name: str = 'test'): """evaluates the model. Returns bleu score on the dataset Args: model (torch.nn.Module): the model dataset (Im2LatexDataset): test dataset args (Munch): arguments num_batches (int): How many batches to evaluate on. Defaults to None (all batches). name (str, optional): name of the test e.g. val or test for wandb. Defaults to 'test'. Returns: bleu_score: BLEU score of validation set. """ assert len(dataset) > 0 device = args.device log = {} bleus, edit_dists = [], [] bleu_score, edit_distance = 0, 1 pbar = tqdm(enumerate(iter(dataset)), total=len(dataset)) for i, (seq, im) in pbar: if seq is None or im is None: continue tgt_seq, tgt_mask = seq['input_ids'].to(device), seq['attention_mask'].bool().to(device) encoded = model.encoder(im.to(device)) #loss = decoder(tgt_seq, mask=tgt_mask, context=encoded) dec = model.decoder.generate(torch.LongTensor([args.bos_token]*len(encoded))[:, None].to(device), args.max_seq_len, eos_token=args.pad_token, context=encoded, temperature=args.get('temperature', .2)) pred = detokenize(dec, dataset.tokenizer) truth = detokenize(seq['input_ids'], dataset.tokenizer) bleus.append(metrics.bleu_score(pred, [alternatives(x) for x in truth])) for predi, truthi in zip(token2str(dec, dataset.tokenizer), token2str(seq['input_ids'], dataset.tokenizer)): ts = post_process(truthi) edit_dists.append(distance(post_process(predi), ts)/len(ts)) pbar.set_description('BLEU: %.3f, ED: %.2e' % (np.mean(bleus), np.mean(edit_dists))) if num_batches is not None and i >= num_batches: break if len(bleus) > 0: bleu_score = np.mean(bleus) log[name+'/bleu'] = bleu_score if len(edit_dists) > 0: edit_distance = np.mean(edit_dists) log[name+'/edit_distance'] = edit_distance if args.wandb: # samples pred = token2str(dec, dataset.tokenizer) truth = token2str(seq['input_ids'], dataset.tokenizer) table = wandb.Table(columns=["Truth", "Prediction"]) for k in range(min([len(pred), args.test_samples])): table.add_data(post_process(truth[k]), post_process(pred[k])) log[name+'/examples'] = table wandb.log(log) else: print('\n%s\n%s' % (truth, pred)) print('BLEU: %.2f' % bleu_score) return bleu_score, edit_distance
def eval_bleu(self, test_cases: Dict = None, silent: bool = False): if test_cases is None: test_cases = self.dataset.validation_stuff() src_sentences: List[List[int]] = [ pair["source_val"] for pair in test_cases ] candidates = self.__eval_multiple(src_sentences, 300, silent) candidates = [[self.dataset.target_value2token(v) for v in s] for s in candidates] sentences_refs = [pair["targets_tokens"] for pair in test_cases] return bleu_score(candidates, sentences_refs)
def bleu(valid_src_data, valid_trg_data, model, SRC, TRG, device, k, max_strlen): pred_sents = [] for sentence in valid_src_data: pred_trg = translate_sentence(sentence, model, SRC, TRG, device, k, max_strlen) pred_sents.append(pred_trg) pred_sents = [TRG.preprocess(sent) for sent in pred_sents] trg_sents = [[sent.split()] for sent in valid_trg_data] return bleu_score(pred_sents, trg_sents)
def estimateBLEU(model, set_name, word2idx_dict, idx2word_dict, cw_idx_list, qw_idx_list, device): for i in range(1, 5): candidate_corpus, references_corpus = FindCandidatesAndReferencesForBLEU( model, word2idx_dict, idx2word_dict, cw_idx_list, qw_idx_list, device) bleu_test = bleu_score(candidate_corpus, references_corpus, max_n=i, weights=[1. / i] * i) print("BLEU-" + str(i) + " on " + set_name + " :" + str(bleu_test)) return
def bleu(data, model, receptors, ligands, device): print("=> Calculating Bleu") targets = [] outputs = [] for example in data: src = vars(example)["src"] trg = vars(example)["trg"] prediction = translate_sentence(model, src, receptors, ligands, device) prediction = list(map(str, prediction[:-1])) # remove <eos> token outputs.append(prediction) targets.append(list(map(str, trg))) return bleu_score(outputs, targets)
def show_bleu(data, src_field, trg_field, model, device, max_len=50): trgs = [] pred_trgs = [] index = 0 for datum in data: src = vars(datum)['src'] trg = vars(datum)['trg'] pred_trg, _ = translate_sentence(src, src_field, trg_field, model, device, max_len, logging=False) # 마지막 <eos> 토큰 제거 pred_trg = pred_trg[:-1] pred_trgs.append(pred_trg) trgs.append([trg]) index += 1 if (index + 1) % 100 == 0: print(f"[{index + 1}/{len(data)}]") print(f"예측: {pred_trg}") print(f"정답: {trg}") bleu = bleu_score(pred_trgs, trgs, max_n=4, weights=[0.25, 0.25, 0.25, 0.25]) print(f'Total BLEU Score = {bleu * 100:.2f}') individual_bleu1_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[1, 0, 0, 0]) individual_bleu2_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[0, 1, 0, 0]) individual_bleu3_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[0, 0, 1, 0]) individual_bleu4_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[0, 0, 0, 1]) print(f'Individual BLEU1 score = {individual_bleu1_score * 100:.2f}') print(f'Individual BLEU2 score = {individual_bleu2_score * 100:.2f}') print(f'Individual BLEU3 score = {individual_bleu3_score * 100:.2f}') print(f'Individual BLEU4 score = {individual_bleu4_score * 100:.2f}') cumulative_bleu1_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[1, 0, 0, 0]) cumulative_bleu2_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[1 / 2, 1 / 2, 0, 0]) cumulative_bleu3_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[1 / 3, 1 / 3, 1 / 3, 0]) cumulative_bleu4_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[1 / 4, 1 / 4, 1 / 4, 1 / 4]) print(f'Cumulative BLEU1 score = {cumulative_bleu1_score * 100:.2f}') print(f'Cumulative BLEU2 score = {cumulative_bleu2_score * 100:.2f}') print(f'Cumulative BLEU3 score = {cumulative_bleu3_score * 100:.2f}') print(f'Cumulative BLEU4 score = {cumulative_bleu4_score * 100:.2f}')
def log_progress(epoch_i, start_time, tr_loss, val_loss, translations=None, tb_writer=None): metrics = { "train": { "loss": tr_loss, "ppl": math.exp(tr_loss), }, "val": { "loss": val_loss, "ppl": math.exp(val_loss), }, } # Get additional metrics if translations: src_dec_all, hyp_dec_all, ref_dec_all = translations m_bleu_score = bleu_score([x.split(" ") for x in hyp_dec_all], [[x.split(" ")] for x in ref_dec_all]) metrics["val"]["bleu"] = m_bleu_score * 100 # Print translations helpers.print_translations(hyp_dec_all, ref_dec_all, src_dec_all, limit=50) # Print stuff end_time = time.time() epoch_hours, epoch_mins, epoch_secs = helpers.epoch_time( start_time, end_time) print("------------------------------------------------------------") print(f'Epoch: {epoch_i + 1:02} | Time: {epoch_mins}m {epoch_secs}s') print( f'\t- Train Loss: {metrics["train"]["loss"]:.3f} | Train PPL: {metrics["train"]["ppl"]:.3f}' ) print( f'\t- Val Loss: {metrics["val"]["loss"]:.3f} | Val PPL: {metrics["val"]["ppl"]:.3f} | Val BLEU: {metrics["val"]["bleu"]:.3f}' ) print("------------------------------------------------------------") # Tensorboard if tb_writer: for split in ["train", "val"]: for k, v in metrics[split].items(): tb_writer.add_scalar(f'{split}_{k.lower()}', v, epoch_i + 1) wandb.log({f'{split}_{k.lower()}': v}) return metrics
def cust_bleu(output_path, target_path): spacy_eng = spacy.load("en") output_data = open(output_path, encoding='utf8').read().split('\n') target_data = open(target_path, encoding='utf8').read().split('\n') outputs = [s.lower().split(' ') for s in output_data] targets = [] for sent in target_data: targets.append([[tok.text.lower() for tok in spacy_eng(sent)]]) #print("cust outputs-->",outputs) #print("cust tagets-->",targets) return bleu_score(outputs, targets)
def bleu(data, model, japanese, english, device): targets = [] outputs = [] for example in data: src = vars(example)["jap"] trg = vars(example)["eng"] prediction = translate_sentence(model, src, japanese, english, device) prediction = prediction[:-1] # remove <eos> token targets.append([trg]) outputs.append(prediction) return bleu_score(outputs, targets)
def test_model(test_loader, model, device): reference = [] pred_list = [] model.eval() for data in test_loader: src, tgt, src_len, tgt_len = data preds = model(src, None, device) preds = preds.max(2)[1].transpose(1, 0) ref = tgt.transpose(1, 0) ref = id2word(ref) reference += [[r] for r in ref] pred_list += id2word(preds) score = bleu_score(pred_list, reference) return score
def count_bleu(output, trg, TRG): # output shape: [T, N, E] # trg shape: [T, N] # using corpus level output = output.permute(1, 0, 2).max(2)[1] trg = trg.permute(1, 0) mask = trg.ne(TRG.vocab.stoi['<pad>']) output = output.masked_select(mask) trg = trg.masked_select(mask) candidate_corpus = [itos(output, TRG)] references_corpus = [[itos(trg, TRG)]] return bleu_score(candidate_corpus, references_corpus)