def translate(self, data_iter, step, attn_debug=False): self.model.eval() output_path = self.args.result_path + '.%d.output' % step output_file = codecs.open(output_path, 'w', 'utf-8') gold_path = self.args.result_path + '.%d.gold_test' % step pred_path = self.args.result_path + '.%d.pred_test' % step ex_single_path = self.args.result_path + '.%d.ex_test' % step + ".short" ex_context_path = self.args.result_path + '.%d.ex_test' % step + ".long" gold_out_file = codecs.open(gold_path, 'w', 'utf-8') pred_out_file = codecs.open(pred_path, 'w', 'utf-8') short_ex_out_file = codecs.open(ex_single_path, 'w', 'utf-8') long_ex_out_file = codecs.open(ex_context_path, 'w', 'utf-8') # pred_results, gold_results = [], [] ct = 0 with torch.no_grad(): rouge = Rouge() for batch in data_iter: doc_data, summ_data = self.translate_batch(batch) translations = self.from_batch_test(batch, doc_data) for idx in range(len(translations)): origin_sent, doc_extract, context_doc_extract, \ doc_pred, lead = translations[idx] if ct % 100 == 0: print("Processing %d" % ct) output_file.write("ID : %d\n" % ct) output_file.write( "ORIGIN : " + origin_sent.replace('<S>', '\n ') + "\n") gold_data = summ_data[idx] output_file.write("GOLD : " + gold_data + "\n") output_file.write("LEAD : " + lead + "\n") output_file.write("DOC_EX : " + doc_extract.strip() + "\n") output_file.write("DOC_CONT: " + context_doc_extract.strip() + "\n") output_file.write("DOC_GEN : " + doc_pred.strip() + "\n") gold_list = gold_data.strip().split() lead_list = lead.strip().replace("[unused2]", "").replace( "[unused3]", "").split() rouge_score = rouge.get_scores(lead, gold_data) bleu_score = sentence_bleu( [gold_list], lead_list, smoothing_function=SmoothingFunction().method1) output_file.write( "LEAD bleu & rouge-f 1/2/l: %.4f & %.4f/%.4f/%.4f\n" % (bleu_score, rouge_score[0]["rouge-1"]["f"], rouge_score[0]["rouge-2"]["f"], rouge_score[0]["rouge-l"]["f"])) doc_extract_list = doc_extract.strip().replace( "[unused2]", "").replace("[unused3]", "").split() rouge_score = rouge.get_scores(doc_extract, gold_data) bleu_score = sentence_bleu( [gold_list], doc_extract_list, smoothing_function=SmoothingFunction().method1) output_file.write( "DOC_EX bleu & rouge-f 1/2/l: %.4f & %.4f/%.4f/%.4f\n" % (bleu_score, rouge_score[0]["rouge-1"]["f"], rouge_score[0]["rouge-2"]["f"], rouge_score[0]["rouge-l"]["f"])) doc_context_list = context_doc_extract.strip().replace( "[unused2]", "").replace("[unused3]", "").split() rouge_score = rouge.get_scores(context_doc_extract, gold_data) bleu_score = sentence_bleu( [gold_list], doc_context_list, smoothing_function=SmoothingFunction().method1) output_file.write( "DOC_CONT bleu & rouge-f 1/2/l: %.4f & %.4f/%.4f/%.4f\n" % (bleu_score, rouge_score[0]["rouge-1"]["f"], rouge_score[0]["rouge-2"]["f"], rouge_score[0]["rouge-l"]["f"])) doc_long_list = doc_pred.strip().replace( "[unused2]", "").replace("[unused3]", "").split() rouge_score = rouge.get_scores(doc_pred, gold_data) bleu_score = sentence_bleu( [gold_list], doc_long_list, smoothing_function=SmoothingFunction().method1) output_file.write( "DOC_GEN bleu & rouge-f 1/2/l: %.4f & %.4f/%.4f/%.4f\n\n" % (bleu_score, rouge_score[0]["rouge-1"]["f"], rouge_score[0]["rouge-2"]["f"], rouge_score[0]["rouge-l"]["f"])) short_ex_out_file.write(doc_extract.strip().replace( "[unused2]", "").replace("[unused3]", "") + '\n') long_ex_out_file.write(context_doc_extract.strip().replace( "[unused2]", "").replace("[unused3]", "") + '\n') pred_out_file.write(doc_pred.strip().replace( "[unused2]", "").replace("[unused3]", "") + '\n') gold_out_file.write(gold_data.strip() + '\n') ct += 1 pred_out_file.flush() short_ex_out_file.flush() long_ex_out_file.flush() gold_out_file.flush() output_file.flush() pred_out_file.close() short_ex_out_file.close() long_ex_out_file.close() gold_out_file.close() output_file.close() if (step != -1): ex_short_bleu = test_bleu(gold_path, ex_single_path) ex_long_bleu = test_bleu(gold_path, ex_context_path) pred_bleu = test_bleu(gold_path, pred_path) file_rouge = FilesRouge(hyp_path=ex_single_path, ref_path=gold_path) ex_short_rouges = file_rouge.get_scores(avg=True) file_rouge = FilesRouge(hyp_path=ex_context_path, ref_path=gold_path) ex_long_rouges = file_rouge.get_scores(avg=True) file_rouge = FilesRouge(hyp_path=pred_path, ref_path=gold_path) pred_rouges = file_rouge.get_scores(avg=True) self.logger.info( 'Gold Length at step %d: %.2f\n' % (step, test_length(gold_path, gold_path, ratio=False))) self.logger.info('Short Extraction Length ratio at step %d: %.2f' % (step, test_length(ex_single_path, gold_path))) self.logger.info('Short Extraction Bleu at step %d: %.2f' % (step, ex_short_bleu * 100)) self.logger.info('Short Extraction Rouges at step %d \n%s' % (step, rouge_results_to_str(ex_short_rouges))) self.logger.info('Long Extraction Length ratio at step %d: %.2f' % (step, test_length(ex_context_path, gold_path))) self.logger.info('Long Extraction Bleu at step %d: %.2f' % (step, ex_long_bleu * 100)) self.logger.info('Long Extraction Rouges at step %d \n%s' % (step, rouge_results_to_str(ex_long_rouges))) self.logger.info('Prediction Length ratio at step %d: %.2f' % (step, test_length(pred_path, gold_path))) self.logger.info('Prediction Bleu at step %d: %.2f' % (step, pred_bleu * 100)) self.logger.info('Prediction Rouges at step %d \n%s' % (step, rouge_results_to_str(pred_rouges)))
def eval(self, filesrc, filetgt, output_file_name, batch_size=64, max_batches=None, device="cpu", keep_chance=0.9): if self.encoder is None or self.decoder is None: print('Model not loaded!') return self.encoder.to(device) self.decoder.to(device) with open(output_file_name, 'w') as output_file: testloader = test_data_loader(filesrc=filesrc, filetgt=filetgt, output_file=output_file, model=self, batch_size=batch_size, max_batches=max_batches, keep_chance=keep_chance, device=device) self.encoder.eval() self.decoder.eval() start = time.time() scores = [] i = 0 for batch_candidate, batch_references in testloader: cur_score = sentence_bleu( batch_references, batch_candidate, smoothing_function=SmoothingFunction().method3) scores.append(cur_score) i += 1 print('', file=output_file) print('Sample {0:d}, BLEU score: {1:0.4f}'.format( i, cur_score)) print('', file=output_file) print('Sample {0:d}, BLEU score: {1:0.4f}'.format( i, cur_score), file=output_file) print('', file=output_file) print('=' * 30, file=output_file) print('', file=output_file) print('=' * 50, file=output_file) print('', file=output_file) print('= ' * 25, file=output_file) print('', file=output_file) print( 'Average BLEU score: {0:0.4f}, minimum score: {1:0.4f}, maximum score: {2:0.4f}, median score: {3:0.4f}' .format(np.mean(scores), min(scores), max(scores), np.median(scores)), file=output_file) print( 'Average BLEU score: {0:0.4f}, minimum score: {1:0.4f}, maximum score: {2:0.4f}, median score: {3:0.4f}' .format(np.mean(scores), min(scores), max(scores), np.median(scores))) return scores, testloader.input_lens
def train(encoder_decoder: EncoderDecoder, model_dump_path, train_data_loader: DataLoader, model_name, val_data_loader: DataLoader, keep_prob, teacher_forcing_schedule, lr, max_length, early_stopping, patience, beam_width): global_step = 0 loss_function = torch.nn.NLLLoss(ignore_index=0) optimizer = AdamW( encoder_decoder.parameters(), lr=lr, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps=1e-8 # args.adam_epsilon - default is 1e-8. ) # optimizer = optim.Adam(encoder_decoder.parameters(), lr=lr) model_path = model_dump_path + model_name + '/' history = { 'val_loss': [], 'best_epoch': -1, 'best_loss': float("inf"), 'prev_loss': float("inf") } for epoch, teacher_forcing in enumerate(teacher_forcing_schedule): print('epoch %i' % epoch, flush=True) for batch_idx, (tweet_idxs, news_idxs, target_idxs, tweet_tokens, news_tokens, hashtag_tokens) in enumerate(tqdm(train_data_loader)): # tweet_idxs have dim (batch_size x max_tweet_len) # news_idxs have dim (batch_size x max_news_len) # hashtag_idxs have dim (batch_size x max_hashtag_len) lengths_tweets = (tweet_idxs != 0).long().sum(dim=1) lengths_news = (news_idxs != 0).long().sum(dim=1) optimizer.zero_grad() output_log_probs, output_seqs, cov_loss = encoder_decoder( tweet_idxs, news_idxs, lengths_tweets, lengths_news, beam_width, targets=target_idxs, keep_prob=keep_prob, teacher_forcing=teacher_forcing) batch_size = tweet_idxs.shape[0] flattened_outputs = output_log_probs.contiguous().view( batch_size * max_length, -1) batch_loss = loss_function( flattened_outputs, target_idxs.contiguous().view(-1)) + cov_loss batch_loss.backward() optimizer.step() batch_outputs = trim_seqs(output_seqs) batch_targets = [[list(seq[seq > 0])] for seq in list(to_np(target_idxs))] batch_bleu_score = corpus_bleu( batch_targets, batch_outputs, smoothing_function=SmoothingFunction().method1) if global_step < 10 or (global_step % 10 == 0 and global_step < 100) or (global_step % 100 == 0): tweet_string = "do you think brett kavanaugh should be confirmed as a justice on the supreme court" news_string = "leading catholic publication turns on brett kavanaugh says his nomination to the supreme court should be withdrawn" output_string = encoder_decoder.get_response( tweet_string, news_string) writer.add_text('kavanaugh', output_string, global_step=global_step) print("Global Step: ", global_step, ' kavanaugh ', output_string) if global_step % 100 == 0: writer.add_scalar('train_batch_loss', batch_loss, global_step) writer.add_scalar('train_batch_bleu_score', batch_bleu_score, global_step) print("Global Step: ", global_step, ' train_batch_loss ', batch_loss) print("Global Step: ", global_step, ' train_batch_bleu_score ', batch_bleu_score) # for tag, value in encoder_decoder.named_parameters(): # tag = tag.replace('.', '/') # writer.add_histogram('weights/' + tag, value, global_step, bins='doane') # writer.add_histogram('grads/' + tag, to_np(value.grad), global_step, bins='doane') global_step += 1 with torch.no_grad(): val_loss, val_bleu_score = evaluate(encoder_decoder, val_data_loader) history["val_loss"].append(val_loss) writer.add_scalar('val_loss', val_loss, global_step=global_step) writer.add_scalar('val_bleu_score', val_bleu_score, global_step=global_step) encoder_embeddings = encoder_decoder.encoder.embedding.weight.data encoder_vocab = encoder_decoder.lang.tok_to_idx.keys() writer.add_embedding(encoder_embeddings, metadata=encoder_vocab, global_step=0, tag='encoder_embeddings') decoder_embeddings = encoder_decoder.decoder.embedding.weight.data decoder_vocab = encoder_decoder.lang.tok_to_idx.keys() writer.add_embedding(decoder_embeddings, metadata=decoder_vocab, global_step=0, tag='decoder_embeddings') tweet_string = "should ask dr ford 1 important question why did you go upstairs to go to the bathroom 99 of 2 story houses have a bathroom downstairs common sense bedrooms are upstairs cbc news fox news real donald trump cnn" news_string = "america supreme court brett kavanuagh senate judiciary committee christine blasey ford cnn fox news supreme court bill clinton kavanaugh new york ford donald trump rod rosenstein new york times trump trump maine alaska tara d sonenshine us george washington university elliott school of international affairs christine blasey ford senate judiciary committee fox news ford fox news chris wallace ford wallace bret baier ford brit hume fox news bret baier ford chris wallace david mack andrew napolitano ford rachel mitchell fox news andrew napolitano ford rachel mitchell keith boykin fox kavanaugh ford ford twitter america supreme court brett kavanuagh senate judiciary committee christine blasey ford cnn fox news supreme court bill clinton kavanaugh new york ford donald trump rod rosenstein new york times trump trump maine alaska tara d sonenshine us george washington university elliott school of international affairs fox news sean hannity donald trump fox news fox news cnn msnbc pbs news gallupknight foundation gallupknight foundation gallup knight foundation christine blasey ford senate judiciary committee fox news ford fox news chris wallace ford wallace bret baier ford brit hume fox news bret baier ford chris wallace" output_string = encoder_decoder.get_response(tweet_string, news_string) writer.add_text('christine blasey ford', output_string, global_step=global_step) print("Global Step: ", global_step, ' christine blasey ford ', output_string) print('val loss: %.5f, val BLEU score: %.5f' % (val_loss, val_bleu_score), flush=True) torch.save(encoder_decoder, "%s%s_%i.pt" % (model_path, model_name, epoch)) print('-' * 100, flush=True) if history['val_loss'][-1] < history['best_loss'] or history[ 'val_loss'][-1] < history['prev_loss']: history['best_loss'] = history['val_loss'][-1] history['best_epoch'] = epoch elif early_stopping and epoch - history['best_epoch'] > patience: # early stopping print( "Early stopping at epoch {0}, best result at epoch {1}".format( epoch, history['best_epoch'])) break history['prev_loss'] = history['val_loss'][-1]
#-*- coding:utf-8 -*- ''' [AI502] Deep Learning Assignment "Attention is all you need" Implementation 20193640 Jungwon Choi ''' import torch import torch.nn as nn import numpy as np import sys from nltk.translate.bleu_score import sentence_bleu from nltk.translate.bleu_score import SmoothingFunction # for short sentence smoothing_func = SmoothingFunction().method4 #=============================================================================== ''' Validate sequence ''' def val(model, val_loader, criterion, dataloader): model.eval() device = next(model.parameters()).device.index losses = [] total_iter = len(val_loader) sum_bleu = 0.0 num_sentence = 0.0 sos_idx = dataloader.sos_idx with torch.no_grad(): for i, batch in enumerate(val_loader):
def calc_bleu(self, reference, hypothesis, weight): return nltk.translate.bleu_score.sentence_bleu( reference, hypothesis, weight, smoothing_function=SmoothingFunction().method1)
import random import logging import numpy as np import tensorflow as tf from nltk.translate.bleu_score import corpus_bleu from nltk.translate.bleu_score import SmoothingFunction import network from utils import * from vocab import Vocabulary, build_unify_vocab from config import load_arguments from dataloader.multi_style_dataloader import MultiStyleDataloader from dataloader.online_dataloader import OnlineDataloader smoothie = SmoothingFunction().method4 logger = logging.getLogger(__name__) def evaluation(sess, args, batches, model, classifier, classifier_vocab, domain_classifer, domain_vocab, output_path, write_dict, save_samples=False,
def processAlignments(data, folder, inputfile, outputType, num, refs=False): with open(folder + "/" + ntpath.basename(inputfile) + '.ali.js', 'w', encoding='utf-8') as out_a_js: with open(folder + "/" + ntpath.basename(inputfile) + '.src.js', 'w', encoding='utf-8') as out_s_js: with open(folder + "/" + ntpath.basename(inputfile) + '.trg.js', 'w', encoding='utf-8') as out_t_js: with open(folder + "/" + ntpath.basename(inputfile) + '.con.js', 'w', encoding='utf-8') as out_c_js: with open(folder + "/" + ntpath.basename(inputfile) + '.sc.js', 'w', encoding='utf-8') as out_sc_js: out_a_js.write(u'var alignments = [\n') out_s_js.write(u'var sources = [\n') out_t_js.write(u'var targets = [\n') out_c_js.write(u'var confidences = [\n') out_sc_js.write(u'var sentence_confidences = [\n') num = int(num) - 1 if num > -1 and (num < len(data)): data = [data[num]] elif num >= len(data): print( 'The selected sentence number is higher than the sentence count!\n' ) printHelp() sys.exit() for i in range(0, len(data)): (src, tgt, rawAli) = data[i] #In case the source string is empty if rawAli.ndim == 1: rawAli = np.array([rawAli]) #In case both source & target string is both empty, or of length 1 without eos elif rawAli.ndim == 0: rawAli = np.array([[rawAli]]) ali = [ l[:len(list(filter(None, tgt)))] for l in rawAli[:len(src)] ] srcTotal = [] trgTotal = [] tali = np.array(ali).transpose() for a in range(0, len(ali)): srcTotal.append( str( math.pow( math.e, -0.05 * math.pow( (getCP([ali[a]]) + getEnt([ali[a]]) + getRevEnt([ali[a]])), 2)))) for a in range(0, len(tali)): trgTotal.append( str( math.pow( math.e, -0.05 * math.pow( (getCP([tali[a]]) + getEnt([tali[a]]) + getRevEnt([tali[a]])), 2)))) JoinedSource = " ".join(src) JoinedTarget = " ".join(tgt) StrippedSource = ''.join( c for c in JoinedSource if unicodedata.category(c).startswith( 'L')).replace('EOS', '').replace( 'quot', '').replace('apos', '') StrippedTarget = ''.join( c for c in JoinedTarget if unicodedata.category(c).startswith( 'L')).replace('EOS', '').replace( 'quot', '').replace('apos', '') #Get the confidence metrics CDP = round(getCP(ali), 10) APout = round(getEnt(ali), 10) APin = round(getRevEnt(ali), 10) Total = round(CDP + APout + APin, 10) #Can we calculate BLEU? bleuNumber = -1 if (refs): try: from nltk.translate import bleu from nltk.translate.bleu_score import SmoothingFunction sm = SmoothingFunction() refNumber = i if num < 0 else num deBpeRef = " ".join( refs[refNumber]).replace('@@ ', '') deBpeHyp = JoinedTarget.replace( '@@ ', '').replace('<EOS>', '').strip() bleuNumber = round( bleu([deBpeRef.split()], deBpeHyp.split(), smoothing_function=sm.method3) * 100, 2) bleuScore = u', ' + repr(bleuNumber) except ImportError: sys.stdout.write( 'NLTK not found! BLEU will not be calculated\n' ) refs = False bleuScore = u'' else: bleuScore = u'' jls = JoinedSource.replace('@@ ', '').replace( '<EOS>', '').replace('"', '"').replace( "'", "'").replace("&", "&").replace("@-@", "-").strip() jlt = JoinedTarget.replace('@@ ', '').replace( '<EOS>', '').replace('"', '"').replace( "'", "'").replace("&", "&").replace("@-@", "-").strip() longest = longestCommonSubstring(jls, jlt).strip() similarity = len(longest) / len(jlt) #Penalize sentences with more than 4 tokens if (len(tgt) > 4) and (similarity > 0.3): #The more similar, the higher penalty #It's worse to have more words with a higher similarity #Let's make it between 0.7 and about 1.5 for veeeery long sentences multiplier = ((0.8 + (len(tgt) * 0.01)) * (3 - ((1 - similarity) * 5)) * (0.7 + similarity) * math.tan(similarity)) Total = round(CDP + APout + APin - multiplier, 10) # e^(-1(x^2)) CDP_pr = round( math.pow(math.e, -1 * math.pow(CDP, 2)) * 100, 2) # e^(-0.05(x^2)) APout_pr = round( math.pow(math.e, -0.05 * math.pow(APout, 2)) * 100, 2) APin_pr = round( math.pow(math.e, -0.05 * math.pow(APin, 2)) * 100, 2) Total_pr = round( math.pow(math.e, -0.05 * math.pow(Total, 2)) * 100, 2) # 1-e^(-0.0001(x^2)) Len = round((1 - math.pow( math.e, -0.0001 * math.pow(len(JoinedSource), 2))) * 100, 2) out_s_js.write('["' + JoinedSource.replace(' ', '", "') + '"], \n') out_t_js.write('["' + JoinedTarget.replace(' ', '", "') + '"], \n') out_c_js.write(u'[' + repr(CDP_pr) + u', ' + repr(APout_pr) + u', ' + repr(APin_pr) + u', ' + repr(Total_pr) + u', ' + repr(Len) + u', ' + repr(len(JoinedSource)) + u', ' + repr(round(similarity * 100, 2)) + bleuScore + u'], \n') out_sc_js.write(u'[[' + ", ".join(srcTotal) + u'], ' + u'[' + ", ".join(trgTotal) + u'], ' + u'], \n') word = 0 out_a_js.write(u'[') for ali_i in ali: linePartC = 0 for ali_j in ali_i: # Maybe worth playing around with this for transformer (and convolutional) NMT output # if ali_j < 0.15: # ali_j = 0 out_a_js.write(u'[' + repr(word) + u', ' + str(np.round(ali_j, 8)) + u', ' + repr(linePartC) + u'], ') linePartC += 1 if outputType == 'color': printColor(ali_j) elif outputType == 'block': printBlock(ali_j) elif outputType == 'block2': printBlock2(ali_j) if outputType != 'web' and outputType != 'compare': sys.stdout.write(src[word].encode( 'utf-8', errors='replace').decode('utf-8')) word += 1 if outputType != 'web' and outputType != 'compare': sys.stdout.write('\n') # write target sentences #build 2d array occupied_to = [] outchars = [] outchars.append([]) tw = 0 for tword in tgt: columns = len(tgt) # Some characters use multiple symbols. Need to decode and then encode... twchars = list(tword) twlen = len(twchars) xpos = tw * 2 emptyline = 0 for el in range(0, len(occupied_to)): # if occupied, move to a new line! if occupied_to[el] < xpos: emptyline = el if len(outchars) < emptyline + 1: # add a new row outchars.append([]) break if el == len(occupied_to) - 1: emptyline = el + 1 if len(outchars) < emptyline + 1: outchars.append([]) for column in range(0, xpos): if len(outchars[emptyline]) <= column: outchars[emptyline].append(' ') for charindex in range(0, twlen): if xpos + charindex == len( outchars[emptyline]): outchars[emptyline].append( twchars[charindex]) else: outchars[emptyline][ charindex] = twchars[charindex] if len(occupied_to) <= emptyline: occupied_to.append(xpos + twlen + 1) else: occupied_to[emptyline] = xpos + twlen + 1 tw += 1 #print 2d array if outputType != 'web' and outputType != 'compare': for liline in outchars: sys.stdout.write(''.join(liline).encode( 'utf-8', errors='replace').decode( 'utf-8') + '\n') # print scores sys.stdout.write( '\nCoverage Deviation Penalty: \t\t' + repr(round(CDP, 8)) + ' (' + repr(CDP_pr) + '%)' + '\n') sys.stdout.write( 'Input Absentmindedness Penalty: \t' + repr(round(APin, 8)) + ' (' + repr(APin_pr) + '%)' + '\n') sys.stdout.write( 'Output Absentmindedness Penalty: \t' + repr(round(APout, 8)) + ' (' + repr(APout_pr) + '%)' + '\n') sys.stdout.write('Confidence: \t\t\t\t' + repr(round(Total, 8)) + ' (' + repr(Total_pr) + '%)' + '\n') sys.stdout.write( 'Similarity: \t\t\t\t' + repr(round(similarity * 100, 2)) + '%' + '\n') if bleuNumber > -1: sys.stdout.write('BLEU: \t\t\t\t\t' + repr(bleuNumber) + '\n') # write target sentences word = 0 out_a_js.write(u'], \n') if outputType != 'web' and outputType != 'compare': sys.stdout.write('\n') out_a_js.write(u'\n]') out_s_js.write(u']') out_t_js.write(u']') out_c_js.write(u']') out_sc_js.write(u']')
def translate(self, data_iter, step, attn_debug=False): self.model.eval() output_path = self.args.result_path + '.%d.output' % step output_file = codecs.open(output_path, 'w', 'utf-8') gold_path = self.args.result_path + '.%d.gold_test' % step pred_path = self.args.result_path + '.%d.pred_test' % step gold_out_file = codecs.open(gold_path, 'w', 'utf-8') pred_out_file = codecs.open(pred_path, 'w', 'utf-8') # pred_results, gold_results = [], [] ct = 0 ext_acc_num = 0 ext_pred_num = 0 ext_gold_num = 0 with torch.no_grad(): rouge = Rouge() for batch in data_iter: output_data, tgt_data, ext_pred, ext_gold = self.translate_batch( batch) translations = self.from_batch_test(batch, output_data, tgt_data) for idx in range(len(translations)): origin_sent, pred_summ, gold_data = translations[idx] if ct % 100 == 0: print("Processing %d" % ct) output_file.write("ID : %d\n" % ct) output_file.write("ORIGIN : \n " + origin_sent.replace('<S>', '\n ') + "\n") output_file.write("GOLD : " + gold_data.strip() + "\n") output_file.write("DOC_GEN : " + pred_summ.strip() + "\n") rouge_score = rouge.get_scores(pred_summ, gold_data) bleu_score = sentence_bleu( [gold_data.split()], pred_summ.split(), smoothing_function=SmoothingFunction().method1) output_file.write( "DOC_GEN bleu & rouge-f 1/2/l: %.4f & %.4f/%.4f/%.4f\n" % (bleu_score, rouge_score[0]["rouge-1"]["f"], rouge_score[0]["rouge-2"]["f"], rouge_score[0]["rouge-l"]["f"])) # ext f1 calculate acc_num = len(ext_pred[idx] + ext_gold[idx]) - len( set(ext_pred[idx] + ext_gold[idx])) pred_num = len(ext_pred[idx]) gold_num = len(ext_gold[idx]) ext_acc_num += acc_num ext_pred_num += pred_num ext_gold_num += gold_num f1, p, r = test_f1(acc_num, pred_num, gold_num) output_file.write( "EXT_GOLD: [" + ','.join([str(i) for i in sorted(ext_gold[idx])]) + "]\n") output_file.write( "EXT_PRED: [" + ','.join([str(i) for i in sorted(ext_pred[idx])]) + "]\n") output_file.write( "EXT_SCORE P/R/F1: %.4f/%.4f/%.4f\n\n" % (p, r, f1)) pred_out_file.write(pred_summ.strip() + '\n') gold_out_file.write(gold_data.strip() + '\n') ct += 1 pred_out_file.flush() gold_out_file.flush() output_file.flush() pred_out_file.close() gold_out_file.close() output_file.close() if (step != -1): pred_bleu = test_bleu(pred_path, gold_path) file_rouge = FilesRouge(hyp_path=pred_path, ref_path=gold_path) pred_rouges = file_rouge.get_scores(avg=True) f1, p, r = test_f1(ext_acc_num, ext_pred_num, ext_gold_num) self.logger.info( 'Ext Sent Score at step %d: \n>> P/R/F1: %.2f/%.2f/%.2f' % (step, p * 100, r * 100, f1 * 100)) self.logger.info( 'Gold Length at step %d: %.2f' % (step, test_length(gold_path, gold_path, ratio=False))) self.logger.info('Prediction Length ratio at step %d: %.2f' % (step, test_length(pred_path, gold_path))) self.logger.info('Prediction Bleu at step %d: %.2f' % (step, pred_bleu * 100)) self.logger.info('Prediction Rouges at step %d: \n%s' % (step, rouge_results_to_str(pred_rouges)))
def compute_bleu(self, predictions): # Hide warnings warnings.filterwarnings('ignore') # NLTK # Download Punkt tokenizer (for word_tokenize method) # Download stopwords (for stopword removal) nltk.download('punkt') nltk.download('stopwords') # English Stopwords stops = set(stopwords.words("english")) # Stemming stemmer = SnowballStemmer("english") # Remove punctuation from string translator = str.maketrans('', '', string.punctuation) candidate_pairs = self.readresult(predictions) gt_pairs = self.readresult(self.gt) # Define max score and current score max_score = len(gt_pairs) current_score = 0 i = 0 for image_key in candidate_pairs: # Get candidate and GT caption candidate_caption = candidate_pairs[image_key] gt_caption = gt_pairs[image_key] # Optional - Go to lowercase if not VqaMedEvaluator.case_sensitive: candidate_caption = candidate_caption.lower() gt_caption = gt_caption.lower() # Split caption into individual words (remove punctuation) candidate_words = nltk.tokenize.word_tokenize( candidate_caption.translate(translator)) gt_words = nltk.tokenize.word_tokenize( gt_caption.translate(translator)) # Optional - Remove stopwords if VqaMedEvaluator.remove_stopwords: candidate_words = [ word for word in candidate_words if word.lower() not in stops ] gt_words = [ word for word in gt_words if word.lower() not in stops ] # Optional - Apply stemming if VqaMedEvaluator.stemming: candidate_words = [ stemmer.stem(word) for word in candidate_words ] gt_words = [stemmer.stem(word) for word in gt_words] # Calculate BLEU score for the current caption try: # If both the GT and candidate are empty, assign a score of 1 for this caption if len(gt_words) == 0 and len(candidate_words) == 0: bleu_score = 1 # Calculate the BLEU score else: bleu_score = nltk.translate.bleu_score.sentence_bleu( [gt_words], candidate_words, smoothing_function=SmoothingFunction().method0) # Handle problematic cases where BLEU score calculation is impossible except ZeroDivisionError: pass #raise Exception('Problem with {} {}', gt_words, candidate_words) # Increase calculated score current_score += bleu_score return current_score / max_score
def get_sentence_bleu(self, example, hyp): return sentence_bleu( [tokenize_for_bleu_eval(example.meta['example_dict']['snippet'])], tokenize_for_bleu_eval(hyp.decanonical_code), smoothing_function=SmoothingFunction().method3)
def evaluate_dataset(self, dataset, decode_results, fast_mode=False, args=None): output_plaintext_file = None if args and args.save_decode_to: output_plaintext_file = open(args.save_decode_to + '.txt', 'w', encoding='utf-8') examples = dataset.examples if isinstance(dataset, Dataset) else dataset assert len(examples) == len(decode_results) # speed up, cache tokenization results if not hasattr(examples[0], 'reference_code_tokens'): for example in examples: setattr( example, 'reference_code_tokens', tokenize_for_bleu_eval( example.meta['example_dict']['snippet'])) if not hasattr(decode_results[0][0], 'decanonical_code_tokens'): for i, example in enumerate(examples): hyp_list = decode_results[i] # here we prune any hypothesis that throws an error when converting back to the decanonical code! # This modifies the decode_results in-place! filtered_hyp_list = [] for hyp in hyp_list: if not hasattr(hyp, 'decanonical_code'): try: hyp.decanonical_code = decanonicalize_code( hyp.code, slot_map=example.meta['slot_map']) if hyp.decanonical_code: hyp.decanonical_code_tokens = tokenize_for_bleu_eval( hyp.decanonical_code) filtered_hyp_list.append(hyp) except: pass decode_results[i] = filtered_hyp_list if fast_mode: references = [e.reference_code_tokens for e in examples] hypotheses = [ hyp_list[0].decanonical_code_tokens if hyp_list else [] for hyp_list in decode_results ] bleu_tup = compute_bleu([[x] for x in references], hypotheses, smooth=False) bleu = bleu_tup[0] return bleu else: tokenized_ref_snippets = [] hyp_code_tokens = [] best_hyp_code_tokens = [] sm_func = SmoothingFunction().method3 sent_bleu_scores = [] oracle_bleu_scores = [] oracle_exact_match = [] for example, hyp_list in zip(examples, decode_results): tokenized_ref_snippets.append(example.reference_code_tokens) example_hyp_bleu_scores = [] if hyp_list: for i, hyp in enumerate(hyp_list): hyp.bleu_score = sentence_bleu( [example.reference_code_tokens], hyp.decanonical_code_tokens, smoothing_function=sm_func) hyp.is_correct = self.is_hyp_correct(example, hyp) example_hyp_bleu_scores.append(hyp.bleu_score) top_decanonical_code_tokens = hyp_list[ 0].decanonical_code_tokens sent_bleu_score = hyp_list[0].bleu_score best_hyp_idx = np.argmax(example_hyp_bleu_scores) oracle_sent_bleu = example_hyp_bleu_scores[best_hyp_idx] _best_hyp_code_tokens = hyp_list[ best_hyp_idx].decanonical_code_tokens else: top_decanonical_code_tokens = [] sent_bleu_score = 0. oracle_sent_bleu = 0. _best_hyp_code_tokens = [] # write results to file if output_plaintext_file: output_plaintext_file.write( " ".join(top_decanonical_code_tokens) + '\n') oracle_exact_match.append( any(hyp.is_correct for hyp in hyp_list)) hyp_code_tokens.append(top_decanonical_code_tokens) sent_bleu_scores.append(sent_bleu_score) oracle_bleu_scores.append(oracle_sent_bleu) best_hyp_code_tokens.append(_best_hyp_code_tokens) bleu_tup = compute_bleu([[x] for x in tokenized_ref_snippets], hyp_code_tokens, smooth=False) corpus_bleu = bleu_tup[0] bleu_tup = compute_bleu([[x] for x in tokenized_ref_snippets], best_hyp_code_tokens, smooth=False) oracle_corpus_bleu = bleu_tup[0] avg_sent_bleu = np.average(sent_bleu_scores) oracle_avg_sent_bleu = np.average(oracle_bleu_scores) exact = sum([ 1 if h == r else 0 for h, r in zip(hyp_code_tokens, tokenized_ref_snippets) ]) / float(len(examples)) oracle_exact_match = np.average(oracle_exact_match) return { 'corpus_bleu': corpus_bleu, 'oracle_corpus_bleu': oracle_corpus_bleu, 'avg_sent_bleu': avg_sent_bleu, 'oracle_avg_sent_bleu': oracle_avg_sent_bleu, 'exact_match': exact, 'oracle_exact_match': oracle_exact_match }
def train(): # argparse parser = argparse.ArgumentParser(description='manual to this script') parser.add_argument('--mode', type=str, default="de2en") parser.add_argument('--gpu', type=str, default=0) parser.add_argument('--save_dir', type=str, default="result") parser.add_argument('--save_file', type=str, default="bleu.txt") parser.add_argument('--save_log', type=str, default="logdir") parser.add_argument('--task', type=str, default="task2") parser.add_argument('--set', type=str, default="val") args = parser.parse_args() mode = args.mode os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu save_path = args.save_log save_file = args.save_file save_dir = args.save_dir task = args.task set = args.set if not os.path.exists(save_path): os.mkdir(save_path) if not os.path.exists(save_dir): os.mkdir(save_dir) # prepare de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() translator = eval("idx2{}".format(mode[-2:])) # !!! # load_graph g = Graph(is_training=True, beam_width=1, mode=mode) g_val = Graph(is_training=False, beam_width=5, mode=mode) print("Graph loaded") # Load data X, Image_index, Y, Targets = load_rl_data(language=mode[:2]) images = np.load(image_path.format("train")) num_batch = int(math.ceil(len(X) / hp.batch_size)) x_val, Targets_val, idents = load_test_rl_data(set=set, task=task, language=mode[:2]) num_batch_val = int(math.ceil(len(x_val) / hp.batch_size_test)) # prepare ref file if task == "task2": f_ref1 = open("{}/{}_ref_1".format(save_dir, mode), "w+") f_ref2 = open("{}/{}_ref_2".format(save_dir, mode), "w+") f_ref3 = open("{}/{}_ref_3".format(save_dir, mode), "w+") f_ref4 = open("{}/{}_ref_4".format(save_dir, mode), "w+") f_ref5 = open("{}/{}_ref_5".format(save_dir, mode), "w+") f_ref = [f_ref1, f_ref2, f_ref3, f_ref4, f_ref5] else: f_ref1 = open("{}/{}_ref_1".format(save_dir, mode), "w+") f_ref = [f_ref1] for i in range(len(Targets_val)): for k, l in enumerate(f_ref): if task == "task2": l.write(Targets_val[i][k] + "\n") else: l.write(Targets_val[i] + "\n") for sth in f_ref: sth.close() concat(sth.name) val_sum = len(Targets_val) temp = [] for file in f_ref: with open(file.name, "r") as h: refs = [i.strip() for i in h.readlines()] temp.extend(refs) Target_val_split = [] for i in range(val_sum): temp1 = [] for j in range(len(f_ref)): temp1.append(temp[i + j * val_sum].split()) Target_val_split.append(temp1) save_num = 1 best_of_now = 0.0 pre_bleu = 0.0 # saver saver1 = tf.train.Saver(var_list=g.value_list, max_to_keep=100) if mode == "de2en": saver2 = tf.train.Saver(var_list=g.value_list_en) else: saver2 = tf.train.Saver(var_list=g.value_list_de) saver_val = tf.train.Saver(var_list=g_val.value_list) # config config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True with tf.Session(config=config, graph=g.graph) as sess: ## Restore parameters sess.run(tf.global_variables_initializer()) if mode == "de2en": # saver1.restore(sess, "../rl/{}".format(mode) + "/step_1430") saver1.restore(sess, "{}_pre_y".format(mode) + "/step_6500") saver2.restore( sess, eval("hp.logdir_cap_{}".format(mode[-2:])) + "/model_step_3999") elif mode == "en2de": # saver1.restore(sess, "../rl/{}".format(mode)+ "/step_3363") saver1.restore(sess, "{}_pre_y".format(mode) + "/step_6000") saver2.restore( sess, eval("hp.logdir_cap_{}".format(mode[-2:])) + "/model_step_9999") print("Restored!") # lr = hp.lr num = 0 for epoch in range(hp.num_epochs): for i in range(num_batch): step = epoch * num_batch + i + 1 lr = hp.lr #* min(pow(step, -0.5),step * pow(hp.warmup_step, -1.5)) image = images[Image_index[i * hp.batch_size:(i + 1) * hp.batch_size]] x = X[i * hp.batch_size:(i + 1) * hp.batch_size] y = Y[i * hp.batch_size:(i + 1) * hp.batch_size] if step % 100 == 0: # prepare save file f_hypo = open( "{}/{}_hypo_{}".format(save_dir, mode, save_num), "w+") save_num += 1 # save log saver1.save(sess, save_path + "/step_{}".format(step)) # write file with tf.Session(graph=g_val.graph) as sess_val: sess_val.run(tf.global_variables_initializer()) saver_val.restore( sess_val, tf.train.latest_checkpoint(save_path)) for j in range(num_batch_val): # cal_pred x_val_batch = x_val[j * hp.batch_size_test:(j + 1) * hp.batch_size_test] feed_dict = { g_val.x: x_val_batch, g_val.dropout_rate_tran: 0.0, g_val.is_inference: True } preds = sess_val.run(g_val.preds, feed_dict) preds = np.concatenate( (preds[:, 1:], np.zeros( (len(x_val_batch), 1))), axis=1) # corporate for pred in preds: # sentence-wisex_val_batch got = " ".join( translator[idx] for idx in pred).split("</S>")[0].strip() f_hypo.write(got + "\n") f_hypo.close() concat(f_hypo.name) with open(f_hypo.name, "r") as f: hypos = [i.strip().split() for i in f.readlines()] n_bleu = corpus_bleu( Target_val_split, hypos, smoothing_function=SmoothingFunction().method2) with open(save_file, "a+") as h: h.write(str(n_bleu) + "\n") # if n_bleu>best_of_now: # best_of_now = n_bleu # num = 0 # elif n_bleu < pre_bleu: # num += 1 # if num==2: # lr = lr*0.7 # num = 0 # pre_bleu = n_bleu #sample from image feed_dict_sample = { g.dropout_rate: 0.0, g.lstm_drop_rate: 0.0, g.image: image, g.is_inference: True } preds = sess.run(g.preds_sample, feed_dict_sample) preds = process(preds) * preds mix_y, sample_index = mix(preds, y, x_ratio=0.5) # !!!!! #train feed_dict = { g.x: x, g.image: image, g.dropout_rate: hp.dropout_rate, g.dropout_rate_tran: hp.dropout_rate_tran, g.lstm_drop_rate: hp.lstm_drop_rate, g.index: sample_index, g.y: mix_y, g.is_inference: False, g.lr: lr } _, loss = sess.run([g.train_op, g.loss], feed_dict) print(loss)
def score(data_path: str, encoder_path: str, vocab_path: str, captions_file: str, sample_length: int = 30, N=4, smoothing='method1', output: str=None): dump = torch.load(encoder_path, map_location=lambda storage, loc: storage) reference_df = pd.read_json(captions_file) reference_df['filename'] = reference_df['filename'].apply( lambda x: int(os.path.splitext(os.path.basename(x))[0])) reference_grouped_df = reference_df.groupby( ['filename'])['tokens'].apply(list).to_dict() encodermodel = dump['encodermodel'] decodermodel = dump['decodermodel'] # Some scaler (sklearn standardscaler) scaler = dump['scaler'] # Also load previous training config config_parameters = dump['config'] vocab = torch.load(vocab_path) # load images from previous encodermodel = encodermodel.to(DEVICE).eval() decodermodel = decodermodel.to(DEVICE).eval() smoother = SmoothingFunction() smoothing_fun = getattr(smoother, smoothing) kaldi_string = parsecopyfeats( data_path, **config_parameters['feature_args']) bleu_score = [] human_bleu_score = [] bleu_weights = [1./N]*N with stdout_or_file(output) as writer: with torch.no_grad(): for k, features in kaldi_io.read_mat_ark(kaldi_string): k = int(k) if k not in reference_grouped_df: continue features = scaler.transform(features) # Add single batch dimension features = torch.from_numpy(features).to(DEVICE).unsqueeze(0) # Generate an caption embedding encoded_feature, hiddens = encodermodel(features) sampled_ids = decodermodel.sample( encoded_feature, states=hiddens, maxlength=sample_length) # (1, max_seq_length) -> (max_seq_length) sampled_ids = sampled_ids[0].cpu().numpy() # Convert word_ids to words candidate = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] # Dont add start, end tokens if word == '<end>': break elif word == '<start>': continue candidate.append(word) reference_sent = reference_grouped_df[k] #human_avg_score = [] #bleu_avg_score = [] human_scores = [] system_scores = [] if len(reference_sent) <= 1: continue for turn in range(len(reference_sent)): human_cand = reference_sent[turn] human_ref = [x for i, x in enumerate( reference_sent) if i != turn] #human_avg_score.append( human_scores.append( sentence_bleu( human_ref, human_cand, smoothing_function=smoothing_fun, weights=bleu_weights)) #bleu_avg_score.append( system_scores.append( sentence_bleu( human_ref, candidate, smoothing_function=smoothing_fun, weights=bleu_weights)) #human_bleu = sum(human_scores)/len(human_scores) human_bleu = max(human_scores) #bleu_score_all_ref = sum(system_scores)/len(system_scores) bleu_score_all_ref = max(system_scores) human_bleu_score.append(human_bleu) bleu_score.append(bleu_score_all_ref) writer.write("BLEU-{} Scores\n".format(N)) writer.write("System {:10.3f}\n".format(np.mean(bleu_score))) writer.write("Human {:10.3f}\n".format(np.mean(human_bleu_score)))
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction import csv TEXT_FILE = './samples/text_val.12' LABELS_FILE = './samples/labels_val.12' smoothing_foonction = SmoothingFunction() semples = [] semples.append([ 'Input Sentence', 'Input Label', 'Generated Sentence', 'Predicted Label', 'BLEU' ]) with open(TEXT_FILE, 'r') as input_file_text: lines_text = input_file_text.readlines() with open(LABELS_FILE, 'r') as input_file_labels: lines_labels = input_file_labels.readlines() for i in range(0, len(lines_text) - 1, 2): input_sentence = lines_text[i].strip() generated_sentence = lines_text[i + 1].strip() input_label = 1 - int(lines_labels[i]) predicted_label = int(lines_labels[i + 1])
def get_bleu(self): ngram = self.gram bleu = list() reference = self.get_reference() weight = tuple((1. / ngram for _ in range(ngram))) with open(self.test_data, encoding='utf-8') as test_data: for hypothesis in test_data: hypothesis = nltk.word_tokenize(hypothesis) bleu.append(nltk.translate.bleu_score.sentence_bleu(reference, hypothesis, weight, smoothing_function=SmoothingFunction().method1)) return sum(bleu) / len(bleu)
def main(data_path): try: tpu = tf.distribute.cluster_resolver.TPUClusterResolver() print('Running on TPU {}'.format(tpu.cluster_spec().as_dict()['worker'])) except ValueError: tpu = None if tpu: tf.config.experimental_connect_to_cluster(tpu) tf.tpu.experimental.initialize_tpu_system(tpu) strategy = tf.distribute.experimental.TPUStrategy(tpu) else: strategy = tf.distribute.get_strategy() print("REPLICAS: {}".format(strategy.num_replicas_in_sync)) # Maximum sentence length MAX_LENGTH = 40 # Maximum number of samples to preprocess MAX_SAMPLES = 15000 # For tf.data.Dataset BATCH_SIZE = 64 * strategy.num_replicas_in_sync BUFFER_SIZE = 20000 # For Transformer NUM_LAYERS = 2 D_MODEL = 256 NUM_HEADS = 8 UNITS = 512 DROPOUT = 0.1 EPOCHS = 40 print('---') print('Loading the data...') path_to_zip = tf.keras.utils.get_file( 'cornell_movie_dialogs.zip', origin= 'http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip', extract=True) path_to_dataset = os.path.join( os.path.dirname(path_to_zip), "cornell movie-dialogs corpus") path_to_movie_lines = os.path.join(path_to_dataset, 'movie_lines.txt') path_to_movie_conversations = os.path.join(path_to_dataset, 'movie_conversations.txt') def preprocess_sentence(sentence): sentence = sentence.lower().strip() # creating a space between a word and the punctuation following it # eg: "he is a boy." => "he is a boy ." sentence = re.sub(r"([?.!,])", r" \1 ", sentence) sentence = re.sub(r'[" "]+', " ", sentence) # removing contractions sentence = re.sub(r"i'm", "i am", sentence) sentence = re.sub(r"he's", "he is", sentence) sentence = re.sub(r"she's", "she is", sentence) sentence = re.sub(r"it's", "it is", sentence) sentence = re.sub(r"that's", "that is", sentence) sentence = re.sub(r"what's", "that is", sentence) sentence = re.sub(r"where's", "where is", sentence) sentence = re.sub(r"how's", "how is", sentence) sentence = re.sub(r"\'ll", " will", sentence) sentence = re.sub(r"\'ve", " have", sentence) sentence = re.sub(r"\'re", " are", sentence) sentence = re.sub(r"\'d", " would", sentence) sentence = re.sub(r"\'re", " are", sentence) sentence = re.sub(r"won't", "will not", sentence) sentence = re.sub(r"can't", "cannot", sentence) sentence = re.sub(r"n't", " not", sentence) sentence = re.sub(r"n'", "ng", sentence) sentence = re.sub(r"'bout", "about", sentence) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",") sentence = re.sub(r"[^a-zA-Z?.!,]+", " ", sentence) sentence = sentence.strip() return sentence print('Preprocessing the data...') def load_conversations(): # dictionary of line id to text id2line = {} with open(path_to_movie_lines, errors='ignore') as file: lines = file.readlines() for line in lines: parts = line.replace('\n', '').split(' +++$+++ ') id2line[parts[0]] = parts[4] inputs, outputs = [], [] with open(path_to_movie_conversations, 'r') as file: lines = file.readlines() for line in lines: parts = line.replace('\n', '').split(' +++$+++ ') # get conversation in a list of line ID conversation = [line[1:-1] for line in parts[3][1:-1].split(', ')] for i in range(len(conversation) - 1): inputs.append(preprocess_sentence(id2line[conversation[i]])) outputs.append(preprocess_sentence(id2line[conversation[i + 1]])) if len(inputs) >= MAX_SAMPLES: return inputs, outputs return inputs, outputs questions, answers = load_conversations() print('Train-Test split...') X_train = questions[:round(0.8*len(questions))] y_train = answers[:round(0.8*len(answers))] X_test = questions[round(0.8*len(questions)):] y_test = answers[round(0.8*len(answers)):] print('Tokenizing...') # Build tokenizer using tfds for both questions and answers tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus( questions + answers, target_vocab_size=2**13) print('START and END tags appended...') # Define start and end token to indicate the start and end of a sentence START_TOKEN, END_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size + 1] # Vocabulary size plus start and end token VOCAB_SIZE = tokenizer.vocab_size + 2 # Tokenize, filter and pad sentences def tokenize_and_filter(inputs, outputs): tokenized_inputs, tokenized_outputs = [], [] for (sentence1, sentence2) in zip(inputs, outputs): # tokenize sentence sentence1 = START_TOKEN + tokenizer.encode(sentence1) + END_TOKEN sentence2 = START_TOKEN + tokenizer.encode(sentence2) + END_TOKEN # check tokenized sentence max length if len(sentence1) <= MAX_LENGTH and len(sentence2) <= MAX_LENGTH: tokenized_inputs.append(sentence1) tokenized_outputs.append(sentence2) # pad tokenized sentences tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences( tokenized_inputs, maxlen=MAX_LENGTH, padding='post') tokenized_outputs = tf.keras.preprocessing.sequence.pad_sequences( tokenized_outputs, maxlen=MAX_LENGTH, padding='post') return tokenized_inputs, tokenized_outputs questions, answers = tokenize_and_filter(questions, answers) print('---') print('Vocab size: {}'.format(VOCAB_SIZE)) print('Number of samples: {}'.format(len(questions))) # decoder inputs use the previous target as input # remove START_TOKEN from targets dataset = tf.data.Dataset.from_tensor_slices(( { 'inputs': questions[:12000], 'dec_inputs': answers[:12000, :-1] }, { 'outputs': answers[:12000, 1:] }, )) dataset = dataset.cache() dataset = dataset.shuffle(BUFFER_SIZE) dataset = dataset.batch(BATCH_SIZE) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) # VALIDATION DATASET # remove START_TOKEN from targets val_dataset = tf.data.Dataset.from_tensor_slices(( { 'inputs': questions[12000:], 'dec_inputs': answers[12000:, :-1] }, { 'outputs': answers[12000:, 1:] }, )) val_dataset = val_dataset.cache() val_dataset = val_dataset.shuffle(BUFFER_SIZE) val_dataset = val_dataset.batch(BATCH_SIZE) val_dataset = val_dataset.prefetch(tf.data.experimental.AUTOTUNE) print(dataset) print(val_dataset) print('Creating scaled dot product attention...') def scaled_dot_product_attention(query, key, value, mask): """Calculate the attention weights. """ matmul_qk = tf.matmul(query, key, transpose_b=True) # scale matmul_qk depth = tf.cast(tf.shape(key)[-1], tf.float32) logits = matmul_qk / tf.math.sqrt(depth) # add the mask to zero out padding tokens if mask is not None: logits += (mask * -1e9) # softmax is normalized on the last axis (seq_len_k) attention_weights = tf.nn.softmax(logits, axis=-1) output = tf.matmul(attention_weights, value) return output print('Creating Multi head attention...') class MultiHeadAttention(tf.keras.layers.Layer): def __init__(self, d_model, num_heads, name="multi_head_attention"): super(MultiHeadAttention, self).__init__(name=name) self.num_heads = num_heads self.d_model = d_model assert d_model % self.num_heads == 0 self.depth = d_model // self.num_heads self.query_dense = tf.keras.layers.Dense(units=d_model) self.key_dense = tf.keras.layers.Dense(units=d_model) self.value_dense = tf.keras.layers.Dense(units=d_model) self.dense = tf.keras.layers.Dense(units=d_model) def get_config(self): config = super(MultiHeadAttention,self).get_config() config.update({ 'num_heads':self.num_heads, 'd_model':self.d_model, }) return config def split_heads(self, inputs, batch_size): inputs = tf.reshape( inputs, shape=(batch_size, -1, self.num_heads, self.depth)) return tf.transpose(inputs, perm=[0, 2, 1, 3]) def call(self, inputs): query, key, value, mask = inputs['query'], inputs['key'], inputs[ 'value'], inputs['mask'] batch_size = tf.shape(query)[0] # linear layers query = self.query_dense(query) key = self.key_dense(key) value = self.value_dense(value) # split heads query = self.split_heads(query, batch_size) key = self.split_heads(key, batch_size) value = self.split_heads(value, batch_size) # scaled dot-product attention scaled_attention = scaled_dot_product_attention(query, key, value, mask) scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) # concatenation of heads concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model)) # final linear layer outputs = self.dense(concat_attention) return outputs def create_padding_mask(x): mask = tf.cast(tf.math.equal(x, 0), tf.float32) # (batch_size, 1, 1, sequence length) return mask[:, tf.newaxis, tf.newaxis, :] def create_look_ahead_mask(x): seq_len = tf.shape(x)[1] look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0) padding_mask = create_padding_mask(x) return tf.maximum(look_ahead_mask, padding_mask) print('Creating positional Encoding...') class PositionalEncoding(tf.keras.layers.Layer): def __init__(self, position, d_model): super(PositionalEncoding, self).__init__() self.pos_encoding = self.positional_encoding(position, d_model) def get_config(self): config = super(PositionalEncoding, self).get_config() config.update({ 'position': self.position, 'd_model': self.d_model, }) return config def get_angles(self, position, i, d_model): angles = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32)) return position * angles def positional_encoding(self, position, d_model): angle_rads = self.get_angles( position=tf.range(position, dtype=tf.float32)[:, tf.newaxis], i=tf.range(d_model, dtype=tf.float32)[tf.newaxis, :], d_model=d_model) # apply sin to even index in the array sines = tf.math.sin(angle_rads[:, 0::2]) # apply cos to odd index in the array cosines = tf.math.cos(angle_rads[:, 1::2]) pos_encoding = tf.concat([sines, cosines], axis=-1) pos_encoding = pos_encoding[tf.newaxis, ...] return tf.cast(pos_encoding, tf.float32) def call(self, inputs): return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :] print('Defining Encoder Layer...') def encoder_layer(units, d_model, num_heads, dropout, name="encoder_layer"): inputs = tf.keras.Input(shape=(None, d_model), name="inputs") padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask") attention = MultiHeadAttention( d_model, num_heads, name="attention")({ 'query': inputs, 'key': inputs, 'value': inputs, 'mask': padding_mask }) attention = tf.keras.layers.Dropout(rate=dropout)(attention) attention = tf.keras.layers.LayerNormalization(epsilon=1e-6)(inputs + attention) outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention) outputs = tf.keras.layers.Dense(units=d_model)(outputs) outputs = tf.keras.layers.Dropout(rate=dropout)(outputs) outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention + outputs) return tf.keras.Model( inputs=[inputs, padding_mask], outputs=outputs, name=name) sample_encoder_layer = encoder_layer( units=512, d_model=128, num_heads=4, dropout=0.3, name="sample_encoder_layer") #tf.keras.utils.plot_model( # sample_encoder_layer, to_file='encoder_layer.png', show_shapes=True) print('Defining encoder...') def encoder(vocab_size, num_layers, units, d_model, num_heads, dropout, name="encoder"): inputs = tf.keras.Input(shape=(None,), name="inputs") padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask") embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs) embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32)) embeddings = PositionalEncoding(vocab_size, d_model)(embeddings) outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings) for i in range(int(num_layers)): outputs = encoder_layer( units=units, d_model=d_model, num_heads=num_heads, dropout=dropout, name="encoder_layer_{}".format(i), )([outputs, padding_mask]) return tf.keras.Model( inputs=[inputs, padding_mask], outputs=outputs, name=name) sample_encoder = encoder( vocab_size=8192, num_layers=2, units=512, d_model=128, num_heads=4, dropout=0.3, name="sample_encoder") print('Defining decoder layer...') def decoder_layer(units, d_model, num_heads, dropout, name="decoder_layer"): inputs = tf.keras.Input(shape=(None, d_model), name="inputs") enc_outputs = tf.keras.Input(shape=(None, d_model), name="encoder_outputs") look_ahead_mask = tf.keras.Input( shape=(1, None, None), name="look_ahead_mask") padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask') attention1 = MultiHeadAttention( d_model, num_heads, name="attention_1")(inputs={ 'query': inputs, 'key': inputs, 'value': inputs, 'mask': look_ahead_mask }) attention1 = tf.keras.layers.LayerNormalization( epsilon=1e-6)(attention1 + inputs) attention2 = MultiHeadAttention( d_model, num_heads, name="attention_2")(inputs={ 'query': attention1, 'key': enc_outputs, 'value': enc_outputs, 'mask': padding_mask }) attention2 = tf.keras.layers.Dropout(rate=dropout)(attention2) attention2 = tf.keras.layers.LayerNormalization( epsilon=1e-6)(attention2 + attention1) outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention2) outputs = tf.keras.layers.Dense(units=d_model)(outputs) outputs = tf.keras.layers.Dropout(rate=dropout)(outputs) outputs = tf.keras.layers.LayerNormalization( epsilon=1e-6)(outputs + attention2) return tf.keras.Model( inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask], outputs=outputs, name=name) sample_decoder_layer = decoder_layer( units=512, d_model=128, num_heads=4, dropout=0.3, name="sample_decoder_layer") print('Defining decoder...') def decoder(vocab_size, num_layers, units, d_model, num_heads, dropout, name='decoder'): inputs = tf.keras.Input(shape=(None,), name='inputs') enc_outputs = tf.keras.Input(shape=(None, d_model), name='encoder_outputs') look_ahead_mask = tf.keras.Input( shape=(1, None, None), name='look_ahead_mask') padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask') embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs) embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32)) embeddings = PositionalEncoding(vocab_size, d_model)(embeddings) outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings) for i in range(int(num_layers)): outputs = decoder_layer( units=units, d_model=d_model, num_heads=num_heads, dropout=dropout, name='decoder_layer_{}'.format(i), )(inputs=[outputs, enc_outputs, look_ahead_mask, padding_mask]) return tf.keras.Model( inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask], outputs=outputs, name=name) sample_decoder = decoder( vocab_size=8192, num_layers=2, units=512, d_model=128, num_heads=4, dropout=0.3, name="sample_decoder") print('Defining Transformer...') def transformer(vocab_size, #VOCAB_SIZE num_layers, #2 units, #512 d_model, #256 num_heads, #8 dropout, #0.1 name="transformer"): inputs = tf.keras.Input(shape=(None,), name="inputs") dec_inputs = tf.keras.Input(shape=(None,), name="dec_inputs") enc_padding_mask = tf.keras.layers.Lambda( create_padding_mask, output_shape=(1, 1, None), name='enc_padding_mask')(inputs) # mask the future tokens for decoder inputs at the 1st attention block look_ahead_mask = tf.keras.layers.Lambda( create_look_ahead_mask, output_shape=(1, None, None), name='look_ahead_mask')(dec_inputs) # mask the encoder outputs for the 2nd attention block dec_padding_mask = tf.keras.layers.Lambda( create_padding_mask, output_shape=(1, 1, None), name='dec_padding_mask')(inputs) enc_outputs = encoder( vocab_size=vocab_size, num_layers=num_layers, units=units, d_model=d_model, num_heads=num_heads, dropout=dropout, )(inputs=[inputs, enc_padding_mask]) dec_outputs = decoder( vocab_size=vocab_size, num_layers=num_layers, units=units, d_model=d_model, num_heads=num_heads, dropout=dropout, )(inputs=[dec_inputs, enc_outputs, look_ahead_mask, dec_padding_mask]) outputs = tf.keras.layers.Dense(units=vocab_size, name="outputs")(dec_outputs) return tf.keras.Model(inputs=[inputs, dec_inputs], outputs=outputs, name=name) sample_transformer = transformer( vocab_size=8192, num_layers=4, units=512, d_model=128, num_heads=4, dropout=0.3, name="sample_transformer") def loss_function(y_true, y_pred): y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1)) loss = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none')(y_true, y_pred) mask = tf.cast(tf.not_equal(y_true, 0), tf.float32) loss = tf.multiply(loss, mask) return tf.reduce_mean(loss) def loss_function(y_true, y_pred): y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1)) loss = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none')(y_true, y_pred) mask = tf.cast(tf.not_equal(y_true, 0), tf.float32) loss = tf.multiply(loss, mask) return tf.reduce_mean(loss) class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule): def __init__(self, d_model, warmup_steps=4000): super(CustomSchedule, self).__init__() self.d_model = d_model self.d_model = tf.cast(self.d_model, tf.float32) self.warmup_steps = warmup_steps def get_config(self): return {"d_model": self.d_model,"warmup_steps":self.warmup_steps} def __call__(self, step): arg1 = tf.math.rsqrt(step) arg2 = step * (self.warmup_steps**-1.5) return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2) class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule): def __init__(self, d_model, warmup_steps=4000): super(CustomSchedule, self).__init__() self.d_model = d_model self.d_model = tf.cast(self.d_model, tf.float32) self.warmup_steps = warmup_steps def get_config(self): return {"d_model": self.d_model,"warmup_steps":self.warmup_steps} def __call__(self, step): arg1 = tf.math.rsqrt(step) arg2 = step * (self.warmup_steps**-1.5) return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2) print('Compiling the Model...') # initialize and compile model within strategy scope with strategy.scope(): model = transformer( vocab_size=VOCAB_SIZE, num_layers=NUM_LAYERS, units=UNITS, d_model=D_MODEL, num_heads=NUM_HEADS, dropout=DROPOUT) model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy]) model.summary() print('---') #UNCOMMENT TO TRAIN THE MODEL ''' import datetime logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1) for i in range(10): model.fit(dataset, epochs=10, validation_data=val_dataset, callbacks = [tensorboard_callback]) model.save_weights('transformer_weights_'+str(i+1)+'.h5') ''' print('Loading the model weights...') loaded_model = transformer( vocab_size=VOCAB_SIZE, num_layers=NUM_LAYERS, units=UNITS, d_model=D_MODEL, num_heads=NUM_HEADS, dropout=DROPOUT) loaded_model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy]) loaded_model.load_weights(os.path.join(data_path, 'transformer_weights_100.h5')) import pandas as pd def textPreprocess(input_text): def removeAccents(input_text): strange='ąćęłńóśżź' ascii_replacements='acelnoszz' translator=str.maketrans(strange,ascii_replacements) return input_text.translate(translator) def removeSpecial(input_text): special='[^A-Za-z0-9 ]+' return re.sub(special, '', input_text) def removeTriplicated(input_text): return re.compile(r'(.)\1{2,}', re.IGNORECASE).sub(r'\1', input_text) return removeTriplicated(removeSpecial(removeAccents(input_text.lower()))) def evaluate(sentence, model): sentence = textPreprocess(sentence) sentence = tf.expand_dims( START_TOKEN + tokenizer.encode(sentence) + END_TOKEN, axis=0) output = tf.expand_dims(START_TOKEN, 0) for i in range(MAX_LENGTH): predictions = model(inputs=[sentence, output], training=False) # select the last word from the seq_len dimension predictions = predictions[:, -1:, :] predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32) # return the result if the predicted_id is equal to the end token if tf.equal(predicted_id, END_TOKEN[0]): break # concatenated the predicted_id to the output which is given to the decoder as its input. output = tf.concat([output, predicted_id], axis=-1) return tf.squeeze(output, axis=0) def predict(sentence,model): prediction = evaluate(sentence,model) predicted_sentence = tokenizer.decode( [i for i in prediction if i < tokenizer.vocab_size]) return predicted_sentence print("---") print("FIVE EXAMPLES: TRAIN SENTENCE PREDICTIONS: ") print('---') for x,y in zip(X_train[:2], y_train[:2]): output = predict(x, loaded_model) print('Question : ', str(x)) print('Actual Response : ', str(y)) print('Predicted Response : ', predict(x, loaded_model)) print("---") print("---") print("FIVE EXAMPLES: TEST SENTENCE PREDICTIONS: ") print('---') for x,y in zip(X_test[:2], y_test[:2]): output = predict(x, loaded_model) print('Question :', str(x)) print('Actual Response : ', str(y)) print('Predicted Response : ', predict(x, loaded_model)) print("---") ("---") import nltk from nltk.translate.bleu_score import SmoothingFunction from nltk.translate.bleu_score import sentence_bleu c = SmoothingFunction() bleuScoresTrain = [] for x,y in zip(X_train, y_train): actualOutput = y predictedOutput = predict(x, loaded_model) ref = actualOutput.split(' ') pred = predictedOutput.split(' ') if len(ref) >= 4 and len(pred) >= 4: BLEUscore = sentence_bleu([ref], pred, smoothing_function = c.method2) elif len(ref) >= 3 and len(pred) >= 3: BLEUscore = sentence_bleu([ref], pred, weights = (1/3, 1/3, 1/3), smoothing_function = c.method2) elif len(ref) >= 2 and len(pred) >= 2: BLEUscore = sentence_bleu([ref], pred, weights = (0.5, 0.5), smoothing_function = c.method2) else: BLEUscore = sentence_bleu([ref], pred, weights = [1], smoothing_function = c.method2) bleuScoresTrain.append(BLEUscore) print('---') print("The Bleu score for Train data is: ", sum(bleuScoresTrain)/float(len(bleuScoresTrain))) print('---') bleuScoresTest = [] for x,y in zip(X_test, y_test): actualOutput = y predictedOutput = predict(x, loaded_model) ref = actualOutput.split(' ') pred = predictedOutput.split(' ') if len(ref) >= 4 and len(pred) >= 4: BLEUscore = sentence_bleu([ref], pred, smoothing_function = c.method2) elif len(ref) >= 3 and len(pred) >= 3: BLEUscore = sentence_bleu([ref], pred, weights = (1/3, 1/3, 1/3), smoothing_function = c.method2) elif len(ref) >= 2 and len(pred) >= 2: BLEUscore = sentence_bleu([ref], pred, weights = (0.5, 0.5), smoothing_function = c.method2) else: BLEUscore = sentence_bleu([ref], pred, weights = [1], smoothing_function = c.method2) bleuScoresTest.append(BLEUscore) print('---') print("The Bleu score for Test data is : ", sum(bleuScoresTest)/float(len(bleuScoresTest))) print('---') print('END!')
def validate(val_loader, encoder, decoder, criterion, vocab, epoch, total_step, start_step=1, start_loss=0.0, start_bleu=0.0): """Validate the model for one epoch using the provided parameters. Return the epoch's average validation loss and Bleu-4 score.""" # Switch to validation mode encoder.eval() decoder.eval() # Initialize smoothing function smoothing = SmoothingFunction() # Keep track of validation loss and Bleu-4 score total_loss = start_loss total_bleu_4 = start_bleu # Start time for every 100 steps start_val_time = time.time() # Disable gradient calculation because we are in inference mode with torch.no_grad(): for i_step in range(start_step, total_step + 1): # Randomly sample a caption length, and sample indices with that length indices = val_loader.dataset.get_indices() # Create a batch sampler to retrieve a batch with the sampled indices new_sampler = data.sampler.SubsetRandomSampler(indices=indices) val_loader.batch_sampler.sampler = new_sampler # Obtain the batch for batch in val_loader: images, captions = batch[0], batch[1] break # Move to GPU if CUDA is available if torch.cuda.is_available(): images = images.cuda() captions = captions.cuda() # Pass the inputs through the CNN-RNN model features = encoder(images) outputs = decoder(features, captions) # Calculate the total Bleu-4 score for the batch batch_bleu_4 = 0.0 # Iterate over outputs. Note: outputs[i] is a caption in the batch # outputs[i, j, k] contains the model's predicted score i.e. how # likely the j-th token in the i-th caption in the batch is the # k-th token in the vocabulary. for i in range(len(outputs)): predicted_ids = [] for scores in outputs[i]: # Find the index of the token that has the max score predicted_ids.append(scores.argmax().item()) # Convert word ids to actual words predicted_word_list = word_list(predicted_ids, vocab) caption_word_list = word_list(captions[i].numpy(), vocab) # Calculate Bleu-4 score and append it to the batch_bleu_4 list batch_bleu_4 += sentence_bleu( [caption_word_list], predicted_word_list, smoothing_function=smoothing.method1) total_bleu_4 += batch_bleu_4 / len(outputs) # Calculate the batch loss loss = criterion(outputs.view(-1, len(vocab)), captions.view(-1)) total_loss += loss.item() # Get validation statistics stats = "Epoch %d, Val step [%d/%d], %ds, Loss: %.4f, Perplexity: %5.4f, Bleu-4: %.4f" \ % (epoch, i_step, total_step, time.time() - start_val_time, loss.item(), np.exp(loss.item()), batch_bleu_4 / len(outputs)) # Print validation statistics (on same line) print("\r" + stats, end == "") sys.stdout.flush() # Print validation statistics (on different line) and reset time if i_step % PRINT_EVERY == 0: print("\r" + stats) filename = os.path.join( "/home/osboxes/image_captioning/example", "val-model-{}{}.pkl".format(epoch, i_step)) save_val_checkpoint(filename, encoder, decoder, total_loss, total_bleu_4, epoch, i_step) start_val_time = time.time() return total_loss / total_step, total_bleu_4 / total_step
def nltk_sentence_bleu(hypothesis, reference, order=4): cc = SmoothingFunction() return nltk.translate.bleu([reference], hypothesis)
def calculate_bleu2(reference: str, hypothesis: str) -> float: return sentence_bleu([reference.split()], hypothesis.split(), weights=(0.5, 0.5), smoothing_function=SmoothingFunction().method2)
def grade_marker(self, marker_text): chencherry = SmoothingFunction() reference = self.key_sentences candidate = marker_text bleu = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0), smoothing_function=chencherry.method1) return bleu
def bleu(original, translated, n=4): weights = [1 / n] * n return sentence_bleu([p(original)], p(translated), weights=weights, smoothing_function=SmoothingFunction().method7)
return input_ids, token_type_ids args = Config() ckpt_path = 'ckpt/VEID/model.bin' tokenizer = BertTokenizer.from_pretrained(args.model_checkpoint, do_lower_case=True) tokenizer.add_special_tokens(SPECIAL_TOKENS_DICT) model_config = GPT2Config.from_pretrained(args.model_checkpoint) model = VEID(model_config) ckpt = torch.load(ckpt_path, map_location='cpu') model.load_state_dict(ckpt['model']) smooth = SmoothingFunction() meter = AverageMeter() with open('data/new_small_train.json', 'r', encoding='utf-8') as f: data = json.load(f) total_num = len(data) current_num = 0 case_study = [] for dia in data: #print(dia) case = {} print(current_num, '/', total_num) case['history'] = dia['history']
def evaluation(args): source = pickle_load(os.path.join(args.model_path, 'source.pkl')) target = pickle_load(os.path.join(args.model_path, 'target.pkl')) target_test = pickle_load(os.path.join(args.model_path, 'target_test.pkl')) setting = load_setting(os.path.join(args.model_path, 'setting.yaml')) start_id, end_id = setting['start_id'], setting['end_id'] type_size = setting['type_size'] player_size = setting['player_size'] team_size = setting['team_size'] detail_size = setting['detail_size'] detail_dim = setting['detail_dim'] src_embed = setting['src_embed'] event_size = setting['event_size'] vocab_size = setting['vocab_size'] trg_embed = setting['trg_embed'] hidden = setting['hidden'] start_id = setting['start_id'] end_id = setting['end_id'] class_weight = None mlp_layers = setting['mlp_layers'] max_length = setting['max_length'] dropout = setting['dropout'] loss_weight = None disc_loss = setting['disc_loss'] loss_func = setting['loss_func'] net = setting['net'] dataset = setting['dataset'] numbering = setting['numbering'] reverse_decode = setting['reverse_decode'] home_player_tag = target.word_to_id.get(target.home_player_tag) away_player_tag = target.word_to_id.get(target.away_player_tag) home_team_tag = target.word_to_id.get(target.home_team_tag) away_team_tag = target.word_to_id.get(target.away_team_tag) test = OptaDataset(path=dataset + '.test', fields={ 'source': source, 'target': target_test }) test20 = OptaDataset(path=dataset + '.test', fields={ 'source': source, 'target': target_test }, limit_length=20) test15 = OptaDataset(path=dataset + '.test', fields={ 'source': source, 'target': target_test }, limit_length=15) test10 = OptaDataset(path=dataset + '.test', fields={ 'source': source, 'target': target_test }, limit_length=10) if 'disc' in net: content_word_size = len(target.content_word_to_id) print('vocab size: {}'.format(vocab_size)) if net == 'plain': model = MLPEncoder2AttentionDecoder(type_size, player_size, team_size, detail_size, detail_dim, src_embed, event_size, vocab_size, trg_embed, hidden, start_id, end_id, class_weight, mlp_layers, max_length, dropout, IGNORE_LABEL, reverse_decode=reverse_decode) elif net == 'tmpl': model = MLPEncoder2AttentionDecoder(type_size, player_size, team_size, detail_size, detail_dim, src_embed, event_size, vocab_size, trg_embed, hidden, start_id, end_id, class_weight, mlp_layers, max_length, dropout, IGNORE_LABEL, source.id_to_player, home_player_tag, away_player_tag, source.id_to_team, home_team_tag, away_team_tag, target.player_to_id, target.players, reverse_decode=reverse_decode) elif net == 'gate': model = MLPEncoder2GatedAttentionDecoder(type_size, player_size, team_size, detail_size, detail_dim, src_embed, event_size, vocab_size, trg_embed, hidden, start_id, end_id, class_weight, mlp_layers, max_length, dropout, IGNORE_LABEL, reverse_decode=reverse_decode) elif net == 'gate-tmpl': model = MLPEncoder2GatedAttentionDecoder(type_size, player_size, team_size, detail_size, detail_dim, src_embed, event_size, vocab_size, trg_embed, hidden, start_id, end_id, class_weight, mlp_layers, max_length, dropout, IGNORE_LABEL, source.id_to_player, home_player_tag, away_player_tag, source.id_to_team, home_team_tag, away_team_tag, target.player_to_id, target.players, reverse_decode=reverse_decode) elif net == 'disc': model = DiscriminativeMLPEncoder2AttentionDecoder( type_size, player_size, team_size, detail_size, detail_dim, src_embed, event_size, vocab_size, content_word_size, trg_embed, hidden, start_id, end_id, class_weight, loss_weight, disc_loss, loss_func, mlp_layers, max_length, dropout, IGNORE_LABEL, reverse_decode=reverse_decode) elif net == 'disc-tmpl': model = DiscriminativeMLPEncoder2AttentionDecoder( type_size, player_size, team_size, detail_size, detail_dim, src_embed, event_size, vocab_size, content_word_size, trg_embed, hidden, start_id, end_id, class_weight, loss_weight, disc_loss, loss_func, mlp_layers, max_length, dropout, IGNORE_LABEL, source.id_to_player, home_player_tag, away_player_tag, source.id_to_team, home_team_tag, away_team_tag, target.player_to_id, target.players, reverse_decode=reverse_decode) elif net == 'gate-disc': model = DiscriminativeMLPEncoder2GatedAttentionDecoder( type_size, player_size, team_size, detail_size, detail_dim, src_embed, event_size, vocab_size, content_word_size, trg_embed, hidden, start_id, end_id, class_weight, loss_weight, disc_loss, loss_func, mlp_layers, max_length, dropout, IGNORE_LABEL, reverse_decode=reverse_decode) elif net == 'gate-disc-tmpl': model = DiscriminativeMLPEncoder2GatedAttentionDecoder( type_size, player_size, team_size, detail_size, detail_dim, src_embed, event_size, vocab_size, content_word_size, trg_embed, hidden, start_id, end_id, class_weight, loss_weight, disc_loss, loss_func, mlp_layers, max_length, dropout, IGNORE_LABEL, source.id_to_player, home_player_tag, away_player_tag, source.id_to_team, home_team_tag, away_team_tag, target.player_to_id, target.players, reverse_decode=reverse_decode) if numbering: model.player_id = target.player_id model.team_id = target.team_id # load best model if args.gpu is not None: model.use_gpu(args.gpu) model.id_to_word = target.id_to_word model.load_model(os.path.join(args.model_path, 'best.model')) batch_size = args.batch src_test_iter = SequentialIterator(test.source, batch_size, None, event_size, source.fillvalue, gpu=args.gpu) src_test20_iter = SequentialIterator(test20.source, batch_size, None, event_size, source.fillvalue, gpu=args.gpu) src_test15_iter = SequentialIterator(test15.source, batch_size, None, event_size, source.fillvalue, gpu=args.gpu) src_test10_iter = SequentialIterator(test10.source, batch_size, None, event_size, source.fillvalue, gpu=args.gpu) trg_test_iter = Iterator(test.target, batch_size, wrapper=EndTokenIdRemoval(end_id), gpu=None) trg_test20_iter = Iterator(test20.target, batch_size, wrapper=EndTokenIdRemoval(end_id), gpu=None) trg_test15_iter = Iterator(test15.target, batch_size, wrapper=EndTokenIdRemoval(end_id), gpu=None) trg_test10_iter = Iterator(test10.target, batch_size, wrapper=EndTokenIdRemoval(end_id), gpu=None) with open('./dataset/player_list.json.new') as f: id_to_player = json.load(f) with open('./dataset/team_list.json.new') as f: id_to_team = json.load(f) def convert(ind, no_tag=False): if 'player' in ind: if no_tag: i = ind.replace('player', '') return id_to_player.get(i, ind) else: return ind elif 'team' in ind: if no_tag: i = ind.replace('team', '') return id_to_team.get(i, ind) else: return ind else: return ind if 'disc' in net: bleu_score, accuracy, hypotheses = evaluate_bleu_and_accuracy( model, src_test_iter, trg_test_iter) bleu_score20, _, hypotheses20 = evaluate_bleu_and_accuracy( model, src_test20_iter, trg_test20_iter) bleu_score15, _, hypotheses15 = evaluate_bleu_and_accuracy( model, src_test15_iter, trg_test15_iter) bleu_score10, _, hypotheses10 = evaluate_bleu_and_accuracy( model, src_test10_iter, trg_test10_iter) else: bleu_score, hypotheses = evaluate_bleu(model, src_test_iter, trg_test_iter) bleu_score20, hypotheses20 = evaluate_bleu(model, src_test20_iter, trg_test20_iter) bleu_score15, hypotheses15 = evaluate_bleu(model, src_test15_iter, trg_test15_iter) bleu_score10, hypotheses10 = evaluate_bleu(model, src_test10_iter, trg_test10_iter) print('best score: {}'.format(bleu_score)) print('best score20: {}'.format(bleu_score20)) print('best score15: {}'.format(bleu_score15)) print('best score10: {}'.format(bleu_score10)) # save hypothesis hypotheses_for_save = [ ' '.join([convert(y, True) for y in h]) for h in hypotheses ] hypotheses20_for_save = [ ' '.join([convert(y, True) for y in h]) for h in hypotheses20 ] hypotheses15_for_save = [ ' '.join([convert(y, True) for y in h]) for h in hypotheses15 ] hypotheses10_for_save = [ ' '.join([convert(y, True) for y in h]) for h in hypotheses10 ] references_for_save = [ ' '.join(convert(y, True) for y in r[0]) for r in test.target ] references20_for_save = [ ' '.join(convert(y, True) for y in r[0]) for r in test20.target ] references15_for_save = [ ' '.join(convert(y, True) for y in r[0]) for r in test15.target ] references10_for_save = [ ' '.join(convert(y, True) for y in r[0]) for r in test10.target ] TextFile(os.path.join(args.model_path, 'hypo'), hypotheses_for_save).save() TextFile(os.path.join(args.model_path, 'hypo_len20'), hypotheses20_for_save).save() TextFile(os.path.join(args.model_path, 'hypo_len15'), hypotheses15_for_save).save() TextFile(os.path.join(args.model_path, 'hypo_len10'), hypotheses10_for_save).save() TextFile(os.path.join('./dataset', 'ref'), references_for_save).save() TextFile(os.path.join('./dataset', 'ref_len20'), references20_for_save).save() TextFile(os.path.join('./dataset', 'ref_len15'), references15_for_save).save() TextFile(os.path.join('./dataset', 'ref_len10'), references10_for_save).save() # generate readable text result = [] for ref, hyp in zip(test.target.data, hypotheses): if type(ref) == tuple: ref = ref[0] ref = ' '.join([convert(y) for y in ref]).split() try: bleu_score = sentence_bleu( [ref], hyp, smoothing_function=SmoothingFunction().method1) except: bleu_score = 0 ref = ' '.join([convert(y, True) for y in ref]).split() hyp = ' '.join([convert(y, True) for y in hyp]).split() result.append((' '.join(ref), ' '.join(hyp), bleu_score)) inputs = [] for xs in test20.source.data: data = [] for x in xs[:5]: event = event_type_mapper.get(x[0], x[0]) player = id_to_player.get(str(x[1]), x[1]) team = id_to_team.get(str(x[2]), x[2]) detail = ','.join( [qualifier_type_mapper.get(i[-1], i[-1]) for i in x[-1]]) data.append('event: {} player: {} team: {} detail: {}'.format( event, player, team, detail)) inputs.append('\n'.join(data)) result = [[x, *y] for x, y in zip(inputs, result)] result = sorted(result, key=lambda x: -x[-1]) TextFile(os.path.join(args.model_path, 'test20_gate_disc_tmpl.txt'), [ 'src:\n{}\nref: {}\nhyp: {}\nbleu: {}\n##\n'.format(*x) for x in result ]).save()
def compute_bleu(reference, output): cc = SmoothingFunction() return sentence_bleu(reference, output,weights=(1.0, 0.0, 0.0, 0.0), smoothing_function=cc.method1)
def compute_bleu(output, reference): cc = SmoothingFunction() return sentence_bleu([reference], output, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=cc.method1)
def __init__(self, candidate, motion): self.smoothing = SmoothingFunction().method2 self.claims_list = '../dataset/claims.txt' # self.candidate = word_tokenize(candidate.lower()) # self.motion = motion.lower() self.candidate = candidate self.motion = motion self.reference = {} # self.lexicon_dictionary = pickle.load(open('../data_histo_no_article/lexicon-dict.pkl', 'rb')) self.lexicon_dictionary = pickle.load(open('lexicon-dict.pkl', 'rb')) # Make evaluation data fobj = csv.reader(open(self.claims_list, "rb"), delimiter='\t') for idx, line in enumerate(fobj): if idx == 0: continue sentence_temp = unicodedata.normalize( 'NFKD', line[2].decode('utf-8')).encode('ascii', 'ignore') sentence_temp = sentence_temp.replace('.', '') sentence_temp = sentence_temp.replace('?', '') sentence_temp = sentence_temp.replace('"', '') sentence_temp = sentence_temp.replace('\'', '') sentence_temp = sentence_temp.replace('(', '') sentence_temp = sentence_temp.replace(')', '') sentence_temp = sentence_temp.replace('%', '') sentence_temp = sentence_temp.replace('$', '') sentence_temp = sentence_temp.replace(',', '') cleaned_claim = sentence_temp.replace('[REF]', '') cleaned_claim = cleaned_claim.lower() sentence_motion = unicodedata.normalize( 'NFKD', line[0].decode('utf-8')).encode('ascii', 'ignore') tokenized_claim = word_tokenize(cleaned_claim.lower()) for idx, word in enumerate(tokenized_claim): if word not in self.lexicon_dictionary: print("%s is not in dictionary" % word) tokenized_claim[idx] = '<unk>' # cleaned_claim = cleaned_claim.replace(word,'444') # tokenized_claim = word_tokenize(cleaned_claim) tokenized_claim.append('<eos>') # a=[1,2,3,4,5,1,2,3,4,5,1] # for n,i in enumerate(a): # if i==1: # a[n]=10 print tokenized_claim # print cleaned_claim if len(tokenized_claim) < 5: continue else: if not sentence_motion.lower() in self.reference: self.reference[sentence_motion.lower()] = [] self.reference[sentence_motion.lower()].append( word_tokenize(cleaned_claim.lower())) # Write to CSV with open( 'evaluation_' + datetime.datetime.now().strftime("%Y-%m-%d:%H:%M:%S") + '.csv', 'wb') as csvfile: csvWriter = csv.writer(csvfile, delimiter=' ') csvWriter.writerow([ 'No', 'Motion', 'Claim', 'Bleu Score', 'Unigram', 'Bigram', 'Trigram', '4-Gram', 'Cumulative' ]) for i in range(len(candidate)): c = word_tokenize(self.candidate[i].lower()) m = self.motion[i].lower() score1 = sentence_bleu(self.reference[m], c, smoothing_function=self.smoothing) score2 = sentence_bleu(self.reference[m], c, weights=(1, 0, 0, 0), smoothing_function=self.smoothing) score3 = sentence_bleu(self.reference[m], c, weights=(0, 1, 0, 0), smoothing_function=self.smoothing) score4 = sentence_bleu(self.reference[m], c, weights=(0, 0, 1, 0), smoothing_function=self.smoothing) score5 = sentence_bleu(self.reference[m], c, weights=(0, 0, 0, 1), smoothing_function=self.smoothing) score6 = sentence_bleu(self.reference[m], c, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=self.smoothing) csvWriter.writerow([ i + 1, self.motion[i], self.candidate[i], score1, score2, score3, score4, score5, score6 ]) print 'Evaluation Done! Saved To Disk!'
def __init__(self): self.rouge = Rouge() self.smooth = SmoothingFunction().method1 self.best_bleu = 0.
import json import argparse import numpy as np from nltk import bleu from rouge import Rouge from collections import defaultdict from nltk.translate.bleu_score import SmoothingFunction smoothing = SmoothingFunction().method1 weights = [0.25] * 4 rouge = Rouge() def main(): parser = argparse.ArgumentParser() parser.add_argument("--out_dir", type=str, required=True, help="The directory of the outputs") args = parser.parse_args() print("\t".join(["Setup", "LM", "BLEU", "ROUGE"])) for setup in [ "rationale", "multi", "update_rationale", "update_type_rationale" ]: for lm in ["bart-large", "gpt2-xl"]: # Compute BLEU and ROUGE from the text predictions data = [
vocab, True) eval_scores = evaluate(target_words, predicted_words) #*print('Target words shape: ' + str(caption.size())) #*print('Target words: ' + str(target_words)) #*print('Predicted words are: ' + str(predicted_words)) for imgs, tgt, pdt in zip(img_paths, target_words, predicted_words): if imgs in target_caption_full.keys(): target_caption_full[imgs].extend(tgt) candidate_caption_full[imgs].extend([pdt]) else: candidate_caption_full[imgs] = [] target_caption_full[imgs] = tgt candidate_caption_full[imgs].append(pdt) sf = SmoothingFunction() #*bleu1_corpus.append(corpus_bleu(target_words, predicted_words, weights=(1, 0, 0, 0), smoothing_function=sf.method4)) #*bleu4_corpus.append(corpus_bleu(target_words, predicted_words, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=sf.method4)) bleu1.append(eval_scores['Bleu_1']) bleu2.append(eval_scores['Bleu_2']) bleu3.append(eval_scores['Bleu_3']) bleu4.append(eval_scores['Bleu_4']) cider.append(eval_scores['CIDEr']) rouge.append(eval_scores['ROUGE_L']) #*assert round(bleu1_corpus[-1], 3) == round(bleu1[-1], 3) #*assert round(bleu4_corpus[-1], 3) == round(bleu4[-1], 3) if (idx + 1) % 100 == 0: # 10 print( "Step %d - %0.4f test loss, %0.2f time, %.3f BLEU1, %.3f BLEU2, %.3f BLEU3, %.3f BLEU4, %.3f CIDEr, %.3f ROUGE_L."
def bleu4(reference_captions, predicted_caption): return 100 * sentence_bleu(reference_captions, predicted_caption, weights=(0, 0, 0, 1), smoothing_function=SmoothingFunction().method1)