def validation_epoch_end(self, val_step_outputs): # global myGlobal # avg_val_loss = torch.tensor([x['loss'] for x in val_step_outputs]).mean() # avg_val_acc = torch.tensor([x["progress_bar"]["val_acc"] for x in val_step_outputs]).mean() # # pbar = {'avg_val_acc': avg_val_acc} print("Translation Sample =================") #"An old man trying to get up from a broken chair #A man wearing red shirt sitting under a tree for sentence in config.sentences: if config.USE_BPE == False: # if self.nepochs == config.MAX_EPOCHS: # myGlobal.change(True) # myGlobal = True translated_sentence = translate_sentence(self, sentence, self.german_vocab, self.english_vocab, self.deviceLegacy, max_length=50) # print("Output", translated_sentence) # print(sentence) # global myGlobal # myGlobal = False # exit() # if self.nepochs == config.MAX_EPOCHS: # myGlobal.change(False) # print("Input", sentence) # print("Output", translated_sentence) # exit() else: translated_sentence = translate_sentence_bpe( self, sentence, self.german_vocab, self.english_vocab, self.deviceLegacy, max_length=50) print("Output", translated_sentence) # if config.COMPUTE_BLEU == True and self.nepochs == config.MAX_EPOCHS: if config.COMPUTE_BLEU == True and self.nepochs > 0: bleu_score = computeBLEU(self.test_data, self, self.german_vocab, self.english_vocab, self.deviceLegacy) self.bleu_scores.append(bleu_score) print("BLEU score: ", bleu_score) if self.nepochs % 1 == 0: writeArrToCSV(self.bleu_scores) return
def calculate_bleu(data, src_field, trg_field, model, device, max_len=100): trgs = [] pred_trgs = [] for datum in tqdm(data): src = vars(datum)['src'] trg = vars(datum)['trg'] pred_trg, _ = translate_sentence(src, src_field, trg_field, model, device, max_len) # cut off <eos> token pred_trg = pred_trg[:-1] pred_trgs.append(pred_trg) trgs.append([trg]) return bleu_score(pred_trgs, trgs)
def validation_epoch_end(self, val_step_outputs): # avg_val_loss = torch.tensor([x['loss'] for x in val_step_outputs]).mean() # avg_val_acc = torch.tensor([x["progress_bar"]["val_acc"] for x in val_step_outputs]).mean() # # pbar = {'avg_val_acc': avg_val_acc} print("Translation Sample =================") sentence = "ein pferd geht unter einer brücke neben einem boot." device = "cuda" translated_sentence = translate_sentence(self, sentence, self.german_vocab, self.english_vocab, device, max_length=50) print("Output", translated_sentence) return
def main(): parser = argparse.ArgumentParser( description='demonstration of machine translation algorithm') parser.add_argument('--model_config', default='./checkpoints/config.json', help='train config for model_weights') parser.add_argument('--model_weights', default='./checkpoints/en_de_final.pt', help='path for weights of the model') args = parser.parse_args() seed = 42 torch.manual_seed(seed) torch.cuda.manual_seed(seed) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') with open(os.path.join(args.model_config), 'rt') as f: model_args = argparse.Namespace() model_args.__dict__.update(json.load(f)) model_args = parser.parse_args(namespace=model_args) print('Loading models...') train_data, valid_data, test_data, src_lang, trg_lang = prepare_data() model = build_model(model_args, src_lang, trg_lang, len(src_lang.vocab), len(trg_lang.vocab), device) model.load_state_dict(torch.load(args.model_weights, map_location='cpu')) model.eval() print('Evaluating 5 random sentence from test set:') for _ in range(5): random_element = vars(test_data.examples[np.random.randint( len(test_data))]) input_sentence = random_element['src'] print(colored('Input sentence: \n', 'yellow'), ' '.join(input_sentence)) translation, _ = translate_sentence(input_sentence, src_lang, trg_lang, model, device) # cut off <eos> token translation = translation[:-1] print(colored('GT translation: \n', 'green'), ' '.join(random_element['trg'])) print(colored('Model translation: \n', 'green'), ' '.join(translation))
def training_loop(train_dict, val_dict, idx_dict, encoder, decoder, criterion, optimizer, opts): """Runs the main training loop; evaluates the model on the val set every epoch. * Prints training and val loss each epoch. * Prints qualitative translation results each epoch using TEST_SENTENCE * Saves an attention map for TEST_WORD_ATTN each epoch Arguments: train_dict: The training word pairs, organized by source and target lengths. val_dict: The validation word pairs, organized by source and target lengths. idx_dict: Contains char-to-index and index-to-char mappings, and start & end token indexes. encoder: An encoder model to produce annotations for each step of the input sequence. decoder: A decoder model (with or without attention) to generate output tokens. criterion: Used to compute the CrossEntropyLoss for each decoder output. optimizer: Implements a step rule to update the parameters of the encoder and decoder. opts: The command-line arguments. """ start_token = idx_dict['start_token'] end_token = idx_dict['end_token'] char_to_index = idx_dict['char_to_index'] loss_log = open(os.path.join(opts.checkpoint_path, 'loss_log.txt'), 'w') best_val_loss = 1e6 train_losses = [] val_losses = [] for epoch in range(opts.nepochs): optimizer.param_groups[0]['lr'] *= opts.lr_decay epoch_losses = [] for key in train_dict: input_strings, target_strings = zip(*train_dict[key]) input_tensors = [ torch.LongTensor( utils.string_to_index_list(s, char_to_index, end_token)) for s in input_strings ] target_tensors = [ torch.LongTensor( utils.string_to_index_list(s, char_to_index, end_token)) for s in target_strings ] num_tensors = len(input_tensors) num_batches = int(np.ceil(num_tensors / float(opts.batch_size))) for i in range(num_batches): start = i * opts.batch_size end = start + opts.batch_size inputs = utils.to_var(torch.stack(input_tensors[start:end]), opts.cuda) targets = utils.to_var(torch.stack(target_tensors[start:end]), opts.cuda) # The batch size may be different in each epoch BS = inputs.size(0) encoder_annotations, encoder_hidden = encoder(inputs) # The last hidden state of the encoder becomes the first hidden state of the decoder decoder_hidden = encoder_hidden start_vector = torch.ones(BS).long().unsqueeze( 1) * start_token # BS x 1 --> 16x1 CHECKED decoder_input = utils.to_var( start_vector, opts.cuda) # BS x 1 --> 16x1 CHECKED loss = 0.0 seq_len = targets.size(1) # Gets seq_len from BS x seq_len use_teacher_forcing = np.random.rand( ) < opts.teacher_forcing_ratio for i in range(seq_len): decoder_output, decoder_hidden, attention_weights = decoder( decoder_input, decoder_hidden, encoder_annotations) current_target = targets[:, i] loss += criterion( decoder_output, current_target ) # cross entropy between the decoder distribution and GT ni = F.softmax(decoder_output, dim=1).data.max(1)[1] if use_teacher_forcing: # With teacher forcing, use the ground-truth token to condition the next step decoder_input = targets[:, i].unsqueeze(1) else: # Without teacher forcing, use the model's own predictions to condition the next step decoder_input = utils.to_var(ni.unsqueeze(1), opts.cuda) loss /= float(seq_len) epoch_losses.append(loss.item()) # Zero gradients optimizer.zero_grad() # Compute gradients loss.backward() # Update the parameters of the encoder and decoder optimizer.step() train_loss = np.mean(epoch_losses) val_loss = evaluate(val_dict, encoder, decoder, idx_dict, criterion, opts) if val_loss < best_val_loss: checkpoint(encoder, decoder, idx_dict, opts) if not opts.no_attention: # Save attention maps for the fixed word TEST_WORD_ATTN throughout training utils.visualize_attention( TEST_WORD_ATTN, encoder, decoder, idx_dict, opts, save=os.path.join( opts.checkpoint_path, 'train_attns/attn-epoch-{}.png'.format(epoch))) gen_string = utils.translate_sentence(TEST_SENTENCE, encoder, decoder, idx_dict, opts) print( "Epoch: {:3d} | Train loss: {:.3f} | Val loss: {:.3f} | Gen: {:20s}" .format(epoch, train_loss, val_loss, gen_string)) loss_log.write('{} {} {}\n'.format(epoch, train_loss, val_loss)) loss_log.flush() train_losses.append(train_loss) val_losses.append(val_loss) save_loss_plot(train_losses, val_losses, opts)
if load_model: load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer) # sentence = "ein pferd geht unter einer brücke neben einem boot." # # translated_sentence = translate_sentence( # model, sentence, german, english, device, max_length=50 # ) sentence1 = [ 'ein', 'pferd', 'geht', 'unter', 'einer', 'brücke', 'neben', 'einem', 'boot', '.' ] translated_sentence = translate_sentence(model, sentence1, german, english, device, max_length=50) # exit() # print(f"Translated1 example sentence: \n {sentence}") # print(f"Translated1 example sentence: \n {translated_sentence}") # exit() train(model, device, load_model, save_model, german, english, train_data, valid_data, test_data, batch_size) # running on entire test data takes a while score = bleu(train_data[1:100], model, german, english, device) print(f"Final Train Bleu score {score * 100:.2f}") score = bleu(test_data[1:100], model, german, english, device)
def train(model, device, load_model, save_model, german_vocab, english_vocab, train_data, valid_data, test_data, batch_size): optimizer = optim.Adam(model.parameters(), lr=learning_rate) if load_model: load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer) sentence = "ein pferd geht unter einer brücke neben einem boot." # sentence = 'a little girl climbing into a wooden playhouse.' # sentence = "man stuffed smiling lion" #6 1 4 7 3 2 5 0 # sentence = ['ein', 'pferd', 'geht', 'unter', 'einer', 'brücke', 'neben', 'einem', 'boot', '.'] #sentence = ['The', 'study’s', 'questions', 'are', 'carefully', 'worded', 'and', 'chosen', '.'] # sentence = 'The study questions are carefully worded and chosen.' # sentence = ['a', 'little', 'girl', 'climbing', 'into', 'a', 'wooden', 'playhouse', '.'] scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=10, verbose=True) pad_idx = english_vocab.stoi["<pad>"] criterion = nn.CrossEntropyLoss(ignore_index=pad_idx) # train_iterator, valid_iterator, test_iterator = Batcher(train_data, valid_data, test_data) train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=batch_size, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device, ) step = 0 for epoch in range(num_epochs): print(f"[Epoch {epoch} / {num_epochs}]") if save_model: checkpoint = { "state_dict": model.state_dict(), "optimizer": optimizer.state_dict(), } save_checkpoint(checkpoint) model.eval() # sentence = "Das wird sehr seltsam" # sentence = "Frankreich wird wohl Deutschland angreifen" translated_sentence = translate_sentence(model, sentence, german_vocab, english_vocab, device, max_length=50) print(f"Translated example sentence: \n {sentence}") print(f"Translated example sentence: \n {translated_sentence}") # exit() # running on entire test data takes a while print("here1") score = bleu(train_data[1:10], model, german_vocab, english_vocab, device) print(f"Train Bleu score {score * 100:.2f}") print("here2") score = bleu(test_data[1:50], model, german_vocab, english_vocab, device) print(f"Test Bleu score {score * 100:.2f}") model.train() losses = [] for batch_idx, batch in enumerate(train_iterator): # Get input and targets and get to cuda # print(batch_idx) inp_data = batch.src inp_data = inp_data.to(device) target = batch.trg target = target.to(device) # inp_data = batch[0].to(device) # target = batch[1].to(device) # Forward prop # print(target) # printSentences(inp_data, german_vocab) # printSentences2(target, english_vocab, inp_data, german_vocab) trg = target[:-1, :] # print(trg.shape) output = model(inp_data, trg) # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss # doesn't take input in that form. For example if we have MNIST we want to have # output to be: (N, 10) and targets just (N). Here we can view it in a similar # way that we have output_words * batch_size that we want to send in into # our cost function, so we need to do some reshapin. # Let's also remove the start token while we're at it output = output.reshape(-1, output.shape[2]) target = target[1:].reshape(-1) optimizer.zero_grad() loss = criterion(output, target) losses.append(loss.item()) # Back prop loss.backward() # Clip to avoid exploding gradient issues, makes sure grads are # within a healthy range torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) # Gradient descent step optimizer.step() # plot to tensorboard # writer.add_scalar("Training loss", loss, global_step=step) step += 1 mean_loss = sum(losses) / len(losses) scheduler.step(mean_loss)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if load_model: load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer) # sentence = "ein pferd geht unter einer brücke neben einem boot." # # translated_sentence = translate_sentence( # model, sentence, german, english, device, max_length=50 # ) # sentence = 'The study questions are carefully worded and chosen.' # sentence = 'a little girl climbing into a wooden playhouse.' sentence = "is man lion a stuffed A at smiling." #sentence1 = ['ein', 'pferd', 'geht', 'unter', 'einer', 'brücke', 'neben', 'einem', 'boot', '.'] # sentence1 = ['a', 'little', 'girl', 'climbing', 'into', 'a', 'wooden', 'playhouse', '.'] translated_sentence = translate_sentence(model, sentence, german_vocab, english_vocab, device, max_length=50) # exit() # print(f"Translated1 example sentence: \n {sentence}") # print(f"Translated1 example sentence: \n {translated_sentence}") # exit() print("===============================going for training ") train(model, device, load_model, save_model, german_vocab, english_vocab, train_data, valid_data, test_data, batch_size) # running on entire test data takes a while
'nepochs':100, 'checkpoint_dir':"checkpoints", 'learning_rate':0.005, ## INCREASE BY AN ORDER OF MAGNITUDE 'lr_decay':0.99, 'batch_size':64, 'hidden_size':20, 'encoder_type': 'transformer', 'decoder_type': 'transformer', # options: rnn / rnn_attention / transformer 'num_transformer_layers': 3, } args.update(args_dict) print_opts(args) transformer_encoder, transformer_decoder = train(args) translated = translate_sentence(TEST_SENTENCE, transformer_encoder, transformer_decoder, None, args) print("source:\t\t{} \ntranslated:\t{}".format(TEST_SENTENCE, translated)) """Try translating different sentences by changing the variable TEST_SENTENCE. Identify two distinct failure modes and briefly describe them.""" TEST_SENTENCE = test_cases translated = translate_sentence(TEST_SENTENCE, transformer_encoder, transformer_decoder, None, args) print("source:\t\t{} \ntranslated:\t{}".format(TEST_SENTENCE, translated)) exit() """# Attention Visualizations One of the benefits of using attention is that it allows us to gain insight into the inner workings of the model. By visualizing the attention weights generated for the input tokens in each decoder step, we can see where the model focuses while producing each output token.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') SRC, TRG, train_data, valid_data, test_data = getData(False) src_vocab_size = len(SRC) trg_vocab_size = len(TRG) SRC_PAD_IDX = SRC.stoi[SRC.pad_token] TRG_EOS_TOKEN = SRC.stoi[SRC.eos_token] model = Seq2Seq(SRC_PAD_IDX, src_vocab_size, trg_vocab_size, device, TRG_EOS_TOKEN).to(device) model.load_state_dict(torch.load('tut4-model.pt')) src = "ein pferd geht unter einer brücke neben einem boot ." translation, attention = translate_sentence(model, src, SRC, TRG, device) print(src) print(translation) #exit() def display_attention(sentence, translation, attention): fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(111) attention = attention.squeeze(1).cpu().detach().numpy() cax = ax.matshow(attention, cmap='bone') ax.tick_params(labelsize=15)