def train(train_loader, validate_data, device, gradient_clipping=1, hidden_state=10, lr=0.001, opt="adam", epochs=600, batch_size=32): model = EncoderDecoder(1, hidden_state, 1, 50).to(device) validate_data = validate_data.to(device) if (opt == "adam"): optimizer = torch.optim.Adam(model.parameters(), lr=lr) else: optimizer = torch.optim.RMSprop(model.parameters(), lr=lr) optimizer_name = 'adam' if 'adam' in str(optimizer).lower() else 'mse' mse = nn.MSELoss() min_loss = float("inf") best_loss_global = float("inf") min_in, min_out = None, None validation_losses = [] for epoch in range(1, epochs): total_loss = 0 for batch_idx, data in enumerate(train_loader): data = data.to(device) optimizer.zero_grad() output = model(data) loss = mse(output, data) total_loss += loss.item() if loss.item() < min_loss: min_loss = loss.item() min_in, min_out = data, output loss.backward() if gradient_clipping: nn.utils.clip_grad_norm_(model.parameters(), max_norm=gradient_clipping) optimizer.step() epoch_loss = total_loss / len(train_loader) best_loss_global = min(best_loss_global, epoch_loss) print(f'Train Epoch: {epoch} \t loss: {epoch_loss}') if epoch % 50 == 0: file_name = f'ae_toy_{optimizer_name}_lr={lr}_hidden_size={hidden_state}_' \ f'_gradient_clipping={gradient_clipping}' path = os.path.join("saved_models", "toy_task", file_name) create_folders(path) torch.save( model, os.path.join(path, f'epoch={epoch}_bestloss={best_loss_global}.pt')) if epoch % 10 == 0: validation(model, mse, validate_data, validation_losses) plot_validation_loss(epochs, gradient_clipping, lr, optimizer_name, validation_losses, batch_size, hidden_state)
def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1): "Helper: Construct a model from hyperparameters." c = copy.deepcopy attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) model = EncoderDecoder( Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N), nn.Sequential(Embeddings(d_model, src_vocab), c(position)), nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)), Generator(d_model, tgt_vocab)) # This was important from their code. # Initialize parameters with Glorot / fan_avg. for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) return model
def main(args): num_frames = 15 ms_per_frame = 40 network = EncoderDecoder(args).cuda() optimizer = torch.optim.Adam(network.parameters(), lr=args.lr, betas=(0.9, 0.99)) criterion = nn.MSELoss() train_loader, dev_loader, test_loader = fetch_kth_data(args) # test_tens = next(iter(train_loader))['instance'][0, :, :, :, :].transpose(0, 1) # print(test_tens.shape) # save_image(test_tens, './img/test_tens.png') # print(next(iter(train_loader))['instance'][0, :, 0, :, :].shape) train_loss = [] dev_loss = [] for epoch in range(args.epochs): epoch_loss = 0 batch_num = 0 for item in train_loader: #label = item['label'] item = item['instance'].cuda() frames_processed = 0 batch_loss = 0 # fit a whole batch for all the different milliseconds for i in range(num_frames-1): for j in range(i+1, num_frames): network.zero_grad() frame_diff = j - i time_delta = torch.tensor(frame_diff * ms_per_frame).float().repeat(args.batch_size).cuda() time_delta.requires_grad = True seq = item[:, :, i, :, :] #print(seq.shape) # downsample #seq = F.interpolate(seq, size=(64, 64)) #print(seq.shape) seq.requires_grad = True seq_targ = item[:, :, j, :, :] # downsample #seq_targ = F.interpolate(seq_targ, size=(64, 64)) seq_targ.requires_grad = False assert seq.requires_grad and time_delta.requires_grad, 'No Gradients' outputs = network(seq, time_delta) error = criterion(outputs, seq_targ) error.backward() optimizer.step() batch_loss += error.cpu().item() frames_processed += 1 if i == 0: save_image(outputs, '/scratch/eecs-share/dinkinst/kth/img/train_output_{}_epoch_{}.png'.format(j, epoch)) batch_num += 1 epoch_loss += batch_loss print('Epoch {} Batch #{} Total Error {}'.format(epoch, batch_num, batch_loss)) print('\nEpoch {} Total Loss {} Scaled Loss {}\n'.format(epoch, epoch_loss, epoch_loss/frames_processed)) train_loss.append(epoch_loss) if epoch % 10 == 0: torch.save(network.state_dict(), KTH_PATH+str('/model_new_{}.pth'.format(epoch))) torch.save(optimizer.state_dict(), KTH_PATH+str('/optim_new_{}.pth'.format(epoch))) dev_loss.append(eval_model(network, dev_loader, epoch)) network.train() plt.plot(range(args.epochs), train_loss) plt.grid() plt.savefig('/scratch/eecs-share/dinkinst/kth/img/loss_train.png', dpi=64) plt.close('all') plt.plot(range(args.epochs), dev_loss) plt.grid() plt.savefig('/scratch/eecs-share/dinkinst/kth/img/loss_dev.png', dpi=64) plt.close('all')
def main(): torch.manual_seed(10) # fix seed for reproducibility torch.cuda.manual_seed(10) train_data, train_source_text, train_target_text = create_data( os.path.join(train_data_dir, train_dataset), lang) #dev_data, dev_source_text, dev_target_text = create_data(os.path.join(eval_data_dir, 'newstest2012_2013'), lang) eval_data, eval_source_text, eval_target_text = create_data( os.path.join(dev_data_dir, eval_dataset), lang) en_emb_lookup_matrix = train_source_text.vocab.vectors.to(device) target_emb_lookup_matrix = train_target_text.vocab.vectors.to(device) global en_vocab_size global target_vocab_size en_vocab_size = train_source_text.vocab.vectors.size(0) target_vocab_size = train_target_text.vocab.vectors.size(0) if verbose: print('English vocab size: ', en_vocab_size) print(lang, 'vocab size: ', target_vocab_size) print_runtime_metric('Vocabs loaded') model = EncoderDecoder(en_emb_lookup_matrix, target_emb_lookup_matrix, hidden_size, bidirectional, attention, attention_type, decoder_cell_type).to(device) model.encoder.device = device criterion = nn.CrossEntropyLoss( ignore_index=1 ) # ignore_index=1 comes from the target_data generation from the data iterator #optimiser = torch.optim.Adadelta(model.parameters(), lr=1.0, rho=0.9, eps=1e-06, weight_decay=0) # This is the exact optimiser in the paper; rho=0.95 optimiser = torch.optim.Adam(model.parameters(), lr=lr) best_loss = 10e+10 # dummy variable best_bleu = 0 epoch = 1 # initial epoch id if resume: print('\n ---------> Resuming training <----------') checkpoint_path = os.path.join(save_dir, 'checkpoint.pth') checkpoint = torch.load(checkpoint_path) epoch = checkpoint['epoch'] subepoch, num_subepochs = checkpoint['subepoch_num'] model.load_state_dict(checkpoint['state_dict']) best_loss = checkpoint['best_loss'] optimiser.load_state_dict(checkpoint['optimiser']) is_best = checkpoint['is_best'] metric_store.load(os.path.join(save_dir, 'checkpoint_metrics.pickle')) if subepoch == num_subepochs: epoch += 1 subepoch = 1 else: subepoch += 1 if verbose: print_runtime_metric('Model initialised') while epoch <= num_epochs: is_best = False # best loss or not # Initialise the iterators train_iter = BatchIterator(train_data, batch_size, do_train=True, seed=epoch**2) num_subepochs = train_iter.num_batches // subepoch_size # train sub-epochs from start_batch # This allows subepoch training resumption if not resume: subepoch = 1 while subepoch <= num_subepochs: if verbose: print(' Running code on: ', device) print('------> Training epoch {}, sub-epoch {}/{} <------'. format(epoch, subepoch, num_subepochs)) mean_train_loss = train(model, criterion, optimiser, train_iter, train_source_text, train_target_text, subepoch, num_subepochs) if verbose: print_runtime_metric('Training sub-epoch complete') print( '------> Evaluating sub-epoch {} <------'.format(subepoch)) eval_iter = BatchIterator(eval_data, batch_size, do_train=False, seed=325632) mean_eval_loss, mean_eval_bleu, _, mean_eval_sent_bleu, _, _ = evaluate( model, criterion, eval_iter, eval_source_text.vocab, eval_target_text.vocab, train_source_text.vocab, train_target_text.vocab) # here should be the eval data if verbose: print_runtime_metric('Evaluating sub-epoch complete') if mean_eval_loss < best_loss: best_loss = mean_eval_loss is_best = True if mean_eval_bleu > best_bleu: best_bleu = mean_eval_bleu is_best = True config_dict = { 'train_dataset': train_dataset, 'b_size': batch_size, 'h_size': hidden_size, 'bidirectional': bidirectional, 'attention': attention, 'attention_type': attention_type, 'decoder_cell_type': decoder_cell_type } # Save the model and the optimiser state for resumption (after each epoch) checkpoint = { 'epoch': epoch, 'subepoch_num': (subepoch, num_subepochs), 'state_dict': model.state_dict(), 'config': config_dict, 'best_loss': best_loss, 'best_BLEU': best_bleu, 'optimiser': optimiser.state_dict(), 'is_best': is_best } torch.save(checkpoint, os.path.join(save_dir, 'checkpoint.pth')) metric_store.log(mean_train_loss, mean_eval_loss) metric_store.save( os.path.join(save_dir, 'checkpoint_metrics.pickle')) if verbose: print('Checkpoint.') # Save the best model so far if is_best: save_dict = { 'state_dict': model.state_dict(), 'config': config_dict, 'epoch': epoch } torch.save(save_dict, os.path.join(save_dir, 'best_model.pth')) metric_store.save( os.path.join(save_dir, 'best_model_metrics.pickle')) if verbose: if is_best: print('Best model saved!') print( 'Ep {} Sub-ep {}/{} Tr loss {} Eval loss {} Eval BLEU {} Eval sent BLEU {}' .format(epoch, subepoch, num_subepochs, round(mean_train_loss, 3), round(mean_eval_loss, 3), round(mean_eval_bleu, 4), round(mean_eval_sent_bleu, 4))) subepoch += 1 epoch += 1
def train(train_loader, test_loader, gradient_clipping=1, hidden_state_size=10, lr=0.001, epochs=100, classify=True): model = EncoderDecoder(input_size=28, hidden_size=hidden_state_size, output_size=28, labels_num=10) if not classify \ else EncoderDecoder(input_size=28, hidden_size=hidden_state_size, output_size=28, is_prediction=True, labels_num=10) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=lr) loss_name = "mse" min_loss = float("inf") task_name = "classify" if classify else "reconstruct" validation_losses = [] validation_accuracies = [] tensorboard_writer = init_writer(results_path, lr, classify, hidden_state_size, epochs) for epoch in range(1, epochs): total_loss = 0 total_batches = 0 for batch_idx, (data, target) in enumerate(train_loader): data = data.to(device) target = target.to(device) # data_sequential = data # turn each image to vector sized 784 data_sequential = data.view(data.shape[0], 28, 28) optimizer.zero_grad() if classify: resconstucted_batch, batch_pred_probs = model(data_sequential) loss = model.loss(data_sequential, resconstucted_batch, target, batch_pred_probs) else: resconstucted_batch = model(data_sequential) loss = model.loss(data_sequential, resconstucted_batch) total_loss += loss.item() loss.backward() if gradient_clipping: nn.utils.clip_grad_norm_(model.parameters(), max_norm=gradient_clipping) optimizer.step() total_batches += 1 epoch_loss = total_loss / total_batches tensorboard_writer.add_scalar('train_loss', epoch_loss, epoch) print(f'Train Epoch: {epoch} \t loss: {epoch_loss}') validation_loss = validation(model, test_loader, validation_losses, device, classify, validation_accuracies, tensorboard_writer, epoch) model.train() if epoch % 5 == 0 or validation_loss < min_loss: file_name = f"ae_toy_{loss_name}_lr={lr}_hidden_size={hidden_state_size}_epoch={epoch}_gradient_clipping={gradient_clipping}.pt" path = os.path.join(results_path, "saved_models", "MNIST_task", task_name, file_name) torch.save(model, path) min_loss = min(validation_loss, min_loss) plot_validation_loss(epochs, gradient_clipping, lr, loss_name, validation_losses, hidden_state_size, task_name) if classify: plot_validation_acc(epochs, gradient_clipping, lr, loss_name, validation_accuracies, hidden_state_size, task_name)
def train(train_loader, validate_data, device, gradient_clipping=1, hidden_state_size=10, lr=0.001, opt="adam", epochs=1000, batch_size=32): model = EncoderDecoder(1, hidden_state_size, 1, 50).to(device) validate_data = validate_data.to(device) if (opt == "adam"): optimizer = torch.optim.Adam(model.parameters(), lr=lr) else: optimizer = torch.optim.RMSprop(model.parameters(), lr=lr) optimizer_name = 'adam' if 'adam' in str(optimizer).lower() else 'mse' mse = nn.MSELoss() min_loss = float("inf") best_loss_global = float("inf") min_in, min_out = None, None validation_losses = [] for epoch in range(0, epochs): total_loss = 0 for batch_idx, data in enumerate(train_loader): data = data.to(device) optimizer.zero_grad() output = model(data) loss = mse(output, data) total_loss += loss.item() if loss.item() < min_loss: min_loss = loss.item() min_in, min_out = data, output loss.backward() if gradient_clipping: nn.utils.clip_grad_norm_(model.parameters(), max_norm=gradient_clipping) optimizer.step() epoch_loss = total_loss / len(train_loader) best_loss_global = min(best_loss_global, epoch_loss) print(f'Train Epoch: {epoch} \t loss: {epoch_loss}') if epoch % 100 == 0: path = f'{results_path}saved_models/ae_toy_{optimizer_name}_lr={lr}_hidden_size={hidden_state_size}_' \ f'_gradient_clipping={gradient_clipping}_' create_folders(path) torch.save(model, path + f"/epoch={epoch}_bestloss={best_loss_global}.pt") # run validation if epoch % 20 == 0: model.eval() mse.eval() output = model(validate_data) loss = mse(output, validate_data) # print("Accuracy: {:.4f}".format(acc)) validation_losses.append(loss.item()) mse.train() model.train() plot_sequence_examples(epochs, gradient_clipping, lr, min_in, min_out, optimizer_name, batch_size) plot_validation_loss(epochs, gradient_clipping, lr, optimizer_name, validation_losses, batch_size)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-data', help="Path to ar2en dataset.", default='./ar2en_dataset') parser.add_argument('-embeddings_size', type=int, default=300) parser.add_argument('-layers', type=int, default=2) parser.add_argument('-hidden_sizes', type=int, default=300) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-epochs', type=int, default=20) parser.add_argument('-optimizer', choices=['sgd', 'adam'], default='adam') parser.add_argument('-learning_rate', type=float, default=0.001) parser.add_argument('-l2_decay', type=float, default=0.0) parser.add_argument('-batch_size', type=int, default=64) parser.add_argument( '-cuda', action='store_true', help= 'Whether or not to use cuda for parallelization (if devices available)' ) parser.add_argument('-name', type=str, required=False, default=None, help="Filename for the plot") parser.add_argument('-quiet', action='store_true', help='No execution output.') parser.add_argument( '-tqdm', action='store_true', help='Whether or not to use TQDM progress bar in training.') parser.add_argument( '-display_vocabularies', action="store_true", help="Only display the vocabularies (no further execution).") parser.add_argument( '-reverse_source_string', action="store_true", help="Whether or not to reverse the source arabic string.") parser.add_argument( '-bidirectional', action="store_true", help="Whether or not to use a bidirectional encoder LSTM.") parser.add_argument('-attention', type=str, choices=["dot", "general"], required=False, default=None, help="Attention mechanism in the decoder.") opt = parser.parse_args() # ############# # # 1 - Load Data # # ############# # dataset = Ar2EnDataset(opt.data, opt.reverse_source_string) if opt.display_vocabularies: sys.exit(0) dataloader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=True) X_dev, y_dev = dataset.X_dev, dataset.y_dev X_test, y_test = dataset.X_test, dataset.y_test # ################ # # 2 - Create Model # # ################ # device = torch.device( "cuda:0" if torch.cuda.is_available() and opt.cuda else "cpu") if not opt.quiet: print(f"Using device '{device}'", flush=True) model = EncoderDecoder(dataset.n_inputs, dataset.n_outputs, opt.embeddings_size, opt.attention, opt.bidirectional, opt.hidden_sizes, opt.layers, opt.dropout, dataset.arabic_vocabulary, dataset.english_vocabulary, device) # ############# # # 3 - Optimizer # # ############# # optimizer = { "adam": torch.optim.Adam, "sgd": torch.optim.SGD }[opt.optimizer](model.parameters(), lr=opt.learning_rate, weight_decay=opt.l2_decay) criterion = nn.CrossEntropyLoss( ignore_index=dataset.english_vocabulary["$PAD"]) # ###################### # # 4 - Train and Evaluate # # ###################### # epochs = torch.arange(1, opt.epochs + 1) train_mean_losses = [] val_word_acc = [] val_char_acc = [] train_losses = [] for epoch in epochs: if not opt.quiet: print('\nTraining epoch {}'.format(epoch), flush=True) if opt.tqdm: from tqdm import tqdm dataloader = tqdm(dataloader) for X_batch, y_batch in dataloader: loss = train_batch(X_batch, y_batch, model, optimizer, criterion) train_losses.append(loss) mean_loss = torch.tensor(train_losses).mean().item() word_acc, char_acc = evaluate(model, X_dev, y_dev) train_mean_losses.append(mean_loss) val_word_acc.append(word_acc) val_char_acc.append(char_acc) if not opt.quiet: print('Training loss: %.4f' % mean_loss, flush=True) print('Valid word acc: %.4f' % val_word_acc[-1], flush=True) print('Valid char acc: %.4f' % val_char_acc[-1], flush=True) final_test_accuracy_words, final_test_accuracy_chars = evaluate( model, X_test, y_test) if not opt.quiet: print('\nFinal Test Word Acc: %.4f' % final_test_accuracy_words, flush=True) print('Final Test Char Acc: %.4f' % final_test_accuracy_chars, flush=True) # ######## # # 5 - Plot # # ######## # name = opt.name if opt.name is not None else "encoder_decoder" plot(epochs, train_mean_losses, ylabel='Loss', name=name + "_loss", title="Training Loss") plot( epochs, val_word_acc, ylabel='Word Val Acc', name=name + "_acc", title= f"Word Validation Accuracy\n(Final Word Test Accuracy: {round(final_test_accuracy_words,3)})" ) return final_test_accuracy_words
dataset = JIGSAWSegmentsDataset(dataset_path, dataset_tasks) dataloader = JIGSAWSegmentsDataloader(batch_size, input_length, output_length, dataset, scale=scale) model = EncoderDecoder(src_vocab, tgt_vocab, N=num_layers, input_size=feature_dim, hidden_layer=hidden_layer, h=num_heads, dropout=dropout, task_dim=task_dim) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=betas, eps=eps) running_loss_plot = train_epochs(dataloader, train_split, model, loss_function, optimizer, n_epochs=num_epochs, use_gpu=use_gpu, use_task=use_task) visulize_results( dataloader, 0, loss_function,
def train(train_loader, test_loader, gradient_clipping=1, hidden_state_size=10, lr=0.001, epochs=3000, is_prediction=False): model = EncoderDecoder(input_size=1, hidden_size=hidden_state_size, output_size=1, labels_num=1) if not is_prediction \ else EncoderDecoder(input_size=1, hidden_size=hidden_state_size, output_size=1, is_prediction=True, labels_num=1, is_snp=True) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=lr) loss_name = "mse" min_loss = float("inf") task_name = "classify" if is_prediction else "reconstruct" validation_losses = [] tensorboard_writer = init_writer(lr, is_prediction, hidden_state_size, epochs, task_name) for epoch in range(1, epochs): total_loss = 0 for batch_idx, (data, target) in enumerate(train_loader): data_sequential = (data.view(data.shape[0], data.shape[1], 1)).to(device) target = target.to(device) optimizer.zero_grad() if is_prediction: resconstucted_batch, batch_preds = model(data_sequential) batch_preds = batch_preds.view(batch_preds.shape[0], batch_preds.shape[1]) loss = model.loss(data_sequential, resconstucted_batch, target, batch_preds) else: resconstucted_batch = model(data_sequential) loss = model.loss(data_sequential, resconstucted_batch) total_loss += loss.item() loss.backward() if gradient_clipping: nn.utils.clip_grad_norm_(model.parameters(), max_norm=gradient_clipping) optimizer.step() epoch_loss = total_loss / len(train_loader) tensorboard_writer.add_scalar('train_loss', epoch_loss, epoch) print(f'Train Epoch: {epoch} \t loss: {epoch_loss}') validation_loss = validation(model, test_loader, validation_losses, device, is_prediction, tensorboard_writer, epoch) if epoch % 5 == 0 or validation_loss < min_loss: file_name = f"ae_s&p500_{loss_name}_lr={lr}_hidden_size={hidden_state_size}_epoch={epoch}_gradient_clipping={gradient_clipping}.pt" path = os.path.join(results_path, "saved_models", "s&p500_task", task_name, file_name) torch.save(model, path) min_loss = min(validation_loss, min_loss) plot_validation_loss(epochs, gradient_clipping, lr, loss_name, validation_losses, hidden_state_size, task_name)
def main(): # Check if cuda is available if torch.cuda.is_available(): device = "cuda" else: device = "cpu" transform = transforms.Compose([transforms.ToTensor()]) # Create the train test split # Automatically download if missing train_data = datasets.MNIST(root="data", train=True, transform=transform, download=True) val_data = datasets.MNIST(root="data", train=False, transform=transform, download=True) # Create dataloaders train_loader = DataLoader(train_data, batch_size=32, shuffle=True) val_loader = DataLoader(val_data, batch_size=32, shuffle=False) # Define model, loss function and optimizer net = EncoderDecoder() net.to(device) epochs=10 optimizer = Adam(net.parameters(), lr=0.001, weight_decay=1e-7) loss_fn = MSELoss(reduction="mean") # Training loop for i in range(epochs): # print(i) print("Epoch {}/{}".format(i + 1, epochs)) epoch_loss = [] counter = 0 for imgs, labels in train_loader: imgs = imgs.to(device) labels = labels.to(device) imgs = imgs.reshape(imgs.shape[0], -1) # print(imgs.device) # print(labels.device) counter += 1 # print(features.shape) # print(labels) # print(labels.dtype) y_pred = net(imgs) # print(y_pred.dtype) # print(y_pred) loss = loss_fn(imgs, y_pred) epoch_loss.append(loss.item()) # with torch.no_grad(): # acc = accuracy_score(labels.view(-1).cpu(), y_pred.view(-1).cpu()) print("{}/{}. Train Loss: {:.2f}".format(counter, len(train_data)//32, loss.item()), end="\r") # print(loss.dtype) loss.backward() optimizer.step() optimizer.zero_grad() epoch_loss = np.array(epoch_loss) print() print("Checking val loss") val_loss = [] counter = 0 for imgs, labels in val_loader: imgs = imgs.to(device) labels = labels.to(device) imgs = imgs.reshape(imgs.shape[0], -1) counter += 1 # print(features.shape) # print(labels) # print(labels.dtype) with torch.no_grad(): y_pred = net(imgs) loss = loss_fn(imgs, y_pred) val_loss.append(loss.item()) print("{}/{}. Train Loss: {:.2f}".format(counter, len(val_data)//32, loss.item()), end="\r") print() val_loss = np.array(val_loss) print("Training loss epoch: {:.2f}\tValidation loss: {:.2f}".format(epoch_loss.mean(), val_loss.mean())) # Save model torch.save(net, "model.pth")
def train(resume_training=True): EMBEDDING_SIZE = 32 num_hiddens, num_layers, dropout, batch_size, num_steps = EMBEDDING_SIZE, 2, 0.1, 64, 10 lr, num_epochs, device = 0.005, 1000, d2lt.try_gpu() ffn_num_input, ffn_num_hiddens, num_heads = EMBEDDING_SIZE, 64, 4 key_size, query_size, value_size = EMBEDDING_SIZE, EMBEDDING_SIZE, EMBEDDING_SIZE norm_shape = [EMBEDDING_SIZE] ### Load data data_iter, src_vocab, tgt_vocab = load_data_nmt(batch_size, num_steps) encoder = TransformerEncoder(len(src_vocab), key_size, query_size, value_size, num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens, num_heads, num_layers, dropout) decoder = TransformerDecoder(len(tgt_vocab), key_size, query_size, value_size, num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens, num_heads, num_layers, dropout) ### Load model model = EncoderDecoder(encoder, decoder).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=lr) ### Load checkpoint if resume_training and PATH_MODEL.exists( ) and os.path.getsize(PATH_MODEL) > 0: model, optimizer, last_epoch = load_checkpoint(model, optimizer) print("Continue training from last checkpoint...") else: if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) with open(PATH_MODEL, 'w') as fp: pass print( 'No prior checkpoint existed, created new save files for checkpoint.' ) model.apply(xavier_init_weights) last_epoch = 0 # model.apply(xavier_init_weights) # model.to(device) # optimizer = torch.optim.Adam(model.parameters(), lr=lr) ### Initialize Loss functions loss = MaskedSoftmaxCELoss() ### Train model.train() # animator = d2lt.Animator(xlabel='epoch', ylabel='loss', # xlim=[10, num_epochs]) for epoch in range(last_epoch, num_epochs): timer = d2lt.Timer() metric = d2lt.Accumulator(2) # Sum of training loss, no. of tokens for batch in data_iter: X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch] bos = torch.tensor([tgt_vocab['<bos>']] * Y.shape[0], device=device).reshape(-1, 1) dec_input = torch.cat([bos, Y[:, :-1]], 1) # Teacher forcing Y_hat, _ = model(X, dec_input, X_valid_len) l = loss(Y_hat, Y, Y_valid_len) l.sum().backward() # Make the loss scalar for `backward` d2lt.grad_clipping(model, 1) num_tokens = Y_valid_len.sum() optimizer.step() with torch.no_grad(): metric.add(l.sum(), num_tokens) if (epoch + 1) % 10 == 0: # animator.add(epoch + 1, (metric[0] / metric[1],)) print(f'epoch {epoch + 1} - ' f'loss {metric[0] / metric[1]:.5f}') ### Save checkpoint save_checkpoint(epoch, model, optimizer) print(f'loss {metric[0] / metric[1]:.5f}, {metric[1] / timer.stop():.1f} ' f'tokens/sec on {str(device)}')