def train(config): # Initialize the device which to run the model on device = torch.device(config.device) # Setup the model that we are going to use print("Initializing Vanilla RNN model...") model = VanillaRNN( seq_length=config.input_length, input_dim=config.input_dim, num_hidden=config.num_hidden, num_classes=config.num_classes, batch_size=config.batch_size, device=device ) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length+1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Setup the loss and optimizer loss_function = torch.nn.NLLLoss() optimizer = optim.RMSprop(model.parameters(), lr=config.learning_rate) for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() # Move to GPU batch_inputs = batch_inputs.unsqueeze(-1) # add input dimensionality batch_inputs = batch_inputs.to(device) # [batch_size, seq_length, 1] batch_targets = batch_targets.to(device) # [batch_size] # Reset for next iteration model.zero_grad() # Forward pass log_probs = model(batch_inputs) # Compute the loss, gradients and update network parameters loss = loss_function(log_probs, batch_targets) loss.backward() ############################################################################ # QUESTION: what happens here and why? ############################################################################ torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=config.max_norm) ############################################################################ optimizer.step() predictions = torch.argmax(log_probs, dim=1) correct = (predictions == batch_targets).sum().item() accuracy = correct / log_probs.size(0) # Just for time measurement t2 = time.time() examples_per_second = config.batch_size/float(t2-t1) if step % 10 == 0: print("[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss )) # Check if training is finished if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break print('Done training.')
def train(config,n_run): assert config.model_type in ('RNN', 'LSTM') # Initialize the device which to run the model on device = torch.device(config.device) # Train on T-1 first digits config.input_length = config.input_length - 1 # Initialize the model that we are going to use if config.model_type == 'RNN': model = VanillaRNN(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device=device) elif config.model_type == 'LSTM': model = LSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device=device) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length+1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Setup the loss and optimizer criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate) model.to(device) train_loss = [] train_acc = [] t_loss = [] t_acc = [] #Convergence condition eps = 1e-6 for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Clear stored gradient model.zero_grad() # Only for time measurement of step through network t1 = time.time() # Add more code here ... #Convert inputs and labels into tensors x = torch.tensor(batch_inputs, device=device) y = torch.tensor(batch_targets,device=device) #Forward pass pred = model.forward(x) loss = criterion(pred, y) t_loss.append(loss.item()) optimizer.zero_grad() #Backward pass loss.backward() ############################################################################ # QUESTION: what happens here and why? # ANSWER : the function torch.nn.utils.clip_grad_norm() is used to prevent # exploding gradients by ‘clipping’ the norm of the gradients, to restrain # the gradient values to a certain threshold. This essentially acts as a # limit to the size of the updates of the parameters of every layer, ensuring # that the parameter values don't change too much from their previous values. ############################################################################ torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=config.max_norm) ############################################################################ # Add more code here ... optimizer.step() accuracy = get_accuracy(pred,y, config.batch_size) t_acc.append(accuracy.item()) # Just for time measurement t2 = time.time() examples_per_second = config.batch_size/float(t2-t1) if step % 1000 == 0: print("[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss )) if step % 100 == 0: #Get loss and accuracy averages over 100 steps train_loss.append(np.mean(t_loss)) train_acc.append(np.mean(t_acc)) t_loss = [] t_acc = [] if step > 0 and abs(train_loss[-1] - train_loss[-2]) < eps: break if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break print('\nDone training.\n') # #Save trained model and results if config.model_type == 'RNN': #save model torch.save(model, "./Results/RNN/" + str(config.input_length) + "_RNN_model") #save train accuracy and loss np.save("./Results/RNN/" + str(config.input_length) + "_RNN_accuracy", train_acc) np.save("./Results/RNN/" + str(config.input_length) + "_RNN_loss", train_loss) # #save model ####################################################################### For SURFsara # torch.save(model, str(config.input_length+1) + "_RNN_model_" + str(n_run)) # #save train accuracy and loss # np.save(str(config.input_length+1) + "_RNN_accuracy_" + str(n_run), train_acc) # np.save(str(config.input_length+1) + "_RNN_loss_" + str(n_run), train_loss) elif config.model_type == 'LSTM': #save model torch.save(model, "./Results/LSTM/" + str(config.input_length) + "_LSTM_model") #save train accuracy and loss np.save("./Results/LSTM/" + str(config.input_length) + "_LSTM_accuracy", train_acc) np.save("./Results/LSTM/" + str(config.input_length) + "_LSTM_loss", train_loss)
def train(config): np.random.seed(42) torch.manual_seed(42) assert config.model_type in ('RNN', 'LSTM') # Initialize the device which to run the model on device = torch.device(config.device) print(device) # Initialize the model that we are going to use if config.model_type=="RNN": print("Training VanillaRNN") print() model = VanillaRNN(config.input_length, config.input_dim,\ config.num_hidden, config.num_classes, config.batch_size, config.device) # fixme else: print("Training LSTM") print() model = LSTM(config.input_length, config.input_dim,\ config.num_hidden, config.num_classes, config.batch_size, config.device) model = model.to(device) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length+1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Setup the loss and optimizer criterion = nn.CrossEntropyLoss() #fixme if config.optimizer=="adam": optimizer = optim.Adam(model.parameters(), lr = config.learning_rate) # fixme else: optimizer = optim.RMSprop(model.parameters(), lr = config.learning_rate) pl_loss =[] average_loss =[] acc =[] for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() batch_targets = torch.LongTensor(batch_targets) batch_inputs, batch_targets = batch_inputs.to(device), batch_targets.to(device) # zero the parameter gradients model.zero_grad() # Add more code here ... output = model(batch_inputs) out_loss = criterion(output, batch_targets) out_loss.backward() ############################################################################ # QUESTION: what happens here and why? # ANSWER: helps prevent the exploding gradient problem in RNNs / LSTMs. ############################################################################ torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.max_norm) ############################################################################ optimizer.step() # Add more code here ... loss = out_loss.item() # fixme # get argmax softmax = torch.nn.Softmax(dim=1) predictions = torch.argmax(softmax(output), dim=1) predictions = config.batch_size-len(torch.nonzero(predictions - batch_targets)) accuracy = predictions/config.batch_size # Just for time measurement t2 = time.time() examples_per_second = config.batch_size/float(t2-t1) pl_loss.append(loss) average_loss.append(np.mean(pl_loss[:-100:-1])) acc.append(accuracy) if step % 10 == 0: print("[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss )) if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break # if step%100==0: # # save training loss # plt.plot(pl_loss,'r-', label="Batch loss", alpha=0.5) # plt.plot(average_loss,'g-', label="Average loss", alpha=0.5) # plt.legend() # plt.xlabel("Iterations") # plt.ylabel("Loss") # plt.title("Training Loss") # plt.grid(True) # # plt.show() # plt.savefig(config.optimizer+"_loss_"+config.model_type+"_"+str(config.input_length)+".png") # plt.close() ################################training################################################## # plt.plot(acc,'g-', alpha=0.5) # plt.xlabel("Iterations") # plt.ylabel("Accuracy") # plt.title("Train Accuracy") # plt.grid(True) # plt.savefig("accuracy_"+config.sampling+"_"+str(config.temp)+".png") # plt.close() # fl = config.optimizer+"_acc_"+config.model_type+"_"+str(config.input_length) # np.savez(fl, acc=acc) print('Done training.')
def train(config): assert config.model_type in ('RNN', 'LSTM') tol = 0. # Initialize the device which to run the model on device = torch.device(config.device) # Initialize the model that we are going to use if config.model_type == 'RNN': model = VanillaRNN(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device) else: model = LSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length + 1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Setup the loss and optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.RMSprop(model.parameters(), lr=config.learning_rate) accuracies = [0, 1] losses = [0, 1] if config.quite: bar = tqdm(total=config.train_steps) for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() batch_inputs = batch_inputs[..., None] batch_inputs.to(device) batch_targets.to(device) # FORWARD, BACKWARD, AND STEP out = model.forward(batch_inputs) model.zero_grad() loss = criterion(out, batch_targets) loss.backward() ############################################################################ # QUESTION: what happens here and why? ############################################################################ torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=config.max_norm) ############################################################################ optimizer.step() # Add more code here ... accuracy = (out.argmax(dim=1) == batch_targets.long()).float().mean() losses.append(loss.item()) accuracies.append(accuracy.item()) # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) if step % 10 == 0 and not config.quite: print( "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracies[-1], losses[-1])) if config.quite: bar.update() if step == config.train_steps or np.isclose(losses[-1], losses[-2], tol): # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break print('Done training.') return accuracies[2:], losses[2:]
def train(config): assert config.model_type in ('RNN', 'LSTM') # Initialize the device which to run the model on device = torch.device(config.device) # Initialize the model that we are going to use if config.model_type == 'RNN': model = VanillaRNN(seq_length=config.input_length, input_dim=config.input_dim, num_hidden=config.num_hidden, num_classes=config.num_classes, batch_size=config.batch_size) elif config.model_type == 'LSTM': model = LSTM(seq_length=config.input_length, input_dim=config.input_dim, num_hidden=config.num_hidden, num_classes=config.num_classes, batch_size=config.batch_size) experiment_label = "{}_".format(datetime.now().strftime("%Y-%m-%d %H:%M")) for key, value in vars(config).items(): experiment_label += "{}={}_".format(key, value) # TensorBoard API cc = CrayonClient(hostname='18.202.229.41', port=6007) xp = cc.create_experiment(xp_name=experiment_label) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length+1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Setup the loss and optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.RMSprop(model.parameters(), lr=config.learning_rate) for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() if config.input_dim == 10: batch_inputs = one_hot(batch_inputs.type(torch.long), config.input_dim) p = model(batch_inputs) loss = criterion(p, batch_targets) predictions = get_predictions(p) accuracy = (predictions == batch_targets).sum().item() / len(predictions) # backward pass model.zero_grad() loss.backward() ############################################################################ # QUESTION: what happens here and why? ############################################################################ torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.max_norm) ############################################################################ optimizer.step() # Just for time measurement t2 = time.time() examples_per_second = config.batch_size/float(t2-t1) if step % 10 == 0: print("[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss )) # post metrics to tensorboard xp.add_scalar_dict({ 'accuracy' : accuracy, 'loss' : loss.item() }, wall_time=time.time(), step=step) if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break _ = xp.to_zip(experiment_label + ".zip") print('Done training.')
def train(config): assert config.model_type in ('RNN', 'LSTM') # Initialize the device which to run the model on device = torch.device(config.device) # Initialize the model that we are going to use if (config.model_type == 'RNN'): model = VanillaRNN(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device) else: model = LSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length + 1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Setup the loss and optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.RMSprop( model.parameters(), lr=config.learning_rate ) #, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0, centered=False) accuracies = [] losses = [] old_loss = float('inf') for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() batch_inputs = batch_inputs[ ..., None] # need to add this because input is a number batch_inputs = batch_inputs.to(device) batch_targets = batch_targets.to(device) batch_predictions = model.forward(batch_inputs) loss = criterion(batch_predictions, batch_targets) losses.append(loss.item()) model.zero_grad() #should we do this?? loss.backward() torch.nn.utils.clip_grad_norm( model.parameters(), max_norm=config.max_norm) #prevents maximum gradient problem optimizer.step() #before or after clip_grad_norm? accuracy = accuracy_(batch_predictions, batch_targets) accuracies.append(accuracy) # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) if step % 10 == 0: with open(config.save_logs, 'a') as file: file.write( "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, Accuracy = {:.2f}, Loss = {:.3f}" .format(datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss) + '\n') if step == config.train_steps or old_loss == loss.item( ): # stop if two consecutive losses remain consistent # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break old_loss = loss.item() print('Done training.') return losses, accuracies