def train(config): assert config.model_type in ('RNN', 'LSTM') # Initialize the device which to run the model on # if GPU was chosen, check if CUDA is available if str(config.device) != "cpu": if not torch.cuda.is_available(): print('\n* GPU was selected but CUDA is not available.\nTraining on CPU ...') device = torch.device("cpu") else: print('\nCUDA is available! Training on GPU ...') device = torch.device(config.device) else: print('\nTraining on GPU ...') device = torch.device(config.device) # Initialize the model that we are going to use if config.model_type == 'RNN': model = VanillaRNN(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device) else: model = LSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device) # Print Configuration print("Model Type: {!s:5} Input Length: {!s:5} Learning Rate: {}\n" .format(config.model_type, config.input_length, config.learning_rate)) # Initialize model model = torch.nn.DataParallel(model).to(device) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length+1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Setup the loss and optimizer criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate) train_loss, train_accuracy, train_steps = [], [], [] # Enable train mode model.train() for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() # move tensors to GPU, if enabled batch_targets = batch_targets.long().to(device) batch_inputs = batch_inputs.to(device) # Forward pass predictions = model(batch_inputs) # Calculate loss loss = criterion(predictions, batch_targets) # Back-propagate loss.backward() ############################################################################ # QUESTION: what happens here and why? # ANSWER: `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. # ref: https://medium.com/usf-msds/deep-learning-best-practices-1-weight-initialization-14e5c0295b94 ############################################################################ torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=config.max_norm) ############################################################################ # Update weights optimizer.step() # Clear weights gradients optimizer.zero_grad() # Just for time measurement t2 = time.time() examples_per_second = config.batch_size/float(t2-t1) if step % 10 == 0: # Store accuracy and loss train_steps.append(step) train_loss.append(loss.item()) train_accuracy.append(accuracy(predictions, batch_targets)) if step % 100 == 0: print("[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, train_accuracy[-1], train_loss[-1])) if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 # Save Train and Test accuracies and losses file_name = str(config.model_type) + '_' + str(config.input_length) + '.npz' np.savez(file_name, train_steps=train_steps, train_accuracy=train_accuracy, model_type=config.model_type, input_length=config.input_length) break print('Done training.')
def train(config, device="cpu"): assert config.model_type in ('RNN', 'LSTM') # Tensorboard summary writer run_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S_" + config.model_type.lower() + '_' + str(config.input_length)) log_dir = 'tensorboard/' + config.model_type.lower() + '/' + run_id writer = SummaryWriter(log_dir=log_dir) # Torch settings if device == 'cpu': torch.set_default_tensor_type(torch.FloatTensor) elif device == 'cuda:0': torch.set_default_tensor_type(torch.cuda.FloatTensor) dtype = torch.float # Initialize the model that we are going to use if config.model_type == 'RNN': model = VanillaRNN(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device=device).to(device) elif config.model_type == 'LSTM': model = LSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device=device).to(device) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length + 1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Setup the loss and optimizer criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate) # Accuracy and loss to be saved accuracies = [] losses = [] # Useful for convergence check avg_range = 200 last_accuracy = 0 convergence_threshold = 1e-4 model.train() for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() # Load batches in the GPU batch_inputs = batch_inputs.to(device=device) batch_targets = batch_targets.to(device=device) # Forward pass predictions = model.forward(batch_inputs) # Compute loss loss = criterion(predictions, batch_targets) # Reset gradients before backwards pass optimizer.zero_grad() # Backward pass loss.backward() # Clipping gradients to avoid exploding gradient problem torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.max_norm) # Update weights optimizer.step() # Compute accuracy accuracy = get_accuracy(predictions, batch_targets) # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) # Add accuracy and loss to the writer writer.add_scalars('accuracy_and_loss', { 'acc': accuracy, 'loss': loss }, step) # Store accuracy and loss accuracies.append(accuracy) losses.append(loss) # Print information if step % 100 == 0: print( "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss)) # Check for convergence if step % avg_range == 0 and step != 0: avg_accuracy = np.mean(accuracies[-avg_range:]) if np.abs(avg_accuracy - last_accuracy) < convergence_threshold: print( "The model has converged with accuracy", avg_accuracy, "(" + ("+" if avg_accuracy > last_accuracy else "-") + str(np.abs(avg_accuracy - last_accuracy)) + ")") break last_accuracy = avg_accuracy if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break save_results(accuracies, losses, run_id, config.model_type, config.input_length, last_accuracy) writer.close() print('Done training. Accuracy:', avg_accuracy)
def train(config, inp_len): assert config.model_type in ('RNN', 'LSTM') # Initialize the device which to run the model on if torch.cuda.is_available(): device = 'cuda' else: device = 'cpu' print('Currently using: ', device) # Initialize the model that we are going to use input_length = inp_len input_dim = config.input_dim num_classes = config.num_classes num_hidden = config.num_hidden batch_size = config.batch_size learning_rate = config.learning_rate if config.model_type == 'RNN': model = VanillaRNN(input_length, input_dim, num_hidden, num_classes, batch_size, device).double() if config.model_type == 'LSTM': model = LSTM(input_length, input_dim, num_hidden, num_classes, batch_size, device).double() model = model.to(device) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(inp_len + 1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Setup the loss and optimizer criterion = torch.nn.CrossEntropyLoss() # fixme optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate) # fixme accuracy_list = [] loss_list = [] test_list_in = [] test_list_ta = [] ## first 100 steps are to generate the test set for step, (batch_inputs, batch_targets) in enumerate(data_loader): if step < 50: test_list_in.append(batch_inputs) test_list_ta.append(batch_targets) else: # Only for time measurement of step through network t1 = time.time() model.train() batch_inputs = batch_inputs.to(device) batch_targets = batch_targets.to(device) output = model.forward(batch_inputs.transpose( 0, 1).double()).to(device) ############################################################################ # QUESTION: what happens here and why? ############################################################################ torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=config.max_norm) ############################################################################ optimizer.zero_grad() #print(output.shape) #print(batch_targets.shape) output_indices = torch.argmax(output.transpose(0, 1), dim=0).to(device) loss_for_backward = criterion(output, batch_targets).to(device) loss_for_backward.backward() optimizer.step() #loss = criterion.forward(output, batch_targets) correct_indices = output_indices == batch_targets #if step == 4000: # return correct_indices, output_indices, batch_targets, batch_inputs accuracy = int(sum(correct_indices)) / int(len(correct_indices)) # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) if step % 10 == 0: print( "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss_for_backward)) #accuracy_list.append(accuracy) ## Run a forward over the test_set if len(test_list_in) != len(test_list_ta): print('Input and target list are unequal') avg_test_acc = [] avg_test_loss = [] for sample in range(len(test_list_in)): model.eval() batch_inputs = test_list_in[sample].to(device) batch_targets = test_list_ta[sample].to(device) output = model.forward( batch_inputs.transpose(0, 1).double()).to(device) output_indices = torch.argmax(output.transpose(0, 1), dim=0).to(device) correct_indices = output_indices == batch_targets test_loss = float( criterion(output, batch_targets).to(device)) test_accuracy = int(sum(correct_indices)) / int( len(correct_indices)) avg_test_acc.append(test_accuracy) avg_test_loss.append(test_loss) avg_test = sum(avg_test_acc) / len(avg_test_acc) avg_loss = sum(avg_test_loss) / len(avg_test_loss) print('Test Accuracy: ', avg_test) accuracy_list.append(avg_test) loss_list.append(avg_loss) if step == config.train_steps or ( len(accuracy_list) > 10 and (sum(accuracy_list[-3:]) / len(accuracy_list[-3:])) == 1.0): # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break print('Done training.') line = ' '.join( (str(config.model_type), 'Palindrome length:', str(input_length), 'Accuracy:', str(accuracy_list), 'Loss', str(loss_list))) with open('LSTM.txt', 'a') as file: file.write(line + '\n')