def setup(self): self.save_checkpoint(name='setup') (model_type, input_length, input_dim, num_classes, num_hidden, batch_size, learning_rate, train_steps, max_norm, wanted_device) = itemgetter(*flags)(vars(self.config)) assert model_type in ('RNN', 'LSTM') # Initialize the device which to run the model on # TODO: debug CUDA issues device = torch.device(wanted_device) # device = torch.device(device if torch.cuda.is_available() else 'cpu') # Initialize the model that we are going to use model_pars = [input_length, input_dim, num_hidden, num_classes, batch_size, device] self.model = LSTM(*model_pars) \ if model_type == 'LSTM' \ else VanillaRNN(*model_pars) self.model.to(device) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(input_length+1) self.data_loader = DataLoader(dataset, batch_size, num_workers=1) # Setup the loss and optimizer self.criterion = torch.nn.CrossEntropyLoss() self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=learning_rate)
def train(config): # Initialize the device which to run the model on device = torch.device("cpu") # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length + 1) data_loader = DataLoader(dataset, 1, num_workers=1) # Initialize the model that we are going to use model = VanillaRNN(config.input_length, config.input_dim, config.num_hidden, config.num_classes, device, True, False) grads_vanilla = get_grads(model, device, data_loader, dataset, config.num_classes) model = LSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, device, True, False) grads_lstm = get_grads(model, device, data_loader, dataset, config.num_classes) import matplotlib.pyplot as plt plt.plot(grads_vanilla, "o", label='VanillaRNN') plt.plot(grads_lstm, "o", label='LSTM') plt.yscale("log") plt.xlabel("Time step") plt.ylabel("Gradient magnitude (log)") plt.title( "Comparison of gradient backprop in types of RNN\nInitialization: xavier_uniform, LSTM forget gate bias: 2" ) plt.legend() plt.show()
def train(config): assert config.model_type in ('RNN', 'LSTM') if config.tensorboard: writer = SummaryWriter(config.summary + datetime.now().strftime("%Y%m%d-%H%M%S")) elif config.record_plot: CSV_DIR = config.csv if not os.path.isfile(CSV_DIR): f = open(CSV_DIR, 'w') writer = csv.writer(f) writer.writerow( ['model_type', 'step', 'input_length', 'accuracy', 'loss']) f.close() # Initialize the device which to run the model on device = torch.device(config.device) # Initialize the model that we are going to use if config.model_type == 'RNN': model = VanillaRNN(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, config.gradient_check, device=device) elif config.model_type == 'LSTM': model = LSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, config.gradient_check, device=device) optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate) model.to(device) # Initialize the dataset and data loader (note the +1) # torch.manual_seed(42) # np.random.seed(42) dataset = PalindromeDataset(config.input_length + 1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) acc_check = [] # Setup the loss criterion = torch.nn.CrossEntropyLoss() for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() # Add more code here ... optimizer.zero_grad() # Add more code here ...0 batch_inputs = batch_inputs.to(device) batch_targets = batch_targets.to(device) out = model.forward(batch_inputs) # print(f'forward output {out.shape}, batch input shape {batch_inputs.shape}, batch_targets.shape {batch_targets.shape}') loss = criterion(out, batch_targets) loss.backward() ############################################################################ # QUESTION: what happens here and why? ############################################################################ torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=config.max_norm) ############################################################################ optimizer.step() predictions = out.argmax(dim=-1) accuracy = (predictions == batch_targets).float().mean() acc_check.append(accuracy.detach().cpu().float()) # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) if step % 10 == 0: print( "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Model type {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, config.model_type, examples_per_second, accuracy, loss)) if config.tensorboard: writer.add_scalar('training_loss', loss, step) writer.add_scalar('accuracy', accuracy, step) elif config.record_plot: with open(CSV_DIR, 'a') as f: writer = csv.writer(f) writer.writerow([ config.model_type, step, config.input_length, accuracy.item(), loss.item() ]) if loss <= 1e-3 and not all( [i is 1.0 for i in acc_check[5:] if len(acc_check) >= 5]): break if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/96553 break return accuracy
def train(config): assert config.model_type in ('RNN', 'LSTM') # Set the model # config.model_type = 'RNN' config.model_type = 'LSTM' # Initialize the device which to run the model on device = torch.device(config.device) # p_len_list = [4,5,6,7,8,9,10,15,20,25,30,35,40,45,50] p_len_list = [4, 9, 14, 15, 16, 17, 18, 19, 24, 29, 34, 39, 44, 49] p_acc = [] # config.batch_size = 150 for in_len in p_len_list: print("The Palendrom length is: " + str(in_len+1)) config.input_length = in_len np.random.seed(42) if config.model_type == 'RNN': # Initialize the RNN model that we are going to use model = VanillaRNN(config.input_length, config.input_dim, config.num_hidden, config.num_classes, device) elif config.model_type == 'LSTM': # Initialize the LSTM model that we are going to use model = LSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, device) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length+1) data_loader = DataLoader(dataset, 4000, num_workers=1) (test_inputs, test_targets) = next(iter(data_loader)) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) #Test set ~ 4000 # Setup the loss and optimizer criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate) #Data Storage train_acc = [] test_acc = [] # test_one = False for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() model_out = model.forward(batch_inputs) loss = criterion(model_out, batch_targets) optimizer.zero_grad() loss.backward() ############################################################################ # QUESTION: what happens here and why? It seems that its giving the gradient an upper limit so that there arent exploading gradients ############################################################################ torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=config.max_norm) ############################################################################ # Add more code here ... optimizer.step() loss = loss.item() # loss_over_time.append(loss) accuracy = np.average((torch.max(model_out, 1)[1] == batch_targets)) train_acc.append(accuracy) # Just for time measurement t2 = time.time() examples_per_second = config.batch_size/float(t2-t1) if step % 10 == 0: # print("[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " # "Accuracy = {:.2f}, Loss = {:.3f}".format( # datetime.now().strftime("%Y-%m-%d %H:%M"), step, # config.train_steps, config.batch_size, examples_per_second, # accuracy, loss # )) model_out = model.forward(test_inputs) accuracy = np.average((torch.max(model_out, 1)[1] == test_targets)) # if accuracy == 1 and not test_one: # test_one = True # print("The currecnt test set accuracy is: " + str(accuracy)) if (step > 2500 and in_len < 9) or (step > 4000 and in_len >= 9): if accuracy == 1: print(str(step)) print("We have convergence due to 1, accuracy is: " + str(accuracy)) p_acc.append(accuracy) test_acc = [] break elif not all(x <= accuracy for x in test_acc[-5:]): print(str(step)) print("We have convergence due to being worse than last 5, accuracy is: " + str(accuracy) + ". Best is: " + str(max(test_acc))) p_acc.append(max(test_acc)) test_acc = [] break elif np.var(test_acc[-5:]) < 0.001: print(str(step)) print("We have convergence due to variance low, accuracy is: " + str(accuracy) + ". Best is: " + str(max(test_acc))) p_acc.append(max(test_acc)) test_acc = [] break else: test_acc.append(accuracy) else: test_acc.append(accuracy) if step == config.train_steps: print("We havent converged, but we ran out of time") p_accc.append(max(test_acc)) test_acc = [] ## Another stopping could be loss < 0.015? ## This is stopping after training acc is 1 for 50 steps. # if step % 50 == 0: # if sum(train_acc) == len(train_acc) and train_acc[-1] == 1 and step > 1000: # print("We have convergence" + str(sum(train_acc))) # model_out = model.forward(test_inputs) # accuracy = np.average((torch.max(model_out, 1)[1] == test_targets)) # print(accuracy) # p_acc.append(accuracy) # train_acc = [] # break # train_acc = [] if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break # break print('Done training.') plt.plot(p_len_list, p_acc) plt.title("Accuracy for Different Lengths of Palindrome for Test Set") plt.xlabel("Palindrome Length") plt.ylabel("Accuracy") plt.savefig('figs/LSTM_Acc_basic.png') plt.show()
def train(config): assert config.model_type in ('RNN', 'LSTM') if not os.path.isdir(MODEL_FOLDER): os.mkdir(MODEL_FOLDER) if not os.path.isdir(IMAGES_FOLDER): os.mkdir(IMAGES_FOLDER) filename = config.model_type + '_nods' + '_length_input=' + str( config.input_length) + '_optimizer=' + config.optimizer + '_lr=' + str( config.learning_rate).replace('.', ',') print("Training " + config.model_type + " " + str(config.input_length) + " optimizer " + config.optimizer + ' lr ' + str(config.learning_rate)) f = open(MODEL_FOLDER + filename, 'w') plotter = LossAccPlotter(config.model_type + ' input length ' + str(config.input_length) + ' optimizer ' + config.optimizer, IMAGES_FOLDER + filename, x_label="Steps", show_regressions=False) # Initialize the device which to run the model on device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Initialize the model that we are going to use if config.model_type == 'RNN': model = VanillaRNN(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size) elif config.model_type == 'LSTM': model = LSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size) #print([print (x.shape) for x in model.parameters()]) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length + 1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Setup the loss and optimizer criterion = nn.CrossEntropyLoss() if config.optimizer == "adam": optimizer = optim.Adam(model.parameters(), lr=config.learning_rate) else: optimizer = optim.RMSprop(model.parameters(), lr=config.learning_rate) for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() predictions = model(batch_inputs.to(device)) torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.max_norm) loss = criterion(predictions, batch_targets) accuracy = get_accuracy(predictions, batch_targets) optimizer.zero_grad() loss.backward() optimizer.step() # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) if step % 100 == 0: info = "Train Step {:04d}/{:04d}: Accuracy = {:.2f}, Loss = {:.3f}".format( step, config.train_steps, accuracy, loss) f.write(info + '\n') plotter.add_values(step, loss_train=loss.data.numpy(), acc_train=accuracy, redraw=False) print( "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss)) if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break plotter.redraw(plot=False) f.close() print('Done training.')
def train(config): assert config.model_type in ('RNN', 'LSTM') # Initialize the device which to run the model on device = torch.device(config.device) device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # fixme # Initialize the model that we are going to use if config.model_type == 'RNN': model = VanillaRNN(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device) elif config.model_type == 'LSTM': model = LSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device) model = model.to(device) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length + 1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Setup the loss and optimizer criterion = torch.nn.CrossEntropyLoss() if config.optim == "RMSprop": optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate) if config.optim == "Adam": optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) # Better timing t1 = time.time() output_file = "results/{}_len{}_{}_batch{}.csv".format( config.model_type, config.input_length, config.optim, config.batch_size) f = open(output_file, "w+") f.write("step;accuracy\n") f.close() accuracies = [] for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Prepare for torch x = torch.tensor(batch_inputs, dtype=torch.float32, device=device) y = torch.tensor(batch_targets, dtype=torch.long, device=device) # Forward pass predictions = model(x) loss = criterion(predictions, y) # Backward pass optimizer.zero_grad() loss.backward() ############################################################################ # QUESTION: what happens here and why? ############################################################################ torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.max_norm) ############################################################################ optimizer.step() if step % config.print_every == 0: # Just for time measurement (changed to measure average every time it prints) t2 = time.time() examples_per_second = (10 * config.batch_size) / float(t2 - t1) accuracy = torch.sum(predictions.argmax(dim=1) == y).to( torch.float32) / len(batch_inputs) accuracies.append(accuracy) print( "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss)) f = open(output_file, "a+") f.write("%d;%f\n" % (step, accuracy)) f.close() # Only for time measurement of step through network t1 = time.time() if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break print('Done training.') print(np.percentile(np.array(accuracies), 95))
def train(config): assert config.model_type in ('RNN', 'LSTM') plt.figure(0) # Initialize the device which to run the model on device = torch.device(config.device) in_len = 24 print("The Palendrom length is: " + str(in_len + 1)) config.input_length = in_len np.random.seed(42) model = VanillaRNN(config.input_length, config.input_dim, config.num_hidden, config.num_classes, device) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length + 1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Setup the loss and optimizer criterion = torch.nn.CrossEntropyLoss() #Data Storage (batch_inputs, batch_targets) = next(iter(data_loader)) # Only for time measurement of step through network model_out = model.forward(batch_inputs) optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate) loss = criterion(model_out, batch_targets) optimizer.zero_grad() loss.backward() RNN_grad_norms = [torch.norm(h.grad) for h in model.h_list] plt.plot(np.arange(0, len(RNN_grad_norms)), RNN_grad_norms, label='RNN') model = LSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, device) optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate) # Setup the loss and optimizer criterion = torch.nn.CrossEntropyLoss() #Data Storage (batch_inputs, batch_targets) = next(iter(data_loader)) # Only for time measurement of step through network model_out = model.forward(batch_inputs) loss = criterion(model_out, batch_targets) optimizer.zero_grad() loss.backward() LSTM_grad_norms = [torch.norm(h.grad) for h in model.h_list] print('Done training.') plt.plot(np.arange(0, len(LSTM_grad_norms)), LSTM_grad_norms, label='LSTM') plt.title("Gradient Norms through 50 Time Steps in the RNN and LSTM") plt.xlabel("Time Steps") plt.ylabel("Gradient Norms") plt.yscale('log') plt.legend() plt.savefig('figs/Gradient_Norms.png') plt.show()
def train(config, acc_th=0.99, epsilon=0.01): """ """ # some additional vars learning_rate = config.learning_rate # input_length = seq_length (?) seq_length = config.input_length assert config.model_type in ('RNN', 'LSTM') # Initialize the device which to run the model on device = torch.device(config.device) # Initialize the model that we are going to use if config.model_type == 'RNN': # Because I added kwargs to VanillaRNN, this will work model = VanillaRNN(seq_length, **config.__dict__) else: model = LSTM(seq_length, **config.__dict__) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length + 1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Setup the loss=CrossEntropy and optimizer=SGD criterion = nn.CrossEntropyLoss() optimizer = optim.RMSprop(model.parameters(), lr=learning_rate) loss_list = list() accuracy_list = list() mean_loss_list = list() mean_accuracy_list = list() for step, (batch_inputs, batch_targets) in enumerate(data_loader): # X_batch = torch.stack(X_transposed).t() # Y_batch = torch.stack(y_transposed).t() X = batch_inputs.to(device) y = batch_targets.to(device) if config.model_type == 'LSTM': if X.dim() != len(model.X_dimensions): X = X.view(X.size()[0], X.size()[1], 1) model.forward(X) # TODO: Try one-hot encode # make X to be (B, L, D) where D is 10 (one-hot encoded) #torch.zeros(len(x), x.max() + 1).scatter_(1, x.unsqueeze(1), 1.) # Only for time measurement of step through network t1 = time.time() # Add more code here ... optimizer.zero_grad() outputs = model.forward(X) # Add more code here ... loss_current = criterion(outputs, y) loss_current.backward(retain_graph=True) optimizer.step() ############################################################################ # QUESTION: what happens here and why? - RESCALING ? ############################################################################ torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=config.max_norm) ############################################################################ loss = loss_current.detach().item() accuracy = (outputs.argmax(dim=1) == y.long()).sum().float() / float( y.shape[0]) loss_list.append(loss) accuracy_list.append(accuracy) # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) if step % 50 == 0: mean_loss_list.append(np.mean(loss_list[-50:])) mean_accuracy_list.append(np.mean(accuracy_list[-50:])) print( "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss)) if step == config.train_steps or mean_loss_list[ -1] < epsilon: #or mean_accuracy_list[-1] > acc_th: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 print(mean_loss_list[-1]) print(mean_accuracy_list[-1]) break print('Done training.') return mean_loss_list, mean_accuracy_list
def train(config): assert config.model_type in ('RNN', 'LSTM') # Initialize the device which to run the model on device = torch.device(config.device) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length + 1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) test_loader = DataLoader(dataset, 10000, num_workers=1) if config.train_log != "STDOUT": outfile = open(config.train_log, 'w') accuracy_avg = 0 for i in range(config.avg_over): # Initialize the model that we are going to use if config.model_type == 'RNN': model = VanillaRNN(config.input_length, config.input_dim, config.num_hidden, config.num_classes, device, linear=config.linear) optimizer = optim.RMSprop(model.parameters(), config.learning_rate) else: model = LSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, device, linear=config.linear) optimizer = optim.RMSprop(model.parameters(), config.learning_rate) model.to(device) # Setup the loss and optimizer criterion = nn.CrossEntropyLoss() for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() batch_inputs = batch_inputs.to(device) batch_targets = batch_targets.to(device) optimizer.zero_grad() pred = model.forward(batch_inputs) loss = criterion(pred, batch_targets) accuracy = acc( pred, F.one_hot(batch_targets, num_classes=config.num_classes).float(), config.num_classes) loss.backward() ############################################################################ # QUESTION: what happens here and why? # Gradient clipping is performed. In deep computational graphs the # parameter gradient could grow very large due to # repeatedly applying the same operation. If this happens an SGD # update will take a bigger-than-usual step, possibly ending up in a # region where loss function already begins to curve upwards again. # To alleviate this behaviour we perform gradient clipping, which # restricts the maximum possible value of gradient and thus the max step # we can take. This will make convergence easier and the optimization # process will be better-behaved than without gradient clipping ############################################################################ torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.max_norm) ############################################################################ optimizer.step() # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) if step % 10 == 0: if config.train_log != "STDOUT": outfile.write( "[{}] Averaging Step: {} Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}\n".format( datetime.now().strftime("%Y-%m-%d %H:%M"), i, step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss)) else: print( "[{}] Averaging Step: {} Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), i, step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss)) if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break test_inputs, test_targets = next(iter(test_loader)) test_inputs = test_inputs.to(device) test_targets = test_targets.to(device) with torch.no_grad(): pred = model.forward(test_inputs) loss = criterion(pred, test_targets) accuracy = acc( pred, F.one_hot(test_targets, num_classes=config.num_classes).float(), config.num_classes) accuracy_avg += accuracy print(accuracy_avg / config.avg_over, end='') if config.train_log != "STDOUT": outfile.close()
def train(config): assert config.model_type in ('RNN', 'LSTM') # Initialize the device which to run the model on device = torch.device(config.device) # Initialize params for models seq_length = config.input_length input_dim = config.input_dim num_hidden = config.num_hidden num_classes = config.num_classes print(seq_length, input_dim, num_classes, num_hidden) # Testing for convergence epsilon = 5e-4 # minimal steps the model definitely trains, LSTM trains slower so needs more interations if seq_length < 30: if config.model_type == 'RNN': min_steps = 3000 if seq_length > 15 else 1000 else: min_steps = 5000 if seq_length > 15 else 1500 else: min_steps = 6500 # Initialize the model that we are going to use if config.model_type == 'RNN': model = VanillaRNN(seq_length, input_dim, num_hidden, num_classes, device) else: model = LSTM(seq_length, input_dim, num_hidden, num_classes, device) model.to(device) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length + 1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Setup the loss and optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.RMSprop(model.parameters(), lr=config.learning_rate) # Train losses and accuracies for debugging purposes accuracies, losses = [], [] for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() # convert to one-hot representation batch_inputs = torch.scatter( torch.zeros(*batch_inputs.size(), num_classes), 2, batch_inputs[..., None].to(torch.int64), 1).to(device) batch_targets = batch_targets.to(device) train_output = model.forward(batch_inputs) loss = criterion(train_output, batch_targets) ############################################################################ # QUESTION: what happens here and why? ############################################################################ # Clip exploding gradients torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=config.max_norm) ############################################################################ optimizer.zero_grad() loss.backward() optimizer.step() accuracy = torch.sum( torch.eq(torch.argmax(train_output, dim=1), batch_targets)).item() / train_output.size(0) accuracies.append(accuracy) losses.append(loss.item()) # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) if step % 100 == 0: print( "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss)) if step > min_steps and ( np.absolute(np.mean(losses[-102:-2]) - losses[-1]) < epsilon): print("Convergence reached after {} steps".format(step)) break if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break print('Done training.') return model
def train(config): assert config.model_type in ('RNN', 'LSTM') # Initialize the device which to run the model on device = torch.device(config.device) # Initialize the model that we are going to use if config.model_type == "RNN": model = VanillaRNN(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, config.device) else: model = LSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, config.device) device = model.device model = model.to(device) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length + 1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Setup the loss and optimizer criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate) list_train_acc = [] list_train_loss = [] acc_average = [] loss_average = [] for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() # Add more code here ... batch_inputs = batch_inputs.to(device) batch_targets = batch_targets.to(device) #model.to(device) output = model(batch_inputs) loss = criterion(output, batch_targets) model.zero_grad() loss.backward() loss = loss.item( ) #Only get the value to be stored, otherwise too much memory ############################################################################ # QUESTION: what happens here and why? # it clips gradient norm of an iterable of parameters, so the gradients are normalized w.r.t. to the max_norm # Thus is will limit the size and get reasonably gradients as opposed to very large gradients. # This handles the case of exploding gradients as with each layer the gradient can get amplified. ############################################################################ torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=config.max_norm) ############################################################################ # Add more code here ... optimizer.step() #Loss is computed above #loss = np.inf # fixme #accuracy = 0.0 # fixme number_predictions = torch.argmax(output, dim=1) result = number_predictions == batch_targets accuracy = result.sum().item() / len(batch_targets) if config.measure_type == 2: acc_average.append(accuracy) loss_average.append(loss) # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) # Add accuracy and loss to list if step % 10 == 0: #Average accuracy and loss over the last 10 step if config.measure_type == 2: accuracy = sum(acc_average) / 10 loss = sum(loss_average) / 10 acc_average = [] loss_average = [] #Either accuracy and loss on the the 10th interval or the average of the last 10 steps. list_train_acc.append(accuracy) list_train_loss.append(loss) print( "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss)) elif config.measure_type == 0: #Track accuracy and loss for every step even if it is not on the 10th interval list_train_acc.append(accuracy) list_train_loss.append(loss) if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break print('Done training.') if not config.overview_length: if config.measure_type == 0: eval_steps = list(range(config.train_steps + 1)) #Every step Acc else: eval_steps = list(range(0, config.train_steps + 10, 10)) print(len(list_train_acc)) plt.plot(eval_steps, list_train_acc, label="Train accuracy") plt.xlabel("Step") plt.ylabel("Accuracy") plt.title("Training accuracies", fontsize=18, fontweight="bold") plt.legend() # plt.savefig('accuracies.png', bbox_inches='tight') plt.show() plt.plot(eval_steps, list_train_loss, label="Train loss") plt.xlabel("Step") plt.ylabel("Loss") plt.title("Train loss", fontsize=18, fontweight="bold") plt.legend() # plt.savefig('loss.png', bbox_inches='tight') plt.show() return (list_train_acc, list_train_loss)
def run(model_type, input_length, input_dim, num_classes, num_hidden, batch_size, learning_rate, train_steps, max_norm, device): assert model_type in ('RNN', 'LSTM') # Initialize the device which to run the model on device = torch.device(device) # Initialize the model that we are going to use model_pars = [ input_length, input_dim, num_hidden, num_classes, batch_size, device ] model = LSTM(*model_pars) \ if model_type == 'LSTM' \ else VanillaRNN(*model_pars) model.to(device) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(input_length + 1) data_loader = DataLoader(dataset, batch_size, num_workers=1) # Setup the loss and optimizer criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate) for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() # Add more code here ... optimizer.zero_grad() ys = model.forward(batch_inputs) ############################################################################ # QUESTION: what happens here and why? ############################################################################ torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=max_norm) ############################################################################ # Add more code here ... predictions = ys.argmax(dim=-1) loss = criterion(ys, batch_targets) loss.backward() optimizer.step() accuracy = (batch_targets == predictions).float().mean() # Just for time measurement t2 = time.time() examples_per_second = batch_size / float(t2 - t1) stats = {'loss': loss, 'accuracy': accuracy} if step % 10 == 0: print( "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, train_steps, batch_size, examples_per_second, accuracy, loss)) if step == train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break print('Done training.') return (accuracy.item())
def train(config): assert config.model_type in ('RNN', 'LSTM') model_dir = config.summary_path + config_to_str(config) + '/' os.makedirs(model_dir) # , exist_ok=True) # add assets to filename if we removoed it with open(model_dir + 'config.pkl', 'wb+') as fd: pickle.dump(config, fd) # Initialize the device which to run the model on device = torch.device(config.device) # Initialize the model that we are going to use if config.model_type == 'RNN': model = VanillaRNN(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, config.device, config.predict_half) else: model = LSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, config.device, config.predict_half) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length+1, config.batch_size, config.train_steps) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Setup the loss and optimizer criterion = torch.nn.CrossEntropyLoss().cuda() optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate) losses = [] accuracies = [] for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() batch_inputs, batch_targets = Variable(batch_inputs.to(device)), Variable(batch_targets.to(device)) # Add more code here ... optimizer.zero_grad() batch_output = model.forward(batch_inputs) if config.predict_half: #MANY long_target = torch.cat((batch_inputs[:, int(config.input_length/2)+1:], batch_targets.unsqueeze(-1).float()),1) long_predictions = batch_output[:, int(config.input_length/2):] loss = criterion(long_predictions.contiguous().view(-1, long_predictions.size()[-1]), long_target.contiguous().view(-1).long()) accuracy = float(torch.sum(long_predictions.argmax(2) == long_target.long())) / (long_predictions.shape[0] * long_predictions.shape[1]) else: #ONE predictions = batch_output loss = criterion(predictions, batch_targets) accuracy = float(torch.sum(predictions.argmax(1) == batch_targets)) / predictions.shape[0] loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.max_norm) optimizer.step() # Just for time measurement t2 = time.time() examples_per_second = config.batch_size/float(t2-t1) if step % 10 == 0: losses.append(loss.item()) accuracies.append(accuracy) if step % 200 == 0: print("[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss )) with open(model_dir + 'learning_curves.pkl', 'wb+') as fd: pickle.dump((losses, accuracies), fd) print('Done training.')
def train(config): #set variables T_options = list(range(config.min_len, config.max_len)) print("model:", config.model_type) print("min input length:", config.min_len) print("max input length:", config.max_len) assert config.model_type in ('RNN', 'LSTM') # Initialize the device which to run the model on device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") def acc(predictions, targets): accuracy = (predictions.argmax(dim=1) == targets).float().mean().item() return accuracy all_accuracies = [] all_losses = [] all_train_steps = [] for T in T_options: accuracies = np.array([]) losses = np.array([]) final_accs = [] final_losses = [] final_train_steps = [] config.input_length = T for i in range(4): print("Iteration", i, "with T:", T, "learning rate:", config.learning_rate) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length + 1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Initialize the model that we are going to use if config.model_type == "RNN": model = VanillaRNN(config.input_length, config.input_dim, config.num_hidden, config.num_classes, device) elif config.model_type == "LSTM": model = LSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, device) model.to(device) # Setup the loss and optimizer criterion = torch.nn.CrossEntropyLoss() # optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate) optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() batch_inputs = batch_inputs.to(device) batch_targets = batch_targets.to(device) p = model.forward(batch_inputs) loss = criterion(p, batch_targets) accuracy = acc(p, batch_targets) optimizer.zero_grad() loss.backward() ############################################################################ # QUESTION: what happens here and why? # here the gradients are clipped to prevent exploding gradients ############################################################################ torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.max_norm) ############################################################################ optimizer.step() # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) if step % 10 == 0: print( "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss)) accuracies = np.append(accuracies, accuracy) losses = np.append(losses, loss.item()) if step == config.train_steps or (step % 10 == 0 and accuracy == 1): # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 final_accs.append(accuracy) final_losses.append(loss.item()) final_train_steps.append(step) break all_accuracies.append(final_accs) all_losses.append(final_losses) all_train_steps.append(final_train_steps) print('Done training.') print(all_accuracies) print(all_losses) print(all_train_steps) with open("lstm_xavier_{}_{}.txt".format(config.min_len, config.max_len), "w") as output: output.write("accuracies \n") output.write(str(all_accuracies) + "\n") output.write("losses \n") output.write(str(all_losses) + "\n") output.write("train steps \n") output.write(str(all_train_steps) + "\n")
class PalindromeExperiment(PytorchExperiment): def setup(self): self.save_checkpoint(name='setup') (model_type, input_length, input_dim, num_classes, num_hidden, batch_size, learning_rate, train_steps, max_norm, wanted_device) = itemgetter(*flags)(vars(self.config)) assert model_type in ('RNN', 'LSTM') # Initialize the device which to run the model on # TODO: debug CUDA issues device = torch.device(wanted_device) # device = torch.device(device if torch.cuda.is_available() else 'cpu') # Initialize the model that we are going to use model_pars = [input_length, input_dim, num_hidden, num_classes, batch_size, device] self.model = LSTM(*model_pars) \ if model_type == 'LSTM' \ else VanillaRNN(*model_pars) self.model.to(device) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(input_length+1) self.data_loader = DataLoader(dataset, batch_size, num_workers=1) # Setup the loss and optimizer self.criterion = torch.nn.CrossEntropyLoss() self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=learning_rate) # TODO: plot accuracy over input_length # TODO: increase learning_rate over input_length # TODO: compare result with RNN def train(self, epoch): (model_type, input_length, input_dim, num_classes, num_hidden, batch_size, learning_rate, train_steps, max_norm, device) = itemgetter(*flags)(vars(self.config)) with SummaryWriter('part1/train') as w: results = [] for step, (batch_inputs, batch_targets) in enumerate(self.data_loader): # Only for time measurement of step through network t1 = time.time() # Add more code here ... self.optimizer.zero_grad() # move to device inputs = torch.tensor(batch_inputs, dtype=torch.float).to(device) targets = torch.tensor(batch_targets, dtype=torch.long ).to(device) ys = self.model.forward(inputs) # clip the gradients so gradient explosion won't let us overshoot the minimum # https://www.quora.com/What-is-gradient-clipping-and-why-is-it-necessary torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=max_norm) # Add more code here ... predictions = ys.argmax(dim=-1) loss = self.criterion(ys, targets) loss.backward() self.optimizer.step() accuracy = (targets == predictions).float().mean() # Just for time measurement t2 = time.time() examples_per_second = batch_size/float(t2-t1) stats = {'loss':loss, 'accuracy':accuracy} results.append({'step': step, **{k:v.item() for k,v in stats.items()}}) if step % 100 == 0: w.add_scalars('metrics', stats, int(step/10)) # # TODO: check why this is slow! # for k, v in stats.items(): # self.add_result(value=v.item(), name=f'train_{k}', counter=step / train_steps, label=k) self.elog.print("elog [{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, train_steps, batch_size, examples_per_second, accuracy, loss )) self.save_checkpoint(name='train', n_iter=step) if step % 100 == 0: results = write_csv(results, self.config) if step == train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break print('Done training.') results = write_csv(results, self.config) def validate(self, epoch): pass
def grads_over_time(config): assert config.model_type in ('RNN', 'LSTM') if config.tensorboard: writer = SummaryWriter(config.summary + datetime.now().strftime("%Y%m%d-%H%M%S")) elif config.record_plot: CSV_DIR = config.csv if not os.path.isfile(CSV_DIR): f = open(CSV_DIR, 'w') writer = csv.writer(f) writer.writerow(['model_type', 'step', 'input_length', 'accuracy', 'loss']) f.close() # Initialize the device which to run the model on device = torch.device(config.device) # Initialize the model that we are going to use if config.model_type=='RNN': model = VanillaRNN(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, config.gradient_check, device=device) optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate) elif config.model_type=='LSTM': model = LSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, config.gradient_check, device=device) optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate) model.to(device) # Initialize the dataset and data loader (note the +1) # torch.manual_seed(42) # np.random.seed(42) dataset = PalindromeDataset(config.input_length+1) # Setup the loss criterion = torch.nn.CrossEntropyLoss() # Add more code here ... # Add more code here ...0 batch_inputs, batch_targets = dataset[0] batch_inputs = torch.from_numpy(batch_inputs).unsqueeze(0).to(device) batch_targets = torch.from_numpy(np.array([batch_targets])).to(device) out = model.forward(batch_inputs) loss = criterion(out, batch_targets) loss.backward() ############################################################################ # QUESTION: what happens here and why? ############################################################################ torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=config.max_norm) ############################################################################ optimizer.zero_grad() optimizer.step() gradient_list = [] for hidden_grad in model.grad_hidden_list: # print(torch.norm(hidden_grad.grad).item()) gradient_list.append(torch.norm(hidden_grad.grad, p=2).item()) return gradient_list
def train(config): assert config.model_type in ('RNN', 'LSTM') # Initialize the device which to run the model on #device = torch.device(config.device) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Initialize the model that we are going to use if config.model_type is 'RNN': model = VanillaRNN(seq_length=config.input_length, input_dim=config.input_dim, num_hidden=config.num_hidden, num_classes=config.num_classes, batch_size=config.batch_size, device=device) if config.model_type is 'LSTM': model = LSTM(seq_length=config.input_length, input_dim=config.input_dim, num_hidden=config.num_hidden, num_classes=config.num_classes, batch_size=config.batch_size, device=device) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length + 1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Setup the loss and optimizer criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.RMSprop(model.parameters(), config.learning_rate) #optimizer = torch.optim.Adam(model.parameters(), config.learning_rate) Accuracy = [] for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() y = model.forward(batch_inputs.to(device)) loss = criterion(y, batch_targets.to(device)) loss.backward() ############################################################################ # QUESTION: what happens here and why? # limits the size of the parameter updates by scaling the gradients down # Should be placed after loss.backward() but before optimizer.step() ############################################################################ torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=config.max_norm) ############################################################################ optimizer.step() loss = loss.item() acc_in = np.argmax(y.cpu().detach().numpy(), axis=1) == batch_targets.cpu().detach().numpy() accuracy = np.sum(acc_in) / batch_targets.shape[0] Accuracy.append(accuracy) # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) if step % 10 == 0: print( "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss)) if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break print('Done training. :)')
def train(config): assert config.model_type in ('RNN', 'LSTM') # Initialize the device which to run the model on device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") def acc(predictions, targets): accuracy = (predictions.argmax(dim=1) == targets).float().mean().item() return accuracy # Initialize the dataset and data loader (note the +1 dataset = PalindromeDataset(config.input_length + 1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Setup the loss and optimizer criterion = torch.nn.CrossEntropyLoss() lstm = LSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes) rnn = VanillaRNN(config.input_length, config.input_dim, config.num_hidden, config.num_classes, device) optimizer_lstm = torch.optim.RMSprop(lstm.parameters(), lr=config.learning_rate) optimizer_rnn = torch.optim.RMSprop(rnn.parameters(), lr=config.learning_rate) for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network print("step", step) # Initialize the model that we are going to use lstm_out = lstm.forward(batch_inputs) optimizer_lstm.zero_grad() loss_lstm = criterion(lstm_out, batch_targets) loss_lstm.backward() optimizer_lstm.step() rnn_out = rnn.forward(batch_inputs) optimizer_rnn.zero_grad() loss_rnn = criterion(rnn_out, batch_targets) loss_rnn.backward() optimizer_rnn.step() lstm_norms = [] for h in lstm.all_h: lstm_norms.append(h.grad.norm().item()) rnn_norms = [] for h in rnn.all_h: rnn_norms.append(h.grad.norm().item()) sequence = list(range(1, config.input_length + 1)) plt.figure(figsize=(15, 6)) plt.plot(sequence, rnn_norms, label="rnn") plt.plot(sequence, lstm_norms, label="lstm") plt.legend() plt.xlabel("sequence value") plt.ylabel("gradient norm") plt.show() break print('Done training.')
def train(config): assert config.model_type in ('RNN', 'LSTM') # Initialize the device which to run the model on if torch.cuda.is_available(): device = torch.device('cuda') l_type = torch.cuda.LongTensor f_type = torch.cuda.FloatTensor else: device = torch.device('cpu') l_type = torch.LongTensor f_type = torch.FloatTensor # Initialize the model that we are going to use if config.model_type == "RNN": model = VanillaRNN(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device) else: model = LSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length + 1) data_loader = DataLoader(dataset, config.batch_size, num_workers=0) # Setup the loss and optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.RMSprop(model.parameters(), lr=config.learning_rate) if torch.cuda.is_available(): criterion.cuda() model.cuda() losses = [] accuracies = [] for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() model.zero_grad() optimizer.zero_grad() batch_inputs = batch_inputs.type(f_type) batch_targets = batch_targets.type(l_type) batch_predicted = model(batch_inputs) loss = criterion(batch_predicted, batch_targets) loss.backward() ############################################################################ # QUESTION: what happens here and why? ############################################################################ torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.max_norm) ############################################################################ optimizer.step() loss = loss.item() correct_predicted = torch.sum( torch.argmax(batch_predicted, dim=1) == batch_targets).item() accuracy = float(correct_predicted) / config.batch_size losses.append(loss) accuracies.append(accuracy) # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) if step % 100 == 0: print( "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.4f}, Loss = {:.4f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss)) if loss <= 0.001: print('Model converged in {:04d} steps at loss {}'.format( step, loss)) break if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break print('Done training.')
def train(config): assert config.model_type in ('RNN', 'LSTM') # Initialize the device which to run the model on device = torch.device(config.device) # Initialize the model that we are going to use if config.model_type == 'RNN': model = VanillaRNN(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device) else: model = LSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device=device) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length + 1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Setup the loss and optimizer criterion = CrossEntropyLoss() optimizer = RMSprop(model.parameters(), lr=config.learning_rate) for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() prob = model.forward(batch_inputs) ############################################################################ # QUESTION: what happens here and why? Done to avoid vanishing gradients ############################################################################ torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=config.max_norm) ############################################################################ loss = criterion.forward(prob, batch_targets) accuracy = float( torch.sum(prob.argmax(dim=1) == batch_targets)) / config.batch_size optimizer.zero_grad() loss.backward() optimizer.step() # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) writer.add_scalar('Train/Accuracy', accuracy, step) if step % 10 == 0: print( "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss)) if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break print('Done training.')
def train(config): assert config.model_type in ('RNN', 'LSTM') # Initialize the device which to run the model on device = torch.device(config.device) print('Running on {}'.format(device)) # Initialize the model that we are going to use if config.model_type is 'RNN': model = VanillaRNN(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device) else: model = LSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length + 1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Setup the loss and optimizer optimizer = torch.optim.RMSprop(model.parameters(), config.learning_rate) criterion = torch.nn.CrossEntropyLoss() accuracies = [] for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() y_pred_batch = model(batch_inputs.to(device)) loss = criterion(y_pred_batch, batch_targets.to(device)) loss.backward() ############################################################################ # QUESTION: what happens here and why? ############################################################################ torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=config.max_norm) ############################################################################ optimizer.step() optimizer.zero_grad() loss = loss.item() accuracy = np.sum( np.argmax(y_pred_batch.cpu().detach().numpy(), axis=1) == batch_targets.cpu().detach().numpy()) / batch_targets.shape[0] # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) accuracies.append(accuracy) if step % 10 == 0: print( "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss)) if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break print('Done training.') print("Length {} max. accuracy: {}".format(config.input_length, max(accuracies)))
def train(config): assert config.model_type in ('RNN', 'LSTM') # Initialize the device which to run the model on device = torch.device(config.device) # Initialize the model that we are going to use model = None if config.model_type == 'LSTM': model = LSTM( config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.device, ) elif config.model_type == 'RNN': model = VanillaRNN( config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.device, ) else: print('Your model type input is neither \'RNN\' or \'LSTM\'') return # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length + 1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Setup the loss and optimizer criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate) for step, (batch_inputs, batch_targets) in enumerate(data_loader): batch_inputs = batch_inputs.to(device) batch_targets = batch_targets.to(device) # Only for time measurement of step through network t1 = time.time() # Add more code here ... optimizer.zero_grad() output = model.forward(batch_inputs) loss = criterion.forward(output, batch_targets) loss.backward() ############################################################################ # QUESTION: what happens here and why? ############################################################################ ''' ANSWER: This function ‘clips’ the norm of the gradients by scaling the gradients down by the same amount in order to reduce the norm to an acceptable level. In practice this places a limit on the size of the parameter updates. ''' torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=config.max_norm) ############################################################################ # Add more code here ... optimizer.step() with torch.no_grad(): pred = torch.nn.functional.softmax(output, dim=0) pred = torch.max(pred, 1)[1] loss = loss # fixme accuracy = np.sum([ 1 if pred[i] == batch_targets[i] else 0 for i in range(len(pred)) ]) / len(batch_targets) # fixme # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) if step % 10 == 0: print( "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss)) if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break print('Done training.') """ Test memory capacity """ dataset = PalindromeDataset(config.input_length + 1) data_loader = DataLoader(dataset, 1000, num_workers=1) batch_inputs, batch_targets = next(iter(data_loader)) batch_inputs = batch_inputs.to(device) batch_targets = batch_targets.to(device) output = model.forward(batch_inputs) print('T:', config.input_length + 1) pred = torch.nn.functional.softmax(output, dim=0) pred = torch.max(pred, 1)[1] accuracy = np.sum( [1 if pred[i] == batch_targets[i] else 0 for i in range(len(pred))]) / len(batch_targets) print("Final accuracy:", accuracy)
def train(config): assert config.model_type in ('RNN', 'LSTM') exp_name = 'runs/{}_batch{}_dim{}_len{}_{}'.format( config.model_type, config.batch_size, config.input_dim, config.input_length, datetime.now().strftime("%Y-%m-%d %H:%M")) print(exp_name) print(config) writer = SummaryWriter(exp_name) # Convert to labels def to_label(tensor): _, tensor = tensor.max(-1) return tensor # Output accuracy given predictions and targets def get_accuracy(predictions, targets): idx_p = to_label(predictions) idx_t = targets correct = (idx_p == idx_t).type( torch.FloatTensor) # TODO test accuracy accuracy = (correct.sum() / correct.shape[0]).item() return accuracy # Initialize the device which to run the model on device = torch.device(config.device) print(device) # Initialize the model that we are going to use if config.model_type == 'RNN': model = VanillaRNN(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device) model.to(device) elif config.model_type == 'LSTM': model = LSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device) model.to(device) else: raise ValueError # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length + 1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Setup the loss and optimizer criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.RMSprop(model.parameters(), config.learning_rate) for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() # Add more code here ... model.train() optimizer.zero_grad() batch_targets = batch_targets.to(device) if config.input_dim == 1: batch_inputs = batch_inputs.unsqueeze(-1).to(device) elif config.input_dim == 10: temp = batch_inputs.type(torch.LongTensor).to(device) batch_inputs = torch.zeros(config.batch_size, config.input_length, config.input_dim).to(device) batch_inputs.scatter_(2, temp.unsqueeze(-1), 1) batch_outputs = model(batch_inputs) loss = criterion(batch_outputs, batch_targets) accuracy = get_accuracy(batch_outputs, batch_targets) loss.backward() ############################################################################ # QUESTION: what happens here and why? # ANSWER: The gradients are clipped up to a certain threshold value (of the vector norm) to avoid exploding gradient problem ############################################################################ torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=config.max_norm) ############################################################################ optimizer.step() # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) # Print and save data to Tensorboard if step % 10 == 0: print( "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss)) writer.add_scalar('Accuracy', accuracy, step) writer.add_scalar('Loss', loss.item(), step) # Output final scores if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 with open("logs.txt", "a") as f: f.write("{} Accuracy: {}\n".format(exp_name, accuracy)) break print('Done training.')
def train(config): assert config.model_type in ('RNN', 'LSTM') # Initialize the device which to run the model on device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # set the config params to be used here SEQ_LENGTH = config.input_length INPUT_DIM = config.input_dim NUM_HIDDEN = config.num_hidden NUM_CLASSES = config.num_classes BATCH_SIZE = config.batch_size # Initialize the model that we are going to use if config.model_type == "RNN": model = VanillaRNN(seq_length=SEQ_LENGTH, input_dim=INPUT_DIM, num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, batch_size=BATCH_SIZE, device=device) optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate) elif config.model_type == "LSTM": model = LSTM(seq_length=SEQ_LENGTH, input_dim=INPUT_DIM, num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, batch_size=BATCH_SIZE, device=device) optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate, momentum=0.8, weight_decay=1e-4) model.to(device) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length + 1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Setup the loss and optimizer (optimize done above) criterion = torch.nn.CrossEntropyLoss() # for intermediate reporting and convergence checks intermediate_accuracies = [] for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() batch_inputs = batch_inputs.to(device) batch_targets = batch_targets.to(device) ############################################################################ # QUESTION: what happens here and why? # ANSWER: the gradients are clipped /rescaled to a max value, as explained in slide 50 of lecture 6 ############################################################################ ############################################################################ out = model.forward(batch_inputs) loss = criterion(out, batch_targets) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.max_norm) optimizer.step() accuracy = calculate_accuracy(out, batch_targets) intermediate_accuracies.append(accuracy) # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) if step % 10000 == 0: print( "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss)) if step > 10: # check for convergence: If the last 5 measured accuracies' mean is over .98, we'll say it converges if step == config.train_steps or np.mean( intermediate_accuracies[-5:-1]) >= 0.98: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break print('Done training.') print('finally accuracy:') print(accuracy) return accuracy, loss.data.item()
def train(config): assert config.model_type in ('RNN', 'LSTM') assert config.input_dim in (1, 10) # Initialize the device which to run the model on device = torch.device(config.device) accuracies = [] filename = f'{config.model_type}_{config.input_length}.pkl' # Initialize the model that we are going to use if config.model_type == 'RNN': model = VanillaRNN(seq_length=config.input_length, input_dim=config.input_dim, num_hidden=config.num_hidden, num_classes=config.num_classes, batch_size=config.batch_size, device=config.device) else: model = LSTM(seq_length=config.input_length, input_dim=config.input_dim, num_hidden=config.num_hidden, num_classes=config.num_classes, batch_size=config.batch_size, device=config.device) model = torch.nn.DataParallel(model).to(device) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length + 1) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Setup the loss and optimizer loss_criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate) model.train() for step, (batch_inputs, batch_targets) in enumerate(data_loader): batch_targets = batch_targets.to(device) # if input_dim == 10 - convert to one-hot, otherwise - use as it is if config.input_dim == 10 and len(batch_inputs.shape) < 3: batch_inputs = to_one_hot(batch_inputs).to(device) else: batch_inputs = batch_inputs.unsqueeze(2).to(device) # Only for time measurement of step through network t1 = time.time() optimizer.zero_grad() outputs = model(batch_inputs) ############################################################################ # QUESTION: what happens here and why? ############################################################################ torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=config.max_norm) ############################################################################ loss = loss_criterion(outputs, batch_targets) accuracy = calculate_accuracy(outputs, batch_targets) accuracies.append(accuracy.item()) loss.backward() optimizer.step() # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) if step % 10 == 0: print( "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss)) if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break pickle.dump(accuracies, open(f'{filename}', 'wb')) print('Done training.')
def analyze_grads_over_time(config, pretrain_model=False): device = torch.device(config.device) config.input_length = 150 seed = 42 torch.manual_seed(seed) np.random.seed(seed) total_norms = [] for m in ["RNN", "LSTM"]: # pretrain model if pretrain_model: model = train(config) else: # Initialize params for models seq_length = config.input_length input_dim = config.input_dim num_hidden = config.num_hidden num_classes = config.num_classes # Initialize the model that we are going to use if m == 'RNN': model = VanillaRNN(seq_length, input_dim, num_hidden, num_classes, device) else: model = LSTM(seq_length, input_dim, num_hidden, num_classes, device) model.to(device) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length + 1) # data_loader = DataLoader(dataset, batch_size=1, num_workers=1) data_loader = DataLoader(dataset, batch_size=config.batch_size, num_workers=1) # Setup the loss and optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.RMSprop(model.parameters(), lr=config.learning_rate) # Get single batch from dataloader batch_inputs, batch_targets, = next(iter(data_loader)) # convert to one-hot batch_inputs = torch.scatter( torch.zeros(*batch_inputs.size(), config.num_classes), 2, batch_inputs[..., None].to(torch.int64), 1).to(device) batch_targets = batch_targets.to(device) train_output = model.analyze_hs_gradients(batch_inputs) loss = criterion(train_output, batch_targets) optimizer.zero_grad() loss.backward() gradient_norms = [] for i, (t, h) in enumerate(reversed(model.h_states)): _grad = h.grad # (batch_size x hidden_dim) average_grads = torch.mean( _grad, dim=0 ) # Calculate average gradient to get more stable estimate grad_l2_norm = average_grads.norm(2).item() gradient_norms.append(grad_l2_norm) print(len(gradient_norms)) total_norms.append(gradient_norms) time_steps = np.arange(150) print(time_steps) fig = plt.figure(figsize=(15, 10), dpi=150) # fig.suptitle('L2-norm of Gradients across Time Steps (LSTM $b_f = 2$)', fontsize=32) fig.suptitle('L2-norm of Gradients across Time Steps', fontsize=36) ax = fig.add_subplot(1, 1, 1) ax.plot(total_norms[0], linewidth=2, color="tomato", label="RNN") ax.plot(total_norms[1], linewidth=2, color="darkblue", label="LSTM") ax.tick_params(labelsize=16) ax.set_xticks(time_steps[::10]) ax.set_xticklabels(time_steps[::10]) ax.set_xlabel('Backpropagation Step', fontsize=24) ax.set_ylabel('Gradient Norm (L2)', fontsize=24) ax.legend(prop={'size': 16}) if not os.path.exists('part1/figures/'): os.makedirs('part1/figures/') plt.savefig("part1/figures/Analyze_gradients_pt_{}.png".format( str(pretrain_model))) # plt.savefig("part1/figures/Analyze_gradients_pt_{}_bias_2.png".format(str(pretrain_model))) plt.show()