def eval(config): use_cuda = torch.cuda.is_available() if use_cuda: device = torch.device('cuda:0') else: device = torch.device('cpu') # Initialize the device which to run the model on device = torch.device(device) dtype = torch.cuda.LongTensor if use_cuda else torch.LongTensor # Initialize the dataset and data loader (note the +1) dataset = pickle.load(open(config.dataset_path, 'rb')) # Initialize the model that we are going to use model = TextGenerationModel(config.batch_size, config.seq_length, dataset.vocab_size, \ config.lstm_num_hidden, config.dropout_keep_prob, config.lstm_num_layers).to(device) model.load_state_dict(torch.load(config.model_path)) # Setup the loss and optimizer model.eval() print('Evaluating: ') num_summaries = 5 # get random intial chars rand_chars = [ dataset._char_to_ix[random.choice(dataset._chars)] for i in range(num_summaries) ] # to tensor prev_pred = torch.Tensor(rand_chars).type(dtype) prev_pred_one_hot = to_one_hot(prev_pred, dataset.vocab_size, dtype) predictions = [] for i in range(config.sample_length): # batch size 1 prev_pred_one_hot = torch.unsqueeze(prev_pred_one_hot, 1) if i is 0: y_pred, hidden = model(prev_pred_one_hot.float()) else: y_pred, hidden = model(prev_pred_one_hot.float(), hidden) # get argmax # Sample from the network as a multinomial distribution if config.sampling_method == 'temp': output_dist = y_pred.data.div(config.temperature).exp() y_pred_batch_idx = output_dist.squeeze(1).multinomial(1).type( dtype) else: y_pred_batch_idx = y_pred.argmax(2).type(dtype) # to one hot prev_pred_one_hot = to_one_hot(y_pred_batch_idx.flatten(), dataset.vocab_size, dtype) predictions.append(y_pred_batch_idx.flatten().cpu().detach().numpy()) predictions = np.asarray(predictions).T summaries = [dataset.convert_to_string(pred) for pred in list(predictions)] print("{} \n".format('\n'.join(summaries)))
def generate_sequence(config, seed=0, temp=0, seq_length=30, model_path='output_dir/kant_100_4.pt', init_char='t'): np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Initialize the device which to run the model on device = torch.device(config.device) print(device) # Initialize the dataset and data loader (note the +1) dataset = TextDataset(config.txt_file, config.seq_length) # Initialize the model that we are going to use model = TextGenerationModel(1, 1, dataset.vocab_size, config.lstm_num_hidden, config.lstm_num_layers, config.device).to(device) model.load_state_dict(torch.load(model_path, map_location=config.device)) model.eval() # print(init_char) word_list = [dataset._char_to_ix[char] for char in init_char] state = model.init_state() for step in range(seq_length): last = torch.tensor([[word_list[step]]]).long().to(device) # print(last) output, state = model.predict(last, state, temp=temp) # print(output.squeeze()) if step + 1 >= len(word_list): if temp > 0: word_list.append(torch.multinomial(output.squeeze(), 1).item()) else: word_list.append(torch.argmax(output).item()) # plt.hist(output.squeeze().numpy(), 100) # plt.show() sequence = ''.join([dataset._ix_to_char[ix] for ix in word_list]) return sequence
def eval(): # Torch settings torch.set_default_tensor_type(torch.FloatTensor) # Initialize the dataset dataset = TextDataset(config.txt_file, config.seq_length) # Get temperature temp = config.temperature # Initialize the model that we are going to use model = TextGenerationModel(config.batch_size, config.seq_length, dataset.vocab_size) # Load model, if there's any model to load model, steps = load_model(model) print("Model trained for", steps, "steps") model.eval() try: while True: # Get input for the start of the sentence start = input("\nStart: ") # Convert input to one-hot representation (length x vocab_size) try: start_oh = get_one_hot(start, dataset) except KeyError: print("One or more characters were not recognized. Try again!") continue # Generate the rest of the sentence sentence = dataset.convert_to_string( model.cmd_generate(start_oh, temp, config.seq_length)) print("Model says:\n") print(start + sentence) except KeyboardInterrupt: print("\n\n" + random.choice(quit_msgs))
def train(config): # Initialize the device which to run the model on device = torch.device(config.device) # Initialize the dataset and data loader (note the +1) dataset = TextDataset(config.txt_file, config.seq_length) # fixme data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Save the instantiated dataset. with open('model_ckpt/train.dataset', 'wb') as dataset_file: pickle.dump(dataset, dataset_file) # Initialize the model that we are going to use model = TextGenerationModel(config.batch_size, config.seq_length, dataset.vocab_size, config.lstm_num_hidden, config.lstm_num_layers, device, config.dropout_keep_prob) # fixme # Setup the loss and optimizer criterion = nn.CrossEntropyLoss() # reduction='mean'(default) - average over all timesteps and all batches as they are merged. optimizer = optim.RMSprop(model.parameters(), config.learning_rate) # fixme # optimizer = optim.Adam(model.parameters(), config.learning_rate) # Create a tensor to hold the one-hot encoding for the batch inputs. onehot_batch_inputs = torch.FloatTensor(config.seq_length, config.batch_size, dataset.vocab_size) onehot_batch_inputs = onehot_batch_inputs.to(device) h_init = torch.zeros(config.lstm_num_layers, config.batch_size, config.lstm_num_hidden, device=device) c_init = torch.zeros(config.lstm_num_layers, config.batch_size, config.lstm_num_hidden, device=device) # Record the learning rate steps individually for learning rate decay. lr_step = 0 lr = 1 for epoch in np.arange(config.epochs): losses = [] accs = [] for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() ####################################################### # Add more code here ... ####################################################### model.train() # Convert the DataLoader output from list of tensors to tensors. batch_inputs = torch.stack(batch_inputs) batch_inputs = batch_inputs.to(device) # If the epoch is finished and there is not enough character to extract, break the loop if batch_inputs.shape[0] * batch_inputs.shape[1] != onehot_batch_inputs.shape[0] * onehot_batch_inputs.shape[1]: break # Zero the one-hot encoding and encode according to batch_inputs. onehot_batch_inputs.zero_() onehot_batch_inputs.scatter_(2, batch_inputs.unsqueeze_(-1), 1) # Convert the DataLoader output from list of tensors to tensors. batch_targets = torch.stack(batch_targets) batch_targets = batch_targets.to(device) # Learning rate decay. if lr_step % config.learning_rate_step == 0: optimizer = optim.RMSprop(model.parameters(), config.learning_rate * lr) lr *= config.learning_rate_decay optimizer.zero_grad() logits, _, _ = model(onehot_batch_inputs, h_init, c_init) # The seq_length dimension and batch_size dimension of the logits and batch_targets are merged together, and the mean is computed over this new dimension. loss = criterion(logits.view(-1, dataset.vocab_size), batch_targets.view(-1)) # fixme loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=config.max_norm) accuracy = accuracy_fn(logits.view(-1, dataset.vocab_size), batch_targets.view(-1)) # fixme optimizer.step() losses.append(loss.item()) accs.append(accuracy) # Just for time measurement t2 = time.time() examples_per_second = config.batch_size/float(t2-t1) if step % config.print_every == 0: print("[{}] Epoch {}/{}, Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), epoch + 1, config.epochs, step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss )) if step % config.sample_every == 0: # Generate some sentences by sampling from the model model.eval() # Create tensor to hold the generated samples. samples = torch.zeros((5, config.sample_length), dtype=torch.int, device=device) # Initialize the first characters for the samples. start_chars = torch.randint(dataset.vocab_size, size=(1, 5, 1), dtype=torch.long, device=device) samples[:, 0] = start_chars.squeeze() # Create a tensor to hold the one-hot encoding for the output characters of the LSTM network (one per each time step). onehot_chars = torch.zeros((1, 5, dataset.vocab_size), device=device) onehot_chars.scatter_(2, start_chars, 1) last_h = torch.zeros(config.lstm_num_layers, 5, config.lstm_num_hidden, device=device) last_c = torch.zeros(config.lstm_num_layers, 5, config.lstm_num_hidden, device=device) for t in np.arange(config.sample_length - 1): logits, last_h, last_c = model(onehot_chars, last_h, last_c) next_chars = logits.squeeze().argmax(-1) onehot_chars.zero_() onehot_chars.scatter_(2, next_chars.view(1, 5, 1), 1) samples[:, t + 1] = next_chars samples = samples.tolist() samples = [dataset.convert_to_string(sample) for sample in samples] # Output the samples into a text file. with open(config.summary_path + 'samples.txt', 'a') as txt_file: txt_file.write('Epoch: {}\nStep: {}\n'.format(epoch + 1, step)) txt_file.writelines(map(lambda x: x + '\n', samples)) if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break lr_step += 1 # After each training epoch, save the model and the training loss and accuracy. model.train() torch.save(model.state_dict(), 'model_ckpt/lstm_gen_epoch{}.ckpt'.format(epoch + 1)) with open(config.summary_path + 'train_epoch{}.csv'.format(epoch + 1), 'w', newline='') as csv_file: csv_writer = csv.writer(csv_file) csv_writer.writerow(losses) csv_writer.writerow(accs) print('Done training.')
def train(config): # Initialize the device which to run the model on device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Initialize the dataset and data loader (note the +1) dataset = TextDataset(config.txt_file, config.seq_length) # fixme data_loader = DataLoader(dataset, batch_size = config.batch_size, shuffle=True, num_workers=1) vocab_size = dataset.vocab_size # char2i = dataset._char_to_ix # i2char = dataset._ix_to_char # ---------------------------------------- # Initialize the model that we are going to use model = TextGenerationModel(config.batch_size, config.seq_length, vocab_size, \ config.lstm_num_hidden, config.lstm_num_layers, device) # fixme model.to(device) # Setup the loss and optimizer criterion = nn.NLLLoss() # fixme optimizer = optim.RMSprop(model.parameters(), lr = config.learning_rate) # fixme logSoftmax = nn.LogSoftmax(dim=2) # Learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, \ step_size=config.learning_rate_step, gamma=config.learning_rate_decay) step = 1 if config.resume: if os.path.isfile(config.resume): print("Loading checkpoint '{}'".format(config.resume)) checkpoint = torch.load(config.resume) step = checkpoint['step'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) print("Checkpoint loaded '{}', steps {}".format(config.resume, checkpoint['step'])) if not os.path.isdir(config.summary_path): os.makedirs(config.summary_path) if config.sampling =="greedy": f = open(os.path.join(config.summary_path,"sampled_"+config.sampling+".txt"), "w+") else: f = open(os.path.join(config.summary_path,"sampled_"+config.sampling+"_"+str(config.temp)+".txt"), "w+") best_accuracy = 0.0 pl_loss =[] average_loss =[] acc =[] for epochs in range(30): if step == config.train_steps: print('Done training.') break for (batch_inputs, batch_targets) in data_loader: if config.batch_size!=batch_inputs.size()[0]: print("batch mismatch") break # Only for time measurement of step through network t1 = time.time() model.hidden = model.init_hidden(config.batch_size) model.zero_grad() ####################################################### # Add more code here ... #convert batch inputs to one-hot vector batch_inputs= torch.zeros(config.batch_size, config.seq_length, vocab_size).scatter_(2,batch_inputs.unsqueeze(-1),1.0) batch_inputs, batch_targets = batch_inputs.to(device), batch_targets.to(device) predictions, _ = model(batch_inputs) if config.sampling=="greedy": predictions = logSoftmax(predictions) else: predictions = logSoftmax(predictions/config.temp) loss = criterion(predictions.transpose(2,1), batch_targets) # fixme _, predictions = torch.max(predictions, dim=2, keepdim=True) predictions = (predictions.squeeze(-1) == batch_targets).float() accuracy = torch.mean(predictions) loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=config.max_norm) optimizer.step() lr_scheduler.step() ####################################################### # Just for time measurement t2 = time.time() examples_per_second = config.batch_size/float(t2-t1) pl_loss.append(loss.item()) average_loss.append(np.mean(pl_loss[:-100:-1])) acc.append(accuracy) if step % config.print_every == 0: print("[{}] Train Step {}/{}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss.item() )) if step % config.sample_every == 0: model.eval() with torch.no_grad(): char_ix = generate_sample(model, vocab_size, config.seq_length, device, config) sentence = dataset.convert_to_string(char_ix) f.write("--------------"+str(step)+"----------------\n") f.write(sentence+"\n") print(sentence) print() model.train() # ########################################################################### # save training loss plt.plot(pl_loss,'r-', label="Batch loss", alpha=0.5) plt.plot(average_loss,'g-', label="Average loss", alpha=0.5) plt.legend() plt.xlabel("Iterations") plt.ylabel("Loss") plt.title("Training Loss") plt.grid(True) # plt.show() if config.sampling == "greedy": plt.savefig("loss_"+config.sampling+".png") else: plt.savefig("loss_"+config.sampling+"_"+str(config.temp)+".png") plt.close() ################################training################################################## plt.plot(acc,'g-', alpha=0.5) plt.xlabel("Iterations") plt.ylabel("Accuracy") plt.title("Train Accuracy") plt.grid(True) if config.sampling == "greedy": plt.savefig("accuracy_"+config.sampling+".png") else: plt.savefig("accuracy_"+config.sampling+"_"+str(config.temp)+".png") plt.close() if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break step+=1 save_checkpoint({ 'epoch': epochs + 1, 'step': step, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler':lr_scheduler.state_dict(), 'accuracy': accuracy }, config) f.close()
def train(config): # Print all configs to confirm parameter settings print_flags() assert config.sampling_method in ('greedy', 'random') assert config.generate_mode in ('generate', 'finish') # Initialize the device which to run the model on device = torch.device(config.device) # Initialize the dataset and data loader (note the +1) dataset = TextDataset(filename=config.txt_file, seq_length=config.seq_length) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Initialize the model that we are going to use model = TextGenerationModel(batch_size=config.batch_size, seq_length=config.seq_length, vocabulary_size=dataset.vocab_size, dropout=1-config.dropout_keep_prob, lstm_num_hidden=config.lstm_num_hidden, lstm_num_layers=config.lstm_num_layers, device=device) model.to(device) # Setup the loss and optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=config.learning_rate) epoch = 10 # Store some measures los = list() iteration = list() acc = list() max_step = 0 for i in range(epoch): for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() model.train() optimizer.zero_grad() batch_inputs = torch.stack(batch_inputs).to(device) batch_targets = torch.stack(batch_targets).to(device) h_0 = torch.zeros(config.lstm_num_layers, batch_inputs.shape[1], config.lstm_num_hidden).to(device) c_0 = torch.zeros(config.lstm_num_layers, batch_inputs.shape[1], config.lstm_num_hidden).to(device) pred, _, _ = model(batch_inputs, h_0, c_0) accuracy = compute_accuracy(pred, batch_targets) pred = pred.permute(1, 2, 0) batch_targets = batch_targets.permute(1, 0) loss = criterion(pred, batch_targets) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.max_norm) optimizer.step() # Just for time measurement t2 = time.time() examples_per_second = config.batch_size/float(t2-t1) if (step + i * max_step) % config.print_every == 0: print("[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step + i * max_step, int(config.train_steps), config.batch_size, examples_per_second, accuracy, loss )) iteration.append(step + i * max_step) acc.append(accuracy) los.append(loss) if max_step < step: max_step = step if (step + i * max_step) % config.sample_every == 0: model.eval() batch_sample = 5 if config.generate_mode == 'finish': generated = [dataset._char_to_ix[c] for c in config.input_seq] generated = torch.LongTensor(generated).view(-1, 1).to(device) for l in range(config.generate_length): if l == 0: h_s = torch.zeros(config.lstm_num_layers, 1, config.lstm_num_hidden).to(device) c_s = torch.zeros(config.lstm_num_layers, 1, config.lstm_num_hidden).to(device) gen, h_s, c_s = model(generated, h_s, c_s) gen = torch.unsqueeze(gen[-1], 0) else: gen, h_s, c_s = model(gen, h_s, c_s) if config.sampling_method == 'greedy': gen = gen.argmax(dim=2) else: gen = nn.functional.softmax(gen/config.temperature, dim=2) dist = torch.distributions.categorical.Categorical(gen) gen = dist.sample() generated = torch.cat((generated, gen)) else: generated = [dataset._char_to_ix[random.choice(dataset._chars)] for c in range(batch_sample)] generated = torch.LongTensor(generated).view(-1, batch_sample).to(device) for l in range(config.generate_length - 1): if l == 0: h_s = torch.zeros(config.lstm_num_layers, batch_sample, config.lstm_num_hidden).to(device) c_s = torch.zeros(config.lstm_num_layers, batch_sample, config.lstm_num_hidden).to(device) gen, h_s, c_s = model(generated, h_s, c_s) else: gen, h_s, c_s = model(gen, h_s, c_s) if config.sampling_method == 'greedy': gen = gen.argmax(dim=2) else: gen = nn.functional.softmax(gen/config.temperature, dim=2) dist = torch.distributions.categorical.Categorical(gen) gen = dist.sample() generated = torch.cat((generated, gen)) generated = generated.t() sentence = [dataset.convert_to_string(idx) for idx in generated.tolist()] if config.sampling_method == 'random': with open('{}/{}_{}_{}_{}.txt'.format(config.summary_path, config.generate_mode, datetime.now().strftime("%Y-%m-%d"), config.sampling_method, config.temperature), 'a', encoding='utf-8') as file: file.write('--------------\n') file.write('Training Step: {}\n'.format(step + i * max_step)) file.write('--------------\n') for sen in sentence: file.write('{}\n'.format(sen)) file.write('\n') file.close() else: with open('{}/{}_{}_{}.txt'.format(config.summary_path, config.generate_mode, datetime.now().strftime("%Y-%m-%d"), config.sampling_method), 'a', encoding='utf-8') as file: file.write('--------------\n') file.write('Training Step: {}\n'.format(step + i * max_step)) file.write('--------------\n') for sen in sentence: file.write('{}\n'.format(sen)) file.write('\n') file.close() if (step + i * max_step) == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break if (step + i * max_step) == config.train_steps: break print('Done training.') fig, axs = plt.subplots(1, 2, figsize=(10,5)) axs[0].plot(iteration, acc) axs[0].set_xlabel('Iteration') axs[0].set_ylabel('Accuracy') axs[1].plot(iteration, los) axs[1].set_xlabel('Iteration') axs[1].set_ylabel('Loss') fig.tight_layout() plt.show()
def train(config, CHOICES): # Initialize the device which to run the model on #device = torch.device(config.device)# fix this! device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(device) # Initialize the model that we are going to use # Initialize the dataset and data loader (note the +1) dataset = TextDataset(config.txt_file, config.seq_length ); # fixme model = TextGenerationModel( config.batch_size, config.seq_length, dataset.vocab_size, config.temperature).cuda(); if (CHOICES['LOAD_BEST_MODEL']): model.load_state_dict(torch.load('./model_parameter.txt')); #print(model.state_dict()); data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Setup the loss and optimizer criterion = nn.CrossEntropyLoss(); optimizer = torch.optim.RMSprop(model.parameters(),lr=config.learning_rate); if (CHOICES['LOAD_BEST_MODEL']): optimizer.load_state_dict(torch.load('./model_optimizer.txt')); accuracy_list = []; loss_list = []; string_list = []; tmp_accuracy = 0; a = 76; while (tmp_accuracy == 0) or (accuracy_list[-1] >0.85): for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() batch_inputs = torch.stack(batch_inputs)[:,:, None].view(config.seq_length, -1).to(device); # sequ_length * batch_size batch_targets = torch.stack(batch_targets)[:,:, None].view(config.seq_length, -1).to(device); # sequ_length * batch_size if not((int(batch_inputs.size()[1])) == config.batch_size): continue; #print(dataset.convert_to_string(batch_inputs[:, 0].cpu().numpy())); batch_inputs_onehot = one_hot(batch_inputs, dataset.vocab_size); # seq_length * batch_size * vacab_size; optimizer.zero_grad(); torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=config.max_norm); out = model(batch_inputs_onehot); values, indices = torch.max(out, 1); loss_criterion = criterion(out,batch_targets); loss_criterion.backward(); optimizer.step(); loss = loss_criterion.data[0]/(config.seq_length); values, indices = torch.max(out, 1); accuracy = ((indices[indices == batch_targets].size())[0])/(config.batch_size*config.seq_length); # Just for time measurement t2 = time.time() examples_per_second = config.batch_size/float(t2-t1) if step % config.print_every == 0: print("[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, int(config.train_steps), config.batch_size, examples_per_second, accuracy, loss)) # generate sentences if step % 50000 == 0 and CHOICES['GENERATE_FIVE_SENTENCES']: model.eval(); test_input = (torch.Tensor(batch_inputs.size())).type(torch.LongTensor).to(device); a = a + 1; test_input = test_input.fill_(a); output_string = generate_new_stings(model, test_input, dataset.vocab_size, config.seq_length); tmp = dataset.convert_to_string(output_string.cpu().numpy().tolist()); string_list += [tmp]; print(tmp); print('---') model.train(); # save parameter torch.save(model.state_dict(), './model_parameter{:d}.txt'.format(step)); torch.save(optimizer.state_dict(), './model_optimizer{:d}.txt'.format(step)); if (CHOICES['DRAW_ACCURACY_PLOT']): accuracy_list += [accuracy]; loss_list += [loss]; if step == config.sample_every: # Generate some sentences by sampling from the model pass if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break if (CHOICES['GENERATE_FIVE_SENTENCES']) and (len(string_list) == 5): break; if (CHOICES['GENERATE_FIVE_SENTENCES']) and (len(string_list) == 5): break; print("============ finish {} epoch ============ ".format(len(accuracy_list))); torch.save(model.state_dict(), './model_parameter.txt'); torch.save(optimizer.state_dict(), './model_optimizer.txt'); print('Done training.'); if (CHOICES['GENERATE_FIVE_SENTENCES']): if (CHOICES['DRAW_ACCURACY_PLOT']): fig, ax = plt.subplots(); ax.plot(np.arange(len(accuracy_list)), accuracy_list, 'r', label = 'accuracy'); ax.plot(np.arange(len(accuracy_list)), loss_list, 'b', label = 'loss'); legend = ax.legend(loc='upper center'); plt.xlabel('Steps'); plt.title('loss and accuracy of LSTM in 2000 steps'); plt.show(); for idx in range(5): print('====') print(string_list[idx]);
def train(config): writer = torch.utils.tensorboard.SummaryWriter() # Initialize the device which to run the model on device = torch.device(config.device) # Initialize the dataset and data loader (note the +1) dataset = TextDataset(config.txt_file, config.seq_length) data_loader = DataLoader( dataset, config.batch_size, config.seq_length, ) # Initialize the model that we are going to use vocabulary_size = dataset.vocab_size model = TextGenerationModel(batch_size=config.batch_size, seq_length=config.seq_length, vocabulary_size=vocabulary_size) model.to(device) # Setup the loss and optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=config.learning_rate) accuracies = [] losses = [] for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() ####################################################### # Move to GPU batch_inputs = to_tensor_rep(batch_inputs).to(device) batch_targets = to_tensor_rep(batch_targets).to(device) # Reset for next iteration model.zero_grad() ####################################################### model_output = model(batch_inputs, c_0=torch.zeros(config.lstm_num_layers, batch_inputs.shape[1], config.lstm_num_hidden, device=device), h_0=torch.zeros(config.lstm_num_layers, batch_inputs.shape[1], config.lstm_num_hidden, device=device)) # for each timestep, the crossentropy loss is computed and subsequently averaged batch_losses = torch.zeros(config.seq_length, device=device) for i in range(config.seq_length): batch_losses[i] = criterion(model_output[i], batch_targets[i]) loss = (1 / config.seq_length) * torch.sum(batch_losses) # compute the gradients, clip them to prevent exploding gradients and backpropagate loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.max_norm) optimizer.step() # calculate accuracy predictions = torch.argmax(model_output, dim=2) correct = (predictions == batch_targets).sum().item() accuracy = correct / (model_output.size(0) * model_output.size(1)) # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) if (step + 1) % config.print_every == 0: print("[{}] Train Step {:04d}/{:04d}, Batch Size = {}, \ Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss)) # save loss and accuracy accuracies.append(accuracy) losses.append(loss) writer.add_scalar("loss", loss) writer.add_scalar("accuracy", accuracy) if (step + 1) % config.sample_every == 0: model.eval() generate_sequence(model, 62, dataset) model.train() if step == config.train_steps: break print('Done training.') # make loss and accuracy plots x = np.arange(len(accuracies)) * config.print_every plot_curve(x, accuracies, "Accuracy", "Training accuracy") plot_curve(x, losses, "Loss", "Training Loss")
def evaluate(config): # Initialize the device which to run the model on device = torch.device(config.device) # Load the dataset with open(config.dataset, 'rb') as dataset_file: dataset = pickle.load(dataset_file) # Initialize the model that we are going to use model = TextGenerationModel(config.batch_size, config.seq_length, dataset.vocab_size, config.lstm_num_hidden, config.lstm_num_layers, device, config.dropout_keep_prob) # fixme model.load_state_dict(torch.load(config.ckpt)) # Generate some sentences by sampling from the model model.eval() # Create tensor to hold the generated samples. samples = torch.zeros((config.sample_batch_size, config.sample_length), dtype=torch.int, device=device, requires_grad=False) last_h = torch.zeros(config.lstm_num_layers, config.sample_batch_size, config.lstm_num_hidden, device=device, requires_grad=False) last_c = torch.zeros(config.lstm_num_layers, config.sample_batch_size, config.lstm_num_hidden, device=device, requires_grad=False) if config.pre_text: pre_input = torch.tensor( [dataset._char_to_ix[ch] for ch in config.pre_text] * 10, device=device, requires_grad=False).view(config.sample_batch_size, -1).t().unsqueeze(-1) onehot_pre_input = torch.zeros( (pre_input.shape[0], pre_input.shape[1], dataset.vocab_size), device=device, requires_grad=False) onehot_pre_input.scatter_(2, pre_input, 1) logits, last_h, last_c = model(onehot_pre_input, last_h, last_c) logits = nn.functional.softmax(logits[-1, :, :].unsqueeze(-1) / config.temperature, dim=1) start_chars = logits.squeeze().argmax(-1) samples[:, 0] = start_chars onehot_chars = torch.zeros( (1, config.sample_batch_size, dataset.vocab_size), device=device, requires_grad=False) onehot_chars.scatter_(2, start_chars.view(1, config.sample_batch_size, 1), 1) else: # Initialize the first characters for the samples. start_chars = torch.randint(dataset.vocab_size, size=(1, config.sample_batch_size, 1), dtype=torch.long, device=device, requires_grad=False) samples[:, 0] = start_chars.squeeze() # Create a tensor to hold the one-hot encoding for the output characters of the LSTM network (one per each time step). onehot_chars = torch.zeros( (1, config.sample_batch_size, dataset.vocab_size), device=device, requires_grad=False) onehot_chars.scatter_(2, start_chars, 1) for t in np.arange(config.sample_length - 1): logits, last_h, last_c = model(onehot_chars, last_h, last_c) logits = nn.functional.softmax(logits / config.temperature, dim=2) next_chars = logits.squeeze().argmax(-1) onehot_chars.zero_() onehot_chars.scatter_(2, next_chars.view(1, config.sample_batch_size, 1), 1) samples[:, t + 1] = next_chars samples = samples.tolist() samples = [dataset.convert_to_string(sample) for sample in samples] # Output the samples into a text file. with open(config.summary_path + 'samples.txt', 'a') as txt_file: txt_file.write('Temperature: {}\nSample length: {}\n'.format( config.temperature, config.sample_length)) txt_file.writelines(map(lambda x: config.pre_text + x + '\n', samples)) print('Done evaluation.')
def train(): # Torch settings device = torch.device(config.device) if device == 'cpu': torch.set_default_tensor_type(torch.FloatTensor) elif device == 'cuda:0': torch.set_default_tensor_type(torch.cuda.FloatTensor) dtype = torch.float # Tensorboard summary writer if config.tensorboard: run_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S_" + config.model_type.lower() + '_' + str(config.input_length)) log_dir = 'tensorboard/' + config.model_type.lower() + '/' + run_id writer = SummaryWriter(log_dir=log_dir) # Initialize the dataset and data loader (note the +1) dataset = TextDataset(config.txt_file, config.seq_length) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # Model parameters lr = config.learning_rate lr_decay = config.learning_rate_decay lr_step = config.learning_rate_step dropout = 1.0 - config.dropout_keep_prob temp = [0.5, 1., 2.] assert config.sample_num % 3 == 0 # Initialize the model that we are going to use model = TextGenerationModel(config.batch_size, config.seq_length, dataset.vocab_size, dropout, device).to(device) # Setup the loss and optimizer criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=lr) # Characters used to start sentences (closing characters such as ')', '.' or others were removed) start_characters = ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '(', '[', '*', '-', '‘', '“'] start_characters = list(set(start_characters) & set(dataset.vocab)) # Store all generated sentences sentences = {} # Load model, if there's any model to load model, optimizer, sentences, start_step = load_model(model, optimizer, sentences, step=0) try: for step, (batch_inputs, batch_targets) in enumerate(data_loader): # If the model has been loaded, regulate step number accordingly step += start_step # Only for time measurement of step through network t1 = time.time() # Get batches as tensors of size (batch_size x seq_length) batch_inputs = torch.stack(batch_inputs).permute((1, 0)) batch_targets = torch.stack(batch_targets).permute((1, 0)).to(device) # Convert batches to one-hot representation (batch_size x seq_length x vocab_size) batch_inputs = get_one_hot(batch_inputs, config.batch_size, config.seq_length, dataset.vocab_size).to(device) # Forward pass model.train() optimizer.zero_grad() predictions = model.forward(batch_inputs) # Compute loss loss = criterion(predictions.permute(0, 2, 1), batch_targets) # Backward pass loss.backward() # Clipping gradients to avoid exploding gradient problem torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.max_norm) # Update weights optimizer.step() # Compute accuracy accuracy = get_accuracy(predictions, batch_targets) # Add accuracy and loss to the writer if config.tensorboard: writer.add_scalars('Accuracy_and_Loss', {'accuracy': accuracy, 'loss': loss}, step) writer.add_scalar('Learning_Rate', lr, step) # Update learning rate if (step % lr_step == 0) and step != 0: lr *= lr_decay for group in optimizer.param_groups: group['lr'] = lr # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) if step % config.print_every == 0: print("[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), step, int(config.train_steps), config.batch_size, examples_per_second, accuracy, loss)) if step % config.sample_every == 0: model.eval() # Store sentences for this step step_sentences = {temp[0]: [], temp[1]: [], temp[2]: []} # Get 6 random starter characters sample = random.sample(start_characters, config.sample_num) print() for idx, c in enumerate(sample): # Temperature parameter t = temp[int(idx / 2)] # Character's one-hot representation c_oh = torch.tensor(dataset.convert_to_one_hot(c), dtype=dtype).to(device) # Returns a sentence of indexes and length 30 sentence = dataset.convert_to_string(model.generate(c_oh, t)) print("[t={:.1f}] {}".format(t, sentence.replace('\n', '\\n '))) step_sentences[t].append(sentence) print() sentences[step] = step_sentences if (step % config.save_every == 0) and step != 0: save_model(model, optimizer, sentences, step) if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break if config.tensorboard: writer.close() print('Done training.') except (KeyboardInterrupt, BrokenPipeError): if config.tensorboard: writer.close() print("\n" + random.choice(quit_msgs))
def train(config): # Create output generated images directory (if it does not already exists) os.makedirs('./generated_text/', exist_ok=True) os.makedirs('./models/', exist_ok=True) os.makedirs('./part2/generated_text/', exist_ok=True) os.makedirs('./part2/models/', exist_ok=True) # Initialize the device which to run the model on # if GPU was chosen, check if CUDA is available if str(config.device) != "cpu": if not torch.cuda.is_available(): print( '\n* GPU was selected but CUDA is not available.\nTraining on CPU ...\n' ) device = torch.device("cpu") else: print('\n* CUDA is available! Training on GPU ...\n') device = torch.device(config.device) else: print('\n* Training on GPU ...\n') device = torch.device(config.device) # Initialize the dataset and data loader (note the +1) dataset = TextDataset(config.txt_file, config.seq_length) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) model = TextGenerationModel(config.batch_size, config.seq_length, dataset.vocab_size, lstm_num_hidden=config.lstm_num_hidden, lstm_num_layers=config.lstm_num_layers, drop_prob=1.0 - config.dropout_keep_prob, device=device).to(device) # Setup the loss, optimizer and scheduler criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate) scheduler = torch.optim.lr_scheduler.StepLR( optimizer, step_size=config.learning_rate_step, gamma=config.learning_rate_decay) train_accuracy, train_loss = [], [] eval_steps, eval_loss, eval_accuracy, = [], [], [] for epoch in range(config.epochs): # Print current epoch print('\n', str('-') * (56), 'epoch: {}/{}'.format(epoch + 1, config.epochs), str('-') * (56)) for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Enable train mode model.train() # Only for time measurement of step through network t1 = time.time() ################################################################ # batch_inputs.shape = batch_size x seq_lenght dimentions batch_inputs = torch.stack(batch_inputs, dim=1).to(device) batch_targets = torch.stack(batch_targets, dim=1).to(device) # Update batch size # -- in case that the last batch size is less than the confg. one config.batch_size = batch_inputs.shape[0] # Clear accumulated gradients optimizer.zero_grad() # Forward pass predictions = model(batch_inputs) # Calculate loss loss = criterion( predictions, batch_targets.view(config.batch_size * config.seq_length)) # Store train accuracy and loss train_loss.append(loss.item()) train_accuracy.append(accuracy(predictions, batch_targets)) # Back-propagate loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=config.max_norm) # Update weights and scheduler optimizer.step() scheduler.step(loss.item()) ################################################################ # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) if step % config.print_every == 0: print( "[{}] Train Step {:04f}/{:04f}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, train_accuracy[-1], train_loss[-1])) if step % config.sample_every == 0: # Generate sentences by sampling from the model print("\n* Sampling...\n") # Model into evaluation mode model.eval() # If summaries are prinded between the training print_ = False # Tempering Sampling betas = [0.5, 1, 2] for beta in betas: tempering_sample(model, dataset, beta, config, device, epoch, step, print_) # Greedy Sampling greedy_sample(model, dataset, config, device, epoch, step, print_) # Bonus part: Generate sentence given a sentence sentence = 'They run into the train.' T = 2000 sampling_methodes = ['top_k', 'beta'] for sampling_meth in sampling_methodes: gen_from_word(sentence, model, dataset, config, device, epoch, step, sampling_meth, T, print_) sentence = 'Anna' T = 2000 sampling_methodes = ['top_k', 'beta'] for sampling_meth in sampling_methodes: gen_from_word(sentence, model, dataset, config, device, epoch, step, sampling_meth, T, print_) # Save the trained model -- Checkpoint # save_model(epoch, step, model) # Save loss and accuracy eval_steps.append(step) eval_loss.append(train_loss[-1]) eval_accuracy.append(train_accuracy[-1]) np.savez('lstm.npz', eval_steps=eval_steps, eval_loss=eval_loss, eval_accuracy=eval_accuracy) if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break print('Done training.') # Save the trained model -- Checkpoint save_model(epoch, step, model)
def train(config): # Initialize the device which to run the model on device = torch.device(config.device) config.txt_file = './assets/book_EN_grimms_fairy_tails.txt' # Initialize the dataset and data loader (note the +1) dataset = TextDataset(config.txt_file, config.seq_length) # fixme data_loader = DataLoader(dataset, config.batch_size) # Initialize the model that we are going to use model = TextGenerationModel(config.batch_size, config.seq_length, dataset.vocab_size, 64, config.dropout_keep_prob, config.lstm_num_hidden, config.lstm_num_layers, device) # FIXME # Setup the loss and optimizer criterion = nn.CrossEntropyLoss() # FIXME optimizer = optim.RMSprop(model.parameters(), lr=config.learning_rate) # FIXME step = 0 loss_list = [] accuracy_list = [] while step < 33600: for (batch_inputs, batch_targets) in data_loader: model.train() step += 1 # Only for time measurement of step through network t1 = time.time() ####################################################### # Add more code here ... ####################################################### batch_inputs = torch.stack(batch_inputs).to(device) batch_targets = torch.stack(batch_targets, dim=1).to(device) # loss = np.inf # fixme # accuracy = 0.0 # fixme model.zero_grad() pred, _ = model(batch_inputs) pred = pred.view(-1, dataset.vocab_size) batch_targets = batch_targets.view(-1) loss = criterion(pred, batch_targets) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0) optimizer.step() predictions = torch.argmax(pred, dim=1) correct = (predictions == batch_targets).sum().item() accuracy = correct / pred.size(0) accuracy_list.append(accuracy) loss_list.append(loss.item()) # Just for time measurement t2 = time.time() examples_per_second = 64 / float(t2 - t1) if (step + 1) % 60 == 0: print("[{}] Train Step {:04d}/{:04d}, Batch Size = {}, \ Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, 1000000, 64, examples_per_second, accuracy, loss)) if step % 11200 == 0: # Generate some sentences by sampling from the model model.eval() for i in range(5): for temperature in [0, 0.5, 1.0, 2.0]: for length in [30, 40, 60]: sentence = generate_sequence( dataset, model, device, temperature, length) with open('./summaries.txt', 'a', encoding='utf-8') as file: file.write("{};{};{};{};{}\n".format( i, step, temperature, length, sentence)) if step == 33600: # If you receive a PyTorch data-loader error, # check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break plt.subplot(2, 1, 1) plt.plot(np.arange(len(accuracy_list)), accuracy_list, 'o-') plt.xlabel('Step') plt.ylabel('Accuracy') # plt.subplot(2, 1, 2) plt.plot(np.arange(len(loss_list)), loss_list) plt.xlabel('Step') plt.ylabel('Loss') print('Done training.')
parser = argparse.ArgumentParser() parser.add_argument(dest='path', type=str, help="Path to the trained model.") parser.add_argument('-d', dest='data', type=str, default='assets/book_EN_grimms_fairy_tails.txt', help="Path to the dataset.") parser.add_argument('-t', dest='temperature', type=float, default=1, help="Sampling temperature.") args = parser.parse_args() checkpoint = torch.load(args.path) dataset = TextDataset(args.data, 30) # fixme model = TextGenerationModel(512, 30, 87, lstm_num_hidden=128).cuda() model.load_state_dict(checkpoint['state_dict']) model.eval() # Randomly sample sequences from the model. sample = model.sample(True, args.temperature) sample = sample_text(dataset, sample) for s in sample: print(s)
def train(config): device = torch.device(config.device) # Initialize the dataset and data loader (note the +1) dataset = TextDataset(config.txt_file, config.seq_length) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) vocabulary_size = dataset.vocab_size # Initialize the model that we are going to use model = TextGenerationModel(config.batch_size, config.seq_length, vocabulary_size, config.lstm_num_hidden, config.lstm_num_layers, device=device) # Setup the loss and optimizer optimizer = optim.RMSprop(model.parameters(), config.learning_rate) criterion = nn.CrossEntropyLoss() accuracies = [] losses = [] h0 = torch.zeros(config.lstm_num_layers, config.batch_size, config.lstm_num_hidden) c0 = torch.zeros(config.lstm_num_layers, config.batch_size, config.lstm_num_hidden) for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() batch_inputs = batch_inputs.to(device) batch_targets = batch_targets.to(device) model.train() optimizer.zero_grad() prediction, _ = model(batch_inputs, h0, c0) loss = criterion(prediction.permute(1, 2, 0), batch_targets) loss.backward() ####################################################### torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.max_norm) ####################################################### optimizer.step() _, prediction = prediction.max(-1) accuracy = (prediction.t() == batch_targets).sum().item() / ( prediction.shape[0] * prediction.shape[1]) accuracies.append(accuracy * 100) losses.append(loss.item()) # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) if step % config.print_every == 0: print( "[{}] Train Step {:04d}/{:04f}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss)) if step % config.sample_every == 0: # temperature from [0.5, 1.0, 2.0] temp = 0.5 model.eval() h1 = torch.zeros(config.lstm_num_layers, 1, config.lstm_num_hidden) c1 = torch.zeros(config.lstm_num_layers, 1, config.lstm_num_hidden) # set first character to be a random symbol from the vocabulary symbol = torch.randint(low=0, high=dataset.vocab_size, size=(1, 1)).long().to(device) # uppercase alphabet # alphabet = list(string.ascii_uppercase) # lowercase alphabet # alphabet = list(string.ascii_lowercase) # initializing with a random upper- or lowercase letter from the alphabet # symbol = torch.tensor([dataset.convert_to_idx(alphabet[randrange(26)])]) # first character to be 'S' # symbol = torch.tensor([dataset.convert_to_idx('S')]) generated_text = [] generated_text.append(symbol.item()) generated_seq_length = 60 for i in range(generated_seq_length): pred_symbol, (h1, c1) = model(symbol, h1, c1) # without using temperature _, prediction_symbol = pred_symbol.max(-1) symbol = prediction_symbol # using temperature function # symbol = torch.tensor([[sample(pred_symbol, temperature=temp)]]) generated_text.append(symbol.item()) # print(dataset.convert_to_string(generated_text)) if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break print('Done training.')
def train(config, seed=0): np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Initialize the device which to run the model on device = torch.device(config.device) print(device) # Initialize the dataset and data loader (note the +1) dataset = TextDataset(config.txt_file, config.seq_length) data_loader = DataLoader(dataset, config.batch_size, drop_last=True) # Initialize the model that we are going to use model = TextGenerationModel(config.batch_size, config.seq_length, dataset.vocab_size, config.lstm_num_hidden, config.lstm_num_layers, config.device).to(device) if config.load_model == 'load': model.load_state_dict(torch.load('output_dir/kant.pt')) model.eval() # Setup the loss and optimizer criterion = nn.NLLLoss() optimizer = optim.AdamW(model.parameters(), lr=config.learning_rate) loss_history = [] acc_history = [] count = 1 for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() # Move to GPU batch_inputs = torch.Tensor( torch.cat([x.float().unsqueeze(dim=0) for x in batch_inputs])).long().to(device) batch_targets = torch.Tensor( torch.cat([y.float().unsqueeze(dim=0) for y in batch_targets])).long().to(device) # Reset for next iteration model.zero_grad() # Forward pass log_probs = model(batch_inputs) loss = criterion(log_probs.transpose(1, 2), batch_targets) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.max_norm) optimizer.step() predictions = torch.argmax(log_probs, dim=-1) correct = (predictions == batch_targets).sum().item() accuracy = correct / (log_probs.size(1) * log_probs.size(0)) loss_history.append(loss.item()) acc_history.append(accuracy) # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) if config.load_model == 'save' and step % 7000 == 0: torch.save(model.state_dict(), f'output_dir/kant_{config.seq_length}_{count}.pt') count += 1 if (step + 1) % config.print_every == 0: print("[{}] Train Step {:04d}/{:04d}, Batch Size = {}, \ Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss)) if (step + 1) % config.sample_every == 0: # Generate some sentences by sampling from the model pass if step == config.train_steps: # If you receive a PyTorch data-loader error, # check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break if config.load_model == 'save': torch.save(model.state_dict(), f'output_dir/kant_{config.seq_length}_{count}.pt') print('Done training.') print('Final loss:', loss_history[-1]) print('Final acc:', acc_history[-1]) return loss_history, acc_history