def train_rrn(hyperparameters: dict, data: dict): model_name = hyperparameters['model_name'] device = hyperparameters['device'] dim_x = hyperparameters['dim_x'] dim_y = hyperparameters['dim_y'] num_iters = hyperparameters['num_iters'] train_size = hyperparameters['train_size'] valid_size = hyperparameters['valid_size'] test_size = hyperparameters['test_size'] batch_size = hyperparameters['batch_size'] epochs = hyperparameters['epochs'] save_epochs = hyperparameters['save_epochs'] embed_size = hyperparameters['embed_size'] hidden_layer_size = hyperparameters['hidden_layer_size'] learning_rate = hyperparameters['learning_rate'] weight_decay = hyperparameters['weight_decay'] train_inputs = data['train_inputs'] train_outputs = data['train_outputs'] valid_inputs = data['valid_inputs'] valid_outputs = data['valid_outputs'] test_inputs = data['test_inputs'] test_outputs = data['test_outputs'] all_train_x = torch.stack([encode_input(p) for p in train_inputs]) all_train_y = torch.stack([encode_output(p) for p in train_outputs]) all_valid_x = torch.stack([encode_input(p) for p in valid_inputs]) all_valid_y = torch.stack([encode_output(p) for p in valid_outputs]) all_test_x = torch.stack([encode_input(p) for p in test_inputs]) all_test_y = torch.stack([encode_output(p) for p in test_outputs]) model = RRN(dim_x=dim_x, dim_y=dim_y, embed_size=embed_size, hidden_layer_size=hidden_layer_size).cuda(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) train_losses = [] # epoch x batch train_accuracies = [] # epoch x batch x grid x timestep valid_losses = [] # epoch x batch valid_accuracies = [] # epoch x batch x grid x timestep times = [] train_x = all_train_x[:train_size].cuda(device) train_y = all_train_y[:train_size].cuda(device) valid_x = all_valid_x[:valid_size].cuda(device) valid_y = all_valid_y[:valid_size].cuda(device) test_x = all_test_x[:test_size].cuda(device) test_y = all_test_y[:test_size].cuda(device) def closure(): optimizer.zero_grad() total_loss = 0 shuffle_indices = np.arange(len(train_x)) np.random.shuffle(shuffle_indices) for i in tqdm(range(0, len(train_x), batch_size), leave=False): x_batch = train_x[shuffle_indices[i:i + batch_size]] y_batch = train_y[shuffle_indices[i:i + batch_size]] loss, accuracies = get_performance(model, x_batch, y_batch, num_iters) loss.backward() total_loss += loss train_losses[-1].append(float(loss)) train_accuracies[-1].append(accuracies) return total_loss for i in tqdm(range(epochs)): start_time_str = datetime.now().strftime("%m/%d/%Y %H:%M:%S") start_time = time.time() train_losses.append([]) train_accuracies.append([]) train_loss = optimizer.step(closure) train_accuracies[-1] = np.array(train_accuracies[-1]) valid_loss, valid_accuracy = get_performance(model, valid_x, valid_y, num_iters) valid_losses.append(float(valid_loss)) valid_accuracies.append(valid_accuracy) train_accuracies[-1] = np.array(train_accuracies[-1]) train_loss = round(float(train_loss), 3) train_accuracy = round(np.mean(train_accuracies[-1][:, :, -1]), 3) valid_loss = round(valid_losses[-1], 3) valid_accuracy = round(np.mean(valid_accuracies[-1][:, -1]), 3) end_time_str = datetime.now().strftime("%m/%d/%Y %H:%M:%S") end_time = time.time() runtime = end_time - start_time times.append({ 'start_time': start_time_str, 'end_time': end_time_str, 'runtime': runtime }) print("({}s): Iter {}\t| TrLoss {}\t| VLoss {}\t| TrAcc {}\t| VAcc {}". format(round(runtime, 1), i, train_loss, valid_loss, train_accuracy, valid_accuracy)) if (i + 1) % save_epochs == 0: model_filename = SUDOKU_PATH + "/models/{}_{}.mdl".format( model_name, i + 1) train_data_filename = SUDOKU_PATH + "/pickles/{}.pkl".format( model_name) print("Saving model to {}".format(model_filename)) torch.save(model.state_dict(), model_filename) with open(train_data_filename, 'wb') as f: pickle.dump( { 'hyperparameters': hyperparameters, 'train_losses': train_losses, 'train_accuracies': train_accuracies, 'valid_losses': valid_losses, 'valid_accuracies': valid_accuracies, 'times': times }, f) test_loss, test_accuracy = get_performance(model, test_x, test_y, num_iters) test_loss = round(float(test_loss), 3) test_accuracy = round(np.mean(test_accuracy[:, -1]), 3) print("TeLoss {}\t| TeAcc {}".format(test_loss, test_accuracy)) return model
def train_rrn(hyperparameters: dict, train_inputs: list, train_outputs: list, other_inputs: dict = None, other_outputs: dict = None): """ :param hyperparameters: Check below for what fields must exist in hyperparameters :param train_inputs: list of GridStrings :param train_outputs: list of GridStrings, corresponding in index to train_inputs :param other_inputs: dictionary of GridStrings where the key is name of the dataset :param other_outputs: dictionary of GridStrings where the key is name of the dataset, corresponding in index to inputs of same name :return: """ if other_inputs is None: other_inputs = {} if other_outputs is None: other_outputs = {} assert set(other_inputs.keys()) == set(other_outputs.keys()) if not os.path.exists('./checkpoints'): os.makedirs('./checkpoints') if not os.path.exists('./logs'): os.makedirs('./logs') dim_x = hyperparameters['dim_x'] dim_y = hyperparameters['dim_y'] num_iters = hyperparameters['num_iters'] batch_size = hyperparameters['batch_size'] epochs = hyperparameters['epochs'] valid_epochs = hyperparameters['valid_epochs'] save_epochs = hyperparameters['save_epochs'] embed_size = hyperparameters['embed_size'] hidden_layer_size = hyperparameters['hidden_layer_size'] learning_rate = hyperparameters['learning_rate'] weight_decay = hyperparameters['weight_decay'] parallel = False if 'devices' in hyperparameters: if len(hyperparameters['devices']) > 1: devices = hyperparameters['devices'] parallel = True device = hyperparameters['devices'][0] else: device = hyperparameters['device'] train_x = torch.stack([encode_input(p) for p in train_inputs]).cuda(device) train_y = torch.stack([encode_output(p) for p in train_outputs]).cuda(device) other_x = {} other_y = {} for k in other_inputs: other_x[k] = torch.stack([encode_input(p) for p in other_inputs[k]]).cuda(device) other_y[k] = torch.stack([encode_output(p) for p in other_outputs[k]]).cuda(device) model = RRN(dim_x=dim_x, dim_y=dim_y, embed_size=embed_size, hidden_layer_size=hidden_layer_size).cuda(device) if parallel: model = nn.DataParallel(model, device_ids=devices) # else: # model = model.cuda(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) train_losses = [] # (epoch, ) train_accuracies = [] # (epoch, grid, timestep) other_losses = {name: [] for name in other_x} # (epoch, ) other_accuracies = {name: [] for name in other_x} # (epoch, grid, timestep) times = [] def closure(): optimizer.zero_grad() total_loss = 0 epoch_accuracies = [] shuffle_indices = np.arange(len(train_x)) np.random.shuffle(shuffle_indices) for i in tqdm(range(0, len(train_x), batch_size), leave=False): x_batch = train_x[shuffle_indices[i:i + batch_size]] y_batch = train_y[shuffle_indices[i:i + batch_size]] loss, accuracies = get_performance(model=model, x=x_batch, y=y_batch, no_grad=False, num_iters=num_iters) loss.backward() total_loss += loss train_losses.append(float(total_loss)) epoch_accuracies.append(accuracies) train_accuracies.append(np.concatenate(epoch_accuracies)) return total_loss for i in tqdm(range(epochs)): start_time_str = datetime.now().strftime("%m/%d/%Y %H:%M:%S") start_time = time.time() train_loss = optimizer.step(closure) run_validate = i == 0 or (i + 1) % valid_epochs == 0 if run_validate: for name in other_x: loss, accuracy = get_performance(model=model, x=other_x[name], y=other_y[name], num_iters=num_iters, no_grad=True) other_losses[name].append(float(loss)) other_accuracies[name].append(accuracy) if (i + 1) % save_epochs == 0: model_filename = "./checkpoints/epoch_{}.mdl".format(i + 1) train_data_filename = "./logs/training.pkl" print("Saving model to {}".format(model_filename)) torch.save(model.state_dict(), model_filename) with open(train_data_filename, 'wb') as f: pickle.dump( { 'hyperparameters': hyperparameters, 'train_losses': train_losses, 'train_accuracies': train_accuracies, 'other_losses': other_losses, 'other_accuracies': other_accuracies, 'times': times }, f) end_time_str = datetime.now().strftime("%m/%d/%Y %H:%M:%S") end_time = time.time() runtime = end_time - start_time times.append({ 'start_time': start_time_str, 'end_time': end_time_str, 'runtime': runtime }) print("duration: {}s\t iter: {}\t| loss: {}\t| accuracy: {}".format( round(runtime, 1), i, round(float(train_loss), 3), round(np.mean(train_accuracies[-1][:, -1]), 3))) if run_validate: for name in sorted(other_x): print("data: {}\t| loss: {}\t| accuracy: {}".format( name, round(other_losses[name][-1], 3), round(np.mean(other_accuracies[name][-1][:, -1]), 3))) model_filename = "./model.mdl" print("Saving model to {}".format(model_filename)) torch.save(model.state_dict(), model_filename) return model
other_x = {} other_y = {} for k in other_inputs: other_x[k] = torch.stack( [rrn_utils.encode_input(p) for p in other_inputs[k]]).cuda(device) other_y[k] = torch.stack( [rrn_utils.encode_output(p) for p in other_outputs[k]]).cuda(device) # model = EmbedRRN(dim_x=dim_x, dim_y=dim_y, embed_size=embed_size, hidden_layer_size=hidden_layer_size).cuda(device) model = RRN(dim_x=dim_x, dim_y=dim_y, embed_size=embed_size, hidden_layer_size=hidden_layer_size).cuda(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) # ones = torch.ones(10, 16).cuda(device) def closure(): optimizer.zero_grad() total_loss = 0 epoch_accuracies = [] shuffle_indices = np.arange(len(train_x)) np.random.shuffle(shuffle_indices) for i in tqdm(range(0, len(train_x), batch_size), leave=False): x_batch = train_x[shuffle_indices[i:i + batch_size]]