def create_encoder(): all_train_dir = '../small_embed/emb3/' dataset = Datasets.load('../4x4_all_reimbed/data/datasets.pkl') model_filename = all_train_dir + "model.mdl" train_log_filename = all_train_dir + "logs/training.pkl" with open(train_log_filename, 'rb') as f: train_log = pickle.load(f) hp = train_log['hyperparameters'] model = rrn.RRN(dim_x=hp['dim_x'], dim_y=hp['dim_y'], embed_size=hp['embed_size'], hidden_layer_size=hp['hidden_layer_size']) model.load_state_dict(torch.load(model_filename), strict=False) model.eval() for k, v in model.named_modules(): if k == 'embed_layer': orig_embed_layer = v device = 4 split_inputs, split_outputs = dataset.split_data([100]) train_inputs = split_outputs[0] train_outputs = split_outputs[0] train_x = torch.stack([rrn_utils.encode_input(p) for p in train_inputs]).cuda(device) train_y = torch.stack([rrn_utils.encode_output(p) for p in train_outputs]).cuda(device) digitEncoder = DigitEncoder(orig_embed_layer).cuda(device) optimizer = optim.Adam(digitEncoder.parameters()) def closure(): optimizer.zero_grad() predictions = digitEncoder(train_x) loss = F.cross_entropy(predictions.permute(0, 2, 1), train_y) loss.backward() return loss for i in range(1000): optimizer.step(closure) encoder = nn.Embedding(digitEncoder.num_embeddings, digitEncoder.embedding_dim) encoder.load_state_dict(digitEncoder.encoder.state_dict()) encoder.eval() for p in encoder.parameters(): p.requires_grad = False decoder = nn.Linear(digitEncoder.embedding_dim, digitEncoder.num_embeddings - 1) decoder.load_state_dict(digitEncoder.decoder.state_dict()) decoder.eval() for p in decoder.parameters(): p.requires_grad = False return encoder, decoder
train_inputs = split_inputs[0] train_outputs = split_outputs[0] other_inputs = {'validation': split_inputs[1]} other_outputs = {'validation': split_outputs[1]} model = RelNet(dim_x=hp['dim_x'], dim_y=hp['dim_y'], embed_size=hp['embed_size'], hidden_layer_size=hp['hidden_layer_size']).cuda(hp['device']) optimizer = optim.Adam(model.parameters(), lr=hp['learning_rate'], weight_decay=hp['weight_decay']) train_x_grid = torch.stack([rrn_utils.encode_input(p) for p in train_inputs]) train_x_prob = utils.puzzle_as_dist(train_x_grid).cuda(hp['device']) train_x_grid = train_x_grid.cuda(hp['device']) train_y = torch.stack([rrn_utils.encode_output(p) for p in train_outputs]).cuda(hp['device']) other_x_grid = {} other_x_prob = {} other_y = {} for k in other_inputs: other_x_grid[k] = torch.stack( [rrn_utils.encode_input(p) for p in other_inputs[k]]) other_x_prob[k] = utils.puzzle_as_dist(other_x_grid[k]).cuda(hp['device']) other_x_grid[k] = other_x_grid[k].cuda(hp['device']) other_y[k] = torch.stack([ rrn_utils.encode_output(p) for p in other_outputs[k]
other_outputs = {'validation': split_outputs[1]} dim_x = hyperparameters['dim_x'] dim_y = hyperparameters['dim_y'] num_iters = hyperparameters['num_iters'] batch_size = hyperparameters['batch_size'] epochs = hyperparameters['epochs'] valid_epochs = hyperparameters['valid_epochs'] save_epochs = hyperparameters['save_epochs'] embed_size = hyperparameters['embed_size'] hidden_layer_size = hyperparameters['hidden_layer_size'] learning_rate = hyperparameters['learning_rate'] weight_decay = hyperparameters['weight_decay'] device = hyperparameters['device'] train_x = torch.stack([rrn_utils.encode_input(p) for p in train_inputs]).cuda(device) train_y = torch.stack([rrn_utils.encode_output(p) for p in train_outputs]).cuda(device) other_x = {} other_y = {} for k in other_inputs: other_x[k] = torch.stack( [rrn_utils.encode_input(p) for p in other_inputs[k]]).cuda(device) other_y[k] = torch.stack( [rrn_utils.encode_output(p) for p in other_outputs[k]]).cuda(device) # model = EmbedRRN(dim_x=dim_x, dim_y=dim_y, embed_size=embed_size, hidden_layer_size=hidden_layer_size).cuda(device) model = RRN(dim_x=dim_x, dim_y=dim_y,
def train_rrn(hyperparameters: dict, train_inputs: list, train_outputs: list, other_inputs: dict = None, other_outputs: dict = None): """ :param hyperparameters: Check below for what fields must exist in hyperparameters :param train_inputs: list of GridStrings :param train_outputs: list of GridStrings, corresponding in index to train_inputs :param other_inputs: dictionary of GridStrings where the key is name of the dataset :param other_outputs: dictionary of GridStrings where the key is name of the dataset, corresponding in index to inputs of same name :return: """ if other_inputs is None: other_inputs = {} if other_outputs is None: other_outputs = {} assert set(other_inputs.keys()) == set(other_outputs.keys()) if not os.path.exists('./checkpoints'): os.makedirs('./checkpoints') if not os.path.exists('./logs'): os.makedirs('./logs') dim_x = hyperparameters['dim_x'] dim_y = hyperparameters['dim_y'] num_iters = hyperparameters['num_iters'] batch_size = hyperparameters['batch_size'] epochs = hyperparameters['epochs'] valid_epochs = hyperparameters['valid_epochs'] save_epochs = hyperparameters['save_epochs'] embed_size = hyperparameters['embed_size'] hidden_layer_size = hyperparameters['hidden_layer_size'] learning_rate = hyperparameters['learning_rate'] weight_decay = hyperparameters['weight_decay'] parallel = False if 'devices' in hyperparameters: if len(hyperparameters['devices']) > 1: devices = hyperparameters['devices'] parallel = True device = hyperparameters['devices'][0] else: device = hyperparameters['device'] train_x = torch.stack([rrn_utils.encode_input(p) for p in train_inputs]).cuda(device) train_y = torch.stack([rrn_utils.encode_output(p) for p in train_outputs]).cuda(device) other_x = {} other_y = {} for k in other_inputs: other_x[k] = torch.stack( [rrn_utils.encode_input(p) for p in other_inputs[k]]).cuda(device) other_y[k] = torch.stack([ rrn_utils.encode_output(p) for p in other_outputs[k] ]).cuda(device) model = RRN(dim_x=dim_x, dim_y=dim_y, embed_size=embed_size, hidden_layer_size=hidden_layer_size) model.embed_layer.load_state_dict(orig_embed_layer.state_dict()) model.embed_layer.eval() for p in model.embed_layer.parameters(): p.requires_grad = False model = model.cuda(device) if parallel: model = nn.DataParallel(model, device_ids=devices) # else: # model = model.cuda(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) train_losses = [] # (epoch, ) train_accuracies = [] # (epoch, grid, timestep) other_losses = {name: [] for name in other_x} # (epoch, ) other_accuracies = {name: [] for name in other_x} # (epoch, grid, timestep) times = [] def closure(): optimizer.zero_grad() total_loss = 0 epoch_accuracies = [] shuffle_indices = np.arange(len(train_x)) np.random.shuffle(shuffle_indices) for i in tqdm(range(0, len(train_x), batch_size), leave=False): x_batch = train_x[shuffle_indices[i:i + batch_size]] y_batch = train_y[shuffle_indices[i:i + batch_size]] loss, accuracies = rrn_utils.get_performance(model=model, x=x_batch, y=y_batch, no_grad=False, num_iters=num_iters) loss.backward() total_loss += loss train_losses.append(float(total_loss)) epoch_accuracies.append(accuracies) train_accuracies.append(np.concatenate(epoch_accuracies)) return total_loss for i in tqdm(range(epochs)): start_time_str = datetime.now().strftime("%m/%d/%Y %H:%M:%S") start_time = time.time() train_loss = optimizer.step(closure) run_validate = i == 0 or (i + 1) % valid_epochs == 0 if run_validate: for name in other_x: loss, accuracy = rrn_utils.get_performance(model=model, x=other_x[name], y=other_y[name], num_iters=num_iters, no_grad=True) other_losses[name].append(float(loss)) other_accuracies[name].append(accuracy) if (i + 1) % save_epochs == 0: model_filename = "./checkpoints/epoch_{}.mdl".format(i + 1) train_data_filename = "./logs/training.pkl" print("Saving model to {}".format(model_filename)) torch.save(model.state_dict(), model_filename) with open(train_data_filename, 'wb') as f: pickle.dump( { 'hyperparameters': hyperparameters, 'train_losses': train_losses, 'train_accuracies': train_accuracies, 'other_losses': other_losses, 'other_accuracies': other_accuracies, 'times': times }, f) end_time_str = datetime.now().strftime("%m/%d/%Y %H:%M:%S") end_time = time.time() runtime = end_time - start_time times.append({ 'start_time': start_time_str, 'end_time': end_time_str, 'runtime': runtime }) print("duration: {}s\t iter: {}\t| loss: {}\t| accuracy: {}".format( round(runtime, 1), i, round(float(train_loss), 3), round(np.mean(train_accuracies[-1][:, -1]), 3))) if run_validate: for name in sorted(other_x): print("data: {}\t| loss: {}\t| accuracy: {}".format( name, round(other_losses[name][-1], 3), round(np.mean(other_accuracies[name][-1][:, -1]), 3))) model_filename = "./model.mdl" print("Saving model to {}".format(model_filename)) torch.save(model.state_dict(), model_filename) return model
other_outputs = {'validation': split_outputs[1]} # model = RelNet(dim_x=hp['dim_x'], # dim_y=hp['dim_y'], # embed_size=hp['embed_size'], # hidden_layer_size=hp['hidden_layer_size']).cuda(hp['device']) model = RRN(dim_x=hp['dim_x'], dim_y=hp['dim_y'], embed_size=hp['embed_size'], hidden_layer_size=hp['hidden_layer_size']).cuda(hp['device']) optimizer = optim.Adam(model.parameters(), lr=hp['learning_rate'], weight_decay=hp['weight_decay']) train_x = torch.stack([rrn_utils.encode_input(p) for p in train_inputs]).cuda(hp['device']) # train_x = utils.one_hot_encode(train_x) train_y = torch.stack([rrn_utils.encode_output(p) for p in train_outputs]).cuda(hp['device']) other_x = {} other_y = {} for k in other_inputs: other_x[k] = torch.stack([ rrn_utils.encode_input(p) for p in other_inputs[k] ]).cuda(hp['device']) # other_x[k] = utils.one_hot_encode(other_x[k]) other_y[k] = torch.stack([ rrn_utils.encode_output(p) for p in other_outputs[k] ]).cuda(hp['device'])
train_outputs = split_outputs[0] other_inputs = {'validation': split_inputs[1]} other_outputs = {'validation': split_outputs[1]} model = RelNet(dim_x=hp['dim_x'], dim_y=hp['dim_y'], embed_size=hp['embed_size'], hidden_layer_size=hp['hidden_layer_size']).cuda(hp['device']) optimizer = optim.Adam(model.parameters(), lr=hp['learning_rate'], weight_decay=hp['weight_decay']) train_x = torch.stack([rrn_utils.encode_input(p) for p in train_inputs]) train_x = utils.puzzle_as_dist(train_x).cuda(hp['device']) train_y = torch.stack([rrn_utils.encode_output(p) for p in train_outputs]).cuda(hp['device']) other_x = {} other_y = {} for k in other_inputs: other_x[k] = torch.stack([rrn_utils.encode_input(p) for p in other_inputs[k]]) other_x[k] = utils.puzzle_as_dist(other_x[k]).cuda(hp['device']) other_y[k] = torch.stack([rrn_utils.encode_output(p) for p in other_outputs[k]]).cuda(hp['device']) train_losses = [] # (epoch) train_accuracies = [] # (epoch, grid, timestep) other_losses = {name: [] for name in other_x} # (epoch) other_accuracies = {name: [] for name in other_x} # (epoch, grid, timestep) times = []