def main(cfg): if cfg['model'] == 'mlp': net = MLP(300, 768, cfg['class_num']) elif cfg['model'] == 'cnn': net = CNN(300, 768, cfg['class_num']) elif cfg['model'] == 'lstm': net = LSTM(300, cfg['class_num'], cfg['device']) elif cfg['model'] == 'gru': net = GRU(300, cfg['class_num'], cfg['device']) else: raise Exception(f'model {args.model} not available') if cfg['device'] == 'cuda': if len(cfg['gpu_ids']) == 1: torch.cuda.set_device(cfg['gpu_ids'][0]) net = net.cuda() else: net = net.cuda() net = nn.DataParallel(net, device_ids=cfg['gpu_ids']) torch.backends.cudnn.benchmark = True if cfg['mode'] == 'train': train(cfg, net) elif cfg['mode'] == 'predict': predict(cfg, net, 'checkpoints/{}.pth'.format(cfg['model']))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # # Build data loader # dataset,targets= load_dataset() # np.save("__cache_dataset.npy", dataset) # np.save("__cache_targets.npy", targets) # return dataset = np.load("__cache_dataset.npy") targets = np.load("__cache_targets.npy") # Build the models mlp = MLP(args.input_size, args.output_size) mlp.load_state_dict( torch.load( '_backup_model_statedict/mlp_100_4000_PReLU_ae_dd_final.pkl')) if torch.cuda.is_available(): mlp.cuda() # Loss and Optimizer criterion = nn.MSELoss() optimizer = torch.optim.Adagrad(mlp.parameters()) # Train the Models total_loss = [] print(len(dataset)) print(len(targets)) sm = 100 # start saving models after 100 epochs for epoch in range(args.num_epochs): print("epoch" + str(epoch)) avg_loss = 0 for i in range(0, len(dataset), args.batch_size): # Forward, Backward and Optimize mlp.zero_grad() bi, bt = get_input(i, dataset, targets, args.batch_size) bi = to_var(bi) bt = to_var(bt) bo = mlp(bi) loss = criterion(bo, bt) avg_loss = avg_loss + loss.item() loss.backward() optimizer.step() print("--average loss:") print(avg_loss / (len(dataset) / args.batch_size)) total_loss.append(avg_loss / (len(dataset) / args.batch_size)) # Save the models if epoch == sm: model_path = 'mlp_100_4000_PReLU_ae_dd' + str(sm) + '.pkl' torch.save(mlp.state_dict(), os.path.join(args.model_path, model_path)) sm = sm + 50 # save model after every 50 epochs from 100 epoch ownwards torch.save(total_loss, 'total_loss.dat') model_path = 'mlp_100_4000_PReLU_ae_dd_final.pkl' torch.save(mlp.state_dict(), os.path.join(args.model_path, model_path))
def main(opt): train_dataset = BADataset(opt.dataroot, opt.L, True, False, False) train_dataloader = BADataloader(train_dataset, batch_size=opt.batchSize, \ shuffle=True, num_workers=opt.workers, drop_last=True) valid_dataset = BADataset(opt.dataroot, opt.L, False, True, False) valid_dataloader = BADataloader(valid_dataset, batch_size=opt.batchSize, \ shuffle=True, num_workers=opt.workers, drop_last=True) test_dataset = BADataset(opt.dataroot, opt.L, False, False, True) test_dataloader = BADataloader(test_dataset, batch_size=opt.batchSize, \ shuffle=True, num_workers=opt.workers, drop_last=True) all_dataset = BADataset(opt.dataroot, opt.L, False, False, False) all_dataloader = BADataloader(all_dataset, batch_size=opt.batchSize, \ shuffle=False, num_workers=opt.workers, drop_last=False) opt.n_edge_types = train_dataset.n_edge_types opt.n_node = train_dataset.n_node net = MLP(opt) net.double() print(net) criterion = nn.BCELoss() if opt.cuda: net.cuda() criterion.cuda() optimizer = optim.Adam(net.parameters(), lr=opt.lr) early_stopping = EarlyStopping(patience=opt.patience, verbose=True) os.makedirs(OutputDir, exist_ok=True) train_loss_ls = [] valid_loss_ls = [] test_loss_ls = [] for epoch in range(0, opt.niter): train_loss = train(epoch, train_dataloader, net, criterion, optimizer, opt) valid_loss = valid(valid_dataloader, net, criterion, opt) test_loss = test(test_dataloader, net, criterion, opt) train_loss_ls.append(train_loss) valid_loss_ls.append(valid_loss) test_loss_ls.append(test_loss) early_stopping(valid_loss, net, OutputDir) if early_stopping.early_stop: print("Early stopping") break df = pd.DataFrame({'epoch':[i for i in range(1, len(train_loss_ls)+1)], 'train_loss': train_loss_ls, 'valid_loss': valid_loss_ls, 'test_loss': test_loss_ls}) df.to_csv(OutputDir + '/loss.csv', index=False) net.load_state_dict(torch.load(OutputDir + '/checkpoint.pt')) inference(all_dataloader, net, criterion, opt, OutputDir)
def main(): np.random.seed(args.seed) cur_acc = 0 max_acc = 0 num_param = 20 cur_param = np.zeros(args.n_epoch) max_pt = np.zeros(args.n_epoch) for iii in range(args.n_iter): for jjj in range(args.n_samples): cur_a = np.random.randn(10) cur_w = np.random.randn(10) cur_b = np.random.randn(10) x = np.arange(args.n_epoch) / args.n_epoch cur_rt = np.dot(np.outer(x, cur_w) + cur_b, cur_a) cur_rt = 1 / (1 + np.exp(-cur_rt)) cur_param = cur_rt.copy() cur_acc = black_box_function(cur_param) if max_acc < cur_acc: max_acc = cur_acc max_pt = cur_param.copy() ''' rate_schedule=np.ones(args.n_epoch)*forget_rate rate_schedule[:10]=np.arange(10,dtype=float)/10*forget_rate # rate_schedule[10:]=np.arange(args.n_epoch-10,dtype=float)/(args.n_epoch-10)*forget_rate+forget_rate rate_schedule=np.zeros(args.n_epoch) print(rate_schedule) ''' rate_schedule = max_pt.copy() print('Final Schedule:', rate_schedule) mean_pure_ratio1 = 0 mean_pure_ratio2 = 0 print('building model...') cnn1 = MLP(n_outputs=num_classes) cnn1.cuda() print(cnn1.parameters) optimizer1 = torch.optim.Adam(cnn1.parameters(), lr=learning_rate) cnn2 = MLP(n_outputs=num_classes) cnn2.cuda() print(cnn2.parameters) optimizer2 = torch.optim.Adam(cnn2.parameters(), lr=learning_rate) epoch = 0 train_acc1 = 0 train_acc2 = 0 # evaluate models with random weights test_acc1, test_acc2 = evaluate(test_loader, cnn1, cnn2) print( 'Epoch [%d/%d] Test Accuracy on the %s test images: Model1 %.4f %% Model2 %.4f %% Pure Ratio1 %.4f %% Pure Ratio2 %.4f %%' % (epoch + 1, args.n_epoch, len(test_dataset), test_acc1, test_acc2, mean_pure_ratio1, mean_pure_ratio2)) # save results with open(txtfile, "a") as myfile: myfile.write( str(int(epoch)) + ' ' + str(train_acc1) + ' ' + str(train_acc2) + ' ' + str(test_acc1) + " " + str(test_acc2) + ' ' + str(mean_pure_ratio1) + ' ' + str(mean_pure_ratio2) + ' ' + str(rate_schedule[epoch]) + "\n") # training for epoch in range(1, args.n_epoch): # train models cnn1.train() adjust_learning_rate(optimizer1, epoch) cnn2.train() adjust_learning_rate(optimizer2, epoch) train_acc1, train_acc2, pure_ratio_1_list, pure_ratio_2_list = train( train_loader, epoch, cnn1, optimizer1, cnn2, optimizer2, rate_schedule) # evaluate models test_acc1, test_acc2 = evaluate(test_loader, cnn1, cnn2) # save results mean_pure_ratio1 = sum(pure_ratio_1_list) / len(pure_ratio_1_list) mean_pure_ratio2 = sum(pure_ratio_2_list) / len(pure_ratio_2_list) print( 'Epoch [%d/%d] Test Accuracy on the %s test images: Model1 %.4f %% Model2 %.4f %%, Pure Ratio 1 %.4f %%, Pure Ratio 2 %.4f %%' % (epoch + 1, args.n_epoch, len(test_dataset), test_acc1, test_acc2, mean_pure_ratio1, mean_pure_ratio2)) with open(txtfile, "a") as myfile: myfile.write( str(int(epoch)) + ' ' + str(train_acc1) + ' ' + str(train_acc2) + ' ' + str(test_acc1) + " " + str(test_acc2) + ' ' + str(mean_pure_ratio1) + ' ' + str(mean_pure_ratio2) + ' ' + str(rate_schedule[epoch]) + "\n")
def black_box_function(opt_param): mean_pure_ratio1 = 0 mean_pure_ratio2 = 0 print('building model...') cnn1 = MLP(n_outputs=num_classes) cnn1.cuda() print(cnn1.parameters) optimizer1 = torch.optim.Adam(cnn1.parameters(), lr=learning_rate) cnn2 = MLP(n_outputs=num_classes) cnn2.cuda() print(cnn2.parameters) optimizer2 = torch.optim.Adam(cnn2.parameters(), lr=learning_rate) rate_schedule = opt_param.copy() print('Schedule:', rate_schedule) epoch = 0 train_acc1 = 0 train_acc2 = 0 # evaluate models with random weights test_acc1, test_acc2 = evaluate(test_loader, cnn1, cnn2) print( 'Epoch [%d/%d] Test Accuracy on the %s test images: Model1 %.4f %% Model2 %.4f %% Pure Ratio1 %.4f %% Pure Ratio2 %.4f %%' % (epoch + 1, args.n_epoch, len(test_dataset), test_acc1, test_acc2, mean_pure_ratio1, mean_pure_ratio2)) # save results with open(txtfile, "a") as myfile: myfile.write( str(int(epoch)) + ' ' + str(train_acc1) + ' ' + str(train_acc2) + ' ' + str(test_acc1) + " " + str(test_acc2) + ' ' + str(mean_pure_ratio1) + ' ' + str(mean_pure_ratio2) + ' ' + str(rate_schedule[epoch]) + "\n") # training for epoch in range(1, args.n_epoch): # train models cnn1.train() adjust_learning_rate(optimizer1, epoch) cnn2.train() adjust_learning_rate(optimizer2, epoch) train_acc1, train_acc2, pure_ratio_1_list, pure_ratio_2_list = train( train_loader, epoch, cnn1, optimizer1, cnn2, optimizer2, rate_schedule) # evaluate models test_acc1, test_acc2 = evaluate(test_loader, cnn1, cnn2) # save results mean_pure_ratio1 = sum(pure_ratio_1_list) / len(pure_ratio_1_list) mean_pure_ratio2 = sum(pure_ratio_2_list) / len(pure_ratio_2_list) print( 'Epoch [%d/%d] Test Accuracy on the %s test images: Model1 %.4f %% Model2 %.4f %%, Pure Ratio 1 %.4f %%, Pure Ratio 2 %.4f %%' % (epoch + 1, args.n_epoch, len(test_dataset), test_acc1, test_acc2, mean_pure_ratio1, mean_pure_ratio2)) with open(txtfile, "a") as myfile: myfile.write( str(int(epoch)) + ' ' + str(train_acc1) + ' ' + str(train_acc2) + ' ' + str(test_acc1) + " " + str(test_acc2) + ' ' + str(mean_pure_ratio1) + ' ' + str(mean_pure_ratio2) + ' ' + str(rate_schedule[epoch]) + "\n") return (test_acc1 + test_acc2) / 200
# prepare mnist datasets. train_datasets = [ get_dataset('mnist', permutation=p) for p in permutations ] test_datasets = [ get_dataset('mnist', train=False, permutation=p) for p in permutations ] # prepare the model. mlp = MLP( DATASET_CONFIGS['mnist']['size']**2, DATASET_CONFIGS['mnist']['classes'], hidden_size=args.hidden_size, hidden_layer_num=args.hidden_layer_num, hidden_dropout_prob=args.hidden_dropout_prob, input_dropout_prob=args.input_dropout_prob, ) # prepare the cuda if needed. if cuda: mlp.cuda() # run the experiment. train( mlp, train_datasets, test_datasets, epochs_per_task=args.epochs_per_task, batch_size=args.batch_size, lr=args.lr, weight_decay=args.weight_decay, cuda=cuda )
from ignite.contrib.handlers.neptune_logger import * from ignite.handlers import Checkpoint from model import Data, MLP from data import X, y from parse import args scaler = GradScaler() model = MLP(n_neurons=[(20, 100), (100, 60), (60, 2)], activation=nn.LeakyReLU(), batch_norm=True, dropout=0.2) model.cuda() logger = NeptuneLogger(api_token=os.getenv('NEPTUNE_API_TOKEN'), project_name = "vladimir.isakov/sandbox", experiment_name = 'Run', upload_source_files='./train.py', #tags = 'v1', params = {'batch_size': args.batch_size, 'epochs': args.epochs, 'lr': args.lr, 'step_size': args.step_size, 'gamma': args.gamma, 'weight_decay': args.weight_decay, 'model': repr(model)}) optimizer = torch.optim.Adam(model.parameters(),
def train_model(config, gpu_id, save_dir, exp_name): # Instantiating the model model_type = config.get('model_type', 'MLP') if model_type == "MLP": model = MLP(784, config["hidden_layers"], 10, config["nonlinearity"], config["initialization"], config["dropout"], verbose=True) elif model_type == "CNN": model = CNN(config["initialization"], config["is_batch_norm"], verbose=True) else: raise ValueError('config["model_type"] not supported : {}'.format(model_type)) # Loading the MNIST dataset x_train, y_train, x_valid, y_valid, x_test, y_test = utils.load_mnist(config["data_file"], data_format=config["data_format"]) if config['data_reduction'] != 1.: x_train, y_train = utils.reduce_trainset_size(x_train, y_train, config['data_reduction']) # If GPU is available, sends model and dataset on the GPU if torch.cuda.is_available(): model.cuda(gpu_id) x_train = torch.from_numpy(x_train).cuda(gpu_id) y_train = torch.from_numpy(y_train).cuda(gpu_id) x_valid = Variable(torch.from_numpy(x_valid), volatile=True).cuda(gpu_id) y_valid = Variable(torch.from_numpy(y_valid), volatile=True).cuda(gpu_id) x_test = Variable(torch.from_numpy(x_test), volatile=True).cuda(gpu_id) y_test = Variable(torch.from_numpy(y_test), volatile=True).cuda(gpu_id) print("Running on GPU") else: x_train = torch.from_numpy(x_train) y_train = torch.from_numpy(y_train) x_valid = Variable(torch.from_numpy(x_valid)) y_valid = Variable(torch.from_numpy(y_valid)) x_test = Variable(torch.from_numpy(x_test)) y_test = Variable(torch.from_numpy(y_test)) print("WATCH-OUT : torch.cuda.is_available() returned False. Running on CPU.") # Instantiate TensorDataset and DataLoader objects train_set = torch.utils.data.TensorDataset(x_train, y_train) loader = torch.utils.data.DataLoader(train_set, batch_size=config["mb_size"], shuffle=True) # Optimizer and Loss Function optimizer = optim.SGD(model.parameters(), lr=config['lr'], momentum=config['momentum'], weight_decay=config['L2_hyperparam'] * (config['mb_size'] / x_train.size()[0])) loss_fn = nn.NLLLoss() # Records the model's performance train_tape = [[],[]] valid_tape = [[],[]] test_tape = [[],[]] weights_tape = [] def evaluate(data, labels): model.eval() if not isinstance(data, Variable): if torch.cuda.is_available(): data = Variable(data, volatile=True).cuda(gpu_id) labels = Variable(labels, volatile=True).cuda(gpu_id) else: data = Variable(data) labels = Variable(labels) output = model(data) loss = loss_fn(output, labels) prediction = torch.max(output.data, 1)[1] accuracy = (prediction.eq(labels.data).sum() / labels.size(0)) * 100 return loss.data[0], accuracy if not os.path.exists(os.path.join(save_dir, exp_name)): os.makedirs(os.path.join(save_dir, exp_name)) # Record train accuracy train_loss, train_acc = evaluate(x_train, y_train) train_tape[0].append(train_loss) train_tape[1].append(train_acc) # Record valid accuracy valid_loss, valid_acc = evaluate(x_valid, y_valid) valid_tape[0].append(valid_loss) valid_tape[1].append(valid_acc) # Record test accuracy test_loss, test_acc = evaluate(x_test, y_test) test_tape[0].append(test_loss) test_tape[1].append(test_acc) # Record weights L2 norm weights_L2_norm = model.get_weights_L2_norm() weights_tape.append(float(weights_L2_norm.data.cpu().numpy())) print("BEFORE TRAINING \nLoss : {0:.3f} \nAcc : {1:.3f}".format(valid_loss, valid_acc)) # TRAINING LOOP best_valid_acc = 0 for epoch in range(1, config["max_epochs"]): start = time.time() model.train() for i,(x_batch, y_batch) in enumerate(loader): #pdb.set_trace() if torch.cuda.is_available(): x_batch = Variable(x_batch).cuda(gpu_id) y_batch = Variable(y_batch).cuda(gpu_id) else: x_batch = Variable(x_batch) y_batch = Variable(y_batch) # Empties the gradients optimizer.zero_grad() # Feedforward through the model output = model(x_batch) # Computes the loss loss = loss_fn(output, y_batch) # Backpropagates to compute the gradients loss.backward() # Takes one training step optimizer.step() # Record weights L2 norm weights_L2_norm = model.get_weights_L2_norm() weights_tape.append(float(weights_L2_norm.data.cpu().numpy())) # Record train accuracy train_loss, train_acc = evaluate(x_train, y_train) train_tape[0].append(train_loss) train_tape[1].append(train_acc) # Record valid accuracy valid_loss, valid_acc = evaluate(x_valid, y_valid) valid_tape[0].append(valid_loss) valid_tape[1].append(valid_acc) # Record test accuracy test_loss, test_acc = evaluate(x_test, y_test) test_tape[0].append(test_loss) test_tape[1].append(test_acc) print("Epoch {0} \nLoss : {1:.3f} \nAcc : {2:.3f}".format(epoch, valid_loss, valid_acc)) print("Time : {0:.2f}".format(time.time() - start)) # Saves the model if valid_acc > best_valid_acc: print("NEW BEST MODEL") torch.save(model.state_dict(), os.path.join(save_dir, exp_name, "model")) best_valid_acc = valid_acc # Saves the graphs utils.save_results(train_tape, valid_tape, test_tape, weights_tape, save_dir, exp_name, config) utils.update_comparative_chart(save_dir, config['show_test']) return
def train_model(config, gpu_id, save_dir, exp_name): # Instantiating the model model_type = config.get('model_type', 'MLP') if model_type == "MLP": model = MLP(config['input_size'], config["hidden_layers"], 1, config["nonlinearity"], config["initialization"], config["dropout"], verbose=True) elif model_type == "CNN": model = CNN(config["initialization"], config["is_batch_norm"], verbose=True) else: raise ValueError( 'config["model_type"] not supported : {}'.format(model_type)) if config['resume']: model.load_state_dict( torch.load(os.path.join(save_dir, exp_name, "model"))) # If GPU is available, sends model and dataset on the GPU if torch.cuda.is_available(): model.cuda(gpu_id) print("USING GPU-{}".format(gpu_id)) # Optimizer and Loss Function optimizer = optim.RMSprop(model.parameters(), lr=config['lr']) loss_fn = nn.CrossEntropyLoss() """ Trains an agent with (stochastic) Policy Gradients on Pong. Uses OpenAI Gym. """ env = gym.make("Pong-v0") observation = env.reset() prev_x = None # used in computing the difference frame y_list, LL_list, reward_list = [], [], [] running_reward = None reward_sum = 0 episode_number = 0 start = time.time() # Initializing recorders update = 0 loss_tape = [] our_score_tape = [] opponent_score_tape = [] our_score = 0 opponent_score = 0 # TRAINING LOOP while update < config['max_updates']: if config['render']: env.render() # preprocess the observation and set input to network to be difference image cur_x = utils.preprocess(observation, data_format=config['data_format']) if prev_x is None: x = np.zeros(cur_x.shape) else: x = cur_x - prev_x prev_x = cur_x x_torch = Variable(torch.from_numpy(x).float(), requires_grad=False) if config['data_format'] == "array": x_torch = x_torch.unsqueeze(dim=0).unsqueeze(dim=0) if torch.cuda.is_available(): x_torch = x_torch.cuda(gpu_id) # Feedforward through the policy network action_prob = model(x_torch) # Sample an action from the returned probability if np.random.uniform() < action_prob.cpu().data.numpy(): action = 2 # UP else: action = 3 # DOWN # record the log-likelihoods y = 1 if action == 2 else 0 # a "fake label" NLL = -y * torch.log(action_prob) - (1 - y) * torch.log(1 - action_prob) LL_list.append(NLL) y_list.append( y ) # grad that encourages the action that was taken to be taken TODO: the tensor graph breaks here. Find a way to backpropagate the PG error. # step the environment and get new measurements observation, reward, done, info = env.step(action) reward_sum += reward reward_list.append( reward ) # record reward (has to be done after we call step() to get reward for previous action) if done: # an episode finished (an episode ends when one of the player wins 21 games) episode_number += 1 # Computes loss and reward for each step of the episode R = torch.zeros(1, 1) loss = 0 for i in reversed(range(len(reward_list))): R = config['gamma'] * R + reward_list[i] Return_i = Variable(R) if torch.cuda.is_available(): Return_i = Return_i.cuda(gpu_id) loss = loss + (LL_list[i] * (Return_i)).sum() # .expand_as(LL_list[i]) loss = loss / len(reward_list) print(loss) # Backpropagates to compute the gradients loss.backward() y_list, LL_list, reward_list = [], [], [] # reset array memory # Performs parameter update every config['mb_size'] episodes if episode_number % config['mb_size'] == 0: # Takes one training step optimizer.step() # Empties the gradients optimizer.zero_grad() stop = time.time() print("PARAMETER UPDATE ------------ {}".format(stop - start)) start = time.time() utils.save_results(save_dir, exp_name, loss_tape, our_score_tape, opponent_score_tape, config) update += 1 if update % 10 == 0: torch.save( model.state_dict(), os.path.join(save_dir, exp_name, "model_" + model.name())) # Records the average loss and score of the episode loss_tape.append(loss.cpu().data.numpy()) our_score_tape.append(our_score) opponent_score_tape.append(opponent_score) our_score = 0 opponent_score = 0 # boring book-keeping if running_reward is None: running_reward = reward_sum else: running_reward = running_reward * 0.99 + reward_sum * 0.01 print( 'resetting env. episode reward total was {0:.2f}. running mean: {1:.2f}' .format(reward_sum, running_reward)) reward_sum = 0 observation = env.reset() # reset env prev_x = None if reward != 0: # Pong has either +1 or -1 reward exactly when game ends. if reward == -1: opponent_score += 1 print('ep {0}: game finished, reward: {1:.2f}'.format( episode_number, reward)) else: our_score += 1 print( 'ep {0}: game finished, reward: {1:.2f} !!!!!!!!!'.format( episode_number, reward))
class REINFORCE: def __init__(self, obs_space_size, hidden_sizes, action_space_size, learning_rate, use_cuda, gpu_id): self.action_space_size = action_space_size self.use_cuda = use_cuda self.gpu_id = gpu_id # Initializes the policy network and optimizer self.policy = MLP(obs_space_size, hidden_sizes, action_space_size, "distribution", "relu", "standard", name="PolicyNetwork", verbose=True) self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=learning_rate) # Creates counters self.action_count = np.zeros(shape=(self.action_space_size, )) self.explore_count = 0 self.exploit_count = 0 # If GPU is available, sends model on GPU if torch.cuda.is_available() and self.use_cuda: self.policy.cuda(gpu_id) print("USING GPU-{}".format(gpu_id)) self.policy.train() def select_action(self, observation): # Transforms the state into a torch Variable x = Variable(torch.Tensor([observation])) if torch.cuda.is_available() and self.use_cuda: x = x.cuda(self.gpu_id) # Forward propagation through policy network action_probs = self.policy(x) # Samples an action action = action_probs.multinomial().data # Negative log-likelihood of sampled action NLL = -torch.log(action_probs[:, action[0, 0]]).view(1, -1) if int(action) == int(torch.max(action_probs, 1)[1].cpu().data): self.exploit_count += 1 else: self.explore_count += 1 self.action_count[int(action)] += 1 return int(action), NLL def compute_gradients(self, reward_list, NLL_list, gamma): R = torch.zeros(1, 1) loss = 0 # Iterates through the episode in reverse order to compute return for each step for i in reversed(range(len(reward_list))): # Discounts reward R = gamma * R + reward_list[i] Return_i = Variable(R) if torch.cuda.is_available() and self.use_cuda: Return_i = Return_i.cuda(self.gpu_id) # Loss is the NLL at each step weighted by the return for that step loss = loss + (NLL_list[i] * Return_i).squeeze() # Average to get the total loss loss = loss / len(reward_list) # Backpropagation to compute the gradients loss.backward() return loss.cpu().data.numpy() def update_parameters(self): # Clips the gradient and apply the update torch.nn.utils.clip_grad_norm(self.policy.parameters(), 40) self.optimizer.step() self.optimizer.zero_grad() def save_policy(self, directory): torch.save(self.policy.state_dict(), os.path.join(directory, self.policy.name + "_ckpt.pkl")) def load_policy(self, directory): model.load_state_dict( torch.load(os.path.join(directory, "model_" + self.policy.name))) def reset_counters(self): self.action_count = np.zeros(shape=(self.action_space_size, )) self.explore_count = 0 self.exploit_count = 0
def main(): # Create a meta optimizer that wraps a model into a meta model # to keep track of the meta updates. # meta_model = FullyConnectedNN() meta_model = MLP() print(meta_model) if args.cuda: meta_model.cuda() meta_optimizer = MetaOptimizer(MetaModel(meta_model), args.num_layers, args.hidden_size) if args.cuda: meta_optimizer.cuda() optimizer = optim.Adam(meta_optimizer.parameters(), lr=1e-3) for epoch in range(args.max_epoch): decrease_in_loss = 0.0 final_loss = 0.0 train_iter = iter(train_loader) for i in range(args.updates_per_epoch): # Sample a new model #model = FullyConnectedNN() model = MLP() if args.cuda: model.cuda() x, y = next(train_iter) if args.cuda: x, y = x.cuda(), y.cuda() x, y = Variable(x), Variable(y) # Compute initial loss of the model f_x = model(x) initial_loss = F.nll_loss(f_x, y) for k in range(args.optimizer_steps // args.truncated_bptt_step): # Keep states for truncated BPTT meta_optimizer.reset_lstm(keep_states=k > 0, model=model, use_cuda=args.cuda) loss_sum = 0 prev_loss = torch.zeros(1) if args.cuda: prev_loss = prev_loss.cuda() for j in range(args.truncated_bptt_step): x, y = next(train_iter) if args.cuda: x, y = x.cuda(), y.cuda() x, y = Variable(x), Variable(y) # First we need to compute the gradients of the model f_x = model(x) loss = F.nll_loss(f_x, y) model.zero_grad() loss.backward() # Perfom a meta update using gradients from model # and return the current meta model saved in the optimizer meta_model = meta_optimizer.meta_update(model, loss.data) # Compute a loss for a step the meta optimizer f_x = meta_model(x) loss = F.nll_loss(f_x, y) loss_sum += (loss - Variable(prev_loss)) prev_loss = loss.data # Update the parameters of the meta optimizer meta_optimizer.zero_grad() loss_sum.backward() for param in meta_optimizer.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step() # Compute relative decrease in the loss function w.r.t initial # value decrease_in_loss += loss.data[0] / initial_loss.data[0] final_loss += loss.data[0] print("Epoch: {}, final loss {}, average final/initial loss ratio: {}". format(epoch, final_loss / args.updates_per_epoch, decrease_in_loss / args.updates_per_epoch))
def train(lr=args.lr, n_hidden=args.n_hidden, batch_size=args.batch_size, dropout=args.dropout, valid_freq=3000, disp_freq=1000, save_freq=100000, max_epochs=args.n_epoch, patience=15, save_name=args.save_name, save_dir=args.save_dir, device=args.device): # Load train and valid dataset print('loading train') with open(args.train_path, 'rb') as f: train_val_y = pickle.load(f) train_val_x = pickle.load(f) print('loading english test') with open(args.en_test_path, 'rb') as f: en_test_y = pickle.load(f) en_test_x = pickle.load(f) print('loading french test') with open(args.fr_test_path, 'rb') as f: fr_test_y = pickle.load(f) fr_test_x = pickle.load(f) sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1125) for train_index, test_index in sss.split(train_val_x, train_val_y): train_y = train_val_y[train_index] train_x = train_val_x[train_index] valid_y = train_val_y[test_index] valid_x = train_val_x[test_index] print('Number of training sample: %d' % train_x.shape[0]) print('Number of validation sample: %d' % valid_x.shape[0]) print('Number of english testing sample: %d' % en_test_x.shape[0]) print('Number of french testing sample: %d' % fr_test_x.shape[0]) print('-' * 100) kf_valid = get_minibatches_idx(len(valid_y), batch_size) kf_en_test = get_minibatches_idx(len(en_test_y), batch_size) kf_fr_test = get_minibatches_idx(len(fr_test_y), batch_size) # Loader parameter: use CUDA pinned memory for faster data loading pin_memory = (device == args.device) # Test set n_emb = train_x.shape[1] n_class = len(set(train_y)) best_valid_acc = None bad_counter = 0 uidx = 0 # the number of update done estop = False # early stop switch net = MLP(n_mlp_layer=args.n_mlp_layers, n_hidden=args.n_hidden, dropout=args.dropout, n_class=n_class, n_emb=n_emb, device=args.device) if args.load_net != '': assert os.path.exists( args.load_net), 'Path to pretrained net does not exist' net.load_state_dict(torch.load(args.load_net)) print('Load exists model stored at: ', args.load_net) if args.device == 'gpu': net = net.cuda() # Begin Training net.train() print('-' * 100) print('Model structure: ') print('MLP baseline') print(net.main) print('-' * 100) print('Parameters for tuning: ') print(net.state_dict().keys()) print('-' * 100) # Define optimizer assert args.optimizer in [ 'SGD', 'Adam', "RMSprop", "LBFGS", "Rprop", "ASGD", "Adadelta", "Adagrad", "Adamax" ], 'Please choose either SGD or Adam' if args.optimizer == 'SGD': optimizer = optim.SGD(lr=lr, params=filter(lambda p: p.requires_grad, net.parameters()), momentum=0.9) else: optimizer = getattr(optim, args.optimizer)(params=filter( lambda p: p.requires_grad, net.parameters()), lr=lr) #lambda1 = lambda epoch: epoch // 30 lambda2 = lambda epoch: 0.98**epoch scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=[lambda2]) #scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=max_epochs) #scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max') try: for eidx in range(max_epochs): scheduler.step() # print('Training mode on: ' ,net.training) start_time = time.time() n_samples = 0 # Get new shuffled index for the training set kf = get_minibatches_idx(len(train_y), batch_size, shuffle=True) for _, train_index in kf: # Remove gradient from previous batch #net.zero_grad() optimizer.zero_grad() uidx += 1 y_batch = torch.autograd.Variable( torch.from_numpy(train_y[train_index]).long()) x_batch = torch.autograd.Variable( torch.from_numpy(train_x[train_index]).float()) if net.device == 'gpu': y_batch = y_batch.cuda() scores = net.forward(x_batch) loss = net.loss(scores, y_batch) loss.backward() optimizer.step() n_samples += len(x_batch) gradient = 0 # For logging gradient information for name, w in net.named_parameters(): if w.grad is not None: w_grad = torch.norm(w.grad.data, 2)**2 gradient += w_grad gradient = gradient**0.5 if np.mod(uidx, disp_freq) == 0: print('Epoch ', eidx, 'Update ', uidx, 'Cost ', loss.data[0], 'Gradient ', gradient) if save_name and np.mod(uidx, save_freq) == 0: print('Saving...') torch.save( net.state_dict(), '%s/%s_epoch%d_update%d.net' % (save_dir, save_name, eidx, uidx)) if np.mod(uidx, valid_freq) == 0: print("=" * 50) print('Evaluation on validation set: ') kf_valid = get_minibatches_idx(len(valid_y), batch_size) top_1_acc, top_n_acc = eval.net_evaluation( net, kf_valid, valid_x, valid_y) #scheduler.step(top_1_acc) # Save best performance state_dict for testing if best_valid_acc is None: best_valid_acc = top_1_acc best_state_dict = net.state_dict() torch.save(best_state_dict, '%s/%s_best.net' % (save_dir, save_name)) else: if top_1_acc > best_valid_acc: print( 'Best validation performance so far, saving model parameters' ) print("*" * 50) bad_counter = 0 # reset counter best_valid_acc = top_1_acc best_state_dict = net.state_dict() torch.save( best_state_dict, '%s/%s_best.net' % (save_dir, save_name)) else: bad_counter += 1 print('Validation accuracy: ', 100 * top_1_acc) print('Getting worse, patience left: ', patience - bad_counter) print('Best validation accuracy now: ', 100 * best_valid_acc) # Learning rate annealing lr /= args.lr_anneal print('Learning rate annealed to: ', lr) print('*' * 100) if args.optimizer == 'SGD': optimizer = optim.SGD( lr=lr, params=filter(lambda p: p.requires_grad, net.parameters()), momentum=0.9) else: optimizer = getattr(optim, args.optimizer)( params=filter(lambda p: p.requires_grad, net.parameters()), lr=lr) if bad_counter > patience: print('-' * 100) print('Early Stop!') estop = True break epoch_time = time.time() - start_time print('Epoch processing time: %.2f s' % epoch_time) print('Seen %d samples' % n_samples) if estop: break print('-' * 100) print('Training finish') best_state_dict = torch.load('%s/%s_best.net' % (save_dir, save_name)) torch.save(net.state_dict(), '%s/%s_final.net' % (save_dir, save_name)) net.load_state_dict(best_state_dict) # add self connection print('Evaluation on validation set: ') kf_valid = get_minibatches_idx(len(valid_y), batch_size) eval.net_evaluation(net, kf_valid, valid_x, valid_y) # Evaluate model on test set print('Evaluation on test set: ') print('Evaluation on English testset: ') eval.net_evaluation(net, kf_en_test, en_test_x, en_test_y) print('Evaluation on French testset: ') eval.net_evaluation(net, kf_fr_test, fr_test_x, fr_test_y) except KeyboardInterrupt: print('-' * 100) print("Training interrupted, saving final model...") best_state_dict = torch.load('%s/%s_best.net' % (save_dir, save_name)) torch.save(net.state_dict(), '%s/%s_final.net' % (save_dir, save_name)) net.load_state_dict(best_state_dict) print('Evaluation on validation set: ') kf_valid = get_minibatches_idx(len(valid_y), batch_size) eval.net_evaluation(net, kf_valid, valid_x, valid_y) # Evaluate model on test set print('Evaluation on English testset: ') eval.net_evaluation(net, kf_en_test, en_test_x, en_test_y) print('Evaluation on French testset: ') eval.net_evaluation(net, kf_fr_test, fr_test_x, fr_test_y)
def train(args, logger, model_save_dir, val_dataset, test_dataset, train_dataset): # set seed torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) pretrain_embed = pickle.load( open('../{}/{}'.format(args.embed_dir, args.embed), 'rb')) try: pretrain_embed = torch.from_numpy(pretrain_embed).float() except: pretrain_embed = pretrain_embed.float() dataLoader = DataLoader(train_dataset, batch_size=args.batch_sz, shuffle=True) if args.model == 'MLP': model = MLP(args.hidden_dim, pretrain_embed) elif args.model == 'MLP3': model = MLP3Diff(args.hidden_dim, pretrain_embed) elif args.model == 'BiLinear': model = BiLinearDiff1(args.hidden_dim, pretrain_embed) else: model = BiLinearDiffH(args.hidden_dim, pretrain_embed) # model = ListMaxTransformer(args.hidden_dim, pretrain_embed) if torch.cuda.is_available(): model.cuda() criterion = torch.nn.MSELoss() # optimizer = torch.optim.SGD(model.parameters(), lr=args.lr) # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=args.gamma) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) best_dev_loss = float('+inf') best_dev_model = None best_dev_test_loss = 0 counter = 0 for epoch in range(1, args.n_epoch + 1): train_loss = 0 model.train() iteration = 0 optimizer.zero_grad() for batch in dataLoader: x = torch.stack(batch['input']) # 5 x bz y = batch['label'].float() # bz if torch.cuda.is_available(): x = x.cuda() y = y.cuda() output = model(x) loss = criterion(output, y) train_loss += loss.item() loss.backward() nn.utils.clip_grad_norm(model.parameters(), 5) optimizer.step() iteration += 1 # if iteration % args.iter_print == 0: # logger.info('{}-{}-{}-{}'.format(epoch, iteration, train_loss, train_acc)) train_loss = train_loss / len(dataLoader) dev_loss = val(model, val_dataset) test_loss = val(model, test_dataset) # scheduler.step() if dev_loss < best_dev_loss: best_dev_model = model.state_dict().copy() best_dev_loss = dev_loss best_dev_test_loss = test_loss counter = 0 else: counter += 1 if epoch % 5 == 0: logger.info('=================================================') logger.info('TRAIN: epoch:{}-loss:{}'.format(epoch, train_loss)) logger.info('DEV: epoch:{}-loss:{}'.format(epoch, dev_loss)) logger.info('TEST: epoch:{}-loss:{}'.format(epoch, test_loss)) logger.info('BEST-DEV-LOSS: {}, BEST-DEV-TEST-LOSS:{}'.format( best_dev_loss, best_dev_test_loss)) if counter > 40: break logger.info('===================[][][][][]====================') logger.info('TRAIN: epoch:{}-loss:{}'.format(epoch, train_loss)) logger.info('DEV: epoch:{}-loss:{}'.format(epoch, dev_loss)) logger.info('TEST: epoch:{}-loss:{}'.format(epoch, test_loss)) logger.info('BEST-DEV-LOSS: {}, BEST-DEV-TEST-LOSS:{}'.format( best_dev_loss, best_dev_test_loss)) torch.save( best_dev_model, model_save_dir + '/model-{}-{}-{}-{}.pt'.format( best_dev_test_loss, args.lr, args.hidden_dim, args.gamma)) del dataLoader del best_dev_model del model del train_dataset del val_dataset del test_dataset
def train_model(config, gpu_id, save_dir, exp_name): # Instantiating the model model = MLP(784, config["hidden_layers"], 10, config["activation"], config["initialization"], verbose=True) # Loading the MNIST dataset x_train, y_train, x_valid, y_valid, x_test, y_test = utils.load_mnist( config["data_file"]) if config['data_reduction'] != 1.: x_train, y_train = utils.reduce_trainset_size(x_train, y_train, config['data_reduction']) # If GPU is available, sends model and dataset on the GPU if torch.cuda.is_available(): model.cuda(gpu_id) x_train = torch.from_numpy(x_train).cuda(gpu_id) y_train = torch.from_numpy(y_train).cuda(gpu_id) x_valid = Variable(torch.from_numpy(x_valid)).cuda(gpu_id) y_valid = Variable(torch.from_numpy(y_valid)).cuda(gpu_id) x_test = Variable(torch.from_numpy(x_test)).cuda(gpu_id) y_test = Variable(torch.from_numpy(y_test)).cuda(gpu_id) print("Running on GPU") else: x_train = torch.from_numpy(x_train) y_train = torch.from_numpy(y_train) x_valid = Variable(torch.from_numpy(x_valid)) y_valid = Variable(torch.from_numpy(y_valid)) x_test = Variable(torch.from_numpy(x_test)) y_test = Variable(torch.from_numpy(y_test)) print( "WATCH-OUT : torch.cuda.is_available() returned False. Running on CPU." ) # Instantiate TensorDataset and DataLoader objects train_set = torch.utils.data.TensorDataset(x_train, y_train) loader = torch.utils.data.DataLoader(train_set, batch_size=config["mb_size"], shuffle=True) # Optimizer and Loss Function optimizer = optim.SGD(model.parameters(), lr=config['lr'], momentum=config['momentum']) loss_fn = nn.NLLLoss() # Records the model's performance train_tape = [[], []] valid_tape = [[], []] test_tape = [[], []] def evaluate(data, labels): if not isinstance(data, Variable): if torch.cuda.is_available(): data = Variable(data).cuda(gpu_id) labels = Variable(labels).cuda(gpu_id) else: data = Variable(data) labels = Variable(labels) output = model(data) loss = loss_fn(output, labels) prediction = torch.max(output.data, 1)[1] accuracy = (prediction.eq(labels.data).sum() / labels.size(0)) * 100 return loss.data[0], accuracy # Record train accuracy train_loss, train_acc = evaluate(x_train, y_train) train_tape[0].append(train_loss) train_tape[1].append(train_acc) # Record valid accuracy valid_loss, valid_acc = evaluate(x_valid, y_valid) valid_tape[0].append(valid_loss) valid_tape[1].append(valid_acc) # Record test accuracy test_loss, test_acc = evaluate(x_test, y_test) test_tape[0].append(test_loss) test_tape[1].append(test_acc) print("BEFORE TRAINING \nLoss : {0:.3f} \nAcc : {1:.3f}".format( valid_loss, valid_acc)) # TRAINING LOOP for epoch in range(1, config["max_epochs"]): start = time.time() for i, (x_batch, y_batch) in enumerate(loader): #pdb.set_trace() if torch.cuda.is_available(): x_batch = Variable(x_batch).cuda(gpu_id) y_batch = Variable(y_batch).cuda(gpu_id) else: x_batch = Variable(x_batch) y_batch = Variable(y_batch) # Empties the gradients optimizer.zero_grad() # Feedforward through the model output = model(x_batch) # Computes the loss loss = loss_fn(output, y_batch) #print(i, loss) #if i % 10 == 0: # print("LOSS : {}".format(loss)) # print("MAX : {}".format(torch.max(output)[0])) # time.sleep(2) # Backpropagates to compute the gradients loss.backward() # Takes one training step optimizer.step() # Record train accuracy train_loss, train_acc = evaluate(x_train, y_train) train_tape[0].append(train_loss) train_tape[1].append(train_acc) # Record valid accuracy valid_loss, valid_acc = evaluate(x_valid, y_valid) valid_tape[0].append(valid_loss) valid_tape[1].append(valid_acc) # Record test accuracy test_loss, test_acc = evaluate(x_test, y_test) test_tape[0].append(test_loss) test_tape[1].append(test_acc) print("Epoch {0} \nLoss : {1:.3f} \nAcc : {2:.3f}".format( epoch, valid_loss, valid_acc)) print("Time : {0:.2f}".format(time.time() - start)) if not os.path.exists(save_dir): os.makedirs(save_dir) # Saves the graphs utils.save_results(train_tape, valid_tape, test_tape, save_dir, exp_name, config) utils.update_comparative_chart(save_dir, config['show_test']) return