def test_agent(env, W, Q_w, Q_E, gamma, training_steps, max_ep_steps, device, n_test_runs, test_log_file): """ This function tests the overall performance of the agent, given that it has already learnt the keyboard. env: Environment W: Weight vector to be learnt over cumulants Q_w: Q-function over weight vectors Q_E: Q-functions over all cumulants and options gamma: Discount factor training_steps: Number of steps for which agent is to be trained max_ep_steps: Maximum number of steps in an episode device: cpu or gpu n_test_runs: Number of episodes for which performance is tested test_log_file: Path to log file for test results """ ep_returns = [] for _ in range(n_test_runs): s = env.reset() s = torch.from_numpy(s).float().to(device) n_steps = 0 ep_return = 0 done = False while n_steps < max_ep_steps and not done: with torch.no_grad(): q = Q_w(s) w = W[torch.argmax(q)] (s_next, done, r_prime, gamma_prime, n_steps, info) = option_keyboard(env, s, w, Q_E, gamma, n_steps, max_ep_steps, device) s_next = torch.from_numpy(s_next).float().to(device) s = (s_next if not done else torch.from_numpy( env.reset()).float().to(device)) ep_return += sum(info['rewards']) ep_returns.append(ep_return) print('Steps:', training_steps, 'Avg. return:', sum(ep_returns) / n_test_runs, 'Episodic return:', ep_returns) logfile = open(test_log_file, 'a+b') pickle.dump({'steps': training_steps, 'returns': ep_returns}, logfile) logfile.close() return ep_returns
def main(): args = parser.parse_args() env = gym.make(args.env_name) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") set_global_seed(args.seed) d = env.num_resources() hyperparams_file = open( os.path.join( args.saved_models.split('saved_models')[0], 'hyperparams'), 'rb') # Loading saved models and constant values returns = [] if args.save_path: fp = open(args.save_path, 'a+b') W = [x for x in product([-1, 0, 1], repeat=2) if sum(x) >= 0] W.remove((0, 0)) W = np.array(W) hyperparams = pickle.load(hyperparams_file) gamma = hyperparams.gamma_ok max_ep_steps = hyperparams.max_steps_agent value_fns = [ ValueFunction(input_dim=env.observation_space.shape[0] + d, action_dim=(env.action_space.n + 1), n_options=d, hidden=[64, 128], batch_size=hyperparams.ok_batch_size, gamma=gamma, alpha=hyperparams.alpha_ok) for _ in range(d) ] Q_w = MlpDiscrete(input_dim=env.observation_space.shape[0], output_dim=W.shape[0], hidden=[64, 128]) for i in range(env.num_resources()): if not torch.cuda.is_available(): checkpoint = torch.load(os.path.join(args.saved_models, 'value_fn_%d.pt' % (i + 1)), map_location=torch.device('cpu')) else: checkpoint = torch.load( os.path.join(args.saved_models, 'value_fn_%d.pt' % (i + 1))) value_fns[i].q_net.load_state_dict(checkpoint['Q']) value_fns[i].q_net.to(device) if not torch.cuda.is_available(): checkpoint = torch.load(os.path.join(args.saved_models, 'agent.pt'), map_location=torch.device('cpu')) else: checkpoint = torch.load(os.path.join(args.saved_models, 'agent.pt')) Q_w.load_state_dict(checkpoint['Q']) Q_w.to(device) # ######## for _ in range(args.n_test_episodes): s = env.reset() done = False s = torch.from_numpy(s).float().to(device) n_steps = 0 ret = 0 while not done: w = W[torch.argmax(Q_w(s))] (s_next, done, _, _, n_steps, info) = option_keyboard(env, s, w, value_fns, gamma, n_steps, max_ep_steps, device, args.visualize) ret += sum(info['rewards']) s = torch.from_numpy(s_next).float().to(device) print('Episode return:', ret) returns.append(ret) returns = np.array(returns) print('Mean: %f, Std. dev: %f' % (returns.mean(), returns.std())) pickle.dump({'Seed': args.seed, 'Returns': returns}, fp) fp.close()
def test_learning_options(env, Q_E, index, w, gamma, training_steps, max_ep_steps, device, n_test_runs, log_file): """ This function tests the performance of the agent for different weight vectors w. This is typically used to see how the options being learnt perform for w = (1, 1), which would optimize for all types of food items. We also record performance for w = (1, -1) and w = (-1, 1) since the keyboard learnt should perform reasonably well for all configurations of w that we consider. env: Environment Q_E: Q-functions over all cumulants and options index: Index of cumulant for which performance is measured w: Weight vector (kept constant to measure keyboard performance) gamma: Discount factor training_steps: Number of steps for which agent is to be trained max_ep_steps: Maximum number of steps in an episode device: cpu or gpu n_test_runs: Number of episodes for which performance is tested log_file: Path to log file for test results """ ep_returns = [] cumulant_returns = [] env.set_learning_options(w, True) for _ in range(n_test_runs): s = env.reset() s = torch.from_numpy(s).float().to(device) n_steps = 0 ep_return = 0 cumulant_return = 0 done = False while n_steps < max_ep_steps and not done: (s_next, done, r_prime, gamma_prime, n_steps, info) = option_keyboard(env, s, w, Q_E, gamma, n_steps, max_ep_steps, device) s_next = torch.from_numpy(s_next).float().to(device) s = (s_next if not done else torch.from_numpy( env.reset()).float().to(device)) ep_return += sum(info['rewards']) cumulant_return += sum([ info['env_info'][i]['rewards'][index] for i in range(len(info['env_info'])) ]) ep_returns.append(ep_return) cumulant_returns.append(cumulant_return) print('w:', w, 'Steps:', training_steps, 'Avg. return:', sum(ep_returns) / n_test_runs, 'Episodic return:', ep_returns, 'Cumulant return:', cumulant_returns) logfile = open(log_file, 'a+b') pickle.dump( { 'steps': training_steps, 'returns': ep_returns, 'cumulant_returns': cumulant_returns }, logfile) logfile.close() env.set_learning_options(np.ones(len(w)), True) return ep_returns, cumulant_returns
def keyboard_player(env, W, Q, alpha, eps, gamma, training_steps, batch_size, pretrained_agent, max_ep_steps, device, test_interval, n_test_runs, log_file, log_dir): """ env: Environment W: Weight vector to be learnt over cumulants Q: Q-functions over all cumulants and options alpha: learning rate eps: Exploration parameter over weight vector w gamma: Discount factor training_steps: Number of steps for which agent is to be trained batch_size: Batch size for updating Q-values pretrained_agent: Path to pretrained agent model max_ep_steps: Maximum number of steps in an episode device: cpu or gpu test_interval: Number of steps after which agent is tested n_test_runs: Number of episodes for which performance is tested log_file: File to store episode return logs log_dir: Directory where logs and intermediate models are saved """ n = W.shape[0] Q_w = MlpDiscrete(input_dim=env.observation_space.shape[0], output_dim=n, hidden=[64, 128]) Q_w.to(device) optimizer = Adam(Q_w.parameters(), lr=alpha) s = env.reset() s = torch.from_numpy(s).float().to(device) n_steps = 0 best_avg_return = -100 # random low value, needs to be changed! q_loss = 0 n_items_batch = 0 done = False writer = {} writer['writer'] = SummaryWriter(os.path.join(log_dir, 'runs')) # Load pretrained agent, if available if pretrained_agent: checkpoint = torch.load(os.path.join(pretrained_agent, 'agent.pt')) n_steps = checkpoint['steps'] Q_w.load_state_dict(checkpoint['Q']) optimizer.load_state_dict(checkpoint['optimizer']) best_avg_return = checkpoint['best_avg_return'] # Start learning while n_steps < training_steps: if done: s = torch.from_numpy(env.reset()).float().to(device) done = False q = Q_w(s) # Epsilon-greedy exploration if np.random.binomial(1, eps): w_index = torch.tensor(np.random.randint(n)).to(device) else: w_index = torch.argmax(q) w = W[w_index] (s_next, done, r_prime, gamma_prime, n_steps, _) = option_keyboard(env, s, w, Q, gamma, n_steps, max_ep_steps, device) s_next = torch.from_numpy(s_next).float().to(device) q_next = Q_w(s_next).detach() td_error = r_prime + gamma_prime * q_next.max() - q[w_index] q_loss += 0.5 * (td_error**2) n_items_batch += 1 # Update the networks if n_items_batch == batch_size: optimizer.zero_grad() writer['writer'].add_scalar('agent/Q', q_loss.item(), n_steps + 1) q_loss.backward() optimizer.step() q_loss = 0 n_items_batch = 0 s = s_next # Test the agent at intermediate time steps and save current and best # models if n_steps % test_interval == 0: ep_returns = test_agent(env, W, Q_w, Q, gamma, n_steps, max_ep_steps, device, n_test_runs, log_file) writer['writer'].add_scalar('episode_returns/Agent', sum(ep_returns) / n_test_runs, n_steps) torch.save( { 'steps': n_steps, 'Q': Q_w.state_dict(), 'optimizer': optimizer.state_dict(), 'best_avg_return': best_avg_return }, os.path.join(log_dir, 'saved_models', 'agent.pt')) if sum(ep_returns) / n_test_runs > best_avg_return: best_avg_return = sum(ep_returns) / n_test_runs torch.save( { 'steps': n_steps, 'Q': Q_w.state_dict(), 'optimizer': optimizer.state_dict(), 'best_avg_return': best_avg_return }, os.path.join(log_dir, 'saved_models', 'best', 'agent.pt')) return Q_w