def __init__(self, env, args): super(DQNTrainer).__init__() self.model = DQN(env, args, Nash=False).to(args.device) self.target = DQN(env, args, Nash=False).to(args.device) self.replay_buffer = ReplayBuffer(args.buffer_size) self.optimizer = optim.Adam(self.model.parameters(), lr=args.lr) self.args = args
def __init__(self, env, device, model_dir, args): self.env = env self.env_name = args.env_name self.seed = args.seed self.state_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] self.max_action = float(self.env.action_space.high[0]) self.batch_size = args.batch_size self.max_timesteps = args.max_timesteps self.gaussian_std = args.gaussian_std self.start_timesteps = args.start_timesteps self.eval_freq = args.eval_freq self.rand_action_p = args.rand_action_p self.model_dir = os.path.join(model_dir, f"{args.env_name}_{args.seed}") self.algo = DDPG(self.state_dim, self.action_dim, self.max_action, device) self.storage = ReplayBuffer(self.state_dim, self.action_dim, device) self.eval_rewards = [] self.total_steps = 0 self.episodes = 0 self.episode_steps = 0 self.episode_rewards = 0 self.state = None
class DQNTrainer(): def __init__(self, env, args): super(DQNTrainer).__init__() self.model = DQN(env, args, Nash=False).to(args.device) self.target = DQN(env, args, Nash=False).to(args.device) self.replay_buffer = ReplayBuffer(args.buffer_size) self.optimizer = optim.Adam(self.model.parameters(), lr=args.lr) self.args = args def push(self, s, a, r, s_, d): self.replay_buffer.push(s, a, r, s_, np.float32(d)) def update(self): state, action, reward, next_state, done = self.replay_buffer.sample( self.args.batch_size) state = torch.FloatTensor(np.float32(state)).to(self.args.device) next_state = torch.FloatTensor(np.float32(next_state)).to( self.args.device) action = torch.LongTensor(action).to(self.args.device) reward = torch.FloatTensor(reward).to(self.args.device) done = torch.FloatTensor(done).to(self.args.device) # Q-Learning with target network q_values = self.model(state) target_next_q_values = self.target(next_state) q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) next_q_value = target_next_q_values.max(1)[0] expected_q_value = reward + ( self.args.gamma**self.args.multi_step) * next_q_value * (1 - done) # Huber Loss loss = F.smooth_l1_loss(q_value, expected_q_value.detach(), reduction='none') loss = loss.mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss.item() def act(self, s, args): return self.model.act(s, args) def save_model(self, model_path): torch.save(self.model.state_dict(), model_path + 'dqn') torch.save(self.target.state_dict(), model_path + 'dqn_target')
def __init__(self, env_name, seed, buffer_dir, summary_dir, max_timesteps, eval_freq, batch_size, state_dim, action_dim, device, gamma, tau, lmbda): self.env_name = env_name self.seed = seed self.device = device self.batch_size = batch_size self.max_timesteps = max_timesteps self.eval_freq = eval_freq self.gamma = gamma self.tau = tau self.lmbda = lmbda self.store = ReplayBuffer(batch_size, state_dim, action_dim, device) self.store.load(buffer_dir) self.training_iters = 0 self.writer = SummaryWriter(log_dir=summary_dir)
def main(args, idx): # Create summary writer writer_path = os.path.join(args.log_dir, args.task_id, args.run_id + '-' + str(idx)) writer = SummaryWriter(log_dir=writer_path) # Create training envs envs = make_vec_envs(args.task_id, args.seed, args.num_processes, args.gamma, args.monitor_dir, args.device) obs_size = envs.observation_space.shape[0] act_size = envs.action_space.shape[0] # Create NN actor_critic = Policy(obs_size, act_size, action_range=[envs.action_space.low[0], envs.action_space.high[0]]) actor_critic.to(args.device) # Create ppo agent agent = PPO( actor_critic=actor_critic, device=args.device, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, clip_param=args.clip_param, ppo_epoch=args.ppo_epoch, num_mini_batch=args.num_mini_batch, value_loss_coef=args.value_loss_coef, entropy_coef=args.entropy_coef, ) # Create replay buffer buffer = ReplayBuffer(args.num_steps, args.num_processes, obs_size, act_size) buffer.to(args.device) # Reset envs obs = envs.reset() buffer.obs[0].copy_(obs) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int(args.num_env_steps) // args.num_steps // args.num_processes for j in tqdm(range(num_updates)): if args.use_linear_lr_decay: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) # Collect trajectories and compute returns with torch.no_grad(): for step in range(args.num_steps): # Sample actions action = actor_critic(buffer.obs[step]) # Get trajectories from envs obs, reward, done, infos = envs.step(action) mask = torch.tensor( [[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float, device=args.device) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # Store trajectories buffer.insert(obs, action, reward, mask) # Compute returns batch_obs = buffer.obs.view(-1, obs_size) value = actor_critic.get_value(batch_obs).view(args.num_steps + 1, args.num_processes, 1) batch_obs = buffer.obs[:-1].view(-1, obs_size) batch_action = buffer.actions.view(-1, act_size) action_log_prob = actor_critic.get_act_log_prob(batch_obs, batch_action).view(args.num_steps, args.num_processes, 1) buffer.update_value_log_prob(value, action_log_prob) buffer.compute_returns(args.gamma, args.gae_lambda) # Update policy agent_output = agent.update(buffer) buffer.after_update() # Log stuff if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() speed = int(total_num_steps / (end - start)) print( "Updates {}, num timesteps {}, FPS {} \n " "Last {} training episodes: mean/median reward {:.1f}/{:.1f}, " "min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, speed, len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), )) writer.add_scalar('mean_reward', np.mean(episode_rewards), total_num_steps) writer.add_scalar('speed', speed, total_num_steps) for key in agent_output.keys(): writer.add_scalar(key, agent_output[key], total_num_steps) if args.task_id == 'Pendulum-v0' and np.mean(episode_rewards) > -250: break envs.close() writer.close()
class Base: def __init__(self, env, device, model_dir, args): self.env = env self.env_name = args.env_name self.seed = args.seed self.state_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] self.max_action = float(self.env.action_space.high[0]) self.batch_size = args.batch_size self.max_timesteps = args.max_timesteps self.gaussian_std = args.gaussian_std self.start_timesteps = args.start_timesteps self.eval_freq = args.eval_freq self.rand_action_p = args.rand_action_p self.model_dir = os.path.join(model_dir, f"{args.env_name}_{args.seed}") self.algo = DDPG(self.state_dim, self.action_dim, self.max_action, device) self.storage = ReplayBuffer(self.state_dim, self.action_dim, device) self.eval_rewards = [] self.total_steps = 0 self.episodes = 0 self.episode_steps = 0 self.episode_rewards = 0 self.state = None def iterate(self): assert self.state is not None self.episode_steps += 1 if self.is_random_action(): action = self.env.action_space.sample() else: action = (self.algo.select_action(np.array(self.state)) + np.random.normal(0, self.max_action * self.gaussian_std, size=self.action_dim)).clip( -self.max_action, self.max_action) next_state, reward, done, _ = self.env.step(action) done_bool = float( done) if self.episode_steps < self.env._max_episode_steps else 0 self.storage.add(self.state, action, next_state, reward, done_bool) self.state = next_state self.episode_rewards += reward if done: print(f"Total T: {self.total_steps + 1} " f"Episode Num: {self.episodes + 1} " f"Episode T: {self.episode_steps} " f"Reward: {self.episode_rewards:.3f}") # Reset environment self.state = self.env.reset() self.episode_rewards = 0 self.episode_steps = 0 self.episodes += 1 self.total_steps += 1 def evaluate(self, eval_episodes=10): eval_env = gym.make(self.env_name) eval_env.seed(self.seed + 100) avg_reward = 0. for _ in range(eval_episodes): state, done = eval_env.reset(), False while not done: action = self.algo.select_action(np.array(state)) state, reward, done, _ = eval_env.step(action) avg_reward += reward avg_reward /= eval_episodes print("---------------------------------------") print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}") print("---------------------------------------") return avg_reward
def train(env, args, writer): # RL Model for Player 1 p1_current_model = DQN(env, args).to(args.device) p1_target_model = DQN(env, args).to(args.device) update_target(p1_current_model, p1_target_model) # RL Model for Player 2 p2_current_model = DQN(env, args).to(args.device) p2_target_model = DQN(env, args).to(args.device) update_target(p2_current_model, p2_target_model) # SL Model for Player 1, 2 p1_policy = Policy(env).to(args.device) p2_policy = Policy(env).to(args.device) if args.load_model and os.path.isfile(args.load_model): load_model(models={ "p1": p1_current_model, "p2": p2_current_model }, policies={ "p1": p1_policy, "p2": p2_policy }, args=args) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) # Replay Buffer for Reinforcement Learning - Best Response p1_replay_buffer = ReplayBuffer(args.buffer_size) p2_replay_buffer = ReplayBuffer(args.buffer_size) # Reservoir Buffer for Supervised Learning - Average Strategy # TODO(Aiden): How to set buffer size of SL? p1_reservoir_buffer = ReservoirBuffer(args.buffer_size) p2_reservoir_buffer = ReservoirBuffer(args.buffer_size) # Deque data structure for multi-step learning p1_state_deque = deque(maxlen=args.multi_step) p1_reward_deque = deque(maxlen=args.multi_step) p1_action_deque = deque(maxlen=args.multi_step) p2_state_deque = deque(maxlen=args.multi_step) p2_reward_deque = deque(maxlen=args.multi_step) p2_action_deque = deque(maxlen=args.multi_step) # RL Optimizer for Player 1, 2 p1_rl_optimizer = optim.Adam(p1_current_model.parameters(), lr=args.lr) p2_rl_optimizer = optim.Adam(p2_current_model.parameters(), lr=args.lr) # SL Optimizer for Player 1, 2 # TODO(Aiden): Is it necessary to seperate learning rate for RL/SL? p1_sl_optimizer = optim.Adam(p1_policy.parameters(), lr=args.lr) p2_sl_optimizer = optim.Adam(p2_policy.parameters(), lr=args.lr) # Logging length_list = [] p1_reward_list, p1_rl_loss_list, p1_sl_loss_list = [], [], [] p2_reward_list, p2_rl_loss_list, p2_sl_loss_list = [], [], [] p1_episode_reward, p2_episode_reward = 0, 0 tag_interval_length = 0 prev_time = time.time() prev_frame = 1 # Main Loop (p1_state, p2_state) = env.reset() for frame_idx in range(1, args.max_frames + 1): is_best_response = False # TODO(Aiden): # Action should be decided by a combination of Best Response and Average Strategy if random.random() > args.eta: p1_action = p1_policy.act( torch.FloatTensor(p1_state).to(args.device)) p2_action = p2_policy.act( torch.FloatTensor(p1_state).to(args.device)) else: is_best_response = True epsilon = epsilon_by_frame(frame_idx) p1_action = p1_current_model.act( torch.FloatTensor(p1_state).to(args.device), epsilon) p2_action = p2_current_model.act( torch.FloatTensor(p2_state).to(args.device), epsilon) actions = {"1": p1_action, "2": p2_action} (p1_next_state, p2_next_state), reward, done, info = env.step(actions) # print(actions) # {'1': 3, '2': 2} # print(p1_next_state) # [[[127 127 ..... #print(reward, done, info) # [0 0] False None # Save current state, reward, action to deque for multi-step learning p1_state_deque.append(p1_state) p2_state_deque.append(p2_state) p1_reward = reward[0] - 1 if args.negative else reward[0] p2_reward = reward[1] - 1 if args.negative else reward[1] p1_reward_deque.append(p1_reward) p2_reward_deque.append(p2_reward) p1_action_deque.append(p1_action) p2_action_deque.append(p2_action) # Store (state, action, reward, next_state) to Replay Buffer for Reinforcement Learning if len(p1_state_deque) == args.multi_step or done: n_reward = multi_step_reward(p1_reward_deque, args.gamma) n_state = p1_state_deque[0] n_action = p1_action_deque[0] p1_replay_buffer.push(n_state, n_action, n_reward, p1_next_state, np.float32(done)) n_reward = multi_step_reward(p2_reward_deque, args.gamma) n_state = p2_state_deque[0] n_action = p2_action_deque[0] p2_replay_buffer.push(n_state, n_action, n_reward, p2_next_state, np.float32(done)) # Store (state, action) to Reservoir Buffer for Supervised Learning if is_best_response: p1_reservoir_buffer.push(p1_state, p1_action) p2_reservoir_buffer.push(p2_state, p2_action) (p1_state, p2_state) = (p1_next_state, p2_next_state) # Logging p1_episode_reward += p1_reward p2_episode_reward += p2_reward tag_interval_length += 1 if info is not None: length_list.append(tag_interval_length) tag_interval_length = 0 # Episode done. Reset environment and clear logging records if done or tag_interval_length >= args.max_tag_interval: (p1_state, p2_state) = env.reset() p1_reward_list.append(p1_episode_reward) p2_reward_list.append(p2_episode_reward) writer.add_scalar("p1/episode_reward", p1_episode_reward, frame_idx) writer.add_scalar("p2/episode_reward", p2_episode_reward, frame_idx) writer.add_scalar("data/tag_interval_length", tag_interval_length, frame_idx) p1_episode_reward, p2_episode_reward, tag_interval_length = 0, 0, 0 p1_state_deque.clear(), p2_state_deque.clear() p1_reward_deque.clear(), p2_reward_deque.clear() p1_action_deque.clear(), p2_action_deque.clear() if (len(p1_replay_buffer) > args.rl_start and len(p1_reservoir_buffer) > args.sl_start and frame_idx % args.train_freq == 0): # Update Best Response with Reinforcement Learning loss = compute_rl_loss(p1_current_model, p1_target_model, p1_replay_buffer, p1_rl_optimizer, args) p1_rl_loss_list.append(loss.item()) writer.add_scalar("p1/rl_loss", loss.item(), frame_idx) loss = compute_rl_loss(p2_current_model, p2_target_model, p2_replay_buffer, p2_rl_optimizer, args) p2_rl_loss_list.append(loss.item()) writer.add_scalar("p2/rl_loss", loss.item(), frame_idx) # Update Average Strategy with Supervised Learning loss = compute_sl_loss(p1_policy, p1_reservoir_buffer, p1_sl_optimizer, args) p1_sl_loss_list.append(loss.item()) writer.add_scalar("p1/sl_loss", loss.item(), frame_idx) loss = compute_sl_loss(p2_policy, p2_reservoir_buffer, p2_sl_optimizer, args) p2_sl_loss_list.append(loss.item()) writer.add_scalar("p2/sl_loss", loss.item(), frame_idx) if frame_idx % args.update_target == 0: update_target(p1_current_model, p1_target_model) update_target(p2_current_model, p2_target_model) # Logging and Saving models if frame_idx % args.evaluation_interval == 0: print_log(frame_idx, prev_frame, prev_time, (p1_reward_list, p2_reward_list), length_list, (p1_rl_loss_list, p2_rl_loss_list), (p1_sl_loss_list, p2_sl_loss_list)) p1_reward_list.clear(), p2_reward_list.clear(), length_list.clear() p1_rl_loss_list.clear(), p2_rl_loss_list.clear() p1_sl_loss_list.clear(), p2_sl_loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(models={ "p1": p1_current_model, "p2": p2_current_model }, policies={ "p1": p1_policy, "p2": p2_policy }, args=args) # Render if rendering argument is on if args.render: env.render() save_model(models={ "p1": p1_current_model, "p2": p2_current_model }, policies={ "p1": p1_policy, "p2": p2_policy }, args=args)
class Algo(object): def __init__(self, env_name, seed, buffer_dir, summary_dir, max_timesteps, eval_freq, batch_size, state_dim, action_dim, device, gamma, tau, lmbda): self.env_name = env_name self.seed = seed self.device = device self.batch_size = batch_size self.max_timesteps = max_timesteps self.eval_freq = eval_freq self.gamma = gamma self.tau = tau self.lmbda = lmbda self.store = ReplayBuffer(batch_size, state_dim, action_dim, device) self.store.load(buffer_dir) self.training_iters = 0 self.writer = SummaryWriter(log_dir=summary_dir) def run(self): while self.training_iters < self.max_timesteps: self.train(iterations=int(self.eval_freq)) self.eval_policy(self.env_name, self.seed) self.training_iters += self.eval_freq print(f"Training iterations: {self.training_iters}") def eval_policy(self, env_name, seed, eval_episodes=10): eval_env = gym.make(env_name) eval_env.seed(seed + 100) avg_reward = 0. avg_q = 0 for _ in range(eval_episodes): state, done = eval_env.reset(), False while not done: action, q = self.select_action(np.array(state)) state, reward, done, _ = eval_env.step(action) avg_reward += reward avg_q += q avg_reward /= eval_episodes avg_q /= eval_episodes print("---------------------------------------") print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}") print("---------------------------------------") self.writer.add_scalar( 'eval/return', avg_reward, self.training_iters) self.writer.add_scalar( 'eval/Estimate Q', avg_q, self.training_iters) def update_vae(self, state, action): recon, mean, std = self.vae(state, action) recon_loss = F.mse_loss(recon, action) kl_loss = -0.5 * (1 + torch.log(std.pow(2)) - mean.pow(2) - std.pow(2)).mean() vae_loss = recon_loss + 0.5 * kl_loss # >> norms norms = 0 for param in self.vae.parameters(): norms += torch.sum(torch.square(param)) # >> norms loss = ( vae_loss # + 1e-4 * norms ) self.vae_optimizer.zero_grad() loss.backward() self.vae_optimizer.step() def update_critic(self, state, action, next_state, next_action, reward, not_done): with torch.no_grad(): next_q1, next_q2 = self.critic_target( next_state, next_action) next_q = self.lmbda * torch.min( next_q1, next_q2) + (1. - self.lmbda) * torch.max(next_q1, next_q2) next_q = next_q.reshape(self.batch_size, -1).max(1)[0].reshape(-1, 1) target_q = reward + not_done * self.gamma * next_q curr_q1, curr_q2 = self.critic(state, action) critic_loss = F.mse_loss(curr_q1, target_q) + F.mse_loss(curr_q2, target_q) # # >> norms # norms = 0 # for param in self.critic.parameters(): # norms += torch.sum(torch.square(param)) # # >> norms loss = ( critic_loss # + 1e-5 * norms ) self.critic_optimizer.zero_grad() loss.backward() self.critic_optimizer.step() def update_actor(self, state): sampled_actions = self.vae.decode(state) perturbed_actions = self.actor(state, sampled_actions) actor_loss = -self.critic.q1(state, perturbed_actions).mean() # # >> norms # norms = 0 # for param in self.critic.parameters(): # norms += torch.sum(torch.square(param)) # # >> norms loss = ( actor_loss # + 1e-5 * norms ) self.actor_optimizer.zero_grad() loss.backward() self.actor_optimizer.step() def update_targets(self): for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)