class TD3: def __init__(self, actor, critic, reward_fun, gamma=0.99, tau=0.005, # policy_noise=0.2, noise_clip=0.5, policy_freq=2, max_buffer_size=1e6, batch_size=64, lr=3e-4 ): self._actor = actor self._actor_target = copy.deepcopy(self._actor) self._actor_optimizer = torch.optim.Adam(self._actor.parameters(), lr=lr) self._critic = critic self._critic_target = copy.deepcopy(self._critic) self._critic_loss = nn.MSELoss() self._critic_optimizer = torch.optim.Adam(self._critic.parameters(), lr=lr) self.reward_fun = reward_fun self._gamma = gamma self._tau = tau self._policy_freq = policy_freq self._rbuffer_max_size = max_buffer_size self._replay_buffer = ReplayBuffer(self._rbuffer_max_size) self._batch_size = batch_size self._steps = 0 self._run = 0 def get_action(self, s, deterministic=False): return self._actor.get_action(s, deterministic=deterministic) def train(self, env, episodes, time_steps, initial_state=None, initial_noise=0.5): stats = EpisodeStats(episode_lengths=np.zeros(episodes), episode_rewards=np.zeros(episodes), episode_loss=np.zeros(episodes)) self._run += 1 for e in range(episodes): s = env.reset(initial_state=initial_state, noise_amplitude=initial_noise) for t in range(time_steps): a = self._actor.get_action(s, deterministic=False) ns, r, d, _ = env.step(tn(a)) stats.episode_rewards[e] += r stats.episode_lengths[e] = t self._steps += 1 self._replay_buffer.add_transition(s, a, ns, r, d) # Sample replay buffer b_states, b_actions, b_nstates, b_rewards, b_terminal = self._replay_buffer.random_next_batch(self._batch_size) # Get action according to target actor policy b_nactions = self._actor_target.get_action(b_nstates, deterministic=False) # Compute the target Q value from target critic target_Q1, target_Q2 = self._critic_target(b_nstates, b_nactions) target_Q = torch.min(target_Q1, target_Q2).reshape((-1)) target_Q = b_rewards + (1 - b_terminal) * self._gamma * target_Q target_Q = target_Q.reshape((-1, 1)).detach() # Get current Q estimates from critic current_Q1, current_Q2 = self._critic(b_states, b_actions) # Compute critic loss critic_loss = self._critic_loss(current_Q1, target_Q) + self._critic_loss(current_Q2, target_Q) stats.episode_loss[e] += critic_loss.item() # Optimize the critic self._critic_optimizer.zero_grad() critic_loss.backward() self._critic_optimizer.step() # Delayed policy updates if self._steps % self._policy_freq == 0: # Compute actor losses by the deterministic policy gradient actor_loss = -self._critic.Q1(b_states, self._actor.get_action(b_states, deterministic=True)).mean() # Optimize the actor self._actor_optimizer.zero_grad() actor_loss.backward() self._actor_optimizer.step() # Soft-Update the target models soft_update(self._critic_target, self._critic, self._tau) soft_update(self._actor_target, self._actor, self._tau) if d: break s = ns pr_stats = {'run': self._run, 'steps': int(stats.episode_lengths[e] + 1), 'episode': e + 1, 'episodes': episodes, 'reward': stats.episode_rewards[e], 'loss': stats.episode_loss[e]} print_stats(pr_stats) return stats def reset_parameters(self): self._actor.reset_parameters() self._actor_target.reset_parameters() self._critic.reset_parameters() self._critic_target.reset_parameters() hard_update(self._actor_target, self._actor) hard_update(self._critic_target, self._critic) self._steps = 0 self._replay_buffer = ReplayBuffer(self._rbuffer_max_size)
class DQN: def __init__(self, policy, action_fun, q, q_target, state_dim, action_dim, gamma, double_q=True, reward_fun=None, replay_buffer=False, max_buffer_size=1e6, batch_size=64, tau=0.01, lr=1e-4): self._q = q self._q_target = q_target self._pi = policy self._action_fun = action_fun self.reward_fun = reward_fun self._doubleQ = double_q if torch.cuda.is_available(): self._q.cuda() self._q_target.cuda() self._gamma = gamma self._tau = tau self._state_dim = state_dim self._action_dim = action_dim self._use_rbuffer = replay_buffer if self._use_rbuffer: self._rbuffer_max_size = max_buffer_size self._replay_buffer = ReplayBuffer(self._rbuffer_max_size) self._batch_size = batch_size self._learning_rate = lr self._loss_function = nn.MSELoss() self._q_optimizer = optim.Adam(self._q.parameters(), lr=self._learning_rate) self._run = 0 def _get_action(self, s, deterministic=False): return self._pi.get_action(s, deterministic=deterministic) def get_action(self, s, deterministic=False): return self._action_fun.act2env( self._get_action(s, deterministic=deterministic)) def train(self, env, episodes, time_steps, initial_state=None, initial_noise=0.5): stats = EpisodeStats(episode_lengths=np.zeros(episodes), episode_rewards=np.zeros(episodes), episode_loss=np.zeros(episodes)) self._run += 1 for e in range(episodes): s = env.reset(initial_state=initial_state, noise_amplitude=initial_noise) total_r = 0 # Step policy for advancing the scheduler epsilon = self._pi.epsilon() # print("\t\t\tStep: {:5d} Epsilon: {:6.5f}".format(t, epsilon)) self._pi.step() for t in range(time_steps): a = self._get_action(s) ns, r, d, _ = env.step(self._action_fun.act2env(a)) stats.episode_rewards[e] += r stats.episode_lengths[e] = t total_r += r if self._use_rbuffer: self._replay_buffer.add_transition(s, a, ns, r, d) b_states, b_actions, b_nstates, b_rewards, b_terminal = self._replay_buffer.random_next_batch( self._batch_size) dim = 1 else: b_states = s b_actions = a b_nstates = ns b_rewards = r b_terminal = d dim = 0 if self._doubleQ: # Q-Values from next states [Q] used only to determine the optima next actions q_nstates = self._q(b_nstates) # Optimal Action Prediction [Q] nactions = torch.argmax(q_nstates, dim=dim) if self._use_rbuffer: nactions = [ torch.arange(self._batch_size).long(), nactions ] # Q-Values from [Q_target] function using the action indices from [Q] function q_target_nstates = self._q_target(b_nstates)[nactions] else: q_target_nstates = self._q_target(b_nstates) q_target_nstates = torch.max(q_target_nstates, dim=dim) target_prediction = b_rewards + ( 1 - b_terminal) * self._gamma * q_target_nstates if self._use_rbuffer: q_actions = [ torch.arange(self._batch_size).long(), b_actions.long() ] else: q_actions = b_actions current_prediction = self._q(b_states)[q_actions] loss = self._loss_function(current_prediction, target_prediction.detach()) stats.episode_loss[e] += loss.item() self._q_optimizer.zero_grad() loss.backward() self._q_optimizer.step() soft_update(self._q_target, self._q, self._tau) if d: break s = ns pr_stats = { 'run': self._run, 'steps': int(stats.episode_lengths[e] + 1), 'episode': e + 1, 'episodes': episodes, 'reward': stats.episode_rewards[e], 'loss': stats.episode_loss[e] } print_stats(pr_stats, ', Epsilon: {:6.5f}'.format(epsilon)) return stats def reset_parameters(self): self._q.reset_parameters() self._q_target.reset_parameters() hard_update(self._q_target, self._q) self._pi.reset_parameters() if self._use_rbuffer: self._replay_buffer = ReplayBuffer(self._rbuffer_max_size)