def __init__(self, state_size, action_size, num_agents, random_seed, lr_actor=1e-4, lr_critic=1e-3, fc1_units=400, fc2_units=300, buffer_size=int(1e5), batch_size=128, gamma=0.99, tau=1e-3, max_norm=1.0, learn_period=20, learn_sampling_num=10): """Initialize an Agent object. Args: state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed max_norm (float): value of clip_grad_norm for critic optimizer """ super().__init__() self.state_size = state_size self.num_agents = num_agents self.action_size = action_size self.seed = random.seed(random_seed) self.max_norm = max_norm self.learn_period = learn_period self.learn_sampling_num = learn_sampling_num # Actor Network (w/ Target Network) self.actor_local = DDPGActor(state_size, action_size, random_seed, fc1_units=fc1_units, fc2_units=fc2_units).to(device) self.actor_target = DDPGActor(state_size, action_size, random_seed, fc1_units=fc1_units, fc2_units=fc2_units).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = DDPGCritic(state_size, action_size, random_seed, fcs1_units=fc1_units, fc2_units=fc2_units).to(device) self.critic_target = DDPGCritic(state_size, action_size, random_seed, fcs1_units=fc1_units, fc2_units=fc2_units).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic) # Noise process for action # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 # (Timothy Lillicrap, 2016) self.exploration_sigma = 0.2 # (Timothy Lillicrap, 2016) # self.noise = OUNoise(action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.noise = OUNoiseMultivariate((num_agents, action_size), random_seed, mu=self.exploration_mu, theta=self.exploration_theta, sigma=self.exploration_sigma) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed, device) # parameter of discounted reward self.gamma = gamma # soft update parameter self.tau = tau self.batch_size = batch_size
class DDPGAgentVersion3(BaseAgent): def __init__(self, state_size, action_size, num_agents, random_seed, lr_actor=1e-4, lr_critic=1e-3, fc1_units=400, fc2_units=300, buffer_size=int(1e5), batch_size=128, gamma=0.99, tau=1e-3, max_norm=1.0, learn_period=20, learn_sampling_num=10): """Initialize an Agent object. Args: state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed max_norm (float): value of clip_grad_norm for critic optimizer """ super().__init__() self.state_size = state_size self.num_agents = num_agents self.action_size = action_size self.seed = random.seed(random_seed) self.max_norm = max_norm self.learn_period = learn_period self.learn_sampling_num = learn_sampling_num # Actor Network (w/ Target Network) self.actor_local = DDPGActor(state_size, action_size, random_seed, fc1_units=fc1_units, fc2_units=fc2_units).to(device) self.actor_target = DDPGActor(state_size, action_size, random_seed, fc1_units=fc1_units, fc2_units=fc2_units).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = DDPGCritic(state_size, action_size, random_seed, fcs1_units=fc1_units, fc2_units=fc2_units).to(device) self.critic_target = DDPGCritic(state_size, action_size, random_seed, fcs1_units=fc1_units, fc2_units=fc2_units).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic) # Noise process for action # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 # (Timothy Lillicrap, 2016) self.exploration_sigma = 0.2 # (Timothy Lillicrap, 2016) # self.noise = OUNoise(action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.noise = OUNoiseMultivariate((num_agents, action_size), random_seed, mu=self.exploration_mu, theta=self.exploration_theta, sigma=self.exploration_sigma) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed, device) # parameter of discounted reward self.gamma = gamma # soft update parameter self.tau = tau self.batch_size = batch_size def step(self, states, actions, rewards, next_states, dones, time_step): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for i in range(self.num_agents): self.memory.add(states[i, :], actions[i, :], rewards[i], next_states[i, :], dones[i]) #self.memory.add_batch(states, actions, rewards, next_states, dones) # Learn, if enough samples are available in memory if (len(self.memory) > self.batch_size) and (time_step % self.learn_period == 0): for _ in range(self.learn_sampling_num): experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + gamma * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Args: experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # train critic # loss fuction = Q_target(TD 1-step boostrapping) - Q_local(current) actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), self.max_norm) self.critic_optimizer.step() # train actor (policy gradient) actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # update critic_target self.soft_update(self.critic_local, self.critic_target, self.tau) # update actor_target self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Args: local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def model_dicts(self): return {'actor': self.actor_target, 'critic': self.critic_target}