class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, device="cpu"): """Initialize an Agent object. Params ------ state_size : int dimension of each state action_size : int dimension of each action random_seed : int random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.device = device # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(self.device) self.actor_local.apply(initialize_weights) self.actor_target = Actor(state_size, action_size, random_seed).to(self.device) self.actor_target.apply(initialize_weights) self.actor_target.eval() self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(self.device) self.critic_local.apply(initialize_weights) self.critic_target = Critic(state_size, action_size, random_seed).to(self.device) self.critic_target.apply(initialize_weights) self.critic_target.eval() self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed + 1, mu=0., theta=THETA, sigma=SIGMA) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed + 2, self.device) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % UPDATE_EVERY # Learn, if enough samples are available in memory if self.t_step == 0 and len(self.memory) > BATCH_SIZE: for _ in range(N_LEARNING): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): def __init__(self, n_state, n_action, n_agents, random_seed, device="cpu"): """Initialize an Agent object. Params ------ n_state : int dimension of each state n_action : int dimension of each action random_seed : int random seed device : which device is used, cpu or cuda. """ self.n_state = n_state self.n_action = n_action self.n_agents = n_agents self.random_seed = np.random.seed(random_seed) self.device = device # Networks for the first agent # Local Actor, Local Critic, Target Actor, Target Critic self.actor_local1 = Actor(self.n_state, self.n_action, self.random_seed).to(self.device) self.actor_local1.apply(initialize_weights) self.critic_local1 = Critic(self.n_state * self.n_agents, self.n_action * self.n_agents, self.random_seed).to(self.device) self.critic_local1.apply(initialize_weights) self.actor_target1 = Actor(self.n_state, self.n_action, self.random_seed).to(self.device) self.actor_target1.apply(initialize_weights) self.actor_target1.eval() self.critic_target1 = Critic(self.n_state * self.n_agents, self.n_action * self.n_agents, self.random_seed).to(self.device) self.critic_target1.apply(initialize_weights) self.critic_target1.eval() # Networks for the second agent # Local Actor, Local Critic, Target Actor, Target Critic self.actor_local2 = Actor(self.n_state, self.n_action, self.random_seed).to(self.device) self.actor_local2.apply(initialize_weights) self.critic_local2 = Critic(self.n_state * self.n_agents, self.n_action * self.n_agents, self.random_seed).to(self.device) self.critic_local2.apply(initialize_weights) self.actor_target2 = Actor(self.n_state, self.n_action, self.random_seed).to(self.device) self.actor_target2.apply(initialize_weights) self.actor_target2.eval() self.critic_target2 = Critic(self.n_state * self.n_agents, self.n_action * self.n_agents, self.random_seed).to(self.device) self.actor_target2.apply(initialize_weights) self.critic_target2.eval() # optimizers self.actor_optimizer1 = optim.Adam(self.actor_local1.parameters(), lr=LR_ACTOR) self.actor_optimizer2 = optim.Adam(self.actor_local2.parameters(), lr=LR_ACTOR) self.critic_optimizer1 = optim.Adam(self.critic_local1.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.critic_optimizer2 = optim.Adam(self.critic_local2.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(n_action * 2, random_seed + 1, mu=0., theta=THETA, sigma=SIGMA) # Replay Buffer self.memory = ReplayBuffer(n_action, BUFFER_SIZE, BATCH_SIZE, random_seed + 2, self.device) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): pass # Save experience / reward self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % UPDATE_EVERY # Learn, if enough samples are available in memory if self.t_step == 0 and len(self.memory) > BATCH_SIZE: for _ in range(N_LEARNING): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): state0 = torch.from_numpy(state[0]).unsqueeze(dim=0).float().to( self.device) state1 = torch.from_numpy(state[1]).unsqueeze(dim=0).float().to( self.device) self.actor_local1.eval() self.actor_local2.eval() with torch.no_grad(): action0 = self.actor_local1(state0).cpu().data.numpy() action1 = self.actor_local2(state1).cpu().data.numpy() action = np.vstack([action0, action1]) self.actor_local1.train() self.actor_local2.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models with torch.no_grad(): actions_next1 = self.actor_target1(next_states[:, 0:24]) actions_next2 = self.actor_target2(next_states[:, 24:]) actions_next = torch.cat((actions_next1, actions_next2), dim=1) Q_targets_next1 = self.critic_target1(next_states, actions_next) Q_targets_next2 = self.critic_target2(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets1 = rewards[:, 0].unsqueeze( dim=1) + (gamma * Q_targets_next1 * (1 - dones[:, 0].unsqueeze(dim=1))) Q_targets2 = rewards[:, 1].unsqueeze( dim=1) + (gamma * Q_targets_next2 * (1 - dones[:, 1].unsqueeze(dim=1))) # Compute critic loss Q_expected1 = self.critic_local1(states, actions) Q_expected2 = self.critic_local2(states, actions) critic_loss1 = F.mse_loss(Q_expected1, Q_targets1.detach()) critic_loss2 = F.mse_loss(Q_expected2, Q_targets2.detach()) # Minimize the loss self.critic_optimizer1.zero_grad() critic_loss1.backward() torch.nn.utils.clip_grad_norm_(self.critic_local1.parameters(), 1) self.critic_optimizer1.step() self.critic_optimizer2.zero_grad() critic_loss2.backward() torch.nn.utils.clip_grad_norm_(self.critic_local2.parameters(), 1) self.critic_optimizer2.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred1 = self.actor_local1(states[:, 0:24]) actions_pred2 = self.actor_local2(states[:, 24:]) actions_pred = torch.cat((actions_pred1, actions_pred2), dim=1) actor_loss1 = -self.critic_local1(states, actions_pred).mean() # Minimize the loss self.actor_optimizer1.zero_grad() actor_loss1.backward(retain_graph=True) self.actor_optimizer1.step() actor_loss2 = -self.critic_local2(states, actions_pred).mean() self.actor_optimizer2.zero_grad() actor_loss2.backward(retain_graph=True) self.actor_optimizer2.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local1, self.critic_target1, TAU) self.soft_update(self.actor_local1, self.actor_target1, TAU) self.soft_update(self.critic_local2, self.critic_target2, TAU) self.soft_update(self.actor_local2, self.actor_target2, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters Arguments """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)