class Agent: def __init__(self, replay_buffer, noise, state_dim, action_dim, seed, fc1_units = 256, fc2_units = 128, device="cpu", lr_actor=1e-4, lr_critic=1e-3, batch_size=128, discount=0.99, tau=1e-3): torch.manual_seed(seed) self.actor_local = Actor(state_dim, action_dim, fc1_units, fc2_units, seed).to(device) self.critic_local = Critic(state_dim, action_dim, fc1_units, fc2_units, seed).to(device) self.actor_optimizer = optim.Adam(params=self.actor_local.parameters(), lr=lr_actor) self.critic_optimizer = optim.Adam(params=self.critic_local.parameters(), lr=lr_critic) self.actor_target = Actor(state_dim, action_dim, fc1_units, fc2_units, seed).to(device) self.critic_target = Critic(state_dim, action_dim, fc1_units, fc2_units, seed).to(device) self.buffer = replay_buffer self.noise = noise self.device = device self.batch_size = batch_size self.discount = discount self.tau = tau Agent.hard_update(model_local=self.actor_local, model_target=self.actor_target) Agent.hard_update(model_local=self.critic_local, model_target=self.critic_target) def step(self, states, actions, rewards, next_states, dones): for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): self.buffer.add(state=state, action=action, reward=reward, next_state=next_state, done=done) if self.buffer.size() >= self.batch_size: experiences = self.buffer.sample(self.batch_size) self.learn(self.to_tensor(experiences)) def to_tensor(self, experiences): states, actions, rewards, next_states, dones = experiences states = torch.from_numpy(states).float().to(self.device) actions = torch.from_numpy(actions).float().to(self.device) rewards = torch.from_numpy(rewards).float().to(self.device) next_states = torch.from_numpy(next_states).float().to(self.device) dones = torch.from_numpy(dones.astype(np.uint8)).float().to(self.device) return states, actions, rewards, next_states, dones def act(self, states, add_noise=True): states = torch.from_numpy(states).float().to(device=self.device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).data.numpy() self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def learn(self, experiences): states, actions, rewards, next_states, dones = experiences # Update critic next_actions = self.actor_target(next_states) q_target_next = self.critic_target(next_states, next_actions) q_target = rewards + self.discount * q_target_next * (1.0 - dones) q_local = self.critic_local(states, actions) critic_loss = F.mse_loss(input=q_local, target=q_target) self.critic_local.zero_grad() critic_loss.backward() self.critic_optimizer.step() actor_objective = self.critic_local(states, self.actor_local(states)).mean() self.actor_local.zero_grad() (-actor_objective).backward() self.actor_optimizer.step() Agent.soft_update(model_local=self.critic_local, model_target=self.critic_target, tau=self.tau) Agent.soft_update(model_local=self.actor_local, model_target=self.actor_target, tau=self.tau) @staticmethod def soft_update(model_local, model_target, tau): for local_param, target_param in zip(model_local.parameters(), model_target.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) @staticmethod def hard_update(model_local, model_target): Agent.soft_update(model_local=model_local, model_target=model_target, tau=1.0) def reset(self): self.noise.reset()
class Agent(): def __init__(self, state_size, action_size, replay_memory, random_seed=0, nb_agent = 20, bs = 128, gamma=0.99, tau=1e-3, lr_actor=1e-4, lr_critic=1e-4, wd_actor=0, wd_critic=0, clip_actor = None, clip_critic=None, update_interval = 20, update_times = 10): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.nb_agent = nb_agent self.bs = bs self.update_interval = update_interval self.update_times = update_times self.timestep = 0 self.gamma = gamma self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic self.wd_critic = wd_critic self.wd_actor = wd_actor self.clip_critic=clip_critic self.clip_actor = clip_actor self.actor_losses = [] self.critic_losses = [] # Actor #0 self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor,weight_decay=self.wd_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic,weight_decay=self.wd_critic) # Noise process self.noise = OUNoise((self.nb_agent, action_size), random_seed) # Replay memory self.memory = replay_memory def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" #increment timestep self.timestep+=1 # Save experience / reward for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if self.timestep % self.update_interval == 0: for i in range(self.update_times): if len(self.memory) > self.bs: experiences = self.memory.sample(self.bs) self.learn(experiences) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset_noise(self): self.noise.reset() def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() if self.clip_critic: torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), self.clip_critic) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() if self.clip_actor: torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), self.clip_actor) self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) self.actor_losses.append(actor_loss.cpu().data.numpy()) self.critic_losses.append(critic_loss.cpu().data.numpy()) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau*local_param.data + (1.0-self.tau)*target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.eps = eps_start self.eps_decay = 1 / (eps_p * LEARN_NUM ) # set decay rate based on epsilon end target # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, agent_number, timestep): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory and at learning interval settings if len(self.memory) > BATCH_SIZE and timestep % 1 == 0: for _ in range(LEARN_NUM): experiences = self.memory.sample() self.learn(experiences, GAMMA, agent_number) def act(self, states, add_noise): """Returns actions for both agents as per current policy, given their respective states.""" states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): # get action for each agent and concatenate them for agent_num, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_num, :] = action self.actor_local.train() # add noise to actions if add_noise: actions += self.eps * self.noise.sample() actions = np.clip(actions, -1, 1) return actions def reset(self): self.noise.reset() def learn(self, experiences, gamma, agent_number): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) # Construct next actions vector relative to the agent if agent_number != 0: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) else: actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1) # Compute Q targets for current states (y_i) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) # Construct action prediction vector relative to each agent if agent_number != 0: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) else: actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1) # Compute actor loss actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # update noise decay parameter self.eps -= self.eps_decay self.eps = max(self.eps, eps_end) self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)