class DDPGAgent(object): """Interacts with and learns from the environment.""" def __init__(self, id, state_size, action_size, seed, memory, num_agents, hyperparameters: Mapping[str, float]): """Initialize a DDPG agent object. Params ====== id (int): agent's id state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed memory (ReplayBuffer): replay buffer to store the experience of this agent hyperparameters (dictionnary of str:): hyperparameters' values of the model. The expected parameters are: - batch_size (int): minibatch size - lr_actor (float): learning rate of the actor - lr_critic (float): learning rate of the critic - gamma (float): discount factor - weight_decay (float): critic L2 weight decay - tau (float): value for soft update of target parameters - update_frequency (int): how much steps must be executed before starting learn - n_learns (int): how many learning for update """ self.id = id self.__name__ = 'DDPG' self.state_size = state_size self.action_size = action_size self.gamma = hyperparameters['gamma'] self.batch_size = int(hyperparameters['batch_size']) self.tau = hyperparameters['tau'] self.update_frequency = int(hyperparameters['update_frequency']) self.n_learns = int(hyperparameters['n_learns']) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=hyperparameters['lr_actor']) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, num_agents, seed).to(device) self.critic_target = Critic(state_size, action_size, num_agents, seed).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=hyperparameters['lr_critic'], weight_decay=hyperparameters['weight_decay']) # Noise process self.noise = Ornstein(action_size) # Replay memory self.memory = memory # Initialize the time step (for every update_frequency steps) self.t_step = 0 def step(self, state, action, reward, next_state, done, other_states, other_actions, other_next_states): """Save experience in replay memory, and use random sample from buffer to learn.""" self.memory.add(state, action, reward, next_state, done, other_states, other_actions, other_next_states) self.t_step = (self.t_step + 1) % self.update_frequency if self.t_step == 0: # Learn, if enough samples are available in memory for _ in range(self.n_learns): if len(self.memory) > self.batch_size: experiences = self.memory.sample(self.batch_size) self.learn(experiences, self.gamma) def act(self, states, add_noise=True): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(states).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, _, _, _, _, other_states, _, _ = experiences self.update_critic(experiences, gamma) self.update_actor(states, other_states) self.update_target_networks() def update_critic(self, experiences, gamma): """Update the critic network given the experiences""" states, actions, rewards, next_states, dones, other_states, other_actions, other_next_states = experiences all_states = torch.cat((states, other_states), dim=1).to(device) all_actions = torch.cat((actions, other_actions), dim=1).to(device) all_next_states = torch.cat((next_states, other_next_states), dim=1).to(device) local_all_next_actions = [] local_all_next_actions.append(self.actor_target(states)) local_all_next_actions.append(self.actor_target(other_states)) all_next_actions = torch.cat(local_all_next_actions, dim=1).to(device) # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models Q_targets_next = self.critic_target(all_next_states, all_next_actions) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(all_states, all_actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() def update_actor(self, states, other_states): all_states = torch.cat((states, other_states), dim=1).to(device) # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) other_actions_pred = self.actor_local(other_states) other_actions_pred = other_actions_pred.detach() actions_pred = torch.cat((actions_pred, other_actions_pred), dim=1).to(device) actor_loss = -self.critic_local(all_states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() def update_target_networks(self): # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPGAgent: def __init__(self, env, gamma, tau, buffer_maxlen, critic_learning_rate, actor_learning_rate, max_action=1): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.noise = OUNoise(env.action_space) self.iter = 0.0 self.noisy = False self.max_action = max_action print(self.action_dim) print(self.obs_dim) # RL hyperparameters self.env = env self.gamma = gamma self.tau = tau # Initialize critic and actorr networks self.critic = Critic(self.obs_dim, self.action_dim).to(self.device) self.critic_target = Critic(self.obs_dim, self.action_dim).to(self.device) self.actor = Actor(self.obs_dim, self.action_dim, self.max_action).to(self.device) self.actor_target = Actor(self.obs_dim, self.action_dim).to(self.device) # Copy target network paramters for critic for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) # Set Optimization algorithms self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_learning_rate) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_learning_rate) self.replay_buffer = ExperienceReplayLog(buffer_maxlen) def get_action(self, obs): #print('obs;',obs) if self.noisy == True: state = torch.FloatTensor(obs).unsqueeze(0).to(self.device) action = self.actor.forward(state) action = action.squeeze(0).cpu().detach().numpy() action = self.noise.get_action(action, self.iter) self.iter = self.iter + 1 else: state = torch.FloatTensor(obs).unsqueeze(0).to(self.device) action = self.actor.forward(state) action = action.squeeze(0).cpu().detach().numpy() return action def update(self, batch_size): #Batch updates states, actions, rewards, next_states, _ = self.replay_buffer.sample( batch_size) state_batch, action_batch, reward_batch, next_state_batch, masks = self.replay_buffer.sample( batch_size) state_batch = torch.FloatTensor(state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) masks = torch.FloatTensor(masks).to(self.device) # Q info updates curr_Q = self.critic.forward(state_batch, action_batch) next_actions = self.actor_target.forward(next_state_batch) next_Q = self.critic_target.forward(next_state_batch, next_actions.detach()) expected_Q = reward_batch + self.gamma * next_Q # Update Critic network q_loss = F.mse_loss(curr_Q, expected_Q.detach()) self.critic_optimizer.zero_grad() q_loss.backward() self.critic_optimizer.step() # Update Actor network policy_loss = -self.critic.forward( state_batch, self.actor.forward(state_batch)).mean() self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() # Update Actor and Critic target networks for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))