class DDPGAgent(): def __init__(self, state_dim, action_dim, random_seed): self.state_dim = state_dim self.action_dim = action_dim self.seed = random.seed(random_seed) # Actor network with its target network self.actor_local = Actor(state_dim, action_dim, random_seed).to(device) self.actor_target = Actor(state_dim, action_dim, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic network with its target network self.critic_local = Critic(state_dim, action_dim, random_seed).to(device) self.critic_target = Critic(state_dim, action_dim, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise self.noise = OUNoise(action_dim, random_seed) self.epsilon = EPSILON # Replay memory self.memory = ReplayBuffer(action_dim, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, timestamp): """Save experience in replay memory, and use random sample from memory to learn.""" # Save experience self.memory.add(state, action, reward, next_state, done) # Learn (if there are enough samples in memory) if len(self.memory) > BATCH_SIZE and timestamp % LEARN_EVERY == 0: for _ in range(LEARN_NUMBER): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Return actions for given state from current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() * self.epsilon return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples.""" states, actions, rewards, next_states, dones = experiences # UPDATE CRITIC # actions_next = self.actor_target(next_states.to(device)) Q_targets_next = self.critic_target(next_states.to(device), actions_next.to(device)) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() clip_grad_norm_(self.critic_local.parameters(), 1) # Clip the gradient when update critic network self.critic_optimizer.step() # UPDATE ACTOR # actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # UPDATE TARGET NETWORKS # self.soft_update(self.critic_local, self.critic_target, RHO) self.soft_update(self.actor_local, self.actor_target, RHO) # UPDATE EPSILON AND NOISE # self.epsilon *= EPSILON_DECAY self.noise.reset() def soft_update(self, local_model, target_model, rho): """Soft update model parameters.""" for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(rho * target_param.data + (1.0 - rho) * local_param.data)
class DDPGAgent: def __init__(self, output_dim, input_dim, name, hidden=256, lr_actor=1.0e-3, lr_critic=1.0e-3, tau=1.0e-2, seed=10): super(DDPGAgent, self).__init__() self.seed = seed self.actor = Actor(input_dim, hidden, output_dim, seed).to(device) self.critic = Critic(input_dim=input_dim, action_dim=output_dim, hidden=hidden, seed=seed, output_dim=1).to(device) self.target_actor = Actor(input_dim, hidden, output_dim, seed).to(device) self.target_critic = Critic(input_dim=input_dim, action_dim=output_dim, hidden=hidden, seed=seed, output_dim=1).to(device) self.name = name self.noise = OUNoise(output_dim, seed) self.tau = tau self.epsilon = EPSILON self.gamma = GAMMA self.clipgrad = CLIPGRAD self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr_critic, weight_decay=0) def act(self, state, add_noise=True): """Return actions for given state from current policy.""" state = torch.from_numpy(state).float().unsqueeze(0).to( device) #.unsqueeze(0) self.actor.eval() with torch.no_grad(): action = self.actor(state).cpu().squeeze(0).data.numpy() self.actor.train() if add_noise: action += self.noise.sample() * self.epsilon return np.clip(action, -1, 1) def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" states, actions, rewards, next_states, dones = experiences # UPDATE CRITIC # actions_next = self.target_actor(next_states.to(device)) Q_targets_next = self.target_critic(next_states.to(device), actions_next.to(device)) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) Q_expected = self.critic(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() clip_grad_norm_(self.critic.parameters(), self.clipgrad) self.critic_optimizer.step() # UPDATE ACTOR # actions_pred = self.actor(states) actor_loss = -self.critic(states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() #clip_grad_norm_(self.actor.parameters(), self.clipgrad) self.actor_optimizer.step() # UPDATE TARGET NETWORKS # self.soft_update(self.critic, self.target_critic) self.soft_update(self.actor, self.target_actor) # UPDATE EPSILON AND NOISE # self.epsilon *= EPSILON_DECAY self.noise.reset() def reset(self): self.noise.reset() def soft_update(self, local_model, target_model): """Soft update model parameters.""" for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)