class AgentDQ(AgentAbstract): """ Implement Deep Q-Net with Fixed TD-Target computation and Experience Replay Fixed TD-Target: TD-Error computed on a target (offline) and local (online) network, where local network weights are copied to target network every `update_every` batches """ def __init__(self, state_size, action_size, gamma, hidden_layers, drop_p, batch_size, learning_rate, soft_upd_param, update_every, buffer_size, seed): super(AgentDQ, self).__init__(state_size, action_size, gamma, hidden_layers, drop_p, batch_size, learning_rate, soft_upd_param, update_every, buffer_size, seed) # Q-Network Architecture self.qnetwork_local = QNetwork(self.state_size, self.action_size, self.seed, self.hidden_layers, self.drop_p).to(device) self.qnetwork_target = QNetwork(self.state_size, self.action_size, self.seed, self.hidden_layers, self.drop_p).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.learning_rate) # Experience Replay self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) def _forward_local(self, states, actions): """ Returns ====== ps_local (torch.tensor) """ ps_local = self.qnetwork_local.forward(states).gather(1, actions) return ps_local def _forward_targets(self, rewards, next_states, dones): """ Use Fixed TD-Target Algorithm Returns ====== ps_target (torch.tensor) """ # Fixed Q-Targets # use target network compute r + g*max(q_est[s',a, w-]), this tensor should be detached in backprop ps_target = rewards + self.gamma * (1 - dones) * self.qnetwork_target.forward(next_states).detach().\ max(dim=1)[0].view(-1, 1) return ps_target
class AgentDoubleDQ(AgentAbstract): """ Implement Dueling Q-Net with Double QNet (fixed) TD-Target computation and Experience Replay Double Q-Net: Split action selection and Q evaluation in two steps """ def __init__(self, state_size, action_size, gamma, hidden_layers, drop_p, batch_size, learning_rate, soft_upd_param, update_every, buffer_size, seed): super(AgentDoubleDQ, self).__init__(state_size, action_size, gamma, hidden_layers, drop_p, batch_size, learning_rate, soft_upd_param, update_every, buffer_size, seed) # Q-Network Architecture: Dueling Q-Nets self.qnetwork_local = QNetwork(self.state_size, self.action_size, self.seed, self.hidden_layers, self.drop_p).to(device) self.qnetwork_target = QNetwork(self.state_size, self.action_size, self.seed, self.hidden_layers, self.drop_p).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.learning_rate) # Experience Replay self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) def _forward_local(self, states, actions): """ Returns ====== ps_local (torch.tensor) """ ps_local = self.qnetwork_local.forward(states).gather(1, actions) return ps_local def _forward_targets(self, rewards, next_states, dones): """ Use Double Q-Net Algorithm Returns ====== ps_target (torch.tensor) """ ps_actions = self.qnetwork_local.forward(next_states).detach().max( dim=1)[1].view(-1, 1) ps_target = rewards + self.gamma * (1 - dones) * self.qnetwork_target.forward(next_states).detach().\ gather(1, ps_actions) return ps_target
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # get targets by doing a forward pass of the next states in the target network self.qnetwork_target.eval() with torch.no_grad(): Q_targets_next = torch.max(self.qnetwork_target.forward(next_states), dim=1, keepdim=True)[0] # distinguish the cases in which next states are terminal and those which are not # for the first case the targets are only the one-step rewards Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones)) # get outputs by forward pass of states in the local network # Note: our qnetwork for a given state all action values for that state. # However, for each state we know what action to do, so we gather all corresponding action values self.qnetwork_local.train() Q_expected = self.qnetwork_local.forward(states).gather(1, actions) # compute the mean squared error of the Bellman Eq. loss = F.mse_loss(Q_expected, Q_targets) # clear gradients buffer from previous iteration self.optimizer.zero_grad() # backprop error through local network loss.backward() # update weights of local network by taking one SGD step self.optimizer.step() # update target network by copying the latest weights of the locat network self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = tau*θ_local + (1 - tau)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.q_optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Policy Network self.policy_network_local = PolicyNetwork(state_size, action_size, seed).to(device) self.policy_network_target = PolicyNetwork(state_size, action_size, seed).to(device) self.policy_optimizer = optim.Adam( self.policy_network_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # Action selection self.noise_scale = START_NOISE_SCALE def step(self, states, actions, rewards, next_states, dones): # With multiple arms we need to save each experience separately in the replay # buffer for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: for _ in range(20): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().to(device) self.qnetwork_local.eval() self.policy_network_local.eval() with torch.no_grad(): action = self.policy_network_local(state).cpu().data.numpy() self.qnetwork_local.train() self.policy_network_local.train() # Add noise to the policy that decays to 0 over time to encourage exploration noise = np.random.normal(loc=0, scale=self.noise_scale, size=(1, self.action_size)) action += noise self.noise_scale *= NOISE_DECAY return np.clip(action, a_min=-1, a_max=1) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Update the Q-network argmax_a_next = self.policy_network_target.forward(next_states) best_next_Q = self.qnetwork_target.forward(next_states, argmax_a_next) Q_target = rewards + gamma * best_next_Q * (1 - dones) Q_current = self.qnetwork_local.forward(states, actions) self.q_optimizer.zero_grad() criterion = torch.nn.MSELoss() loss = criterion(Q_current, Q_target.detach()) loss.backward() torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), 1) self.q_optimizer.step() # Update the policy network argmax_a = self.policy_network_local.forward(states) action_values = self.qnetwork_local.forward(states, argmax_a) self.policy_optimizer.zero_grad() loss = -action_values.mean( ) # Negative b/c we're doing gradient ascent loss.backward() torch.nn.utils.clip_grad_norm_(self.policy_network_local.parameters(), 1) self.policy_optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) self.soft_update(self.policy_network_local, self.policy_network_target, TAU) @staticmethod def soft_update(local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, td_target_type="DQN"): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) assert td_target_type in {"DQN", "Double DQN"} self.td_target_type = td_target_type # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.0): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences criterion = torch.nn.MSELoss() optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) optimizer.zero_grad() if self.td_target_type == "DQN": # compute the Q target using the Q-target network best_next_Q = ( self.qnetwork_target.forward(next_states) .detach() .max(1)[0] .unsqueeze(1) ) elif self.td_target_type == "Double DQN": # select best action using current network best_next_actions = ( self.qnetwork_local.forward(next_states) .detach() .max(1)[1] .reshape(-1, 1) ) # Use the target network to evaluate the best actions best_next_Q = ( self.qnetwork_target.forward(next_states) .detach() .gather(1, best_next_actions) ) Q_target = rewards + gamma * best_next_Q * (1 - dones) Q_current = self.qnetwork_local.forward(states).gather(1, actions) loss = criterion(Q_current, Q_target) loss.backward() optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip( target_model.parameters(), local_model.parameters() ): target_param.data.copy_( tau * local_param.data + (1.0 - tau) * target_param.data )