class AgentDQ(AgentAbstract): """ Implement Deep Q-Net with Fixed TD-Target computation and Experience Replay Fixed TD-Target: TD-Error computed on a target (offline) and local (online) network, where local network weights are copied to target network every `update_every` batches """ def __init__(self, state_size, action_size, gamma, hidden_layers, drop_p, batch_size, learning_rate, soft_upd_param, update_every, buffer_size, seed): super(AgentDQ, self).__init__(state_size, action_size, gamma, hidden_layers, drop_p, batch_size, learning_rate, soft_upd_param, update_every, buffer_size, seed) # Q-Network Architecture self.qnetwork_local = QNetwork(self.state_size, self.action_size, self.seed, self.hidden_layers, self.drop_p).to(device) self.qnetwork_target = QNetwork(self.state_size, self.action_size, self.seed, self.hidden_layers, self.drop_p).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.learning_rate) # Experience Replay self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) def _forward_local(self, states, actions): """ Returns ====== ps_local (torch.tensor) """ ps_local = self.qnetwork_local.forward(states).gather(1, actions) return ps_local def _forward_targets(self, rewards, next_states, dones): """ Use Fixed TD-Target Algorithm Returns ====== ps_target (torch.tensor) """ # Fixed Q-Targets # use target network compute r + g*max(q_est[s',a, w-]), this tensor should be detached in backprop ps_target = rewards + self.gamma * (1 - dones) * self.qnetwork_target.forward(next_states).detach().\ max(dim=1)[0].view(-1, 1) return ps_target
class AgentDoubleDQ(AgentAbstract): """ Implement Dueling Q-Net with Double QNet (fixed) TD-Target computation and Experience Replay Double Q-Net: Split action selection and Q evaluation in two steps """ def __init__(self, state_size, action_size, gamma, hidden_layers, drop_p, batch_size, learning_rate, soft_upd_param, update_every, buffer_size, seed): super(AgentDoubleDQ, self).__init__(state_size, action_size, gamma, hidden_layers, drop_p, batch_size, learning_rate, soft_upd_param, update_every, buffer_size, seed) # Q-Network Architecture: Dueling Q-Nets self.qnetwork_local = QNetwork(self.state_size, self.action_size, self.seed, self.hidden_layers, self.drop_p).to(device) self.qnetwork_target = QNetwork(self.state_size, self.action_size, self.seed, self.hidden_layers, self.drop_p).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.learning_rate) # Experience Replay self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) def _forward_local(self, states, actions): """ Returns ====== ps_local (torch.tensor) """ ps_local = self.qnetwork_local.forward(states).gather(1, actions) return ps_local def _forward_targets(self, rewards, next_states, dones): """ Use Double Q-Net Algorithm Returns ====== ps_target (torch.tensor) """ ps_actions = self.qnetwork_local.forward(next_states).detach().max( dim=1)[1].view(-1, 1) ps_target = rewards + self.gamma * (1 - dones) * self.qnetwork_target.forward(next_states).detach().\ gather(1, ps_actions) return ps_target
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, mode="train"): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed mode (str): if eval, the agent will not learn and collect experiences """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = PrioritizedBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # Caches the expected action value of the last act self.last_action_value = None self.set_mode(mode) def step(self, state, action, reward, next_state, done): if self.mode == "train": error = self.calculate_error_eval(state, action, reward, next_state, done) # Save experience in replay memory self.memory.add(state, action, reward, next_state, done, error) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ if self.mode == "eval": eps = 0 state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: action = np.argmax(action_values.cpu().data.numpy()) else: action = random.choice(np.arange(self.action_size)) self.last_action_value = action_values[0][action].item() return action def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss with torch.no_grad(): local_max = self.qnetwork_local(next_states).detach().argmax( 1).unsqueeze(1) targets = self.qnetwork_target(next_states).detach().gather( 1, local_max) target_values = rewards + gamma * targets * (1 - dones) predicted_values = self.qnetwork_local(states).gather(1, actions) criterion = torch.nn.MSELoss() loss = criterion(predicted_values, target_values) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) self.memory.update_errors((target_values - predicted_values).squeeze()) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def calculate_error_eval(self, state, action, reward, next_state, done): """Calculates the error for a given step.""" self.qnetwork_target.eval() next_state = torch.from_numpy(next_state).float().unsqueeze(0).to( device) with torch.no_grad(): target = self.qnetwork_target(next_state).max() target_value = reward + GAMMA * target * (1 - done) error = (target_value - self.last_action_value).item() self.qnetwork_target.train() return error def save(self, path=""): torch.save(self.qnetwork_local.state_dict(), path + "checkpoint_local.pth") torch.save(self.qnetwork_target.state_dict(), path + "checkpoint_target.pth") def load(self, path=""): self.qnetwork_local.load_state_dict( torch.load(path + "checkpoint_local.pth")) self.qnetwork_target.load_state_dict( torch.load(path + "checkpoint_target.pth")) def set_mode(self, mode): if mode not in {"train", "eval"}: raise ValueError("mode must be one of [train, eval]") self.mode = mode
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.q_optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Policy Network self.policy_network_local = PolicyNetwork(state_size, action_size, seed).to(device) self.policy_network_target = PolicyNetwork(state_size, action_size, seed).to(device) self.policy_optimizer = optim.Adam( self.policy_network_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # Action selection self.noise_scale = START_NOISE_SCALE def step(self, states, actions, rewards, next_states, dones): # With multiple arms we need to save each experience separately in the replay # buffer for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: for _ in range(20): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().to(device) self.qnetwork_local.eval() self.policy_network_local.eval() with torch.no_grad(): action = self.policy_network_local(state).cpu().data.numpy() self.qnetwork_local.train() self.policy_network_local.train() # Add noise to the policy that decays to 0 over time to encourage exploration noise = np.random.normal(loc=0, scale=self.noise_scale, size=(1, self.action_size)) action += noise self.noise_scale *= NOISE_DECAY return np.clip(action, a_min=-1, a_max=1) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Update the Q-network argmax_a_next = self.policy_network_target.forward(next_states) best_next_Q = self.qnetwork_target.forward(next_states, argmax_a_next) Q_target = rewards + gamma * best_next_Q * (1 - dones) Q_current = self.qnetwork_local.forward(states, actions) self.q_optimizer.zero_grad() criterion = torch.nn.MSELoss() loss = criterion(Q_current, Q_target.detach()) loss.backward() torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), 1) self.q_optimizer.step() # Update the policy network argmax_a = self.policy_network_local.forward(states) action_values = self.qnetwork_local.forward(states, argmax_a) self.policy_optimizer.zero_grad() loss = -action_values.mean( ) # Negative b/c we're doing gradient ascent loss.backward() torch.nn.utils.clip_grad_norm_(self.policy_network_local.parameters(), 1) self.policy_optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) self.soft_update(self.policy_network_local, self.policy_network_target, TAU) @staticmethod def soft_update(local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed , local_filename = None, target_filename = None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) ## if filename is given load them if local_filename is not None: self.qnetwork_local.load_state_dict(torch.load(local_filename)) if target_filename is not None: self.qnetwork_target.load_state_dict(torch.load(target_filename)) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def save(self, local_save_filename, target_save_filename): torch.save(self.qnetwork_local.state_dict(), local_save_filename) torch.save(self.qnetwork_target.state_dict(), target_save_filename)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # get targets by doing a forward pass of the next states in the target network self.qnetwork_target.eval() with torch.no_grad(): Q_targets_next = torch.max(self.qnetwork_target.forward(next_states), dim=1, keepdim=True)[0] # distinguish the cases in which next states are terminal and those which are not # for the first case the targets are only the one-step rewards Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones)) # get outputs by forward pass of states in the local network # Note: our qnetwork for a given state all action values for that state. # However, for each state we know what action to do, so we gather all corresponding action values self.qnetwork_local.train() Q_expected = self.qnetwork_local.forward(states).gather(1, actions) # compute the mean squared error of the Bellman Eq. loss = F.mse_loss(Q_expected, Q_targets) # clear gradients buffer from previous iteration self.optimizer.zero_grad() # backprop error through local network loss.backward() # update weights of local network by taking one SGD step self.optimizer.step() # update target network by copying the latest weights of the locat network self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = tau*θ_local + (1 - tau)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """ The agent object to interact with the environment. """ def __init__(self, state_size, action_size, seed, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, tau=TAU, lr=LR, update_every=4): ''' Initialize the agent Params ====== state_size (int) : dimension of state space action_size (int): dimension of action space seed (int) : number of seed buffer_size (int): maximum size of buffer batch_size (int): size of each training batch gamma (float): discount factor tau (float) : for soft update of target parameters lr (float) : the learning rate update_every (int) : how often to update the network ''' # Initialize important parameter self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.update_every = update_every # initialize Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Initialize replay memory buffer self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, td_target_type="DQN"): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) assert td_target_type in {"DQN", "Double DQN"} self.td_target_type = td_target_type # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.0): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences criterion = torch.nn.MSELoss() optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) optimizer.zero_grad() if self.td_target_type == "DQN": # compute the Q target using the Q-target network best_next_Q = ( self.qnetwork_target.forward(next_states) .detach() .max(1)[0] .unsqueeze(1) ) elif self.td_target_type == "Double DQN": # select best action using current network best_next_actions = ( self.qnetwork_local.forward(next_states) .detach() .max(1)[1] .reshape(-1, 1) ) # Use the target network to evaluate the best actions best_next_Q = ( self.qnetwork_target.forward(next_states) .detach() .gather(1, best_next_actions) ) Q_target = rewards + gamma * best_next_Q * (1 - dones) Q_current = self.qnetwork_local.forward(states).gather(1, actions) loss = criterion(Q_current, Q_target) loss.backward() optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip( target_model.parameters(), local_model.parameters() ): target_param.data.copy_( tau * local_param.data + (1.0 - tau) * target_param.data )