class Agent(): """Interacts with and learns from the environment""" def __init__(self, state_size, action_size, fc1_units=256, fc2_units=128, device=torch.device('cpu')): """DQN agent Args: state_size (int): dimension of each state action_size (int): dimension of each action (or the number of action choices) seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.device = device # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, fc1_units=fc1_units, fc2_units=fc2_units).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, fc1_units=fc1_units, fc2_units=fc2_units).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Initialze qnetwork_target parameters to qnetwork_local self.soft_update(self.qnetwork_local, self.qnetwork_target, 1) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, device=self.device) # Initialize the time step counter (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subnet and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Args: state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) # Set qnetwork_local to evaluation mode self.qnetwork_local.eval() # This operation should not be included in gradient calculation with torch.no_grad(): action_values = self.qnetwork_local(state) # Set back qnetwork_local to training mode self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Args: experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # Compute Q tagets for current states with actual rewards Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ----- Update the target network ----- self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. theta_target = tau * theta_local + (1 - tau) * theta_target Args: local_model (torch.nn.Module): weights will be copied from target_model (torch.nn.MOdule): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1. - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed) self.qnetwork_target = QNetwork(state_size, action_size, seed) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent: def __init__(self, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.q_local = QNetwork(state_size, action_size, seed) self.q_target = QNetwork(state_size, action_size, seed) self.optimizer = optim.Adam(self.q_local.parameters(), lr=LR) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.t_size = 0 def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) self.t_size = (self.t_size + 1) % UPDATE_EVERY if self.t_size == 0: if len(self.memory) > BATCH_SIZE: e = self.memory.sample() self.learn(e) def act(self, state, epsilon): state = torch.from_numpy(state).float().unsqueeze(0) #Get state self.q_local.eval() #Set Q_local in evaluate mode #Equivalent to q_local.train(False) with torch.no_grad(): #Get Action values action_values = self.q_local(state) self.q_local.train() #Train Q_local if random.random() > epsilon: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma=GAMMA): states, actions, rewards, next_states, dones = experiences # TD target best_actions = self.q_target(next_states).detach().max(1)[1].unsqueeze( 1) evaluations = self.q_local(next_states).gather(1, best_actions) Q_target = rewards + evaluations * gamma * (~dones) # Currently predicted Q value Q_expected = self.q_local(states).gather(1, actions) loss = F.mse_loss(Q_expected, Q_target) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update(self.q_local, self.q_target) def soft_update(self, local_model, target_model, tau=TAU): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, lr_decay=0.985): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ print("Running on: " + str(device)) self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network if USING_DUELING: self.qnetwork_local = DuelQNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = DuelQNetwork(state_size, action_size, seed).to(device) else: self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.lr_scheduler = optim.lr_scheduler.ExponentialLR( self.optimizer, lr_decay) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences if USING_DOUBLE_DQN: # Getting the actions with maximum reward from the local model next_actions_local = self.qnetwork_local(next_states).max( dim=1, keepdim=True)[1] # Get the reward from the target model for the selected actions Q_targets_next = self.qnetwork_target(next_states).gather( 1, next_actions_local) else: # Max predicted values for the next state from the target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Q targets for current state Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Q expected values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) loss = F.mse_loss(Q_expected, Q_targets) # Minimizing loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma, double_dqn=True): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences if (double_dqn): # --------------- # double DQN # --------------- # get the Q values for best actions in observations # based off the current Q network # max(Q(s', a', theta_i)) wrt a' Q_local_values = self.qnetwork_local(next_states).detach() _, a_prime = Q_local_values.max(1) # get Q values from frozen network (i.e. target network) for next state and chosen action # Q(s',argmax(Q(s',a', theta_i), theta_i_frozen)) (argmax wrt a') Q_target_values = self.qnetwork_target(next_states).detach() Q_target_s_a_prime = Q_target_values.gather( 1, a_prime.unsqueeze(1)) #Q_target_s_a_prime = Q_target_s_a_prime.squeeze() #print('Q_target_s_a_prime', Q_target_s_a_prime.size()) # Compute Q targets for next states Q_target_s_a_prime = rewards + (gamma * Q_target_s_a_prime * (1 - dones)) #print('Q_target_s_a_prime2', Q_target_s_a_prime.size()) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) #print('Q_expected', Q_expected.size()) # Compute loss loss = F.mse_loss(Q_expected, Q_target_s_a_prime) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() else: # --------------- # regular DQN # --------------- # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) #print('Q_targets_next', Q_targets_next.size()) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) #print('Q_targets', Q_targets.size()) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) #print('Q_expected', Q_expected.size()) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, lr_decay=0.9999, double_dqn=False, duel_dqn=False, prio_exp=False): """Initialize an Agent object. Params ====== state_size (int): Dimension of each State action_size (int): Dimension of each Action seed (int): Random Seed lr_decay (float): Decay float for alpha learning rate DOUBLE DQN (boolean): Indicator for Double Deep Q-Network DUEL DQN (boolean): Indicator for Duel Deep Q-Network PRIORITISED_EXPERIENCE (boolean): Indicator for Prioritized Experience Replay """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.lr_decay = lr_decay self.DOUBLE_DQN = double_dqn self.DUEL_DQN = duel_dqn self.PRIORITISED_EXPERIENCE = prio_exp # Determine Deep Q-Network for use if self.DUEL_DQN: self.qnetwork_local = DuelQNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = DuelQNetwork(state_size, action_size, seed).to(device) else: self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) # Initialize Optimizer self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Determine if Prioritized Experience will be used if self.PRIORITISED_EXPERIENCE: self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, alpha=0.6, beta=0.4, beta_anneal=1.0001) else: self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ if self.PRIORITISED_EXPERIENCE: states, actions, rewards, next_states, dones, weights = experiences else: states, actions, rewards, next_states, dones = experiences if self.DOUBLE_DQN: # Select max Action for Next State from Local NN max_action = self.qnetwork_local(next_states).detach().max( 1)[1].unsqueeze(1) # Evaluate max Action with Target NN Q_targets_next = self.qnetwork_target(next_states).gather( 1, max_action) else: # Get Max Predicted Q values for next state from Target NN Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Predicted Q values for current state Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get Expected Q values from Local NN Q_expected = self.qnetwork_local(states).gather(1, actions) if self.PRIORITISED_EXPERIENCE: td_error = (Q_expected - Q_targets).squeeze_() # Compute TD Error td_error_detached = td_error.detach() self.memory.update_probabilities( td_error_detached) # Update Probabilities loss = ((td_error**2) * weights).mean() # Compute Weighted Loss else: loss = F.mse_loss(Q_expected, Q_targets) # Compute Loss # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- Update Target Network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local( state) # same as self.qnetwork_local.forward(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss # "*** YOUR CODE HERE ***" qs_local = self.qnetwork_local.forward(states) qsa_local = qs_local[torch.arange(BATCH_SIZE, dtype=torch.long), actions.reshape(BATCH_SIZE)] qsa_local = qsa_local.reshape((BATCH_SIZE, 1)) # print(qsa_local.shape) # # DQN Target # qs_target = self.qnetwork_target.forward(next_states) # qsa_target, _ = torch.max(qs_target, dim=1) #using the greedy policy (q-learning) # qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete # qsa_target = qsa_target.reshape((BATCH_SIZE,1)) # TD_target = rewards + gamma * qsa_target # #print(qsa_target.shape, TD_target.shape, rewards.shape) # # Double DQN Target ver 1 # qs_target = self.qnetwork_target.forward(next_states) # if random.random() > 0.5: # _, qsa_target_argmax_a = torch.max(qs_target, dim=1) #using the greedy policy (q-learning) # qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_target_argmax_a.reshape(BATCH_SIZE)] # else: # _, qsa_local_argmax_a = torch.max(qs_local, dim=1) #using the greedy policy (q-learning) # #qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)] # ##qsa_target = qs_local[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)] # qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete # qsa_target = qsa_target.reshape((BATCH_SIZE,1)) # TD_target = rewards + gamma * qsa_target # Double DQN Target ver 2 (based upon double dqn paper) qs_target = self.qnetwork_target.forward(next_states) _, qsa_local_argmax_a = torch.max( qs_local, dim=1) # using the greedy policy (q-learning) qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)] qsa_target = qsa_target * ( 1 - dones.reshape(BATCH_SIZE) ) # target qsa value is zero when episode is complete qsa_target = qsa_target.reshape((BATCH_SIZE, 1)) TD_target = rewards + gamma * qsa_target # print(qsa_target.shape, TD_target.shape, rewards.shape) # #Udacity's approach # # Get max predicted Q values (for next states) from target model # Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # # Compute Q targets for current states # TD_target = rewards + (gamma * Q_targets_next * (1 - dones)) # # Get expected Q values from local model # qsa_local = self.qnetwork_local(states).gather(1, actions) # diff = qsa_local - TD_target # loss = torch.matmul(torch.transpose(diff, dim0=0, dim1=1), diff) #loss is now a scalar loss = F.mse_loss( qsa_local, TD_target) # much faster than the above loss function # print(loss) # minimize the loss self.optimizer.zero_grad() # clears the gradients loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DQNAgent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. :param state_size: (int) dimension of each state :param action_size: (int) dimension of each action :param seed: (int) random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=PARAM.LR) # Replay memory self.memory = ReplayBuffer(action_size, PARAM.BUFFER_SIZE, PARAM.BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): """ Adds the current state-action value to the memory and lets the agent learn if UPDATE_EVERY many steps are taken and the memory has more entries then BATCH_SIZE. :param state: current state :param action: taken action :param reward: received reward :param next_state: next state seen after action :param done: boolean if the episode ended after the action """ # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % PARAM.UPDATE_EVERY if self.t_step == 0: if len(self.memory) > PARAM.BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, PARAM.GAMMA) def act(self, state, eps=0.): """ Returns actions for given state as per current policy. :param state: (array_like) current state :param eps: (float) epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def get_dqg_target(self, next_states, rewards, gamma, dones): """ Gets the state-action value of the target network. That is, the current estimate of the target network for the next state including the seen reward. :param next_states: next state for each entry in the sampled mini batch :param rewards: rewards seen for each sample in the mini batch :param gamma: decay factor for current estimate :param dones: indicator if the episode ended for each sample in the mini batch :return: """ # Get predicted Q values qtarget_values = self.qnetwork_target(next_states).detach() # get max of it best_qtarget_value = qtarget_values.max(1) # reduce one dimension best_qtarget_value = best_qtarget_value[0] # reshape to 2d matrix with one value in it for 1st dimension (so difference can be calculated) # >>> torch.unsqueeze(x, 1) # tensor([[ 1], # [ 2], # [ 3], # [ 4]]) best_qtarget_value = best_qtarget_value.unsqueeze(1) # use vector formulation of: # if dones == 1: # Q_targets = rewards # else: # Q_targets = rewards + (gamma * best_qtarget_value) q_targets = rewards + (gamma * best_qtarget_value * (1 - dones)) return q_targets def learn(self, experiences, gamma): """ Update value parameters using given batch of experience tuples. :param experiences: (Tuple[torch.Variable]) tuple of (s, a, r, s', done) tuples :param gamma: (float) discount factor """ states, actions, rewards, next_states, dones = experiences Q_targets = self.get_dqg_target(next_states, rewards, gamma, dones) # Get expected Q values q_exp = self.qnetwork_local(states) # gets the q values along dimention 1 according to the actions, which is used as index # >>> t = torch.tensor([[1,2],[3,4]]) # >>> torch.gather(t, 1, torch.tensor([[0],[1]])) # tensor([[ 1], # [ 4]]) q_exp = q_exp.gather(1, actions) # compute loss loss = F.mse_loss(q_exp, Q_targets) # reset optimizer gradient self.optimizer.zero_grad() # do backpropagation loss.backward() # do optimize step self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, PARAM.TAU) def soft_update(self, local_model, target_model, tau): """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target :param local_model: (PyTorch model) weights will be copied from :param target_model: (PyTorch model) weights will be copied to :param tau: (float) interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, buffer_size, batch_size, lr, tau, sequential_sampling_fre): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed buffer_size(int):replay buffer size batch_size(int): minibatch size lr(float):learning rate tau(float):for soft update of target parameters sequential_sampling_fre(int):Ratio of random sampling to sequential sampling """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.batch_size = batch_size self.buffer_size = buffer_size self.tau = tau # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed, sequential_sampling_fre) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_local_argmax = self.qnetwork_local(next_states).max(1)[1].unsqueeze( 1) Q_targets_next_states = self.qnetwork_target( next_states).detach().gather(1, Q_local_argmax) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next_states * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, model='DQN', buffer_size=int(1e5), batch_size=64, gamma=0.99, tau=1e-3, lr=5e-4, update_every=4, pretrained_model_file=None): if model not in ('DQN', 'DDQN'): raise ValueError('Current model supports DQN or DDQN') """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed model (str): currently suports DQN and DDQN buffer size (int): replay buffer size batch size (int): minibatch size gamma (float): discount factor tau (float): for soft update of target parameters lr (float): learning rate update_every (int): how often to update the network pretrained_model_file (str): filepath to .pth file with pretrained model weights """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = lr self.update_every = update_every self.model = model # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) if pretrained_model_file: weights = torch.load(pretrained_model_file) self.qnetwork_local.load_state_dict(weights) self.qnetwork_target.load_state_dict(weights) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model if self.model == 'DQN': Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) if self.model == 'DDQN': argmax_actions = self.qnetwork_local(next_states).detach().max( 1)[1].unsqueeze(1) Q_targets_next = self.qnetwork_target(next_states).gather( 1, argmax_actions) # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
class agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, duel, fc1_units, fc2_units, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action fc1_units : number of nodes in the first hidden layer fc2_units : number of nodes in the second hidden layer seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) #Chose betweeen regulat Q-Network or duel architecture #if(duel): # self.qnetwork_local = Duel_QNetwork(state_size, action_size,fc1_units,fc2_units, seed).to(device) # self.qnetwork_target = Duel_QNetwork(state_size, action_size,fc1_units,fc2_units, seed).to(device) #else: # self.qnetwork_local = QNetwork(state_size, action_size,fc1_units,fc2_units, seed).to(device) # self.qnetwork_target = QNetwork(state_size, action_size,fc1_units,fc2_units, seed).to(device) self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Visualize network print(self.qnetwork_local) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state_size, state, action, reward, next_state, done, dqn): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() if (dqn): self.DQN_learn(experiences, state_size, GAMMA) else: self.DDQN_learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().to(device) # Model eval notify layers in model.py that it is eval mode self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def DQN_learn(self, experiences, state_size, gamma): """Learn using the DQN algorithm. Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences states = states.view(BATCH_SIZE, 4, state_size[0], state_size[1]) next_states = next_states.view(BATCH_SIZE, 4, state_size[0], state_size[1]) # # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from the local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss using element-wise mean squared error. loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def DDQN_learn(self, experiences, gamma): """DDQN version Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model # DQN #Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) #DDQN Q_local_argmax = self.qnetwork_local(next_states).detach().max( 1)[1].unsqueeze(1) Q_targets_next = self.qnetwork_target(next_states).gather( 1, Q_local_argmax) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. teta_target = ro*teta_local + (1 - ro)*teta_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class SAC(object): def __init__(self): self.gamma = 0.99 self.tau = 0.005 self.alpha = 0.2 self.lr = 0.003 self.target_update_interval = 1 self.device = torch.device("cpu") # 8 phases self.num_inputs = 8 self.num_actions = 1 self.hidden_size = 256 self.critic = QNetwork(self.num_inputs, self.num_actions, self.hidden_size).to(self.device) self.critic_optimizer = Adam(self.critic.parameters(), lr=self.lr) self.critic_target = QNetwork(self.num_inputs, self.num_actions, self.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) # Copy the parameters of critic to critic_target self.target_entropy = -torch.Tensor([1.0]).to(self.device).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optimizer = Adam([self.log_alpha], lr=self.lr) self.policy = GaussianPolicy(self.num_inputs, self.num_actions, self.hidden_size).to(self.device) self.policy_optimizer = Adam(self.policy.parameters(), lr=self.lr) def select_action(self, state): state = torch.FloatTensor(state).to(self.device) # TODO _, _, action = self.policy.sample(state) return action.detach().cpu().numpy()[0] # action is a CUDA tensor, you should do .detach().cpu().numpy(), when # you need a numpy def update_parameters(self, memory, batch_size, updates): # Sample a batch from memory state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample( batch_size=batch_size) action_batch = np.expand_dims(action_batch, axis=1) state_batch = torch.FloatTensor(state_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to( self.device).unsqueeze(1) mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1) # Unsqueeze: add one dimension to the index with torch.no_grad(): next_state_action, next_state_log_pi, _ = self.policy.sample( next_state_batch) qf1_next_target, qf2_next_target = self.critic_target( next_state_batch, next_state_action) min_qf_next_target = torch.min( qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi next_q_value = reward_batch + mask_batch * self.gamma * ( min_qf_next_target) qf1, qf2 = self.critic( state_batch, action_batch ) # Two Q-functions to mitigate positive bias in the policy improvement step qf1_loss = F.mse_loss( qf1, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf2_loss = F.mse_loss( qf2, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf_loss = qf1_loss + qf2_loss self.critic_optimizer.zero_grad() # Clear the cumulative grad qf_loss.backward() # Get grad via backward() self.critic_optimizer.step() # Update the para via grad pi, log_pi, _ = self.policy.sample(state_batch) qf1_pi, qf2_pi = self.critic(state_batch, pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean() # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() # TODO self.alpha_optimizer.zero_grad() alpha_loss.backward() self.alpha_optimizer.step() self.alpha = self.log_alpha.exp() alpha_tlogs = self.alpha.clone() # For TensorboardX logs if updates % self.target_update_interval == 0: soft_update(self.critic_target, self.critic, self.tau) return qf1_loss.item(), qf2_loss.item(), policy_loss.item( ), alpha_loss.item(), alpha_tlogs.item() # Save model parameters def save_model(self, env_name, suffix="", actor_path=None, critic_path=None): # Create a dir package in the current location if not os.path.exists('models/'): os.makedirs('models/') if actor_path is None: actor_path = "models/sac_actor_{}_{}".format(env_name, suffix) if critic_path is None: critic_path = "models/sac_critic_{}_{}".format(env_name, suffix) print('Saving models to {} and {}'.format(actor_path, critic_path)) torch.save(self.policy.state_dict(), actor_path) # state_dict() stores the parameters of layers and optimizers which have grad torch.save(self.critic.state_dict(), critic_path) # Load model parameters def load_model(self, actor_path, critic_path): print('Loading models from {} and {}'.format(actor_path, critic_path)) if actor_path is not None: self.policy.load_state_dict(torch.load(actor_path)) if critic_path is not None: self.critic.load_state_dict(torch.load(critic_path)) def get_alpha(self): return self.alpha
class soft_actor_critic_agent(object): def __init__(self, num_inputs, action_space, \ device, hidden_size, seed, lr, gamma, tau, alpha): self.gamma = gamma self.tau = tau self.alpha = alpha self.device = device self.seed = seed self.seed = torch.manual_seed(seed) torch.cuda.manual_seed(seed) #torch.cuda.manual_seed_all(seed) #torch.backends.cudnn.deterministic=True self.critic = QNetwork(seed, num_inputs, action_space.shape[0], hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=lr) self.critic_target = QNetwork(seed, num_inputs, action_space.shape[0], hidden_size).to(self.device) hard_update(self.critic_target, self.critic) # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=lr) self.policy = GaussianPolicy(seed, num_inputs, action_space.shape[0], \ hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=lr) def select_action(self, state, eval=False): state = torch.FloatTensor(state).to(self.device).unsqueeze(0) if eval == False: action, _, _ = self.policy.sample(state) else: _, _, action = self.policy.sample(state) return action.detach().cpu().numpy()[0] def update_parameters(self, memory, batch_size, updates): # Sample a batch from memory state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample( batch_size=batch_size) state_batch = torch.FloatTensor(state_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to( self.device).unsqueeze(1) mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1) with torch.no_grad(): next_state_action, next_state_log_pi, _ = self.policy.sample( next_state_batch) qf1_next_target, qf2_next_target = self.critic_target( next_state_batch, next_state_action) min_qf_next_target = torch.min( qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi next_q_value = reward_batch + mask_batch * self.gamma * ( min_qf_next_target) # Two Q-functions to mitigate positive bias in the policy improvement step qf1, qf2 = self.critic(state_batch, action_batch) qf1_loss = F.mse_loss(qf1, next_q_value) qf2_loss = F.mse_loss(qf2, next_q_value) pi, log_pi, _ = self.policy.sample(state_batch) qf1_pi, qf2_pi = self.critic(state_batch, pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean() self.critic_optim.zero_grad() qf1_loss.backward() self.critic_optim.step() self.critic_optim.zero_grad() qf2_loss.backward() self.critic_optim.step() self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() alpha_tlogs = self.alpha.clone() # For TensorboardX logs soft_update(self.critic_target, self.critic, self.tau)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network # TODO: initialize action-value function Q with random weights theta self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) # TODO: initialize target action-value function Qhat with weights theta_=theta self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory # TODO: initialize replay memory D to capacity N (circular queue) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # TODO: set s_t1=s_t,a_t,x_t1 and preprocess f_t1=f(s_t1) # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # DONE. TODO: if episode terminates at step j+1, set y_j = r_j # else set y_j = r_j + gamma*max(Qhat(f_j1,a_;theta_)) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # DONE. TODO: every C steps reset Qhat = Q # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: # DONE. TODO: sample random minibatch of transitions (f_j,a_j,r_j,f_j1) from D experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ # Unpack the experiences tuple states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # DONE. TODO: perform a gradient descent step on (y_j - Q(f_j,a_j;theta))^2 with # respect to the network parameters theta # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every LEARN_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Learn every UPDATE_EVERY time steps. # self.t_step = (self.t_step + 1) % LEARN_EVERY # if self.t_step == 0: self.t_step += 1 if done: for _ in range(self.t_step // SOFT_UPDATE_EVERY): # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) # you can use learn_DDQN to enable double q-learning. but on lunarlander, at least, # I don't see any benefit # self.learn_DDQN(experiences, GAMMA) self.t_step = 0 def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_local(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # print(loss) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() def learn_DDQN(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get index of maximum value for next state from Q_expected Q_argmax = self.qnetwork_local(next_states).detach() _, a_prime = Q_argmax.max(1) # print (self.qnetwork_local(states).detach()) # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().gather( 1, a_prime.unsqueeze(1)) # print (Q_targets_next.shape) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # print (Q_targets.shape) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # print (Q_expected.shape) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """ Deep Reinforcement Learning agent that interacts with and learns from the environment. Uses the Double DQN algorithm (see https://arxiv.org/abs/1509.06461) with a Dueling DQN model (see https://arxiv.org/abs/1511.06581). """ def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Use Double DQN: Get predicted actions from local network model local_actions = self.qnetwork_local(next_states).detach().argmax( dim=1).unsqueeze(1) # Get predicted Q values (for next states) from target model using predicted actions Q_targets_next = self.qnetwork_target(next_states).gather( 1, local_actions).detach() # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) loss = F.mse_loss(Q_expected, Q_targets) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(object): def __init__(self, n_states, n_actions, hidden_dim, lr, device): """Agent class that choose action and train Args: n_states (int): input dimension n_actions (int): output dimension hidden_dim (int): hidden dimension """ self.device = device self.q_local = QNetwork(n_states, n_actions, hidden_dim=16).to(self.device) self.q_target = QNetwork(n_states, n_actions, hidden_dim=16).to(self.device) self.mse_loss = torch.nn.MSELoss() self.optim = optim.Adam(self.q_local.parameters(), lr=lr) self.n_states = n_states self.n_actions = n_actions # ReplayMemory: trajectory is saved here self.replay_memory = ReplayMemory(10000) def get_action(self, state, eps, check_eps=True): """Returns an action Args: state : 2-D tensor of shape (n, input_dim) eps (float): eps-greedy for exploration Returns: int: action index """ global steps_done sample = random.random() if check_eps==False or sample > eps: with torch.no_grad(): return self.q_local(Variable(state).type(FloatTensor)).data.max(1)[1].view(1, 1) else: ## return LongTensor([[random.randrange(2)]]) return torch.tensor([[random.randrange(self.n_actions)]], device=self.device) def learn(self, experiences, gamma): """Prepare minibatch and train them Args: experiences (List[Transition]): batch of `Transition` gamma (float): Discount rate of Q_target """ if len(self.replay_memory.memory) < BATCH_SIZE: return; transitions = self.replay_memory.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) states = torch.cat(batch.state) actions = torch.cat(batch.action) rewards = torch.cat(batch.reward) next_states = torch.cat(batch.next_state) dones = torch.cat(batch.done) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to newtork q_local (current estimate) Q_expected = self.q_local(states).gather(1, actions) Q_targets_next = self.q_target(next_states).detach().max(1)[0] # Compute the expected Q values Q_targets = rewards + (gamma * Q_targets_next * (1-dones)) self.q_local.train(mode=True) self.optim.zero_grad() loss = self.mse_loss(Q_expected, Q_targets.unsqueeze(1)) # backpropagation of loss to NN loss.backward() self.optim.step() def soft_update(self, local_model, target_model, tau): """ tau (float): interpolation parameter""" for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def hard_update(self, local, target): for target_param, param in zip(target.parameters(), local.parameters()): target_param.data.copy_(param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # initialize local and target Q-Networks self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # initialize replay buffer self.replay_buffer = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # initialize time step self.t_step = 0 # initialize parameters self.buffer_size = BUFFER_SIZE self.batch_size = BATCH_SIZE self.gamma = GAMMA self.tau = TAU self.lr = LR self.update_every = UPDATE_EVERY def step(self, state, action, reward, next_state, done): # store experience tuple in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # perform learning step every UPDATE_EVERY time steps self.t_step += 1 is_time_to_update_weights = (self.t_step % UPDATE_EVERY) == 0 if is_time_to_update_weights: # if enough samples in replay_buffer, # get random batch and perform one learning step if len(self.replay_buffer) > BATCH_SIZE: experiences = self.replay_buffer.sample() self.learn(experiences, GAMMA) def act(self, state, epsilon=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # epsilon-greedy action selection action_values = action_values.cpu().data.numpy()[0] optimal_action = np.argmax(action_values) random_action = np.random.choice(np.arange(self.action_size)) action = np.random.choice([optimal_action, random_action], p=[1-epsilon, epsilon]) return np.int32(action) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # get max predicted Q values for next states from target models Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # compute loss loss = F.mse_loss(Q_expected, Q_targets) # minimize loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # target network soft update self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, lr_decay=0.9999): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed lr_decay (float): multiplicative factor of learning rate decay """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) print("Running on: "+str(device)) # Q-Network hidden_layers = [128, 32] if USE_DUELING_NETWORK: hidden_state_value = [64, 32] self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed, hidden_layers, hidden_state_value).to(device) self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed, hidden_layers, hidden_state_value).to(device) self.qnetwork_target.eval() else: self.qnetwork_local = QNetwork(state_size, action_size, seed, hidden_layers).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, hidden_layers).to(device) self.qnetwork_target.eval() self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.lr_scheduler = optim.lr_scheduler.ExponentialLR(self.optimizer, lr_decay) # Replay memory if USE_PRIORITIZED_REPLAY: self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device, alpha=0.6, beta=0.4, beta_scheduler=1.0) else: self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ # Epsilon-greedy action selection if random.random() > eps: state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done, w) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, w = experiences ## Compute and minimize the loss with torch.no_grad(): ### Use of Double DQN method if USE_DOUBLE_DQN: ## Select the greedy actions using the QNetwork Local # calculate the pair action/reward for each of the next_states next_action_rewards_local = self.qnetwork_local(next_states) # select the action with the maximum reward for each of the next actions greedy_actions_local = next_action_rewards_local.max(dim=1, keepdim=True)[1] ## Get the rewards for the greedy actions using the QNetwork Target # calculate the pair action/reward for each of the next_states next_action_rewards_target = self.qnetwork_target(next_states) # get the target reward for each of the greedy actions selected following the local network target_rewards = next_action_rewards_target.gather(1, greedy_actions_local) ### Use of Fixed Q-Target else: # calculate the pair action/reward for each of the next_states next_action_rewards = self.qnetwork_target(next_states) # select the maximum reward for each of the next actions target_rewards = next_action_rewards.max(dim=1, keepdim=True)[0] ## Calculate the discounted target rewards target_rewards = rewards + (gamma * target_rewards * (1 - dones)) # calculate the pair action/rewards for each of the states expected_action_rewards = self.qnetwork_local(states) # shape: [batch_size, action_size] # get the reward for each of the actions expected_rewards = expected_action_rewards.gather(1, actions) # shape: [batch_size, 1] if USE_PRIORITIZED_REPLAY: target_rewards.sub_(expected_rewards) target_rewards.squeeze_() target_rewards.pow_(2) with torch.no_grad(): td_error = target_rewards.detach() td_error.pow_(0.5) self.memory.update_priorities(td_error) target_rewards.mul_(w) loss = target_rewards.mean() else: # calculate the loss loss = F.mse_loss(expected_rewards, target_rewards) # perform the back-propagation self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.lr_scheduler.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class SAC(object): """ SAC class from Haarnoja et al. (2018) We leave the option to use automatice_entropy_tuning to avoid selecting entropy rate alpha """ def __init__(self, num_inputs, action_space, args): #self.n_flow = args.n_flows #assert self.n_flow == 0 self.num_inputs = num_inputs #self.flow_family = args.flow_family self.num_layers = args.num_layers self.args = args self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.device = torch.device("cuda" if args.cuda else "cpu") self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) if self.automatic_entropy_tuning: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, self.num_layers, args).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) def select_action(self, state, eval=False): """ Select action for a state (Train) Sample an action from NF{N(mu(s),Sigma(s))} (Eval) Pass mu(s) through NF{} """ state = torch.FloatTensor(state).to(self.device).unsqueeze(0) if not eval: self.policy.train() action, _, _, _, _ = self.policy.evaluate(state) else: self.policy.eval() action, _, _, _, _ = self.policy.evaluate(state, eval=True) action = action.detach().cpu().numpy() return action[0] def update_parameters(self, memory, batch_size, updates): """ Update parameters of SAC-NF Exactly like SAC, but keep two separate Adam optimizers for the Gaussian policy AND the NF layers .backward() on them sequentially """ state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample( batch_size=batch_size) state_batch = torch.FloatTensor(state_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to( self.device).unsqueeze(1) mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1) # for visualization info = {} ''' update critic ''' with torch.no_grad(): next_state_action, next_state_log_pi, _, _, _ = self.policy.evaluate( next_state_batch) qf1_next_target, qf2_next_target = self.critic_target( next_state_batch, next_state_action) min_qf_next_target = torch.min( qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi next_q_value = reward_batch + mask_batch * self.gamma * ( min_qf_next_target) qf1, qf2 = self.critic( state_batch, action_batch ) # Two Q-functions to mitigate positive bias in the policy improvement step qf1_loss = F.mse_loss( qf1, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf2_loss = F.mse_loss( qf2, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] pi, log_pi, _, _, _ = self.policy.evaluate(state_batch) qf1_pi, qf2_pi = self.critic(state_batch, pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean( ) # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] nf_loss = ((self.alpha * log_pi) - min_qf_pi).mean() # update self.critic_optim.zero_grad() qf1_loss.backward() self.critic_optim.step() self.critic_optim.zero_grad() qf2_loss.backward() self.critic_optim.step() self.policy_optim.zero_grad() policy_loss.backward() #retain_graph=True) self.policy_optim.step() if self.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() alpha_tlogs = self.alpha.clone() # For TensorboardX logs else: alpha_loss = torch.tensor(0.).to(self.device) alpha_tlogs = torch.tensor(self.alpha) # For TensorboardX logs # update target value fuctions if updates % self.target_update_interval == 0: soft_update(self.critic_target, self.critic, self.tau) return qf1_loss.item(), qf2_loss.item(), policy_loss.item( ), alpha_loss.item(), alpha_tlogs.item(), info def save_model(self, info): """ Save the weights of the network (actor and critic separately) """ # policy save_checkpoint( { **info, 'state_dict': self.policy.state_dict(), 'optimizer': self.policy_optim.state_dict(), }, self.args, filename='policy-ckpt.pth.tar') # critic save_checkpoint( { **info, 'state_dict': self.critic.state_dict(), 'optimizer': self.critic_optim.state_dict(), }, self.args, filename='critic-ckpt.pth.tar') save_checkpoint( { **info, 'state_dict': self.critic_target.state_dict(), #'optimizer' : self.critic_optim.state_dict(), }, self.args, filename='critic_target-ckpt.pth.tar') def load_model(self, args): """ Jointly or separately load actor and critic weights """ # policy load_checkpoint( model=self.policy, optimizer=self.policy_optim, opt=args, device=self.device, filename='policy-ckpt.pth.tar', ) # critic load_checkpoint( model=self.critic, optimizer=self.critic_optim, opt=args, device=self.device, filename='critic-ckpt.pth.tar', ) load_checkpoint( model=self.critic_target, #optimizer=self.critic_optim, opt=args, device=self.device, filename='critic_target-ckpt.pth.tar', )
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences Q_expected = self.qnetwork_local(states).gather(1, actions) actions_value = self.qnetwork_local.forward(next_states) next_action = torch.unsqueeze(torch.max(actions_value, 1)[1], 1) next_q = self.qnetwork_target.forward(next_states).gather( 1, next_action) Q_targets = rewards + GAMMA * next_q * (1 - dones) loss = F.mse_loss(Q_expected, Q_targets) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed=42, hidden_layers=[32, 8]): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # detect GPU device self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, hidden_layers, seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, hidden_layers, seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayMemory(BUFFER_SIZE, BATCH_SIZE, self.device, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def step(self, state, action, reward, next_step, done): # Save experience in replay memory self.memory.add(state, action, reward, next_step, done) # Learn every UPDATE_EVERY time steps. self.t_step += 1 if self.t_step % UPDATE_EVERY == 0: if self.memory.length > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, next_states, rewards, dones = experiences self.qnetwork_target.eval() with torch.no_grad(): # get the max expected q-values Q_expected = self.qnetwork_local( next_states ) # gather = multiindex selector, dim=1 indices = actions action_argmax = torch.max(Q_expected, dim=1, keepdim=True)[1] Q_max_expected = Q_expected.gather(1, action_argmax) # get max predicted q-values for next states from target model (action with max value per state) # detach gets the tensor value, unsqueeze makes a matrix with one column Q_targets_next = self.qnetwork_target(next_states) # q-target for current state targets = rewards + gamma * Q_max_expected * ( 1 - dones) #consider only not dones self.qnetwork_target.train() expected = self.qnetwork_local(states).gather(1, actions) loss = torch.sum((expected - targets)**2) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # update target network self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1 - tau) * target_param.data) def train(self, env, brain_name, n_episodes=2000, timesteps=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): ''' train the model network applying experience replay Params ====== agent (Agent): agent that interacts with the enviroment n_episodes (int): number of games played timesteps (int): max number os steps to be played in the game eps_start (floaßt): initial proportion os random actions on epsilon-greedy action eps_end (float): final proportion os random actions on epsilon-greedy action eps_decay (float): epsilon decay rate ''' scores = [] last_scores = deque(maxlen=100) eps = eps_start for i_episode in range(n_episodes): env_status = env.reset(train_mode=True)[brain_name] state = env_status.vector_observations[0] #get state score = 0 for _ in range(timesteps): action = self.act(state, eps).astype(int) env_status = env.step(action)[brain_name] next_state = env_status.vector_observations[0] reward = env_status.rewards[0] done = env_status.local_done[0] self.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores.append(score) last_scores.append(score) eps = max(eps_end, eps * eps_decay) #decreases epsilon print('\rEpisode {}\tScores mean: {:.2f}'.format( i_episode, np.mean(last_scores)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tLast 100 scores mean: {:.2f}'.format( i_episode, np.mean(last_scores))) if np.mean(last_scores) >= 13.0: print( '\nEnvironment solved in {:d} episodes!\tScores mean: {:.2f}' .format(i_episode - 100, np.mean(last_scores))) torch.save(self.qnetwork_local.state_dict(), 'checkpoint.pth') break return scores
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" # Get max predicted Q values for next states from the target model (frozen weights) # # next_states is 64x8 # self.qnetwork_target(next_states) is 64x4 # detach() returns a tensor copy detached from the graph (no gradient) # max(1)[0] returns the the max value in given dim (max value indexes in 2nd array) # => This returns an array of 64 values # Unsqueeze(1)returns a new Tensor of size one inserted at the given position # => This returns a 64X1 tensor Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model (being trained) # x.gather(1, actions) returns a tensor (located on the current device) that is the result of # concataining the input tensor values along the provided dimensions (here the dim indexes are the taken actions indexes) Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def step(self, state, action, reward, next_state, done): # ------------------- train with mini-batch sample of experiences ------------------- # if len(self.memory) > BATCH_SIZE: # If enough samples are available in memory, get random subset and learn experiences = self.memory.sample() self.learn(experiences, GAMMA) # ------------------- update target network ----------------------------------------- # self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If C (UPDATE_EVERY) steps have been reached, blend weights to the target network self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model # - qnetwork_target : apply forward pass for the whole mini-batch # - detach : do not backpropagate # - max : get maximizing action for each sample of the mini-batch (dim=1) # - [0].unsqueeze(1) : transform output into a flat array Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states (y) # - dones : detect if the episode has finished Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model (Q(Sj, Aj, w)) # - gather : for each sample select only the output value for action Aj Q_expected = self.qnetwork_local(states).gather(1, actions) # Optimize over (yj-Q(Sj, Aj, w))^2 # * compute loss loss = F.mse_loss(Q_expected, Q_targets) # * minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, mode='DQN'): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size """ Set Tuning and Hyperparameters """ self.mode = mode self.losses = [] self.ddqn_enabled = False self.ddqn_counter = 0 self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) print("Parameters = {}".format(self.qnetwork_local.parameters())) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def process_action(self, value): L = [ np.array([1, 0, 0]), np.array([-1, 0, 0]), np.array([0, 1, 0]), np.array([0, 0, 1]) ] return L[value] def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model if self.mode == 'DQN': argmax_actions_locals_next = self.qnetwork_local(next_states).max( 1)[1].unsqueeze(1) Q_targets_next = self.qnetwork_target(next_states).gather( 1, argmax_actions_locals_next) if self.mode == 'DDQN': Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) self.losses.append(float(loss)) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ # the state size and action size will be used to generate the Q Network self.state_size = state_size self.action_size = action_size ### random.seed(seed) generates sequence of random numbers by performing some operation on initial value. #If same initial value is used, it will generate the same sequence of random numbers self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = Replay_Buffer(action_size, Buffer_Size, Batch_Size, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def select_act(self, state, eps=0.): " selects action based on state and epsilon" # get the state array from env, convert to tensor state = torch.from_numpy(state).float().unsqueeze(0).to(device) # unsqueeze(0) adds a singleton dimension at 0 positon # useful because the states are in batches # to(device) moves the tensor to the device memory, cpu or cuda ## put network in eval mode self.qnetwork_local.eval() #get last_layer of the network to retrive index of the max reward with torch.no_grad( ): # torch.no_grad() prevents calculating gradients in the following block, so no backward_pass. action_values = self.qnetwork_local(state) self.qnetwork_local.train() if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return np.random.randint(self.action_size) # select an action #random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_next_states = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # detach returns a new tensor detachd from the current graph # final layer is (batch_size ,action_size)i.e. (64,4), max(1), will find max in the second dim(1) # the new tensor is (64,), we then add a singleton dimensin to it with unsqueeze # Q_targets_next is the max reward of the four actons for each of the 64 states Q_target = rewards + (gamma * Q_next_states * (1 - dones)) Q_expected = self.qnetwork_local(states).gather(1, actions) #gather rearranges values in the dimension (1 here) of the input tensor (64,4), #as per the indices in the index tensor provided, actions here...actions carries the index of the next action taken # given the state in states. SO only one value will be provided..it coud be either of 0,1,2,3..based on def act and state #therefore output is 64,1.with reward corresponding to only that action chosen after the state. # the rewars generated by q_network local is used for comparison with Q_targets to calc.loss #then we update parametrs to min loss loss = F.mse_loss(Q_expected, Q_target) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) self.t_step = ( self.t_step + 1 ) % UPDATE_EVERY # self.t_step will increase by 1 after every step() call # that means every time step if self.t_step == 0: if len(self.memory) > Batch_Size: experiences = self.memory.sample() self.learn(experiences, gamma) def soft_update(self, local_model, target_model, TAU): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(TAU * local_param.data + (1.0 - TAU) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) #self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.optimizer = optim.RMSprop(self.qnetwork_local.parameters(), lr=LR, momentum=0.95) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences, idxs, ws = self.memory.sample() self.learn(experiences, idxs, ws, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, idxs, ws, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss next_action_values_local = self.qnetwork_local(states).gather(1, actions) # Only change proposed for Double DQN: Get maximizing future actions from local network and get their # corresponding values from target network. Compare then these to the local taken actions. local_max_actions = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1) next_action_values_target = self.qnetwork_target(next_states).detach().gather(1, local_max_actions) ''' print(next_action_values_local.shape) print(next_action_values_local[0][:]) print(next_action_values_local.gather(1, actions).shape) print(actions[0][0]) print(next_action_values_local.gather(1, actions)[0][0]) ''' y = rewards + (gamma * next_action_values_target*(1 - dones)) # Local network will be actualized, target one is used as ground truth ws = torch.from_numpy(ws.astype(float)).float().to(device) loss = F.mse_loss(ws*next_action_values_local, ws*y) errors = np.abs(y.cpu().data.numpy() - next_action_values_local.cpu().data.numpy()) self.memory.memory.update_batch(idxs, errors) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # # Copy from local to target network parameters self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def adjust_learning_rate(self, episode, val): print("adjusting learning rate!") for param_group in self.optimizer.param_groups: param_group['lr'] = val
class Agent: def __init__(self, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steos self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """ Returns action for given state as per current policy """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """ Update value parameters using given batch of experience tuples """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values for next states from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Update target network self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDQNPERAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, tor_dstate, srpt_pens, lrn_rate, hsize1, hsize2, seed=0): """Initialize a DDQN Agent object with PER (Prioritized Experience Replay) support. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action tor_dstate (float): tolerance for deciding whether two states are the same srpt_pens (array_like): penalty (negative reward) values for undesirable actions lrn_rate (float): learning rate for Q-Network training hsize1 (int): size of the first hidden layer of the Q-Network hsize2 (int): size of the second hidden layer of the Q-Network seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.tor_dstate = tor_dstate self.srpt_pens = srpt_pens self.lrn_rate = lrn_rate self.hsize1 = hsize1 self.hsize2 = hsize2 self.seed = seed if seed is not None: random.seed(seed) # Each penalty value adds a vector of action_size to signal which action causes the penalty. self.aug_state_size = state_size + len(srpt_pens) * action_size # Set up Q-Networks. self.qnetwork_local = QNetwork(self.aug_state_size, action_size, hsize1, hsize2, seed).to(device) self.qnetwork_local.initialize_weights( ) # initialize network with random weights self.qnetwork_target = QNetwork(self.aug_state_size, action_size, hsize1, hsize2, seed=None).to(device) self.qnetwork_target.update_weights( self.qnetwork_local) # copy network weights to target network self.qnetwork_target.eval() self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lrn_rate) # Store trained Q-model when the environment is solved. self.qnetwork_solved = None # Set up experience replay memory. self.ebuffer = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, seed) # Initialize interval steps. self.l_step = 0 # for learning every LEARN_EVERY time steps self.t_step = 0 # for updating target network every UPDATE_EVERY learnings def reset_epsisode(self, state, srpt_det=0): """Re-initialize buffers after environment reset for a new episode. Params ====== state (array_like): initial state after environment reset srpt_det (int): number of repeated state types to be checked for post-processing """ self.srpt_det = 0 if len(self.srpt_pens) == 0: # State repeat detection for post-processing is active only when state repeat penalty option is off. self.srpt_det = srpt_det else: # This is used to signal self.step() hasn't been run yet. self.next_aug_state = None if len(self.srpt_pens) > 0 or self.srpt_det > 0: self.state_buffer = deque(maxlen=2) buffer_size = 2 * (max(len(self.srpt_pens), self.srpt_det) - 1) self.smsta_buffer = deque(maxlen=max(2, buffer_size)) # The initial state will be pushed to the buffer again and be compared to this state in the process of # selecting the first action. So add 1 to the initial state here to ensure the states are different # enough for the first comparison. self.state_buffer.append(np.array(state) + 1) # Any position and orientation can be the initial simulated state here. It is like putting in a # coordinate system (origin and x-direction) for a 2-D plane and all the other simulated states # in the episode will be specified based on this reference coordinate system. self.smsta_buffer.append((np.array([0, 0]), 0)) def step(self, state, action, reward, next_state, done): """Update replay memory and parameters of Q-Network by training. Params ====== state (array_like): starting state of the step action (int): action performed in the step reward (float): reward from the action next_state (array_like): resulting state of the action in the step done (bool): indicator for whether next_state is terminal (i.e., end of episode) or not """ if len(self.srpt_pens) > 0: # Augment state vector and modify reward using state repeat penalty values. self.state_buffer.append(np.array(next_state)) self.next_aug_state = self.augment_state(next_state) state = self.aug_state next_state = self.next_aug_state reward = self.modify_reward(reward, state, action) # Save experience in replay memory. self.ebuffer.add(state, action, reward, next_state, done) # Learn every LEARN_EVERY steps after memory reaches batch_size. if len(self.ebuffer.memory) >= self.ebuffer.batch_size: self.l_step += 1 self.l_step %= LEARN_EVERY if self.l_step == 0: experiences, weights = self.ebuffer.sample() self.learn(experiences, weights, GAMMA) def augment_state(self, state): """Augment state vector to penalize undesirable actions. Params ====== state (array_like): original state vector to be augmented Returns ====== aug_state (numpy.ndarray): augmented state vector """ # Each penalty value adds a vector of action_size to signal which action causes the penalty. aug_state = np.concatenate( (state, np.zeros((len(self.srpt_pens) * self.action_size, )))) # Detect situation where the two preceeding observed states (not augmented) are essentially the # same, which indicates the agent is either stucked at a wall or in some kind of undesirable # blind spot. The next action to avoid (i.e., to be penalized) is the one that will keep the # agent stuck or in blind spot. avoid_action = self.get_avoid_action() if avoid_action != ACT_INVALID: aug_state[self.state_size + avoid_action] = 1 if avoid_action != ACT_INVALID or len(self.srpt_pens) == 1: return aug_state # If agent is not stuck or in blind spot and there are more penalty values, continue to check # state repeats separated by more than two actions. Assuming NUM_ORIS is even, states separated # by odd number of actions won't repeat. So only even number of actions needs to be checked. for action in range(self.action_size): nxt_sta = self.sim_step(action) for act_cnt in range(2, 2 * len(self.srpt_pens), 2): if self.is_state_repeated(act_cnt, nxt_sta): aug_state[self.state_size + (act_cnt // 2) * self.action_size + action] = 1 # signal undesirable action break return aug_state def modify_reward(self, reward, aug_state, action): """Modify reward to penalized undesirable action. Params ====== reward (float): original reward aug_state (numpy.ndarray): augmented state vector action (int): action performed Returns ====== reward (float): modified reward """ # Penalize undesirable action when it doesn't earn a reward or cause a penalty. If it earns a positive # reward or causes a more negative reward, leave the reward unchanged. if reward <= 0: for i, penalty in enumerate(self.srpt_pens): if aug_state[self.state_size + i * self.action_size + action] > 0: # action is undesirable reward = min(reward, penalty) break return reward def sim_step(self, action): """Advance simulated state (position and orientation) for one step by the action. Params ====== action (int): action to advance the simulated state Returns pos, ori (numpy.ndarray, int): resulting simulated state """ # An action can either be a move or turn (but not both) with the type of actions (including non-actions) # identified by the action code. pos, ori = self.smsta_buffer[-1] act_code = ACT_CODES[action] pos = pos + act_code[0] * ORIVEC_TABLE[ori] ori = (ori + act_code[1]) % NUM_ORIS return pos, ori def is_state_repeated(self, act_cnt, nxt_sta): """Check whether the next state repeats the past state separated by the specified number of actions. Params ====== act_cnt (int): number of actions separating the past state to be checked and the next state nxt_sta (numpy.ndarray, int): next state resulting from an action Returns ====== repeated (bool): indicator for repeated state """ repeated = False if act_cnt <= len(self.smsta_buffer): chk_sta = self.smsta_buffer[-act_cnt] # past state to be checked if chk_sta[1] == nxt_sta[1]: if np.linalg.norm(nxt_sta[0] - chk_sta[0]) <= self.tor_dstate: repeated = True return repeated def act(self, state, eps=0.0): """Select action for given state as per epsilon-greedy current policy. Params ====== state (array_like): current state eps (float): epsilon, for adjusting epsilon-greedy action selection Returns ====== action (int): the chosen action """ # If the agent is in testing mode, self.step() won't be invoked and some of the operations done there # need to be done here. if (len(self.srpt_pens) > 0 and self.next_aug_state is None) or self.srpt_det > 0: # Push current state into state buffer for comparing with previous state if it is not alraedy pushed # by self.step() in the agent training process. self.state_buffer.append(np.array(state)) if len(self.srpt_pens) > 0: if self.next_aug_state is None: self.aug_state = self.augment_state(state) else: self.aug_state = self.next_aug_state state = self.aug_state if self.srpt_det == 0: # no checking for repeated states (observed or simulated) # Randomly select action. action = random.choice(np.arange(self.action_size)) # Epsilon-greedy action selection. if random.random() >= eps: state = torch.from_numpy(state).float().to(device) self.qnetwork_local.eval() with torch.no_grad(): action = self.qnetwork_local( state).squeeze().argmax().cpu().item() if len(self.srpt_pens) > 0: # Update simulated state buffer with result of chosen action. nxt_sta = self.sim_step(action) self.smsta_buffer.append(nxt_sta) return action # This is the implementation of the post-processing of the Epsilon-greedy policy to avoid repeated states # within a short series of actions. This option is set in self.reset_episode() for each espisode and is # only active when the option of penalizing undesirable actions, which is set for the class object, is # disabled when len(self.srpt_pens) == 0. To accomondate the post-processing of the selected actions, the # random policy is modified to randomly assign rankings to all the available actions. # Randomly assign rankings to action candidates. ranked_actions = np.random.permutation(self.action_size) # Epsilon-greedy action selection. if random.random() >= eps: state = torch.from_numpy(state).float().to(device) self.qnetwork_local.eval() with torch.no_grad(): neg_act_qvals = -self.qnetwork_local(state).squeeze() ranked_actions = neg_act_qvals.argsort().cpu().numpy().astype(int) # Post-process ranked action candidates to remove undesirable action. avoid_action = self.get_avoid_action() action = self.select_nosrpt_action(avoid_action, ranked_actions) return action def get_avoid_action(self): """Avoid action that will keep the agent stucked or in a blind spot. Returns avoid_action (int): next action to avoid """ avoid_action = ACT_INVALID # used to sigal agent is not stucked or in a blind spot if np.linalg.norm(self.state_buffer[1] - self.state_buffer[0]) <= self.tor_dstate: sim_sta0 = self.smsta_buffer[-2] sim_sta1 = self.smsta_buffer[-1] if sim_sta0[1] == sim_sta1[ 1]: # action is not a turn, must be a move # Agent is stuck at a wall dpos = sim_sta1[0] - sim_sta0[0] mcode = np.around(np.dot( dpos, ORIVEC_TABLE[sim_sta0[1]])).astype( int) # dot(mcode*(cos, sin), (cos, sin)) = mcode avoid_action = AVOID_MOVE_TABLE[mcode + 1] self.smsta_buffer.clear( ) # it is reasonable to backtrack to get unstucked except the last state which self.smsta_buffer.append( sim_sta0 ) # the agent is stucked in (as the new reference, it can be any state) else: # action is a turn # Agent is in a blind spot (turned, but observed same state). tcode = sim_sta1[1] - sim_sta0[1] avoid_action = AVOID_TURN_TABLE[(tcode + 1) % NUM_ORIS] self.smsta_buffer.clear( ) # it is reasonable to backtrack to get out of blind self.smsta_buffer.append( sim_sta0 ) # spot except the last two states, which represent self.smsta_buffer.append(sim_sta1) # the blind spot return avoid_action def select_nosrpt_action(self, avoid_action, ranked_actions): """Select action that avoids repeated state (i.e., loops) by a short series of actions. Params ====== avoid_action (int): action to avoid if agent is stuck or in blind spot ranked_actions (array like): action candidates ranked by decreasing Q-values Returns ====== action (int): the selected action """ action = ranked_actions[0] if action == avoid_action: action = ranked_actions[1] nxt_sta = self.sim_step(action) # If repeated observed state by an action is detected (signaled by avoid_action != ACT_INVALID), the selected # action for avoiding the repeated state will be used since it is more important to free a agent that is # stucked or in a blind spot than to go back further to check for repeated simulated states. So the checking # for repeated simulated states by 2 or more actions will only occur when avoid_action == ACT_INVALID. if avoid_action == ACT_INVALID and self.srpt_det > 1: act_heapq = [] for action in ranked_actions: nxt_sta = self.sim_step(action) for act_cnt in range( 2, 2 * self.srpt_det, 2 ): # assuming NUM_ORIS is even, only check even number of actions if self.is_state_repeated(act_cnt, nxt_sta): # Simulated state repeated, go checking next action. heapq.heappush(act_heapq, [-act_cnt, action, nxt_sta]) break else: # No repeated state detected, action is found. break else: # No action can satisfy all the no repeated state conditions, select the action that repeats the # state separated by most actions (i.e., long loop is more acceptable than short loop). action, nxt_sta = heapq.heappop(act_heapq)[1:] self.smsta_buffer.append( nxt_sta ) # update simulated state buffer with result of chosen action. return action def learn(self, experiences, is_weights, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple (s, a, r, s', done) of batched experience data is_weights (torch.Tensor): importance sampling weights for the batched experiences gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Double DQN method for obtaining target Q-values. self.qnetwork_local.eval() with torch.no_grad(): maxq_actions = self.qnetwork_local(next_states).max( 1)[1].unsqueeze(1) qouts_next_states = self.qnetwork_target(next_states).gather( 1, maxq_actions).squeeze() qouts_target = rewards + gamma * qouts_next_states * (1 - dones) # Obtain current Q-values and its difference from the target Q-values. self.qnetwork_local.train() qouts_states = self.qnetwork_local(states).gather(1, actions).squeeze() delta_qouts = qouts_states - qouts_target # Calculated weighted sum of squared losses. wsqr_loss = is_weights * delta_qouts**2 # weighted squared loss loss_sum = wsqr_loss.sum() # Update model parameters by minimizing the loss sum. self.optimizer.zero_grad() loss_sum.backward() self.optimizer.step() # Update priorities of the replay memory. neg_prios = -torch.abs(delta_qouts.detach()) self.ebuffer.update_priorities(neg_prios.cpu().numpy()) # Update target network. self.t_step += 1 self.t_step %= UPDATE_EVERY if self.t_step == 0: self.qnetwork_target.update_weights(self.qnetwork_local, TAU) def update_beta(self, beta): """Update importance sampling weights for memory buffer with new Beta. Params ====== beta (float): new Beta value """ if beta != self.ebuffer.beta: self.ebuffer.beta = beta if len(self.ebuffer.memory) >= self.ebuffer.batch_size: self.ebuffer.update_is_weights() def copy_solved_qnet(self): """Copy current local Q-Network to solved Q-Network while local Q-Network will continue the training.""" if self.qnetwork_solved is None: self.qnetwork_solved = QNetwork(self.aug_state_size, self.action_size, self.hsize1, self.hsize2, seed=None).to(device) self.qnetwork_solved.update_weights( self.qnetwork_local ) # copy local network weights to solved network def save_qnet(self, model_name): """Save Q-Network parameters into file. Params ====== model_name (str): name of the Q-Network """ # Save CPU version since it can be used with or without GPU. if self.qnetwork_solved is not None: torch.save(self.qnetwork_solved.cpu().state_dict(), model_name + '.pth') self.qnetwork_solved = self.qnetwork_solved.to(device) else: torch.save(self.qnetwork_local.cpu().state_dict(), model_name + '.pth') self.qnetwork_local = self.qnetwork_local.to(device) def load_qnet(self, model_name): """Load Q-Network parameters from file. Params ====== model_name (str): name of the Q-Network """ # Saved QNetwork is alway the CPU version. qnetwork_loaded = QNetwork(self.aug_state_size, self.action_size, self.hsize1, self.hsize2, seed=None) qnetwork_loaded.load_state_dict(torch.load(model_name + '.pth')) self.qnetwork_local.update_weights(qnetwork_loaded.to( device)) # copy loaded network weights to local network
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed=0, double_dqn=False, dueling=False, per=False, per_args=(0.2, 0.01, 2e-5)): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed double_dqn (bool): whether to implement Double DQN (default=False) dueling (bool): whether to implement Dueling DQN per (bool): whether to implement Prioritized Experience Replay per_args (tuple): a,beta,beta_increment for PER """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.double_dqn = double_dqn self.per = per self.gamma = GAMMA # output name for checkpoint self.output_name = '' self.output_name += '_double' if double_dqn else '' self.output_name += '_dueling' if dueling else '' self.output_name += '_per' if per else '' # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, dueling=dueling).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, dueling=dueling).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory if self.per: self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, *per_args) else: self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def train(self, env, n_episodes=1000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): """Deep Q-Learning. Params ====== env (UnityEnvironment): Bananas environment n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # list containing scores from each episode scores = [] # list containing window averaged scores avg_scores = [] # last 100 scores scores_window = deque(maxlen=100) # initialize epsilon eps = eps_start for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] score = 0 for t in range(max_t): action = self.act(state, eps) env_info = env.step(action)[brain_name] # get the next state next_state = env_info.vector_observations[0] # get the reward reward = env_info.rewards[0] # see if episode has finished done = env_info.local_done[0] self.step((state, action, reward, next_state, done)) state = next_state score += reward if done: break # save most recent score scores_window.append(score) scores.append(score) avg_scores.append(np.mean(scores_window)) # decrease epsilon eps = max(eps_end, eps_decay * eps) print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) if np.mean(scores_window) >= 13.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) torch.save(self.qnetwork_local.state_dict(), f'./checkpoints/checkpoint{self.output_name}.pth') break return scores, avg_scores def step(self, experience): """Save experience in replay memory and learn. Params ====== experience (tuple): (state, action, reward, next_state, done) """ # save experience self.memory.add(experience) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: self.learn() def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self): """Update value parameters using given batch of experience tuples. """ # if using PER if self.per: states, actions, rewards, next_states, dones, idxs, is_weights = self.memory.sample( ) # else normal replay buffer else: states, actions, rewards, next_states, dones = self.memory.sample() # if Double DQN if self.double_dqn: # Get predicted Q values (for next actions chosen by local model) from target model self.qnetwork_local.eval() with torch.no_grad(): next_actions = self.qnetwork_local(next_states).detach().max( 1)[1].unsqueeze(1) self.qnetwork_local.train() Q_targets_next = self.qnetwork_target(next_states).gather( 1, next_actions) else: # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss if self.per: loss = (torch.FloatTensor(is_weights) * F.mse_loss(Q_expected, Q_targets)).mean() else: loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # if PER, update priority if self.per: errors = torch.abs(Q_expected - Q_targets).data.numpy() self.memory.update(idxs, errors) # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)