class ddqn_Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.wQ =0 self.wQ1=0 self.wQ2=0 # Q-Network self.qnetwork_Qa = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_Qb = QNetwork(state_size, action_size, seed).to(device) self.optimizer_Qa = optim.Adam(self.qnetwork_Qa.parameters(), lr=LR) self.optimizer_Qb = optim.Adam(self.qnetwork_Qb.parameters(), lr=LR) # Replay memory self.memory = ddqn_ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def print_result(): somme=self.wQ1+self.wQ2+0.0000001 print("qQ1=",self.wQ1/somme," qQ2=",self.wQ2/somme) def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act_a(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_Qa.eval() with torch.no_grad(): action_values = self.qnetwork_Qa(state) self.qnetwork_Qa.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def act_b(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_Qb.eval() with torch.no_grad(): action_values = self.qnetwork_Qb(state) self.qnetwork_Qb.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def act(self,state,eps=0.): self.wQ=np.random.choice([0, 1]) if(self.wQ): return self.act_a(state, eps) else: return self.act_b(state, eps) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ # states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" if self.wQ: # wQ take either 0 or 1 based on uniform random function. yj=self.qnetwork_Qb.forward(next_states).detach().max(1)[0].unsqueeze(1) Q_targets=rewards+gamma*yj*(1.0-dones) # Get expected Q values from local model Q_expected = self.qnetwork_Qa.forward(states).gather(1, actions) # Compute loss: Mean Square Error by element loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer_Qa.zero_grad() loss.backward() self.optimizer_Qa.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_Qa, self.qnetwork_Qb, TAU) else: yj=self.qnetwork_Qa.forward(next_states).detach().max(1)[0].unsqueeze(1) Q_targets=rewards+gamma*yj*(1.0-dones) # Get expected Q values from local model Q_expected = self.qnetwork_Qb.forward(states).gather(1, actions) # Compute loss: Mean Square Error by element loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer_Qb.zero_grad() loss.backward() self.optimizer_Qb.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_Qb, self.qnetwork_Qa, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. teta_target = ro*teta_local + (1 - ro)*teta_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): def __init__(self, state_size, action_size, seed): """ """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.q_network_local = QNetwork(state_size, action_size, seed).to(device) self.q_network_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.q_network_local.parameters(), lr=LR) # Replay buffer self.memory = ReplayBuffer(BATCH_SIZE, action_size, BUFFER_SIZE, seed) # Ini self.t_step = 0 def soft_update(self, local_network, target_network, tau): """Soft update model parameters """ for target_param, local_param in zip(target_network.parameters(), local_network.parameters()): target_param.data.copy_(tau*local_param.data + (1-tau)*target_param.data) def learn(self, experiences, gamma): """ """ states, actions, rewards, next_states, dones = experiences Q_expected = self.q_network_local.forward(states).gather(1, actions) Q_targets_next = self.q_network_target.forward(next_states).detach().max(1)[0].unsqueeze(1) Q_targets = rewards + gamma*Q_targets_next*(1-dones) self.optimizer.zero_grad() loss = F.mse_loss(Q_expected, Q_targets) loss.backward() self.optimizer.step() # Update the target network self.soft_update(self.q_network_local, self.q_network_target, TAU) def act(self, state, epsilon=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.q_network_local.eval() with torch.no_grad(): action_values = self.q_network_local(state) self.q_network_local.train() do_exploration = (random.random()<epsilon) if do_exploration: action = np.random.randint(self.action_size) else: action = np.argmax(action_values.cpu().data.numpy()) return action def step(self, state, action, reward, next_state, done): # save experience in replay memory self.memory.add(state, action, reward, next_state, done) # update self.t_step self.t_step = (self.t_step+1) % UPDATE_EVERY # learn from this batch if self.t_step == 0: if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA)
class DQN(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, config): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.config = config self.state_size = state_size self.action_size = action_size nodes = self.config.get("nodes", [128, 64]) self.seed = self.config.get("seed", 0) lr = self.config.get("lr", 1e-4) memory_size = self.config.get("memory_size", 100000) self.batch_size = self.config.get("batch_size", 256) self.discount = self.config.get("discount", 0.9) self.tau = self.config.get("tau", 0.001) self.epsilon = self.config.get("epsilon", 0.1) self.epsilon_end = self.config.get("epsilon_end", 0.0001) self.epsilon_decay = self.config.get("epsilon_decay", 0.995) self.learn_every = self.config.get("learn_every", 4) self.dqn = self.config.get("dqn", "simple") self.per = self.config.get("per", False) np.random.seed(self.seed) random.seed(self.seed) torch.manual_seed(self.seed) # Q-Network if self.dqn == "dueling": self.qnetwork_local = Dueling_QNetwork(state_size, action_size, self.seed).to(device) self.qnetwork_target = Dueling_QNetwork(state_size, action_size, self.seed).to(device) else: self.qnetwork_local = QNetwork(state_size, action_size, self.seed, nodes=nodes).to(device) self.qnetwork_target = QNetwork(state_size, action_size, self.seed, nodes=nodes).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) #self.optimizer = optim.RMSprop(self.qnetwork_local.parameters(), lr= lr) # Replay memory if self.per: self.memory = Memory(memory_size) else: self.memory = ReplayBuffer(memory_size, self.batch_size, self.seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.scores = [] def add_sample(self, state, action, reward, next_state, done): if self.per == False: self.memory.add((state, action, reward, next_state, 1 * done)) else: target = self.qnetwork_local( Variable(torch.FloatTensor(state)).to(device)).data old_val = target[action] target_val = self.qnetwork_target( Variable(torch.FloatTensor(next_state)).to(device)).data if done: target[action] = reward else: target[action] = reward + self.discount * torch.max(target_val) error = abs(old_val - target[action]) self.memory.add(error, (state, action, reward, next_state, 1 * done)) def act(self, state, add_noise=True): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() #set to eval mode with torch.no_grad(): action_values = self.qnetwork_local.forward(state) self.qnetwork_local.train() # set to training mode # Epsilon-greedy action selection if add_noise == False: eps = 0.0 else: eps = self.epsilon if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.add_sample(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.learn_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: self.learn() def learn(self): if self.per: mini_batch, idxs, is_weights = self.memory.sample(self.batch_size) mini_batch = np.array(mini_batch).transpose() states = torch.from_numpy(np.vstack( mini_batch[0])).float().to(device) actions = torch.from_numpy(np.vstack( mini_batch[1])).long().to(device) rewards = torch.from_numpy(np.vstack( mini_batch[2])).float().to(device) next_states = torch.from_numpy(np.vstack( mini_batch[3])).float().to(device) dones = torch.from_numpy( np.vstack(mini_batch[4]).astype(np.uint8)).float().to(device) else: states, actions, rewards, next_states, dones = self.memory.sample() # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions.long()) if self.dqn == "simple": # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) elif self.dqn == "double": # Double DQN _, Q_targets_next = self.qnetwork_local(next_states).detach().max( 1) # Get argmax Q_targets_next = self.qnetwork_target(next_states).detach().gather( 1, Q_targets_next.unsqueeze(1)) elif self.dqn == "dueling": # Dueling _, Q_targets_next = self.qnetwork_local(next_states).detach().max( 1) # Get argmax Q_targets_next = self.qnetwork_target(next_states).detach().gather( 1, Q_targets_next.unsqueeze(1)) else: raise OSError( 'Error in DQN: {}. Options: simple, double, dueling.'.format( self.dqn)) # Compute Q targets for current states Q_targets = rewards + (self.discount * Q_targets_next * (1 - dones)) """ # update priority if self.per: error= abs(Q_expected - Q_targets) errors = error.data.cpu().numpy() for i in range(len(idxs)): idx = idxs[i] self.memory.update(idx, errors[i]) """ # Compute loss loss = F.mse_loss(Q_expected, Q_targets) #loss = F.smooth_l1_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- self.soft_update(self.qnetwork_local, self.qnetwork_target) def soft_update(self, local_model, target_model): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data) def reset(self): pass def update(self, score): self.scores.append(score) self.epsilon = max(self.epsilon_end, self.epsilon_decay * self.epsilon) # decrease epsilon def save(self, filename): torch.save(self.qnetwork_local.state_dict(), filename) def load(self, filename): self.qnetwork_local.load_state_dict(torch.load(filename))
class PrioritizedAgent: '''Interact with and learn from the environment. The agent uses prioritized experience replay. ''' def __init__(self, state_size, action_size, seed, is_double_q=False): '''Initialize an Agent. Params ====== state_size (int): the dimension of the state action_size (int): the number of actions seed (int): random seed ''' self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.t_step = 0 # Initialize time step (for tracking LEARN_EVERY_STEP and UPDATE_EVERY_STEP) self.running_loss = 0 self.training_cnt = 0 self.is_double_q = is_double_q self.qnetwork_local = QNetwork(self.state_size, self.action_size, seed).to(device) self.qnetowrk_target = QNetwork(self.state_size, self.action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.prioritized_memory = PrioritizedMemory(BATCH_SIZE, BUFFER_SIZE, seed) def act(self, state, mode, epsilon=None): '''Returns actions for given state as per current policy. Params ====== state (array): current state mode (string): train or test epsilon (float): for epsilon-greedy action selection ''' state = torch.from_numpy(state).float().unsqueeze(0).to( device) # shape of state (1, state) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local.forward(state) self.qnetwork_local.train() if mode == 'test': action = np.argmax(action_values.cpu().data.numpy() ) # pull action values from gpu to local cpu elif mode == 'train': if random.random() <= epsilon: # random action action = random.choice(np.arange(self.action_size)) else: # greedy action action = np.argmax(action_values.cpu().data.numpy( )) # pull action values from gpu to local cpu return action def step(self, state, action, reward, next_state, done): # add new experience in memory self.prioritized_memory.add(state, action, reward, next_state, done) # activate learning every few steps self.t_step = self.t_step + 1 if self.t_step % LEARN_EVERY_STEP == 0: # If enough samples are available in memory, get random subset and learn if len(self.prioritized_memory) >= BUFFER_SIZE: idxes, experiences, is_weights = self.prioritized_memory.sample( device) self.learn(experiences, GAMMA, is_weights=is_weights, leaf_idxes=idxes) def learn(self, experiences, gamma, is_weights, leaf_idxes): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor is_weights (tensor array): importance-sampling weights for prioritized experience replay leaf_idxes (numpy array): indexes for update priorities in SumTree """ states, actions, rewards, next_states, dones = experiences q_local_chosen_action_values = self.qnetwork_local.forward( states).gather(1, actions) q_target_action_values = self.qnetowrk_target.forward( next_states).detach() if self.is_double_q == True: q_local_next_actions = self.qnetwork_local.forward( next_states).detach().max(1)[1].unsqueeze( 1) # shape (batch_size, 1) q_target_best_action_values = q_target_action_values.gather( 1, q_local_next_actions) # Double DQN elif self.is_double_q == False: q_target_best_action_values = q_target_action_values.max( 1)[0].unsqueeze(1) # shape (batch_size, 1) rewards = rewards.tanh( ) # rewards are clipped to be in [-1,1], referencing from original paper q_target_values = rewards + gamma * q_target_best_action_values * ( 1 - dones) # zero value for terminal state td_errors = (q_target_values - q_local_chosen_action_values).tanh( ) # TD-errors are clipped to be in [-1,1], referencing from original paper abs_errors = td_errors.abs().cpu().data.numpy() # pull back to cpu self.prioritized_memory.batch_update( leaf_idxes, abs_errors) # update priorities in SumTree loss = (is_weights * (td_errors**2)).mean( ) # adjust squared TD loss by Importance-Sampling Weights self.running_loss += float(loss.cpu().data.numpy()) self.training_cnt += 1 self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # if self.t_step % UPDATE_EVERY_STEP == 0: self.update(self.qnetwork_local, self.qnetowrk_target) def update(self, local_netowrk, target_network): """Hard update model parameters, as indicated in original paper. Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to """ for local_param, target_param in zip(local_netowrk.parameters(), target_network.parameters()): target_param.data.copy_(local_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) #expliding else: return random.choice(np.arange(self.action_size)) #exploration def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" """ for i in range(BATCH_SIZE): if not dones[i]: max_val = self.qnetwork_target(next_states[i]) best_val = max_val.argmax() target = rewards[i] + gamma*(max_val[best_val]) else: target = rewards[i] current = self.qnetwork_local(states[i])[actions[i]] #current = self.qnetwork_local(states).gather(-1, actions.reshape(actions.size()[0], 1)) self.loss = F.mse_loss(target, current) #self.loss.requires_grad = True self.optimizer.zero_grad() self.loss.backward() self.optimizer.step() """ current = self.qnetwork_local(states).gather( -1, actions.reshape(actions.size()[0], 1)) target1 = self.qnetwork_local.forward(next_states) max_val = target1.argmax(dim=-1) final = target1.gather(-1, max_val.reshape(max_val.shape[0], 1)) target2 = self.qnetwork_target.forward(next_states) max_val = target2.argmax(dim=-1) final2 = target2.gather(-1, max_val.reshape(max_val.shape[0], 1)) data = torch.cat([final, final2], 1) min_val = data.argmin(dim=-1) final = data.gather(-1, min_val.reshape(min_val.shape[0], 1)) target = rewards + gamma * final * (1 - dones) self.loss = F.mse_loss(current, target) self.optimizer.zero_grad() self.loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" self.qnetwork_local.train() #state_action_values = self.qnetwork_local.forward(states) #computing max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target.forward( next_states).detach().max(1)[0].unsqueeze(1) #compute targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) #computing best Q-action value (for each state) from local model Q_expected = self.qnetwork_local.forward(states).gather(1, actions) #loss = self.criterion(state_action_values, expected_state_action_values) loss = F.mse_loss(Q_expected, Q_targets) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): def __init__(self, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) for target_param, param in zip(self.qnetwork_local.parameters(), self.qnetwork_target.parameters()): target_param.data.copy_(param) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences actions = actions.view(actions.size(0), 1) dones = dones.view(dones.size(0), 1) curr_Q = self.qnetwork_local.forward(states).gather(1, actions) next_Q = self.qnetwork_target.forward(next_states) max_next_Q = torch.max(next_Q, 1)[0] max_next_Q = max_next_Q.view(max_next_Q.size(0), 1) expected_Q = rewards + (1 - dones) * gamma * max_next_Q loss = F.mse_loss(curr_Q, expected_Q.detach()) # Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # Q_targets = rewards + (gamma * Q_targets_next * (1-dones)) # Q_expected = self.qnetwork_local(states).gather(1, actions) # loss = F.mse_loss(Q_expected, Q_targets) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) for target_param, param in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()): target_param.data.copy_(TAU * param + (1 - TAU) * target_param) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): def __init__(self, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed) self.qnetwork_target = QNetwork(state_size, action_size, seed) self.qnetwork_local.load_model("./dqn_LL_model data.pickle") self.qnetwork_target.load_model("./dqn_LL_model data.pickle") # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.loss = 0 self.loss_list = [] def step(self, state, action, reward, next_state, done, t_step): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = t_step if self.t_step % UPDATE_EVERY == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > 100 * BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. """ action_values = self.qnetwork_local.forward(state) # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples """ states, actions, rewards, next_states, dones = experiences for time in range(BATCH_SIZE): # compute Q_target from the target network inputing next_state Q_target_av = np.max( self.qnetwork_target.forward(next_states[time])) Q_target = rewards[time] + gamma * (Q_target_av) * ( 1 - dones[time]) # if done, than the second will not be added # compute the Q_expected Q_expected = self.qnetwork_local.forward( states[time] ) # get q value for corrosponding action along dimension 1 of 64,4 matrix self.qnetwork_local.backward(Q_target, "MSE", actions[time]) self.loss_list.append((Q_target - Q_expected[actions[time]])**2) self.loss = np.mean(self.loss_list) self.qnetwork_local.step() self.loss_list.clear() # update target network # if self.t_step % UPDATE_FREQUENCY == 0: self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = tau*θ_local + (1 - tau)*θ_target """ self.qnetwork_target.soft_update(local_model, TAU)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Parameters: ========== state_size (int): This is the dimension of each state. action_size (int): This is the dimension of each action. seed (int): This is the random seed. """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network (local and target one) self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) # mounting an Adam optimizer for the backward propagation self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # mounting an MSE Loss function self.criterion = nn.MSELoss() # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps). self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory. self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn. if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Parameters: ========== state (array_like): The current state. eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences self.optimizer.zero_grad() # Forward and backward passes output = self.qnetwork_local.forward(states).gather(1, actions) loss = self.criterion(output, self.targets(gamma, rewards, next_states, dones)) loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def targets(self, gamma, rewards, next_states, dones): with torch.no_grad(): q = self.qnetwork_target.forward(next_states) y = torch.add(rewards, torch.mul(torch.max(q, dim=1, keepdim=True)[0], gamma)) * (1 - dones) return y def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Parameters: ========== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class PrioritizedAgent: '''Interact with and learn from the environment.''' def __init__(self, state_size, action_size, seed, is_prioritized_sample=False): '''Initialize an Agent. Params ====== state_size (int): the dimension of the state action_size (int): the number of actions seed (int): random seed ''' self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.t_step = 0 # Initialize time step (for tracking LEARN_EVERY_STEP and UPDATE_EVERY_STEP) self.is_prioritized_sample = is_prioritized_sample self.qnetwork_local = QNetwork(self.state_size, self.action_size, seed).to(device) self.qnetowrk_target = QNetwork(self.state_size, self.action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) if self.is_prioritized_sample == False: self.replay_memory = ReplayMemory(BATCH_SIZE, BUFFER_SIZE, seed) else: self.replay_memory = PrioritizedReplayMemory(BATCH_SIZE, BUFFER_SIZE, seed) def act(self, state, epsilon=0.): '''Returns actions for given state as per current policy. Params ====== state (array-like): current state epsilon (float): for epsilon-greedy action selection ''' state = torch.from_numpy(state).float().unsqueeze(0).to(device) # shape of state (1, state) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local.forward(state) self.qnetwork_local.train() if random.random() <= epsilon: # random action action = random.choice(np.arange(self.action_size)) else: # greedy action action = np.argmax(action_values.cpu().data.numpy()) # pull action values from gpu to local cpu return action def step(self, state, action, reward, next_state, done): # add new experience in memory self.replay_memory.add(state, action, reward, next_state, done) # activate learning every few steps self.t_step = self.t_step + 1 if self.t_step % LEARN_EVERY_STEP == 0: # If enough samples are available in memory, get random subset and learn if len(self.replay_memory) >= BUFFER_SIZE and self.is_prioritized_sample==False: experiences = self.replay_memory.sample(device) self.learn(experiences, GAMMA) elif len(self.replay_memory) >= BUFFER_SIZE and self.is_prioritized_sample==True: batch_idx, experiences, batch_ISWeights = self.replay_memory.sample(device) self.learn(experiences, GAMMA, ISWeights=batch_ISWeights, leaf_idxes=batch_idx) def learn(self, experiences, gamma, ISWeights=None, leaf_idxes=None): """Update value parameters using given batch of experience tuples. If is_prioritized_sample, then weights update is adjusted by ISWeights. In addition, Double DQN is optional. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor ISWeights (tensor array): importance-sampling weights for prioritized experience replay leaf_idxes (numpy array): indexes for update priorities in SumTree """ # compute and minimize the loss if self.is_prioritized_sample == False: states, actions, rewards, next_states, dones = experiences q_local_chosen_action_values = self.qnetwork_local.forward(states).gather(1, actions) q_target_action_values = self.qnetowrk_target.forward(next_states).detach() # # detach from graph, don't backpropagate q_target_best_action_values = q_target_action_values.max(1)[0].unsqueeze(1) # shape (batch_size, 1) q_target_values = rewards + gamma * q_target_best_action_values * (1 - dones) # zero value for terminal state loss = F.mse_loss(q_local_chosen_action_values, q_target_values) self.optimizer.zero_grad() loss.backward() self.optimizer.step() else: states, actions, rewards, next_states, dones = experiences q_local_chosen_action_values = self.qnetwork_local.forward(states).gather(1, actions) #q_local_next_actions = self.qnetwork_local.forward(next_states).detach().max(1)[1].unsqueeze(1) # shape (batch_size, 1) q_target_action_values = self.qnetowrk_target.forward(next_states).detach() q_target_best_action_values = q_target_action_values.max(1)[0].unsqueeze(1) # shape (batch_size, 1) #q_target_best_action_values = q_target_action_values.gather(1, q_local_next_actions) # Double DQN q_target_values = rewards + gamma * q_target_best_action_values * (1 - dones) # zero value for terminal state abs_errors = torch.abs(q_target_values - q_local_chosen_action_values).cpu().data.numpy() # pull back to cpu self.replay_memory.batch_update(leaf_idxes, abs_errors) # update priorities in SumTree loss = F.mse_loss(q_local_chosen_action_values, q_target_best_action_values, reduce=False) loss = (ISWeights * loss).mean() # adjust TD loss by Importance-Sampling Weights self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # if self.t_step % UPDATE_EVERY_STEP == 0: self.update(self.qnetwork_local, self.qnetowrk_target) def update(self, local_netowrk, target_network): """Hard update model parameters, as indicated in original paper. Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to """ for local_param, target_param in zip(local_netowrk.parameters(), target_network.parameters()): target_param.data.copy_(local_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs net = nn.DataParallel(self.qnetwork_local) if torch.cuda.is_available(): print("using GPUs!") net.cuda() # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" # target net update # Get max predicted Q values (for next states) from target model qs_local = self.qnetwork_local.forward(states) qsa_local = qs_local[torch.arange(BATCH_SIZE, dtype=torch.long), actions.reshape(BATCH_SIZE)] Q_expected = qsa_local.reshape((BATCH_SIZE, 1)) qs_target = self.qnetwork_target.forward(next_states) _, qsa_local_argmax_a = torch.max( qs_local, dim=1) #using the greedy policy (q-learning) qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)] qsa_target = qsa_target * ( 1 - dones.reshape(BATCH_SIZE) ) #target qsa value is zero when episode is complete qsa_target = qsa_target.reshape((BATCH_SIZE, 1)) Q_targets = rewards + gamma * qsa_target # Compute loss loss = F.mse_loss(Q_expected, Q_targets) #logger.info('mse: {}'.format(delta)) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # evolutionary step - increase survival chances #logger.info('avg reward: {} mse:{}'.format(delta, np.mean(experiences.rewards()))) # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBufferWithPriority(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences_with_index = self.memory.sample() self.learn(experiences_with_index, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences_with_index, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences_with_index (Tuple[torch.Variable]): tuple of (s, a, r, s', done, index, weightsIS) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, index, weightsIS = experiences_with_index ## TODO: compute and minimize the loss # Get max predicted Q values (for next states) from target model ### Regular DQN # Q_targets_next = self.qnetwork_target.forward(next_states).detach().max(1)[0].unsqueeze(1) ### Double DQN with torch.no_grad(): estimated_action = self.qnetwork_local(next_states).argmax(dim=1, keepdim=True) Q_targets_next = self.qnetwork_target.forward(next_states).gather(1, estimated_action) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local.forward(states).gather(1, actions) # Compute importance-sampling weight # Compute loss loss_fn = nn.MSELoss(reduce=False) loss = loss_fn(Q_expected, Q_targets) weighted_loss = torch.sum(torch.from_numpy(weightsIS).float().to(device) * loss) # Update priority according to TD error self.memory.update_priority(list(loss.detach().cpu().numpy().squeeze()**ALPHA+EPS), index) # Minimize the loss self.optimizer.zero_grad() weighted_loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): """ Agent used to interact with and learns from the environment """ def __init__(self, state_size, action_size, config): """ Initialize an Agent object """ self.state_size = state_size self.action_size = action_size self.config = config # logging for this class self.logger = logging.getLogger(self.__class__.__name__) # gpu support self.device = pick_device(config, self.logger) ## Q-Networks self.qnetwork_local = QNetwork(state_size, action_size, config).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, config).to(self.device) ## Get optimizer for local network self.optimizer = getattr(optim, config["optimizer"]["optimizer_type"])( self.qnetwork_local.parameters(), betas=tuple(config["optimizer"]["betas"]), **config["optimizer"]["optimizer_params"]) ## Replay memory self.memory = ReplayBuffer( config=config, action_size=action_size, buffer_size=int(config["DQN"]["buffer_size"]), batch_size=config["trainer"]["batch_size"] ) ## Initialize time step (for update every `update_every` steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every `update_every` time steps self.t_step = (self.t_step + 1) % self.config["DQN"]["update_every"] if (self.t_step == 0): # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.config["trainer"]["batch_size"]: experiences = self.memory.sample() self.learn(experiences, self.config["DQN"]["gamma"]) def act(self, state, epsilon): """ Returns actions for given state as per current policy """ # pdb.set_trace() # Convert state to tensor state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) ## Evaluation mode self.qnetwork_local.eval() with torch.no_grad(): # Forward pass of local qnetwork action_values = self.qnetwork_local.forward(state) ## Training mode self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > epsilon: # Choose the best action (exploitation) return np.argmax(action_values.cpu().data.numpy()) else: # Choose random action (exploration) return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """ Update value parameters using given batch of experience tuples """ states, actions, rewards, next_states, dones = experiences ## TD target # Get max predicted Q-values (for next states) from target model # Q_targets_next = torch.argmax(self.qnetwork_target(next_states).detach(), dim=1).unsqueeze(1) Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) Q_targets_next = Q_targets_next.type(torch.FloatTensor) # Compute Q-targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) ## old value # Get expected Q-values from local model Q_expected = torch.gather(self.qnetwork_local(states), dim=1, index=actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # update target network with a soft update self.soft_update(self.qnetwork_local, self.qnetwork_target, self.config["DQN"]["tau"])
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, fc1_units=64, fc2_units=64, fc3_units=None, double_q=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.double_q = double_q # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, fc1_units, fc2_units, fc3_units).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, fc1_units, fc2_units, fc3_units).to(device) if torch.cuda.is_available(): self.qnetwork_local.cuda() self.qnetwork_target.cuda() else: self.qnetwork_local.cpu() self.qnetwork_target.cpu() self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR, weight_decay=WD) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def get_action(self, state, eps=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(device) with torch.no_grad(): output = self.qnetwork_local.forward(state) action_values = self.qnetwork_local.forward(state) if random.random() <= eps: return np.random.choice(np.arange(self.action_size)) else: return output.argmax().item() def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences # double q learning argmax_a = self.qnetwork_local.forward(next_states).detach().argmax( dim=1).unsqueeze(dim=1) a_val = self.qnetwork_target.forward(next_states).detach() Q_targets_next = a_val.gather(1, argmax_a) Q_targets = rewards + GAMMA * Q_targets_next Q_expected = self.qnetwork_local.forward(states).gather(1, actions) loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed=0): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Initialize both the target and the local Q networks self.qnetwork_local = QNetwork(state_size, action_size).to(device) self.qnetwork_target = QNetwork(state_size, action_size).to(device) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, MINIBATCH_SIZE, seed) # The Optimizer used is Adam self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LEARNING_RATE) # Initialize time step (for updating every UPDATE_EVERY steps) # Used to determine when the agent starts learning self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # check if there are enough examples in memory if len(self.memory) > MINIBATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): # Aquire an action by passing the current state to the local network action_values = self.qnetwork_local.forward(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model # running forward on the target network on the set of experiences Q_targets_next = self.qnetwork_target.forward(next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local.forward(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_updatee(self.qnetwork_local, self.qnetwork_target, TAU) def hard_update(self, local_model, target_model): """Hard update the bias and the weights from the local Network to the target network""" for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(local_param.data) def soft_updatee(self, local_model, target_model, tau): """Soft update the weights and biases from the local Network to the target network using the update factor tau""" for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save_weights(self, path='./p1_weights'): """Save the trained weights of the current local Q network‚""" torch.save(self.qnetwork_local.state_dict(), path) def load_saved_weights(self, path='./p1_weights'): """" Load the weights to the local and target Q network """ self.qnetwork_local.load_state_dict(torch.load(path)) self.qnetwork_local.eval() self.qnetwork_target.load_state_dict(torch.load(path)) self.qnetwork_target.eval()
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local( state) # same as self.qnetwork_local.forward(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss # "*** YOUR CODE HERE ***" qs_local = self.qnetwork_local.forward(states) qsa_local = qs_local[torch.arange(BATCH_SIZE, dtype=torch.long), actions.reshape(BATCH_SIZE)] qsa_local = qsa_local.reshape((BATCH_SIZE, 1)) # print(qsa_local.shape) # # DQN Target # qs_target = self.qnetwork_target.forward(next_states) # qsa_target, _ = torch.max(qs_target, dim=1) #using the greedy policy (q-learning) # qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete # qsa_target = qsa_target.reshape((BATCH_SIZE,1)) # TD_target = rewards + gamma * qsa_target # #print(qsa_target.shape, TD_target.shape, rewards.shape) # # Double DQN Target ver 1 # qs_target = self.qnetwork_target.forward(next_states) # if random.random() > 0.5: # _, qsa_target_argmax_a = torch.max(qs_target, dim=1) #using the greedy policy (q-learning) # qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_target_argmax_a.reshape(BATCH_SIZE)] # else: # _, qsa_local_argmax_a = torch.max(qs_local, dim=1) #using the greedy policy (q-learning) # #qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)] # ##qsa_target = qs_local[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)] # qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete # qsa_target = qsa_target.reshape((BATCH_SIZE,1)) # TD_target = rewards + gamma * qsa_target # Double DQN Target ver 2 (based upon double dqn paper) qs_target = self.qnetwork_target.forward(next_states) _, qsa_local_argmax_a = torch.max( qs_local, dim=1) # using the greedy policy (q-learning) qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)] qsa_target = qsa_target * ( 1 - dones.reshape(BATCH_SIZE) ) # target qsa value is zero when episode is complete qsa_target = qsa_target.reshape((BATCH_SIZE, 1)) TD_target = rewards + gamma * qsa_target # print(qsa_target.shape, TD_target.shape, rewards.shape) # #Udacity's approach # # Get max predicted Q values (for next states) from target model # Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # # Compute Q targets for current states # TD_target = rewards + (gamma * Q_targets_next * (1 - dones)) # # Get expected Q values from local model # qsa_local = self.qnetwork_local(states).gather(1, actions) # diff = qsa_local - TD_target # loss = torch.matmul(torch.transpose(diff, dim0=0, dim1=1), diff) #loss is now a scalar loss = F.mse_loss( qsa_local, TD_target) # much faster than the above loss function # print(loss) # minimize the loss self.optimizer.zero_grad() # clears the gradients loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with the environment , act and learn from the environment""" def __init__(self, state_size, action_size, seed): """Initializes Agent object , 1. agent variables, 2. local and target QNetworks, 3. Optimizer, 4. Replay Buffer""" self.state_size = state_size self.action_size = action_size self.seed = torch.manual_seed(seed) self.t_step = 0 self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) def act(self, state, eps=eps): """To read state , pass it through local network and return action values as per the given policy. Then from action values , based on eps gives argmax or chooses a random action from action values""" state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local.forward(state) self.qnetwork_local.train() if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(1, self.action_size)) def step(self, state, action, reward, next_state, done): """Perform a step which consists of, 1. Add into Replay Buffer 2. Learn """ self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % LEARN_EVERY if self.t_step == 0: if self.memory.getlength > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def learn(self, experiences, GAMMA): """Calculate the MSE based on Expected Q value and Target Q value. Use optimizer to learn from MSE and calculate target weights and then update those weights in the target Q network""" states, actions, rewards, next_states, done = experiences Q_expected = self.qnetwork_local(states).gather(1, actions) Q_targets_next = self.qnetwork_target(next_states).detach().max( 0)[1].unsqueeze(1) Q_targets = rewards + (GAMMA * Q_targets_next * (1 - done)) loss = F.mse_loss(Q_expected, Q_targets) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Perform inplace copy of target parameters based on Tau""" for local_params, target_params in zip(local_model.parameters(), target_model.parameters()): target_params.data.copy_(tau * local_params.data + (1 - tau) * target_params.data)
class Agent(object): def __init__(self, state_size, action_size, mem_length=100000, ddqn=True): self.gamma = 0.99 self.batch_size = 64 self.action_size = action_size self.ddqn = ddqn self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") if ddqn: self.model = DuelingQNetwork(state_size, action_size).to(self.device) self.target_model = DuelingQNetwork(state_size, action_size).to(self.device) self.optimizer = optim.Adam(self.model.parameters(), lr=5e-4) self.experience = self.ddqn_experience else: self.model = QNetwork(state_size, action_size).to(self.device) self.optimizer = optim.Adam(self.model.parameters(), lr=5e-4) self.experience = self.dqn_experience # replay memory self.memory = deque(maxlen=mem_length) def act(self, state, eps=0): # epsilon greedy if random.random() < eps: return random.choice(np.arange(self.action_size)) # state to predict action from state = torch.FloatTensor(state).unsqueeze(0).to(self.device) self.model.eval() with torch.no_grad(): action_values = self.model(state) self.model.train() return np.argmax(action_values.cpu().data.numpy()) def ddqn_experience(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) if len(self.memory) < self.batch_size: return # get random batch states, actions, rewards, next_states, terminals = self.get_batch() # Get expected Q values from local model expected = self.model(states).gather(1, actions) Q = self.model(next_states).detach() # Get max predicted Q values (for next states) from target model targets_next = self.target_model(next_states).detach() targets_next = targets_next.gather(1, Q.max(1)[1].unsqueeze(1)) # Compute Q targets for current states targets = rewards + (self.gamma * targets_next * (1 - terminals)) # compute loss loss = functional.mse_loss(expected, targets) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # update target network lr = 0.001 for target_param, primary_param in zip(self.target_model.parameters(), self.model.parameters()): target_param.data.copy_(lr * primary_param.data + (1 - lr) * target_param.data) def dqn_experience(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) if len(self.memory) < self.batch_size: return # get random batch states, actions, rewards, next_states, terminals = self.get_batch() Q = self.model.forward(states) Q = Q.gather(1, actions).squeeze(1) next_Q = self.model.forward(next_states) max_next_Q = torch.max(next_Q, 1)[0] expected = rewards.squeeze(1) + self.gamma * max_next_Q # update model loss = functional.mse_loss(Q, expected) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def get_batch(self): experiences = np.array(random.sample(self.memory, k=self.batch_size)) experiences = [np.vstack(experiences[:, i]) for i in range(5)] # convert data to tensors states = torch.FloatTensor(experiences[0]).to(self.device) actions = torch.LongTensor(experiences[1]).to(self.device) rewards = torch.FloatTensor(experiences[2]).to(self.device) next_states = torch.FloatTensor(experiences[3]).to(self.device) terminals = torch.FloatTensor(experiences[4].astype(np.uint8)).to( self.device) return states, actions, rewards, next_states, terminals
class Agent(): def __init__(self, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.t_step = 0 def step(self, state, action, reward, next_state, done, priority, B_P): self.memory.add(state, action, reward, next_state, done, priority) self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample(B_P) self.learn(experiences, GAMMA) def priority(self, states, actions, rewards, next_states): if len(states.shape) == 1: # only a single experience tuple to evaluate # need to format variables accordingly: states = torch.from_numpy(states).float().unsqueeze(0).to(device) next_states = torch.from_numpy(next_states).float().unsqueeze( 0).to(device) rewards = torch.tensor([[rewards]], dtype=torch.float).to( device) # scalar value actions = torch.tensor([[actions]], dtype=torch.uint8).to( device) # scalar value action_local = self.qnetwork_local.forward(next_states).argmax(1) max_q = self.qnetwork_target.forward(next_states)[ np.arange(action_local.shape[0]), action_local] delta = (rewards.squeeze() + GAMMA * max_q) - self.qnetwork_local( states)[np.arange(actions.shape[0]), actions.byte().squeeze().cpu().numpy()] priority = torch.abs(delta) + E_p return priority.squeeze().tolist() def act(self, state, eps=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): states, actions, rewards, next_states, dones, weights, experience_indices = experiences Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.qnetwork_local(states).gather(1, actions) loss = WEightedMSE(Q_expected, Q_targets, weights) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) # ------------------- update priorities in the replay buffer ------------------- # new_priorities = self.priority(states, actions, rewards, next_states) for count, idx in enumerate(experience_indices): self.memory.memory[idx] = self.memory.memory[idx]._replace( priority=new_priorities[count]) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def priority(self, states, actions, rewards, next_states): if len(states.shape) == 1: # only a single experience tuple to evaluate # need to format variables accordingly: states = torch.from_numpy(states).float().unsqueeze(0).to(device) next_states = torch.from_numpy(next_states).float().unsqueeze( 0).to(device) rewards = torch.tensor([[rewards]], dtype=torch.float).to( device) # scalar value actions = torch.tensor([[actions]], dtype=torch.uint8).to( device) # scalar value action_local = self.qnetwork_local.forward(next_states).argmax(1) max_q = self.qnetwork_target.forward(next_states)[ np.arange(action_local.shape[0]), action_local] delta = (rewards.squeeze() + GAMMA * max_q) - self.qnetwork_local( states)[np.arange(actions.shape[0]), actions.byte().squeeze().cpu().numpy()] priority = torch.abs(delta) + E_PRIORITY return priority.squeeze().tolist() def step(self, state, action, reward, next_state, done, priority, b_priority): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done, priority) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample( b_priority) # needs b_priority to compute weights self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, weights, experience_indices = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" actions_local = self.qnetwork_local.forward(next_states).argmax(1) max_q_values = self.qnetwork_target.forward(next_states)[ np.arange(actions_local.shape[0]), actions_local] td_target = rewards.squeeze() + gamma * max_q_values * ( 1 - dones.squeeze()) predicted_q_values = self.qnetwork_local.forward(states) predicted_q_values = predicted_q_values[ np.arange(predicted_q_values.shape[0]), actions.squeeze()] self.optimizer.zero_grad( ) # must zero the gradients each time, otherwise they get summed # Forward and backward passes loss = WeightedMSE(predicted_q_values, td_target, weights) loss.backward() # backward pass to compute the gradients self.optimizer.step( ) # take a step using the learning rate and computed gradient # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) # ------------------- update priorities in the replay buffer ------------------- # new_priorities = self.priority(states, actions, rewards, next_states) for count, idx in enumerate(experience_indices): self.memory.memory[idx] = self.memory.memory[idx]._replace( priority=new_priorities[count]) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network hidden_layers = [128,64] self.qnetwork_local = QNetwork(state_size, action_size, hidden_layers, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, hidden_layers, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) #Initialising target and local environment with same weights self.hard_update(self.qnetwork_local,self.qnetwork_target) def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) def update(self): if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if np.random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" max_actions = self.qnetwork_local.forward(next_states).detach().max(1)[1].unsqueeze(1) output_target = self.qnetwork_target.forward(next_states).gather(1,max_actions) td_target = rewards + gamma*(output_target*(1-dones)) output_local= self.qnetwork_local(states).gather(1, actions) loss = F.mse_loss(output_local,td_target) self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), 1) self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def hard_update(self,local_model,target_model): for target_param,local_param in zip(target_model.parameters(),local_model.parameters()): target_param.data.copy_(local_param)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, use_ddqn=True): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.use_ddqn = use_ddqn # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() if self.use_ddqn: # Use double dqn for training if selected self.learn_ddqn(experiences, GAMMA) else: self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # compute and minimize the loss qvalue_target = self.qnetwork_target.forward(next_states).detach().max( 1)[0].unsqueeze(1) y = rewards + gamma * qvalue_target * (1 - dones) qvalue = self.qnetwork_local.forward(states).gather(1, actions) #print("Best actions:") #print(qvalue) loss = F.mse_loss(y, qvalue) # Optimize the model self.optimizer.zero_grad() loss.backward() #for param in self.qnetwork_local.parameters(): # param.grad.data.clamp_(-1, 1) self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def learn_ddqn(self, experiences, gamma): """Update value parameters using given batch of experience tuples uaing double dqn method. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get best actions from local model to use in the double DQN best_actions_local = self.qnetwork_local(next_states).detach().argmax( dim=1).unsqueeze(1) # Get predicted Q values (for next states) from target model using actions selected from double DQN using local model qvalue_target = self.qnetwork_target(next_states).detach().gather( 1, best_actions_local) # compute and minimize the loss y = rewards + gamma * qvalue_target * (1 - dones) qvalue = self.qnetwork_local.forward(states).gather(1, actions) #print("Best actions:") #print(qvalue) loss = F.mse_loss(y, qvalue) # Optimize the model self.optimizer.zero_grad() loss.backward() #for param in self.qnetwork_local.parameters(): # param.grad.data.clamp_(-1, 1) self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. #self.t_step = (self.t_step + 1) % UPDATE_EVERY #if self.t_step == 0: self.t_step += 1 # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss # train loop # states and next_states (batch_size x num_states) # actions and rewards (batch_size x 1) # forward pass # use local network to compute q_est(s,w)[action] ps_local = self.qnetwork_local.forward(states).gather(1, actions) # use target network compute r + g*max(q_est[s',a, w-]), this tensor should be detached from backward computations ps_target = rewards + gamma * ( 1 - dones) * self.qnetwork_target.forward( next_states).detach().max(dim=1)[0].view(-1, 1) # compute loss loss = F.mse_loss(ps_local, ps_target) # backward pass self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # if (self.t_step % UPDATE_EVERY) == 0: self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)