class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, Double_DQN=False, Priority_Replay_Paras=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.BUFFER_SIZE = BUFFER_SIZE # setting optional extra techniques self.Double_DQN = Double_DQN self.prio_e, self.prio_a, self.prio_b = Priority_Replay_Paras # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, Double_DQN, Priority_Replay_Paras) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences, experience_indexes, priorities = self.memory.sample( ) self.learn(experiences, experience_indexes, priorities, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, experience_indexes, priorities, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## compute and minimize the loss # calculate current Q_sa Q_s = self.qnetwork_local(states) Q_s_a = self.qnetwork_local(states).gather(1, actions) # Get max predicted Q values (for next states) from target model if self.Double_DQN: # double DQN uses the local network for selecting best action and evaluates it with target network best_actions = self.qnetwork_local(next_states).max( 1)[1].unsqueeze(1) Q_s_next = self.qnetwork_target(next_states).gather( 1, best_actions) else: Q_s_next = self.qnetwork_target(next_states).max(1)[0].unsqueeze(1) targets = rewards + gamma * Q_s_next * (1 - dones) # calculate loss between the two losses = (Q_s_a - targets)**2 # importance-sampling weights aka formula from Prioritized Experience Replay importance_weights = (((1 / self.BUFFER_SIZE) * (1 / priorities))**self.prio_b).unsqueeze(1) loss = (importance_weights * losses).mean() # calculate gradients and do a step self.optimizer.zero_grad() loss.backward() self.optimizer.step() # calculate priorities and update them target_priorities = abs(Q_s_a - targets).detach().cpu().numpy() + self.prio_e self.memory.update_priority(experience_indexes, target_priorities) # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def update_beta(self, interpolation): """Update priority beta for unbiased Q updates. Params ====== interpolation (float): number between 0 and 1 specifying how much to interpolate to beta = 1 """ self.prio_b += (1 - self.prio_b) * interpolation
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences self.optimizer.zero_grad() print(actions.shape) print(states.shape) ## TODO: compute and minimize the loss Qpredicted = torch.gather(self.qnetwork_local(states), 1, actions) Qactual = self.qnetwork_target(next_states).max(1) Qactual[dones] = 0 Qactual = rewards + gamma * Qactual loss = nn.MSELoss() loss1 = loss(Qactual, Qpredicted) loss1.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Agent that interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): #last arg was , , use_dueling=False use_double=False """Initialize the Agent: Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Apply the use of a Replay Memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step - this will need to be updated according to UPDATE_EVERY self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, np.int32(action), reward, next_state, done) # self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: #Only need to learn when every UPDATE_EVERY time steps if len(self.memory) > BATCH_SIZE: # Need enough to make a batch experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions as determined by the policy, and specific state agent is in. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update parameters from given batch. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Computer for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, double_dqn=True): self.state_size = state_size self.action_size = action_size self.double_dqn = double_dqn # Q-Network self.qnetwork_local = QNetwork(state_size, action_size).to(device) self.qnetwork_target = copy.deepcopy(self.qnetwork_local) self.optimizer = torch.optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE) self.t_step = 0 def save(self, path, *data): torch.save(self.qnetwork_local.state_dict(), path / "model_checkpoint.local") torch.save(self.qnetwork_target.state_dict(), path / "model_checkpoint.target") torch.save(self.optimizer.state_dict(), path / 'model_checkpoint.optimizer') with open(path / 'model_checkpoint.meta', 'wb') as file: pickle.dump(data, file) def load(self, path, *defaults): try: print("Loading model from checkpoint...") self.qnetwork_local.load_state_dict( torch.load(path / 'model_checkpoint.local')) self.qnetwork_target.load_state_dict( torch.load(path / 'model_checkpoint.target')) self.optimizer.load_state_dict( torch.load(path / 'model_checkpoint.optimizer')) with open(path / 'model_checkpoint.meta', 'rb') as file: return pickle.load(file) except: print("No checkpoint file was found") return defaults def step(self, state, action, reward, next_state, done, train=True): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if train and len(self.memory) > BATCH_SIZE and self.t_step == 0: self.learn(self.memory.sample(), GAMMA) def act(self, state, eps=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) if self.double_dqn: Q_best_action = self.qnetwork_local(next_states).max(1)[1] Q_targets_next = self.qnetwork_target(next_states).gather( 1, Q_best_action.unsqueeze(-1)) else: Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(-1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute loss and perform a gradient step self.optimizer.zero_grad() loss = F.mse_loss(Q_expected, Q_targets) loss.backward() self.optimizer.step() # Update target network self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state): # Save experience in replay memory self.memory.add(state, action, reward, next_state) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s') tuples gamma (float): discount factor """ states, actions, rewards, next_states = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def extractPolicy(self): policy = np.zeros((9, 9)) - 1 for a in range(9): for h in range(9): state = torch.from_numpy(np.asarray([a, h])).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() max_action = np.argmax(action_values.cpu().data.numpy()) policy[a,h] = max_action return policy def processPolicy(self, policy): results = '' print(policy) for a in range(9): results += '{} & '.format(a) for h in range(9): action = policy[a, h] assert(action in [0, 1, 2]) if action == 0: results += '\\ag' elif action == 1: results += '\\ob' else: results += '\\wt' results += ' & ' results = results[:-2] results += '\\\\ \n' print(results)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(DEVICE) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(DEVICE) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, # get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(DEVICE) # Notify all layers to work in eval mode self.qnetwork_local.eval() # Deactivate autograd engine -> reduces memory & speeds up computation with torch.no_grad(): action_values = self.qnetwork_local(state) # Re-enable train mode in all layers self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Compute and minimize the loss criterion = torch.nn.MSELoss() ## Move input and label tensors to correct device self.qnetwork_local.to(DEVICE) self.qnetwork_target.to(DEVICE) inputs = next_states.to(DEVICE) ## Select max predicted Q value for next state using the target model with torch.no_grad(): next_target = self.qnetwork_target(inputs) next_q_target = next_target.max(1)[0].unsqueeze(1) ## Calculate q targets target_q = rewards + (gamma * next_q_target * (1 - dones)) ## Use local model to get the expected Q value expected_q = self.qnetwork_local(states).gather(1, actions) ## Compute and minimize the loss loss = criterion(expected_q, target_q) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, prioritized=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed prioritized (bool): whether to use proportional prioritized experience replay """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.prioritized = prioritized if self.prioritized: self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) else: self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step self.t_step = 0 def step(self, state, action, reward, next_state, done, double=False): """Gather experience for each step and learn from it Params ====== """ # Save experience to Replay Buffer self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: if len(self.memory) >= BATCH_SIZE: self.learn(self.memory.sample(), GAMMA, double) def act(self, state, epsilon=0.0): """Get action from on/off policy Params ====== state (array_like): current state epsilon (float): for epsilon-greedy action selection """ _state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(_state) self.qnetwork_local.train() if random.random() > epsilon: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma, double=False): """Learn from sample experiences and update weights for both target and local models Params ====== experiences (*array_like): (s, a, r, s', done) gamma (float): discount rate """ if self.prioritized: states, actions, rewards, next_states, dones, is_weights, sample_idx = experiences else: states, actions, rewards, next_states, dones = experiences self.qnetwork_local.train() self.qnetwork_target.eval() q_expected = self.qnetwork_local(states).gather(1, actions) if double: self.qnetwork_local.eval() with torch.no_grad(): _, next_actions = self.qnetwork_local(next_states).max(1) q_target_next = self.qnetwork_target(next_states).gather( 1, next_actions.unsqueeze(1).long()) self.qnetwork_local.train() else: q_target_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) q_target = rewards + gamma * q_target_next * (1 - dones) if self.prioritized: diff = q_target - q_expected loss = 0.5 * torch.pow(diff, 2) # Mean Square Error loss = (is_weights * loss).mean() self.memory.update_priority( diff.abs().detach().squeeze(1).cpu().data.numpy(), sample_idx) else: loss = F.mse_loss(q_target, q_expected) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.update(self.qnetwork_local, self.qnetwork_target, TAU) def update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for local_params, target_params in zip(local_model.parameters(), target_model.parameters()): target_params.data.copy_(tau * local_params.data + (1. - tau) * target_params.data) def save(self, path): """Save model parameters. Params ====== path (str): path to save a model, torch model with extension of ".pt" or ".pth" """ torch.save(self.qnetwork_local.state_dict(), path) print("Model saved as {}".format(path)) def load(self, path, device='cpu'): """Load model parameters. Params ====== path (str): path to load a model, torch model with extension of ".pt" or ".pth" """ self.qnetwork_local.load_state_dict( torch.load(path, map_location=device)) print("Model loaded from {} on {}".format(path, device))
class Agent(): """ An agent to interact with the environment and learn from it """ def __init__(self, state_size, action_size, seed): """ Initialization function. Params ====== state_size (int): dim of each state action_size (int): dim of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # define Q-Network if USE_DUELING_NETWORK: self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed, 128, 32, 64, 32).to(device) self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed, 128, 32, 64, 32).to(device) else: self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory if USE_PRIORITIZED_REPLAY: self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device, alpha=.6, beta=.4, beta_scheduler=1.) else: self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # initial time step self.t_step = 0 def step(self, state, action, reward, next_state, done): # save experience in memory replay self.memory.add(state, action, reward, next_state, done) # learn every UPDATE_EVERY time steps self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # when the memory is full enough if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0): """ Return actions for given state as per current policy Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """ Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done, w) tuples gamma (float): discount factor """ if USE_PRIORITIZED_REPLAY: states, actions, reward, next_states, dones, w = experiences else: states, actions, reward, next_states, dones = experiences with torch.no_grad(): if USE_DOUBLE_DQN: Q_local_next = self.qnetwork_local(next_states).detach().max( 1)[1].unsqueeze(1) Q_target_next = self.qnetwork_target(next_states).gather( 1, Q_local_next) else: # get max predicted Q values (for next states) from target model Q_target_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # compute Q targets for current states Q_target = reward + (gamma * Q_target_next * (1 - dones)) # get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) if USE_PRIORITIZED_REPLAY: Q_target.sub_(Q_expected) Q_target = torch.squeeze(Q_target) Q_target.pow_(2) with torch.no_grad(): TD_error = Q_target.detach() TD_error.pow_(.5) self.memory.update_priorities(TD_error) Q_target.mul_(w) loss = Q_target.mean() else: # compute loss loss = F.mse_loss(Q_expected, Q_target) # minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # update target network self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """ soft update model parameters. theta_target = tau*theta_local + (1 - tau)*theta_target Params ====== local_model (pytorch model): weight will be copied from target-model (pytorch model): weight will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1. - tau) * target_param.data)
class AgentVanilla(): """Interacts with and learns from the environment. This class implements the deep Q-Learning with experience replay. """ def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork.parameters(), lr=LR) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): experience = (state, action, reward, next_state, done) self.learn(experience, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork.eval() with torch.no_grad(): action_values = self.qnetwork(state) self.qnetwork.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experience, gamma): """Update value parameters using given batch of experience tuples. Params ====== experience (Tuple[torch.Tensor]): (s, a, r, s', done) gamma (float): discount factor """ state, action, reward, next_state, done = experience with torch.no_grad(): Q_next = self.qnetwork(torch.Tensor([next_state]).to(device)) Q_val = reward + gamma * (1 - done) * Q_next.max(dim=1)[0] Q1 = self.qnetwork(torch.Tensor([state]).to(device)) Q_expected = Q1.squeeze()[action].unsqueeze(dim=0) loss = F.mse_loss(Q_expected, Q_val.detach()) self.optimizer.zero_grad() loss.backward() self.optimizer.step()
class AgentEr(): """Interacts with and learns from the environment. This class implements the deep Q-Learning with experience replay. """ def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork.eval() with torch.no_grad(): action_values = self.qnetwork(state) self.qnetwork.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences with torch.no_grad(): Q_next = self.qnetwork(next_states) Q_vals = rewards + gamma * (1 - dones) * Q_next.max(dim=1)[0].unsqueeze(1) Q1 = self.qnetwork(states) Q_expected = Q1.gather(dim=1, index=actions) loss = F.mse_loss(Q_expected, Q_vals.detach()) self.optimizer.zero_grad() loss.backward() self.optimizer.step()
class Agent(object): def __init__(self, n_states, n_actions, hidden_dim): """Agent class that choose action and train Args: n_states (int): input dimension n_actions (int): output dimension hidden_dim (int): hidden dimension """ self.q_local = QNetwork(n_states, n_actions, hidden_dim=16).to(device) self.q_target = QNetwork(n_states, n_actions, hidden_dim=16).to(device) self.mse_loss = torch.nn.MSELoss() self.optim = optim.Adam(self.q_local.parameters(), lr=LEARNING_RATE) self.n_states = n_states self.n_actions = n_actions # ReplayMemory: trajectory is saved here self.replay_memory = ReplayMemory(10000) def get_action(self, state, eps, check_eps=True): """Returns an action Args: state : 2-D tensor of shape (n, input_dim) eps (float): eps-greedy for exploration Returns: int: action index """ global steps_done sample = random.random() if check_eps == False or sample > eps: with torch.no_grad(): return self.q_local( Variable(state).type(FloatTensor)).data.max(1)[1].view( 1, 1) else: ## return LongTensor([[random.randrange(2)]]) return torch.tensor([[random.randrange(self.n_actions)]], device=device) def learn(self, experiences, gamma): """Prepare minibatch and train them Args: experiences (List[Transition]): batch of `Transition` gamma (float): Discount rate of Q_target """ if len(self.replay_memory.memory) < BATCH_SIZE: return transitions = self.replay_memory.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) states = torch.cat(batch.state) actions = torch.cat(batch.action) rewards = torch.cat(batch.reward) next_states = torch.cat(batch.next_state) dones = torch.cat(batch.done) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to newtork q_local (current estimate) Q_expected = self.q_local(states).gather(1, actions) Q_targets_next = self.q_target(next_states).detach().max(1)[0] # Compute the expected Q values Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) self.q_local.train(mode=True) self.optim.zero_grad() loss = self.mse_loss(Q_expected, Q_targets.unsqueeze(1)) # backpropagation of loss to NN loss.backward() self.optim.step() def soft_update(self, local_model, target_model, tau): """ tau (float): interpolation parameter""" for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def hard_update(self, local, target): for target_param, param in zip(target.parameters(), local.parameters()): target_param.data.copy_(param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network #optimizer = tf.train.RMSPropOptimizer(learning_rate= LR) optimizer = tf.train.AdamOptimizer(learning_rate=LR) self.Qnetwork = QNetwork(state_size=state_size, action_size=action_size, optimizer=optimizer, gamma=GAMMA, tau=TAU, minibatch_size=BATCH_SIZE, neurons_of_layers=NEURONS_OF_LAYERS, with_bn=WITH_BN) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # ------------------- update target network ------------------- # self.Qnetwork.update_target_network() def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ if len(state.shape) == 1: # make it batch-like state = state[np.newaxis, :] # Epsilon-greedy action selection if random.random() > eps: action_values = self.Qnetwork.get_action(state) return np.argmax(action_values) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ #states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" current_loss = self.Qnetwork.train(experiences)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, buf_size, gamma, tau, update_t, lr, batch_size, fc1_units, fc2_units, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action buf_size (int): replay buffer size gamma (float): discount factor tau (float): for soft update of target parameters update_t (int): how often to update the network lr (float): learning rate batch_size (int): minibatch size fc1_units (int): number of nodes in first hidden layer of Q network fc2_units (int): number of nodes in second hidden layer of Q network seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, fc1_units, fc2_units).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, fc1_units, fc2_units).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) # Replay memory self.memory = ReplayBuffer(action_size, buf_size, batch_size, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.batch_size = batch_size self.update_t = update_t self.gamma = gamma self.tau = tau def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_t if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class SAC(object): def __init__(self, num_inputs, action_space, args): self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.device = torch.device("cuda" if args.cuda else "cpu") self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) if self.policy_type == "Gaussian": # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning is True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) else: self.alpha = 0 self.automatic_entropy_tuning = False self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) def select_action(self, state, evaluate=False): state = torch.FloatTensor(state).to(self.device).unsqueeze(0) if evaluate is False: action, _, _ = self.policy.sample(state) else: _, _, action = self.policy.sample(state) return action.detach().cpu().numpy()[0] def update_parameters(self, memory, batch_size, updates): # Sample a batch from memory state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample( batch_size=batch_size) state_batch = torch.FloatTensor(state_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to( self.device).unsqueeze(1) mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1) with torch.no_grad(): next_state_action, next_state_log_pi, _ = self.policy.sample( next_state_batch) qf1_next_target, qf2_next_target = self.critic_target( next_state_batch, next_state_action) min_qf_next_target = torch.min( qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi next_q_value = reward_batch + mask_batch * self.gamma * ( min_qf_next_target) qf1, qf2 = self.critic( state_batch, action_batch ) # Two Q-functions to mitigate positive bias in the policy improvement step qf1_loss = F.mse_loss( qf1, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf2_loss = F.mse_loss( qf2, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf_loss = qf1_loss + qf2_loss self.critic_optim.zero_grad() qf_loss.backward() self.critic_optim.step() pi, log_pi, _ = self.policy.sample(state_batch) qf1_pi, qf2_pi = self.critic(state_batch, pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean( ) # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() if self.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() alpha_tlogs = self.alpha.clone() # For TensorboardX logs else: alpha_loss = torch.tensor(0.).to(self.device) alpha_tlogs = torch.tensor(self.alpha) # For TensorboardX logs if updates % self.target_update_interval == 0: soft_update(self.critic_target, self.critic, self.tau) return qf1_loss.item(), qf2_loss.item(), policy_loss.item( ), alpha_loss.item(), alpha_tlogs.item() # Save model parameters def save_checkpoint(self, env_name, suffix="", ckpt_path=None): if not os.path.exists('checkpoints/'): os.makedirs('checkpoints/') if ckpt_path is None: ckpt_path = "checkpoints/sac_checkpoint_{}_{}".format( env_name, suffix) print('Saving models to {}'.format(ckpt_path)) torch.save( { 'policy_state_dict': self.policy.state_dict(), 'critic_state_dict': self.critic.state_dict(), 'critic_target_state_dict': self.critic_target.state_dict(), 'critic_optimizer_state_dict': self.critic_optim.state_dict(), 'policy_optimizer_state_dict': self.policy_optim.state_dict() }, ckpt_path) # Load model parameters def load_checkpoint(self, ckpt_path, evaluate=False): print('Loading models from {}'.format(ckpt_path)) if ckpt_path is not None: checkpoint = torch.load(ckpt_path) self.policy.load_state_dict(checkpoint['policy_state_dict']) self.critic.load_state_dict(checkpoint['critic_state_dict']) self.critic_target.load_state_dict( checkpoint['critic_target_state_dict']) self.critic_optim.load_state_dict( checkpoint['critic_optimizer_state_dict']) self.policy_optim.load_state_dict( checkpoint['policy_optimizer_state_dict']) if evaluate: self.policy.eval() self.critic.eval() self.critic_target.eval() else: self.policy.train() self.critic.train() self.critic_target.train()
class PrioritizedAgent: '''Interact with and learn from the environment. The agent uses prioritized experience replay. ''' def __init__(self, state_size, action_size, seed, is_double_q=False): '''Initialize an Agent. Params ====== state_size (int): the dimension of the state action_size (int): the number of actions seed (int): random seed ''' self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.t_step = 0 # Initialize time step (for tracking LEARN_EVERY_STEP and UPDATE_EVERY_STEP) self.running_loss = 0 self.training_cnt = 0 self.is_double_q = is_double_q self.qnetwork_local = QNetwork(self.state_size, self.action_size, seed).to(device) self.qnetowrk_target = QNetwork(self.state_size, self.action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.prioritized_memory = PrioritizedMemory(BATCH_SIZE, BUFFER_SIZE, seed) def act(self, state, mode, epsilon=None): '''Returns actions for given state as per current policy. Params ====== state (array): current state mode (string): train or test epsilon (float): for epsilon-greedy action selection ''' state = torch.from_numpy(state).float().unsqueeze(0).to( device) # shape of state (1, state) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local.forward(state) self.qnetwork_local.train() if mode == 'test': action = np.argmax(action_values.cpu().data.numpy() ) # pull action values from gpu to local cpu elif mode == 'train': if random.random() <= epsilon: # random action action = random.choice(np.arange(self.action_size)) else: # greedy action action = np.argmax(action_values.cpu().data.numpy( )) # pull action values from gpu to local cpu return action def step(self, state, action, reward, next_state, done): # add new experience in memory self.prioritized_memory.add(state, action, reward, next_state, done) # activate learning every few steps self.t_step = self.t_step + 1 if self.t_step % LEARN_EVERY_STEP == 0: # If enough samples are available in memory, get random subset and learn if len(self.prioritized_memory) >= BUFFER_SIZE: idxes, experiences, is_weights = self.prioritized_memory.sample( device) self.learn(experiences, GAMMA, is_weights=is_weights, leaf_idxes=idxes) def learn(self, experiences, gamma, is_weights, leaf_idxes): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor is_weights (tensor array): importance-sampling weights for prioritized experience replay leaf_idxes (numpy array): indexes for update priorities in SumTree """ states, actions, rewards, next_states, dones = experiences q_local_chosen_action_values = self.qnetwork_local.forward( states).gather(1, actions) q_target_action_values = self.qnetowrk_target.forward( next_states).detach() if self.is_double_q == True: q_local_next_actions = self.qnetwork_local.forward( next_states).detach().max(1)[1].unsqueeze( 1) # shape (batch_size, 1) q_target_best_action_values = q_target_action_values.gather( 1, q_local_next_actions) # Double DQN elif self.is_double_q == False: q_target_best_action_values = q_target_action_values.max( 1)[0].unsqueeze(1) # shape (batch_size, 1) rewards = rewards.tanh( ) # rewards are clipped to be in [-1,1], referencing from original paper q_target_values = rewards + gamma * q_target_best_action_values * ( 1 - dones) # zero value for terminal state td_errors = (q_target_values - q_local_chosen_action_values).tanh( ) # TD-errors are clipped to be in [-1,1], referencing from original paper abs_errors = td_errors.abs().cpu().data.numpy() # pull back to cpu self.prioritized_memory.batch_update( leaf_idxes, abs_errors) # update priorities in SumTree loss = (is_weights * (td_errors**2)).mean( ) # adjust squared TD loss by Importance-Sampling Weights self.running_loss += float(loss.cpu().data.numpy()) self.training_cnt += 1 self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # if self.t_step % UPDATE_EVERY_STEP == 0: self.update(self.qnetwork_local, self.qnetowrk_target) def update(self, local_netowrk, target_network): """Hard update model parameters, as indicated in original paper. Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to """ for local_param, target_param in zip(local_netowrk.parameters(), target_network.parameters()): target_param.data.copy_(local_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed=SEED, batch_size=BATCH_SIZE, buffer_size=BUFFER_SIZE, start_since=START_SINCE, gamma=GAMMA, target_update_every=T_UPDATE, tau=TAU, lr=LR, weight_decay=WEIGHT_DECAY, update_every=UPDATE_EVERY, priority_eps=P_EPS, a=A, initial_beta=INIT_BETA, clip=CLIP, **kwds): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed batch_size (int): size of each sample batch buffer_size (int): size of the experience memory buffer start_since (int): number of steps to collect before start training gamma (float): discount factor target_update_every (int): how often to update the target network tau (float): target network soft-update parameter lr (float): learning rate weight_decay (float): weight decay for optimizer update_every (int): update(learning and target update) interval priority_eps (float): small base value for priorities a (float): priority exponent parameter initial_beta (float): initial importance-sampling weight clip (float): gradient norm clipping (`None` to disable) """ if kwds != {}: print("Ignored keyword arguments: ", end='') print(*kwds, sep=', ') assert isinstance(state_size, int) assert isinstance(action_size, int) assert isinstance(seed, int) assert isinstance(batch_size, int) and batch_size > 0 assert isinstance(buffer_size, int) and buffer_size >= batch_size assert isinstance(start_since, int) and batch_size <= start_since <= buffer_size assert isinstance(gamma, (int, float)) and 0 <= gamma <= 1 assert isinstance(target_update_every, int) and target_update_every > 0 assert isinstance(tau, (int, float)) and 0 <= tau <= 1 assert isinstance(lr, (int, float)) and lr >= 0 assert isinstance(weight_decay, (int, float)) and weight_decay >= 0 assert isinstance(update_every, int) and update_every > 0 assert isinstance(priority_eps, (int, float)) and priority_eps >= 0 assert isinstance(a, (int, float)) and 0 <= a <= 1 assert isinstance(initial_beta, (int, float)) and 0 <= initial_beta <= 1 if clip: assert isinstance(clip, (int, float)) and clip >= 0 self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.batch_size = batch_size self.buffer_size = buffer_size self.start_since = start_since self.gamma = gamma self.target_update_every = target_update_every self.tau = tau self.lr = lr self.weight_decay = weight_decay self.update_every = update_every self.priority_eps = priority_eps self.a = a self.beta = initial_beta self.clip = clip # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict()) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr, weight_decay=weight_decay) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, a, seed) # Initialize time step (for updating every UPDATE_EVERY steps and TARGET_UPDATE_EVERY steps) self.u_step = 0 self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.u_step = (self.u_step + 1) % self.update_every if self.u_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) >= self.start_since: experiences, is_weights, indices = self.memory.sample(self.beta) new_priorities = self.learn(experiences, is_weights, self.gamma) self.memory.update_priorities(indices, new_priorities) # update the target network every TARGET_UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.target_update_every if self.t_step == 0: self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) return random.choice(np.arange(self.action_size)) def learn(self, experiences, is_weights, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples is_weights (torch.Tensor): tensor of importance-sampling weights gamma (float): discount factor Returns ======= new_priorities (List[float]): list of new priority values for the given sample """ states, actions, rewards, next_states, dones = experiences with torch.no_grad(): target = rewards + gamma * (1 - dones) * self.qnetwork_target(next_states)\ .gather(dim=1, index=self.qnetwork_local(next_states)\ .argmax(dim=1, keepdim=True)) pred = self.qnetwork_local(states) diff = target.sub(pred.gather(dim=1, index=actions)) new_priorities = diff.detach().abs().add(P_EPS).cpu().numpy().reshape((-1,)) loss = diff.pow(2).mul(is_weights).mean() self.optimizer.zero_grad() loss.backward() if self.clip: torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), CLIP) self.optimizer.step() return new_priorities def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): def __init__(self, state_size, action_size, behavior_name, index_player, replay_memory_size=1e4, batch_size=128, gamma=0.99, learning_rate = 1e-3, target_tau=1e-3, update_rate=4, seed=0): self.state_size = state_size self.current_state = [] self.action_size = action_size self.buffer_size = int(replay_memory_size) self.batch_size = batch_size self.gamma = gamma self.learn_rate = learning_rate self.tau = target_tau self.update_rate = update_rate self.seed = random.seed(seed) self.behavior_name = behavior_name self.index_player = index_player self.close_ball_reward = 0 self.touch_ball_reward = 0 """ Now we define two models: (a) one netwoek will be updated every (step % update_rate == 0), (b) A target network, with weights updated to equal to equal to the network (a) at a slower (target_tau) rate. """ self.network = QNetwork(state_size, action_size, seed).to(device) self.target_network = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.network.parameters(), lr= self.learn_rate) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed) # Initialize time step ( for updating every UPDATE_EVERY steps) self.t_step = 0 def load_model(self, path_model, path_target = None): params = torch.load(path_model) #self.network.set_params(params) self.network.load_state_dict(torch.load(path_model)) if path_target != None: self.target_network.load_state_dict(torch.load(path_target)) def model_step(self, state, action, reward, next_state): # save experience in replay memory self.memory.add(state, action, reward, next_state) # learn every UPDATE_EVERY time steps self.t_step = (self.t_step + 1)%1003 #% self.update_rate if self.t_step% self.update_rate == 0: print('LEAR HERE') # if enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: for hhh in range(1): experiences = self.memory.sample() self.learn(experiences, self.gamma,self.t_step) def choose_action(self, state, eps = 0.0): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.network.eval() with torch.no_grad(): action_values = self.network(state) self.network.train() # epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) # return a number from 0 to action_size else: return random.choice(np.arange(self.action_size)) # return a number from 0 to action_size def learn(self, experiences, gamma,stp): states, actions, rewards, next_states = experiences # Get Q values from current observations (s,a) using model network # get max Q values for (s', a') from target model self.network.train() Q_sa = self.network(states).gather(1, actions) #print(Q_sa) Q_sa_prime_target_values = self.target_network(next_states).max(1)[0].to(device).float().detach() #Q_sa_prime_targets = Q_sa_prime_target_values.max(1)[0].unsqueeze(1) #print(Q_sa_prime_target_values) # compute Q targets for current states Q_sa_targets = rewards + gamma * Q_sa_prime_target_values.unsqueeze(1) # Compute loss (error) criterion = torch.nn.MSELoss(reduction='sum') loss = criterion(Q_sa,Q_sa_targets)#F.mse_loss(Q_sa, Q_sa_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # update target network if(stp%1000==0): self.soft_update(self.network, self.target_network, self.tau) def soft_update(self, local_model, target_model, tau): """" local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def Read(self): decision_steps, terminal_steps = env.get_steps(self.behavior_name) try: signal_front = np.array(sensor_front_sig(decision_steps.obs[0][self.index_player, :])) # 3 x 11 x 8 signal_back = np.array(sensor_back_sig(decision_steps.obs[1][self.index_player, :])) # 3 x 3 x 8 pre_state = [] signal_front =np.array(signal_front) r = np.concatenate((signal_front,signal_back),axis=1) self.current_state = r count_close_to_ball = 0 count_touch_ball = 0 count_back_touch = 0 count_back_close = 0 self.rew_d_to_our_post =0 self.rew_for_ball_dist = -0.1 for i in range(len(signal_front[0])): if signal_front[0][i][0] == 1.0: print('baaallllll') count_close_to_ball+= 1 self.rew_for_ball_dist = max(0.3*(1 - signal_front[0][i][7]),self.rew_for_ball_dist) if signal_front[0][i][7] <= 0.02: count_touch_ball += 1 if signal_front[0][i][1] == 1.0: self.rew_d_to_our_post =-0.1 if signal_front[0][i][2] == 1.0: self.rew_d_to_our_post =0.1 for i in range(len(signal_back[0])): if signal_back[0][i][0] == 1.0: count_back_close+= 1 if signal_back[0][i][7] <= 0.03: count_back_touch += 0 self.back_touch = 1 if count_back_touch>0 else 0 self.back_close = 1 if count_back_close>0 else 0 # add reward if kick the ball self.touch_ball_reward = 2.5 if count_touch_ball > 0 else 0 #if count_back_touch>0: # self.touch_ball_reward= -0.3 if count_back_close >0: self.close_ball_reward = -0.1 # penalize if the ball is not in view self.close_ball_reward = -0.15 if count_close_to_ball == 0 else 0.2 if count_back_close >0: self.close_ball_reward = -0.15 return self.current_state except: self.touch_ball_reward = 0 self.close_ball_reward = 0 return self.current_state def upd_after_goal(self, n_upds): self.memory.upd_goal(n_upds) def we_goll(self): self.memory.we_goll() def us_goll(self): self.memory.us_goll()
class Agent: """Interacts with and learns from the environment.""" def __init__(self, action_size, seed, state_size, visual): """Initialize an Agent object. Params ====== action_size (int): dimension of each action seed (int): random seed state_size (int): dimension of each state. Note this can be None if visual is true visual (bool): whether to train the agent on visual pixels or vector observations """ if not visual: self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed) if not visual else VisualQNetwork(action_size, seed) self.qnetwork_local = self.qnetwork_local.to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed) if not visual else VisualQNetwork(action_size, seed) self.qnetwork_target = self.qnetwork_target.to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.beta_start = 0.4 # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, GAMMA) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.batch_no = 0 self.beta_batch_nos = 50_000 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: beta = min(1.0, self.beta_start + (self.batch_no / self.beta_batch_nos) * (1 - self.beta_start)) self.batch_no += 1 experiences = self.memory.sample(beta) self._learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def _learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, sample_indices, weight_update_weights = experiences # Get max predicted Q values (for next states) from target model q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0] # Compute Q targets for current states q_targets = rewards.squeeze(1) + (gamma * q_targets_next * (1 - dones.squeeze(1))) # Get expected Q values from local model q_expected = self.qnetwork_local(states).gather(1, actions).squeeze(1) # Compute loss loss = (q_expected - q_targets.detach()).pow(2) * weight_update_weights prios = loss + 1e-5 loss = loss.mean() # Minimize the loss self.optimizer.zero_grad() loss.backward() self.memory.update_priorities(prios.data.cpu().numpy(), sample_indices) self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.criterion = torch.nn.MSELoss() # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: # print('obtaining experiences') experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" self.optimizer.zero_grad() # obtain the estimated q-values of the states from the local network # Then obtain the q_value corresponding to the action taken estimated_q = self.qnetwork_local(states)[ range(BATCH_SIZE), actions.view(-1)].view( -1, 1 ) # there is simpler way to do this by using the .gather() method # estimated_q = self.qnetwork_local(states).gather(1, actions) # print(estimated_q) # print(estimated_q.size(0) == states.size(0)) # now compute the target q-value using the target qnetwork in eval mode self.qnetwork_target.eval() with torch.no_grad(): next_q_max = torch.max(self.qnetwork_target(next_states).detach(), axis=1).values.view(-1, 1) self.qnetwork_target.train() # if done then next_q_max should be zero # print(dones.size()) # print(next_q_max.size()) next_q_max *= (1 - dones) # target value is the sum of rewards and next_q_max discounted by GAMMA targets = rewards + GAMMA * next_q_max # now compute the loss loss = self.criterion(estimated_q, targets) loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" # Get max predicted Q values for next states from the target model (frozen weights) # # next_states is 64x8 # self.qnetwork_target(next_states) is 64x4 # detach() returns a tensor copy detached from the graph (no gradient) # max(1)[0] returns the the max value in given dim (max value indexes in 2nd array) # => This returns an array of 64 values # Unsqueeze(1)returns a new Tensor of size one inserted at the given position # => This returns a 64X1 tensor Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model (being trained) # x.gather(1, actions) returns a tensor (located on the current device) that is the result of # concataining the input tensor values along the provided dimensions (here the dim indexes are the taken actions indexes) Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, self.seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # Minimum priority self.eps = 0.0001 def step(self, state, action, reward, next_state, done): # Save experience in replay memory start_time = time.time() self.memory.add(state, action, reward, next_state, done) # print("Sample add time {:.4f}".format(start_time - time.time())) # Learn every UPDATE_EVERY time steps. self.t_step += 1 if len(self.memory) > 0 and (self.t_step % UPDATE_EVERY == 0): # start_time = time.time() experiences = self.memory.sample() # print("Sample time {:.4f}".format(start_time - time.time())) self.learn(experiences, GAMMA, self.t_step) self.memory.updateBeta() if self.t_step % 1e3 == 0: self.memory.reorder() def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma, t_step): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, indices, weights = experiences # Get best actions from local network target_actions = self.qnetwork_local(next_states).detach().max( 1)[1].unsqueeze(1) # And use them to evaluate the target network Q_targets_next = self.qnetwork_target(next_states).detach().gather( 1, target_actions) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # TD Error td_error = (Q_targets - Q_expected).abs().detach().numpy() + self.eps start_time = time.time() self.memory.setPriority(indices, td_error) # print("Update time {:.4f}".format(start_time - time.time())) start_time = time.time() # Compute loss loss = F.mse_loss(weights * Q_expected, weights * Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # print("Backprop time {:.4f}".format(start_time - time.time())) if (t_step % UPDATE_EVERY) == 0: # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.loss = torch.nn.MSELoss() # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory # it seems every element of memory is only one single state, action, reward ... not a series of them. self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() # print(experiences) self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # print("states", states.shape) # print("actions", actions) # print("rewards", rewards.shape) # print("next_states", next_states.shape) ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" # the most confusing thing here is that we are training the local model, but our objective is the target model ? # the target model parameters only get updated once in a while. best_action_q_value_target, _ = torch.max(self.qnetwork_target(next_states),1) best_action_q_value_target = best_action_q_value_target.view(-1,1) action_q_values_local = self.qnetwork_local(states).gather(1, actions.view(-1,1)) # action_q_values_target = self.qnetwork_target(states)[actions,] # print("action_q_values_local", best_action_q_value_local.shape) # print("action_q_values_target", action_q_values_target.shape) # error = action_q_values_target - (rewards + gamma*best_action_q_value_local) loss = self.loss(action_q_values_local, rewards + gamma*best_action_q_value_target* (1 - dones)) # loss = torch.sum(error**2) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class AgentPR: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBufferPR(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def priority(self, state, action, reward, next_state, done, gamma, alpha): """ Description: Calculates priority of a given (state action, reward, next_state, done)-tuple. The priority if made non-zero given a positive constant EPS and the significance of the TD error is controlled with the alpha paramteter. Input: state: action: reward: next_state: done: gamma: discount factor alpha: Weighting factor of the experience replays. I.e. how much should we care about the priorities. alpha=0: not to much. alpha=1: quite a bit. """ # taget values state = torch.from_numpy(np.vstack([state])).float().to(device) action = torch.from_numpy(np.vstack([action])).long().to(device) reward = torch.from_numpy(np.vstack([reward])).float().to(device) next_state = torch.from_numpy(np.vstack([next_state ])).float().to(device) done = torch.from_numpy(np.vstack([done]).astype( np.uint8)).float().to(device) qs_target = self.qnetwork_target(next_state).detach() qmax, qmax_index = torch.max(qs_target, axis=1) qmax = qmax.unsqueeze(1) y = reward + gamma * qmax * (1 - done) y_hat = self.qnetwork_local(state).gather(1, action) # delta = TD error (used for prioritized replay) delta = y - y_hat return ((abs(delta) + EPS).detach()**alpha).item() def step(self, state, action, reward, next_state, done, beta): # Calculate priority of the new sample priority = self.priority(state, action, reward, next_state, done, GAMMA, ALPHA) # Save experience in replay memory self.memory.add(state, action, reward, next_state, done, priority) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.prioritized_sample() #experiences = self.memory.prioritized_sample(ALPHA, EPS) self.learn(experiences, GAMMA, ALPHA, beta) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()).astype( np.int32) # must be in32 to work with unity else: return random.choice(np.arange(self.action_size)).astype( np.int32) # must be in32 to work with unity def loss_function(self, y, y_hat, imp_w): """ imp_w: importance sampling weights for that sample """ return torch.sum( imp_w * torch.clamp((y - y_hat).pow(2), 0, 1) ) # Following the clipping approach suggested in: https://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf def learn(self, experiences, gamma, alpha, beta): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor max_priority: Maximum priority given to any experience. Used to rescale the priorities to avoid outliers impacting heavly on the update. beta: Bias correction of importance sampling weights. beta = 1 full bias correction. beta = 0 no bias correction. """ states, actions, rewards, next_states, dones, priorities, idxs = experiences # taget values qs_target = self.qnetwork_target(next_states).detach() qmax, qmax_index = torch.max(qs_target, axis=1) qmax = qmax.unsqueeze(1) y = rewards + gamma * qmax * (1 - dones) y_hat = self.qnetwork_local(states).gather(1, actions) # delta = TD error (used for prioritized replay) delta = y - y_hat # Importance sampling weights if self.memory.max_priority: imp_w = (BUFFER_SIZE * priorities)**(-beta) / self.memory.max_priority #imp_w = (priorities) ** (- BETA) / self.memory.max_priority else: imp_w = torch.Tensor([1]) * len(y) imp_w = imp_w.to(device) # Set gradients to zero self.optimizer.zero_grad() # Calculate the loss between target and estimate loss = self.loss_function( y, y_hat, imp_w ) # # Adjust the loss according to the importance sampling distribution # Backpropagation with loss function loss.backward() # Update weights self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, learning_rate=LR, update_every=UPDATE_EVERY, discount_factor=GAMMA): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) #saving hyperparams self.update_every = update_every self.discount_factor = discount_factor # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, 64, 128).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, 64, 128).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=learning_rate) # Replay memory self.memory = PrioretizedReplayBuffer( BUFFER_SIZE, BATCH_SIZE, seed, device) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.loss_track = [] def eval_action_values(self, state, qnetwork): """ Helper method to evaluate model on given state and return action state values Params ==== state (Torch tensor) - current env state model (QNetwork) - one of the Q networks (qnetwork_local, qnetwork_target) """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) qnetwork.eval() # setting model to inference with torch.no_grad(): action_values = qnetwork(state) qnetwork.train() # setting model back to training return action_values def load_model_weights(self, weights_file): state_dict = torch.load(weights_file) self.qnetwork_local.load_state_dict(state_dict) def step(self, state, action, reward, next_state, done): # calculate TD error in order to save the experience with correct priority into PrioritiedReplayBuffer Q_target_vals = self.eval_action_values(state, self.qnetwork_target).numpy() Q_vals = self.eval_action_values(state, self.qnetwork_local).numpy()[0] td_error = reward + GAMMA*np.max(Q_target_vals) - Q_vals[action] if done != 0 else reward - Q_vals[action] # Save experience in replay memory self.memory.add(state, action, reward, next_state, done, td_error) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, self.discount_factor) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # loss_fn = self.loss_fn ## this is required as we're not learning qnetwork_targets weights # with torch.no_grad(): # Q_target = rewards + gamma * (torch.max(self.qnetwork_target(next_states), dim=1)[0].view(64,1))*(1 - dones) # Q_target[dones == True] = rewards[dones == True] # Q_pred = torch.max(self.qnetwork_local(states), dim=1)[0].view(64,1) ## Double Q-Learning implementation # Find action with highest value using Q network under training (argmax on qnetwork_local) for each S' best_actions_by_local_nn = torch.max(self.qnetwork_local(next_states).detach(), dim=1)[1].unsqueeze(1) # Then use Target Q-network (one not trained atm) to predict Q values for each (S', best_action) pair, which hopefully should be less noisy than Qnetwork_local would predict action_values_by_target_nn = self.qnetwork_target(next_states).detach().gather(1, best_actions_by_local_nn) # once action_values are predicted using Q_target = rewards + gamma * action_values_by_target_nn * (1 - dones) Q_pred = self.qnetwork_local(states).gather(1, actions) self.optimizer.zero_grad() loss = F.mse_loss(Q_pred, Q_target) self.loss_track.append(loss.item()) loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class AgentDQN(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBufferDQN(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() #experiences = self.memory.prioritized_sample(ALPHA, EPS) self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()).astype( np.int32) # must be in32 to work with unity else: return random.choice(np.arange(self.action_size)).astype( np.int32) # must be in32 to work with unity def loss_function(self, y, y_hat): return torch.sum((y - y_hat).pow(2)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # taget values qs_target = self.qnetwork_target(next_states).detach() qmax, qmax_index = torch.max(qs_target, axis=1) qmax = qmax.unsqueeze(1) #qmax = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) y = rewards + gamma * qmax * (1 - dones) # current estimate #y_all = self.qnetwork_local(states) #y_hat = torch.Tensor([y_all[i][action] for i, action in enumerate(actions)]).unsqueeze(1) #print('y_hat v1', y_hat) y_hat = self.qnetwork_local(states).gather(1, actions) #print('y_hat v2', y_hat) # Set gradients to zero self.optimizer.zero_grad() # Calculate the loss between target and estimate loss = self.loss_function(y, y_hat) # Backpropagation with loss function loss.backward() # Upate weights self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class ddqn_Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.wQ =0 self.wQ1=0 self.wQ2=0 # Q-Network self.qnetwork_Qa = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_Qb = QNetwork(state_size, action_size, seed).to(device) self.optimizer_Qa = optim.Adam(self.qnetwork_Qa.parameters(), lr=LR) self.optimizer_Qb = optim.Adam(self.qnetwork_Qb.parameters(), lr=LR) # Replay memory self.memory = ddqn_ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def print_result(): somme=self.wQ1+self.wQ2+0.0000001 print("qQ1=",self.wQ1/somme," qQ2=",self.wQ2/somme) def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act_a(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_Qa.eval() with torch.no_grad(): action_values = self.qnetwork_Qa(state) self.qnetwork_Qa.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def act_b(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_Qb.eval() with torch.no_grad(): action_values = self.qnetwork_Qb(state) self.qnetwork_Qb.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def act(self,state,eps=0.): self.wQ=np.random.choice([0, 1]) if(self.wQ): return self.act_a(state, eps) else: return self.act_b(state, eps) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ # states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" if self.wQ: # wQ take either 0 or 1 based on uniform random function. yj=self.qnetwork_Qb.forward(next_states).detach().max(1)[0].unsqueeze(1) Q_targets=rewards+gamma*yj*(1.0-dones) # Get expected Q values from local model Q_expected = self.qnetwork_Qa.forward(states).gather(1, actions) # Compute loss: Mean Square Error by element loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer_Qa.zero_grad() loss.backward() self.optimizer_Qa.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_Qa, self.qnetwork_Qb, TAU) else: yj=self.qnetwork_Qa.forward(next_states).detach().max(1)[0].unsqueeze(1) Q_targets=rewards+gamma*yj*(1.0-dones) # Get expected Q values from local model Q_expected = self.qnetwork_Qb.forward(states).gather(1, actions) # Compute loss: Mean Square Error by element loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer_Qb.zero_grad() loss.backward() self.optimizer_Qb.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_Qb, self.qnetwork_Qa, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. teta_target = ro*teta_local + (1 - ro)*teta_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class DQNAgent: def __init__(self, state_size, action_size, buffer_size=int(1e5), batch_size=64, gamma=.99, tau=1e-3, lr=5e-4, update_every=4, use_double=False, use_dueling=False, use_priority=False, use_noise=False, seed=42): """Deep Q-Network Agent Args: state_size (int) action_size (int) buffer_size (int): Experience Replay buffer size batch_size (int) gamma (float): discount factor, used to balance immediate and future reward tau (float): interpolation parameter for soft update target network lr (float): neural Network learning rate, update_every (int): how ofter we're gonna learn, use_double (bool): whether or not to use double networks improvement use_dueling (bool): whether or not to use dueling network improvement use_priority (bool): whether or not to use priority experience replay use_noise (bool): whether or not to use noisy nets for exploration seed (int) """ self.state_size = state_size self.action_size = action_size self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = lr self.update_every = update_every self.use_double = use_double self.use_dueling = use_dueling self.use_priority = use_priority self.use_noise = use_noise random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) # Q-Network if use_dueling: self.qn_local = DuelingQNetwork(state_size, action_size, noisy=use_noise).to(device) else: self.qn_local = QNetwork(state_size, action_size, noisy=use_noise).to(device) if use_dueling: self.qn_target = DuelingQNetwork(state_size, action_size, noisy=use_noise).to(device) else: self.qn_target = QNetwork(state_size, action_size, noisy=use_noise).to(device) # Initialize target model parameters with local model parameters self.soft_update(1.0) # TODO: make the optimizer configurable self.optimizer = optim.Adam(self.qn_local.parameters(), lr=lr) if use_priority: self.memory = PrioritizedReplayBuffer(buffer_size, batch_size) else: self.memory = ReplayBuffer(buffer_size, batch_size) # Initialize time step (for updating every update_every steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): """Step performed by the agent after interacting with the environment and receiving feedback Args: state (int) action (int) reward (float) next_state (int) done (bool) """ # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every update_every time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: if self.use_priority: experiences, indices, weights = self.memory.sample() self.learn(experiences, indices, weights) else: experiences = self.memory.sample() self.learn(experiences) def act(self, state, eps=0.): """Given a state what's the next action to take Args: state (int) eps (flost): controls how often we explore before taking the greedy action Returns: int: action to take """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qn_local.eval() with torch.no_grad(): action_values = self.qn_local(state) self.qn_local.train() if self.use_noise: return np.argmax(action_values.cpu().numpy()) else: # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, indices=None, weights=None): """Use a batch of experiences to calculate TD errors and update Q networks Args: experiences: tuple with state, action, reward, next_state and done indices (Numpy array): array of indices to update priorities (only used with PER) weights (Numpy array): importance-sampling weights (only used with PER) """ states = torch.from_numpy( np.vstack([e.state for e in experiences if e is not None]))\ .float().to(device) actions = torch.from_numpy( np.vstack([e.action for e in experiences if e is not None]))\ .long().to(device) rewards = torch.from_numpy( np.vstack([e.reward for e in experiences if e is not None]))\ .float().to(device) next_states = torch.from_numpy( np.vstack([e.next_state for e in experiences if e is not None]))\ .float().to(device) dones = torch.from_numpy( np.vstack([e.done for e in experiences if e is not None])\ .astype(np.uint8)).float().to(device) if self.use_priority: weights = torch.from_numpy(np.vstack(weights)).float().to(device) if self.use_double: # uses Double Deep Q-Network # Get the best action using local model best_action = self.qn_local(next_states).argmax(-1, keepdim=True) # Evaluate the action using target model max_q = self.qn_target(next_states).detach().gather( -1, best_action) else: # normal Deep Q-Network # Get max predicted Q value (for next states) from target model max_q = self.qn_target(next_states).detach().max(-1, keepdim=True)[0] # Compute Q targets for current states q_targets = rewards + (self.gamma * max_q * (1 - dones)) # Get expected Q values from local model q_expected = self.qn_local(states).gather(-1, actions) # Compute loss... if self.use_priority: # Calculate TD error to update priorities weighted_td_errors = weights * (q_targets - q_expected)**2 loss = weighted_td_errors.mean() else: loss = F.mse_loss(q_expected, q_targets) # ...and minimize self.optimizer.zero_grad() loss.backward() self.optimizer.step() if self.use_priority: self.memory.update(indices, weighted_td_errors.detach().cpu().numpy()) # Update target network self.soft_update(self.tau) def soft_update(self, tau): """Soft update model parameters: θ_target = τ*θ_local + (1 - τ)*θ_target Args: local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(self.qn_target.parameters(), self.qn_local.parameters()): target_param.data.copy_(tau * local_param + (1.0 - tau) * target_param) def make_filename(self, filename): filename = 'noisy_' + filename if self.use_noise else filename filename = 'dueling_' + filename if self.use_dueling else filename filename = 'double_' + filename if self.use_double else filename filename = 'prioritized_' + filename if self.use_priority else filename return filename def save_weights(self, filename='local_weights.pth', path='weights'): filename = self.make_filename(filename) torch.save(self.qn_local.state_dict(), '{}/{}'.format(path, filename)) def load_weights(self, filename='local_weights.pth', path='weights'): self.qn_local.load_state_dict( torch.load('{}/{}'.format(path, filename))) def summary(self): print('DQNAgent:') print('========') print('') print('Using Double:', self.use_double) print('Using Dueling:', self.use_dueling) print('Using Priority:', self.use_priority) print('Using Noise:', self.use_noise) print('') print(self.qn_local)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.loss = None self.loss_list = None self.exp = None # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # self.memory.add2(state, action, reward, next_state, done,None) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn #### Original # if len(self.memory) > BATCH_SIZE: # experiences = self.memory.sample() # self.learn(experiences, GAMMA) #### Testing if len(self.memory.memory2) > BATCH_SIZE: experiences = self.memory.sample2() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) self.loss=loss loss_list = ((Q_expected-Q_targets)**2)**.5 self.loss_list = loss_list # for s,a,r,n,d,ll in zip(states, actions, rewards, next_states, dones,loss_list): # self.memory.add2(s,a,r,n,d,ll) for i in range(len(states)): self.memory.add2(states[i], actions[i], rewards[i], next_states[i], dones[i],loss_list[i].detach().numpy()) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): def __init__(self, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.t_step = 0 def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.qnetwork_local(states).gather(1, actions) loss = F.mse_loss(Q_expected, Q_targets) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network hidden_layers = [128,64] self.qnetwork_local = QNetwork(state_size, action_size, hidden_layers, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, hidden_layers, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) #Initialising target and local environment with same weights self.hard_update(self.qnetwork_local,self.qnetwork_target) def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) def update(self): if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if np.random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" max_actions = self.qnetwork_local.forward(next_states).detach().max(1)[1].unsqueeze(1) output_target = self.qnetwork_target.forward(next_states).gather(1,max_actions) td_target = rewards + gamma*(output_target*(1-dones)) output_local= self.qnetwork_local(states).gather(1, actions) loss = F.mse_loss(output_local,td_target) self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), 1) self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def hard_update(self,local_model,target_model): for target_param,local_param in zip(target_model.parameters(),local_model.parameters()): target_param.data.copy_(local_param)