class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values for next states from the target model (frozen weights) Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model (being trained) # x.gather(1, actions) returns a tensor (located on the current device) that is the result of # concataining the input tensor values along the provided dimensions (here the dim indexes are the taken actions indexes) Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DQNAgent: device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') def __init__(self, osize, asize, seed, buffersize=int(1e6), gamma=0.99, epsilon=0.05, epsilondecay=1e6, epsilonmin=0.1, minibatchsize=128, lr=0.01, tau=0.01): """ Initialize DQN agent parameters. """ # initialize agent parameters self.osize = osize self.asize = asize self.gamma = gamma self.epsilon0 = epsilon self.epsilon = epsilon self.epsilondecay = epsilondecay self.epsilonmin = epsilonmin self.minibatchsize = minibatchsize self.lr = lr self.tau = tau self.stepcount = 0 self.loss_log = [] # set the random seed self.seed = torch.manual_seed(seed) # create local and target Q networks self.Q = QNetwork(osize, asize).to(self.device) self.targetQ = QNetwork(osize, asize).to(self.device) # initialize optimizer self.optimizer = optim.Adam(self.Q.parameters(), lr=self.lr) # initialize experience replay self.replay = ExperienceReplay(asize, buffersize, minibatchsize, seed) def step(self, state, action, reward, next_state, done): """ Step the agent, and learn if necessary. """ # add experience to replay self.replay.add(state, action, reward, next_state, done) # learn from experiences if self.replay.__len__() > self.minibatchsize: # create mini batch for learning experiences = self.replay.sample(self.device) # train the agent self.learn(experiences) # increase step count self.stepcount += 1 # decay epsilon decayed_epsilon = self.epsilon * (1 - self.epsilondecay) self.epsilon = max(self.epsilonmin, decayed_epsilon) def get_action(self, state): """ Get an epsilon greedy action. """ # convert network input to torch variable x = torch.from_numpy(state).float().unsqueeze(0).to(self.device) # obtain network output self.Q.eval() with torch.no_grad( ): # do not calculate network gradients which will speed things up y = self.Q(x) self.Q.train() # select action if random.random() > self.epsilon: # epsilon greedy action action = np.argmax( y.cpu().data.numpy()) # action is actually action index else: # random action selection action = np.random.choice(np.arange(self.asize)) return action def learn(self, experiences): """ Learn using Double DQN algorithm. """ # unpack experience states, actions, rewards, next_states, dones = experiences # get the argmax of Q(next_state) a_max = torch.argmax(self.Q(next_states), dim=1).cpu().data.numpy().reshape( (self.minibatchsize, 1)) # obtain the target Q network output target_out = self.targetQ(next_states).detach().data.numpy() target_q = np.array( [tout[aidx] for tout, aidx in zip(target_out, a_max)]) # calculate target and local Qs target = rewards + self.gamma * target_q * (1 - dones) local = self.Q(states).gather(1, actions) # calculate loss loss = F.mse_loss(local, target) self.loss_log.append(loss.cpu().data.numpy()) # perform gradient descent step self.optimizer.zero_grad() # reset the gradients to zero loss.backward() self.optimizer.step() # soft update target network for target_params, params in zip(self.targetQ.parameters(), self.Q.parameters()): target_params.data.copy_(self.tau * params + (1 - self.tau) * target_params.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed=SEED, batch_size=BATCH_SIZE, buffer_size=BUFFER_SIZE, start_since=START_SINCE, gamma=GAMMA, target_update_every=T_UPDATE, tau=TAU, lr=LR, weight_decay=WEIGHT_DECAY, update_every=UPDATE_EVERY, clip=CLIP, **kwds): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed batch_size (int): size of each sample batch buffer_size (int): size of the experience memory buffer start_since (int): number of steps to collect before start training gamma (float): discount factor target_update_every (int): how often to update the target network tau (float): target network soft-update parameter lr (float): learning rate weight_decay (float): weight decay for optimizer update_every (int): update(learning and target update) interval clip (float): gradient norm clipping (`None` to disable) """ if kwds != {}: print("Ignored keyword arguments: ", end='') print(*kwds, sep=', ') assert isinstance(state_size, int) assert isinstance(action_size, int) assert isinstance(seed, int) assert isinstance(batch_size, int) and batch_size > 0 assert isinstance(buffer_size, int) and buffer_size >= batch_size assert isinstance(start_since, int) and batch_size <= start_since <= buffer_size assert isinstance(gamma, (int, float)) and 0 <= gamma <= 1 assert isinstance(target_update_every, int) and target_update_every > 0 assert isinstance(tau, (int, float)) and 0 <= tau <= 1 assert isinstance(lr, (int, float)) and lr >= 0 assert isinstance(weight_decay, (int, float)) and weight_decay >= 0 assert isinstance(update_every, int) and update_every > 0 if clip: assert isinstance(clip, (int, float)) and clip >= 0 self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.batch_size = batch_size self.buffer_size = buffer_size self.start_since = start_since self.gamma = gamma self.target_update_every = target_update_every self.tau = tau self.lr = lr self.weight_decay = weight_decay self.update_every = update_every # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict()) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr, weight_decay=weight_decay) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) # Initialize time step (for updating every UPDATE_EVERY steps and TARGET_UPDATE_EVERY steps) self.u_step = 0 self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.u_step = (self.u_step + 1) % self.update_every if self.u_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) >= self.start_since: experiences = self.memory.sample() self.learn(experiences, GAMMA) # update the target network every TARGET_UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.target_update_every if self.t_step == 0: self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences with torch.no_grad(): target = rewards + gamma * self.qnetwork_target(next_states).max(dim=1)[0] * (1 - dones) pred = self.qnetwork_local(states) loss = F.mse_loss(pred.gather(dim=1, index=actions), target) self.optimizer.zero_grad() loss.backward() if self.clip: torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), CLIP) self.optimizer.step() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) # Target or w- self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory if (PRIORITIZED_REPLY_ENABLED): self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) else: self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.B = .001 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random # subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ if (PRIORITIZED_REPLY_ENABLED): states, actions, rewards, next_states, dones, a_probs = experiences else: states, actions, rewards, next_states, dones = experiences if (DOUBLE_DQN_ENABLED): # We will use the local paramters to get the next best action # We will then get the Q(s', a ) from the target # Get max predicted Q values (for next states) from target model # execute prediction for next states with torch.no_grad(): Q_targets_next = self.qnetwork_local(next_states).detach() # Returns the maximum value of each row of the input tensor in the # given dimension dim. The second return value is the index # location of each maximum value found (argmax). Q_targets_next = np.argmax(Q_targets_next, axis=1) Q_targets_next_prime = self.qnetwork_target(next_states).detach() Q_targets_next = Q_targets_next_prime[list(range(0, len(states))), Q_targets_next].reshape( len(states), 1) else: # Get max predicted Q values (for next states) from target model # execute prediction for next states Q_targets_next = self.qnetwork_target(next_states) # Detaches the Tensor from the graph that created it, making it # a leaf. Q_targets_next = Q_targets_next.detach() # Returns the maximum value of each row of the input tensor in # the given dimension dim. The second return value is the index # location of each maximum value found (argmax). Q_targets_next = Q_targets_next.max(1)[0] # Returns a new tensor with a dimension of size one inserted at the # specified position. # Before squeze torch.Size([64]) Q_targets_next = Q_targets_next.unsqueeze(1) # After squeze torch.Size([64, 1]) # Compute Q targets for current states # start with the rewards Q_targets = rewards # gamma * make on next state but only if not done Q_targets += (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model # do a forward pass Q_expected = self.qnetwork_local(states) # Gathers values along an axis specified by dim. # Before gather torch.Size([64, 4]) Q_expected = Q_expected.gather(1, actions) # After gather torch.Size([64, 1]) # Calulate the loss if (PRIORITIZED_REPLY_ENABLED): td_error = (Q_expected - Q_targets).abs_() + E_REPLAY impSampleWeigt = torch.tensor( ((1 / np.array(a_probs)) * (1 / BUFFER_SIZE))**self.B).float() for i in range(len(experiences)): self.memory.update(states[i], actions[i], rewards[i], next_states[i], dones[i], td_error[i]) loss = F.mse_loss(Q_expected, Q_targets, reduce=False) impSampleWeigt = torch.unsqueeze(impSampleWeigt, 1) if self.B < 0.998: self.B += .001 loss = torch.mean(loss * torch.tensor(impSampleWeigt).float()) else: # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """ Interacts with and learns from the environment """ def __init__(self, state_size=4 * 4, action_size=4, seed=42, fc1_units=256, fc2_units=256, fc3_units=256, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE, lr=LR, use_expected_rewards=True, predict_steps=2, gamma=GAMMA, tau=TAU): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed fc*_units (int): size of the respective layer buffer_size (int): number of steps to save in replay buffer batch_size (int): self-explanatory lr (float): learning rate use_expected_rewards (bool): whether to predict the weighted sum of future rewards or just for current step predict_steps (int): for how many steps to predict the expected rewards """ TAU = tau GAMMA = gamma self.state_size = state_size self.action_size = action_size self.seed = seed random.seed(seed) np.random.seed(seed) self.batch_size = batch_size self.losses = [] self.use_expected_rewards = use_expected_rewards self.current_iteration = 0 # Game scores self.scores_list = [] self.last_n_scores = deque(maxlen=50) self.mean_scores = [] self.max_score = 0 self.min_score = 1000 self.best_score_board = [] # Rewards self.total_rewards_list = [] self.last_n_total_rewards = deque(maxlen=50) self.mean_total_rewards = [] self.max_total_reward = 0 self.best_reward_board = [] # Max cell value on game board self.max_vals_list = [] self.last_n_vals = deque(maxlen=50) self.mean_vals = [] self.max_val = 0 self.best_val_board = [] # Number of steps per episode self.max_steps_list = [] self.last_n_steps = deque(maxlen=50) self.mean_steps = [] self.max_steps = 0 self.total_steps = 0 self.best_steps_board = [] self.actions_avg_list = [] self.actions_deque = { 0: deque(maxlen=50), 1: deque(maxlen=50), 2: deque(maxlen=50), 3: deque(maxlen=50) } # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, fc1_units=fc1_units, fc2_units=fc2_units, fc3_units=fc3_units).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, fc1_units=fc1_units, fc2_units=fc2_units, fc3_units=fc3_units).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) lr_s = lambda epoch: 0.998**( epoch % 1000) if epoch < 100000 else 0.999**(epoch % 1000) self.lr_decay = optim.lr_scheduler.StepLR(self.optimizer, 1000, 0.9999) # Replay buffer self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) # Initialize time step self.t_step = 0 self.steps_ahead = predict_steps def save(self, name): """Saves the state of the model and stats Params ====== name (str): name of the agent version used in dqn function """ torch.save(self.qnetwork_local.state_dict(), base_dir + '/network_local_%s.pth' % name) torch.save(self.qnetwork_target.state_dict(), base_dir + '/network_target_%s.pth' % name) torch.save(self.optimizer.state_dict(), base_dir + '/optimizer_%s.pth' % name) torch.save(self.lr_decay.state_dict(), base_dir + '/lr_schd_%s.pth' % name) state = { 'state_size': self.state_size, 'action_size': self.action_size, 'seed': self.seed, 'batch_size': self.batch_size, 'losses': self.losses, 'use_expected_rewards': self.use_expected_rewards, 'current_iteration': self.current_iteration, # Game scores 'scores_list': self.scores_list, 'last_n_scores': self.last_n_scores, 'mean_scores': self.mean_scores, 'max_score': self.max_score, 'min_score': self.min_score, 'best_score_board': self.best_score_board, # Rewards 'total_rewards_list': self.total_rewards_list, 'last_n_total_rewards': self.last_n_total_rewards, 'mean_total_rewards': self.mean_total_rewards, 'max_total_reward': self.max_total_reward, 'best_reward_board': self.best_reward_board, # Max cell value on game board 'max_vals_list': self.max_vals_list, 'last_n_vals': self.last_n_vals, 'mean_vals': self.mean_vals, 'max_val': self.max_val, 'best_val_board': self.best_val_board, # Number of steps per episode 'max_steps_list': self.max_steps_list, 'last_n_steps': self.last_n_steps, 'mean_steps': self.mean_steps, 'max_steps': self.max_steps, 'total_steps': self.total_steps, 'best_steps_board': self.best_steps_board, 'actions_avg_list': self.actions_avg_list, 'actions_deque': self.actions_deque, # Replay buffer 'memory': self.memory.dump(), # Initialize time step 't_step': self.t_step, 'steps_ahead': self.steps_ahead } with open(base_dir + '/agent_state_%s.pkl' % name, 'wb') as f: pickle.dump(state, f) def load(self, name): """Saves the state of the model and stats Params ====== name (str): name of the agent version used in dqn function """ self.qnetwork_local.load_state_dict( torch.load(base_dir + '/network_local_%s.pth' % name)) self.qnetwork_target.load_state_dict( torch.load(base_dir + '/network_target_%s.pth' % name)) self.optimizer.load_state_dict( torch.load(base_dir + '/optimizer_%s.pth' % name)) self.lr_decay.load_state_dict( torch.load(base_dir + '/lr_schd_%s.pth' % name)) with open(base_dir + '/agent_state_%s.pkl' % name, 'rb') as f: state = pickle.load(f) self.state_size = state['state_size'] self.action_size = state['action_size'] self.seed = state['seed'] random.seed(self.seed) np.random.seed(self.seed) self.batch_size = state['batch_size'] self.losses = state['losses'] self.use_expected_rewards = state['use_expected_rewards'] self.current_iteration = state['current_iteration'] # Game scores self.scores_list = state['scores_list'] self.last_n_scores = state['last_n_scores'] self.mean_scores = state['mean_scores'] self.max_score = state['max_score'] self.min_score = state['min_score'] if 'min_score' in state.keys( ) else state['max_score'] self.best_score_board = state['best_score_board'] # Rewards self.total_rewards_list = state['total_rewards_list'] self.last_n_total_rewards = state['last_n_total_rewards'] self.mean_total_rewards = state['mean_total_rewards'] self.max_total_reward = state['max_total_reward'] self.best_reward_board = state['best_reward_board'] # Max cell value on game board self.max_vals_list = state['max_vals_list'] self.last_n_vals = state['last_n_vals'] self.mean_vals = state['mean_vals'] self.max_val = state['max_val'] self.best_val_board = state['best_val_board'] # Number of steps per episode self.max_steps_list = state['max_steps_list'] self.last_n_steps = state['last_n_steps'] self.mean_steps = state['mean_steps'] self.max_steps = state['max_steps'] self.total_steps = state['total_steps'] self.best_steps_board = state['best_steps_board'] self.actions_avg_list = state['actions_avg_list'] self.actions_deque = state['actions_deque'] # Replay buffer self.memory.load(state['memory']) # Initialize time step self.t_step = state['t_step'] self.steps_ahead = state['steps_ahead'] def step(self, state, action, reward, next_state, done, error, action_dist): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done, error, action_dist, None) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() return action_values.cpu().data.numpy() def learn(self, learn_iterations, mode='board_max', save_loss=True, gamma=GAMMA, weight=None): if self.use_expected_rewards: self.memory.calc_expected_rewards(self.steps_ahead, weight) self.memory.add_episode_experiences() losses = [] if len(self.memory) > self.batch_size: if learn_iterations is None: learn_iterations = self.learn_iterations for i in range(learn_iterations): states, actions, rewards, next_states, dones = self.memory.sample( mode=mode) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, rewards) losses.append(loss.detach().numpy()) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.lr_decay.step() if save_loss: self.losses.append(np.mean(losses)) else: self.losses.append(0) def soft_update(self, local_model, target_model, tau): """NOT USED ANYMORE Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.loss = torch.nn.MSELoss() # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss ''' 1. Get the target q-values 2. Get the current q-values 3. compute the loss 4. update the weights using adam optimizer (don't forget to set zero grad) ''' # the current_q_values tensor will have a shape of 64, 1 current_q_values = self.qnetwork_local(states).gather(1, actions) target_q_values = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(-1) td_target = rewards + (gamma * target_q_values) output = self.loss(td_target, current_q_values) # emptying out the gradients before calculating again to ensure there is no wierd addition self.optimizer.zero_grad() # caculating the gradients of the loss function with respect to the weights output.backward() # updating the weights using the stochastic gradient descent self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Parameters: ========== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network (local and target) self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Parameters: ========== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Parameters: ========== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Forward and backward passes output = self.qnetwork_local.forward(states).gather(1, actions) # MSE Loss implementation: self.criterion = nn.MSELoss() loss = self.criterion(output, self.targets(gamma, rewards, next_states, dones)) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def targets(self, gamma, rewards, next_states, dones): with torch.no_grad(): q = self.qnetwork_target.forward(next_states) y = rewards + torch.mul(torch.max(q, dim=1, keepdim=True)[0], gamma) * (1 - dones) return y def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Parameters: ========== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs net = nn.DataParallel(self.qnetwork_local) if torch.cuda.is_available(): print("using GPUs!") net.cuda() # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" # target net update # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) #logger.info('mse: {}'.format(delta)) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # evolutionary step - increase survival chances #logger.info('avg reward: {} mse:{}'.format(delta, np.mean(experiences.rewards()))) # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # use helper calc_loss function & not this directly self.criterion = nn.MSELoss() # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ # first unsqueeze to make it a batch with one sample in it state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) # set the mode back to training mode self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: # output of model needs to be Q values of actions 0 - (n-1) And output[ind_x] needs to correspond to action_x return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) # helper for learn func # y_j does not depend on the weight parameters that gradient descent will be training def calc_y_j(self, r_j, dones, gamma, target_out): # 1 or 0 flag; if episode doesn't terminate at j+1 (aka done == False), y_j product now already includes the gamma multiplication factor # use [[x] for x in y] kind of list comprehension because need them to be batch_size by 1 like r_j # use .to(device) to move to gpu (not just setting device arg when creating a tensor) dones_flags = torch.Tensor([[0] if done == True else [gamma] for done in dones]).float().to(device) max_q_target_out = torch.Tensor([[torch.max(q_for_all_actions)] for q_for_all_actions in target_out]).float().to(device) # RuntimeError: Can't call numpy() on Variable that requires grad. Use var.detach().numpy() instead. #dones_flags = torch.from_numpy(np.vstack([0 if done == True else gamma for done in dones])).float().to(device) #max_q_target_out = torch.from_numpy(np.vstack([torch.max(q_for_all_actions) for q_for_all_actions in target_out])).float().to(device) y_j = r_j + dones_flags * max_q_target_out return y_j # helper for learn func def calc_loss(self, y_j, pred_out, actions): # need pred_out_actions_taken to be a tensor & built by combining (concatenating) other tensors to maintain gradients # actions is batch_size by 1- only have to iterate through rows for i in range(actions.shape[0]): # action taken. is an index for which col to look at in pred_out (pred_out is batch_size by n_actions] action_ind = actions[i, 0].item() if i == 0: # need to take h from 0 dimensional to 2 dimensional pred_out_actions_taken = pred_out[i, action_ind].unsqueeze(0).unsqueeze(0) else: # concat along dim 0 -> vertically stack rows pred_out_actions_taken = torch.cat((pred_out_actions_taken, pred_out[i, action_ind].unsqueeze(0).unsqueeze(0)), dim=0) # loss is MSE between pred_out_actions_taken (input) and y_j (target) return self.criterion(pred_out_actions_taken, y_j) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ # torch states, actions, rewards, next_states, dones = experiences "*** YOUR CODE HERE ***" ## TODO: compute and minimize the loss # vstack takes one argument (sequence)- stacks the pieces of that argument vertically # SELF NOTE: think about what (if anything additional) needs .todevice() # make sure to zero the gradients self.optimizer.zero_grad() # q_network_local model output from forward pass pred_out = self.qnetwork_local(states) target_out = self.qnetwork_target(next_states) # compute the loss for q_network_local vs q_network_target y_j = self.calc_y_j(rewards, dones, gamma, target_out) # calc gradient & take step down the gradient loss = self.calc_loss(y_j, pred_out, actions) loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, 64, 64).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, 64, 64).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.loss_fn = torch.nn.MSELoss() # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # torch.nn.utils.clip_grad_value_(self.qnetwork_local.parameters(), clip_value = 1) def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" # # Get max predicted Q values (for next states) from target model # Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # # Compute Q targets for current states # Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # # Get expected Q values from local model # Q_expected = self.qnetwork_local(states).gather(1, actions) # # Compute loss # loss = F.mse_loss(Q_expected, Q_targets) # # Minimize the loss # self.optimizer.zero_grad() # loss.backward() # self.optimizer.step() optimizer = self.optimizer loss_fn = self.loss_fn ## this is required as we're not learning qnetwork_targets weights # with torch.no_grad(): # Q_target = rewards + gamma * (torch.max(self.qnetwork_target(next_states), dim=1)[0].view(64,1))*(1 - dones) # Q_target[dones == True] = rewards[dones == True] # Q_pred = torch.max(self.qnetwork_local(states), dim=1)[0].view(64,1) ## Double DQNs #argmax on Target W best_actions_by_local_nn = torch.max( self.qnetwork_local(next_states).detach(), dim=1)[1].unsqueeze(1) action_values_by_target_nn = self.qnetwork_target( next_states).detach().gather(1, best_actions_by_local_nn) Q_target = rewards + gamma * action_values_by_target_nn * (1 - dones) Q_pred = self.qnetwork_local(states).gather(1, actions) optimizer.zero_grad() loss = loss_fn(Q_pred, Q_target) loss.backward() optimizer.step() # print("Loss=", loss.item()) # print("Loss=", loss, # "Local params L2=", torch.norm(self.qnetwork_local.parameters(), 2), # "Local params grad L2=", torch.norm(self.qnetwork_local.parameters().grad, 2)) # with torch.no_grad(): # for param in self.qnetwork_local.parameters(): # param -= learning_rate * param.grad # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, ALPHA) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # Initialize learning step for updating beta self.learn_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get prioritized subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA, BETA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma, beta): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor beta (float): initial value for beta, which controls how much importance weights affect learning """ states, actions, rewards, next_states, dones, probabilities, indices = experiences if double_dqn: # Get the Q values for each next_state, action pair from the # local/online/behavior Q network: Q_targets_next_local = self.qnetwork_local( next_states).detach() # Get the corresponding best action for those next_states: _, a_prime = Q_targets_next_local.max(1) # Get the Q values from the target Q network but following a_prime, # which belongs to the local network, not the target network: Q_targets_next = self.qnetwork_target(next_states).detach() Q_targets_next = Q_targets_next.gather(1, a_prime.unsqueeze(1)) else: # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target( next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute and update new priorities new_priorities = (abs(Q_expected - Q_targets) + EPSILON_PER).detach() self.memory.update_priority(new_priorities, indices) # Update beta parameter (b). By default beta will reach 1 after # 25,000 training steps (~325 episodes in the Banana environment): b = min(1.0, beta + self.learn_step * (1.0 - beta) / BETA_ITERS) self.learn_step += 1 # Compute and apply importance sampling weights to TD Errors ISweights = (((1 / len(self.memory)) * (1 / probabilities))**b) max_ISweight = torch.max(ISweights) ISweights /= max_ISweight Q_targets *= ISweights Q_expected *= ISweights # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): def __init__(self, state_size, action_size, hidden_layers, buffer_size=int(1e6), batch_size=32, gamma=.99, tau=1, lr=2.5e-4, update_local=4, update_target=10000, ddqn=False, seed=1): """Initialize Agent object Params ====== state_size (int): Dimension of states action_size (int): Dimension of actions hidden_layers (list of ints): number of nodes in the hidden layers buffer_size (int): size of replay buffer batch_size (int): size of sample gamma (float): discount factor tau (float): (soft) update of target parameters lr (float): learning rate update_local (int): update local after every x steps update_target (int): update target after every x steps ddqn (boolean): Double Deep Q-Learning seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Hyperparameters self.buffer_size = buffer_size # replay buffer self.batch_size = batch_size # minibatch size self.gamma = gamma # discount factor self.tau = tau # (soft) update of target parameters self.lr = lr # learning rate self.update_local = update_local # update local network after every x steps self.update_target = update_target # update target network with local network weights # Q Network self.qnet_local = \ QNetwork(state_size, action_size, hidden_layers, seed).to(device) self.qnet_target = \ QNetwork(state_size, action_size, hidden_layers, seed).to(device) self.optimizer = optim.Adam(self.qnet_local.parameters(), lr=lr) # Replay buffer self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) # Initialize time step self.t_step = 0 # Double Deep Q-Learning flag self.ddqn = ddqn def step(self, state, action, reward, next_state, done): # Save experience in replay buffer self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE LOCAL time steps self.t_step += 1 if self.t_step % self.update_local == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: sample = self.memory.sample() if self.t_step % self.update_target == 0: do_target_update = True else: do_target_update = False self.__learn(sample, self.gamma, do_target_update) def act(self, state, epsilon=0): """Returns action given a state according to local Q Network (current policy) Params ====== state (array_like): current state epsilon (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnet_local.eval() with torch.no_grad(): action_values = self.qnet_local(state) self.qnet_local.train() # Epsilon greedy action selection if random.random() > epsilon: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def __learn(self, sample, gamma, do_target_update): """Update value parameters using given batch of sampled experiences tuples Params ====== sample (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = sample if not self.ddqn: # Get max predicted Q values (for next states) from target model Q_targets_next = \ self.qnet_target(next_states).detach().max(1)[0].unsqueeze(1) else: # Get actions (for next states) with max Q values from local net next_actions = \ self.qnet_local(next_states).detach().max(1)[1].unsqueeze(1) # Get predicted Q values from target model Q_targets_next = \ self.qnet_target(next_states).gather(1, next_actions) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnet_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Update target network if do_target_update: self.__target_net_update(self.qnet_local, self.qnet_target, self.tau) def __target_net_update(self, local_net, target_net, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param \ in zip(target_net.parameters(), local_net.parameters()): target_param.data.\ copy_(tau*local_param.data + (1.0 - tau)*target_param.data) def get_info(self): output = """ Replay Buffer size: {} \n Batch size: {} \n Discout factor: {} \n tau: {} \n Learning Rate: {} \n Update local network after every {} steps \n Update target network with local network parameters after every {} steps \n DDQN: {} """ print( output.format(self.buffer_size, self.batch_size, self.gamma, self.tau, self.lr, self.update_local, self.update_target, self.ddqn))
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory # self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. # Note that train + update is made every C iterations here. # In the algorithm, train is assumed to be done every iteration, whereas # update is done every C iterations. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() # ------------------- train with mini-batch sample of experiences ------------------- # self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model # - qnetwork_target : apply forward pass for the whole mini-batch # - detach : do not backpropagate # - max : get maximizing action for each sample of the mini-batch (dim=1) # - [0].unsqueeze(1) : transform output into a flat array Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states (y) # - dones : detect if the episode has finished Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model (Q(Sj, Aj, w)) # - gather : for each sample select only the output value for action Aj Q_expected = self.qnetwork_local(states).gather(1, actions) # Optimize over (yj-Q(Sj, Aj, w))^2 # * compute loss loss = F.mse_loss(Q_expected, Q_targets) # * minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, checkpoint_path=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) # Load the model only when the checkpoint is available if checkpoint_path is not None: self.qnetwork_local.load_state_dict(torch.load(checkpoint_path)) print("Checkpoint loaded successfully") self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences self.optimizer.zero_grad() #target with torch.no_grad(): #Double DQN ddqn_max_indices = self.qnetwork_local(next_states).max(dim=1)[1] target_op = self.qnetwork_target(next_states) target_op = target_op.gather(1, ddqn_max_indices.view(-1, 1)) ''' # DQN target_op = self.qnetwork_target(next_states).max(dim=1)[0].view(-1,1) ''' targets = rewards + target_op * (1 - dones) * gamma predictions = self.qnetwork_local(states) predictions = predictions.gather(1, actions.view(-1, 1)) loss = torch.nn.MSELoss()(predictions, targets) loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, compute_weights=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.compute_weights = compute_weights # Algorithms to enable during training self.PrioritizedReplayBuffer = True # Use False to enable uniform sampling self.HardTargetUpdate = True # Use False to enable soft target update # building the policy and target Q-networks for the agent, such that the target Q-network is kept frozen to avoid the training instability issues # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) # main policy network self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) # target network self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.criterion = nn.MSELoss() # Replay memory # building the experience replay memory used to avoid training instability issues # Below: PER self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, EXPERIENCES_PER_SAMPLING, seed, compute_weights) # Below: Uniform by method defined in this script #self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_NN_EVERY steps) self.t_step_nn = 0 # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps) self.t_step_mem_par = 0 # Initialize time step (for updating every UPDATE_MEM_EVERY steps) self.t_step_mem = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_NN_EVERY time steps. self.t_step_nn = (self.t_step_nn + 1) % UPDATE_NN_EVERY self.t_step_mem = (self.t_step_mem + 1) % UPDATE_MEM_EVERY self.t_step_mem_par = (self.t_step_mem_par + 1) % UPDATE_MEM_PAR_EVERY if self.t_step_mem_par == 0: self.memory.update_parameters() if self.t_step_nn == 0: # If enough samples are available in memory, get random subset and learn if self.memory.experience_count > EXPERIENCES_PER_SAMPLING: sampling = self.memory.sample() self.learn(sampling, GAMMA) if self.t_step_mem == 0: self.memory.update_memory_sampling() def act(self, state, eps=0.): """A function to select an action based on the Epsilon greedy policy. Epislon percent of times the agent will select a random action while 1-Epsilon percent of the time the agent will select the action with the highest Q value as predicted by the neural network. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) # here we calculate action values (Q values) self.qnetwork_local.eval( ) # model deactivate norm, dropout etc. layers as it is expected with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train( ) # model.train() sets the modules in the network in training mode # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.cpu().numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, sampling, gamma): """Update value parameters using given batch of experience tuples. Function for training the neural network. The function will update the weights of the newtwork Params ====== sampling (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, weights, indices = sampling # Target (absolute) Q values from target Q network q_target = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Predictions from local Q network expected_values = rewards + gamma * q_target * (1 - dones) output = self.qnetwork_local(states).gather(1, actions) # computing the loss loss = F.mse_loss(output, expected_values) # Loss Function: Mean Square Error if self.compute_weights: with torch.no_grad(): weight = sum(np.multiply(weights, loss.data.cpu().numpy())) loss *= weight # Minimizing the loss by optimizer self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) # ------------------- update priorities ------------------- # delta = abs(expected_values - output.detach()).cpu().numpy() #print("delta", delta) self.memory.update_priorities(delta, indices) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) # def hard_update(self): # """ This hard_update method performs direct update of target network # weight update from local network weights instantly""" # Write the algorithm here def load_models(self, policy_net_filename, target_net_filename): """ Function to load the parameters of the policy and target models """ print('Loading model...') self.qnetwork_local.load_model(policy_net_filename) self.qnetwork_target.load_model(target_net_filename)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, compute_weights=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.compute_weights = compute_weights # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.criterion = nn.MSELoss() # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, EXPERIENCES_PER_SAMPLING, seed, compute_weights) # Initialize time step (for updating every UPDATE_NN_EVERY steps) self.t_step_nn = 0 # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps) self.t_step_mem_par = 0 # Initialize time step (for updating every UPDATE_MEM_EVERY steps) self.t_step_mem = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_NN_EVERY time steps. self.t_step_nn = (self.t_step_nn + 1) % UPDATE_NN_EVERY self.t_step_mem = (self.t_step_mem + 1) % UPDATE_MEM_EVERY self.t_step_mem_par = (self.t_step_mem_par + 1) % UPDATE_MEM_PAR_EVERY if self.t_step_mem_par == 0: self.memory.update_parameters() if self.t_step_nn == 0: # If enough samples are available in memory, get random subset and learn if self.memory.experience_count > EXPERIENCES_PER_SAMPLING: sampling = self.memory.sample() self.learn(sampling, GAMMA) if self.t_step_mem == 0: self.memory.update_memory_sampling() def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, sampling, gamma): """Update value parameters using given batch of experience tuples. Params ====== sampling (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, weights, indices = sampling ## TODO: compute and minimize the loss q_target = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) expected_values = rewards + gamma * q_target * (1 - dones) output = self.qnetwork_local(states).gather(1, actions) loss = F.mse_loss(output, expected_values) if self.compute_weights: with torch.no_grad(): weight = sum(np.multiply(weights, loss.data.cpu().numpy())) loss *= weight self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) # ------------------- update priorities ------------------- # delta = abs(expected_values - output.detach()).numpy() #print("delta", delta) self.memory.update_priorities(delta, indices) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, buffer_size=int(1e5), batch_size=64, gamma=0.99, tau=1e-3, lr=5e-4, update_every=4, double_dqn=False, dueling_dqn=False, prioritized_replay=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed buffer_size (int): replay buffer size batch_size (int): minibatch size gamma (float): discount factor tau (float): for soft update of target parameters lr (float): learning rate update_every (int): how often to update the network double_dqn (bool) use double Q-network when 'True' dueling_dqn (bool): use dueling Q-network when 'True' prioritized_replay (bool): use prioritized replay buffer when 'True' """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = lr self.update_every = update_every self.double_dqn = double_dqn self.dueling_dqn = dueling_dqn self.prioritized_replay = prioritized_replay # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, dueling_dqn=dueling_dqn).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, dueling_dqn=dueling_dqn).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed, prioritized_replay=prioritized_replay) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, eps=0.): """Returns actions for a given state as per the current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using a given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ if self.prioritized_replay: states, actions, rewards, next_states, dones, indices, weights = experiences else: states, actions, rewards, next_states, dones = experiences # Get max predicted Q-values (for next states) from target model if self.double_dqn: # Use local model to choose an action, and target model to evaluate that action Q_local_max = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1) Q_targets_next = self.qnetwork_target(next_states).gather(1, Q_local_max) else: Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # Compute Q-targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q-values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) if self.prioritized_replay: priorities = np.sqrt(loss.detach().cpu().data.numpy()) self.memory.update_priorities(indices, priorities) loss = loss * weights loss = loss.mean() # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): where weights will be copied from target_model (PyTorch model): where weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, temperature, type_of_update, type_of_loss): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.type_of_loss = type_of_loss print(self.type_of_loss) self.type_of_update = type_of_update print(self.type_of_update) self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.temperature = temperature # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.total_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY self.total_step = (self.total_step + 1) % TARGET_UPDATE if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: # import pdb # pdb.set_trace() # whole_experiences = list(self.memory[:]) # pdb.set_trace() #sample_weights = self.learn_sample_weights(self.memory.memory, GAMMA) #sample_weights=torch.nn.functional.softmax(sample_weights/self.temperature) # if min(sample_weights)<10*(-6): # addition=min(sample_weights) # else: # addition=10*(-6) # sample_weights=sample_weights+addition/100.0 #sample_weights=sample_weights.detach().numpy() #experiences = self.memory.sample(sample_weights) experiences = self.memory.sample() self.learn(experiences, GAMMA) if self.type_of_update == 'hard': if self.total_step == 0: self.hard_update(self.qnetwork_local, self.qnetwork_target) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) # def learn_sample_weights(self, experiences, gamma): # states, actions, rewards, next_states, dones = turn_experiences_into_subcategories(experiences) # # Get max predicted Q values (for next states) from target model # Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # # Compute Q targets for current states # Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # # Get expected Q values from local model # Q_expected = self.qnetwork_local(states).gather(1, actions) # sample_weights=abs(Q_targets-Q_expected).reshape(-1,) # return sample_weights def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model if self.type_of_update == 'same': Q_targets_next = self.qnetwork_local(next_states).detach().max( 1)[0].unsqueeze(1) else: Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) #import pdb #pdb.set_trace() sample_weights = abs(Q_targets - Q_expected).reshape(-1, ) # Compute loss if self.type_of_loss == 'mse': loss = F.mse_loss(Q_expected, Q_targets) elif self.type_of_loss == 'huber': loss = F.smooth_l1_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # if self.type_of_update == 'soft': self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def hard_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(local_param.data) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, lr_decay=0.9999): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network if USE_DUELING_NETWORK: self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed, [128, 32], [64, 32]).to(device) self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed, [128, 32], [64, 32]).to(device) self.qnetwork_target.eval() else: self.qnetwork_local = QNetwork(state_size, action_size, seed, fc1_units=128, fc2_units=32).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, fc1_units=128, fc2_units=32).to(device) self.qnetwork_target.eval() self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.lr_scheduler = optim.lr_scheduler.ExponentialLR( self.optimizer, lr_decay) # Replay memory if USE_PRIORITIZED_REPLAY_BUFFER: self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device, alpha=0.6, beta=0.4, beta_scheduler=1.0) else: self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, w = experiences if USE_DOUBLE_DQN: self.qnetwork_local.eval() Q_local = self.qnetwork_local(next_states) greedy_actions = torch.argmax(Q_local, axis=1).unsqueeze(1) self.qnetwork_local.train() Q_targets_next = self.qnetwork_target(next_states) Q_targets_next = Q_targets_next.gather(1, greedy_actions) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) else: # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) Q_targets_next = Q_targets_next.max(dim=1, keepdim=True)[0] # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss if USE_PRIORITIZED_REPLAY_BUFFER: Q_targets.sub_(Q_expected) Q_targets.squeeze_() Q_targets.pow_(2) with torch.no_grad(): td_error = Q_targets.detach() td_error.pow_(0.5) self.memory.update_priorities(td_error) Q_targets.mul_(w) loss = Q_targets.mean() else: loss = F.mse_loss(Q_expected, Q_targets) # Mback-propagation self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.lr_scheduler.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class ddqn_dual_Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = dqn_ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" #detach: Returns a new Tensor, detached from the current graph. #The result will never require gradient. yj=self.qnetwork_target.forward(next_states).detach().max(1)[0].unsqueeze(1) # print("shape",self.qnetwork_target.forward(next_states).detach()) Q_targets=rewards+gamma*yj*(1.0-dones) # Get expected Q values from local model Q_expected = self.qnetwork_local.forward(states).gather(1, actions) # Compute loss: Mean Square Error by element loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. teta_target = ro*teta_local + (1 - ro)*teta_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() # this change the local net to eval mode with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train( ) # this just return the local net back to train mode # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" # Get max predicted Q values (for next states) from target model target_q_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) """ # disregard action, get best value! # why so many next states? answer: the qnetwork will return each corresponding next states action, the max will pick from each the best action # explanation on detach (https://discuss.pytorch.org/t/detach-no-grad-and-requires-grad/16915/7) """ # Compute Q targets for current states target_q = rewards + (gamma * target_q_next * (1 - dones)) # Get expected Q values from local model expected_q = self.qnetwork_local(states).gather(1, actions) """ this uses gather instead of detach like target since it only give a s*** to action taken # explanation on gather (https://stackoverflow.com/questions/50999977/what-does-the-gather-function-do-in-pytorch-in-layman-terms) """ # Compute loss loss = F.mse_loss(expected_q, target_q) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, enable_curiosity): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.enable_curiosity = enable_curiosity # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) # Curiosity Elements self.fwd_model = FwdModel(state_size, action_size, seed).to(device) self.inverse_model = InverseModel(state_size, action_size, seed).to(device) ##Optimizer params_to_opt = list(self.qnetwork_local.parameters()) + list( self.fwd_model.parameters()) + list( self.inverse_model.parameters()) self.optimizer = optim.Adam(params_to_opt, lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.loss_list = [] def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) ##Now get the result of evaluating the forward model ##FIXME: do I need to normalize state? Probably! act_onehot = torch.FloatTensor(BATCH_SIZE, self.action_size).to(device) act_onehot.zero_() act_onehot.scatter_(1, actions, 1) ns_expected = self.fwd_model(states, act_onehot) ##Now get the result of evaluating the inverse model a_expected = self.inverse_model(states, next_states) # Compute loss #exploration loss loss1 = F.mse_loss(Q_expected, Q_targets) #inverse model loss criterion = torch.nn.CrossEntropyLoss() loss2 = criterion(a_expected, torch.squeeze(actions)) #forward model loss loss3 = F.mse_loss(ns_expected, next_states) if self.enable_curiosity: loss1 = loss1 * EXTRINSIC_WEIGHT loss2 = loss2 * INVERSE_WEIGHT loss3 = loss3 * FORWARD_WEIGHT loss = loss1 + loss2 + loss3 else: loss = loss1 # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.loss_list.append((loss1, loss2, loss3)) # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) print("Running on: " + str(device)) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target.eval() self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## DONE: compute and minimize the loss "*** YOUR CODE HERE ***" with torch.no_grad(): # calculate the target rewards for the next_states target_rewards = self.qnetwork_target(next_states) # select the maximum reward for each next_state target_rewards = target_rewards.max(1)[0] # change shape: [batch_size] --> [batch_size, 1] target_rewards = target_rewards.unsqueeze(1) # calculate the discounted target rewards target_rewards = rewards + (gamma * target_rewards * (1 - dones)) # calculate the expected rewards for each action for the states expected_rewards = self.qnetwork_local( states) # shape: [batch_size, action_size] # get the reward for the action selected for each state expected_rewards = expected_rewards.gather( 1, actions) # shape: [batch_size, 1] # calculate the loss loss = F.mse_loss(expected_rewards, target_rewards) # perform the back-propagation self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Vanilla(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, hidden_layers, seed, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, lr=LR, update_every=UPDATE_EVERY): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.hidden_layers = hidden_layers self.buffer_size = int(buffer_size) self.batch_size = batch_size self.gamma = gamma self.lr = lr self.update_every = update_every # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, hidden_layers, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, hidden_layers, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed) # Initialize time step (for updating every update_every steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every update_every time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() # Is line below required? Don't think so looks like no-op ... #action_values = self.qnetwork_local(experiences[0]) self.learn(experiences, self.gamma) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ # "unsqueeze" set the batch_size dim which is one here state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss # Should we multiply by weights now -- TEMP loss = F.mse_loss(Q_expected, Q_targets) # Somewhere here update priorities -- TEMP # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class AbstractAgent(metaclass=ABCMeta): """Abstract Base Agent""" def __init__(self, state_size, action_size, memory, seed, configs): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.lr = configs['LR'] self.update_every = configs['UPDATE_EVERY'] self.batch_size = configs['BATCH_SIZE'] self.gamma = configs['GAMMA'] self.tau = configs['TAU'] # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) # Replay memory # ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.memory = memory # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # Loss self.criterion = nn.MSELoss() def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self._learn(experiences, self.gamma) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) @abstractmethod def _learn(self, experiences, gamma): raise NotImplementedError
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local( state) #same as self.qnetwork_local.forward(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss #"*** YOUR CODE HERE ***" qs_local = self.qnetwork_local.forward(states) qsa_local = qs_local[torch.arange(BATCH_SIZE, dtype=torch.long), actions.reshape(BATCH_SIZE)] qsa_local = qsa_local.reshape((BATCH_SIZE, 1)) #print(qsa_local.shape) # # DQN Target # qs_target = self.qnetwork_target.forward(next_states) # qsa_target, _ = torch.max(qs_target, dim=1) #using the greedy policy (q-learning) # qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete # qsa_target = qsa_target.reshape((BATCH_SIZE,1)) # TD_target = rewards + gamma * qsa_target # #print(qsa_target.shape, TD_target.shape, rewards.shape) # # Double DQN Target ver 1 # qs_target = self.qnetwork_target.forward(next_states) # if random.random() > 0.5: # _, qsa_target_argmax_a = torch.max(qs_target, dim=1) #using the greedy policy (q-learning) # qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_target_argmax_a.reshape(BATCH_SIZE)] # else: # _, qsa_local_argmax_a = torch.max(qs_local, dim=1) #using the greedy policy (q-learning) # #qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)] # ##qsa_target = qs_local[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)] # qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete # qsa_target = qsa_target.reshape((BATCH_SIZE,1)) # TD_target = rewards + gamma * qsa_target # Double DQN Target ver 2 (based upon double dqn paper) qs_target = self.qnetwork_target.forward(next_states) _, qsa_local_argmax_a = torch.max( qs_local, dim=1) #using the greedy policy (q-learning) qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)] qsa_target = qsa_target * ( 1 - dones.reshape(BATCH_SIZE) ) #target qsa value is zero when episode is complete qsa_target = qsa_target.reshape((BATCH_SIZE, 1)) TD_target = rewards + gamma * qsa_target #print(qsa_target.shape, TD_target.shape, rewards.shape) # #Udacity's approach # # Get max predicted Q values (for next states) from target model # Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # # Compute Q targets for current states # TD_target = rewards + (gamma * Q_targets_next * (1 - dones)) # # Get expected Q values from local model # qsa_local = self.qnetwork_local(states).gather(1, actions) #diff = qsa_local - TD_target #loss = torch.matmul(torch.transpose(diff, dim0=0, dim1=1), diff) #loss is now a scalar loss = F.mse_loss(qsa_local, TD_target) #much faster than the above loss function #print(loss) #minimize the loss self.optimizer.zero_grad() #clears the gradients loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class AgentUniform(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, hidden_layers, lr=5e-4): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = seed self.gamma = GAMMA # Q-Network self.lr = lr self.qnetwork_local = QNetwork(state_size, action_size, self.seed, hidden_layers).to(device) self.qnetwork_target = QNetwork(state_size, action_size, self.seed, hidden_layers).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.checkpoint = { "input_size": self.state_size, "output_size": self.action_size, "hidden_layers": [each.out_features for each in self.qnetwork_local.hidden_layers], "state_dict": self.qnetwork_local.state_dict() } self.checkpointfile = 'vanilla_dpq.pth' def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn NUM_LEARNS times par every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0 and len(self.memory) >= MIN_BUF_SIZE: for i in range(NUM_LEARNS): experiences = self.memory.sample() self.learn(experiences) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()).astype(np.int32) else: return random.choice(np.arange(self.action_size)).astype(np.int32) def learn(self, experiences): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def update_qtarget(self): for target_param, local_param in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()): target_param.data.copy_(local_param.data) def set_lr(self, lr): self.lr = lr def load_model(self, filepath): checkpoint = torch.load(filepath) self.qnetwork_local = QNetwork(checkpoint["input_size"], checkpoint["output_size"], self.seed, checkpoint["hidden_layers"]) self.qnetwork_local.load_state_dict(checkpoint["state_dict"]) def get_gamma(self): return self.gamma def save_model(self): torch.save(self.checkpoint, self.checkpointfile)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network h_size = 128 self.qnetwork_local = QNetwork(state_size, action_size, h_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, h_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.criterion = torch.nn.MSELoss() # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" # get predictions (actions with their values?) with torch.no_grad(): action_values = self.qnetwork_target( next_states) #.detach().max(1)[0].unsqueeze(1) max_idx = torch.argmax(action_values, 1).unsqueeze(1) y_targets = action_values.gather(1, max_idx) values = rewards + (1 - dones) * gamma * ( y_targets) # if done (=1) then we just use reward value y_expected = self.qnetwork_local(states).gather(1, actions) loss = F.mse_loss(y_expected, values) self.optimizer.zero_grad() # back prop loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, fc1_units = 64, fc2_units = 64): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed fc1_units (int): fully connected layer 1 size fc2_units (int): fully connected layer 2 size """ self.state_size = state_size self.action_size = action_size random.seed(seed) self.seed = random.randint(1,1000) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, fc1_units, fc2_units).to(device) print(self.qnetwork_local.parameters) self.qnetwork_target = QNetwork(state_size, action_size, seed, fc1_units, fc2_units).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: # return np.argmax(action_values.cpu().data.numpy()).astype('int') #int32 here makes unity env happy return int(np.argmax(action_values.cpu().data.numpy())) #try casting this to native python int to make unity happy else: # return random.choice(np.arange(self.action_size)).astype('int') return int(random.choice(np.arange(self.action_size))) #try casting this to native python int to make unity happy ######################## # the commented return lines are believed to cause errors' in Udacity reviewer's hardware. int64 that comes out of the argmax function # by default appears to have cause the same issues on my own hardware. Casting to int32 fixed that for me. Now trying native python int ######################## def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class DDQNAgentPrioExpReplay: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=PARAM.LR) # Replay memory self.memory = PrioritizedReplayBuffer(action_size, 20000, PARAM.BATCH_SIZE, 0, PARAM.PROBABILITY_EXPONENT) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.eps = 1 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % PARAM.UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > PARAM.BATCH_SIZE: experiences, experience_indices, importance_weights = self.memory.sample( ) self.learn(experiences, experience_indices, importance_weights, PARAM.GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ self.eps = eps state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def get_ddqn_targets(self, next_states, rewards, gamma, dones): # get best action according to online value function approximation q_online = self.qnetwork_local(next_states).detach() q_online = q_online.argmax(1) # get value of target function at position of best online action q_target = self.qnetwork_target(next_states).detach() q_target = q_target.index_select(1, q_online)[:, 0] # reshape q_target = q_target.unsqueeze(1) # calculate more correct q-value given the current reward Q_targets = rewards + (gamma * q_target * (1 - dones)) return Q_targets def learn(self, experiences, experience_indices, importance_weights, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences Q_targets = self.get_ddqn_targets(next_states, rewards, gamma, dones) # Get expected Q values q_exp = self.qnetwork_local(states) # print(q_exp) # gets the q values along dimension 1 according to the actions, which is used as index # >>> t = torch.tensor([[1,2],[3,4]]) # >>> torch.gather(t, 1, torch.tensor([[0],[1]])) # tensor([[ 1], # [ 4]]) q_exp = q_exp.gather(1, actions) # print(q_exp) error = torch.abs(q_exp - Q_targets) with torch.no_grad(): # update priority # we need ".cpu()" here because the values need to be copied to memory before converting them to numpy, # else they are just present in the GPU errors = np.squeeze(error.cpu().data.numpy()) self.memory.set_priorities(experience_indices, errors) # compute loss squared_error = torch.mul(error, error) with torch.no_grad(): w = torch.from_numpy( importance_weights**(1 - self.eps)).float().to(device) w = w.detach() squared_error = torch.squeeze(squared_error) weighted_squared_error = torch.mul(squared_error, w) loss = torch.mean(weighted_squared_error) #loss = F.mse_loss(q_exp, Q_targets) # reset optimizer gradient self.optimizer.zero_grad() # do backpropagation loss.backward() # do optimize step self.optimizer.step() # ------------------- update target network ------------------- # # according to the algorithm in # https://proceedings.neurips.cc/paper/2010/file/091d584fced301b442654dd8c23b3fc9-Paper.pdf # one should update randomly in ether direction #update_direction = np.random.binomial(1, 0.5) #if update_direction == 0: # self.soft_update(self.qnetwork_local, self.qnetwork_target, PARAM.TAU) #else: # self.soft_update(self.qnetwork_target, self.qnetwork_local, PARAM.TAU) self.soft_update(self.qnetwork_local, self.qnetwork_target, PARAM.TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)