def _optimize(self): """Sample batch from experience replay pool and update the policy""" if len(self.memory) < self.BATCH_SIZE: return transitions, weights, idxes = self.memory.sample( self.BATCH_SIZE, self.beta.anneal()) batch = Transition(*zip(*transitions)) states = torch.cat(batch.state) actions = torch.cat(batch.action) rewards = torch.cat(batch.reward) weights = torch.tensor(weights, dtype=torch.float32, device=self.device) q = self._q(states, actions) expected_q = self._expected_q(batch.next_state, rewards) # update the priority of each transition td_error = expected_q - q new_priorities = torch.abs(td_error) + self.eps self.memory.update_priorities(idxes, new_priorities.flatten()) # Compute Huber loss loss = F.smooth_l1_loss(q, expected_q, reduction='none') loss = (weights * loss).mean() # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step()
def optimize(): if len(replaymemory) < BATCH_SIZE: return # sample tuples trainsitions = replaymemory.sample(BATCH_SIZE) batch = Transition(*zip(*trainsitions)) actions = tuple((map(lambda a:torch.tensor([[a]], device=device), batch.action))) rewards = tuple((map(lambda r:torch.tensor([r], device=device), batch.reward))) state_batch = torch.cat(batch.state).to(device) action_batch = torch.cat(actions) reward_batch = torch.cat(rewards) next_state_batch = batch.next_state non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, next_state_batch)),dtype=torch.bool, device=device) non_final_next_states = torch.cat([s for s in next_state_batch if s is not None]).to(device) #policy net output q value for cur state state_action_values = policy_net(state_batch).gather(1, action_batch) # target net output q value for cur state next_state_values = torch.zeros(BATCH_SIZE, device=device) next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach() expected_state_action_values = (next_state_values * GAMMA) + reward_batch loss = F.smooth_l1_loss(state_action_values, expected_state_action_values) optimizer.zero_grad() loss.backward() for param in policy_net.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step() pass
def train(self, ds): dataset = ds.get_dataset() loss = 0 q_score = 0 act = 0 N = 10000 l, s, n = 0, 0, 0 for i in range(30_000_000): state, action, reward, next_state, raw = next(dataset) self.agent.push(Transition(state, action, reward, next_state), i) if i % 2 == 0 and i: l, s = self.agent.train(i) if l: loss += l q_score += s n += 1 if i % N == 0: act = self.agent.act(state, i) #print(time.time() - self.t) if n: print( 'step %d, train: %d, act:%.2f, score:%.2f, loss:%.4f' % (i, n, act, q_score / n, loss / n)) self.t = time.time() loss, q_score, n = 0, 0, 0 act = 0.0 #print('update model file') self.save()
def train(self, env: gym.Env, n_steps): rewards = [] steps = [] episode_rewards = [] state = np_to_unsq_tensor(env.reset()) loop_range = tqdm.tqdm(range(n_steps)) for step in loop_range: with torch.no_grad(): z = self.z_net(state) if random.random() < self.epsilon: # Random action action = torch.LongTensor([[env.action_space.sample()]]) else: action = select_argmax_action(z, self.atoms) next_state, reward, done, info = env.step(squeeze_np(action)) next_state = np_to_unsq_tensor(next_state) if not done else None self.replay_buffer.remember( Transition(state, action, torch.tensor([[reward]]), next_state)) state = next_state # Perform training step self._train_step(step) # Update episode stats episode_rewards.append(reward) if done: state = np_to_unsq_tensor(env.reset()) rewards.append(sum(episode_rewards)) steps.append(step) episode_rewards = [] loop_range.set_description(f'Reward {rewards[-1]}') return Plot(steps, rewards, None)
def update(self): if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) # Compute Q(s', a') for all a' # TODO: Use a target network??? next_qvals = self.network(batch.next_state, batch.next_acts) # Take the max over next q-values next_qvals = torch.tensor([vals.max() for vals in next_qvals], device=device) # Zero all the next_qvals that are done next_qvals = next_qvals * ( 1 - torch.tensor(batch.done, dtype=torch.float, device=device)) targets = torch.tensor(batch.reward, dtype=torch.float, device=device) + self.gamma * next_qvals # Next compute Q(s, a) # Nest each action in a list - so that it becomes the only admissible cmd nested_acts = tuple([[a] for a in batch.act]) qvals = self.network(batch.state, nested_acts) # Combine the qvals: Maybe just do a greedy max for generality qvals = torch.cat(qvals) # Compute Huber loss loss = F.smooth_l1_loss(qvals, targets.detach()) self.optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(self.network.parameters(), self.clip) self.optimizer.step() return loss.item()
def fit_buffer(self): transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) # Update actor and critic according to the batch actor_loss, critic_loss = self.agent.update_params(batch) self.metrics['actor_loss'].append(actor_loss) self.metrics['critic_loss'].append(critic_loss)
def update(self): if len(self.memory) < self.BATCH_SIZE: return # get training batch transitions = self.memory.sample(self.BATCH_SIZE) batch = Transition(*zip(*transitions)) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward).unsqueeze(1) next_state = torch.cat(batch.next_state) # update value network state_action = torch.cat((state_batch, action_batch), dim=1) state_action_value = self.value_network(state_action) next_action = self.action_target_network(next_state).detach() next_state_action = torch.cat((next_state, next_action), dim=1) next_state_action_value = self.value_target_network( next_state_action).detach() expected_state_action_value = (self.DISCOUNT * next_state_action_value) + reward_batch value_loss = self.criterion(state_action_value, expected_state_action_value) self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() # update action network optim_action = self.action_network(state_batch) optim_state_action = torch.cat((state_batch, optim_action), dim=1) action_loss = -self.value_network(optim_state_action) action_loss = action_loss.mean() self.action_optimizer.zero_grad() action_loss.backward() self.action_optimizer.step() # update target network soft_update(self.value_target_network, self.value_network, 0.01) soft_update(self.action_target_network, self.action_network, 0.01)
def optimize(self, step): # print(len(self.memory)) if len(self.memory) < self.batch_size * 10: return transitions = self.memory.sample(self.batch_size) # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) next_state = torch.FloatTensor(batch.next_state).to(device) non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, next_state))) non_final_next_states = torch.cat([s for s in next_state if s is not None]) state_batch = torch.FloatTensor(batch.state).to(device) action_batch = torch.LongTensor(torch.add(torch.tensor(batch.action), torch.tensor(1))).to(device) reward_batch = torch.FloatTensor(batch.reward).to(device) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net l = self.policy_net(state_batch).size(0) state_action_values = self.policy_net(state_batch)[95:l:96].gather(1, action_batch.reshape((self.batch_size, 1))) state_action_values = state_action_values.squeeze(-1) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values = torch.zeros(self.batch_size, device=device) next_state_values[non_final_mask] = self.target_net(next_state)[95:l:96].max(1)[0].detach() # Compute the expected Q values expected_state_action_values = (next_state_values * self.gamma) + reward_batch # Compute the loss loss = torch.nn.MSELoss()(expected_state_action_values, state_action_values) # Optimize the model loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() if step % self.T == 0: # print('soft_update') gamma = 0.001 param_before = copy.deepcopy(self.target_net) target_update = copy.deepcopy(self.target_net.state_dict()) for k in target_update.keys(): target_update[k] = self.target_net.state_dict()[k] * (1 - gamma) + self.policy_net.state_dict()[k] * gamma self.target_net.load_state_dict(target_update)
def update(self): if len(self.memory) < self.batch_size: return batch_loss = None num_per_step = int(self.batch_size / self.accummulate_step) for _ in range(self.accummulate_step): transitions = self.memory.sample(num_per_step) batch = Transition(*zip(*transitions)) # Compute Q(s', a') for all a' # TODO: Use a target network??? next_history = [] for act, history in zip(batch.act, batch.history): next_history.append(history + [act]) next_qvals = self.network(batch.next_state, batch.next_acts, next_history) # Take the max over next q-values next_qvals = torch.tensor([vals.max() for vals in next_qvals], device=device) # Zero all the next_qvals that are done next_qvals = next_qvals * ( 1 - torch.tensor(batch.done, dtype=torch.float, device=device)) targets = torch.tensor(batch.reward, dtype=torch.float, device=device) + self.gamma * next_qvals # Next compute Q(s, a) # Nest each action in a list - so that it becomes the only admissible cmd nested_acts = tuple([[a] for a in batch.act]) qvals = self.network(batch.state, nested_acts, batch.history) # Combine the qvals: Maybe just do a greedy max for generality qvals = torch.cat(qvals) loss = F.smooth_l1_loss(qvals, targets.detach()) # Compute Huber loss if batch_loss is None: batch_loss = loss else: batch_loss += loss batch_loss /= num_per_step self.optimizer.zero_grad() batch_loss.backward() nn.utils.clip_grad_norm_(self.network.parameters(), self.clip) self.optimizer.step() # self.scheduler.step() return loss.item()
def optimize_model(self, config): #transitions = self.memory.sample(config.batch_size) # PrioritizedReplayMemory transitions, weights, indices = self.memory.sample( config.batch_size, config.beta) transitions = self.transition_to_tensor(transitions) batch = Transition(*zip(*transitions)) loss, weights_loss = self.get_loss(batch, config, weights, config.gamma) # N Step transitions_n, _, _ = self.memory_n.sample_from_indices( config.batch_size, config.beta, indices) transitions_n = self.transition_to_tensor(transitions_n) batch_n = Transition(*zip(*transitions_n)) gamma_n = config.gamma**config.n_step loss_n, weights_loss_n = self.get_loss(batch_n, config, weights, gamma_n) weights_loss += weights_loss_n self.optimizer.zero_grad() #loss.backward() # PrioritizedReplayMemory weights_loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() # PrioritizedReplayMemory loss_for_prior = loss.detach().cpu().numpy() new_priorities = loss_for_prior + config.prior_eps self.memory.update_priorities(indices, new_priorities) # N Step self.memory_n.update_priorities(indices, new_priorities) # Noisy Net self.policy_net.reset_noise() self.target_net.reset_noise()
def optimize_model(): if len(memory) < BATCH_SIZE: return transitions = memory.sample(BATCH_SIZE) # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.uint8) non_final_next_states = torch.cat([s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = policy_net(state_batch).gather(1, action_batch) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values = torch.zeros(BATCH_SIZE, device=device) next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach() # Compute the expected Q values expected_state_action_values = (next_state_values * GAMMA) + reward_batch # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) # Optimize the model optimizer.zero_grad() loss.backward() for param in policy_net.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step()
def optimize(): if len(memory) < BATCH_SIZE: return transitions = memory.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) states = torch.cat(batch.state) actions = torch.cat(batch.action) rewards = torch.cat(batch.reward) actual_q = policy_net(states).gather(1, actions) expected_q_value = expected_q(batch.next_state, rewards) loss = F.smooth_l1_loss( actual_q, expected_q_value) # loss between actual q and expected q # optimize the model optimizer.zero_grad() loss.backward() for param in policy_net.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step()
def step(self): self.env.steps_done += 1 x = self.env.input().reshape(self.n_agents, -1).cuda(self.device) phi = [] for i in range(self.n_agents): with torch.no_grad(): phi.append(self.Encoder(x[i])) # after encoding n = torch.cat(phi, dim=0).reshape(self.n_agents, -1) # state s = [] for i in range(self.n_agents): ni = torch.cat((n[0:i], n[i + 1:])).reshape(-1) s.append(torch.cat((x[i], ni))) s = torch.cat(s).reshape(self.n_agents, -1) # epsilon-greedy actions = self.select_action(s) # collect rewards rewards = self.env.step(actions) if rewards == -1: return "done" # state_{t+1} x_tp1 = self.env.input().reshape(self.n_agents, -1).cuda(self.device) phi_tp1 = [] for i in range(self.n_agents): with torch.no_grad(): phi_tp1.append(self.Encoder(x_tp1[i])) n_tp1 = torch.cat(phi_tp1, dim=0).reshape(self.n_agents, -1) s_tp1 = [] for i in range(self.n_agents): ni = torch.cat((n_tp1[0:i], n_tp1[i + 1:])).reshape(-1) s_tp1.append(torch.cat((x_tp1[i], ni))) s_tp1 = torch.cat(s_tp1).reshape(self.n_agents, -1) # initial Transition tuple res = [] for i in range(self.n_agents): res.append( Transition(state=s[i], action=actions[i], next_state=s_tp1[i], reward=rewards[i])) return res
def _optimize(self): if len(self.memory) < self.BATCH_SIZE: return transitions = self.memory.sample(self.BATCH_SIZE) batch = Transition(*zip(*transitions)) states = torch.cat(batch.state) actions = torch.cat(batch.action) rewards = torch.cat(batch.reward) # calculate q value and expected q value q = self._q(states, actions) expected_q = self._expected_q(batch.next_state, rewards) loss = F.smooth_l1_loss(q, expected_q) # optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step()
def learn(self, mem): transitions = mem.sample(self.batch_size) batch = Transition(*zip(*transitions)) # Transpose the batch states = Variable(torch.stack(batch.state, 0)) actions = Variable(torch.LongTensor(batch.action).unsqueeze(1)) rewards = Variable(torch.Tensor(batch.reward)) non_final_mask = torch.ByteTensor( tuple(map( lambda s: s is not None, batch.next_state))) # Only process non-terminal next states next_states = Variable( torch.stack(tuple(s for s in batch.next_state if s is not None), 0), volatile=True ) # Prevent backpropagating through expected action values Qs = self.policy_net(states).gather(1, actions) # Q(s_t, a_t; θpolicy) next_state_argmax_indices = self.policy_net(next_states).max( 1, keepdim=True )[1] # Perform argmax action selection using policy network: argmax_a[Q(s_t+1, a; θpolicy)] Qns = Variable(torch.zeros( self.batch_size)) # Q(s_t+1, a) = 0 if s_t+1 is terminal Qns[non_final_mask] = self.target_net(next_states).gather( 1, next_state_argmax_indices ) # Q(s_t+1, argmax_a[Q(s_t+1, a; θpolicy)]; θtarget) Qns.volatile = False # Remove volatile flag to prevent propagating it through loss target = rewards + ( self.discount * Qns ) # Double-Q target: Y = r + γ.Q(s_t+1, argmax_a[Q(s_t+1, a; θpolicy)]; θtarget) loss = F.smooth_l1_loss( Qs, target) # Huber loss on TD-error δ: δ = Y - Q(s_t, a_t) # TODO: TD-error clipping? self.policy_net.zero_grad() loss.backward() nn.utils.clip_grad_norm(self.policy_net.parameters(), self.max_gradient_norm) # Clamp gradients self.optimiser.step()
def optimize_model(policy_net, optimizer): # first sample a batch if len(memory) < BATCH_SIZE: return transitions = memory.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) # non_final_mask is the mask to tag all the item whose next_state is not None as True non_final_mask = tuple(map(lambda s: s is not None, batch.next_state)) non_final_mask = torch.tensor(non_final_mask, device=device, dtype=torch.uint8) non_final_next_states = torch.cat( [s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # policy_net(state_batch) is used to get all value among all actions # gather method is used to get the value corresponding to certain action state_action_values = policy_net(state_batch).gather(1, action_batch) next_state_values = torch.zeros(BATCH_SIZE, device=device) # compute the V(s_{t+1}) for $s_{t+1}$ which is final state, we set V(s_{t+1}) = 0 next_state_values[non_final_mask] = target_net(non_final_next_states).max( 1)[0].detach() expected_state_action_values = (next_state_values * GAMMA) + reward_batch # Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) optimizer.zero_grad() loss.backward() for param in policy_net.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step()
def optimize(self, batch_size, global_step=None): if len(self.memory) < batch_size: return None self.memory.batch_size = batch_size for transitions_batch in self.memory: # transform list of tuples into a tuple of lists. # explanation here: https://stackoverflow.com/a/19343/3343043 batch = Transition(*zip(*transitions_batch)) state_batch = torch.cat(batch.state) next_state_batch = torch.cat(batch.next_state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = self.model(state_batch).gather( 1, action_batch.reshape(batch_size, 1)) # Compute the expected Q values with torch.no_grad(): next_state_values = self.model(next_state_batch).max( 1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_batch # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) # Optimize the model self.optimizer.zero_grad() loss.backward() self.writer.add_scalar('training/loss', loss.item(), global_step) torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=2) self.optimizer.step()
def update(self): if len(self.memory) < self.BATCH_SIZE: print("[Warning] Memory data less than batch sizes!") return transitions = self.memory.sample(self.BATCH_SIZE) batch = Transition(*zip(*transitions)) final_mask = torch.cat(batch.done) non_final_next_states = torch.cat( [s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) state_action_values = self.policy_net(state_batch).gather( 1, action_batch) next_state_values = torch.zeros(self.BATCH_SIZE, 1, device=device) next_state_values[final_mask.bitwise_not()] = self.target_net( non_final_next_states).max(1, True)[0].detach() expected_state_action_values = (next_state_values * self.GAMMA) + reward_batch loss = F.smooth_l1_loss(state_action_values, expected_state_action_values) self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() self.update_count += 1 if self.update_count % self.TARGET_UPDATE == 0: self.update_target_net()
def optimize_model(policy_net, target_net, optimizer, memory): if len(memory) < BATCH_SIZE: return transitions = memory.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) # compute a mask of non-final states non_final_mask = torch.tensor([s is not None for s in batch.next_state], device=device, dtype=torch.uint8) non_final_next_states = torch.cat([s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # model computes Q(s_t) # use this to compute Q(s_t, a) state_action_values = policy_net(state_batch).gather(1, action_batch) # V(s_{t+1}) # all final states have 0 value # double Q-learning implemented policy_best_actions = policy_net(non_final_next_states).argmax(dim=1) i = torch.arange(len(policy_best_actions)) next_state_values = torch.zeros(BATCH_SIZE, device=device) next_state_values[non_final_mask] = target_net(non_final_next_states)[i, policy_best_actions].detach() # expected Q values expected_state_action_values = reward_batch + (next_state_values * GAMMA) loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(dim=1)) optimizer.zero_grad() loss.backward() for param in policy_net.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step()
def optimize(self): transitions = self.memory.sample(500) normalized_transitions = Transition(*zip(*transitions))
def optimize_model( dqn_net, target_net, memory, learning_rate, batch_size, size_board, gamma, optimizer, device, ): # Sample batch tree_indexes, memory_batch, batch_ISWeights = memory.sample(batch_size) samples = Transition(*zip(*memory_batch)) states_batch = samples.state actions_batch = samples.action rewards_batch = samples.reward next_states_batch = samples.next_state dones_batch = samples.done target_qs_batch = [] torch_next_states_batch = (torch.from_numpy( np.asarray(next_states_batch)).float().to(device)) # Get Q values for next state q_next_state = dqn_net(torch_next_states_batch, batch_size, size_board) # REMOVER detach depois e testar !!!!!!!!!!! q_target_next_state = (target_net(torch_next_states_batch, batch_size, size_board).cpu().detach()) for i in range(0, len(memory_batch)): terminal = dones_batch[i] # Get max action value index action = np.argmax(q_next_state[i].cpu().detach().numpy()) # If we are in terminal state, only equals reward if terminal: target_qs_batch.append(rewards_batch[i]) else: target = rewards_batch[i] + gamma * q_target_next_state[i][action] target_qs_batch.append(target) targets_batch = np.array([each for each in target_qs_batch]) torch_states_batch = torch.from_numpy( np.asarray(states_batch)).float().to(device) output = dqn_net(torch_states_batch, batch_size, size_board) torch_actions_batch = torch.from_numpy(np.asarray(actions_batch)) torch_actions_batch = torch_actions_batch.unsqueeze(0) torch_actions_batch = torch_actions_batch.view(batch_size, 1) # Q is our predicted Q value q_values = output.gather(1, torch_actions_batch.to(device)) q_values = q_values.float() # Absolute error for update tree absolute_errors = ( torch.abs(q_values - torch.from_numpy(targets_batch).view( batch_size, 1).float().to(device)).cpu().detach().numpy()) torch_batch_ISWeights = torch.from_numpy(batch_ISWeights).to(device) # Mean squared error diff_target = q_values - torch.from_numpy(targets_batch).view( batch_size, 1).float().to(device) squared_diff = diff_target**2 weighted_squared_diff = squared_diff * torch_batch_ISWeights # Loss loss = torch.mean(weighted_squared_diff) # Optimization optimizer.zero_grad() loss.backward() optimizer.step() # Squeze absolute errors absolute_errors = np.squeeze(absolute_errors, 1) # Memory tree update memory.batch_update(tree_indexes, absolute_errors) return loss.cpu().detach().numpy()
def optimize_model(self): if (len(self.memory) < self.batch_size or self.train_step % self.update_every != 0): return transitions = self.memory.sample(self.batch_size) # transpose the batch so that we get a transition # of batch array batch = Transition(*zip(*transitions)) # mask for non-final states # these are states where we will have another move to make # after the current move non_final_mask = torch.logical_not( torch.tensor(batch.done, dtype=torch.bool, device=self.device)) # get the next states that are not final non_final_next_state_batch = torch.tensor( [s['obs'] for s in batch.next_state], dtype=torch.float, device=self.device)[non_final_mask] # get the legal actions for these non-final # next states non_final_next_state_legal_actions = torch.stack([ torch.tensor(self.pad_actions(s['legal_actions']), dtype=torch.long, device=self.device) for s in batch.next_state ], dim=0)[non_final_mask] # get the state, action, and reward batches state_batch = torch.tensor([s['obs'] for s in batch.state], dtype=torch.float, device=self.device) action_batch = torch.tensor(batch.action, dtype=torch.long, device=self.device).view(-1, 1) reward_batch = torch.tensor(batch.reward, dtype=torch.float, device=self.device) # compute Q(s_t, a) state_action_values = self.policy_net(state_batch).gather( 1, action_batch) # compute max a for Q(s_{t+1}, a) # if s_{t+1} is a final state, then Q(s_{t+1}, a) is 0 next_state_values = torch.zeros(self.batch_size, device=self.device) if non_final_next_state_batch.size()[0] != 0: # get predicted rewards for all non-final next states next_state_all_values = \ self.target_net(non_final_next_state_batch).detach() # only select rewards for valid actions next_state_valid_values = next_state_all_values.gather( 1, non_final_next_state_legal_actions) # get the max reward for a valid action next_state_values[non_final_mask] = ( next_state_valid_values.max(1)[0]) expected_state_action_values = ((next_state_values * self.gamma) + reward_batch) # minimize Q(s_t, a) - reward + (gamma * max a Q(s_{t+1}, a)) # the predicted total reward for choosing action a at state s_t # should equal the actual reward for that action plus the # predicted total reward for choosing another action at state s_{t+1} loss = self.criterion(state_action_values, expected_state_action_values.unsqueeze(1)) self.loss_sum += loss.item() self.optimizer.zero_grad() loss.backward() self.weight_updates += 1 if self.clip_grads: for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() if self.weight_updates % self.target_update == 0: self.target_net.load_state_dict(self.policy_net.state_dict())
def update(self, state, action, reward, state_next, done): record = Transition(state, action, reward, state_next, done) self._memory.add(record) if len(self._memory) >= self._batch_size: self.train_step()
mask = np.array([not done]) nextState = stateToTensor(nextState, desiredGoal) nextStateNumpy = nextState.cpu().numpy() reward = calcReward(state[0, -3:], desiredGoal, orginalDistance) episodeReward = reward shortMemory.push(state, action, mask, nextStateNumpy, reward) if done: break else: state = nextState.to(device) if len(memory) > batchSize: for _ in range(updatesPerStep): transition = memory.sample(batchSize) batch = Transition(*zip(*transition)) valueLoss = agent.updateParameters(batch, device) valueLossEp += valueLoss memory.append(shortMemory) rewards.append(episodeReward) if episode % checkEvery == 0: testRewards = [] for _ in range(numberOfTests): state = env.reset() startingPositionPuck = state["achieved_goal"] orginalDistance = np.linalg.norm(startingPositionPuck - desiredGoal) while True: state = stateToTensor(state, desiredGoal).to(device=device)
def compute_bellman_residual(self, batch, target_state_action_value=None): # Compute concatenate the batch elements if not isinstance(batch.current_state, torch.Tensor): # logger.info("Casting the batch to torch.tensor") current_state = torch.cat( tuple(torch.tensor([batch.current_state], dtype=torch.float))).to(self.device) current_future_pos = torch.cat( tuple( torch.tensor([batch.current_future_pos], dtype=torch.float))).to(self.device) current_past_pos = torch.cat( tuple(torch.tensor([batch.current_past_pos], dtype=torch.float))).to(self.device) action = torch.tensor(batch.action, dtype=torch.long).to(self.device) reward = torch.tensor(batch.reward, dtype=torch.float).to(self.device) next_state = torch.cat( tuple(torch.tensor([batch.next_state], dtype=torch.float))).to(self.device) next_future_pos = torch.cat( tuple(torch.tensor([batch.next_future_pos], dtype=torch.float))).to(self.device) next_past_pos = torch.cat( tuple(torch.tensor([batch.next_past_pos], dtype=torch.float))).to(self.device) terminal = torch.tensor(batch.terminal, dtype=torch.bool).to(self.device) batch = Transition(current_state, current_future_pos, current_past_pos,\ action, reward,\ next_state, next_future_pos, next_past_pos,\ terminal, batch.info) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken current_state_action_values, current_trajectory = self.value_net( batch.current_state, batch.current_past_pos, batch.action) state_action_values = current_state_action_values.gather( 1, batch.action.unsqueeze(1)).squeeze(1) if target_state_action_value is None: with torch.no_grad(): # Compute V(s_{t+1}) for all next states. next_state_values = torch.zeros(batch.reward.shape).to( self.device) if self.config["double"]: # Double Q-learning: pick best actions from policy network next_state_action_values, next_trajectory = self.value_net( batch.next_state, batch.next_past_pos) _, best_actions = next_state_action_values.max(1) # Double Q-learning: estimate action values from target network next_target_state_action_values, next_target_tragectory = self.target_net( batch.next_state, batch.next_past_pos) best_values = next_target_state_action_values.gather( 1, best_actions.unsqueeze(1)).squeeze(1) else: next_state_action_values, next_trajectory = self.target_net( batch.next_state, bacth.next_past_pos) best_values, _ = next_state_action_values.max(1) next_state_values[~batch.terminal] = best_values[~batch. terminal] # Compute the expected Q values target_state_action_value = batch.reward + self.config[ "gamma"] * next_state_values # Compute loss rl_loss = self.rl_lossFunction(state_action_values, target_state_action_value) predict_loss = self.predict_lossfunction(current_trajectory, batch.current_future_pos) self.writer.add_scalar('step/rl_loss', rl_loss, self.step) self.writer.add_scalar('step/predict_loss', predict_loss, self.step) return rl_loss + predict_loss, target_state_action_value, batch
def sample_minibatch(self): if len(self.memory) < self.config["batch_size"]: return None transitions = self.memory.sample(self.config["batch_size"]) return Transition(*zip(*transitions))
duration.append(lifespan[i][-1]) lifespan[i].append(0) if lifespan[i][-1] > 0: # 500일때 버림 memory.push(s[i], a[i], r[i], s_gotten[i], done[i]) if frame_count > initial_exploration: eps -= 0.00005 eps = max(eps, 0.1) batch = memory.sample(batch_size) s = torch.FloatTensor([*batch.s]).to(device) a = torch.LongTensor([*batch.a]).unsqueeze(-1).to(device) r = torch.FloatTensor([*batch.r]).unsqueeze(-1).to(device) ns = torch.FloatTensor([*batch.ns]).to(device) nt = torch.BoolTensor(np.array(batch.nt).tolist()).unsqueeze(-1).to(device) agent.train_model(Transition(s, a, r, ns, nt), solver, gamma, F.mse_loss) if frame_count % update_target == 0: agent.update() if len(duration) > 100: score = np.array(duration)[-100:].mean() print('score:', score) if score > 498: break frame_count += 1 envs.close() env = gym.make(env_name) s = env.reset() while True: preprocessed_s = torch.FloatTensor(s).unsqueeze(0).to(device) a = agent.response(preprocessed_s)
def train_dqn(settings): required_settings = [ "batch_size", "checkpoint_frequency", "device", "eps_start", "eps_end", "eps_cliff", "eps_decay", "gamma", "log_freq", "logs_dir", "lr", "max_steps", "memory_size", "model_name", "num_episodes", "out_dir", "target_net_update_freq", ] if not settings_is_valid(settings, required_settings): raise Exception( f"Settings object {settings} missing some required settings.") batch_size = settings["batch_size"] checkpoint_frequency = settings["checkpoint_frequency"] device = settings["device"] eps_start = settings["eps_start"] eps_end = settings["eps_end"] eps_cliff = settings["eps_cliff"] # eps_decay = settings["eps_decay"] gamma = settings["gamma"] logs_dir = settings["logs_dir"] log_freq = settings["log_freq"] lr = settings["lr"] max_steps = settings["max_steps"] memory_size = settings["memory_size"] model_name = settings["model_name"] num_episodes = settings["num_episodes"] out_dir = settings["out_dir"] target_net_update_freq = settings["target_net_update_freq"] # Initialize environment env = gym.make("StarGunner-v0") # Initialize model num_actions = env.action_space.n settings["num_actions"] = num_actions policy_net = DQN(settings).to(device) target_net = DQN(settings).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() # Initialize memory logging.info("Initializing memory.") memory = ReplayMemory(memory_size) memory.init_with_random((1, 3, 84, 84), num_actions) logging.info("Finished initializing memory.") # Initialize other model ingredients optimizer = optim.Adam(policy_net.parameters(), lr=lr) # Initialize tensorboard writer = SummaryWriter(logs_dir) # Loop over episodes policy_net.train() steps_done = 0 log_reward_acc = 0.0 log_steps_acc = 0 for episode in tqdm(range(num_episodes)): state = process_state(env.reset()).to(device) reward_acc = 0.0 loss_acc = 0.0 # Loop over steps in episode for t in range(max_steps): with torch.no_grad(): Q = policy_net.forward(state.type(torch.float)) # Get best predicted action and perform it if steps_done < eps_cliff: epsilon = -(eps_start - eps_end) / eps_cliff * steps_done + eps_start else: epsilon = eps_end if random.random() < epsilon: predicted_action = torch.tensor([env.action_space.sample() ]).to(device) else: predicted_action = torch.argmax(Q, dim=1) next_state, raw_reward, done, info = env.step( predicted_action.item()) # Note that next state could also be a difference next_state = process_state(next_state) reward = torch.tensor([clamp_reward(raw_reward)]) # Save to memory memory.push(state.to("cpu"), predicted_action.to("cpu"), next_state, reward) # Move to next state state = next_state.to(device) # Sample from memory batch = Transition(*zip(*memory.sample(batch_size))) # Mask terminal state (adapted from https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html) final_mask = torch.tensor( tuple(map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.bool, ) # print("FINAL_MASK", final_mask.shape) state_batch = torch.cat(batch.state).type(torch.float).to(device) next_state_batch = torch.cat(batch.next_state).type( torch.float).to(device) action_batch = torch.cat(batch.action).to(device) reward_batch = torch.cat(batch.reward).to(device) # print("STATE_BATCH SHAPE", state_batch.shape) # print("STATE_BATCH", state_batch[4, :, 100]) # print("ACTION_BATCH SHAPE", action_batch.shape) # print("ACTION_BATCH", action_batch) # print("REWARD_BATCH SHAPE", reward_batch.shape) # Compute Q # Q_next = torch.zeros((batch_size, num_actions)) # print("MODEL STATE BATCH SHAPE", model(state_batch).shape) Q_actual = policy_net(state_batch).gather( 1, action_batch.view(action_batch.shape[0], 1)) Q_next_pred = target_net(next_state_batch) Q_max = torch.max(Q_next_pred, dim=1)[0].detach() # print("Q_MAX shape", Q_max.shape) target = reward_batch + gamma * Q_max * final_mask.to(Q_max.dtype) # print("TARGET SIZE", target.shape) # Calculate loss loss = F.smooth_l1_loss(Q_actual, target.unsqueeze(1)) optimizer.zero_grad() loss.backward() # Clamp gradient to avoid gradient explosion for param in policy_net.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step() # Store stats loss_acc += loss.item() reward_acc += raw_reward steps_done += 1 if steps_done % target_net_update_freq == 0: target_net.load_state_dict(policy_net.state_dict()) # Exit if in terminal state if done: logging.debug( f"Episode {episode} finished after {t} timesteps with reward {reward_acc}." ) break logging.debug(f"Loss: {loss_acc / t}") # Save model checkpoint if (episode != 0) and (episode % checkpoint_frequency == 0): save_model_checkpoint( policy_net, optimizer, episode, loss, f"{out_dir}/checkpoints/{model_name}_{episode}", ) # Log to tensorboard log_reward_acc += reward_acc log_steps_acc += t writer.add_scalar("Loss / Timestep", loss_acc / t, episode) if episode % log_freq == 0: writer.add_scalar("Reward", log_reward_acc / log_freq, episode) writer.add_scalar("Reward / Timestep", log_reward_acc / log_steps_acc, episode) writer.add_scalar("Duration", log_steps_acc / log_freq, episode) writer.add_scalar("Steps", log_reward_acc / log_steps_acc, steps_done) log_reward_acc = 0.0 log_steps_acc = 0 # Save model save_model(policy_net, f"{out_dir}/{model_name}.model") # Report final stats logging.info(f"Steps Done: {steps_done}") env.close() return policy_net