def prepare_dqn_transitions(self, hps, decoder_states, greedy_samples, vsize_extended): """Prepare the experiences for this batch Args: hps: model paramters decoder_states: decode output states (max_dec_steps, batch_size, hidden_dim) greedy_samples: set of tokens selected through greedy selection, list of size batch_size each contains max_dec_steps tokens. Returns: transitions: List of experiences collected for this batch (batch_size, k, max_dec_steps) """ # all variables must have the shape (batch_size, k, <=max_dec_steps, feature_len) decoder_states = np.transpose(np.stack(decoder_states),[1,0,2]) # now of shape (batch_size, <=max_dec_steps, hidden_dim) greedy_samples = np.stack(greedy_samples) # now of shape (batch_size, <=max_dec_steps) dec_length = decoder_states.shape[1] hidden_dim = decoder_states.shape[-1] # modifying decoder state tensor to shape (batch_size, k, <=max_dec_steps, hidden_dim) _decoder_states = np.expand_dims(decoder_states, 1) _decoder_states = np.concatenate([_decoder_states] * hps.k, axis=1) # shape (batch_size, k, <=max_dec_steps, hidden_dim) # TODO: if wanna use time as a categorical feature #features = np.concatenate([self.times, _decoder_states], axis=-1) # shape (batch_size, k, <=max_dec_steps, hidden_dim + <=max_dec_steps) features = _decoder_states # shape (batch_size, k, <=max_dec_steps, hidden_dim) ### TODO: do it in parallel??? transitions = [] # (h_t, w_t, h_{t+1}, r_t, q_t, done) for i in range(self._hps.batch_size): for k in range(self._hps.k): for t in range(self._hps.max_dec_steps): action = greedy_samples[i,k,t] done = (t==(self._hps.max_dec_steps-1) or action==3) # 3 is the id for [STOP] in our vocabularity to stop decoding if done: state = features[i,k,t] state_prime = np.zeros((features.shape[-1])) action_prime = 3 # 3 is the id for [STOP] in our vocabularity to stop decoding if self._hps.calculate_true_q: # We use the true q_values that we calculated to train DQN network. transitions.append(Transition(state, action, state_prime, action_prime, self.r_values[i,k,t,action], self.q_values[i,k,t], True)) else: # We update the q_values later, after collecting the q_estimates from DQN network. transitions.append(Transition(state, action, state_prime, action_prime, self.r_values[i,k,t], np.zeros((vsize_extended)), True)) else: state = features[i,k,t] state_prime = features[i,k,t+1] action_prime = greedy_samples[i,k,t+1] if self._hps.calculate_true_q: # We use the true q_values that we calculated to train DQN network. transitions.append(Transition(state, action, state_prime, action_prime,self.r_values[i,k,t,action], self.q_values[i,k,t], False)) else: # We update the q_values later, after collecting the q_estimates from DQN network. transitions.append(Transition(state, action, state_prime, action_prime,self.r_values[i,k,t], np.zeros((vsize_extended)), False)) return transitions
def learn(self, experiences, gamma): """Prepare minibatch and train them Args: experiences (List[Transition]): batch of `Transition` gamma (float): Discount rate of Q_target """ if len(self.replay_memory.memory) < BATCH_SIZE: return transitions = self.replay_memory.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) states = torch.cat(batch.state) actions = torch.cat(batch.action) rewards = torch.cat(batch.reward) next_states = torch.cat(batch.next_state) dones = torch.cat(batch.done) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to newtork q_local (current estimate) Q_expected = self.q_local(states).gather(1, actions) Q_targets_next = self.q_target(next_states).detach().max(1)[0] Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) #self.q_local.train(mode=True) self.optim.zero_grad() loss = self.mse_loss(Q_expected, Q_targets.unsqueeze(1)) loss.backward() self.optim.step()
def optimize_model(losses): global n_step if len(memory) < learning_param.BATCH_SIZE: return # sample a batch of transitions transitions = memory.sample(learning_param.BATCH_SIZE) # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. batch = Transition(*zip(*transitions)) # Concatenate the batch elements state_batch = torch.cat([s.unsqueeze(0) for s in batch.state]) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) #print(state_batch.shape) # predicted value for state and chosen action predicted_values = torch.cat( [policy_net(s.unsqueeze(0)) for s in state_batch]) state_action_values = torch.tensor([ predicted_values[i][action_batch[i]] for i in range(predicted_values.shape[0]) ]) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # if the state was final, V(s_{t+1}) is set to zero next_state_values = torch.cat( [target_net(s.unsqueeze(0)).max(1).values for s in batch.next_state]) # Compute the expected Q values expected_state_action_values = (next_state_values * learning_param.GAMMA) + reward_batch #print("expected :", next_state_values[0].item(), "new :", expected_state_action_values[0].item(), "différence :", next_state_values[0].item()-expected_state_action_values[0].item()) # Compute Huber loss loss = F.smooth_l1_loss(state_action_values.unsqueeze(1), expected_state_action_values.unsqueeze(1)) if n_step % print_freq == 0: print(" Loss : ", loss.item()) losses.append(loss.item()) # Optimize the model optimizer.zero_grad() loss.backward() for param in policy_net.parameters(): param.data.clamp_(-1, 1) optimizer.step()
def learn(self, experiences, gamma): """Prepare minibatch and train them Args: experiences (List[Transition]): Minibatch of `Transition` gamma (float): Discount rate of Q_target """ if len(self.replay_memory.memory) < BATCH_SIZE: return transitions = self.replay_memory.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) states = torch.cat(batch.state) actions = torch.cat(batch.action) rewards = torch.cat(batch.reward) next_states = torch.cat(batch.next_state) dones = torch.cat(batch.done) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net # Use local model to choose an action, and target model to evaluate that action Q_max_action = self.q_local(next_states).detach().max(1)[1].unsqueeze( 1) Q_targets_next = self.q_target(next_states).gather( 1, Q_max_action).reshape(-1) # Compute the expected Q values Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.q_local(states).gather(1, actions) ## current #self.q_local.train(mode=True) self.optim.zero_grad() #print('Q_expected.shape: ', Q_expected.shape) #print('Q_targets_next.shape: ', Q_targets_next.shape) #print('Q_targets.shape: ', Q_targets.shape) loss = self.mse_loss(Q_expected, Q_targets.unsqueeze(1)) # backpropagation of loss to NN loss.backward() self.optim.step()
# ================================================== # getting the tuple (s, a, r, s', done) # ================================================== action = param.act(obs) next_obs, reward, done, _ = env.step(action) # no need to keep track of max time-steps, because the environment # is wrapped with TimeLimit automatically (timeout after 1000 steps) total_reward += reward # ================================================== # storing it to the buffer # ================================================== buf.push(Transition(obs, action, reward, next_obs, done)) # ================================================== # update the parameters # ================================================== if buf.ready_for(batch_size): param.update_networks(buf.sample(batch_size)) total_updates += 1 # ================================================== # check done # ================================================== if done: break
def train_on_minibatches(): for i in range(args['replay_num_updates']): transitions = replay_buffer.sample(args['batch_size']) batch = Transition(*zip(*transitions)) agent.update_parameters(batch)