Esempio n. 1
0
  def prepare_dqn_transitions(self, hps, decoder_states, greedy_samples, vsize_extended):
    """Prepare the experiences for this batch
    Args:
      hps: model paramters
      decoder_states: decode output states (max_dec_steps, batch_size, hidden_dim)
      greedy_samples: set of tokens selected through greedy selection, list of size batch_size each contains
      max_dec_steps tokens.

    Returns:
      transitions:
        List of experiences collected for this batch (batch_size, k, max_dec_steps)
    """
    # all variables must have the shape (batch_size, k, <=max_dec_steps, feature_len)
    decoder_states = np.transpose(np.stack(decoder_states),[1,0,2]) # now of shape (batch_size, <=max_dec_steps, hidden_dim)
    greedy_samples = np.stack(greedy_samples) # now of shape (batch_size, <=max_dec_steps)

    dec_length = decoder_states.shape[1]
    hidden_dim = decoder_states.shape[-1]

    # modifying decoder state tensor to shape (batch_size, k, <=max_dec_steps, hidden_dim)
    _decoder_states = np.expand_dims(decoder_states, 1)
    _decoder_states = np.concatenate([_decoder_states] * hps.k, axis=1) # shape (batch_size, k, <=max_dec_steps, hidden_dim)
    # TODO: if wanna use time as a categorical feature
    #features = np.concatenate([self.times, _decoder_states], axis=-1) # shape (batch_size, k, <=max_dec_steps, hidden_dim + <=max_dec_steps)
    features = _decoder_states # shape (batch_size, k, <=max_dec_steps, hidden_dim)

    ### TODO: do it in parallel???
    transitions = [] # (h_t, w_t, h_{t+1}, r_t, q_t, done)
    for i in range(self._hps.batch_size):
      for k in range(self._hps.k):
        for t in range(self._hps.max_dec_steps):
          action = greedy_samples[i,k,t]
          done = (t==(self._hps.max_dec_steps-1) or action==3) # 3 is the id for [STOP] in our vocabularity to stop decoding
          if done:
            state = features[i,k,t]
            state_prime = np.zeros((features.shape[-1]))
            action_prime = 3 # 3 is the id for [STOP] in our vocabularity to stop decoding
            if self._hps.calculate_true_q:
              # We use the true q_values that we calculated to train DQN network.
              transitions.append(Transition(state, action, state_prime, action_prime, self.r_values[i,k,t,action], self.q_values[i,k,t], True))
            else:
              # We update the q_values later, after collecting the q_estimates from DQN network.
              transitions.append(Transition(state, action, state_prime, action_prime, self.r_values[i,k,t], np.zeros((vsize_extended)), True))
          else:
            state = features[i,k,t]
            state_prime = features[i,k,t+1]
            action_prime = greedy_samples[i,k,t+1]
            if self._hps.calculate_true_q:
              # We use the true q_values that we calculated to train DQN network.
              transitions.append(Transition(state, action, state_prime, action_prime,self.r_values[i,k,t,action], self.q_values[i,k,t], False))
            else:
              # We update the q_values later, after collecting the q_estimates from DQN network.
              transitions.append(Transition(state, action, state_prime, action_prime,self.r_values[i,k,t], np.zeros((vsize_extended)), False))

    return transitions
    def learn(self, experiences, gamma):
        """Prepare minibatch and train them

        Args:
        experiences (List[Transition]): batch of `Transition`
        gamma (float): Discount rate of Q_target
        """

        if len(self.replay_memory.memory) < BATCH_SIZE:
            return

        transitions = self.replay_memory.sample(BATCH_SIZE)

        batch = Transition(*zip(*transitions))

        states = torch.cat(batch.state)
        actions = torch.cat(batch.action)
        rewards = torch.cat(batch.reward)
        next_states = torch.cat(batch.next_state)
        dones = torch.cat(batch.done)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to newtork q_local (current estimate)
        Q_expected = self.q_local(states).gather(1, actions)

        Q_targets_next = self.q_target(next_states).detach().max(1)[0]

        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        #self.q_local.train(mode=True)
        self.optim.zero_grad()
        loss = self.mse_loss(Q_expected, Q_targets.unsqueeze(1))
        loss.backward()
        self.optim.step()
Esempio n. 3
0
def optimize_model(losses):
    global n_step
    if len(memory) < learning_param.BATCH_SIZE:
        return
    # sample a batch of transitions
    transitions = memory.sample(learning_param.BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Concatenate the batch elements
    state_batch = torch.cat([s.unsqueeze(0) for s in batch.state])
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    #print(state_batch.shape)

    # predicted value for state and chosen action
    predicted_values = torch.cat(
        [policy_net(s.unsqueeze(0)) for s in state_batch])
    state_action_values = torch.tensor([
        predicted_values[i][action_batch[i]]
        for i in range(predicted_values.shape[0])
    ])

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # if the state was final, V(s_{t+1}) is set to zero
    next_state_values = torch.cat(
        [target_net(s.unsqueeze(0)).max(1).values for s in batch.next_state])
    # Compute the expected Q values
    expected_state_action_values = (next_state_values *
                                    learning_param.GAMMA) + reward_batch

    #print("expected :", next_state_values[0].item(), "new :", expected_state_action_values[0].item(), "différence :", next_state_values[0].item()-expected_state_action_values[0].item())

    # Compute Huber loss
    loss = F.smooth_l1_loss(state_action_values.unsqueeze(1),
                            expected_state_action_values.unsqueeze(1))
    if n_step % print_freq == 0:
        print("                                  Loss : ", loss.item())
    losses.append(loss.item())

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.data.clamp_(-1, 1)
    optimizer.step()
Esempio n. 4
0
    def learn(self, experiences, gamma):
        """Prepare minibatch and train them

        Args:
        experiences (List[Transition]): Minibatch of `Transition`
        gamma (float): Discount rate of Q_target
        """

        if len(self.replay_memory.memory) < BATCH_SIZE:
            return

        transitions = self.replay_memory.sample(BATCH_SIZE)

        batch = Transition(*zip(*transitions))

        states = torch.cat(batch.state)
        actions = torch.cat(batch.action)
        rewards = torch.cat(batch.reward)
        next_states = torch.cat(batch.next_state)
        dones = torch.cat(batch.done)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        # Use local model to choose an action, and target model to evaluate that action

        Q_max_action = self.q_local(next_states).detach().max(1)[1].unsqueeze(
            1)
        Q_targets_next = self.q_target(next_states).gather(
            1, Q_max_action).reshape(-1)

        # Compute the expected Q values
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        Q_expected = self.q_local(states).gather(1, actions)  ## current

        #self.q_local.train(mode=True)
        self.optim.zero_grad()

        #print('Q_expected.shape: ', Q_expected.shape)
        #print('Q_targets_next.shape: ', Q_targets_next.shape)
        #print('Q_targets.shape: ', Q_targets.shape)

        loss = self.mse_loss(Q_expected, Q_targets.unsqueeze(1))

        # backpropagation of loss to NN
        loss.backward()
        self.optim.step()
        # ==================================================
        # getting the tuple (s, a, r, s', done)
        # ==================================================

        action = param.act(obs)
        next_obs, reward, done, _ = env.step(action)
        # no need to keep track of max time-steps, because the environment
        # is wrapped with TimeLimit automatically (timeout after 1000 steps)

        total_reward += reward

        # ==================================================
        # storing it to the buffer
        # ==================================================

        buf.push(Transition(obs, action, reward, next_obs, done))

        # ==================================================
        # update the parameters
        # ==================================================

        if buf.ready_for(batch_size):
            param.update_networks(buf.sample(batch_size))
            total_updates += 1

        # ==================================================
        # check done
        # ==================================================

        if done: break
Esempio n. 6
0
def train_on_minibatches():
    for i in range(args['replay_num_updates']):
        transitions = replay_buffer.sample(args['batch_size'])
        batch = Transition(*zip(*transitions))
        agent.update_parameters(batch)