def compute_trajectory_actions_prob(self, trajectories: List[Trajectory], return_info=False) -> torch.Tensor:
        contexts = [traj.context for traj in trajectories]
        context_encoding = self.encoder.encode(contexts)
        state_tm1 = init_state = self.decoder.get_initial_state(context_encoding)

        batched_observation_seq, tgt_actions_info = Trajectory.to_batched_sequence_tensors(trajectories,
                                                                                           self.memory_size)

        # moved to device
        batched_observation_seq.to(self.device)
        # for val in tgt_actions_info.values(): val.to(self.device)
        # batched_observation_seq = Observation.to_batched_sequence_input(obs_seq, memory_size=self.memory_size)

        # tgt_action_id (batch_size, max_action_len)
        # tgt_action_mask (batch_size, max_action_len)
        tgt_action_id, tgt_action_mask = tgt_actions_info['tgt_action_ids'], tgt_actions_info['tgt_action_mask']
        tgt_action_id = tgt_action_id.to(self.device)
        tgt_action_mask = tgt_action_mask.to(self.device)

        max_time_step = batched_observation_seq.read_ind.size(1)
        action_logits = []
        for t in range(max_time_step):
            obs_slice_t = batched_observation_seq.slice(t)

            # mem_logits: (batch_size, memory_size)
            mem_logits, state_t = self.decoder.step(obs_slice_t, state_tm1, context_encoding)

            action_logits.append(mem_logits)
            state_tm1 = state_t

        # (max_action_len, batch_size, memory_size)
        action_logits = torch.stack(action_logits, dim=0).permute(1, 0, 2)

        # (batch_size, max_action_len, memory_size)
        action_log_probs = nn_util.masked_log_softmax(action_logits, batched_observation_seq.valid_action_mask)

        # (batch_size, max_action_len)
        tgt_action_log_probs = torch.gather(action_log_probs, dim=-1, index=tgt_action_id.unsqueeze(-1)).squeeze(
            -1) * tgt_action_mask

        # (batch_size)
        traj_log_prob = tgt_action_log_probs.sum(dim=-1)

        if return_info:
            info = dict(
                action_log_probs=action_log_probs,
                tgt_action_id=tgt_action_id,
                tgt_action_mask=tgt_action_mask,
                action_logits=action_logits,
                valid_action_mask=batched_observation_seq.valid_action_mask,
                context_encoding=context_encoding
            )

            return traj_log_prob, info

        return traj_log_prob
Beispiel #2
0
    def step_and_get_action_scores_t(self, observations_t, state_tm1,
                                     context_encoding):
        mem_logits, state_t = self.step(observations_t,
                                        state_tm1,
                                        context_encoding=context_encoding)

        # (batch_size, mem_size)
        action_score_t = nn_util.masked_log_softmax(
            mem_logits, mask=observations_t.valid_action_mask)

        return action_score_t, state_t
    def sample_action(self, logits, valid_action_mask, return_log_prob=False):
        """
        logits: (batch_size, action_num)
        valid_action_mask: (batch_size, action_num)
        """

        # p_actions = nn_util.masked_softmax(logits, mask=valid_action_mask)
        logits.masked_fill_((1 - valid_action_mask).bool(), -math.inf)
        p_actions = F.softmax(logits, dim=-1)
        # (batch_size, 1)
        sampled_actions = torch.multinomial(p_actions, num_samples=1)

        if return_log_prob:
            log_p_actions = nn_util.masked_log_softmax(logits, mask=valid_action_mask)
            log_prob = torch.gather(log_p_actions, dim=1, index=sampled_actions).squeeze(-1)

            return sampled_actions.squeeze(-1), log_prob

        return sampled_actions.squeeze(-1)