def compute_trajectory_actions_prob(self, trajectories: List[Trajectory], return_info=False) -> torch.Tensor: contexts = [traj.context for traj in trajectories] context_encoding = self.encoder.encode(contexts) state_tm1 = init_state = self.decoder.get_initial_state(context_encoding) batched_observation_seq, tgt_actions_info = Trajectory.to_batched_sequence_tensors(trajectories, self.memory_size) # moved to device batched_observation_seq.to(self.device) # for val in tgt_actions_info.values(): val.to(self.device) # batched_observation_seq = Observation.to_batched_sequence_input(obs_seq, memory_size=self.memory_size) # tgt_action_id (batch_size, max_action_len) # tgt_action_mask (batch_size, max_action_len) tgt_action_id, tgt_action_mask = tgt_actions_info['tgt_action_ids'], tgt_actions_info['tgt_action_mask'] tgt_action_id = tgt_action_id.to(self.device) tgt_action_mask = tgt_action_mask.to(self.device) max_time_step = batched_observation_seq.read_ind.size(1) action_logits = [] for t in range(max_time_step): obs_slice_t = batched_observation_seq.slice(t) # mem_logits: (batch_size, memory_size) mem_logits, state_t = self.decoder.step(obs_slice_t, state_tm1, context_encoding) action_logits.append(mem_logits) state_tm1 = state_t # (max_action_len, batch_size, memory_size) action_logits = torch.stack(action_logits, dim=0).permute(1, 0, 2) # (batch_size, max_action_len, memory_size) action_log_probs = nn_util.masked_log_softmax(action_logits, batched_observation_seq.valid_action_mask) # (batch_size, max_action_len) tgt_action_log_probs = torch.gather(action_log_probs, dim=-1, index=tgt_action_id.unsqueeze(-1)).squeeze( -1) * tgt_action_mask # (batch_size) traj_log_prob = tgt_action_log_probs.sum(dim=-1) if return_info: info = dict( action_log_probs=action_log_probs, tgt_action_id=tgt_action_id, tgt_action_mask=tgt_action_mask, action_logits=action_logits, valid_action_mask=batched_observation_seq.valid_action_mask, context_encoding=context_encoding ) return traj_log_prob, info return traj_log_prob
def step_and_get_action_scores_t(self, observations_t, state_tm1, context_encoding): mem_logits, state_t = self.step(observations_t, state_tm1, context_encoding=context_encoding) # (batch_size, mem_size) action_score_t = nn_util.masked_log_softmax( mem_logits, mask=observations_t.valid_action_mask) return action_score_t, state_t
def sample_action(self, logits, valid_action_mask, return_log_prob=False): """ logits: (batch_size, action_num) valid_action_mask: (batch_size, action_num) """ # p_actions = nn_util.masked_softmax(logits, mask=valid_action_mask) logits.masked_fill_((1 - valid_action_mask).bool(), -math.inf) p_actions = F.softmax(logits, dim=-1) # (batch_size, 1) sampled_actions = torch.multinomial(p_actions, num_samples=1) if return_log_prob: log_p_actions = nn_util.masked_log_softmax(logits, mask=valid_action_mask) log_prob = torch.gather(log_p_actions, dim=1, index=sampled_actions).squeeze(-1) return sampled_actions.squeeze(-1), log_prob return sampled_actions.squeeze(-1)