Esempio n. 1
0
    def step(self, observation, prev_action, prev_reward):
        prev_action = self.distribution.to_onehot(prev_action)
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)

        # TODO: need to decide which action to take
        pi, value = self.model(*model_inputs)
        int_pi, int_value = self.model_int(*model_inputs)

        dist_info = DistInfo(prob=pi)

        if self.dual_model:
            pi_int, pi_int = self.model_int(*model_inputs)
            dist_int_info = DistInfo(prob=pi_int)
            if self._mode == "eval":
                action = self.distribution.sample(dist_info)
            else:
                action = self.distribution.sample(dist_int_info)
        else:
            action = self.distribution.sample(dist_info)

        if self.dual_model:
            agent_info = AgentInfoTwin(dist_info=dist_info,
                                       value=value,
                                       dist_int_info=dist_int_info,
                                       int_value=int_value)
        else:
            agent_info = AgentInfo(dist_info=dist_info, value=value)

        action, agent_info = buffer_to((action, agent_info), device="cpu")
        return AgentStep(action=action, agent_info=agent_info)
Esempio n. 2
0
 def step(self, observation, prev_action, prev_reward):
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     mu, log_std, value = self.model(*model_inputs)
     dist_info = DistInfoStd(mean=mu, log_std=log_std)
     action = self.distribution.sample(dist_info)
     agent_info = AgentInfo(dist_info=dist_info, value=value)
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)
 def step(self, observation, prev_action, prev_reward):
     prev_action = self.distribution.to_onehot(prev_action)
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     pi, value = self.model(*model_inputs)
     dist_info = DistInfo(prob=pi)
     action = self.distribution.sample(dist_info)
     agent_info = AgentInfo(dist_info=dist_info, value=value)
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)
Esempio n. 4
0
 def step(self, observation, prev_action, prev_reward):
     prev_action = self.distribution.to_onehot(prev_action)
     observation = observation.type(
         torch.float)  # Expect torch.uint8 inputs
     observation = observation.mul_(1. /
                                    255)  # From [0-255] to [0-1], in place.
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     pi, value = self.model(*model_inputs)
     dist_info = DistInfo(prob=pi)
     action = self.distribution.sample(dist_info)
     agent_info = AgentInfo(dist_info=dist_info, value=value)
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)
Esempio n. 5
0
 def step(self, observation, prev_action, prev_reward, device="cpu"):
     """
     Compute policy's action distribution from inputs, and sample an
     action. Calls the model to produce mean, log_std, and value estimate.
     Moves inputs to device and returns outputs back to CPU, for the
     sampler.  (no grad)
     """
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     mu, log_std, value = self.model(*model_inputs)
     dist_info = DistInfoStd(mean=mu, log_std=log_std)
     action = self.distribution.sample(dist_info)
     agent_info = AgentInfo(dist_info=dist_info, value=value)
     action, agent_info = buffer_to((action, agent_info), device=device)
     return AgentStep(action=action, agent_info=agent_info)
Esempio n. 6
0
 def step(self, observation, prev_action, prev_reward):
     prev_action = self.distribution.to_onehot(prev_action)
     #observation = observation.type(torch.float)  # Expect torch.uint8 inputs
     #observation = observation.mul_(1. / 255)  # From [0-255] to [0-1], in place.
     if len(observation.shape) == 3:
         observation = self.aug_obs(observation.unsqueeze(0)).squeeze(0)
     else:
         observation = self.aug_obs(observation)
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     pi, value, latent, reconstruction = self.model(*model_inputs)
     dist_info = DistInfo(prob=pi)
     action = self.distribution.sample(dist_info)
     agent_info = AgentInfo(dist_info=dist_info, value=value)
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)