def step(self, observation, prev_action, prev_reward): prev_action = self.distribution.to_onehot(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) # TODO: need to decide which action to take pi, value = self.model(*model_inputs) int_pi, int_value = self.model_int(*model_inputs) dist_info = DistInfo(prob=pi) if self.dual_model: pi_int, pi_int = self.model_int(*model_inputs) dist_int_info = DistInfo(prob=pi_int) if self._mode == "eval": action = self.distribution.sample(dist_info) else: action = self.distribution.sample(dist_int_info) else: action = self.distribution.sample(dist_info) if self.dual_model: agent_info = AgentInfoTwin(dist_info=dist_info, value=value, dist_int_info=dist_int_info, int_value=int_value) else: agent_info = AgentInfo(dist_info=dist_info, value=value) action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) mu, log_std, value = self.model(*model_inputs) dist_info = DistInfoStd(mean=mu, log_std=log_std) action = self.distribution.sample(dist_info) agent_info = AgentInfo(dist_info=dist_info, value=value) action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): prev_action = self.distribution.to_onehot(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) pi, value = self.model(*model_inputs) dist_info = DistInfo(prob=pi) action = self.distribution.sample(dist_info) agent_info = AgentInfo(dist_info=dist_info, value=value) action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): prev_action = self.distribution.to_onehot(prev_action) observation = observation.type( torch.float) # Expect torch.uint8 inputs observation = observation.mul_(1. / 255) # From [0-255] to [0-1], in place. model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) pi, value = self.model(*model_inputs) dist_info = DistInfo(prob=pi) action = self.distribution.sample(dist_info) agent_info = AgentInfo(dist_info=dist_info, value=value) action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward, device="cpu"): """ Compute policy's action distribution from inputs, and sample an action. Calls the model to produce mean, log_std, and value estimate. Moves inputs to device and returns outputs back to CPU, for the sampler. (no grad) """ model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) mu, log_std, value = self.model(*model_inputs) dist_info = DistInfoStd(mean=mu, log_std=log_std) action = self.distribution.sample(dist_info) agent_info = AgentInfo(dist_info=dist_info, value=value) action, agent_info = buffer_to((action, agent_info), device=device) return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): prev_action = self.distribution.to_onehot(prev_action) #observation = observation.type(torch.float) # Expect torch.uint8 inputs #observation = observation.mul_(1. / 255) # From [0-255] to [0-1], in place. if len(observation.shape) == 3: observation = self.aug_obs(observation.unsqueeze(0)).squeeze(0) else: observation = self.aug_obs(observation) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) pi, value, latent, reconstruction = self.model(*model_inputs) dist_info = DistInfo(prob=pi) action = self.distribution.sample(dist_info) agent_info = AgentInfo(dist_info=dist_info, value=value) action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)