def step(self, observation, prev_action, prev_reward):
     prev_action = self.distribution.to_onehot(prev_action)
     agent_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     probs, value, rnn_state = self.model(*agent_inputs,
                                          self.prev_rnn_state)
     dist_info = DistInfo(prob=probs)
     if self._mode == 'sample':
         action = self.distribution.sample(dist_info)
     elif self._mode == 'eval':
         action = torch.argmax(probs, dim=-1)
     # Model handles None, but Buffer does not, make zeros if needed:
     if self.prev_rnn_state is None:
         prev_rnn_state = buffer_func(rnn_state, torch.zeros_like)
     else:
         prev_rnn_state = self.prev_rnn_state
     # Transpose the rnn_state from [N,B,H] --> [B,N,H] for storage.
     # (Special case: model should always leave B dimension in.)
     prev_rnn_state = buffer_method(prev_rnn_state, "transpose", 0, 1)
     agent_info = AgentInfoRnn(dist_info=dist_info,
                               value=value,
                               prev_rnn_state=prev_rnn_state)
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     self.advance_rnn_state(rnn_state)  # Keep on device.
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 2
0
    def step(self, observation, prev_action, prev_reward):
        prev_action = self.distribution.to_onehot(prev_action)
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)

        # TODO: need to decide which action to take
        pi, value = self.model(*model_inputs)
        int_pi, int_value = self.model_int(*model_inputs)

        dist_info = DistInfo(prob=pi)

        if self.dual_model:
            pi_int, pi_int = self.model_int(*model_inputs)
            dist_int_info = DistInfo(prob=pi_int)
            if self._mode == "eval":
                action = self.distribution.sample(dist_info)
            else:
                action = self.distribution.sample(dist_int_info)
        else:
            action = self.distribution.sample(dist_info)

        if self.dual_model:
            agent_info = AgentInfoTwin(dist_info=dist_info,
                                       value=value,
                                       dist_int_info=dist_int_info,
                                       int_value=int_value)
        else:
            agent_info = AgentInfo(dist_info=dist_info, value=value)

        action, agent_info = buffer_to((action, agent_info), device="cpu")
        return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 3
0
 def step(self, observation, prev_action, prev_reward):
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     action = self.model(*model_inputs)
     agent_info = EmptyAgentInfo()
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 4
0
 def step(self, observation, prev_action, prev_reward, device="cpu"):
     """
     Compute policy's action distribution from inputs, and sample an
     action. Calls the model to produce mean, log_std, value estimate, and
     next recurrent state.  Moves inputs to device and returns outputs back
     to CPU, for the sampler.  Advances the recurrent state of the agent.
     (no grad)
     """
     agent_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     mu, log_std, value, rnn_state = self.model(*agent_inputs,
                                                self.prev_rnn_state)
     dist_info = DistInfoStd(mean=mu, log_std=log_std)
     action = self.distribution.sample(dist_info)
     # Model handles None, but Buffer does not, make zeros if needed:
     prev_rnn_state = self.prev_rnn_state if self.prev_rnn_state is not None else buffer_func(
         rnn_state, torch.zeros_like)
     # Transpose the rnn_state from [N,B,H] --> [B,N,H] for storage.
     # (Special case: model should always leave B dimension in.)
     prev_rnn_state = buffer_method(prev_rnn_state, "transpose", 0, 1)
     agent_info = AgentInfoRnn(dist_info=dist_info,
                               value=value,
                               prev_rnn_state=prev_rnn_state)
     action, agent_info = buffer_to((action, agent_info), device=device)
     self.advance_rnn_state(rnn_state)  # Keep on device.
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 5
0
 def step(self, observation, prev_action, prev_reward, device="cpu"):
     """
     Compute policy's option and action distributions from inputs.
     Calls model to get mean, std for all pi_w, q, beta for all options, pi over options
     Moves inputs to device and returns outputs back to CPU, for the
     sampler.  (no grad)
     """
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     mu, log_std, beta, q, pi = self.model(*model_inputs)
     dist_info_omega = DistInfo(prob=pi)
     new_o, terminations = self.sample_option(
         beta, dist_info_omega)  # Sample terminations and options
     dist_info = DistInfoStd(mean=mu, log_std=log_std)
     dist_info_o = DistInfoStd(mean=select_at_indexes(new_o, mu),
                               log_std=select_at_indexes(new_o, log_std))
     action = self.distribution.sample(dist_info_o)
     agent_info = AgentInfoOC(dist_info=dist_info,
                              dist_info_o=dist_info_o,
                              q=q,
                              value=(pi * q).sum(-1),
                              termination=terminations,
                              dist_info_omega=dist_info_omega,
                              prev_o=self._prev_option,
                              o=new_o)
     action, agent_info = buffer_to((action, agent_info), device=device)
     self.advance_oc_state(new_o)
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 6
0
 def step(self, observation, prev_action, prev_reward):
     model_inputs = buffer_to((observation, prev_action, prev_reward),
         device=self.device)
     action, action_probs, log_action_probs = self.model(*model_inputs)
     dist_info = DistInfo(prob=action_probs)
     agent_info = AgentInfo(dist_info=dist_info)
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 7
0
 def step(self, observation, prev_action, prev_reward):
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     mu = self.model(*model_inputs)
     action = self.distribution.sample(DistInfo(mean=mu))
     agent_info = AgentInfo(mu=mu)
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 8
0
 def to_agent_step(self, output):
     """Convert the output of the NN model into step info for the agent.
     """
     q = output
     # q = q.cpu()
     action = self.distribution.sample(q)
     agent_info = AgentInfo(q=q)
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 9
0
    def step(self, observation, prev_action, prev_reward):
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        # mean, log_std = self.model(*model_inputs)
        # dist_info = DistInfoStd(mean=mean, log_std=log_std)
        # action = self.distribution.sample(dist_info)
        if self.random_actions_for_pretraining:
            action = torch.randint_like(prev_action, 15)
            action = buffer_to(action, device="cpu")
            return AgentStep(action=action,
                             agent_info=AgentInfo(dist_info=None))

        pi, _, _ = self.model(*model_inputs)
        dist_info = DistInfo(prob=pi)
        action = self.distribution.sample(dist_info)
        agent_info = AgentInfo(dist_info=dist_info)
        action, agent_info = buffer_to((action, agent_info), device="cpu")
        return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 10
0
 def step(self, observation, prev_action, prev_reward):
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     mu, log_std, value = self.model(*model_inputs)
     dist_info = DistInfoStd(mean=mu, log_std=log_std)
     action = self.distribution.sample(dist_info)
     agent_info = AgentInfo(dist_info=dist_info, value=value)
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 11
0
 def step(self, observation, prev_action, prev_reward):
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     alpha, beta = self.model(*model_inputs)
     dist_info = DistInfoStd(alpha=alpha, beta=beta)
     action = self.distribution.sample(dist_info)
     agent_info = AgentInfo(dist_info=dist_info)
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 12
0
 def to_agent_step(self, output):
     """Convert the output of the NN model into step info for the agent.
     """
     p = output
     # p = p.cpu()
     action = self.distribution.sample(p)
     agent_info = AgentInfo(p=p)  # Only change from DQN: q -> p.
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 13
0
 def step(self, observation, prev_action, prev_reward):
     prev_action = self.distribution.to_onehot(prev_action)
     model_inputs = buffer_to((observation, prev_action, prev_reward),
         device=self.device)
     q = self.model(*model_inputs)
     q = q.cpu()
     action = self.distribution.sample(q)
     agent_info = AgentInfo(q=q)
     # action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 14
0
 def step(self, observation, prev_action, prev_reward):
     prev_action = self.distribution.to_onehot(prev_action)
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     p = self.model(*model_inputs)
     p = p.cpu()
     action = self.distribution.sample(p)
     agent_info = AgentInfo(p=p)  # Only change from DQN: q -> p.
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)
 def step(self, observation, prev_action, prev_reward):
     prev_action = self.distribution.to_onehot(prev_action)
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     pi, value = self.model(*model_inputs)
     dist_info = DistInfo(prob=pi)
     action = self.distribution.sample(dist_info)
     agent_info = AgentInfo(dist_info=dist_info, value=value)
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 16
0
 def step(self, observation, prev_action, prev_reward):
     """Computes distribution parameters (mu) for state/observation,
     returns (gaussian) sampled action."""
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     mu = self.model(*model_inputs)
     action = self.distribution.sample(DistInfo(mean=mu))
     agent_info = AgentInfo(mu=mu)
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 17
0
 def step(self, observation, prev_action, prev_reward):
     model_inputs = buffer_to(observation, device=self.device)
     mean, log_std, sym_features = self.model(model_inputs,
                                              "pi",
                                              extract_sym_features=True)
     dist_info = DistInfoStd(mean=mean, log_std=log_std)
     action = self.distribution.sample(dist_info)
     agent_info = SafeSacAgentInfo(dist_info=dist_info,
                                   sym_features=sym_features)
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 18
0
 def step(self, observation, prev_action, prev_reward):
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     mean, log_std = self.pi_model(*model_inputs)
     dist_info = DistInfoStd(mean=mean, log_std=log_std)
     action = self.distribution.sample(dist_info)
     agent_info = AgentInfo(dist_info=dist_info)
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     if np.any(np.isnan(action.numpy())):
         breakpoint()
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 19
0
 def step(self, observation, prev_action, prev_reward):
     """Computes Q-values for states/observations and selects actions by
     epsilon-greedy. (no grad)"""
     prev_action = self.distribution.to_onehot(prev_action)
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     q = self.model(*model_inputs)
     q = q.cpu()
     action = self.distribution.sample(q)
     agent_info = AgentInfo(q=q)
     # action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 20
0
 def step(self, observation, prev_action, prev_reward):
     observation, prev_action, prev_reward = buffer_to(
         (observation, prev_action, prev_reward), device=self.device)
     # self.model includes encoder + actor MLP.
     mean, log_std, latent, conv = self.model(observation, prev_action,
                                              prev_reward)
     dist_info = DistInfoStd(mean=mean, log_std=log_std)
     action = self.distribution.sample(dist_info)
     agent_info = AgentInfo(dist_info=dist_info,
                            conv=conv if self.store_latent else None)
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 21
0
 def step(self, observation, prev_action, prev_reward):
     """Compute the discrete distribution for the Q-value for each
     action for each state/observation (no grad)."""
     prev_action = self.distribution.to_onehot(prev_action)
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     p = self.model(*model_inputs)
     p = p.cpu()
     action = self.distribution.sample(p)
     agent_info = AgentInfo(p=p)  # Only change from DQN: q -> p.
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 22
0
 def step(self, observation, prev_action, prev_reward):
     prev_action = self.format_actions(prev_action)
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     pi, ext_value, int_value = self.model(*model_inputs)
     dist_info = DistInfo(prob=pi)
     action = self.distribution.sample(dist_info)
     agent_info = IntAgentInfo(dist_info=dist_info,
                               ext_value=ext_value,
                               int_value=int_value)
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 23
0
 def step(self, observation, prev_action, prev_reward):
     model_inputs = buffer_to((observation, ), device=self.device)[0]
     mu, log_std, value, sym_features = self.model(
         model_inputs, extract_sym_features=True)
     dist_info = DistInfoStd(mean=mu, log_std=log_std)
     action = self.distribution.sample(dist_info)
     action = action.clamp(-1, 1)
     agent_info = SafeAgentInfo(dist_info=dist_info,
                                value=value,
                                sym_features=sym_features)
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 24
0
 def step(self, observation, prev_action, prev_reward):
     prev_action = self.distribution.to_onehot(prev_action)
     model_inputs = buffer_to((observation, prev_action, prev_reward),
         device=self.device)
     pi, value, conv = self.model(*model_inputs)
     if self._act_uniform:
         pi[:] = 1. / pi.shape[-1]  # uniform
     dist_info = DistInfo(prob=pi)
     action = self.distribution.sample(dist_info)
     agent_info = AgentInfoConv(dist_info=dist_info, value=value,
         conv=conv if self.store_latent else None)  # Don't write extra data.
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 25
0
 def step(self, observation, prev_action=None, prev_reward=None):
     pi, value, sym_features = self.model(
         observation.to(device=self.device), extract_sym_features=True)
     dist_info = DistInfo(prob=pi)
     action = self.distribution.sample(dist_info)
     # either sym_features should always be given or never
     if sym_features is not None:
         agent_info = SafeAgentInfo(dist_info=dist_info,
                                    value=value,
                                    sym_features=sym_features)
     else:
         agent_info = AgentInfo(dist_info=dist_info, value=value)
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 26
0
 def to_agent_step(self, output):
     """Convert the output of the NN model into step info for the agent.
     """
     q, rnn_state = output
     # q = q.cpu()
     action = self.distribution.sample(q)
     prev_rnn_state = self.prev_rnn_state or buffer_func(rnn_state, torch.zeros_like)
     # Transpose the rnn_state from [N,B,H] --> [B,N,H] for storage.
     # (Special case, model should always leave B dimension in.)
     prev_rnn_state = buffer_method(prev_rnn_state, "transpose", 0, 1)
     prev_rnn_state, action, q = buffer_to((prev_rnn_state, action, q), device="cpu")
     agent_info = AgentInfo(q=q, prev_rnn_state=prev_rnn_state)
     self.advance_rnn_state(rnn_state)  # Keep on device.
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 27
0
 def step(self, observation, prev_action, prev_reward):
     prev_action = self.distribution.to_onehot(prev_action)
     observation = observation.type(
         torch.float)  # Expect torch.uint8 inputs
     observation = observation.mul_(1. /
                                    255)  # From [0-255] to [0-1], in place.
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     pi, value = self.model(*model_inputs)
     dist_info = DistInfo(prob=pi)
     action = self.distribution.sample(dist_info)
     agent_info = AgentInfo(dist_info=dist_info, value=value)
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 28
0
 def step(self, observation, prev_action, prev_reward, device="cpu"):
     """
     Compute policy's action distribution from inputs, and sample an
     action. Calls the model to produce mean, log_std, and value estimate.
     Moves inputs to device and returns outputs back to CPU, for the
     sampler.  (no grad)
     """
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     mu, log_std, value = self.model(*model_inputs)
     dist_info = DistInfoStd(mean=mu, log_std=log_std)
     action = self.distribution.sample(dist_info)
     agent_info = AgentInfo(dist_info=dist_info, value=value)
     action, agent_info = buffer_to((action, agent_info), device=device)
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 29
0
 def step(self, observation, prev_action, prev_reward):
     prev_action = self.distribution.to_onehot(prev_action)
     #observation = observation.type(torch.float)  # Expect torch.uint8 inputs
     #observation = observation.mul_(1. / 255)  # From [0-255] to [0-1], in place.
     if len(observation.shape) == 3:
         observation = self.aug_obs(observation.unsqueeze(0)).squeeze(0)
     else:
         observation = self.aug_obs(observation)
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     pi, value, latent, reconstruction = self.model(*model_inputs)
     dist_info = DistInfo(prob=pi)
     action = self.distribution.sample(dist_info)
     agent_info = AgentInfo(dist_info=dist_info, value=value)
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)
Ejemplo n.º 30
0
 def step(self, observation, prev_action, prev_reward):
     agent_inputs = buffer_to((observation, prev_action, prev_reward),
         device=self.device)
     mu, log_std, value, rnn_state = self.model(*agent_inputs, self.prev_rnn_state)
     dist_info = DistInfoStd(mean=mu, log_std=log_std)
     action = self.distribution.sample(dist_info)
     # Model handles None, but Buffer does not, make zeros if needed:
     prev_rnn_state = self.prev_rnn_state or buffer_func(rnn_state, torch.zeros_like)
     # Transpose the rnn_state from [N,B,H] --> [B,N,H] for storage.
     # (Special case: model should always leave B dimension in.)
     prev_rnn_state = buffer_method(prev_rnn_state, "transpose", 0, 1)
     agent_info = AgentInfoRnn(dist_info=dist_info, value=value,
         prev_rnn_state=prev_rnn_state)
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     self.advance_rnn_state(rnn_state)  # Keep on device.
     return AgentStep(action=action, agent_info=agent_info)