def step(self, observation, prev_action, prev_reward): prev_action = self.distribution.to_onehot(prev_action) agent_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) probs, value, rnn_state = self.model(*agent_inputs, self.prev_rnn_state) dist_info = DistInfo(prob=probs) if self._mode == 'sample': action = self.distribution.sample(dist_info) elif self._mode == 'eval': action = torch.argmax(probs, dim=-1) # Model handles None, but Buffer does not, make zeros if needed: if self.prev_rnn_state is None: prev_rnn_state = buffer_func(rnn_state, torch.zeros_like) else: prev_rnn_state = self.prev_rnn_state # Transpose the rnn_state from [N,B,H] --> [B,N,H] for storage. # (Special case: model should always leave B dimension in.) prev_rnn_state = buffer_method(prev_rnn_state, "transpose", 0, 1) agent_info = AgentInfoRnn(dist_info=dist_info, value=value, prev_rnn_state=prev_rnn_state) action, agent_info = buffer_to((action, agent_info), device="cpu") self.advance_rnn_state(rnn_state) # Keep on device. return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): prev_action = self.distribution.to_onehot(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) # TODO: need to decide which action to take pi, value = self.model(*model_inputs) int_pi, int_value = self.model_int(*model_inputs) dist_info = DistInfo(prob=pi) if self.dual_model: pi_int, pi_int = self.model_int(*model_inputs) dist_int_info = DistInfo(prob=pi_int) if self._mode == "eval": action = self.distribution.sample(dist_info) else: action = self.distribution.sample(dist_int_info) else: action = self.distribution.sample(dist_info) if self.dual_model: agent_info = AgentInfoTwin(dist_info=dist_info, value=value, dist_int_info=dist_int_info, int_value=int_value) else: agent_info = AgentInfo(dist_info=dist_info, value=value) action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) action = self.model(*model_inputs) agent_info = EmptyAgentInfo() action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward, device="cpu"): """ Compute policy's action distribution from inputs, and sample an action. Calls the model to produce mean, log_std, value estimate, and next recurrent state. Moves inputs to device and returns outputs back to CPU, for the sampler. Advances the recurrent state of the agent. (no grad) """ agent_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) mu, log_std, value, rnn_state = self.model(*agent_inputs, self.prev_rnn_state) dist_info = DistInfoStd(mean=mu, log_std=log_std) action = self.distribution.sample(dist_info) # Model handles None, but Buffer does not, make zeros if needed: prev_rnn_state = self.prev_rnn_state if self.prev_rnn_state is not None else buffer_func( rnn_state, torch.zeros_like) # Transpose the rnn_state from [N,B,H] --> [B,N,H] for storage. # (Special case: model should always leave B dimension in.) prev_rnn_state = buffer_method(prev_rnn_state, "transpose", 0, 1) agent_info = AgentInfoRnn(dist_info=dist_info, value=value, prev_rnn_state=prev_rnn_state) action, agent_info = buffer_to((action, agent_info), device=device) self.advance_rnn_state(rnn_state) # Keep on device. return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward, device="cpu"): """ Compute policy's option and action distributions from inputs. Calls model to get mean, std for all pi_w, q, beta for all options, pi over options Moves inputs to device and returns outputs back to CPU, for the sampler. (no grad) """ model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) mu, log_std, beta, q, pi = self.model(*model_inputs) dist_info_omega = DistInfo(prob=pi) new_o, terminations = self.sample_option( beta, dist_info_omega) # Sample terminations and options dist_info = DistInfoStd(mean=mu, log_std=log_std) dist_info_o = DistInfoStd(mean=select_at_indexes(new_o, mu), log_std=select_at_indexes(new_o, log_std)) action = self.distribution.sample(dist_info_o) agent_info = AgentInfoOC(dist_info=dist_info, dist_info_o=dist_info_o, q=q, value=(pi * q).sum(-1), termination=terminations, dist_info_omega=dist_info_omega, prev_o=self._prev_option, o=new_o) action, agent_info = buffer_to((action, agent_info), device=device) self.advance_oc_state(new_o) return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) action, action_probs, log_action_probs = self.model(*model_inputs) dist_info = DistInfo(prob=action_probs) agent_info = AgentInfo(dist_info=dist_info) action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) mu = self.model(*model_inputs) action = self.distribution.sample(DistInfo(mean=mu)) agent_info = AgentInfo(mu=mu) action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def to_agent_step(self, output): """Convert the output of the NN model into step info for the agent. """ q = output # q = q.cpu() action = self.distribution.sample(q) agent_info = AgentInfo(q=q) action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) # mean, log_std = self.model(*model_inputs) # dist_info = DistInfoStd(mean=mean, log_std=log_std) # action = self.distribution.sample(dist_info) if self.random_actions_for_pretraining: action = torch.randint_like(prev_action, 15) action = buffer_to(action, device="cpu") return AgentStep(action=action, agent_info=AgentInfo(dist_info=None)) pi, _, _ = self.model(*model_inputs) dist_info = DistInfo(prob=pi) action = self.distribution.sample(dist_info) agent_info = AgentInfo(dist_info=dist_info) action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) mu, log_std, value = self.model(*model_inputs) dist_info = DistInfoStd(mean=mu, log_std=log_std) action = self.distribution.sample(dist_info) agent_info = AgentInfo(dist_info=dist_info, value=value) action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) alpha, beta = self.model(*model_inputs) dist_info = DistInfoStd(alpha=alpha, beta=beta) action = self.distribution.sample(dist_info) agent_info = AgentInfo(dist_info=dist_info) action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def to_agent_step(self, output): """Convert the output of the NN model into step info for the agent. """ p = output # p = p.cpu() action = self.distribution.sample(p) agent_info = AgentInfo(p=p) # Only change from DQN: q -> p. action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): prev_action = self.distribution.to_onehot(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) q = self.model(*model_inputs) q = q.cpu() action = self.distribution.sample(q) agent_info = AgentInfo(q=q) # action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): prev_action = self.distribution.to_onehot(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) p = self.model(*model_inputs) p = p.cpu() action = self.distribution.sample(p) agent_info = AgentInfo(p=p) # Only change from DQN: q -> p. action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): prev_action = self.distribution.to_onehot(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) pi, value = self.model(*model_inputs) dist_info = DistInfo(prob=pi) action = self.distribution.sample(dist_info) agent_info = AgentInfo(dist_info=dist_info, value=value) action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): """Computes distribution parameters (mu) for state/observation, returns (gaussian) sampled action.""" model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) mu = self.model(*model_inputs) action = self.distribution.sample(DistInfo(mean=mu)) agent_info = AgentInfo(mu=mu) action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): model_inputs = buffer_to(observation, device=self.device) mean, log_std, sym_features = self.model(model_inputs, "pi", extract_sym_features=True) dist_info = DistInfoStd(mean=mean, log_std=log_std) action = self.distribution.sample(dist_info) agent_info = SafeSacAgentInfo(dist_info=dist_info, sym_features=sym_features) action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) mean, log_std = self.pi_model(*model_inputs) dist_info = DistInfoStd(mean=mean, log_std=log_std) action = self.distribution.sample(dist_info) agent_info = AgentInfo(dist_info=dist_info) action, agent_info = buffer_to((action, agent_info), device="cpu") if np.any(np.isnan(action.numpy())): breakpoint() return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): """Computes Q-values for states/observations and selects actions by epsilon-greedy. (no grad)""" prev_action = self.distribution.to_onehot(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) q = self.model(*model_inputs) q = q.cpu() action = self.distribution.sample(q) agent_info = AgentInfo(q=q) # action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): observation, prev_action, prev_reward = buffer_to( (observation, prev_action, prev_reward), device=self.device) # self.model includes encoder + actor MLP. mean, log_std, latent, conv = self.model(observation, prev_action, prev_reward) dist_info = DistInfoStd(mean=mean, log_std=log_std) action = self.distribution.sample(dist_info) agent_info = AgentInfo(dist_info=dist_info, conv=conv if self.store_latent else None) action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): """Compute the discrete distribution for the Q-value for each action for each state/observation (no grad).""" prev_action = self.distribution.to_onehot(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) p = self.model(*model_inputs) p = p.cpu() action = self.distribution.sample(p) agent_info = AgentInfo(p=p) # Only change from DQN: q -> p. action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): prev_action = self.format_actions(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) pi, ext_value, int_value = self.model(*model_inputs) dist_info = DistInfo(prob=pi) action = self.distribution.sample(dist_info) agent_info = IntAgentInfo(dist_info=dist_info, ext_value=ext_value, int_value=int_value) action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): model_inputs = buffer_to((observation, ), device=self.device)[0] mu, log_std, value, sym_features = self.model( model_inputs, extract_sym_features=True) dist_info = DistInfoStd(mean=mu, log_std=log_std) action = self.distribution.sample(dist_info) action = action.clamp(-1, 1) agent_info = SafeAgentInfo(dist_info=dist_info, value=value, sym_features=sym_features) action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): prev_action = self.distribution.to_onehot(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) pi, value, conv = self.model(*model_inputs) if self._act_uniform: pi[:] = 1. / pi.shape[-1] # uniform dist_info = DistInfo(prob=pi) action = self.distribution.sample(dist_info) agent_info = AgentInfoConv(dist_info=dist_info, value=value, conv=conv if self.store_latent else None) # Don't write extra data. action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action=None, prev_reward=None): pi, value, sym_features = self.model( observation.to(device=self.device), extract_sym_features=True) dist_info = DistInfo(prob=pi) action = self.distribution.sample(dist_info) # either sym_features should always be given or never if sym_features is not None: agent_info = SafeAgentInfo(dist_info=dist_info, value=value, sym_features=sym_features) else: agent_info = AgentInfo(dist_info=dist_info, value=value) action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def to_agent_step(self, output): """Convert the output of the NN model into step info for the agent. """ q, rnn_state = output # q = q.cpu() action = self.distribution.sample(q) prev_rnn_state = self.prev_rnn_state or buffer_func(rnn_state, torch.zeros_like) # Transpose the rnn_state from [N,B,H] --> [B,N,H] for storage. # (Special case, model should always leave B dimension in.) prev_rnn_state = buffer_method(prev_rnn_state, "transpose", 0, 1) prev_rnn_state, action, q = buffer_to((prev_rnn_state, action, q), device="cpu") agent_info = AgentInfo(q=q, prev_rnn_state=prev_rnn_state) self.advance_rnn_state(rnn_state) # Keep on device. return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): prev_action = self.distribution.to_onehot(prev_action) observation = observation.type( torch.float) # Expect torch.uint8 inputs observation = observation.mul_(1. / 255) # From [0-255] to [0-1], in place. model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) pi, value = self.model(*model_inputs) dist_info = DistInfo(prob=pi) action = self.distribution.sample(dist_info) agent_info = AgentInfo(dist_info=dist_info, value=value) action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward, device="cpu"): """ Compute policy's action distribution from inputs, and sample an action. Calls the model to produce mean, log_std, and value estimate. Moves inputs to device and returns outputs back to CPU, for the sampler. (no grad) """ model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) mu, log_std, value = self.model(*model_inputs) dist_info = DistInfoStd(mean=mu, log_std=log_std) action = self.distribution.sample(dist_info) agent_info = AgentInfo(dist_info=dist_info, value=value) action, agent_info = buffer_to((action, agent_info), device=device) return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): prev_action = self.distribution.to_onehot(prev_action) #observation = observation.type(torch.float) # Expect torch.uint8 inputs #observation = observation.mul_(1. / 255) # From [0-255] to [0-1], in place. if len(observation.shape) == 3: observation = self.aug_obs(observation.unsqueeze(0)).squeeze(0) else: observation = self.aug_obs(observation) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) pi, value, latent, reconstruction = self.model(*model_inputs) dist_info = DistInfo(prob=pi) action = self.distribution.sample(dist_info) agent_info = AgentInfo(dist_info=dist_info, value=value) action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info)
def step(self, observation, prev_action, prev_reward): agent_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) mu, log_std, value, rnn_state = self.model(*agent_inputs, self.prev_rnn_state) dist_info = DistInfoStd(mean=mu, log_std=log_std) action = self.distribution.sample(dist_info) # Model handles None, but Buffer does not, make zeros if needed: prev_rnn_state = self.prev_rnn_state or buffer_func(rnn_state, torch.zeros_like) # Transpose the rnn_state from [N,B,H] --> [B,N,H] for storage. # (Special case: model should always leave B dimension in.) prev_rnn_state = buffer_method(prev_rnn_state, "transpose", 0, 1) agent_info = AgentInfoRnn(dist_info=dist_info, value=value, prev_rnn_state=prev_rnn_state) action, agent_info = buffer_to((action, agent_info), device="cpu") self.advance_rnn_state(rnn_state) # Keep on device. return AgentStep(action=action, agent_info=agent_info)