def evaluate(self, decision_requests: DecisionSteps, global_agent_ids: List[str]) -> Dict[str, Any]: """ Evaluates policy for the agent experiences provided. :param global_agent_ids: :param decision_requests: DecisionStep object containing inputs. :return: Outputs from network as defined by self.inference_dict. """ vec_vis_obs, masks = self._split_decision_step(decision_requests) vec_obs = [torch.as_tensor(vec_vis_obs.vector_observations)] vis_obs = [ torch.as_tensor(vis_ob) for vis_ob in vec_vis_obs.visual_observations ] memories = torch.as_tensor( self.retrieve_memories(global_agent_ids)).unsqueeze(0) run_out = {} with torch.no_grad(): action, log_probs, entropy, memories = self.sample_actions( vec_obs, vis_obs, masks=masks, memories=memories) run_out["action"] = ModelUtils.to_numpy(action) run_out["pre_action"] = ModelUtils.to_numpy(action) # Todo - make pre_action difference run_out["log_probs"] = ModelUtils.to_numpy(log_probs) run_out["entropy"] = ModelUtils.to_numpy(entropy) run_out["learning_rate"] = 0.0 if self.use_recurrent: run_out["memory_out"] = ModelUtils.to_numpy(memories).squeeze(0) return run_out
def evaluate(self, decision_requests: DecisionSteps, global_agent_ids: List[str]) -> Dict[str, Any]: """ Evaluates policy for the agent experiences provided. :param global_agent_ids: :param decision_requests: DecisionStep object containing inputs. :return: Outputs from network as defined by self.inference_dict. """ obs = decision_requests.obs masks = self._extract_masks(decision_requests) tensor_obs = [torch.as_tensor(np_ob) for np_ob in obs] memories = torch.as_tensor( self.retrieve_memories(global_agent_ids)).unsqueeze(0) run_out = {} with torch.no_grad(): action, log_probs, entropy, memories = self.sample_actions( tensor_obs, masks=masks, memories=memories) action_tuple = action.to_action_tuple() run_out["action"] = action_tuple # This is the clipped action which is not saved to the buffer # but is exclusively sent to the environment. env_action_tuple = action.to_action_tuple(clip=self._clip_action) run_out["env_action"] = env_action_tuple run_out["log_probs"] = log_probs.to_log_probs_tuple() run_out["entropy"] = ModelUtils.to_numpy(entropy) run_out["learning_rate"] = 0.0 if self.use_recurrent: run_out["memory_out"] = ModelUtils.to_numpy(memories).squeeze(0) return run_out
def _compare_two_policies(policy1: TorchPolicy, policy2: TorchPolicy) -> None: """ Make sure two policies have the same output for the same input. """ decision_step, _ = mb.create_steps_from_behavior_spec( policy1.behavior_spec, num_agents=1) vec_vis_obs, masks = policy1._split_decision_step(decision_step) vec_obs = [torch.as_tensor(vec_vis_obs.vector_observations)] vis_obs = [ torch.as_tensor(vis_ob) for vis_ob in vec_vis_obs.visual_observations ] memories = torch.as_tensor( policy1.retrieve_memories(list(decision_step.agent_id))).unsqueeze(0) with torch.no_grad(): _, log_probs1, _, _, _ = policy1.sample_actions(vec_obs, vis_obs, masks=masks, memories=memories, all_log_probs=True) _, log_probs2, _, _, _ = policy2.sample_actions(vec_obs, vis_obs, masks=masks, memories=memories, all_log_probs=True) np.testing.assert_array_equal(log_probs1, log_probs2)
def compute_gradient_magnitude(self, policy_batch: AgentBuffer, expert_batch: AgentBuffer) -> torch.Tensor: """ Gradient penalty from https://arxiv.org/pdf/1704.00028. Adds stability esp. for off-policy. Compute gradients w.r.t randomly interpolated input. """ policy_inputs = self.get_state_inputs(policy_batch) expert_inputs = self.get_state_inputs(expert_batch) interp_inputs = [] for policy_input, expert_input in zip(policy_inputs, expert_inputs): obs_epsilon = torch.rand(policy_input.shape) interp_input = obs_epsilon * policy_input + ( 1 - obs_epsilon) * expert_input interp_input.requires_grad = True # For gradient calculation interp_inputs.append(interp_input) if self._settings.use_actions: policy_action = self.get_action_input(policy_batch) expert_action = self.get_action_input(expert_batch) action_epsilon = torch.rand(policy_action.shape) policy_dones = torch.as_tensor(policy_batch[BufferKey.DONE], dtype=torch.float).unsqueeze(1) expert_dones = torch.as_tensor(expert_batch[BufferKey.DONE], dtype=torch.float).unsqueeze(1) dones_epsilon = torch.rand(policy_dones.shape) action_inputs = torch.cat( [ action_epsilon * policy_action + (1 - action_epsilon) * expert_action, dones_epsilon * policy_dones + (1 - dones_epsilon) * expert_dones, ], dim=1, ) action_inputs.requires_grad = True hidden, _ = self.encoder(interp_inputs, action_inputs) encoder_input = tuple(interp_inputs + [action_inputs]) else: hidden, _ = self.encoder(interp_inputs) encoder_input = tuple(interp_inputs) if self._settings.use_vail: use_vail_noise = True z_mu = self._z_mu_layer(hidden) hidden = z_mu + torch.randn_like( z_mu) * self._z_sigma * use_vail_noise estimate = self._estimator(hidden).squeeze(1).sum() gradient = torch.autograd.grad(estimate, encoder_input, create_graph=True)[0] # Norm's gradient could be NaN at 0. Use our own safe_norm safe_norm = (torch.sum(gradient**2, dim=1) + self.EPSILON).sqrt() gradient_mag = torch.mean((safe_norm - 1)**2) return gradient_mag
def get_state_encoding(self, mini_batch: AgentBuffer) -> torch.Tensor: """ Creates the observation input. """ n_vis = len(self._state_encoder.visual_processors) hidden, _ = self._state_encoder.forward( vec_inputs=[ torch.as_tensor(mini_batch["vector_obs"], dtype=torch.float) ], vis_inputs=[ torch.as_tensor(mini_batch["visual_obs%d" % i], dtype=torch.float) for i in range(n_vis) ], ) return hidden
def compute_estimate(self, mini_batch: AgentBuffer, use_vail_noise: bool = False) -> torch.Tensor: """ Given a mini_batch, computes the estimate (How much the discriminator believes the data was sampled from the demonstration data). :param mini_batch: The AgentBuffer of data :param use_vail_noise: Only when using VAIL : If true, will sample the code, if false, will return the mean of the code. """ vec_inputs, vis_inputs = self.get_state_inputs(mini_batch) if self._settings.use_actions: actions = self.get_action_input(mini_batch) dones = torch.as_tensor(mini_batch["done"], dtype=torch.float).unsqueeze(1) action_inputs = torch.cat([actions, dones], dim=1) hidden, _ = self.encoder(vec_inputs, vis_inputs, action_inputs) else: hidden, _ = self.encoder(vec_inputs, vis_inputs) z_mu: Optional[torch.Tensor] = None if self._settings.use_vail: z_mu = self._z_mu_layer(hidden) hidden = torch.normal(z_mu, self._z_sigma * use_vail_noise) estimate = self._estimator(hidden) return estimate, z_mu
def get_action_input(self, mini_batch: AgentBuffer) -> torch.Tensor: """ Creates the action Tensor. In continuous case, corresponds to the action. In the discrete case, corresponds to the concatenation of one hot action Tensors. """ return self._action_flattener.forward( torch.as_tensor(mini_batch["actions"], dtype=torch.float))
def _compare_two_policies(policy1: TorchPolicy, policy2: TorchPolicy) -> None: """ Make sure two policies have the same output for the same input. """ policy1.actor = policy1.actor.to(default_device()) policy2.actor = policy2.actor.to(default_device()) decision_step, _ = mb.create_steps_from_behavior_spec( policy1.behavior_spec, num_agents=1) np_obs = decision_step.obs masks = policy1._extract_masks(decision_step) memories = torch.as_tensor( policy1.retrieve_memories(list(decision_step.agent_id))).unsqueeze(0) tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] with torch.no_grad(): _, log_probs1, _, _ = policy1.sample_actions(tensor_obs, masks=masks, memories=memories) _, log_probs2, _, _ = policy2.sample_actions(tensor_obs, masks=masks, memories=memories) np.testing.assert_array_equal( ModelUtils.to_numpy(log_probs1.all_discrete_tensor), ModelUtils.to_numpy(log_probs2.all_discrete_tensor), )
def list_to_tensor(ndarray_list: List[np.ndarray], dtype: Optional[torch.dtype] = None) -> torch.Tensor: """ Converts a list of numpy arrays into a tensor. MUCH faster than calling as_tensor on the list directly. """ return torch.as_tensor(np.asanyarray(ndarray_list), dtype=dtype)
def update_normalization(self, vector_obs: np.ndarray) -> None: """ If this policy normalizes vector observations, this will update the norm values in the graph. :param vector_obs: The vector observations to add to the running estimate of the distribution. """ vector_obs = [torch.as_tensor(vector_obs)] if self.use_vec_obs and self.normalize: self.actor_critic.update_normalization(vector_obs)
def _extract_masks(self, decision_requests: DecisionSteps) -> np.ndarray: mask = None if self.behavior_spec.action_spec.discrete_size > 0: mask = torch.ones([len(decision_requests), np.sum(self.act_size)]) if decision_requests.action_mask is not None: mask = torch.as_tensor( 1 - np.concatenate(decision_requests.action_mask, axis=1) ) return mask
def compute_gradient_magnitude(self, policy_batch: AgentBuffer, expert_batch: AgentBuffer) -> torch.Tensor: """ Gradient penalty from https://arxiv.org/pdf/1704.00028. Adds stability esp. for off-policy. Compute gradients w.r.t randomly interpolated input. """ policy_obs = self.get_state_encoding(policy_batch) expert_obs = self.get_state_encoding(expert_batch) obs_epsilon = torch.rand(policy_obs.shape) encoder_input = obs_epsilon * policy_obs + (1 - obs_epsilon) * expert_obs if self._settings.use_actions: policy_action = self.get_action_input(policy_batch) expert_action = self.get_action_input(expert_batch) action_epsilon = torch.rand(policy_action.shape) policy_dones = torch.as_tensor(policy_batch["done"], dtype=torch.float).unsqueeze(1) expert_dones = torch.as_tensor(expert_batch["done"], dtype=torch.float).unsqueeze(1) dones_epsilon = torch.rand(policy_dones.shape) encoder_input = torch.cat( [ encoder_input, action_epsilon * policy_action + (1 - action_epsilon) * expert_action, dones_epsilon * policy_dones + (1 - dones_epsilon) * expert_dones, ], dim=1, ) hidden = self.encoder(encoder_input) if self._settings.use_vail: use_vail_noise = True z_mu = self._z_mu_layer(hidden) hidden = torch.normal(z_mu, self._z_sigma * use_vail_noise) estimate = self._estimator(hidden).squeeze(1).sum() gradient = torch.autograd.grad(estimate, encoder_input, create_graph=True)[0] # Norm's gradient could be NaN at 0. Use our own safe_norm safe_norm = (torch.sum(gradient**2, dim=1) + self.EPSILON).sqrt() gradient_mag = torch.mean((safe_norm - 1)**2) return gradient_mag
def list_to_tensor_list( ndarray_list: List[np.ndarray], dtype: Optional[torch.dtype] = torch.float32 ) -> torch.Tensor: """ Converts a list of numpy arrays into a list of tensors. MUCH faster than calling as_tensor on the list directly. """ return [ torch.as_tensor(np.asanyarray(_arr), dtype=dtype) for _arr in ndarray_list ]
def forward(self, action: torch.Tensor) -> torch.Tensor: if self._specs.is_action_continuous(): return action else: return torch.cat( ModelUtils.actions_to_onehot( torch.as_tensor(action, dtype=torch.long), self._specs.discrete_action_branches, ), dim=1, )
def _split_decision_step( self, decision_requests: DecisionSteps ) -> Tuple[SplitObservations, np.ndarray]: vec_vis_obs = SplitObservations.from_observations( decision_requests.obs) mask = None if not self.use_continuous_act: mask = torch.ones([len(decision_requests), np.sum(self.act_size)]) if decision_requests.action_mask is not None: mask = torch.as_tensor( 1 - np.concatenate(decision_requests.action_mask, axis=1)) return vec_vis_obs, mask
def __init__( self, observation_specs: List[ObservationSpec], network_settings: NetworkSettings, action_spec: ActionSpec, ): super().__init__() self.normalize = network_settings.normalize self.use_lstm = network_settings.memory is not None self.h_size = network_settings.hidden_units self.m_size = (network_settings.memory.memory_size if network_settings.memory is not None else 0) self.action_spec = action_spec self.observation_encoder = ObservationEncoder( observation_specs, self.h_size, network_settings.vis_encode_type, self.normalize, ) self.processors = self.observation_encoder.processors # Modules for multi-agent self-attention obs_only_ent_size = self.observation_encoder.total_enc_size q_ent_size = (obs_only_ent_size + sum(self.action_spec.discrete_branches) + self.action_spec.continuous_size) attention_embeding_size = self.h_size self.obs_encoder = EntityEmbedding(obs_only_ent_size, None, attention_embeding_size) self.obs_action_encoder = EntityEmbedding(q_ent_size, None, attention_embeding_size) self.self_attn = ResidualSelfAttention(attention_embeding_size) self.linear_encoder = LinearEncoder( attention_embeding_size, network_settings.num_layers, self.h_size, kernel_gain=(0.125 / self.h_size)**0.5, ) if self.use_lstm: self.lstm = LSTM(self.h_size, self.m_size) else: self.lstm = None # type: ignore self._current_max_agents = torch.nn.Parameter(torch.as_tensor(1), requires_grad=False)
def forward(self, action: AgentAction) -> torch.Tensor: """ Returns a tensor corresponding the flattened action :param action: An AgentAction object """ action_list: List[torch.Tensor] = [] if self._specs.continuous_size > 0: action_list.append(action.continuous_tensor) if self._specs.discrete_size > 0: flat_discrete = torch.cat( ModelUtils.actions_to_onehot( torch.as_tensor(action.discrete_tensor, dtype=torch.long), self._specs.discrete_branches, ), dim=1, ) action_list.append(flat_discrete) return torch.cat(action_list, dim=1)
def forward( self, obs_only: List[List[torch.Tensor]], obs: List[List[torch.Tensor]], actions: List[AgentAction], memories: Optional[torch.Tensor] = None, sequence_length: int = 1, ) -> Tuple[torch.Tensor, torch.Tensor]: """ Returns sampled actions. If memory is enabled, return the memories as well. :param obs_only: Observations to be processed that do not have corresponding actions. These are encoded with the obs_encoder. :param obs: Observations to be processed that do have corresponding actions. After concatenation with actions, these are processed with obs_action_encoder. :param actions: After concatenation with obs, these are processed with obs_action_encoder. :param memories: If using memory, a Tensor of initial memories. :param sequence_length: If using memory, the sequence length. """ self_attn_masks = [] self_attn_inputs = [] concat_f_inp = [] if obs: obs_attn_mask = self._get_masks_from_nans(obs) obs = self._copy_and_remove_nans_from_obs(obs, obs_attn_mask) for inputs, action in zip(obs, actions): encoded = self.observation_encoder(inputs) cat_encodes = [ encoded, action.to_flat(self.action_spec.discrete_branches), ] concat_f_inp.append(torch.cat(cat_encodes, dim=1)) f_inp = torch.stack(concat_f_inp, dim=1) self_attn_masks.append(obs_attn_mask) self_attn_inputs.append(self.obs_action_encoder(None, f_inp)) concat_encoded_obs = [] if obs_only: obs_only_attn_mask = self._get_masks_from_nans(obs_only) obs_only = self._copy_and_remove_nans_from_obs( obs_only, obs_only_attn_mask) for inputs in obs_only: encoded = self.observation_encoder(inputs) concat_encoded_obs.append(encoded) g_inp = torch.stack(concat_encoded_obs, dim=1) self_attn_masks.append(obs_only_attn_mask) self_attn_inputs.append(self.obs_encoder(None, g_inp)) encoded_entity = torch.cat(self_attn_inputs, dim=1) encoded_state = self.self_attn(encoded_entity, self_attn_masks) flipped_masks = 1 - torch.cat(self_attn_masks, dim=1) num_agents = torch.sum(flipped_masks, dim=1, keepdim=True) if torch.max(num_agents).item() > self._current_max_agents: self._current_max_agents = torch.nn.Parameter(torch.as_tensor( torch.max(num_agents).item()), requires_grad=False) # num_agents will be -1 for a single agent and +1 when the current maximum is reached num_agents = num_agents * 2.0 / self._current_max_agents - 1 encoding = self.linear_encoder(encoded_state) if self.use_lstm: # Resize to (batch, sequence length, encoding size) encoding = encoding.reshape([-1, sequence_length, self.h_size]) encoding, memories = self.lstm(encoding, memories) encoding = encoding.reshape([-1, self.m_size // 2]) encoding = torch.cat([encoding, num_agents], dim=1) return encoding, memories
def update_normalization(self, buffer: AgentBuffer) -> None: obs = ObsUtil.from_buffer(buffer, len(self.processors)) for vec_input, enc in zip(obs, self.processors): if isinstance(enc, VectorInput): enc.update_normalization(torch.as_tensor(vec_input))
def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings): super().__init__(policy, trainer_params) hyperparameters: SACSettings = cast(SACSettings, trainer_params.hyperparameters) self.tau = hyperparameters.tau self.init_entcoef = hyperparameters.init_entcoef self.policy = policy policy_network_settings = policy.network_settings self.tau = hyperparameters.tau self.burn_in_ratio = 0.0 # Non-exposed SAC parameters self.discrete_target_entropy_scale = 0.2 # Roughly equal to e-greedy 0.05 self.continuous_target_entropy_scale = 1.0 self.stream_names = list(self.reward_signals.keys()) # Use to reduce "survivor bonus" when using Curiosity or GAIL. self.gammas = [ _val.gamma for _val in trainer_params.reward_signals.values() ] self.use_dones_in_backup = { name: int(not self.reward_signals[name].ignore_done) for name in self.stream_names } self._action_spec = self.policy.behavior_spec.action_spec self.value_network = TorchSACOptimizer.PolicyValueNetwork( self.stream_names, self.policy.behavior_spec.sensor_specs, policy_network_settings, self._action_spec, ) self.target_network = ValueNetwork( self.stream_names, self.policy.behavior_spec.sensor_specs, policy_network_settings, ) ModelUtils.soft_update(self.policy.actor_critic.critic, self.target_network, 1.0) # We create one entropy coefficient per action, whether discrete or continuous. _disc_log_ent_coef = torch.nn.Parameter( torch.log( torch.as_tensor([self.init_entcoef] * len(self._action_spec.discrete_branches))), requires_grad=True, ) _cont_log_ent_coef = torch.nn.Parameter(torch.log( torch.as_tensor([self.init_entcoef])), requires_grad=True) self._log_ent_coef = TorchSACOptimizer.LogEntCoef( discrete=_disc_log_ent_coef, continuous=_cont_log_ent_coef) _cont_target = ( -1 * self.continuous_target_entropy_scale * np.prod(self._action_spec.continuous_size).astype(np.float32)) _disc_target = [ self.discrete_target_entropy_scale * np.log(i).astype(np.float32) for i in self._action_spec.discrete_branches ] self.target_entropy = TorchSACOptimizer.TargetEntropy( continuous=_cont_target, discrete=_disc_target) policy_params = list( self.policy.actor_critic.network_body.parameters()) + list( self.policy.actor_critic.action_model.parameters()) value_params = list(self.value_network.parameters()) + list( self.policy.actor_critic.critic.parameters()) logger.debug("value_vars") for param in value_params: logger.debug(param.shape) logger.debug("policy_vars") for param in policy_params: logger.debug(param.shape) self.decay_learning_rate = ModelUtils.DecayedValue( hyperparameters.learning_rate_schedule, hyperparameters.learning_rate, 1e-10, self.trainer_settings.max_steps, ) self.policy_optimizer = torch.optim.Adam( policy_params, lr=hyperparameters.learning_rate) self.value_optimizer = torch.optim.Adam( value_params, lr=hyperparameters.learning_rate) self.entropy_optimizer = torch.optim.Adam( self._log_ent_coef.parameters(), lr=hyperparameters.learning_rate) self._move_to_device(default_device())