def sac_value_loss( self, log_probs: torch.Tensor, values: Dict[str, torch.Tensor], q1p_out: Dict[str, torch.Tensor], q2p_out: Dict[str, torch.Tensor], loss_masks: torch.Tensor, discrete: bool, ) -> torch.Tensor: min_policy_qs = {} with torch.no_grad(): _ent_coef = torch.exp(self._log_ent_coef) for name in values.keys(): if not discrete: min_policy_qs[name] = torch.min(q1p_out[name], q2p_out[name]) else: action_probs = log_probs.exp() _branched_q1p = ModelUtils.break_into_branches( q1p_out[name] * action_probs, self.act_size) _branched_q2p = ModelUtils.break_into_branches( q2p_out[name] * action_probs, self.act_size) _q1p_mean = torch.mean( torch.stack([ torch.sum(_br, dim=1, keepdim=True) for _br in _branched_q1p ]), dim=0, ) _q2p_mean = torch.mean( torch.stack([ torch.sum(_br, dim=1, keepdim=True) for _br in _branched_q2p ]), dim=0, ) min_policy_qs[name] = torch.min(_q1p_mean, _q2p_mean) value_losses = [] if not discrete: for name in values.keys(): with torch.no_grad(): v_backup = min_policy_qs[name] - torch.sum( _ent_coef * log_probs, dim=1) value_loss = 0.5 * ModelUtils.masked_mean( torch.nn.functional.mse_loss(values[name], v_backup), loss_masks) value_losses.append(value_loss) else: branched_per_action_ent = ModelUtils.break_into_branches( log_probs * log_probs.exp(), self.act_size) # We have to do entropy bonus per action branch branched_ent_bonus = torch.stack([ torch.sum(_ent_coef[i] * _lp, dim=1, keepdim=True) for i, _lp in enumerate(branched_per_action_ent) ]) for name in values.keys(): with torch.no_grad(): v_backup = min_policy_qs[name] - torch.mean( branched_ent_bonus, axis=0) value_loss = 0.5 * ModelUtils.masked_mean( torch.nn.functional.mse_loss(values[name], v_backup.squeeze()), loss_masks, ) value_losses.append(value_loss) value_loss = torch.mean(torch.stack(value_losses)) if torch.isinf(value_loss).any() or torch.isnan(value_loss).any(): raise UnityTrainerException("Inf found") return value_loss
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: """ Updates model using buffer. :param num_sequences: Number of trajectories in batch. :param batch: Experience mini-batch. :param update_target: Whether or not to update target value network :param reward_signal_batches: Minibatches to use for updating the reward signals, indexed by name. If none, don't update the reward signals. :return: Output from update process. """ rewards = {} for name in self.reward_signals: rewards[name] = ModelUtils.list_to_tensor(batch[f"{name}_rewards"]) vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])] next_vec_obs = [ModelUtils.list_to_tensor(batch["next_vector_in"])] act_masks = ModelUtils.list_to_tensor(batch["action_mask"]) if self.policy.use_continuous_act: actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1) else: actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.long) memories_list = [ ModelUtils.list_to_tensor(batch["memory"][i]) for i in range( 0, len(batch["memory"]), self.policy.sequence_length) ] # LSTM shouldn't have sequence length <1, but stop it from going out of the index if true. offset = 1 if self.policy.sequence_length > 1 else 0 next_memories_list = [ ModelUtils.list_to_tensor( batch["memory"][i] [self.policy.m_size // 2:]) # only pass value part of memory to target network for i in range(offset, len(batch["memory"]), self.policy.sequence_length) ] if len(memories_list) > 0: memories = torch.stack(memories_list).unsqueeze(0) next_memories = torch.stack(next_memories_list).unsqueeze(0) else: memories = None next_memories = None # Q network memories are 0'ed out, since we don't have them during inference. q_memories = (torch.zeros_like(next_memories) if next_memories is not None else None) vis_obs: List[torch.Tensor] = [] next_vis_obs: List[torch.Tensor] = [] if self.policy.use_vis_obs: vis_obs = [] for idx, _ in enumerate( self.policy.actor_critic.network_body.visual_processors): vis_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx]) vis_obs.append(vis_ob) next_vis_ob = ModelUtils.list_to_tensor( batch["next_visual_obs%d" % idx]) next_vis_obs.append(next_vis_ob) # Copy normalizers from policy self.value_network.q1_network.network_body.copy_normalization( self.policy.actor_critic.network_body) self.value_network.q2_network.network_body.copy_normalization( self.policy.actor_critic.network_body) self.target_network.network_body.copy_normalization( self.policy.actor_critic.network_body) ( sampled_actions, log_probs, entropies, sampled_values, _, ) = self.policy.sample_actions( vec_obs, vis_obs, masks=act_masks, memories=memories, seq_len=self.policy.sequence_length, all_log_probs=not self.policy.use_continuous_act, ) if self.policy.use_continuous_act: squeezed_actions = actions.squeeze(-1) q1p_out, q2p_out = self.value_network( vec_obs, vis_obs, sampled_actions, memories=q_memories, sequence_length=self.policy.sequence_length, ) q1_out, q2_out = self.value_network( vec_obs, vis_obs, squeezed_actions, memories=q_memories, sequence_length=self.policy.sequence_length, ) q1_stream, q2_stream = q1_out, q2_out else: with torch.no_grad(): q1p_out, q2p_out = self.value_network( vec_obs, vis_obs, memories=q_memories, sequence_length=self.policy.sequence_length, ) q1_out, q2_out = self.value_network( vec_obs, vis_obs, memories=q_memories, sequence_length=self.policy.sequence_length, ) q1_stream = self._condense_q_streams(q1_out, actions) q2_stream = self._condense_q_streams(q2_out, actions) with torch.no_grad(): target_values, _ = self.target_network( next_vec_obs, next_vis_obs, memories=next_memories, sequence_length=self.policy.sequence_length, ) masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool) use_discrete = not self.policy.use_continuous_act dones = ModelUtils.list_to_tensor(batch["done"]) q1_loss, q2_loss = self.sac_q_loss(q1_stream, q2_stream, target_values, dones, rewards, masks) value_loss = self.sac_value_loss(log_probs, sampled_values, q1p_out, q2p_out, masks, use_discrete) policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks, use_discrete) entropy_loss = self.sac_entropy_loss(log_probs, masks, use_discrete) total_value_loss = q1_loss + q2_loss + value_loss decay_lr = self.decay_learning_rate.get_value( self.policy.get_current_step()) ModelUtils.update_learning_rate(self.policy_optimizer, decay_lr) self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() ModelUtils.update_learning_rate(self.value_optimizer, decay_lr) self.value_optimizer.zero_grad() total_value_loss.backward() self.value_optimizer.step() ModelUtils.update_learning_rate(self.entropy_optimizer, decay_lr) self.entropy_optimizer.zero_grad() entropy_loss.backward() self.entropy_optimizer.step() # Update target network self.soft_update(self.policy.actor_critic.critic, self.target_network, self.tau) update_stats = { "Losses/Policy Loss": policy_loss.item(), "Losses/Value Loss": value_loss.item(), "Losses/Q1 Loss": q1_loss.item(), "Losses/Q2 Loss": q2_loss.item(), "Policy/Entropy Coeff": torch.mean(torch.exp(self._log_ent_coef)).item(), "Policy/Learning Rate": decay_lr, } return update_stats
def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings): super().__init__(policy, trainer_params) hyperparameters: SACSettings = cast(SACSettings, trainer_params.hyperparameters) self.tau = hyperparameters.tau self.init_entcoef = hyperparameters.init_entcoef self.policy = policy self.act_size = policy.act_size policy_network_settings = policy.network_settings self.tau = hyperparameters.tau self.burn_in_ratio = 0.0 # Non-exposed SAC parameters self.discrete_target_entropy_scale = 0.2 # Roughly equal to e-greedy 0.05 self.continuous_target_entropy_scale = 1.0 self.stream_names = list(self.reward_signals.keys()) # Use to reduce "survivor bonus" when using Curiosity or GAIL. self.gammas = [_val.gamma for _val in trainer_params.reward_signals.values()] self.use_dones_in_backup = { name: int(not self.reward_signals[name].ignore_done) for name in self.stream_names } # Critics should have 1/2 of the memory of the policy critic_memory = policy_network_settings.memory if critic_memory is not None: critic_memory = attr.evolve( critic_memory, memory_size=critic_memory.memory_size // 2 ) value_network_settings = attr.evolve( policy_network_settings, memory=critic_memory ) self.value_network = TorchSACOptimizer.PolicyValueNetwork( self.stream_names, self.policy.behavior_spec.observation_shapes, value_network_settings, self.policy.behavior_spec.action_type, self.act_size, ) self.target_network = ValueNetwork( self.stream_names, self.policy.behavior_spec.observation_shapes, value_network_settings, ) self.soft_update(self.policy.actor_critic.critic, self.target_network, 1.0) self._log_ent_coef = torch.nn.Parameter( torch.log(torch.as_tensor([self.init_entcoef] * len(self.act_size))), requires_grad=True, ) if self.policy.use_continuous_act: self.target_entropy = torch.as_tensor( -1 * self.continuous_target_entropy_scale * np.prod(self.act_size[0]).astype(np.float32) ) else: self.target_entropy = [ self.discrete_target_entropy_scale * np.log(i).astype(np.float32) for i in self.act_size ] policy_params = list(self.policy.actor_critic.network_body.parameters()) + list( self.policy.actor_critic.distribution.parameters() ) value_params = list(self.value_network.parameters()) + list( self.policy.actor_critic.critic.parameters() ) logger.debug("value_vars") for param in value_params: logger.debug(param.shape) logger.debug("policy_vars") for param in policy_params: logger.debug(param.shape) self.decay_learning_rate = ModelUtils.DecayedValue( hyperparameters.learning_rate_schedule, hyperparameters.learning_rate, 1e-10, self.trainer_settings.max_steps, ) self.policy_optimizer = torch.optim.Adam( policy_params, lr=hyperparameters.learning_rate ) self.value_optimizer = torch.optim.Adam( value_params, lr=hyperparameters.learning_rate ) self.entropy_optimizer = torch.optim.Adam( [self._log_ent_coef], lr=hyperparameters.learning_rate )
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: """ Performs update on model. :param batch: Batch of experiences. :param num_sequences: Number of sequences to process. :return: Results of update. """ # Get decayed parameters decay_lr = self.decay_learning_rate.get_value( self.policy.get_current_step()) decay_eps = self.decay_epsilon.get_value( self.policy.get_current_step()) decay_bet = self.decay_beta.get_value(self.policy.get_current_step()) returns = {} old_values = {} for name in self.reward_signals: old_values[name] = ModelUtils.list_to_tensor( batch[f"{name}_value_estimates"]) returns[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns"]) vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])] act_masks = ModelUtils.list_to_tensor(batch["action_mask"]) if self.policy.use_continuous_act: actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1) else: actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.long) memories = [ ModelUtils.list_to_tensor(batch["memory"][i]) for i in range( 0, len(batch["memory"]), self.policy.sequence_length) ] if len(memories) > 0: memories = torch.stack(memories).unsqueeze(0) if self.policy.use_vis_obs: vis_obs = [] for idx, _ in enumerate( self.policy.actor_critic.network_body.visual_processors): vis_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx]) vis_obs.append(vis_ob) else: vis_obs = [] log_probs, entropy, values = self.policy.evaluate_actions( vec_obs, vis_obs, masks=act_masks, actions=actions, memories=memories, seq_len=self.policy.sequence_length, ) loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool) value_loss = self.ppo_value_loss(values, old_values, returns, decay_eps, loss_masks) policy_loss = self.ppo_policy_loss( ModelUtils.list_to_tensor(batch["advantages"]), log_probs, ModelUtils.list_to_tensor(batch["action_probs"]), loss_masks, ) loss = (policy_loss + 0.5 * value_loss - decay_bet * ModelUtils.masked_mean(entropy, loss_masks)) # Set optimizer learning rate ModelUtils.update_learning_rate(self.optimizer, decay_lr) self.optimizer.zero_grad() loss.backward() self.optimizer.step() update_stats = { # NOTE: abs() is not technically correct, but matches the behavior in TensorFlow. # TODO: After PyTorch is default, change to something more correct. "Losses/Policy Loss": torch.abs(policy_loss).item(), "Losses/Value Loss": value_loss.item(), "Policy/Learning Rate": decay_lr, "Policy/Epsilon": decay_eps, "Policy/Beta": decay_bet, } for reward_provider in self.reward_signals.values(): update_stats.update(reward_provider.update(batch)) return update_stats
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: """ Updates model using buffer. :param num_sequences: Number of trajectories in batch. :param batch: Experience mini-batch. :param update_target: Whether or not to update target value network :param reward_signal_batches: Minibatches to use for updating the reward signals, indexed by name. If none, don't update the reward signals. :return: Output from update process. """ rewards = {} for name in self.reward_signals: rewards[name] = ModelUtils.list_to_tensor( batch[RewardSignalUtil.rewards_key(name)]) n_obs = len(self.policy.behavior_spec.observation_specs) current_obs = ObsUtil.from_buffer(batch, n_obs) # Convert to tensors current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] next_obs = ObsUtil.from_buffer_next(batch, n_obs) # Convert to tensors next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs] act_masks = ModelUtils.list_to_tensor(batch[BufferKey.ACTION_MASK]) actions = AgentAction.from_buffer(batch) memories_list = [ ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i]) for i in range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length) ] # LSTM shouldn't have sequence length <1, but stop it from going out of the index if true. value_memories_list = [ ModelUtils.list_to_tensor(batch[BufferKey.CRITIC_MEMORY][i]) for i in range(0, len(batch[BufferKey.CRITIC_MEMORY]), self.policy.sequence_length) ] if len(memories_list) > 0: memories = torch.stack(memories_list).unsqueeze(0) value_memories = torch.stack(value_memories_list).unsqueeze(0) else: memories = None value_memories = None # Q and V network memories are 0'ed out, since we don't have them during inference. q_memories = (torch.zeros_like(value_memories) if value_memories is not None else None) # Copy normalizers from policy self.q_network.q1_network.network_body.copy_normalization( self.policy.actor.network_body) self.q_network.q2_network.network_body.copy_normalization( self.policy.actor.network_body) self.target_network.network_body.copy_normalization( self.policy.actor.network_body) self._critic.network_body.copy_normalization( self.policy.actor.network_body) sampled_actions, log_probs, _, _, = self.policy.actor.get_action_and_stats( current_obs, masks=act_masks, memories=memories, sequence_length=self.policy.sequence_length, ) value_estimates, _ = self._critic.critic_pass( current_obs, value_memories, sequence_length=self.policy.sequence_length) cont_sampled_actions = sampled_actions.continuous_tensor cont_actions = actions.continuous_tensor q1p_out, q2p_out = self.q_network( current_obs, cont_sampled_actions, memories=q_memories, sequence_length=self.policy.sequence_length, q2_grad=False, ) q1_out, q2_out = self.q_network( current_obs, cont_actions, memories=q_memories, sequence_length=self.policy.sequence_length, ) if self._action_spec.discrete_size > 0: disc_actions = actions.discrete_tensor q1_stream = self._condense_q_streams(q1_out, disc_actions) q2_stream = self._condense_q_streams(q2_out, disc_actions) else: q1_stream, q2_stream = q1_out, q2_out with torch.no_grad(): # Since we didn't record the next value memories, evaluate one step in the critic to # get them. if value_memories is not None: # Get the first observation in each sequence just_first_obs = [ _obs[::self.policy.sequence_length] for _obs in current_obs ] _, next_value_memories = self._critic.critic_pass( just_first_obs, value_memories, sequence_length=1) else: next_value_memories = None target_values, _ = self.target_network( next_obs, memories=next_value_memories, sequence_length=self.policy.sequence_length, ) masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS], dtype=torch.bool) dones = ModelUtils.list_to_tensor(batch[BufferKey.DONE]) q1_loss, q2_loss = self.sac_q_loss(q1_stream, q2_stream, target_values, dones, rewards, masks) value_loss = self.sac_value_loss(log_probs, value_estimates, q1p_out, q2p_out, masks) policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks) entropy_loss = self.sac_entropy_loss(log_probs, masks) total_value_loss = q1_loss + q2_loss if self.policy.shared_critic: policy_loss += value_loss else: total_value_loss += value_loss decay_lr = self.decay_learning_rate.get_value( self.policy.get_current_step()) ModelUtils.update_learning_rate(self.policy_optimizer, decay_lr) self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() ModelUtils.update_learning_rate(self.value_optimizer, decay_lr) self.value_optimizer.zero_grad() total_value_loss.backward() self.value_optimizer.step() ModelUtils.update_learning_rate(self.entropy_optimizer, decay_lr) self.entropy_optimizer.zero_grad() entropy_loss.backward() self.entropy_optimizer.step() # Update target network ModelUtils.soft_update(self._critic, self.target_network, self.tau) update_stats = { "Losses/Policy Loss": policy_loss.item(), "Losses/Value Loss": value_loss.item(), "Losses/Q1 Loss": q1_loss.item(), "Losses/Q2 Loss": q2_loss.item(), "Policy/Discrete Entropy Coeff": torch.mean(torch.exp(self._log_ent_coef.discrete)).item(), "Policy/Continuous Entropy Coeff": torch.mean(torch.exp(self._log_ent_coef.continuous)).item(), "Policy/Learning Rate": decay_lr, } return update_stats
def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray: with torch.no_grad(): rewards = ModelUtils.to_numpy( self._network.compute_reward(mini_batch)) rewards = np.minimum(rewards, 1.0 / self.strength) return rewards * self._has_updated_once
def sac_value_loss( self, log_probs: ActionLogProbs, values: Dict[str, torch.Tensor], q1p_out: Dict[str, torch.Tensor], q2p_out: Dict[str, torch.Tensor], loss_masks: torch.Tensor, ) -> torch.Tensor: min_policy_qs = {} with torch.no_grad(): _cont_ent_coef = self._log_ent_coef.continuous.exp() _disc_ent_coef = self._log_ent_coef.discrete.exp() for name in values.keys(): if self._action_spec.discrete_size <= 0: min_policy_qs[name] = torch.min(q1p_out[name], q2p_out[name]) else: disc_action_probs = log_probs.all_discrete_tensor.exp() _branched_q1p = ModelUtils.break_into_branches( q1p_out[name] * disc_action_probs, self._action_spec.discrete_branches, ) _branched_q2p = ModelUtils.break_into_branches( q2p_out[name] * disc_action_probs, self._action_spec.discrete_branches, ) _q1p_mean = torch.mean( torch.stack([ torch.sum(_br, dim=1, keepdim=True) for _br in _branched_q1p ]), dim=0, ) _q2p_mean = torch.mean( torch.stack([ torch.sum(_br, dim=1, keepdim=True) for _br in _branched_q2p ]), dim=0, ) min_policy_qs[name] = torch.min(_q1p_mean, _q2p_mean) value_losses = [] if self._action_spec.discrete_size <= 0: for name in values.keys(): with torch.no_grad(): v_backup = min_policy_qs[name] - torch.sum( _cont_ent_coef * log_probs.continuous_tensor, dim=1) value_loss = 0.5 * ModelUtils.masked_mean( torch.nn.functional.mse_loss(values[name], v_backup), loss_masks) value_losses.append(value_loss) else: disc_log_probs = log_probs.all_discrete_tensor branched_per_action_ent = ModelUtils.break_into_branches( disc_log_probs * disc_log_probs.exp(), self._action_spec.discrete_branches, ) # We have to do entropy bonus per action branch branched_ent_bonus = torch.stack([ torch.sum(_disc_ent_coef[i] * _lp, dim=1, keepdim=True) for i, _lp in enumerate(branched_per_action_ent) ]) for name in values.keys(): with torch.no_grad(): v_backup = min_policy_qs[name] - torch.mean( branched_ent_bonus, axis=0) # Add continuous entropy bonus to minimum Q if self._action_spec.continuous_size > 0: v_backup += torch.sum( _cont_ent_coef * log_probs.continuous_tensor, dim=1, keepdim=True, ) value_loss = 0.5 * ModelUtils.masked_mean( torch.nn.functional.mse_loss(values[name], v_backup.squeeze()), loss_masks, ) value_losses.append(value_loss) value_loss = torch.mean(torch.stack(value_losses)) if torch.isinf(value_loss).any() or torch.isnan(value_loss).any(): raise UnityTrainerException("Inf found") return value_loss
def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings): super().__init__(policy, trainer_params) reward_signal_configs = trainer_params.reward_signals reward_signal_names = [ key.value for key, _ in reward_signal_configs.items() ] if policy.shared_critic: raise UnityTrainerException( "SAC does not support SharedActorCritic") self._critic = ValueNetwork( reward_signal_names, policy.behavior_spec.observation_specs, policy.network_settings, ) hyperparameters: SACSettings = cast(SACSettings, trainer_params.hyperparameters) self.tau = hyperparameters.tau self.init_entcoef = hyperparameters.init_entcoef self.policy = policy policy_network_settings = policy.network_settings self.tau = hyperparameters.tau self.burn_in_ratio = 0.0 # Non-exposed SAC parameters self.discrete_target_entropy_scale = 0.2 # Roughly equal to e-greedy 0.05 self.continuous_target_entropy_scale = 1.0 self.stream_names = list(self.reward_signals.keys()) # Use to reduce "survivor bonus" when using Curiosity or GAIL. self.gammas = [ _val.gamma for _val in trainer_params.reward_signals.values() ] self.use_dones_in_backup = { name: int(not self.reward_signals[name].ignore_done) for name in self.stream_names } self._action_spec = self.policy.behavior_spec.action_spec self.q_network = TorchSACOptimizer.PolicyValueNetwork( self.stream_names, self.policy.behavior_spec.observation_specs, policy_network_settings, self._action_spec, ) self.target_network = ValueNetwork( self.stream_names, self.policy.behavior_spec.observation_specs, policy_network_settings, ) ModelUtils.soft_update(self._critic, self.target_network, 1.0) # We create one entropy coefficient per action, whether discrete or continuous. _disc_log_ent_coef = torch.nn.Parameter( torch.log( torch.as_tensor([self.init_entcoef] * len(self._action_spec.discrete_branches))), requires_grad=True, ) _cont_log_ent_coef = torch.nn.Parameter(torch.log( torch.as_tensor([self.init_entcoef])), requires_grad=True) self._log_ent_coef = TorchSACOptimizer.LogEntCoef( discrete=_disc_log_ent_coef, continuous=_cont_log_ent_coef) _cont_target = ( -1 * self.continuous_target_entropy_scale * np.prod(self._action_spec.continuous_size).astype(np.float32)) _disc_target = [ self.discrete_target_entropy_scale * np.log(i).astype(np.float32) for i in self._action_spec.discrete_branches ] self.target_entropy = TorchSACOptimizer.TargetEntropy( continuous=_cont_target, discrete=_disc_target) policy_params = list(self.policy.actor.parameters()) value_params = list(self.q_network.parameters()) + list( self._critic.parameters()) logger.debug("value_vars") for param in value_params: logger.debug(param.shape) logger.debug("policy_vars") for param in policy_params: logger.debug(param.shape) self.decay_learning_rate = ModelUtils.DecayedValue( hyperparameters.learning_rate_schedule, hyperparameters.learning_rate, 1e-10, self.trainer_settings.max_steps, ) self.policy_optimizer = torch.optim.Adam( policy_params, lr=hyperparameters.learning_rate) self.value_optimizer = torch.optim.Adam( value_params, lr=hyperparameters.learning_rate) self.entropy_optimizer = torch.optim.Adam( self._log_ent_coef.parameters(), lr=hyperparameters.learning_rate) self._move_to_device(default_device())
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: """ Performs update on model. :param batch: Batch of experiences. :param num_sequences: Number of sequences to process. :return: Results of update. """ # Get decayed parameters decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step()) decay_eps = self.decay_epsilon.get_value(self.policy.get_current_step()) decay_bet = self.decay_beta.get_value(self.policy.get_current_step()) returns = {} old_values = {} for name in self.reward_signals: old_values[name] = ModelUtils.list_to_tensor( batch[f"{name}_value_estimates"] ) returns[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns"]) n_obs = len(self.policy.behavior_spec.observation_specs) current_obs = ObsUtil.from_buffer(batch, n_obs) # Convert to tensors current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] act_masks = ModelUtils.list_to_tensor(batch["action_mask"]) actions = AgentAction.from_dict(batch) memories = [ ModelUtils.list_to_tensor(batch["memory"][i]) for i in range(0, len(batch["memory"]), self.policy.sequence_length) ] if len(memories) > 0: memories = torch.stack(memories).unsqueeze(0) log_probs, entropy, values = self.policy.evaluate_actions( current_obs, masks=act_masks, actions=actions, memories=memories, seq_len=self.policy.sequence_length, ) old_log_probs = ActionLogProbs.from_dict(batch).flatten() log_probs = log_probs.flatten() loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool) value_loss = self.ppo_value_loss( values, old_values, returns, decay_eps, loss_masks ) policy_loss = self.ppo_policy_loss( ModelUtils.list_to_tensor(batch["advantages"]), log_probs, old_log_probs, loss_masks, ) loss = ( policy_loss + 0.5 * value_loss - decay_bet * ModelUtils.masked_mean(entropy, loss_masks) ) # Set optimizer learning rate ModelUtils.update_learning_rate(self.optimizer, decay_lr) self.optimizer.zero_grad() loss.backward() self.optimizer.step() update_stats = { # NOTE: abs() is not technically correct, but matches the behavior in TensorFlow. # TODO: After PyTorch is default, change to something more correct. "Losses/Policy Loss": torch.abs(policy_loss).item(), "Losses/Value Loss": value_loss.item(), "Policy/Learning Rate": decay_lr, "Policy/Epsilon": decay_eps, "Policy/Beta": decay_bet, } for reward_provider in self.reward_signals.values(): update_stats.update(reward_provider.update(batch)) return update_stats
def _evaluate_by_sequence( self, tensor_obs: List[torch.Tensor], initial_memory: torch.Tensor ) -> Tuple[Dict[str, torch.Tensor], AgentBufferField, torch.Tensor]: """ Evaluate a trajectory sequence-by-sequence, assembling the result. This enables us to get the intermediate memories for the critic. :param tensor_obs: A List of tensors of shape (trajectory_len, <obs_dim>) that are the agent's observations for this trajectory. :param initial_memory: The memory that preceeds this trajectory. Of shape (1,1,<mem_size>), i.e. what is returned as the output of a MemoryModules. :return: A Tuple of the value estimates as a Dict of [name, tensor], an AgentBufferField of the initial memories to be used during value function update, and the final memory at the end of the trajectory. """ num_experiences = tensor_obs[0].shape[0] all_next_memories = AgentBufferField() # In the buffer, the 1st sequence are the ones that are padded. So if seq_len = 3 and # trajectory is of length 10, the 1st sequence is [pad,pad,obs]. # Compute the number of elements in this padded seq. leftover = num_experiences % self.policy.sequence_length # Compute values for the potentially truncated initial sequence seq_obs = [] first_seq_len = leftover if leftover > 0 else self.policy.sequence_length for _obs in tensor_obs: first_seq_obs = _obs[0:first_seq_len] seq_obs.append(first_seq_obs) # For the first sequence, the initial memory should be the one at the # beginning of this trajectory. for _ in range(first_seq_len): all_next_memories.append( ModelUtils.to_numpy(initial_memory.squeeze())) init_values, _mem = self.critic.critic_pass( seq_obs, initial_memory, sequence_length=first_seq_len) all_values = { signal_name: [init_values[signal_name]] for signal_name in init_values.keys() } # Evaluate other trajectories, carrying over _mem after each # trajectory for seq_num in range( 1, math.ceil( (num_experiences) / (self.policy.sequence_length))): seq_obs = [] for _ in range(self.policy.sequence_length): all_next_memories.append(ModelUtils.to_numpy(_mem.squeeze())) start = seq_num * self.policy.sequence_length - ( self.policy.sequence_length - leftover) end = (seq_num + 1) * self.policy.sequence_length - ( self.policy.sequence_length - leftover) for _obs in tensor_obs: seq_obs.append(_obs[start:end]) values, _mem = self.critic.critic_pass( seq_obs, _mem, sequence_length=self.policy.sequence_length) for signal_name, _val in values.items(): all_values[signal_name].append(_val) # Create one tensor per reward signal all_value_tensors = { signal_name: torch.cat(value_list, dim=0) for signal_name, value_list in all_values.items() } next_mem = _mem return all_value_tensors, all_next_memories, next_mem
def get_trajectory_value_estimates( self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool, agent_id: str = "", ) -> Tuple[Dict[str, np.ndarray], Dict[str, float], Optional[AgentBufferField]]: """ Get value estimates and memories for a trajectory, in batch form. :param batch: An AgentBuffer that consists of a trajectory. :param next_obs: the next observation (after the trajectory). Used for boostrapping if this is not a termiinal trajectory. :param done: Set true if this is a terminal trajectory. :param agent_id: Agent ID of the agent that this trajectory belongs to. :returns: A Tuple of the Value Estimates as a Dict of [name, np.ndarray(trajectory_len)], the final value estimate as a Dict of [name, float], and optionally (if using memories) an AgentBufferField of initial critic memories to be used during update. """ n_obs = len(self.policy.behavior_spec.observation_specs) if agent_id in self.critic_memory_dict: memory = self.critic_memory_dict[agent_id] else: memory = (torch.zeros((1, 1, self.critic.memory_size)) if self.policy.use_recurrent else None) # Convert to tensors current_obs = [ ModelUtils.list_to_tensor(obs) for obs in ObsUtil.from_buffer(batch, n_obs) ] next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs] next_obs = [obs.unsqueeze(0) for obs in next_obs] # If we're using LSTM, we want to get all the intermediate memories. all_next_memories: Optional[AgentBufferField] = None # To prevent memory leak and improve performance, evaluate with no_grad. with torch.no_grad(): if self.policy.use_recurrent: ( value_estimates, all_next_memories, next_memory, ) = self._evaluate_by_sequence(current_obs, memory) else: value_estimates, next_memory = self.critic.critic_pass( current_obs, memory, sequence_length=batch.num_experiences) # Store the memory for the next trajectory. This should NOT have a gradient. self.critic_memory_dict[agent_id] = next_memory next_value_estimate, _ = self.critic.critic_pass(next_obs, next_memory, sequence_length=1) for name, estimate in value_estimates.items(): value_estimates[name] = ModelUtils.to_numpy(estimate) next_value_estimate[name] = ModelUtils.to_numpy( next_value_estimate[name]) if done: for k in next_value_estimate: if not self.reward_signals[k].ignore_done: next_value_estimate[k] = 0.0 if agent_id in self.critic_memory_dict: self.critic_memory_dict.pop(agent_id) return value_estimates, next_value_estimate, all_next_memories