def test_ppo_optimizer_update_curiosity( dummy_config, curiosity_dummy_config, rnn, visual, discrete # noqa: F811 ): # Test evaluate dummy_config.reward_signals = curiosity_dummy_config optimizer = create_test_ppo_optimizer( dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual ) # Test update update_buffer = mb.simulate_rollout( BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec, memory_size=optimizer.policy.m_size, ) # Mock out reward signal eval copy_buffer_fields( update_buffer, src_key=BufferKey.ENVIRONMENT_REWARDS, dst_keys=[ BufferKey.ADVANTAGES, RewardSignalUtil.returns_key("extrinsic"), RewardSignalUtil.value_estimates_key("extrinsic"), RewardSignalUtil.returns_key("curiosity"), RewardSignalUtil.value_estimates_key("curiosity"), ], ) # Copy memories to critic memories copy_buffer_fields(update_buffer, BufferKey.MEMORY, [BufferKey.CRITIC_MEMORY]) optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def test_sac_update_reward_signals( dummy_config, curiosity_dummy_config, discrete # noqa: F811 ): # Add a Curiosity module dummy_config.reward_signals = curiosity_dummy_config optimizer = create_sac_optimizer_mock(dummy_config, use_rnn=False, use_discrete=discrete, use_visual=False) # Test update, while removing PPO-specific buffer elements. update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec) # Mock out reward signal eval update_buffer[RewardSignalUtil.rewards_key("extrinsic")] = update_buffer[ BufferKey.ENVIRONMENT_REWARDS] update_buffer[RewardSignalUtil.rewards_key("curiosity")] = update_buffer[ BufferKey.ENVIRONMENT_REWARDS] return_stats = optimizer.update_reward_signals( {"curiosity": update_buffer}, num_sequences=update_buffer.num_experiences) required_stats = [ "Losses/Curiosity Forward Loss", "Losses/Curiosity Inverse Loss" ] for stat in required_stats: assert stat in return_stats.keys()
def test_sac_optimizer_update(dummy_config, rnn, visual, discrete): torch.manual_seed(0) # Test evaluate optimizer = create_sac_optimizer_mock(dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual) # Test update update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec, memory_size=12) # Mock out reward signal eval update_buffer[RewardSignalUtil.rewards_key("extrinsic")] = update_buffer[ BufferKey.ENVIRONMENT_REWARDS] # Mock out value memories update_buffer[BufferKey.CRITIC_MEMORY] = update_buffer[BufferKey.MEMORY] return_stats = optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, ) # Make sure we have the right stats required_stats = [ "Losses/Policy Loss", "Losses/Value Loss", "Losses/Q1 Loss", "Losses/Q2 Loss", "Policy/Continuous Entropy Coeff", "Policy/Discrete Entropy Coeff", "Policy/Learning Rate", ] for stat in required_stats: assert stat in return_stats.keys()
def test_poca_optimizer_update(dummy_config, rnn, visual, discrete): # Test evaluate optimizer = create_test_poca_optimizer(dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual) # Test update update_buffer = mb.simulate_rollout( BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec, memory_size=optimizer.policy.m_size, num_other_agents_in_group=NUM_AGENTS, ) # Mock out reward signal eval copy_buffer_fields( update_buffer, BufferKey.ENVIRONMENT_REWARDS, [ BufferKey.ADVANTAGES, RewardSignalUtil.returns_key("extrinsic"), RewardSignalUtil.value_estimates_key("extrinsic"), RewardSignalUtil.baseline_estimates_key("extrinsic"), ], ) # Copy memories to critic memories copy_buffer_fields( update_buffer, BufferKey.MEMORY, [BufferKey.CRITIC_MEMORY, BufferKey.BASELINE_MEMORY], ) return_stats = optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, ) # Make sure we have the right stats required_stats = [ "Losses/Policy Loss", "Losses/Value Loss", "Policy/Learning Rate", "Policy/Epsilon", "Policy/Beta", ] for stat in required_stats: assert stat in return_stats.keys()
def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811 # Test evaluate dummy_config.reward_signals = gail_dummy_config config = ppo_dummy_config() optimizer = create_test_ppo_optimizer(config, use_rnn=False, use_discrete=False, use_visual=False) # Test update update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec) # Mock out reward signal eval copy_buffer_fields( update_buffer, src_key=BufferKey.ENVIRONMENT_REWARDS, dst_keys=[ BufferKey.ADVANTAGES, RewardSignalUtil.returns_key("extrinsic"), RewardSignalUtil.value_estimates_key("extrinsic"), RewardSignalUtil.returns_key("gail"), RewardSignalUtil.value_estimates_key("gail"), ], ) update_buffer[BufferKey.CONTINUOUS_LOG_PROBS] = np.ones_like( update_buffer[BufferKey.CONTINUOUS_ACTION]) optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, ) # Check if buffer size is too big update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec) # Mock out reward signal eval copy_buffer_fields( update_buffer, src_key=BufferKey.ENVIRONMENT_REWARDS, dst_keys=[ BufferKey.ADVANTAGES, RewardSignalUtil.returns_key("extrinsic"), RewardSignalUtil.value_estimates_key("extrinsic"), RewardSignalUtil.returns_key("gail"), RewardSignalUtil.value_estimates_key("gail"), ], ) optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def _update_sac_policy(self) -> bool: """ Uses update_buffer to update the policy. We sample the update_buffer and update until the steps_per_update ratio is met. """ has_updated = False self.cumulative_returns_since_policy_update.clear() n_sequences = max( int(self.hyperparameters.batch_size / self.policy.sequence_length), 1) batch_update_stats: Dict[str, list] = defaultdict(list) while (self._step - self.hyperparameters.buffer_init_steps ) / self.update_steps > self.steps_per_update: logger.debug(f"Updating SAC policy at step {self._step}") buffer = self.update_buffer if self.update_buffer.num_experiences >= self.hyperparameters.batch_size: sampled_minibatch = buffer.sample_mini_batch( self.hyperparameters.batch_size, sequence_length=self.policy.sequence_length, ) # Get rewards for each reward for name, signal in self.optimizer.reward_signals.items(): sampled_minibatch[RewardSignalUtil.rewards_key(name)] = ( signal.evaluate(sampled_minibatch) * signal.strength) update_stats = self.optimizer.update(sampled_minibatch, n_sequences) for stat_name, value in update_stats.items(): batch_update_stats[stat_name].append(value) self.update_steps += 1 for stat, stat_list in batch_update_stats.items(): self._stats_reporter.add_stat(stat, np.mean(stat_list)) has_updated = True if self.optimizer.bc_module: update_stats = self.optimizer.bc_module.update() for stat, val in update_stats.items(): self._stats_reporter.add_stat(stat, val) # Truncate update buffer if neccessary. Truncate more than we need to to avoid truncating # a large buffer at each update. if self.update_buffer.num_experiences > self.hyperparameters.buffer_size: self.update_buffer.truncate( int(self.hyperparameters.buffer_size * BUFFER_TRUNCATE_PERCENT)) return has_updated
def test_publish_queue(dummy_config): mock_specs = mb.setup_test_behavior_specs( True, False, vector_action_space=[1], vector_obs_space=8 ) behavior_id_team0 = "test_brain?team=0" behavior_id_team1 = "test_brain?team=1" parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0) brain_name = parsed_behavior_id0.brain_name ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0") controller = GhostController(100) trainer = GhostTrainer( ppo_trainer, brain_name, controller, 0, dummy_config, True, "0" ) # First policy encountered becomes policy trained by wrapped PPO # This queue should remain empty after swap snapshot policy = trainer.create_policy(parsed_behavior_id0, mock_specs) trainer.add_policy(parsed_behavior_id0, policy) policy_queue0 = AgentManagerQueue(behavior_id_team0) trainer.publish_policy_queue(policy_queue0) # Ghost trainer should use this queue for ghost policy swap parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1) policy = trainer.create_policy(parsed_behavior_id1, mock_specs) trainer.add_policy(parsed_behavior_id1, policy) policy_queue1 = AgentManagerQueue(behavior_id_team1) trainer.publish_policy_queue(policy_queue1) # check ghost trainer swap pushes to ghost queue and not trainer assert policy_queue0.empty() and policy_queue1.empty() trainer._swap_snapshots() assert policy_queue0.empty() and not policy_queue1.empty() # clear policy_queue1.get_nowait() mock_specs = mb.setup_test_behavior_specs( False, False, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_specs) # Mock out reward signal eval copy_buffer_fields( buffer, src_key=BufferKey.ENVIRONMENT_REWARDS, dst_keys=[ BufferKey.ADVANTAGES, RewardSignalUtil.rewards_key("extrinsic"), RewardSignalUtil.returns_key("extrinsic"), RewardSignalUtil.value_estimates_key("extrinsic"), RewardSignalUtil.rewards_key("curiosity"), RewardSignalUtil.returns_key("curiosity"), RewardSignalUtil.value_estimates_key("curiosity"), ], ) trainer.trainer.update_buffer = buffer # when ghost trainer advance and wrapped trainer buffers full # the wrapped trainer pushes updated policy to correct queue assert policy_queue0.empty() and policy_queue1.empty() trainer.advance() assert not policy_queue0.empty() and policy_queue1.empty()
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: """ Performs update on model. :param batch: Batch of experiences. :param num_sequences: Number of sequences to process. :return: Results of update. """ # Get decayed parameters decay_lr = self.decay_learning_rate.get_value( self.policy.get_current_step()) decay_eps = self.decay_epsilon.get_value( self.policy.get_current_step()) decay_bet = self.decay_beta.get_value(self.policy.get_current_step()) returns = {} old_values = {} for name in self.reward_signals: old_values[name] = ModelUtils.list_to_tensor( batch[RewardSignalUtil.value_estimates_key(name)]) returns[name] = ModelUtils.list_to_tensor( batch[RewardSignalUtil.returns_key(name)]) n_obs = len(self.policy.behavior_spec.observation_specs) current_obs = ObsUtil.from_buffer(batch, n_obs) # Convert to tensors current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] act_masks = ModelUtils.list_to_tensor(batch[BufferKey.ACTION_MASK]) actions = AgentAction.from_buffer(batch) memories = [ ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i]) for i in range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length) ] if len(memories) > 0: memories = torch.stack(memories).unsqueeze(0) # Get value memories value_memories = [ ModelUtils.list_to_tensor(batch[BufferKey.CRITIC_MEMORY][i]) for i in range(0, len(batch[BufferKey.CRITIC_MEMORY]), self.policy.sequence_length) ] if len(value_memories) > 0: value_memories = torch.stack(value_memories).unsqueeze(0) log_probs, entropy = self.policy.evaluate_actions( current_obs, masks=act_masks, actions=actions, memories=memories, seq_len=self.policy.sequence_length, ) values, _ = self.critic.critic_pass( current_obs, memories=value_memories, sequence_length=self.policy.sequence_length, ) old_log_probs = ActionLogProbs.from_buffer(batch).flatten() log_probs = log_probs.flatten() loss_masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS], dtype=torch.bool) value_loss = self.ppo_value_loss(values, old_values, returns, decay_eps, loss_masks) policy_loss = self.ppo_policy_loss( ModelUtils.list_to_tensor(batch[BufferKey.ADVANTAGES]), log_probs, old_log_probs, loss_masks, ) loss = (policy_loss + 0.5 * value_loss - decay_bet * ModelUtils.masked_mean(entropy, loss_masks)) # Set optimizer learning rate ModelUtils.update_learning_rate(self.optimizer, decay_lr) self.optimizer.zero_grad() loss.backward() self.optimizer.step() update_stats = { # NOTE: abs() is not technically correct, but matches the behavior in TensorFlow. # TODO: After PyTorch is default, change to something more correct. "Losses/Policy Loss": torch.abs(policy_loss).item(), "Losses/Value Loss": value_loss.item(), "Policy/Learning Rate": decay_lr, "Policy/Epsilon": decay_eps, "Policy/Beta": decay_bet, } for reward_provider in self.reward_signals.values(): update_stats.update(reward_provider.update(batch)) return update_stats
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: """ Updates model using buffer. :param num_sequences: Number of trajectories in batch. :param batch: Experience mini-batch. :param update_target: Whether or not to update target value network :param reward_signal_batches: Minibatches to use for updating the reward signals, indexed by name. If none, don't update the reward signals. :return: Output from update process. """ rewards = {} for name in self.reward_signals: rewards[name] = ModelUtils.list_to_tensor( batch[RewardSignalUtil.rewards_key(name)]) n_obs = len(self.policy.behavior_spec.observation_specs) current_obs = ObsUtil.from_buffer(batch, n_obs) # Convert to tensors current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] next_obs = ObsUtil.from_buffer_next(batch, n_obs) # Convert to tensors next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs] act_masks = ModelUtils.list_to_tensor(batch[BufferKey.ACTION_MASK]) actions = AgentAction.from_buffer(batch) memories_list = [ ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i]) for i in range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length) ] # LSTM shouldn't have sequence length <1, but stop it from going out of the index if true. value_memories_list = [ ModelUtils.list_to_tensor(batch[BufferKey.CRITIC_MEMORY][i]) for i in range(0, len(batch[BufferKey.CRITIC_MEMORY]), self.policy.sequence_length) ] offset = 1 if self.policy.sequence_length > 1 else 0 next_value_memories_list = [ ModelUtils.list_to_tensor( batch[BufferKey.CRITIC_MEMORY] [i]) # only pass value part of memory to target network for i in range(offset, len(batch[BufferKey.CRITIC_MEMORY]), self.policy.sequence_length) ] if len(memories_list) > 0: memories = torch.stack(memories_list).unsqueeze(0) value_memories = torch.stack(value_memories_list).unsqueeze(0) next_value_memories = torch.stack( next_value_memories_list).unsqueeze(0) else: memories = None value_memories = None next_value_memories = None # Q and V network memories are 0'ed out, since we don't have them during inference. q_memories = (torch.zeros_like(next_value_memories) if next_value_memories is not None else None) # Copy normalizers from policy self.q_network.q1_network.network_body.copy_normalization( self.policy.actor.network_body) self.q_network.q2_network.network_body.copy_normalization( self.policy.actor.network_body) self.target_network.network_body.copy_normalization( self.policy.actor.network_body) self._critic.network_body.copy_normalization( self.policy.actor.network_body) sampled_actions, log_probs, _, _, = self.policy.actor.get_action_and_stats( current_obs, masks=act_masks, memories=memories, sequence_length=self.policy.sequence_length, ) value_estimates, _ = self._critic.critic_pass( current_obs, value_memories, sequence_length=self.policy.sequence_length) cont_sampled_actions = sampled_actions.continuous_tensor cont_actions = actions.continuous_tensor q1p_out, q2p_out = self.q_network( current_obs, cont_sampled_actions, memories=q_memories, sequence_length=self.policy.sequence_length, q2_grad=False, ) q1_out, q2_out = self.q_network( current_obs, cont_actions, memories=q_memories, sequence_length=self.policy.sequence_length, ) if self._action_spec.discrete_size > 0: disc_actions = actions.discrete_tensor q1_stream = self._condense_q_streams(q1_out, disc_actions) q2_stream = self._condense_q_streams(q2_out, disc_actions) else: q1_stream, q2_stream = q1_out, q2_out with torch.no_grad(): target_values, _ = self.target_network( next_obs, memories=next_value_memories, sequence_length=self.policy.sequence_length, ) masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS], dtype=torch.bool) dones = ModelUtils.list_to_tensor(batch[BufferKey.DONE]) q1_loss, q2_loss = self.sac_q_loss(q1_stream, q2_stream, target_values, dones, rewards, masks) value_loss = self.sac_value_loss(log_probs, value_estimates, q1p_out, q2p_out, masks) policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks) entropy_loss = self.sac_entropy_loss(log_probs, masks) total_value_loss = q1_loss + q2_loss if self.policy.shared_critic: policy_loss += value_loss else: total_value_loss += value_loss decay_lr = self.decay_learning_rate.get_value( self.policy.get_current_step()) ModelUtils.update_learning_rate(self.policy_optimizer, decay_lr) self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() ModelUtils.update_learning_rate(self.value_optimizer, decay_lr) self.value_optimizer.zero_grad() total_value_loss.backward() self.value_optimizer.step() ModelUtils.update_learning_rate(self.entropy_optimizer, decay_lr) self.entropy_optimizer.zero_grad() entropy_loss.backward() self.entropy_optimizer.step() # Update target network ModelUtils.soft_update(self._critic, self.target_network, self.tau) update_stats = { "Losses/Policy Loss": policy_loss.item(), "Losses/Value Loss": value_loss.item(), "Losses/Q1 Loss": q1_loss.item(), "Losses/Q2 Loss": q2_loss.item(), "Policy/Discrete Entropy Coeff": torch.mean(torch.exp(self._log_ent_coef.discrete)).item(), "Policy/Continuous Entropy Coeff": torch.mean(torch.exp(self._log_ent_coef.continuous)).item(), "Policy/Learning Rate": decay_lr, } return update_stats
def _process_trajectory(self, trajectory: Trajectory) -> None: """ Takes a trajectory and processes it, putting it into the update buffer. Processing involves calculating value and advantage targets for model updating step. :param trajectory: The Trajectory tuple containing the steps to be processed. """ super()._process_trajectory(trajectory) agent_id = trajectory.agent_id # All the agents should have the same ID agent_buffer_trajectory = trajectory.to_agentbuffer() # Update the normalization if self.is_training: self.policy.update_normalization(agent_buffer_trajectory) # Get all value estimates ( value_estimates, baseline_estimates, value_next, value_memories, baseline_memories, ) = self.optimizer.get_trajectory_and_baseline_value_estimates( agent_buffer_trajectory, trajectory.next_obs, trajectory.next_group_obs, trajectory.all_group_dones_reached and trajectory.done_reached and not trajectory.interrupted, ) if value_memories is not None and baseline_memories is not None: agent_buffer_trajectory[BufferKey.CRITIC_MEMORY].set( value_memories) agent_buffer_trajectory[BufferKey.BASELINE_MEMORY].set( baseline_memories) for name, v in value_estimates.items(): agent_buffer_trajectory[RewardSignalUtil.value_estimates_key( name)].extend(v) agent_buffer_trajectory[RewardSignalUtil.baseline_estimates_key( name)].extend(baseline_estimates[name]) self._stats_reporter.add_stat( f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Baseline Estimate", np.mean(baseline_estimates[name]), ) self._stats_reporter.add_stat( f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate", np.mean(value_estimates[name]), ) self.collected_rewards["environment"][agent_id] += np.sum( agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS]) self.collected_group_rewards[agent_id] += np.sum( agent_buffer_trajectory[BufferKey.GROUP_REWARD]) for name, reward_signal in self.optimizer.reward_signals.items(): evaluate_result = ( reward_signal.evaluate(agent_buffer_trajectory) * reward_signal.strength) agent_buffer_trajectory[RewardSignalUtil.rewards_key(name)].extend( evaluate_result) # Report the reward signals self.collected_rewards[name][agent_id] += np.sum(evaluate_result) # Compute lambda returns and advantage tmp_advantages = [] for name in self.optimizer.reward_signals: local_rewards = np.array( agent_buffer_trajectory[RewardSignalUtil.rewards_key( name)].get_batch(), dtype=np.float32, ) baseline_estimate = agent_buffer_trajectory[ RewardSignalUtil.baseline_estimates_key(name)].get_batch() v_estimates = agent_buffer_trajectory[ RewardSignalUtil.value_estimates_key(name)].get_batch() lambd_returns = lambda_return( r=local_rewards, value_estimates=v_estimates, gamma=self.optimizer.reward_signals[name].gamma, lambd=self.hyperparameters.lambd, value_next=value_next[name], ) local_advantage = np.array(lambd_returns) - np.array( baseline_estimate) agent_buffer_trajectory[RewardSignalUtil.returns_key(name)].set( lambd_returns) agent_buffer_trajectory[RewardSignalUtil.advantage_key(name)].set( local_advantage) tmp_advantages.append(local_advantage) # Get global advantages global_advantages = list( np.mean(np.array(tmp_advantages, dtype=np.float32), axis=0)) agent_buffer_trajectory[BufferKey.ADVANTAGES].set(global_advantages) # Append to update buffer agent_buffer_trajectory.resequence_and_append( self.update_buffer, training_length=self.policy.sequence_length) # If this was a terminal trajectory, append stats and reset reward collection if trajectory.done_reached: self._update_end_episode_stats(agent_id, self.optimizer) # Remove dead agents from group reward recording if not trajectory.all_group_dones_reached: self.collected_group_rewards.pop(agent_id) # If the whole team is done, average the remaining group rewards. if trajectory.all_group_dones_reached and trajectory.done_reached: self.stats_reporter.add_stat( "Environment/Group Cumulative Reward", self.collected_group_rewards.get(agent_id, 0), aggregation=StatsAggregationMethod.HISTOGRAM, ) self.collected_group_rewards.pop(agent_id)
def _process_trajectory(self, trajectory: Trajectory) -> None: """ Takes a trajectory and processes it, putting it into the update buffer. Processing involves calculating value and advantage targets for model updating step. :param trajectory: The Trajectory tuple containing the steps to be processed. """ super()._process_trajectory(trajectory) agent_id = trajectory.agent_id # All the agents should have the same ID agent_buffer_trajectory = trajectory.to_agentbuffer() # Check if we used group rewards, warn if so. self._warn_if_group_reward(agent_buffer_trajectory) # Update the normalization if self.is_training: self.policy.update_normalization(agent_buffer_trajectory) # Get all value estimates ( value_estimates, value_next, value_memories, ) = self.optimizer.get_trajectory_value_estimates( agent_buffer_trajectory, trajectory.next_obs, trajectory.done_reached and not trajectory.interrupted, ) if value_memories is not None: agent_buffer_trajectory[BufferKey.CRITIC_MEMORY].set( value_memories) for name, v in value_estimates.items(): agent_buffer_trajectory[RewardSignalUtil.value_estimates_key( name)].extend(v) self._stats_reporter.add_stat( f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate", np.mean(v), ) # Evaluate all reward functions self.collected_rewards["environment"][agent_id] += np.sum( agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS]) for name, reward_signal in self.optimizer.reward_signals.items(): evaluate_result = ( reward_signal.evaluate(agent_buffer_trajectory) * reward_signal.strength) agent_buffer_trajectory[RewardSignalUtil.rewards_key(name)].extend( evaluate_result) # Report the reward signals self.collected_rewards[name][agent_id] += np.sum(evaluate_result) # Compute GAE and returns tmp_advantages = [] tmp_returns = [] for name in self.optimizer.reward_signals: bootstrap_value = value_next[name] local_rewards = agent_buffer_trajectory[ RewardSignalUtil.rewards_key(name)].get_batch() local_value_estimates = agent_buffer_trajectory[ RewardSignalUtil.value_estimates_key(name)].get_batch() local_advantage = get_gae( rewards=local_rewards, value_estimates=local_value_estimates, value_next=bootstrap_value, gamma=self.optimizer.reward_signals[name].gamma, lambd=self.hyperparameters.lambd, ) local_return = local_advantage + local_value_estimates # This is later use as target for the different value estimates agent_buffer_trajectory[RewardSignalUtil.returns_key(name)].set( local_return) agent_buffer_trajectory[RewardSignalUtil.advantage_key(name)].set( local_advantage) tmp_advantages.append(local_advantage) tmp_returns.append(local_return) # Get global advantages global_advantages = list( np.mean(np.array(tmp_advantages, dtype=np.float32), axis=0)) global_returns = list( np.mean(np.array(tmp_returns, dtype=np.float32), axis=0)) agent_buffer_trajectory[BufferKey.ADVANTAGES].set(global_advantages) agent_buffer_trajectory[BufferKey.DISCOUNTED_RETURNS].set( global_returns) self._append_to_update_buffer(agent_buffer_trajectory) # If this was a terminal trajectory, append stats and reset reward collection if trajectory.done_reached: self._update_end_episode_stats(agent_id, self.optimizer)