def test_ppo_optimizer_update_curiosity( dummy_config, curiosity_dummy_config, rnn, visual, discrete # noqa: F811 ): # Test evaluate dummy_config.reward_signals = curiosity_dummy_config optimizer = create_test_ppo_optimizer( dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual ) # Test update update_buffer = mb.simulate_rollout( BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec, memory_size=optimizer.policy.m_size, ) # Mock out reward signal eval copy_buffer_fields( update_buffer, src_key=BufferKey.ENVIRONMENT_REWARDS, dst_keys=[ BufferKey.ADVANTAGES, RewardSignalUtil.returns_key("extrinsic"), RewardSignalUtil.value_estimates_key("extrinsic"), RewardSignalUtil.returns_key("curiosity"), RewardSignalUtil.value_estimates_key("curiosity"), ], ) # Copy memories to critic memories copy_buffer_fields(update_buffer, BufferKey.MEMORY, [BufferKey.CRITIC_MEMORY]) optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def test_poca_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811 # Test evaluate dummy_config.reward_signals = gail_dummy_config config = poca_dummy_config() optimizer = create_test_poca_optimizer(config, use_rnn=False, use_discrete=False, use_visual=False) # Test update update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec) # Mock out reward signal eval copy_buffer_fields( update_buffer, src_key=BufferKey.ENVIRONMENT_REWARDS, dst_keys=[ BufferKey.ADVANTAGES, RewardSignalUtil.returns_key("extrinsic"), RewardSignalUtil.value_estimates_key("extrinsic"), RewardSignalUtil.baseline_estimates_key("extrinsic"), RewardSignalUtil.returns_key("gail"), RewardSignalUtil.value_estimates_key("gail"), RewardSignalUtil.baseline_estimates_key("gail"), ], ) update_buffer[BufferKey.CONTINUOUS_LOG_PROBS] = np.ones_like( update_buffer[BufferKey.CONTINUOUS_ACTION]) optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, ) # Check if buffer size is too big update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec) # Mock out reward signal eval copy_buffer_fields( update_buffer, src_key=BufferKey.ENVIRONMENT_REWARDS, dst_keys=[ BufferKey.ADVANTAGES, RewardSignalUtil.returns_key("extrinsic"), RewardSignalUtil.value_estimates_key("extrinsic"), RewardSignalUtil.baseline_estimates_key("extrinsic"), RewardSignalUtil.returns_key("gail"), RewardSignalUtil.value_estimates_key("gail"), RewardSignalUtil.baseline_estimates_key("gail"), ], ) optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def test_poca_optimizer_update(dummy_config, rnn, visual, discrete): # Test evaluate optimizer = create_test_poca_optimizer(dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual) # Test update update_buffer = mb.simulate_rollout( BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec, memory_size=optimizer.policy.m_size, num_other_agents_in_group=NUM_AGENTS, ) # Mock out reward signal eval copy_buffer_fields( update_buffer, BufferKey.ENVIRONMENT_REWARDS, [ BufferKey.ADVANTAGES, RewardSignalUtil.returns_key("extrinsic"), RewardSignalUtil.value_estimates_key("extrinsic"), RewardSignalUtil.baseline_estimates_key("extrinsic"), ], ) # Copy memories to critic memories copy_buffer_fields( update_buffer, BufferKey.MEMORY, [BufferKey.CRITIC_MEMORY, BufferKey.BASELINE_MEMORY], ) return_stats = optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, ) # Make sure we have the right stats required_stats = [ "Losses/Policy Loss", "Losses/Value Loss", "Policy/Learning Rate", "Policy/Epsilon", "Policy/Beta", ] for stat in required_stats: assert stat in return_stats.keys()
def test_publish_queue(dummy_config): mock_specs = mb.setup_test_behavior_specs( True, False, vector_action_space=[1], vector_obs_space=8 ) behavior_id_team0 = "test_brain?team=0" behavior_id_team1 = "test_brain?team=1" parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0) brain_name = parsed_behavior_id0.brain_name ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0") controller = GhostController(100) trainer = GhostTrainer( ppo_trainer, brain_name, controller, 0, dummy_config, True, "0" ) # First policy encountered becomes policy trained by wrapped PPO # This queue should remain empty after swap snapshot policy = trainer.create_policy(parsed_behavior_id0, mock_specs) trainer.add_policy(parsed_behavior_id0, policy) policy_queue0 = AgentManagerQueue(behavior_id_team0) trainer.publish_policy_queue(policy_queue0) # Ghost trainer should use this queue for ghost policy swap parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1) policy = trainer.create_policy(parsed_behavior_id1, mock_specs) trainer.add_policy(parsed_behavior_id1, policy) policy_queue1 = AgentManagerQueue(behavior_id_team1) trainer.publish_policy_queue(policy_queue1) # check ghost trainer swap pushes to ghost queue and not trainer assert policy_queue0.empty() and policy_queue1.empty() trainer._swap_snapshots() assert policy_queue0.empty() and not policy_queue1.empty() # clear policy_queue1.get_nowait() mock_specs = mb.setup_test_behavior_specs( False, False, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_specs) # Mock out reward signal eval copy_buffer_fields( buffer, src_key=BufferKey.ENVIRONMENT_REWARDS, dst_keys=[ BufferKey.ADVANTAGES, RewardSignalUtil.rewards_key("extrinsic"), RewardSignalUtil.returns_key("extrinsic"), RewardSignalUtil.value_estimates_key("extrinsic"), RewardSignalUtil.rewards_key("curiosity"), RewardSignalUtil.returns_key("curiosity"), RewardSignalUtil.value_estimates_key("curiosity"), ], ) trainer.trainer.update_buffer = buffer # when ghost trainer advance and wrapped trainer buffers full # the wrapped trainer pushes updated policy to correct queue assert policy_queue0.empty() and policy_queue1.empty() trainer.advance() assert not policy_queue0.empty() and policy_queue1.empty()
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: """ Performs update on model. :param batch: Batch of experiences. :param num_sequences: Number of sequences to process. :return: Results of update. """ # Get decayed parameters decay_lr = self.decay_learning_rate.get_value( self.policy.get_current_step()) decay_eps = self.decay_epsilon.get_value( self.policy.get_current_step()) decay_bet = self.decay_beta.get_value(self.policy.get_current_step()) returns = {} old_values = {} for name in self.reward_signals: old_values[name] = ModelUtils.list_to_tensor( batch[RewardSignalUtil.value_estimates_key(name)]) returns[name] = ModelUtils.list_to_tensor( batch[RewardSignalUtil.returns_key(name)]) n_obs = len(self.policy.behavior_spec.observation_specs) current_obs = ObsUtil.from_buffer(batch, n_obs) # Convert to tensors current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] act_masks = ModelUtils.list_to_tensor(batch[BufferKey.ACTION_MASK]) actions = AgentAction.from_buffer(batch) memories = [ ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i]) for i in range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length) ] if len(memories) > 0: memories = torch.stack(memories).unsqueeze(0) # Get value memories value_memories = [ ModelUtils.list_to_tensor(batch[BufferKey.CRITIC_MEMORY][i]) for i in range(0, len(batch[BufferKey.CRITIC_MEMORY]), self.policy.sequence_length) ] if len(value_memories) > 0: value_memories = torch.stack(value_memories).unsqueeze(0) log_probs, entropy = self.policy.evaluate_actions( current_obs, masks=act_masks, actions=actions, memories=memories, seq_len=self.policy.sequence_length, ) values, _ = self.critic.critic_pass( current_obs, memories=value_memories, sequence_length=self.policy.sequence_length, ) old_log_probs = ActionLogProbs.from_buffer(batch).flatten() log_probs = log_probs.flatten() loss_masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS], dtype=torch.bool) value_loss = self.ppo_value_loss(values, old_values, returns, decay_eps, loss_masks) policy_loss = self.ppo_policy_loss( ModelUtils.list_to_tensor(batch[BufferKey.ADVANTAGES]), log_probs, old_log_probs, loss_masks, ) loss = (policy_loss + 0.5 * value_loss - decay_bet * ModelUtils.masked_mean(entropy, loss_masks)) # Set optimizer learning rate ModelUtils.update_learning_rate(self.optimizer, decay_lr) self.optimizer.zero_grad() loss.backward() self.optimizer.step() update_stats = { # NOTE: abs() is not technically correct, but matches the behavior in TensorFlow. # TODO: After PyTorch is default, change to something more correct. "Losses/Policy Loss": torch.abs(policy_loss).item(), "Losses/Value Loss": value_loss.item(), "Policy/Learning Rate": decay_lr, "Policy/Epsilon": decay_eps, "Policy/Beta": decay_bet, } for reward_provider in self.reward_signals.values(): update_stats.update(reward_provider.update(batch)) return update_stats
def _process_trajectory(self, trajectory: Trajectory) -> None: """ Takes a trajectory and processes it, putting it into the update buffer. Processing involves calculating value and advantage targets for model updating step. :param trajectory: The Trajectory tuple containing the steps to be processed. """ super()._process_trajectory(trajectory) agent_id = trajectory.agent_id # All the agents should have the same ID agent_buffer_trajectory = trajectory.to_agentbuffer() # Update the normalization if self.is_training: self.policy.update_normalization(agent_buffer_trajectory) # Get all value estimates ( value_estimates, baseline_estimates, value_next, value_memories, baseline_memories, ) = self.optimizer.get_trajectory_and_baseline_value_estimates( agent_buffer_trajectory, trajectory.next_obs, trajectory.next_group_obs, trajectory.all_group_dones_reached and trajectory.done_reached and not trajectory.interrupted, ) if value_memories is not None and baseline_memories is not None: agent_buffer_trajectory[BufferKey.CRITIC_MEMORY].set( value_memories) agent_buffer_trajectory[BufferKey.BASELINE_MEMORY].set( baseline_memories) for name, v in value_estimates.items(): agent_buffer_trajectory[RewardSignalUtil.value_estimates_key( name)].extend(v) agent_buffer_trajectory[RewardSignalUtil.baseline_estimates_key( name)].extend(baseline_estimates[name]) self._stats_reporter.add_stat( f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Baseline Estimate", np.mean(baseline_estimates[name]), ) self._stats_reporter.add_stat( f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate", np.mean(value_estimates[name]), ) self.collected_rewards["environment"][agent_id] += np.sum( agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS]) self.collected_group_rewards[agent_id] += np.sum( agent_buffer_trajectory[BufferKey.GROUP_REWARD]) for name, reward_signal in self.optimizer.reward_signals.items(): evaluate_result = ( reward_signal.evaluate(agent_buffer_trajectory) * reward_signal.strength) agent_buffer_trajectory[RewardSignalUtil.rewards_key(name)].extend( evaluate_result) # Report the reward signals self.collected_rewards[name][agent_id] += np.sum(evaluate_result) # Compute lambda returns and advantage tmp_advantages = [] for name in self.optimizer.reward_signals: local_rewards = np.array( agent_buffer_trajectory[RewardSignalUtil.rewards_key( name)].get_batch(), dtype=np.float32, ) baseline_estimate = agent_buffer_trajectory[ RewardSignalUtil.baseline_estimates_key(name)].get_batch() v_estimates = agent_buffer_trajectory[ RewardSignalUtil.value_estimates_key(name)].get_batch() lambd_returns = lambda_return( r=local_rewards, value_estimates=v_estimates, gamma=self.optimizer.reward_signals[name].gamma, lambd=self.hyperparameters.lambd, value_next=value_next[name], ) local_advantage = np.array(lambd_returns) - np.array( baseline_estimate) agent_buffer_trajectory[RewardSignalUtil.returns_key(name)].set( lambd_returns) agent_buffer_trajectory[RewardSignalUtil.advantage_key(name)].set( local_advantage) tmp_advantages.append(local_advantage) # Get global advantages global_advantages = list( np.mean(np.array(tmp_advantages, dtype=np.float32), axis=0)) agent_buffer_trajectory[BufferKey.ADVANTAGES].set(global_advantages) # Append to update buffer agent_buffer_trajectory.resequence_and_append( self.update_buffer, training_length=self.policy.sequence_length) # If this was a terminal trajectory, append stats and reset reward collection if trajectory.done_reached: self._update_end_episode_stats(agent_id, self.optimizer) # Remove dead agents from group reward recording if not trajectory.all_group_dones_reached: self.collected_group_rewards.pop(agent_id) # If the whole team is done, average the remaining group rewards. if trajectory.all_group_dones_reached and trajectory.done_reached: self.stats_reporter.add_stat( "Environment/Group Cumulative Reward", self.collected_group_rewards.get(agent_id, 0), aggregation=StatsAggregationMethod.HISTOGRAM, ) self.collected_group_rewards.pop(agent_id)
def _process_trajectory(self, trajectory: Trajectory) -> None: """ Takes a trajectory and processes it, putting it into the update buffer. Processing involves calculating value and advantage targets for model updating step. :param trajectory: The Trajectory tuple containing the steps to be processed. """ super()._process_trajectory(trajectory) agent_id = trajectory.agent_id # All the agents should have the same ID agent_buffer_trajectory = trajectory.to_agentbuffer() # Check if we used group rewards, warn if so. self._warn_if_group_reward(agent_buffer_trajectory) # Update the normalization if self.is_training: self.policy.update_normalization(agent_buffer_trajectory) # Get all value estimates ( value_estimates, value_next, value_memories, ) = self.optimizer.get_trajectory_value_estimates( agent_buffer_trajectory, trajectory.next_obs, trajectory.done_reached and not trajectory.interrupted, ) if value_memories is not None: agent_buffer_trajectory[BufferKey.CRITIC_MEMORY].set( value_memories) for name, v in value_estimates.items(): agent_buffer_trajectory[RewardSignalUtil.value_estimates_key( name)].extend(v) self._stats_reporter.add_stat( f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate", np.mean(v), ) # Evaluate all reward functions self.collected_rewards["environment"][agent_id] += np.sum( agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS]) for name, reward_signal in self.optimizer.reward_signals.items(): evaluate_result = ( reward_signal.evaluate(agent_buffer_trajectory) * reward_signal.strength) agent_buffer_trajectory[RewardSignalUtil.rewards_key(name)].extend( evaluate_result) # Report the reward signals self.collected_rewards[name][agent_id] += np.sum(evaluate_result) # Compute GAE and returns tmp_advantages = [] tmp_returns = [] for name in self.optimizer.reward_signals: bootstrap_value = value_next[name] local_rewards = agent_buffer_trajectory[ RewardSignalUtil.rewards_key(name)].get_batch() local_value_estimates = agent_buffer_trajectory[ RewardSignalUtil.value_estimates_key(name)].get_batch() local_advantage = get_gae( rewards=local_rewards, value_estimates=local_value_estimates, value_next=bootstrap_value, gamma=self.optimizer.reward_signals[name].gamma, lambd=self.hyperparameters.lambd, ) local_return = local_advantage + local_value_estimates # This is later use as target for the different value estimates agent_buffer_trajectory[RewardSignalUtil.returns_key(name)].set( local_return) agent_buffer_trajectory[RewardSignalUtil.advantage_key(name)].set( local_advantage) tmp_advantages.append(local_advantage) tmp_returns.append(local_return) # Get global advantages global_advantages = list( np.mean(np.array(tmp_advantages, dtype=np.float32), axis=0)) global_returns = list( np.mean(np.array(tmp_returns, dtype=np.float32), axis=0)) agent_buffer_trajectory[BufferKey.ADVANTAGES].set(global_advantages) agent_buffer_trajectory[BufferKey.DISCOUNTED_RETURNS].set( global_returns) self._append_to_update_buffer(agent_buffer_trajectory) # If this was a terminal trajectory, append stats and reset reward collection if trajectory.done_reached: self._update_end_episode_stats(agent_id, self.optimizer)