def test_poca_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811 # Test evaluate dummy_config.reward_signals = gail_dummy_config config = poca_dummy_config() optimizer = create_test_poca_optimizer(config, use_rnn=False, use_discrete=False, use_visual=False) # Test update update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec) # Mock out reward signal eval copy_buffer_fields( update_buffer, src_key=BufferKey.ENVIRONMENT_REWARDS, dst_keys=[ BufferKey.ADVANTAGES, RewardSignalUtil.returns_key("extrinsic"), RewardSignalUtil.value_estimates_key("extrinsic"), RewardSignalUtil.baseline_estimates_key("extrinsic"), RewardSignalUtil.returns_key("gail"), RewardSignalUtil.value_estimates_key("gail"), RewardSignalUtil.baseline_estimates_key("gail"), ], ) update_buffer[BufferKey.CONTINUOUS_LOG_PROBS] = np.ones_like( update_buffer[BufferKey.CONTINUOUS_ACTION]) optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, ) # Check if buffer size is too big update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec) # Mock out reward signal eval copy_buffer_fields( update_buffer, src_key=BufferKey.ENVIRONMENT_REWARDS, dst_keys=[ BufferKey.ADVANTAGES, RewardSignalUtil.returns_key("extrinsic"), RewardSignalUtil.value_estimates_key("extrinsic"), RewardSignalUtil.baseline_estimates_key("extrinsic"), RewardSignalUtil.returns_key("gail"), RewardSignalUtil.value_estimates_key("gail"), RewardSignalUtil.baseline_estimates_key("gail"), ], ) optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def test_poca_optimizer_update_curiosity( dummy_config, curiosity_dummy_config, rnn, visual, discrete # noqa: F811 ): # Test evaluate dummy_config.reward_signals = curiosity_dummy_config optimizer = create_test_poca_optimizer(dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual) # Test update update_buffer = mb.simulate_rollout( BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec, memory_size=optimizer.policy.m_size, ) # Mock out reward signal eval copy_buffer_fields( update_buffer, src_key=BufferKey.ENVIRONMENT_REWARDS, dst_keys=[ BufferKey.ADVANTAGES, RewardSignalUtil.returns_key("extrinsic"), RewardSignalUtil.value_estimates_key("extrinsic"), RewardSignalUtil.baseline_estimates_key("extrinsic"), RewardSignalUtil.returns_key("curiosity"), RewardSignalUtil.value_estimates_key("curiosity"), RewardSignalUtil.baseline_estimates_key("curiosity"), ], ) # Copy memories to critic memories copy_buffer_fields( update_buffer, BufferKey.MEMORY, [BufferKey.CRITIC_MEMORY, BufferKey.BASELINE_MEMORY], ) optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def test_poca_optimizer_update(dummy_config, rnn, visual, discrete): # Test evaluate optimizer = create_test_poca_optimizer(dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual) # Test update update_buffer = mb.simulate_rollout( BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec, memory_size=optimizer.policy.m_size, num_other_agents_in_group=NUM_AGENTS, ) # Mock out reward signal eval copy_buffer_fields( update_buffer, BufferKey.ENVIRONMENT_REWARDS, [ BufferKey.ADVANTAGES, RewardSignalUtil.returns_key("extrinsic"), RewardSignalUtil.value_estimates_key("extrinsic"), RewardSignalUtil.baseline_estimates_key("extrinsic"), ], ) # Copy memories to critic memories copy_buffer_fields( update_buffer, BufferKey.MEMORY, [BufferKey.CRITIC_MEMORY, BufferKey.BASELINE_MEMORY], ) return_stats = optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, ) # Make sure we have the right stats required_stats = [ "Losses/Policy Loss", "Losses/Value Loss", "Policy/Learning Rate", "Policy/Epsilon", "Policy/Beta", ] for stat in required_stats: assert stat in return_stats.keys()
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: """ Performs update on model. :param batch: Batch of experiences. :param num_sequences: Number of sequences to process. :return: Results of update. """ # Get decayed parameters decay_lr = self.decay_learning_rate.get_value( self.policy.get_current_step()) decay_eps = self.decay_epsilon.get_value( self.policy.get_current_step()) decay_bet = self.decay_beta.get_value(self.policy.get_current_step()) returns = {} old_values = {} old_baseline_values = {} for name in self.reward_signals: old_values[name] = ModelUtils.list_to_tensor( batch[RewardSignalUtil.value_estimates_key(name)]) returns[name] = ModelUtils.list_to_tensor( batch[RewardSignalUtil.returns_key(name)]) old_baseline_values[name] = ModelUtils.list_to_tensor( batch[RewardSignalUtil.baseline_estimates_key(name)]) n_obs = len(self.policy.behavior_spec.observation_specs) current_obs = ObsUtil.from_buffer(batch, n_obs) # Convert to tensors current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] groupmate_obs = GroupObsUtil.from_buffer(batch, n_obs) groupmate_obs = [[ ModelUtils.list_to_tensor(obs) for obs in _groupmate_obs ] for _groupmate_obs in groupmate_obs] act_masks = ModelUtils.list_to_tensor(batch[BufferKey.ACTION_MASK]) actions = AgentAction.from_buffer(batch) groupmate_actions = AgentAction.group_from_buffer(batch) memories = [ ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i]) for i in range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length) ] if len(memories) > 0: memories = torch.stack(memories).unsqueeze(0) value_memories = [ ModelUtils.list_to_tensor(batch[BufferKey.CRITIC_MEMORY][i]) for i in range(0, len(batch[BufferKey.CRITIC_MEMORY]), self.policy.sequence_length) ] baseline_memories = [ ModelUtils.list_to_tensor(batch[BufferKey.BASELINE_MEMORY][i]) for i in range(0, len(batch[BufferKey.BASELINE_MEMORY]), self.policy.sequence_length) ] if len(value_memories) > 0: value_memories = torch.stack(value_memories).unsqueeze(0) baseline_memories = torch.stack(baseline_memories).unsqueeze(0) log_probs, entropy = self.policy.evaluate_actions( current_obs, masks=act_masks, actions=actions, memories=memories, seq_len=self.policy.sequence_length, ) all_obs = [current_obs] + groupmate_obs values, _ = self.critic.critic_pass( all_obs, memories=value_memories, sequence_length=self.policy.sequence_length, ) groupmate_obs_and_actions = (groupmate_obs, groupmate_actions) baselines, _ = self.critic.baseline( current_obs, groupmate_obs_and_actions, memories=baseline_memories, sequence_length=self.policy.sequence_length, ) old_log_probs = ActionLogProbs.from_buffer(batch).flatten() log_probs = log_probs.flatten() loss_masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS], dtype=torch.bool) baseline_loss = ModelUtils.trust_region_value_loss( baselines, old_baseline_values, returns, decay_eps, loss_masks) value_loss = ModelUtils.trust_region_value_loss( values, old_values, returns, decay_eps, loss_masks) policy_loss = ModelUtils.trust_region_policy_loss( ModelUtils.list_to_tensor(batch[BufferKey.ADVANTAGES]), log_probs, old_log_probs, loss_masks, decay_eps, ) loss = (policy_loss + 0.5 * (value_loss + 0.5 * baseline_loss) - decay_bet * ModelUtils.masked_mean(entropy, loss_masks)) # Set optimizer learning rate ModelUtils.update_learning_rate(self.optimizer, decay_lr) self.optimizer.zero_grad() loss.backward() self.optimizer.step() update_stats = { # NOTE: abs() is not technically correct, but matches the behavior in TensorFlow. # TODO: After PyTorch is default, change to something more correct. "Losses/Policy Loss": torch.abs(policy_loss).item(), "Losses/Value Loss": value_loss.item(), "Losses/Baseline Loss": baseline_loss.item(), "Policy/Learning Rate": decay_lr, "Policy/Epsilon": decay_eps, "Policy/Beta": decay_bet, } for reward_provider in self.reward_signals.values(): update_stats.update(reward_provider.update(batch)) return update_stats
def _process_trajectory(self, trajectory: Trajectory) -> None: """ Takes a trajectory and processes it, putting it into the update buffer. Processing involves calculating value and advantage targets for model updating step. :param trajectory: The Trajectory tuple containing the steps to be processed. """ super()._process_trajectory(trajectory) agent_id = trajectory.agent_id # All the agents should have the same ID agent_buffer_trajectory = trajectory.to_agentbuffer() # Update the normalization if self.is_training: self.policy.update_normalization(agent_buffer_trajectory) # Get all value estimates ( value_estimates, baseline_estimates, value_next, value_memories, baseline_memories, ) = self.optimizer.get_trajectory_and_baseline_value_estimates( agent_buffer_trajectory, trajectory.next_obs, trajectory.next_group_obs, trajectory.all_group_dones_reached and trajectory.done_reached and not trajectory.interrupted, ) if value_memories is not None and baseline_memories is not None: agent_buffer_trajectory[BufferKey.CRITIC_MEMORY].set( value_memories) agent_buffer_trajectory[BufferKey.BASELINE_MEMORY].set( baseline_memories) for name, v in value_estimates.items(): agent_buffer_trajectory[RewardSignalUtil.value_estimates_key( name)].extend(v) agent_buffer_trajectory[RewardSignalUtil.baseline_estimates_key( name)].extend(baseline_estimates[name]) self._stats_reporter.add_stat( f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Baseline Estimate", np.mean(baseline_estimates[name]), ) self._stats_reporter.add_stat( f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate", np.mean(value_estimates[name]), ) self.collected_rewards["environment"][agent_id] += np.sum( agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS]) self.collected_group_rewards[agent_id] += np.sum( agent_buffer_trajectory[BufferKey.GROUP_REWARD]) for name, reward_signal in self.optimizer.reward_signals.items(): evaluate_result = ( reward_signal.evaluate(agent_buffer_trajectory) * reward_signal.strength) agent_buffer_trajectory[RewardSignalUtil.rewards_key(name)].extend( evaluate_result) # Report the reward signals self.collected_rewards[name][agent_id] += np.sum(evaluate_result) # Compute lambda returns and advantage tmp_advantages = [] for name in self.optimizer.reward_signals: local_rewards = np.array( agent_buffer_trajectory[RewardSignalUtil.rewards_key( name)].get_batch(), dtype=np.float32, ) baseline_estimate = agent_buffer_trajectory[ RewardSignalUtil.baseline_estimates_key(name)].get_batch() v_estimates = agent_buffer_trajectory[ RewardSignalUtil.value_estimates_key(name)].get_batch() lambd_returns = lambda_return( r=local_rewards, value_estimates=v_estimates, gamma=self.optimizer.reward_signals[name].gamma, lambd=self.hyperparameters.lambd, value_next=value_next[name], ) local_advantage = np.array(lambd_returns) - np.array( baseline_estimate) agent_buffer_trajectory[RewardSignalUtil.returns_key(name)].set( lambd_returns) agent_buffer_trajectory[RewardSignalUtil.advantage_key(name)].set( local_advantage) tmp_advantages.append(local_advantage) # Get global advantages global_advantages = list( np.mean(np.array(tmp_advantages, dtype=np.float32), axis=0)) agent_buffer_trajectory[BufferKey.ADVANTAGES].set(global_advantages) # Append to update buffer agent_buffer_trajectory.resequence_and_append( self.update_buffer, training_length=self.policy.sequence_length) # If this was a terminal trajectory, append stats and reset reward collection if trajectory.done_reached: self._update_end_episode_stats(agent_id, self.optimizer) # Remove dead agents from group reward recording if not trajectory.all_group_dones_reached: self.collected_group_rewards.pop(agent_id) # If the whole team is done, average the remaining group rewards. if trajectory.all_group_dones_reached and trajectory.done_reached: self.stats_reporter.add_stat( "Environment/Group Cumulative Reward", self.collected_group_rewards.get(agent_id, 0), aggregation=StatsAggregationMethod.HISTOGRAM, ) self.collected_group_rewards.pop(agent_id)