def test_agent_action_group_from_buffer(): buff = AgentBuffer() # Create some actions for _ in range(3): buff[BufferKey.GROUP_CONTINUOUS_ACTION].append( 3 * [np.ones((5,), dtype=np.float32)] ) buff[BufferKey.GROUP_DISCRETE_ACTION].append( 3 * [np.ones((4,), dtype=np.float32)] ) # Some agents have died for _ in range(2): buff[BufferKey.GROUP_CONTINUOUS_ACTION].append( 1 * [np.ones((5,), dtype=np.float32)] ) buff[BufferKey.GROUP_DISCRETE_ACTION].append( 1 * [np.ones((4,), dtype=np.float32)] ) # Get the group actions, which will be a List of Lists of AgentAction, where each element is the same # length as the AgentBuffer but contains only one agent's obs. Dead agents are padded by # NaNs. gact = AgentAction.group_from_buffer(buff) # Agent 0 is full agent_0_act = gact[0] assert agent_0_act.continuous_tensor.shape == (buff.num_experiences, 5) assert agent_0_act.discrete_tensor.shape == (buff.num_experiences, 4) agent_1_act = gact[1] assert agent_1_act.continuous_tensor.shape == (buff.num_experiences, 5) assert agent_1_act.discrete_tensor.shape == (buff.num_experiences, 4) assert (agent_1_act.continuous_tensor[0:3] > 0).all() assert (agent_1_act.continuous_tensor[3:] == 0).all() assert (agent_1_act.discrete_tensor[0:3] > 0).all() assert (agent_1_act.discrete_tensor[3:] == 0).all()
def get_trajectory_and_baseline_value_estimates( self, batch: AgentBuffer, next_obs: List[np.ndarray], next_groupmate_obs: List[List[np.ndarray]], done: bool, agent_id: str = "", ) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray], Dict[str, float], Optional[AgentBufferField], Optional[AgentBufferField], ]: """ Get value estimates, baseline estimates, and memories for a trajectory, in batch form. :param batch: An AgentBuffer that consists of a trajectory. :param next_obs: the next observation (after the trajectory). Used for boostrapping if this is not a termiinal trajectory. :param next_groupmate_obs: the next observations from other members of the group. :param done: Set true if this is a terminal trajectory. :param agent_id: Agent ID of the agent that this trajectory belongs to. :returns: A Tuple of the Value Estimates as a Dict of [name, np.ndarray(trajectory_len)], the baseline estimates as a Dict, the final value estimate as a Dict of [name, float], and optionally (if using memories) an AgentBufferField of initial critic and baseline memories to be used during update. """ n_obs = len(self.policy.behavior_spec.observation_specs) current_obs = ObsUtil.from_buffer(batch, n_obs) groupmate_obs = GroupObsUtil.from_buffer(batch, n_obs) current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] groupmate_obs = [[ ModelUtils.list_to_tensor(obs) for obs in _groupmate_obs ] for _groupmate_obs in groupmate_obs] groupmate_actions = AgentAction.group_from_buffer(batch) next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs] next_obs = [obs.unsqueeze(0) for obs in next_obs] next_groupmate_obs = [ ModelUtils.list_to_tensor_list(_list_obs) for _list_obs in next_groupmate_obs ] # Expand dimensions of next critic obs next_groupmate_obs = [[_obs.unsqueeze(0) for _obs in _list_obs] for _list_obs in next_groupmate_obs] if agent_id in self.value_memory_dict: # The agent_id should always be in both since they are added together _init_value_mem = self.value_memory_dict[agent_id] _init_baseline_mem = self.baseline_memory_dict[agent_id] else: _init_value_mem = (torch.zeros((1, 1, self.critic.memory_size)) if self.policy.use_recurrent else None) _init_baseline_mem = (torch.zeros((1, 1, self.critic.memory_size)) if self.policy.use_recurrent else None) all_obs = ([current_obs] + groupmate_obs if groupmate_obs is not None else [current_obs]) all_next_value_mem: Optional[AgentBufferField] = None all_next_baseline_mem: Optional[AgentBufferField] = None with torch.no_grad(): if self.policy.use_recurrent: ( value_estimates, baseline_estimates, all_next_value_mem, all_next_baseline_mem, next_value_mem, next_baseline_mem, ) = self._evaluate_by_sequence_team( current_obs, groupmate_obs, groupmate_actions, _init_value_mem, _init_baseline_mem, ) else: value_estimates, next_value_mem = self.critic.critic_pass( all_obs, _init_value_mem, sequence_length=batch.num_experiences) groupmate_obs_and_actions = (groupmate_obs, groupmate_actions) baseline_estimates, next_baseline_mem = self.critic.baseline( current_obs, groupmate_obs_and_actions, _init_baseline_mem, sequence_length=batch.num_experiences, ) # Store the memory for the next trajectory self.value_memory_dict[agent_id] = next_value_mem self.baseline_memory_dict[agent_id] = next_baseline_mem all_next_obs = ([next_obs] + next_groupmate_obs if next_groupmate_obs is not None else [next_obs]) next_value_estimates, _ = self.critic.critic_pass(all_next_obs, next_value_mem, sequence_length=1) for name, estimate in baseline_estimates.items(): baseline_estimates[name] = ModelUtils.to_numpy(estimate) for name, estimate in value_estimates.items(): value_estimates[name] = ModelUtils.to_numpy(estimate) # the base line and V shpuld not be on the same done flag for name, estimate in next_value_estimates.items(): next_value_estimates[name] = ModelUtils.to_numpy(estimate) if done: for k in next_value_estimates: if not self.reward_signals[k].ignore_done: next_value_estimates[k][-1] = 0.0 return ( value_estimates, baseline_estimates, next_value_estimates, all_next_value_mem, all_next_baseline_mem, )
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: """ Performs update on model. :param batch: Batch of experiences. :param num_sequences: Number of sequences to process. :return: Results of update. """ # Get decayed parameters decay_lr = self.decay_learning_rate.get_value( self.policy.get_current_step()) decay_eps = self.decay_epsilon.get_value( self.policy.get_current_step()) decay_bet = self.decay_beta.get_value(self.policy.get_current_step()) returns = {} old_values = {} old_baseline_values = {} for name in self.reward_signals: old_values[name] = ModelUtils.list_to_tensor( batch[RewardSignalUtil.value_estimates_key(name)]) returns[name] = ModelUtils.list_to_tensor( batch[RewardSignalUtil.returns_key(name)]) old_baseline_values[name] = ModelUtils.list_to_tensor( batch[RewardSignalUtil.baseline_estimates_key(name)]) n_obs = len(self.policy.behavior_spec.observation_specs) current_obs = ObsUtil.from_buffer(batch, n_obs) # Convert to tensors current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] groupmate_obs = GroupObsUtil.from_buffer(batch, n_obs) groupmate_obs = [[ ModelUtils.list_to_tensor(obs) for obs in _groupmate_obs ] for _groupmate_obs in groupmate_obs] act_masks = ModelUtils.list_to_tensor(batch[BufferKey.ACTION_MASK]) actions = AgentAction.from_buffer(batch) groupmate_actions = AgentAction.group_from_buffer(batch) memories = [ ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i]) for i in range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length) ] if len(memories) > 0: memories = torch.stack(memories).unsqueeze(0) value_memories = [ ModelUtils.list_to_tensor(batch[BufferKey.CRITIC_MEMORY][i]) for i in range(0, len(batch[BufferKey.CRITIC_MEMORY]), self.policy.sequence_length) ] baseline_memories = [ ModelUtils.list_to_tensor(batch[BufferKey.BASELINE_MEMORY][i]) for i in range(0, len(batch[BufferKey.BASELINE_MEMORY]), self.policy.sequence_length) ] if len(value_memories) > 0: value_memories = torch.stack(value_memories).unsqueeze(0) baseline_memories = torch.stack(baseline_memories).unsqueeze(0) log_probs, entropy = self.policy.evaluate_actions( current_obs, masks=act_masks, actions=actions, memories=memories, seq_len=self.policy.sequence_length, ) all_obs = [current_obs] + groupmate_obs values, _ = self.critic.critic_pass( all_obs, memories=value_memories, sequence_length=self.policy.sequence_length, ) groupmate_obs_and_actions = (groupmate_obs, groupmate_actions) baselines, _ = self.critic.baseline( current_obs, groupmate_obs_and_actions, memories=baseline_memories, sequence_length=self.policy.sequence_length, ) old_log_probs = ActionLogProbs.from_buffer(batch).flatten() log_probs = log_probs.flatten() loss_masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS], dtype=torch.bool) baseline_loss = ModelUtils.trust_region_value_loss( baselines, old_baseline_values, returns, decay_eps, loss_masks) value_loss = ModelUtils.trust_region_value_loss( values, old_values, returns, decay_eps, loss_masks) policy_loss = ModelUtils.trust_region_policy_loss( ModelUtils.list_to_tensor(batch[BufferKey.ADVANTAGES]), log_probs, old_log_probs, loss_masks, decay_eps, ) loss = (policy_loss + 0.5 * (value_loss + 0.5 * baseline_loss) - decay_bet * ModelUtils.masked_mean(entropy, loss_masks)) # Set optimizer learning rate ModelUtils.update_learning_rate(self.optimizer, decay_lr) self.optimizer.zero_grad() loss.backward() self.optimizer.step() update_stats = { # NOTE: abs() is not technically correct, but matches the behavior in TensorFlow. # TODO: After PyTorch is default, change to something more correct. "Losses/Policy Loss": torch.abs(policy_loss).item(), "Losses/Value Loss": value_loss.item(), "Losses/Baseline Loss": baseline_loss.item(), "Policy/Learning Rate": decay_lr, "Policy/Epsilon": decay_eps, "Policy/Beta": decay_bet, } for reward_provider in self.reward_signals.values(): update_stats.update(reward_provider.update(batch)) return update_stats