def test_agentprocessor(num_vis_obs): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) fake_action_outputs = { "action": ActionTuple(continuous=np.array([[0.1], [0.1]])), "entropy": np.array([1.0], dtype=np.float32), "learning_rate": 1.0, "log_probs": LogProbsTuple(continuous=np.array([[0.1], [0.1]])), } mock_decision_steps, mock_terminal_steps = mb.create_mock_steps( num_agents=2, observation_shapes=[(8,)] + num_vis_obs * [(84, 84, 3)], action_spec=ActionSpec.create_continuous(2), ) fake_action_info = ActionInfo( action=ActionTuple(continuous=np.array([[0.1], [0.1]])), env_action=ActionTuple(continuous=np.array([[0.1], [0.1]])), value=[0.1, 0.1], outputs=fake_action_outputs, agent_ids=mock_decision_steps.agent_id, ) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty() ) for _ in range(5): processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, fake_action_info ) # Assert that two trajectories have been added to the Trainer assert len(tqueue.put.call_args_list) == 2 # Assert that the trajectory is of length 5 trajectory = tqueue.put.call_args_list[0][0][0] assert len(trajectory.steps) == 5 # Assert that the AgentProcessor is empty assert len(processor.experience_buffers[0]) == 0 # Test empty steps mock_decision_steps, mock_terminal_steps = mb.create_mock_steps( num_agents=0, observation_shapes=[(8,)] + num_vis_obs * [(84, 84, 3)], action_spec=ActionSpec.create_continuous(2), ) processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty() ) # Assert that the AgentProcessor is still empty assert len(processor.experience_buffers[0]) == 0
def test_end_episode(): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) fake_action_outputs = { "action": ActionTuple(continuous=np.array([[0.1]])), "entropy": np.array([1.0], dtype=np.float32), "learning_rate": 1.0, "log_probs": LogProbsTuple(continuous=np.array([[0.1]])), } mock_decision_step, mock_terminal_step = mb.create_mock_steps( num_agents=1, observation_shapes=[(8,)], action_spec=ActionSpec.create_continuous(2), ) fake_action_info = ActionInfo( action=ActionTuple(continuous=np.array([[0.1]])), env_action=ActionTuple(continuous=np.array([[0.1]])), value=[0.1], outputs=fake_action_outputs, agent_ids=mock_decision_step.agent_id, ) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences( mock_decision_step, mock_terminal_step, 0, ActionInfo.empty() ) # Run 3 trajectories, with different workers (to simulate different agents) remove_calls = [] for _ep in range(3): remove_calls.append(mock.call([get_global_agent_id(_ep, 0)])) for _ in range(5): processor.add_experiences( mock_decision_step, mock_terminal_step, _ep, fake_action_info ) # Make sure we don't add experiences from the prior agents after the done # Call end episode processor.end_episode() # Check that we removed every agent policy.remove_previous_action.assert_has_calls(remove_calls) # Check that there are no experiences left assert len(processor.experience_buffers.keys()) == 0 assert len(processor.last_take_action_outputs.keys()) == 0 assert len(processor.episode_steps.keys()) == 0 assert len(processor.episode_rewards.keys()) == 0
def get_action( self, decision_requests: DecisionSteps, worker_id: int = 0 ) -> ActionInfo: """ Decides actions given observations information, and takes them in environment. :param decision_requests: A dictionary of brain names and DecisionSteps from environment. :param worker_id: In parallel environment training, the unique id of the environment worker that the DecisionSteps came from. Used to construct a globally unique id for each agent. :return: an ActionInfo containing action, memories, values and an object to be passed to add experiences """ if len(decision_requests) == 0: return ActionInfo.empty() global_agent_ids = [ get_global_agent_id(worker_id, int(agent_id)) for agent_id in decision_requests.agent_id ] # For 1-D array, the iterator order is correct. run_out = self.evaluate( # pylint: disable=assignment-from-no-return decision_requests, global_agent_ids ) self.save_memories(global_agent_ids, run_out.get("memory_out")) # For Compatibility with buffer changes for hybrid action support if "log_probs" in run_out: log_probs_tuple = LogProbsTuple() if self.behavior_spec.action_spec.is_continuous(): log_probs_tuple.add_continuous(run_out["log_probs"]) else: log_probs_tuple.add_discrete(run_out["log_probs"]) run_out["log_probs"] = log_probs_tuple if "action" in run_out: action_tuple = ActionTuple() env_action_tuple = ActionTuple() if self.behavior_spec.action_spec.is_continuous(): action_tuple.add_continuous(run_out["pre_action"]) env_action_tuple.add_continuous(run_out["action"]) else: action_tuple.add_discrete(run_out["action"]) env_action_tuple.add_discrete(run_out["action"]) run_out["action"] = action_tuple run_out["env_action"] = env_action_tuple self.check_nan_action(run_out.get("action")) return ActionInfo( action=run_out.get("action"), env_action=run_out.get("env_action"), value=run_out.get("value"), outputs=run_out, agent_ids=decision_requests.agent_id, )
def _create_action_info(num_agents: int, agent_ids: List[str]) -> ActionInfo: fake_action_outputs = { "action": ActionTuple( continuous=np.array([[0.1]] * num_agents, dtype=np.float32) ), "entropy": np.array([1.0], dtype=np.float32), "learning_rate": 1.0, "log_probs": LogProbsTuple( continuous=np.array([[0.1]] * num_agents, dtype=np.float32) ), } fake_action_info = ActionInfo( action=ActionTuple(continuous=np.array([[0.1]] * num_agents, dtype=np.float32)), env_action=ActionTuple( continuous=np.array([[0.1]] * num_agents, dtype=np.float32) ), outputs=fake_action_outputs, agent_ids=agent_ids, ) return fake_action_info
def make_fake_trajectory( length: int, observation_specs: List[ObservationSpec], action_spec: ActionSpec, max_step_complete: bool = False, memory_size: int = 10, num_other_agents_in_group: int = 0, ) -> Trajectory: """ Makes a fake trajectory of length length. If max_step_complete, the trajectory is terminated by a max step rather than a done. """ steps_list = [] action_size = action_spec.discrete_size + action_spec.continuous_size for _i in range(length - 1): obs = [] for obs_spec in observation_specs: obs.append(np.ones(obs_spec.shape, dtype=np.float32)) reward = 1.0 done = False action = ActionTuple( continuous=np.zeros(action_spec.continuous_size, dtype=np.float32), discrete=np.zeros(action_spec.discrete_size, dtype=np.int32), ) action_probs = LogProbsTuple( continuous=np.ones(action_spec.continuous_size, dtype=np.float32), discrete=np.ones(action_spec.discrete_size, dtype=np.float32), ) action_mask = ( [ [False for _ in range(branch)] for branch in action_spec.discrete_branches ] # type: ignore if action_spec.is_discrete() else None ) if action_spec.is_discrete(): prev_action = np.ones(action_size, dtype=np.int32) else: prev_action = np.ones(action_size, dtype=np.float32) max_step = False memory = np.ones(memory_size, dtype=np.float32) agent_id = "test_agent" behavior_id = "test_brain" group_status = [] for _ in range(num_other_agents_in_group): group_status.append(AgentStatus(obs, reward, action, done)) experience = AgentExperience( obs=obs, reward=reward, done=done, action=action, action_probs=action_probs, action_mask=action_mask, prev_action=prev_action, interrupted=max_step, memory=memory, group_status=group_status, group_reward=0, ) steps_list.append(experience) obs = [] for obs_spec in observation_specs: obs.append(np.ones(obs_spec.shape, dtype=np.float32)) last_experience = AgentExperience( obs=obs, reward=reward, done=not max_step_complete, action=action, action_probs=action_probs, action_mask=action_mask, prev_action=prev_action, interrupted=max_step_complete, memory=memory, group_status=group_status, group_reward=0, ) steps_list.append(last_experience) return Trajectory( steps=steps_list, agent_id=agent_id, behavior_id=behavior_id, next_obs=obs, next_group_obs=[obs] * num_other_agents_in_group, )
def _process_step( self, step: Union[TerminalStep, DecisionStep], worker_id: int, index: int ) -> None: terminated = isinstance(step, TerminalStep) global_agent_id = get_global_agent_id(worker_id, step.agent_id) global_group_id = get_global_group_id(worker_id, step.group_id) stored_decision_step, idx = self._last_step_result.get( global_agent_id, (None, None) ) stored_take_action_outputs = self._last_take_action_outputs.get( global_agent_id, None ) if not terminated: # Index is needed to grab from last_take_action_outputs self._last_step_result[global_agent_id] = (step, index) # This state is the consequence of a past action if stored_decision_step is not None and stored_take_action_outputs is not None: obs = stored_decision_step.obs if self.policy.use_recurrent: memory = self.policy.retrieve_previous_memories([global_agent_id])[0, :] else: memory = None done = terminated # Since this is an ongoing step interrupted = step.interrupted if terminated else False # Add the outputs of the last eval stored_actions = stored_take_action_outputs["action"] action_tuple = ActionTuple( continuous=stored_actions.continuous[idx], discrete=stored_actions.discrete[idx], ) stored_action_probs = stored_take_action_outputs["log_probs"] log_probs_tuple = LogProbsTuple( continuous=stored_action_probs.continuous[idx], discrete=stored_action_probs.discrete[idx], ) action_mask = stored_decision_step.action_mask prev_action = self.policy.retrieve_previous_action([global_agent_id])[0, :] # Assemble teammate_obs. If none saved, then it will be an empty list. group_statuses = [] for _id, _mate_status in self._group_status[global_group_id].items(): if _id != global_agent_id: group_statuses.append(_mate_status) experience = AgentExperience( obs=obs, reward=step.reward, done=done, action=action_tuple, action_probs=log_probs_tuple, action_mask=action_mask, prev_action=prev_action, interrupted=interrupted, memory=memory, group_status=group_statuses, group_reward=step.group_reward, ) # Add the value outputs if needed self._experience_buffers[global_agent_id].append(experience) self._episode_rewards[global_agent_id] += step.reward if not terminated: self._episode_steps[global_agent_id] += 1 # Add a trajectory segment to the buffer if terminal or the length has reached the time horizon if ( len(self._experience_buffers[global_agent_id]) >= self._max_trajectory_length or terminated ): next_obs = step.obs next_group_obs = [] for _id, _obs in self._current_group_obs[global_group_id].items(): if _id != global_agent_id: next_group_obs.append(_obs) trajectory = Trajectory( steps=self._experience_buffers[global_agent_id], agent_id=global_agent_id, next_obs=next_obs, next_group_obs=next_group_obs, behavior_id=self._behavior_id, ) for traj_queue in self._trajectory_queues: traj_queue.put(trajectory) self._experience_buffers[global_agent_id] = [] if terminated: # Record episode length. self._stats_reporter.add_stat( "Environment/Episode Length", self._episode_steps.get(global_agent_id, 0), ) self._clean_agent_data(global_agent_id)