def test_agentprocessor(num_vis_obs): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) fake_action_outputs = { "action": ActionTuple(continuous=np.array([[0.1], [0.1]])), "entropy": np.array([1.0], dtype=np.float32), "learning_rate": 1.0, "log_probs": LogProbsTuple(continuous=np.array([[0.1], [0.1]])), } mock_decision_steps, mock_terminal_steps = mb.create_mock_steps( num_agents=2, observation_shapes=[(8,)] + num_vis_obs * [(84, 84, 3)], action_spec=ActionSpec.create_continuous(2), ) fake_action_info = ActionInfo( action=ActionTuple(continuous=np.array([[0.1], [0.1]])), env_action=ActionTuple(continuous=np.array([[0.1], [0.1]])), value=[0.1, 0.1], outputs=fake_action_outputs, agent_ids=mock_decision_steps.agent_id, ) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty() ) for _ in range(5): processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, fake_action_info ) # Assert that two trajectories have been added to the Trainer assert len(tqueue.put.call_args_list) == 2 # Assert that the trajectory is of length 5 trajectory = tqueue.put.call_args_list[0][0][0] assert len(trajectory.steps) == 5 # Assert that the AgentProcessor is empty assert len(processor.experience_buffers[0]) == 0 # Test empty steps mock_decision_steps, mock_terminal_steps = mb.create_mock_steps( num_agents=0, observation_shapes=[(8,)] + num_vis_obs * [(84, 84, 3)], action_spec=ActionSpec.create_continuous(2), ) processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty() ) # Assert that the AgentProcessor is still empty assert len(processor.experience_buffers[0]) == 0
def test_agentprocessor(num_vis_obs): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) mock_decision_steps, mock_terminal_steps = mb.create_mock_steps( num_agents=2, observation_specs=create_observation_specs_with_shapes( [(8,)] + num_vis_obs * [(84, 84, 3)] ), action_spec=ActionSpec.create_continuous(2), ) fake_action_info = _create_action_info(2, mock_decision_steps.agent_id) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty() ) for _ in range(5): processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, fake_action_info ) # Assert that two trajectories have been added to the Trainer assert len(tqueue.put.call_args_list) == 2 # Assert that the trajectory is of length 5 trajectory = tqueue.put.call_args_list[0][0][0] assert len(trajectory.steps) == 5 # Make sure ungrouped agents don't have team obs for step in trajectory.steps: assert len(step.group_status) == 0 # Assert that the AgentProcessor is empty assert len(processor._experience_buffers[0]) == 0 # Test empty steps mock_decision_steps, mock_terminal_steps = mb.create_mock_steps( num_agents=0, observation_specs=create_observation_specs_with_shapes( [(8,)] + num_vis_obs * [(84, 84, 3)] ), action_spec=ActionSpec.create_continuous(2), ) processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty() ) # Assert that the AgentProcessor is still empty assert len(processor._experience_buffers[0]) == 0
def get_action(self, decision_requests: DecisionSteps, worker_id: int = 0) -> ActionInfo: """ Decides actions given observations information, and takes them in environment. :param decision_requests: A dictionary of brain names and DecisionSteps from environment. :param worker_id: In parallel environment training, the unique id of the environment worker that the DecisionSteps came from. Used to construct a globally unique id for each agent. :return: an ActionInfo containing action, memories, values and an object to be passed to add experiences """ if len(decision_requests) == 0: return ActionInfo.empty() global_agent_ids = [ get_global_agent_id(worker_id, int(agent_id)) for agent_id in decision_requests.agent_id ] # For 1-D array, the iterator order is correct. run_out = self.evaluate( # pylint: disable=assignment-from-no-return decision_requests, global_agent_ids) self.save_memories(global_agent_ids, run_out.get("memory_out")) return ActionInfo( action=run_out.get("action"), value=run_out.get("value"), outputs=run_out, agent_ids=decision_requests.agent_id, )
def get_action(self, batched_step_result: BatchedStepResult, worker_id: int = 0) -> ActionInfo: """ Decides actions given observations information, and takes them in environment. :param batched_step_result: A dictionary of brain names and BatchedStepResult from environment. :param worker_id: In parallel environment training, the unique id of the environment worker that the BatchedStepResult came from. Used to construct a globally unique id for each agent. :return: an ActionInfo containing action, memories, values and an object to be passed to add experiences """ if batched_step_result.n_agents() == 0: return ActionInfo.empty() agents_done = [ agent for agent, done in zip(batched_step_result.agent_id, batched_step_result.done) if done ] self.remove_memories(agents_done) self.remove_previous_action(agents_done) global_agent_ids = [ get_global_agent_id(worker_id, int(agent_id)) for agent_id in batched_step_result.agent_id ] # For 1-D array, the iterator order is correct. run_out = self.evaluate( # pylint: disable=assignment-from-no-return batched_step_result, global_agent_ids) self.save_memories(global_agent_ids, run_out.get("memory_out")) action_info = ActionInfo( action=run_out.get("action"), value=run_out.get("value"), outputs=run_out, agent_ids=batched_step_result.agent_id, ) return action_info
def get_action(self, decision_requests: DecisionSteps, worker_id: int = 0) -> ActionInfo: """ Decides actions given observations information, and takes them in environment. :param worker_id: :param decision_requests: A dictionary of behavior names and DecisionSteps from environment. :return: an ActionInfo containing action, memories, values and an object to be passed to add experiences """ if len(decision_requests) == 0: return ActionInfo.empty() global_agent_ids = [ get_global_agent_id(worker_id, int(agent_id)) for agent_id in decision_requests.agent_id ] # For 1-D array, the iterator order is correct. run_out = self.evaluate(decision_requests, global_agent_ids) # pylint: disable=assignment-from-no-return self.save_memories(global_agent_ids, run_out.get("memory_out")) self.check_nan_action(run_out.get("action")) return ActionInfo( action=run_out.get("action"), env_action=run_out.get("env_action"), value=run_out.get("value"), outputs=run_out, agent_ids=list(decision_requests.agent_id), )
def test_agent_deletion(): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) fake_action_outputs = { "action": [0.1], "entropy": np.array([1.0], dtype=np.float32), "learning_rate": 1.0, "pre_action": [0.1], "log_probs": [0.1], } mock_step = mb.create_mock_batchedstep( num_agents=1, num_vector_observations=8, action_shape=[2], num_vis_observations=0, ) mock_done_step = mb.create_mock_batchedstep( num_agents=1, num_vector_observations=8, action_shape=[2], num_vis_observations=0, done=True, ) fake_action_info = ActionInfo( action=[0.1], value=[0.1], outputs=fake_action_outputs, agent_ids=mock_step.agent_id, ) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences(mock_step, 0, ActionInfo.empty()) # Run 3 trajectories, with different workers (to simulate different agents) add_calls = [] remove_calls = [] for _ep in range(3): for _ in range(5): processor.add_experiences(mock_step, _ep, fake_action_info) add_calls.append(mock.call([get_global_agent_id(_ep, 0)], [0.1])) processor.add_experiences(mock_done_step, _ep, fake_action_info) # Make sure we don't add experiences from the prior agents after the done remove_calls.append(mock.call([get_global_agent_id(_ep, 0)])) policy.save_previous_action.assert_has_calls(add_calls) policy.remove_previous_action.assert_has_calls(remove_calls) # Check that there are no experiences left assert len(processor.experience_buffers.keys()) == 0 assert len(processor.last_take_action_outputs.keys()) == 0 assert len(processor.episode_steps.keys()) == 0 assert len(processor.episode_rewards.keys()) == 0
def _process_step_infos(self, step_infos: List[EnvironmentStep]) -> int: for step_info in step_infos: for name_behavior_id in step_info.name_behavior_ids: if name_behavior_id not in self.agent_managers: logger.warning( "Agent manager was not created for behavior id {}.".format( name_behavior_id ) ) continue decision_steps, terminal_steps = step_info.current_all_step_result[ name_behavior_id ] self.agent_managers[name_behavior_id].add_experiences( decision_steps, terminal_steps, step_info.worker_id, step_info.brain_name_to_action_info.get( name_behavior_id, ActionInfo.empty() ), ) self.agent_managers[name_behavior_id].record_environment_stats( step_info.environment_stats, step_info.worker_id ) return len(step_infos)
def test_take_action_returns_empty_with_no_agents(): test_seed = 3 behavior_spec = basic_behavior_spec() policy = FakePolicy(test_seed, behavior_spec, TrainerSettings(), "output") no_agent_step = DecisionSteps.empty(behavior_spec) result = policy.get_action(no_agent_step) assert result == ActionInfo.empty()
def test_take_action_returns_empty_with_no_agents(): test_seed = 3 policy = FakePolicy(test_seed, basic_mock_brain(), basic_params()) # Doesn't really matter what this is dummy_groupspec = AgentGroupSpec([(1, )], "continuous", 1) no_agent_step = BatchedStepResult.empty(dummy_groupspec) result = policy.get_action(no_agent_step) assert result == ActionInfo.empty()
def test_take_action_returns_empty_with_no_agents(): test_seed = 3 policy = FakePolicy(test_seed, basic_mock_brain(), TrainerSettings(), "output") # Doesn't really matter what this is dummy_groupspec = BehaviorSpec([(1,)], "continuous", 1) no_agent_step = DecisionSteps.empty(dummy_groupspec) result = policy.get_action(no_agent_step) assert result == ActionInfo.empty()
def test_group_statuses(): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) mock_decision_steps, mock_terminal_steps = mb.create_mock_steps( num_agents=4, observation_specs=create_observation_specs_with_shapes([(8,)]), action_spec=ActionSpec.create_continuous(2), grouped=True, ) fake_action_info = _create_action_info(4, mock_decision_steps.agent_id) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty() ) for _ in range(2): processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, fake_action_info ) # Make terminal steps for some dead agents mock_decision_steps_2, mock_terminal_steps_2 = mb.create_mock_steps( num_agents=2, observation_specs=create_observation_specs_with_shapes([(8,)]), action_spec=ActionSpec.create_continuous(2), done=True, grouped=True, ) processor.add_experiences( mock_decision_steps_2, mock_terminal_steps_2, 0, fake_action_info ) fake_action_info = _create_action_info(4, mock_decision_steps.agent_id) for _ in range(3): processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, fake_action_info ) # Assert that four trajectories have been added to the Trainer assert len(tqueue.put.call_args_list) == 4 # Last trajectory should be the longest trajectory = tqueue.put.call_args_list[0][0][-1] # Make sure trajectory has the right Groupmate Experiences for step in trajectory.steps[0:3]: assert len(step.group_status) == 3 # After 2 agents has died for step in trajectory.steps[3:]: assert len(step.group_status) == 1
def test_agentprocessor(num_vis_obs): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) fake_action_outputs = { "action": [0.1, 0.1], "entropy": np.array([1.0], dtype=np.float32), "learning_rate": 1.0, "pre_action": [0.1, 0.1], "log_probs": [0.1, 0.1], } mock_step = mb.create_mock_batchedstep( num_agents=2, num_vector_observations=8, action_shape=[2], num_vis_observations=num_vis_obs, ) fake_action_info = ActionInfo( action=[0.1, 0.1], value=[0.1, 0.1], outputs=fake_action_outputs, agent_ids=mock_step.agent_id, ) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences(mock_step, 0, ActionInfo.empty()) for _ in range(5): processor.add_experiences(mock_step, 0, fake_action_info) # Assert that two trajectories have been added to the Trainer assert len(tqueue.put.call_args_list) == 2 # Assert that the trajectory is of length 5 trajectory = tqueue.put.call_args_list[0][0][0] assert len(trajectory.steps) == 5 # Assert that the AgentProcessor is empty assert len(processor.experience_buffers[0]) == 0 # Test empty BatchedStepResult mock_step = mb.create_mock_batchedstep( num_agents=0, num_vector_observations=8, action_shape=[2], num_vis_observations=num_vis_obs, ) processor.add_experiences(mock_step, 0, ActionInfo([], [], {}, [])) # Assert that the AgentProcessor is still empty assert len(processor.experience_buffers[0]) == 0
def test_end_episode(): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) fake_action_outputs = { "action": ActionTuple(continuous=np.array([[0.1]])), "entropy": np.array([1.0], dtype=np.float32), "learning_rate": 1.0, "log_probs": LogProbsTuple(continuous=np.array([[0.1]])), } mock_decision_step, mock_terminal_step = mb.create_mock_steps( num_agents=1, observation_shapes=[(8,)], action_spec=ActionSpec.create_continuous(2), ) fake_action_info = ActionInfo( action=ActionTuple(continuous=np.array([[0.1]])), env_action=ActionTuple(continuous=np.array([[0.1]])), value=[0.1], outputs=fake_action_outputs, agent_ids=mock_decision_step.agent_id, ) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences( mock_decision_step, mock_terminal_step, 0, ActionInfo.empty() ) # Run 3 trajectories, with different workers (to simulate different agents) remove_calls = [] for _ep in range(3): remove_calls.append(mock.call([get_global_agent_id(_ep, 0)])) for _ in range(5): processor.add_experiences( mock_decision_step, mock_terminal_step, _ep, fake_action_info ) # Make sure we don't add experiences from the prior agents after the done # Call end episode processor.end_episode() # Check that we removed every agent policy.remove_previous_action.assert_has_calls(remove_calls) # Check that there are no experiences left assert len(processor.experience_buffers.keys()) == 0 assert len(processor.last_take_action_outputs.keys()) == 0 assert len(processor.episode_steps.keys()) == 0 assert len(processor.episode_rewards.keys()) == 0
def get_action( self, decision_requests: DecisionSteps, worker_id: int = 0 ) -> ActionInfo: """ Decides actions given observations information, and takes them in environment. :param decision_requests: A dictionary of brain names and DecisionSteps from environment. :param worker_id: In parallel environment training, the unique id of the environment worker that the DecisionSteps came from. Used to construct a globally unique id for each agent. :return: an ActionInfo containing action, memories, values and an object to be passed to add experiences """ if len(decision_requests) == 0: return ActionInfo.empty() global_agent_ids = [ get_global_agent_id(worker_id, int(agent_id)) for agent_id in decision_requests.agent_id ] # For 1-D array, the iterator order is correct. run_out = self.evaluate( # pylint: disable=assignment-from-no-return decision_requests, global_agent_ids ) self.save_memories(global_agent_ids, run_out.get("memory_out")) # For Compatibility with buffer changes for hybrid action support if "log_probs" in run_out: log_probs_tuple = LogProbsTuple() if self.behavior_spec.action_spec.is_continuous(): log_probs_tuple.add_continuous(run_out["log_probs"]) else: log_probs_tuple.add_discrete(run_out["log_probs"]) run_out["log_probs"] = log_probs_tuple if "action" in run_out: action_tuple = ActionTuple() env_action_tuple = ActionTuple() if self.behavior_spec.action_spec.is_continuous(): action_tuple.add_continuous(run_out["pre_action"]) env_action_tuple.add_continuous(run_out["action"]) else: action_tuple.add_discrete(run_out["action"]) env_action_tuple.add_discrete(run_out["action"]) run_out["action"] = action_tuple run_out["env_action"] = env_action_tuple self.check_nan_action(run_out.get("action")) return ActionInfo( action=run_out.get("action"), env_action=run_out.get("env_action"), value=run_out.get("value"), outputs=run_out, agent_ids=decision_requests.agent_id, )
def test_group_statuses(): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) mock_decision_steps, mock_terminal_steps = mb.create_mock_steps( num_agents=4, observation_specs=create_observation_specs_with_shapes([(8,)]), action_spec=ActionSpec.create_continuous(2), grouped=True, ) fake_action_info = _create_action_info(4, mock_decision_steps.agent_id) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty() ) for _ in range(2): processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, fake_action_info ) # Make terminal steps for some dead agents _, mock_terminal_steps_2 = mb.create_mock_steps( num_agents=2, observation_specs=create_observation_specs_with_shapes([(8,)]), action_spec=ActionSpec.create_continuous(2), done=True, grouped=True, agent_ids=[2, 3], ) # Make decision steps continue for other agents mock_decision_steps_2, _ = mb.create_mock_steps( num_agents=2, observation_specs=create_observation_specs_with_shapes([(8,)]), action_spec=ActionSpec.create_continuous(2), done=False, grouped=True, agent_ids=[0, 1], ) processor.add_experiences( mock_decision_steps_2, mock_terminal_steps_2, 0, fake_action_info ) # Continue to add for remaining live agents fake_action_info = _create_action_info(4, mock_decision_steps_2.agent_id) for _ in range(3): processor.add_experiences( mock_decision_steps_2, mock_terminal_steps, 0, fake_action_info ) # Assert that four trajectories have been added to the Trainer assert len(tqueue.put.call_args_list) == 4 # Get the first trajectory, which should have been agent 2 (one of the killed agents) trajectory = tqueue.put.call_args_list[0][0][-1] assert len(trajectory.steps) == 3 # Make sure trajectory has the right Groupmate Experiences. # All three steps should contain all agents for step in trajectory.steps: assert len(step.group_status) == 3 # Last trajectory should be the longest. It should be that of agent 1, one of the surviving agents. trajectory = tqueue.put.call_args_list[-1][0][-1] assert len(trajectory.steps) == 5 # Make sure trajectory has the right Groupmate Experiences. # THe first 3 steps should contain all of the obs (that 3rd step is also the terminal step of 2 of the agents) for step in trajectory.steps[0:3]: assert len(step.group_status) == 3 # After 2 agents has died, there should only be 1 group status. for step in trajectory.steps[3:]: assert len(step.group_status) == 1