def test_batched_step_result_from_proto(): n_agents = 10 shapes = [(3, ), (4, )] spec = BehaviorSpec(create_observation_specs_with_shapes(shapes), ActionSpec.create_continuous(3)) ap_list = generate_list_agent_proto(n_agents, shapes) decision_steps, terminal_steps = steps_from_proto(ap_list, spec) for agent_id in range(n_agents): if agent_id in decision_steps: # we set the reward equal to the agent id in generate_list_agent_proto assert decision_steps[agent_id].reward == agent_id elif agent_id in terminal_steps: assert terminal_steps[agent_id].reward == agent_id else: raise Exception("Missing agent from the steps") # We sort the AgentId since they are split between DecisionSteps and TerminalSteps combined_agent_id = list(decision_steps.agent_id) + list( terminal_steps.agent_id) combined_agent_id.sort() assert combined_agent_id == list(range(n_agents)) for agent_id in range(n_agents): assert (agent_id in terminal_steps) == (agent_id % 2 == 0) if agent_id in terminal_steps: assert terminal_steps[agent_id].interrupted == (agent_id % 4 == 0) assert decision_steps.obs[0].shape[1] == shapes[0][0] assert decision_steps.obs[1].shape[1] == shapes[1][0] assert terminal_steps.obs[0].shape[1] == shapes[0][0] assert terminal_steps.obs[1].shape[1] == shapes[1][0]
def test_action_masking_continuous(): n_agents = 10 shapes = [(3, ), (4, )] behavior_spec = BehaviorSpec(shapes, ActionType.CONTINUOUS, 10) ap_list = generate_list_agent_proto(n_agents, shapes) decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec) masks = decision_steps.action_mask assert masks is None
def test_action_masking_continuous(): n_agents = 10 shapes = [(3, ), (4, )] behavior_spec = BehaviorSpec(create_observation_specs_with_shapes(shapes), ActionSpec.create_continuous(10)) ap_list = generate_list_agent_proto(n_agents, shapes) decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec) masks = decision_steps.action_mask assert masks is None
def test_action_masking_discrete_1(): n_agents = 10 shapes = [(3, ), (4, )] behavior_spec = BehaviorSpec(shapes, ActionType.DISCRETE, (10, )) ap_list = generate_list_agent_proto(n_agents, shapes) decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec) masks = decision_steps.action_mask assert isinstance(masks, list) assert len(masks) == 1 assert masks[0].shape == (n_agents / 2, 10) assert masks[0][0, 0]
def test_action_masking_discrete_2(): n_agents = 10 shapes = [(3, ), (4, )] behavior_spec = BehaviorSpec(shapes, ActionSpec.create_discrete((2, 2, 6))) ap_list = generate_list_agent_proto(n_agents, shapes) decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec) masks = decision_steps.action_mask assert isinstance(masks, list) assert len(masks) == 3 assert masks[0].shape == (n_agents / 2, 2) assert masks[1].shape == (n_agents / 2, 2) assert masks[2].shape == (n_agents / 2, 6) assert masks[0][0, 0]
def test_action_masking_discrete(): n_agents = 10 shapes = [(3, ), (4, )] behavior_spec = BehaviorSpec(create_observation_specs_with_shapes(shapes), ActionSpec.create_discrete((7, 3))) ap_list = generate_list_agent_proto(n_agents, shapes) decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec) masks = decision_steps.action_mask assert isinstance(masks, list) assert len(masks) == 2 assert masks[0].shape == (n_agents / 2, 7) # half agents are done assert masks[1].shape == (n_agents / 2, 3) # half agents are done assert masks[0][0, 0] assert not masks[1][0, 0] assert masks[1][0, 1]
def _update_state(self, output: UnityRLOutputProto) -> None: """ Collects experience information from all external brains in environment at current step. """ for brain_name in self._env_specs.keys(): if brain_name in output.agentInfos: agent_info_list = output.agentInfos[brain_name].value self._env_state[brain_name] = steps_from_proto( agent_info_list, self._env_specs[brain_name] ) else: self._env_state[brain_name] = ( DecisionSteps.empty(self._env_specs[brain_name]), TerminalSteps.empty(self._env_specs[brain_name]), ) self._side_channel_manager.process_side_channel_message(output.side_channel)
def make_demo_buffer( pair_infos: List[AgentInfoActionPairProto], behavior_spec: BehaviorSpec, sequence_length: int, ) -> AgentBuffer: # Create and populate buffer using experiences demo_raw_buffer = AgentBuffer() demo_processed_buffer = AgentBuffer() for idx, current_pair_info in enumerate(pair_infos): if idx > len(pair_infos) - 2: break next_pair_info = pair_infos[idx + 1] current_decision_step, current_terminal_step = steps_from_proto( [current_pair_info.agent_info], behavior_spec ) next_decision_step, next_terminal_step = steps_from_proto( [next_pair_info.agent_info], behavior_spec ) previous_action = ( np.array( pair_infos[idx].action_info.vector_actions_deprecated, dtype=np.float32 ) * 0 ) if idx > 0: previous_action = np.array( pair_infos[idx - 1].action_info.vector_actions_deprecated, dtype=np.float32, ) next_done = len(next_terminal_step) == 1 next_reward = 0 if len(next_terminal_step) == 1: next_reward = next_terminal_step.reward[0] else: next_reward = next_decision_step.reward[0] current_obs = None if len(current_terminal_step) == 1: current_obs = list(current_terminal_step.values())[0].obs else: current_obs = list(current_decision_step.values())[0].obs demo_raw_buffer["done"].append(next_done) demo_raw_buffer["rewards"].append(next_reward) for i, obs in enumerate(current_obs): demo_raw_buffer[ObsUtil.get_name_at(i)].append(obs) if ( len(current_pair_info.action_info.continuous_actions) == 0 and len(current_pair_info.action_info.discrete_actions) == 0 ): if behavior_spec.action_spec.continuous_size > 0: demo_raw_buffer["continuous_action"].append( current_pair_info.action_info.vector_actions_deprecated ) else: demo_raw_buffer["discrete_action"].append( current_pair_info.action_info.vector_actions_deprecated ) else: if behavior_spec.action_spec.continuous_size > 0: demo_raw_buffer["continuous_action"].append( current_pair_info.action_info.continuous_actions ) if behavior_spec.action_spec.discrete_size > 0: demo_raw_buffer["discrete_action"].append( current_pair_info.action_info.discrete_actions ) demo_raw_buffer["prev_action"].append(previous_action) if next_done: demo_raw_buffer.resequence_and_append( demo_processed_buffer, batch_size=None, training_length=sequence_length ) demo_raw_buffer.reset_agent() demo_raw_buffer.resequence_and_append( demo_processed_buffer, batch_size=None, training_length=sequence_length ) return demo_processed_buffer