def make_demo_buffer( pair_infos: List[AgentInfoActionPairProto], brain_params: BrainParameters, sequence_length: int, ) -> AgentBuffer: # Create and populate buffer using experiences demo_raw_buffer = AgentBuffer() demo_processed_buffer = AgentBuffer() for idx, experience in enumerate(pair_infos): if idx > len(pair_infos) - 2: break current_pair_info = pair_infos[idx] next_pair_info = pair_infos[idx + 1] current_brain_info = BrainInfo.from_agent_proto( 0, [current_pair_info.agent_info], brain_params ) next_brain_info = BrainInfo.from_agent_proto( 0, [next_pair_info.agent_info], brain_params ) previous_action = ( np.array(pair_infos[idx].action_info.vector_actions, dtype=np.float32) * 0 ) if idx > 0: previous_action = np.array( pair_infos[idx - 1].action_info.vector_actions, dtype=np.float32 ) demo_raw_buffer["done"].append(next_brain_info.local_done[0]) demo_raw_buffer["rewards"].append(next_brain_info.rewards[0]) for i in range(brain_params.number_visual_observations): demo_raw_buffer["visual_obs%d" % i].append( current_brain_info.visual_observations[i][0] ) if brain_params.vector_observation_space_size > 0: demo_raw_buffer["vector_obs"].append( current_brain_info.vector_observations[0] ) demo_raw_buffer["actions"].append(current_pair_info.action_info.vector_actions) demo_raw_buffer["prev_action"].append(previous_action) if next_brain_info.local_done[0]: demo_raw_buffer.resequence_and_append( demo_processed_buffer, batch_size=None, training_length=sequence_length ) demo_raw_buffer.reset_agent() demo_raw_buffer.resequence_and_append( demo_processed_buffer, batch_size=None, training_length=sequence_length ) return demo_processed_buffer
def test_take_action_returns_nones_on_missing_values(): test_seed = 3 policy = TFPolicy(test_seed, basic_mock_brain(), basic_params()) policy.evaluate = MagicMock(return_value={}) policy.save_memories = MagicMock() brain_info_with_agents = BrainInfo([], [], [], agents=["an-agent-id"], local_done=[False]) result = policy.get_action(brain_info_with_agents) assert result == ActionInfo(None, None, {})
def test_take_action_returns_action_info_when_available(): test_seed = 3 policy = TFPolicy(test_seed, basic_mock_brain(), basic_params()) policy_eval_out = { "action": np.array([1.0], dtype=np.float32), "memory_out": np.array([[2.5]], dtype=np.float32), "value": np.array([1.1], dtype=np.float32), } policy.evaluate = MagicMock(return_value=policy_eval_out) brain_info_with_agents = BrainInfo([], [], [], agents=["an-agent-id"], local_done=[False]) result = policy.get_action(brain_info_with_agents) expected = ActionInfo(policy_eval_out["action"], policy_eval_out["value"], policy_eval_out) assert result == expected
def step_result_to_brain_info( step_result: BatchedStepResult, group_spec: AgentGroupSpec, agent_id_prefix: int = None, ) -> BrainInfo: n_agents = step_result.n_agents() vis_obs_indices = [] vec_obs_indices = [] for index, observation in enumerate(step_result.obs): if len(observation.shape) == 2: vec_obs_indices.append(index) elif len(observation.shape) == 4: vis_obs_indices.append(index) else: raise UnityEnvironmentException( "Invalid input received from the environment, the observation should " "either be a vector of float or a PNG image") if len(vec_obs_indices) == 0: vec_obs = np.zeros((n_agents, 0), dtype=np.float32) else: vec_obs = np.concatenate([step_result.obs[i] for i in vec_obs_indices], axis=1) vis_obs = [step_result.obs[i] for i in vis_obs_indices] mask = np.ones((n_agents, np.sum(group_spec.action_size)), dtype=np.float32) if group_spec.is_action_discrete(): mask = np.ones((n_agents, np.sum(group_spec.discrete_action_branches)), dtype=np.float32) if step_result.action_mask is not None: mask = 1 - np.concatenate(step_result.action_mask, axis=1) if agent_id_prefix is None: agent_ids = [str(ag_id) for ag_id in list(step_result.agent_id)] else: agent_ids = [ f"${agent_id_prefix}-{ag_id}" for ag_id in step_result.agent_id ] return BrainInfo( vis_obs, vec_obs, list(step_result.reward), agent_ids, list(step_result.done), list(step_result.max_step), mask, )
def construct_curr_info(self, next_info: BrainInfo) -> BrainInfo: """ Constructs a BrainInfo which contains the most recent previous experiences for all agents which correspond to the agents in a provided next_info. :BrainInfo next_info: A t+1 BrainInfo. :return: curr_info: Reconstructed BrainInfo to match agents of next_info. """ visual_observations: List[List[Any]] = [ [] for _ in next_info.visual_observations ] # TODO add types to brain.py methods vector_observations = [] rewards = [] local_dones = [] max_reacheds = [] agents = [] action_masks = [] for agent_id in next_info.agents: agent_brain_info = self.processing_buffer[agent_id].last_brain_info if agent_brain_info is None: agent_brain_info = next_info agent_index = agent_brain_info.agents.index(agent_id) for i in range(len(next_info.visual_observations)): visual_observations[i].append( agent_brain_info.visual_observations[i][agent_index]) vector_observations.append( agent_brain_info.vector_observations[agent_index]) rewards.append(agent_brain_info.rewards[agent_index]) local_dones.append(agent_brain_info.local_done[agent_index]) max_reacheds.append(agent_brain_info.max_reached[agent_index]) agents.append(agent_brain_info.agents[agent_index]) action_masks.append(agent_brain_info.action_masks[agent_index]) curr_info = BrainInfo( visual_observations, vector_observations, rewards, agents, local_dones, max_reacheds, action_masks, ) return curr_info
def test_take_action_returns_empty_with_no_agents(): test_seed = 3 policy = TFPolicy(test_seed, basic_mock_brain(), basic_params()) no_agent_brain_info = BrainInfo([], [], [], agents=[]) result = policy.get_action(no_agent_brain_info) assert result == ActionInfo([], [], None)