def test_agent_deletion(): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) fake_action_outputs = { "action": [0.1], "entropy": np.array([1.0], dtype=np.float32), "learning_rate": 1.0, "pre_action": [0.1], "log_probs": [0.1], } mock_step = mb.create_mock_batchedstep( num_agents=1, num_vector_observations=8, action_shape=[2], num_vis_observations=0, ) mock_done_step = mb.create_mock_batchedstep( num_agents=1, num_vector_observations=8, action_shape=[2], num_vis_observations=0, done=True, ) fake_action_info = ActionInfo( action=[0.1], value=[0.1], outputs=fake_action_outputs, agent_ids=mock_step.agent_id, ) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences(mock_step, 0, ActionInfo.empty()) # Run 3 trajectories, with different workers (to simulate different agents) add_calls = [] remove_calls = [] for _ep in range(3): for _ in range(5): processor.add_experiences(mock_step, _ep, fake_action_info) add_calls.append(mock.call([get_global_agent_id(_ep, 0)], [0.1])) processor.add_experiences(mock_done_step, _ep, fake_action_info) # Make sure we don't add experiences from the prior agents after the done remove_calls.append(mock.call([get_global_agent_id(_ep, 0)])) policy.save_previous_action.assert_has_calls(add_calls) policy.remove_previous_action.assert_has_calls(remove_calls) # Check that there are no experiences left assert len(processor.experience_buffers.keys()) == 0 assert len(processor.last_take_action_outputs.keys()) == 0 assert len(processor.episode_steps.keys()) == 0 assert len(processor.episode_rewards.keys()) == 0
def add_experiences( self, decision_steps: DecisionSteps, terminal_steps: TerminalSteps, worker_id: int, previous_action: ActionInfo, ) -> None: """ Adds experiences to each agent's experience history. :param decision_steps: current DecisionSteps. :param terminal_steps: current TerminalSteps. :param previous_action: The outputs of the Policy's get_action method. """ take_action_outputs = previous_action.outputs if take_action_outputs: for _entropy in take_action_outputs["entropy"]: self.stats_reporter.add_stat("Policy/Entropy", _entropy) # Make unique agent_ids that are global across workers action_global_agent_ids = [ get_global_agent_id(worker_id, ag_id) for ag_id in previous_action.agent_ids ] for global_id in action_global_agent_ids: if global_id in self.last_step_result: # Don't store if agent just reset self.last_take_action_outputs[global_id] = take_action_outputs # Iterate over all the terminal steps for terminal_step in terminal_steps.values(): local_id = terminal_step.agent_id global_id = get_global_agent_id(worker_id, local_id) self._process_step( terminal_step, global_id, terminal_steps.agent_id_to_index[local_id] ) # Iterate over all the decision steps for ongoing_step in decision_steps.values(): local_id = ongoing_step.agent_id global_id = get_global_agent_id(worker_id, local_id) self._process_step( ongoing_step, global_id, decision_steps.agent_id_to_index[local_id] ) for _gid in action_global_agent_ids: # If the ID doesn't have a last step result, the agent just reset, # don't store the action. if _gid in self.last_step_result: if "action" in take_action_outputs: self.policy.save_previous_action( [_gid], take_action_outputs["action"] )
def get_action(self, batched_step_result: BatchedStepResult, worker_id: int = 0) -> ActionInfo: """ Decides actions given observations information, and takes them in environment. :param batched_step_result: A dictionary of brain names and BatchedStepResult from environment. :param worker_id: In parallel environment training, the unique id of the environment worker that the BatchedStepResult came from. Used to construct a globally unique id for each agent. :return: an ActionInfo containing action, memories, values and an object to be passed to add experiences """ if batched_step_result.n_agents() == 0: return ActionInfo.empty() agents_done = [ agent for agent, done in zip(batched_step_result.agent_id, batched_step_result.done) if done ] self.remove_memories(agents_done) self.remove_previous_action(agents_done) global_agent_ids = [ get_global_agent_id(worker_id, int(agent_id)) for agent_id in batched_step_result.agent_id ] # For 1-D array, the iterator order is correct. run_out = self.evaluate( # pylint: disable=assignment-from-no-return batched_step_result, global_agent_ids) self.save_memories(global_agent_ids, run_out.get("memory_out")) action_info = ActionInfo( action=run_out.get("action"), value=run_out.get("value"), outputs=run_out, agent_ids=batched_step_result.agent_id, ) return action_info
def get_action(self, decision_requests: DecisionSteps, worker_id: int = 0) -> ActionInfo: """ Decides actions given observations information, and takes them in environment. :param decision_requests: A dictionary of brain names and DecisionSteps from environment. :param worker_id: In parallel environment training, the unique id of the environment worker that the DecisionSteps came from. Used to construct a globally unique id for each agent. :return: an ActionInfo containing action, memories, values and an object to be passed to add experiences """ if len(decision_requests) == 0: return ActionInfo.empty() global_agent_ids = [ get_global_agent_id(worker_id, int(agent_id)) for agent_id in decision_requests.agent_id ] # For 1-D array, the iterator order is correct. run_out = self.evaluate( # pylint: disable=assignment-from-no-return decision_requests, global_agent_ids) self.save_memories(global_agent_ids, run_out.get("memory_out")) return ActionInfo( action=run_out.get("action"), value=run_out.get("value"), outputs=run_out, agent_ids=decision_requests.agent_id, )
def add_experiences( self, batched_step_result: BatchedStepResult, worker_id: int, previous_action: ActionInfo, ) -> None: """ Adds experiences to each agent's experience history. :param batched_step_result: current BatchedStepResult. :param previous_action: The outputs of the Policy's get_action method. """ take_action_outputs = previous_action.outputs if take_action_outputs: for _entropy in take_action_outputs["entropy"]: self.stats_reporter.add_stat("Policy/Entropy", _entropy) self.stats_reporter.add_stat("Policy/Learning Rate", take_action_outputs["learning_rate"]) # Make unique agent_ids that are global across workers action_global_agent_ids = [ get_global_agent_id(worker_id, ag_id) for ag_id in previous_action.agent_ids ] for global_id in action_global_agent_ids: self.last_take_action_outputs[global_id] = take_action_outputs for _id in batched_step_result.agent_id: # Assume agent_id is 1-D local_id = int( _id ) # Needed for mypy to pass since ndarray has no content type curr_agent_step = batched_step_result.get_agent_step_result( local_id) global_id = get_global_agent_id(worker_id, local_id) stored_step = self.last_step_result.get(global_id, None) stored_take_action_outputs = self.last_take_action_outputs.get( global_id, None) if stored_step is not None and stored_take_action_outputs is not None: # We know the step is from the same worker, so use the local agent id. stored_agent_step = stored_step.get_agent_step_result(local_id) idx = stored_step.agent_id_to_index[local_id] obs = stored_agent_step.obs if not stored_agent_step.done: if self.policy.use_recurrent: memory = self.policy.retrieve_memories([global_id ])[0, :] else: memory = None done = curr_agent_step.done max_step = curr_agent_step.max_step # Add the outputs of the last eval action = stored_take_action_outputs["action"][idx] if self.policy.use_continuous_act: action_pre = stored_take_action_outputs["pre_action"][ idx] else: action_pre = None action_probs = stored_take_action_outputs["log_probs"][idx] action_mask = stored_agent_step.action_mask prev_action = self.policy.retrieve_previous_action( [global_id])[0, :] experience = AgentExperience( obs=obs, reward=curr_agent_step.reward, done=done, action=action, action_probs=action_probs, action_pre=action_pre, action_mask=action_mask, prev_action=prev_action, max_step=max_step, memory=memory, ) # Add the value outputs if needed self.experience_buffers[global_id].append(experience) self.episode_rewards[global_id] += curr_agent_step.reward if (curr_agent_step.done or (len(self.experience_buffers[global_id]) >= self.max_trajectory_length)) and len( self.experience_buffers[global_id]) > 0: # Make next AgentExperience next_obs = curr_agent_step.obs trajectory = Trajectory( steps=self.experience_buffers[global_id], agent_id=global_id, next_obs=next_obs, behavior_id=self.behavior_id, ) for traj_queue in self.trajectory_queues: traj_queue.put(trajectory) self.experience_buffers[global_id] = [] if curr_agent_step.done: self.stats_reporter.add_stat( "Environment/Cumulative Reward", self.episode_rewards.get(global_id, 0), ) self.stats_reporter.add_stat( "Environment/Episode Length", self.episode_steps.get(global_id, 0), ) del self.episode_steps[global_id] del self.episode_rewards[global_id] elif not curr_agent_step.done: self.episode_steps[global_id] += 1 self.last_step_result[global_id] = batched_step_result if "action" in take_action_outputs: self.policy.save_previous_action(previous_action.agent_ids, take_action_outputs["action"])
def add_experiences( self, batched_step_result: BatchedStepResult, worker_id: int, previous_action: ActionInfo, ) -> None: """ Adds experiences to each agent's experience history. :param batched_step_result: current BatchedStepResult. :param previous_action: The outputs of the Policy's get_action method. """ take_action_outputs = previous_action.outputs if take_action_outputs: for _entropy in take_action_outputs["entropy"]: self.stats_reporter.add_stat("Policy/Entropy", _entropy) # Make unique agent_ids that are global across workers action_global_agent_ids = [ get_global_agent_id(worker_id, ag_id) for ag_id in previous_action.agent_ids ] for global_id in action_global_agent_ids: if global_id in self.last_step_result: # Don't store if agent just reset self.last_take_action_outputs[global_id] = take_action_outputs for _id in batched_step_result.agent_id: # Assume agent_id is 1-D local_id = int( _id ) # Needed for mypy to pass since ndarray has no content type curr_agent_step = batched_step_result.get_agent_step_result( local_id) global_id = get_global_agent_id(worker_id, local_id) stored_agent_step, idx = self.last_step_result.get( global_id, (None, None)) stored_take_action_outputs = self.last_take_action_outputs.get( global_id, None) if stored_agent_step is not None and stored_take_action_outputs is not None: # We know the step is from the same worker, so use the local agent id. obs = stored_agent_step.obs if not stored_agent_step.done: if self.policy.use_recurrent: memory = self.policy.retrieve_memories([global_id ])[0, :] else: memory = None done = curr_agent_step.done max_step = curr_agent_step.max_step # Add the outputs of the last eval action = stored_take_action_outputs["action"][idx] if self.policy.use_continuous_act: action_pre = stored_take_action_outputs["pre_action"][ idx] else: action_pre = None action_probs = stored_take_action_outputs["log_probs"][idx] action_mask = stored_agent_step.action_mask prev_action = self.policy.retrieve_previous_action( [global_id])[0, :] experience = AgentExperience( obs=obs, reward=curr_agent_step.reward, done=done, action=action, action_probs=action_probs, action_pre=action_pre, action_mask=action_mask, prev_action=prev_action, max_step=max_step, memory=memory, ) # Add the value outputs if needed self.experience_buffers[global_id].append(experience) self.episode_rewards[global_id] += curr_agent_step.reward if (curr_agent_step.done or (len(self.experience_buffers[global_id]) >= self.max_trajectory_length)) and len( self.experience_buffers[global_id]) > 0: # Make next AgentExperience next_obs = curr_agent_step.obs trajectory = Trajectory( steps=self.experience_buffers[global_id], agent_id=global_id, next_obs=next_obs, behavior_id=self.behavior_id, ) for traj_queue in self.trajectory_queues: traj_queue.put(trajectory) self.experience_buffers[global_id] = [] if curr_agent_step.done: # Record episode length for agents which have had at least # 1 step. Done after reset ignored. self.stats_reporter.add_stat( "Environment/Episode Length", self.episode_steps.get(global_id, 0), ) elif not curr_agent_step.done: self.episode_steps[global_id] += 1 # Index is needed to grab from last_take_action_outputs self.last_step_result[global_id] = ( curr_agent_step, batched_step_result.agent_id_to_index[_id], ) # Delete all done agents, regardless of if they had a 0-length episode. if curr_agent_step.done: self._clean_agent_data(global_id) for _gid in action_global_agent_ids: # If the ID doesn't have a last step result, the agent just reset, # don't store the action. if _gid in self.last_step_result: if "action" in take_action_outputs: self.policy.save_previous_action( [_gid], take_action_outputs["action"])