def on_episode_end(self, *, worker: "RolloutWorker", base_env: BaseEnv, policies: Dict[PolicyID, Policy], episode: Episode, env_index: Optional[int] = None, **kwargs) -> None: snapshot = tracemalloc.take_snapshot() top_stats = snapshot.statistics("lineno") for stat in top_stats[:10]: count = stat.count size = stat.size trace = str(stat.traceback) episode.custom_metrics[f"tracemalloc/{trace}/size"] = size episode.custom_metrics[f"tracemalloc/{trace}/count"] = count process = psutil.Process(os.getpid()) worker_rss = process.memory_info().rss worker_data = process.memory_info().data worker_vms = process.memory_info().vms episode.custom_metrics["tracemalloc/worker/rss"] = worker_rss episode.custom_metrics["tracemalloc/worker/data"] = worker_data episode.custom_metrics["tracemalloc/worker/vms"] = worker_vms
def add_init_obs( self, episode: Episode, agent_id: AgentID, env_id: EnvID, policy_id: PolicyID, t: int, init_obs: TensorType, ) -> None: # Make sure our mappings are up to date. agent_key = (episode.episode_id, agent_id) self.agent_key_to_policy_id[agent_key] = policy_id policy = self.policy_map[policy_id] # Add initial obs to Trajectory. assert agent_key not in self.agent_collectors # TODO: determine exact shift-before based on the view-req shifts. self.agent_collectors[agent_key] = _AgentCollector( policy.view_requirements, policy) self.agent_collectors[agent_key].add_init_obs( episode_id=episode.episode_id, agent_index=episode._agent_index(agent_id), env_id=env_id, t=t, init_obs=init_obs, ) self.episodes[episode.episode_id] = episode if episode.batch_builder is None: episode.batch_builder = (self.policy_collector_groups.pop() if self.policy_collector_groups else _PolicyCollectorGroup(self.policy_map)) self._add_to_next_inference_call(agent_key)
def compute_actions_from_input_dict(self, input_dict, explore=None, timestep=None, episodes=None, **kwargs): obs_batch = input_dict["obs"] # In policy loss initialization phase, no episodes are passed # in. if episodes is not None: # Pretend we did a model-based rollout and want to return # the extra trajectory. env_id = episodes[0].env_id fake_eps = Episode( episodes[0].policy_map, episodes[0].policy_mapping_fn, lambda: None, lambda x: None, env_id, ) builder = get_global_worker().sampler.sample_collector agent_id = "extra_0" policy_id = "p1" # use p1 so we can easily check it builder.add_init_obs(fake_eps, agent_id, env_id, policy_id, -1, obs_batch[0]) for t in range(4): builder.add_action_reward_next_obs( episode_id=fake_eps.episode_id, agent_id=agent_id, env_id=env_id, policy_id=policy_id, agent_done=t == 3, values=dict( t=t, actions=0, rewards=0, dones=t == 3, infos={}, new_obs=obs_batch[0], ), ) batch = builder.postprocess_episode(episode=fake_eps, build=True) episodes[0].add_extra_batch(batch) # Just return zeros for actions return [0] * len(obs_batch), [], {}
def postprocess_episode( self, episode: Episode, is_done: bool = False, check_dones: bool = False, build: bool = False, ) -> Union[None, SampleBatch, MultiAgentBatch]: episode_id = episode.episode_id policy_collector_group = episode.batch_builder # TODO: (sven) Once we implement multi-agent communication channels, # we have to resolve the restriction of only sending other agent # batches from the same policy to the postprocess methods. # Build SampleBatches for the given episode. pre_batches = {} for (eps_id, agent_id), collector in self.agent_collectors.items(): # Build only if there is data and agent is part of given episode. if collector.agent_steps == 0 or eps_id != episode_id: continue pid = self.agent_key_to_policy_id[(eps_id, agent_id)] policy = self.policy_map[pid] pre_batch = collector.build(policy.view_requirements) pre_batches[agent_id] = (policy, pre_batch) # Apply reward clipping before calling postprocessing functions. if self.clip_rewards is True: for _, (_, pre_batch) in pre_batches.items(): pre_batch["rewards"] = np.sign(pre_batch["rewards"]) elif self.clip_rewards: for _, (_, pre_batch) in pre_batches.items(): pre_batch["rewards"] = np.clip( pre_batch["rewards"], a_min=-self.clip_rewards, a_max=self.clip_rewards, ) post_batches = {} for agent_id, (_, pre_batch) in pre_batches.items(): # Entire episode is said to be done. # Error if no DONE at end of this agent's trajectory. if is_done and check_dones and not pre_batch[ SampleBatch.DONES][-1]: raise ValueError( "Episode {} terminated for all agents, but we still " "don't have a last observation for agent {} (policy " "{}). ".format( episode_id, agent_id, self.agent_key_to_policy_id[(episode_id, agent_id)], ) + "Please ensure that you include the last observations " "of all live agents when setting done[__all__] to " "True. Alternatively, set no_done_at_end=True to " "allow this.") # Skip a trajectory's postprocessing (and thus using it for training), # if its agent's info exists and contains the training_enabled=False # setting (used by our PolicyClients). last_info = episode.last_info_for(agent_id) if last_info and not last_info.get("training_enabled", True): continue if len(pre_batches) > 1: other_batches = pre_batches.copy() del other_batches[agent_id] else: other_batches = {} pid = self.agent_key_to_policy_id[(episode_id, agent_id)] policy = self.policy_map[pid] if (any(pre_batch[SampleBatch.DONES][:-1]) or len(set(pre_batch[SampleBatch.EPS_ID])) > 1): raise ValueError( "Batches sent to postprocessing must only contain steps " "from a single trajectory.", pre_batch, ) # Call the Policy's Exploration's postprocess method. post_batches[agent_id] = pre_batch if getattr(policy, "exploration", None) is not None: policy.exploration.postprocess_trajectory( policy, post_batches[agent_id], policy.get_session()) post_batches[agent_id].set_get_interceptor(None) post_batches[agent_id] = policy.postprocess_trajectory( post_batches[agent_id], other_batches, episode) if log_once("after_post"): logger.info( "Trajectory fragment after postprocess_trajectory():\n\n{}\n". format(summarize(post_batches))) # Append into policy batches and reset. from ray.rllib.evaluation.rollout_worker import get_global_worker for agent_id, post_batch in sorted(post_batches.items()): agent_key = (episode_id, agent_id) pid = self.agent_key_to_policy_id[agent_key] policy = self.policy_map[pid] self.callbacks.on_postprocess_trajectory( worker=get_global_worker(), episode=episode, agent_id=agent_id, policy_id=pid, policies=self.policy_map, postprocessed_batch=post_batch, original_batches=pre_batches, ) # Add the postprocessed SampleBatch to the policy collectors for # training. # PID may be a newly added policy. Just confirm we have it in our # policy map before proceeding with adding a new _PolicyCollector() # to the group. if pid not in policy_collector_group.policy_collectors: assert pid in self.policy_map policy_collector_group.policy_collectors[ pid] = _PolicyCollector(policy) policy_collector_group.policy_collectors[ pid].add_postprocessed_batch_for_training( post_batch, policy.view_requirements) if is_done: del self.agent_key_to_policy_id[agent_key] del self.agent_collectors[agent_key] if policy_collector_group: env_steps = self.episode_steps[episode_id] policy_collector_group.env_steps += env_steps agent_steps = self.agent_steps[episode_id] policy_collector_group.agent_steps += agent_steps if is_done: del self.episode_steps[episode_id] del self.agent_steps[episode_id] del self.episodes[episode_id] # Make PolicyCollectorGroup available for more agent batches in # other episodes. Do not reset count to 0. if policy_collector_group: self.policy_collector_groups.append(policy_collector_group) else: self.episode_steps[episode_id] = self.agent_steps[episode_id] = 0 # Build a MultiAgentBatch from the episode and return. if build: return self._build_multi_agent_batch(episode)