def _env_runner( worker: "RolloutWorker", base_env: BaseEnv, extra_batch_callback: Callable[[SampleBatchType], None], policies: Dict[PolicyID, Policy], policy_mapping_fn: Callable[[AgentID], PolicyID], rollout_fragment_length: int, horizon: int, preprocessors: Dict[PolicyID, Preprocessor], obs_filters: Dict[PolicyID, Filter], clip_rewards: bool, clip_actions: bool, multiple_episodes_in_batch: bool, callbacks: "DefaultCallbacks", tf_sess: Optional["tf.Session"], perf_stats: _PerfStats, soft_horizon: bool, no_done_at_end: bool, observation_fn: "ObservationFunction", sample_collector: Optional[SampleCollector] = None, render: bool = None, ) -> Iterable[SampleBatchType]: """This implements the common experience collection logic. Args: worker (RolloutWorker): Reference to the current rollout worker. base_env (BaseEnv): Env implementing BaseEnv. extra_batch_callback (fn): function to send extra batch data to. policies (Dict[PolicyID, Policy]): Map of policy ids to Policy instances. policy_mapping_fn (func): Function that maps agent ids to policy ids. This is called when an agent first enters the environment. The agent is then "bound" to the returned policy for the episode. rollout_fragment_length (int): Number of episode steps before `SampleBatch` is yielded. Set to infinity to yield complete episodes. horizon (int): Horizon of the episode. preprocessors (dict): Map of policy id to preprocessor for the observations prior to filtering. obs_filters (dict): Map of policy id to filter used to process observations for the policy. clip_rewards (bool): Whether to clip rewards before postprocessing. multiple_episodes_in_batch (bool): Whether to pack multiple episodes into each batch. This guarantees batches will be exactly `rollout_fragment_length` in size. clip_actions (bool): Whether to clip actions to the space range. callbacks (DefaultCallbacks): User callbacks to run on episode events. tf_sess (Session|None): Optional tensorflow session to use for batching TF policy evaluations. perf_stats (_PerfStats): Record perf stats into this object. soft_horizon (bool): Calculate rewards but don't reset the environment when the horizon is hit. no_done_at_end (bool): Ignore the done=True at the end of the episode and instead record done=False. observation_fn (ObservationFunction): Optional multi-agent observation func to use for preprocessing observations. sample_collector (Optional[SampleCollector]): An optional SampleCollector object to use. render (bool): Whether to try to render the environment after each step. Yields: rollout (SampleBatch): Object containing state, action, reward, terminal condition, and other fields as dictated by `policy`. """ # May be populated with used for image rendering simple_image_viewer: Optional["SimpleImageViewer"] = None # Try to get Env's `max_episode_steps` prop. If it doesn't exist, ignore # error and continue with max_episode_steps=None. max_episode_steps = None try: max_episode_steps = base_env.get_unwrapped()[0].spec.max_episode_steps except Exception: pass # Trainer has a given `horizon` setting. if horizon: # `horizon` is larger than env's limit. if max_episode_steps and horizon > max_episode_steps: # Try to override the env's own max-step setting with our horizon. # If this won't work, throw an error. try: base_env.get_unwrapped()[0].spec.max_episode_steps = horizon base_env.get_unwrapped()[0]._max_episode_steps = horizon except Exception: raise ValueError( "Your `horizon` setting ({}) is larger than the Env's own " "timestep limit ({}), which seems to be unsettable! Try " "to increase the Env's built-in limit to be at least as " "large as your wanted `horizon`.".format( horizon, max_episode_steps)) # Otherwise, set Trainer's horizon to env's max-steps. elif max_episode_steps: horizon = max_episode_steps logger.debug( "No episode horizon specified, setting it to Env's limit ({}).". format(max_episode_steps)) # No horizon/max_episode_steps -> Episodes may be infinitely long. else: horizon = float("inf") logger.debug("No episode horizon specified, assuming inf.") # Pool of batch builders, which can be shared across episodes to pack # trajectory data. batch_builder_pool: List[MultiAgentSampleBatchBuilder] = [] def get_batch_builder(): if batch_builder_pool: return batch_builder_pool.pop() else: return None def new_episode(env_id): episode = MultiAgentEpisode(policies, policy_mapping_fn, get_batch_builder, extra_batch_callback, env_id=env_id) # Call each policy's Exploration.on_episode_start method. # type: Policy for p in policies.values(): if getattr(p, "exploration", None) is not None: p.exploration.on_episode_start(policy=p, environment=base_env, episode=episode, tf_sess=getattr( p, "_sess", None)) callbacks.on_episode_start( worker=worker, base_env=base_env, policies=policies, episode=episode, env_index=env_id, ) return episode active_episodes: Dict[str, MultiAgentEpisode] = \ NewEpisodeDefaultDict(new_episode) while True: perf_stats.iters += 1 t0 = time.time() # Get observations from all ready agents. # type: MultiEnvDict, MultiEnvDict, MultiEnvDict, MultiEnvDict, ... unfiltered_obs, rewards, dones, infos, off_policy_actions = \ base_env.poll() perf_stats.env_wait_time += time.time() - t0 if log_once("env_returns"): logger.info("Raw obs from env: {}".format( summarize(unfiltered_obs))) logger.info("Info return from env: {}".format(summarize(infos))) # Process observations and prepare for policy evaluation. t1 = time.time() # type: Set[EnvID], Dict[PolicyID, List[PolicyEvalData]], # List[Union[RolloutMetrics, SampleBatchType]] active_envs, to_eval, outputs = \ _process_observations( worker=worker, base_env=base_env, policies=policies, active_episodes=active_episodes, unfiltered_obs=unfiltered_obs, rewards=rewards, dones=dones, infos=infos, horizon=horizon, preprocessors=preprocessors, obs_filters=obs_filters, multiple_episodes_in_batch=multiple_episodes_in_batch, callbacks=callbacks, soft_horizon=soft_horizon, no_done_at_end=no_done_at_end, observation_fn=observation_fn, sample_collector=sample_collector, ) perf_stats.raw_obs_processing_time += time.time() - t1 for o in outputs: yield o # Do batched policy eval (accross vectorized envs). t2 = time.time() # type: Dict[PolicyID, Tuple[TensorStructType, StateBatch, dict]] eval_results = _do_policy_eval( to_eval=to_eval, policies=policies, sample_collector=sample_collector, active_episodes=active_episodes, tf_sess=tf_sess, ) perf_stats.inference_time += time.time() - t2 # Process results and update episode state. t3 = time.time() actions_to_send: Dict[EnvID, Dict[AgentID, EnvActionType]] = \ _process_policy_eval_results( to_eval=to_eval, eval_results=eval_results, active_episodes=active_episodes, active_envs=active_envs, off_policy_actions=off_policy_actions, policies=policies, clip_actions=clip_actions, ) perf_stats.action_processing_time += time.time() - t3 # Return computed actions to ready envs. We also send to envs that have # taken off-policy actions; those envs are free to ignore the action. t4 = time.time() base_env.send_actions(actions_to_send) perf_stats.env_wait_time += time.time() - t4 # Try to render the env, if required. if render: t5 = time.time() # Render can either return an RGB image (uint8 [w x h x 3] numpy # array) or take care of rendering itself (returning True). rendered = base_env.try_render() # Rendering returned an image -> Display it in a SimpleImageViewer. if isinstance(rendered, np.ndarray) and len(rendered.shape) == 3: # ImageViewer not defined yet, try to create one. if simple_image_viewer is None: try: from gym.envs.classic_control.rendering import \ SimpleImageViewer simple_image_viewer = SimpleImageViewer() except (ImportError, ModuleNotFoundError): render = False # disable rendering logger.warning( "Could not import gym.envs.classic_control." "rendering! Try `pip install gym[all]`.") if simple_image_viewer: simple_image_viewer.imshow(rendered) perf_stats.env_render_time += time.time() - t5
def _env_runner(worker: "RolloutWorker", base_env: BaseEnv, extra_batch_callback: Callable[[SampleBatchType], None], policies: Dict[PolicyID, Policy], policy_mapping_fn: Callable[[AgentID], PolicyID], rollout_fragment_length: int, horizon: int, preprocessors: Dict[PolicyID, Preprocessor], obs_filters: Dict[PolicyID, Filter], clip_rewards: bool, clip_actions: bool, pack_multiple_episodes_in_batch: bool, callbacks: "DefaultCallbacks", tf_sess: Optional["tf.Session"], perf_stats: _PerfStats, soft_horizon: bool, no_done_at_end: bool, observation_fn: "ObservationFunction", _use_trajectory_view_api: bool = False ) -> Iterable[SampleBatchType]: """This implements the common experience collection logic. Args: worker (RolloutWorker): Reference to the current rollout worker. base_env (BaseEnv): Env implementing BaseEnv. extra_batch_callback (fn): function to send extra batch data to. policies (Dict[PolicyID, Policy]): Map of policy ids to Policy instances. policy_mapping_fn (func): Function that maps agent ids to policy ids. This is called when an agent first enters the environment. The agent is then "bound" to the returned policy for the episode. rollout_fragment_length (int): Number of episode steps before `SampleBatch` is yielded. Set to infinity to yield complete episodes. horizon (int): Horizon of the episode. preprocessors (dict): Map of policy id to preprocessor for the observations prior to filtering. obs_filters (dict): Map of policy id to filter used to process observations for the policy. clip_rewards (bool): Whether to clip rewards before postprocessing. pack_multiple_episodes_in_batch (bool): Whether to pack multiple episodes into each batch. This guarantees batches will be exactly `rollout_fragment_length` in size. clip_actions (bool): Whether to clip actions to the space range. callbacks (DefaultCallbacks): User callbacks to run on episode events. tf_sess (Session|None): Optional tensorflow session to use for batching TF policy evaluations. perf_stats (_PerfStats): Record perf stats into this object. soft_horizon (bool): Calculate rewards but don't reset the environment when the horizon is hit. no_done_at_end (bool): Ignore the done=True at the end of the episode and instead record done=False. observation_fn (ObservationFunction): Optional multi-agent observation func to use for preprocessing observations. _use_trajectory_view_api (bool): Whether to use the (experimental) `_use_trajectory_view_api` to make generic trajectory views available to Models. Default: False. Yields: rollout (SampleBatch): Object containing state, action, reward, terminal condition, and other fields as dictated by `policy`. """ # Try to get Env's `max_episode_steps` prop. If it doesn't exist, ignore # error and continue with max_episode_steps=None. max_episode_steps = None try: max_episode_steps = base_env.get_unwrapped()[0].spec.max_episode_steps except Exception: pass # Trainer has a given `horizon` setting. if horizon: # `horizon` is larger than env's limit -> Error and explain how # to increase Env's own episode limit. if max_episode_steps and horizon > max_episode_steps: raise ValueError( "Your `horizon` setting ({}) is larger than the Env's own " "timestep limit ({})! Try to increase the Env's limit via " "setting its `spec.max_episode_steps` property.".format( horizon, max_episode_steps)) # Otherwise, set Trainer's horizon to env's max-steps. elif max_episode_steps: horizon = max_episode_steps logger.debug( "No episode horizon specified, setting it to Env's limit ({}).". format(max_episode_steps)) else: horizon = float("inf") logger.debug("No episode horizon specified, assuming inf.") # Pool of batch builders, which can be shared across episodes to pack # trajectory data. batch_builder_pool: List[MultiAgentSampleBatchBuilder] = [] def get_batch_builder(): if batch_builder_pool: return batch_builder_pool.pop() else: return MultiAgentSampleBatchBuilder(policies, clip_rewards, callbacks) def new_episode(): episode = MultiAgentEpisode(policies, policy_mapping_fn, get_batch_builder, extra_batch_callback) # Call each policy's Exploration.on_episode_start method. # type: Policy for p in policies.values(): if getattr(p, "exploration", None) is not None: p.exploration.on_episode_start( policy=p, environment=base_env, episode=episode, tf_sess=getattr(p, "_sess", None)) callbacks.on_episode_start( worker=worker, base_env=base_env, policies=policies, episode=episode) return episode active_episodes: Dict[str, MultiAgentEpisode] = defaultdict(new_episode) while True: perf_stats.iters += 1 t0 = time.time() # Get observations from all ready agents. # type: MultiEnvDict, MultiEnvDict, MultiEnvDict, MultiEnvDict, ... unfiltered_obs, rewards, dones, infos, off_policy_actions = \ base_env.poll() perf_stats.env_wait_time += time.time() - t0 if log_once("env_returns"): logger.info("Raw obs from env: {}".format( summarize(unfiltered_obs))) logger.info("Info return from env: {}".format(summarize(infos))) # Process observations and prepare for policy evaluation. t1 = time.time() # type: Set[EnvID], Dict[PolicyID, List[PolicyEvalData]], # List[Union[RolloutMetrics, SampleBatchType]] active_envs, to_eval, outputs = _process_observations( worker=worker, base_env=base_env, policies=policies, batch_builder_pool=batch_builder_pool, active_episodes=active_episodes, unfiltered_obs=unfiltered_obs, rewards=rewards, dones=dones, infos=infos, horizon=horizon, preprocessors=preprocessors, obs_filters=obs_filters, rollout_fragment_length=rollout_fragment_length, pack_multiple_episodes_in_batch=pack_multiple_episodes_in_batch, callbacks=callbacks, soft_horizon=soft_horizon, no_done_at_end=no_done_at_end, observation_fn=observation_fn, _use_trajectory_view_api=_use_trajectory_view_api) perf_stats.processing_time += time.time() - t1 for o in outputs: yield o # Do batched policy eval (accross vectorized envs). t2 = time.time() # type: Dict[PolicyID, Tuple[TensorStructType, StateBatch, dict]] eval_results = _do_policy_eval( to_eval=to_eval, policies=policies, active_episodes=active_episodes, tf_sess=tf_sess, _use_trajectory_view_api=_use_trajectory_view_api) perf_stats.inference_time += time.time() - t2 # Process results and update episode state. t3 = time.time() actions_to_send: Dict[EnvID, Dict[AgentID, EnvActionType]] = \ _process_policy_eval_results( to_eval=to_eval, eval_results=eval_results, active_episodes=active_episodes, active_envs=active_envs, off_policy_actions=off_policy_actions, policies=policies, clip_actions=clip_actions, _use_trajectory_view_api=_use_trajectory_view_api) perf_stats.processing_time += time.time() - t3 # Return computed actions to ready envs. We also send to envs that have # taken off-policy actions; those envs are free to ignore the action. t4 = time.time() base_env.send_actions(actions_to_send) perf_stats.env_wait_time += time.time() - t4