def __init__(self, env, policies, policy_mapping_fn, preprocessors, obs_filters, clip_rewards, unroll_length, callbacks, horizon=None, pack=False, tf_sess=None, clip_actions=True): self.async_vector_env = AsyncVectorEnv.wrap_async(env) self.unroll_length = unroll_length self.horizon = horizon self.policies = policies self.policy_mapping_fn = policy_mapping_fn self.preprocessors = preprocessors self.obs_filters = obs_filters self.extra_batches = queue.Queue() self.rollout_provider = _env_runner( self.async_vector_env, self.extra_batches.put, self.policies, self.policy_mapping_fn, self.unroll_length, self.horizon, self.preprocessors, self.obs_filters, clip_rewards, clip_actions, pack, callbacks, tf_sess) self.metrics_queue = queue.Queue()
def __init__(self, env, policies, policy_mapping_fn, preprocessors, obs_filters, clip_rewards, unroll_length, callbacks, horizon=None, pack=False, tf_sess=None, clip_actions=True): for _, f in obs_filters.items(): assert getattr(f, "is_concurrent", False), \ "Observation Filter must support concurrent updates." self.async_vector_env = AsyncVectorEnv.wrap_async(env) threading.Thread.__init__(self) self.queue = queue.Queue(5) self.extra_batches = queue.Queue() self.metrics_queue = queue.Queue() self.unroll_length = unroll_length self.horizon = horizon self.policies = policies self.policy_mapping_fn = policy_mapping_fn self.preprocessors = preprocessors self.obs_filters = obs_filters self.clip_rewards = clip_rewards self.daemon = True self.pack = pack self.tf_sess = tf_sess self.callbacks = callbacks self.clip_actions = clip_actions
def __init__(self, env, policies, policy_mapping_fn, obs_filters, num_local_steps, horizon=None, pack=False, tf_sess=None): for _, f in obs_filters.items(): assert getattr(f, "is_concurrent", False), \ "Observation Filter must support concurrent updates." self.async_vector_env = AsyncVectorEnv.wrap_async(env) threading.Thread.__init__(self) self.queue = queue.Queue(5) self.extra_batches = queue.Queue() self.metrics_queue = queue.Queue() self.num_local_steps = num_local_steps self.horizon = horizon self.policies = policies self.policy_mapping_fn = policy_mapping_fn self._obs_filters = obs_filters self.daemon = True self.pack = pack self.tf_sess = tf_sess
def __init__(self, env, policies, policy_mapping_fn, obs_filters, num_local_steps, horizon=None, pack=False, tf_sess=None): self.async_vector_env = AsyncVectorEnv.wrap_async(env) self.num_local_steps = num_local_steps self.horizon = horizon self.policies = policies self.policy_mapping_fn = policy_mapping_fn self._obs_filters = obs_filters self.rollout_provider = _env_runner(self.async_vector_env, self.policies, self.policy_mapping_fn, self.num_local_steps, self.horizon, self._obs_filters, pack, tf_sess) self.metrics_queue = queue.Queue()
def __init__(self, env, policies, policy_mapping_fn, preprocessors, obs_filters, clip_rewards, unroll_length, callbacks, horizon=None, pack=False, tf_sess=None, clip_actions=True, blackhole_outputs=False): for _, f in obs_filters.items(): assert getattr(f, "is_concurrent", False), \ "Observation Filter must support concurrent updates." self.async_vector_env = AsyncVectorEnv.wrap_async(env) threading.Thread.__init__(self) self.queue = queue.Queue(5) self.extra_batches = queue.Queue() self.metrics_queue = queue.Queue() self.unroll_length = unroll_length self.horizon = horizon self.policies = policies self.policy_mapping_fn = policy_mapping_fn self.preprocessors = preprocessors self.obs_filters = obs_filters self.clip_rewards = clip_rewards self.daemon = True self.pack = pack self.tf_sess = tf_sess self.callbacks = callbacks self.clip_actions = clip_actions self.blackhole_outputs = blackhole_outputs self.shutdown = False
def __init__(self, env_creator, policy_graph, policy_mapping_fn=None, policies_to_train=None, tf_session_creator=None, batch_steps=100, batch_mode="truncate_episodes", episode_horizon=None, preprocessor_pref="deepmind", sample_async=False, compress_observations=False, num_envs=1, observation_filter="NoFilter", clip_rewards=False, env_config=None, model_config=None, policy_config=None, worker_index=0, monitor_path=None): """Initialize a policy evaluator. Arguments: env_creator (func): Function that returns a gym.Env given an EnvContext wrapped configuration. policy_graph (class|dict): Either a class implementing PolicyGraph, or a dictionary of policy id strings to (PolicyGraph, obs_space, action_space, config) tuples. If a dict is specified, then we are in multi-agent mode and a policy_mapping_fn should also be set. policy_mapping_fn (func): A function that maps agent ids to policy ids in multi-agent mode. This function will be called each time a new agent appears in an episode, to bind that agent to a policy for the duration of the episode. policies_to_train (list): Optional whitelist of policies to train, or None for all policies. tf_session_creator (func): A function that returns a TF session. This is optional and only useful with TFPolicyGraph. batch_steps (int): The target number of env transitions to include in each sample batch returned from this evaluator. batch_mode (str): One of the following batch modes: "truncate_episodes": Each call to sample() will return a batch of at most `batch_steps` in size. The batch will be exactly `batch_steps` in size if postprocessing does not change batch sizes. Episodes may be truncated in order to meet this size requirement. When `num_envs > 1`, episodes will be truncated to sequences of `batch_size / num_envs` in length. "complete_episodes": Each call to sample() will return a batch of at least `batch_steps in size. Episodes will not be truncated, but multiple episodes may be packed within one batch to meet the batch size. Note that when `num_envs > 1`, episode steps will be buffered until the episode completes, and hence batches may contain significant amounts of off-policy data. episode_horizon (int): Whether to stop episodes at this horizon. preprocessor_pref (str): Whether to prefer RLlib preprocessors ("rllib") or deepmind ("deepmind") when applicable. sample_async (bool): Whether to compute samples asynchronously in the background, which improves throughput but can cause samples to be slightly off-policy. compress_observations (bool): If true, compress the observations. They can be decompressed with rllib/utils/compression. num_envs (int): If more than one, will create multiple envs and vectorize the computation of actions. This has no effect if if the env already implements VectorEnv. observation_filter (str): Name of observation filter to use. clip_rewards (bool): Whether to clip rewards to [-1, 1] prior to experience postprocessing. env_config (dict): Config to pass to the env creator. model_config (dict): Config to use when creating the policy model. policy_config (dict): Config to pass to the policy. In the multi-agent case, this config will be merged with the per-policy configs specified by `policy_graph`. worker_index (int): For remote evaluators, this should be set to a non-zero and unique value. This index is passed to created envs through EnvContext so that envs can be configured per worker. monitor_path (str): Write out episode stats and videos to this directory if specified. """ env_context = EnvContext(env_config or {}, worker_index) policy_config = policy_config or {} self.policy_config = policy_config model_config = model_config or {} policy_mapping_fn = (policy_mapping_fn or (lambda agent_id: DEFAULT_POLICY_ID)) self.env_creator = env_creator self.batch_steps = batch_steps self.batch_mode = batch_mode self.compress_observations = compress_observations self.env = env_creator(env_context) if isinstance(self.env, VectorEnv) or \ isinstance(self.env, ServingEnv) or \ isinstance(self.env, MultiAgentEnv) or \ isinstance(self.env, AsyncVectorEnv): def wrap(env): return env # we can't auto-wrap these env types elif is_atari(self.env) and \ "custom_preprocessor" not in model_config and \ preprocessor_pref == "deepmind": def wrap(env): env = wrap_deepmind(env, dim=model_config.get("dim", 84), framestack=not model_config.get("use_lstm") and not model_config.get("no_framestack")) if monitor_path: env = _monitor(env, monitor_path) return env else: def wrap(env): env = ModelCatalog.get_preprocessor_as_wrapper( env, model_config) if monitor_path: env = _monitor(env, monitor_path) return env self.env = wrap(self.env) def make_env(vector_index): return wrap( env_creator(env_context.with_vector_index(vector_index))) self.tf_sess = None policy_dict = _validate_and_canonicalize(policy_graph, self.env) self.policies_to_train = policies_to_train or list(policy_dict.keys()) if _has_tensorflow_graph(policy_dict): with tf.Graph().as_default(): if tf_session_creator: self.tf_sess = tf_session_creator() else: self.tf_sess = tf.Session(config=tf.ConfigProto( gpu_options=tf.GPUOptions(allow_growth=True))) with self.tf_sess.as_default(): self.policy_map = self._build_policy_map( policy_dict, policy_config) else: self.policy_map = self._build_policy_map(policy_dict, policy_config) self.multiagent = self.policy_map.keys() != {DEFAULT_POLICY_ID} self.filters = { policy_id: get_filter(observation_filter, policy.observation_space.shape) for (policy_id, policy) in self.policy_map.items() } # Always use vector env for consistency even if num_envs = 1 self.async_env = AsyncVectorEnv.wrap_async(self.env, make_env=make_env, num_envs=num_envs) self.num_envs = num_envs if self.batch_mode == "truncate_episodes": if batch_steps % num_envs != 0: raise ValueError( "In 'truncate_episodes' batch mode, `batch_steps` must be " "evenly divisible by `num_envs`. Got {} and {}.".format( batch_steps, num_envs)) batch_steps = batch_steps // num_envs pack_episodes = True elif self.batch_mode == "complete_episodes": batch_steps = float("inf") # never cut episodes pack_episodes = False # sampler will return 1 episode per poll else: raise ValueError("Unsupported batch mode: {}".format( self.batch_mode)) if sample_async: self.sampler = AsyncSampler(self.async_env, self.policy_map, policy_mapping_fn, self.filters, clip_rewards, batch_steps, horizon=episode_horizon, pack=pack_episodes, tf_sess=self.tf_sess) self.sampler.start() else: self.sampler = SyncSampler(self.async_env, self.policy_map, policy_mapping_fn, self.filters, clip_rewards, batch_steps, horizon=episode_horizon, pack=pack_episodes, tf_sess=self.tf_sess)
def __init__(self, env_creator, policy_graph, policy_mapping_fn=None, policies_to_train=None, tf_session_creator=None, batch_steps=100, batch_mode="truncate_episodes", episode_horizon=None, preprocessor_pref="deepmind", sample_async=False, compress_observations=False, num_envs=1, observation_filter="NoFilter", clip_rewards=None, clip_actions=True, env_config=None, model_config=None, policy_config=None, worker_index=0, monitor_path=None, log_dir=None, log_level=None, callbacks=None, input_creator=lambda ioctx: ioctx.default_sampler_input(), input_evaluation_method=None, output_creator=lambda ioctx: NoopOutput()): """Initialize a policy evaluator. Arguments: env_creator (func): Function that returns a gym.Env given an EnvContext wrapped configuration. policy_graph (class|dict): Either a class implementing PolicyGraph, or a dictionary of policy id strings to (PolicyGraph, obs_space, action_space, config) tuples. If a dict is specified, then we are in multi-agent mode and a policy_mapping_fn should also be set. policy_mapping_fn (func): A function that maps agent ids to policy ids in multi-agent mode. This function will be called each time a new agent appears in an episode, to bind that agent to a policy for the duration of the episode. policies_to_train (list): Optional whitelist of policies to train, or None for all policies. tf_session_creator (func): A function that returns a TF session. This is optional and only useful with TFPolicyGraph. batch_steps (int): The target number of env transitions to include in each sample batch returned from this evaluator. batch_mode (str): One of the following batch modes: "truncate_episodes": Each call to sample() will return a batch of at most `batch_steps * num_envs` in size. The batch will be exactly `batch_steps * num_envs` in size if postprocessing does not change batch sizes. Episodes may be truncated in order to meet this size requirement. "complete_episodes": Each call to sample() will return a batch of at least `batch_steps * num_envs` in size. Episodes will not be truncated, but multiple episodes may be packed within one batch to meet the batch size. Note that when `num_envs > 1`, episode steps will be buffered until the episode completes, and hence batches may contain significant amounts of off-policy data. episode_horizon (int): Whether to stop episodes at this horizon. preprocessor_pref (str): Whether to prefer RLlib preprocessors ("rllib") or deepmind ("deepmind") when applicable. sample_async (bool): Whether to compute samples asynchronously in the background, which improves throughput but can cause samples to be slightly off-policy. compress_observations (bool): If true, compress the observations. They can be decompressed with rllib/utils/compression. num_envs (int): If more than one, will create multiple envs and vectorize the computation of actions. This has no effect if if the env already implements VectorEnv. observation_filter (str): Name of observation filter to use. clip_rewards (bool): Whether to clip rewards to [-1, 1] prior to experience postprocessing. Setting to None means clip for Atari only. clip_actions (bool): Whether to clip action values to the range specified by the policy action space. env_config (dict): Config to pass to the env creator. model_config (dict): Config to use when creating the policy model. policy_config (dict): Config to pass to the policy. In the multi-agent case, this config will be merged with the per-policy configs specified by `policy_graph`. worker_index (int): For remote evaluators, this should be set to a non-zero and unique value. This index is passed to created envs through EnvContext so that envs can be configured per worker. monitor_path (str): Write out episode stats and videos to this directory if specified. log_dir (str): Directory where logs can be placed. log_level (str): Set the root log level on creation. callbacks (dict): Dict of custom debug callbacks. input_creator (func): Function that returns an InputReader object for loading previous generated experiences. input_evaluation_method (str): How to evaluate the current policy. This only applies when the input is reading offline data. Options are: - None: don't evaluate the policy. The episode reward and other metrics will be NaN. - "simulation": run the environment in the background, but use this data for evaluation only and never for learning. - "counterfactual": use counterfactual policy evaluation to estimate performance. output_creator (func): Function that returns an OutputWriter object for saving generated experiences. """ if log_level: logging.getLogger("ray.rllib").setLevel(log_level) env_context = EnvContext(env_config or {}, worker_index) policy_config = policy_config or {} self.policy_config = policy_config self.callbacks = callbacks or {} model_config = model_config or {} policy_mapping_fn = (policy_mapping_fn or (lambda agent_id: DEFAULT_POLICY_ID)) if not callable(policy_mapping_fn): raise ValueError( "Policy mapping function not callable. If you're using Tune, " "make sure to escape the function with tune.function() " "to prevent it from being evaluated as an expression.") self.env_creator = env_creator self.sample_batch_size = batch_steps * num_envs self.batch_mode = batch_mode self.compress_observations = compress_observations self.preprocessing_enabled = True self.env = env_creator(env_context) if isinstance(self.env, MultiAgentEnv) or \ isinstance(self.env, AsyncVectorEnv): def wrap(env): return env # we can't auto-wrap these env types elif is_atari(self.env) and \ not model_config.get("custom_preprocessor") and \ preprocessor_pref == "deepmind": # Deepmind wrappers already handle all preprocessing self.preprocessing_enabled = False if clip_rewards is None: clip_rewards = True def wrap(env): env = wrap_deepmind( env, dim=model_config.get("dim"), framestack=model_config.get("framestack")) if monitor_path: env = _monitor(env, monitor_path) return env else: def wrap(env): if monitor_path: env = _monitor(env, monitor_path) return env self.env = wrap(self.env) def make_env(vector_index): return wrap( env_creator(env_context.with_vector_index(vector_index))) self.tf_sess = None policy_dict = _validate_and_canonicalize(policy_graph, self.env) self.policies_to_train = policies_to_train or list(policy_dict.keys()) if _has_tensorflow_graph(policy_dict): if (ray.worker._mode() != ray.worker.LOCAL_MODE and not ray.get_gpu_ids()): logger.info("Creating policy evaluation worker {}".format( worker_index) + " on CPU (please ignore any CUDA init errors)") with tf.Graph().as_default(): if tf_session_creator: self.tf_sess = tf_session_creator() else: self.tf_sess = tf.Session( config=tf.ConfigProto( gpu_options=tf.GPUOptions(allow_growth=True))) with self.tf_sess.as_default(): self.policy_map, self.preprocessors = \ self._build_policy_map(policy_dict, policy_config) else: self.policy_map, self.preprocessors = self._build_policy_map( policy_dict, policy_config) self.multiagent = set(self.policy_map.keys()) != {DEFAULT_POLICY_ID} if self.multiagent: if not (isinstance(self.env, MultiAgentEnv) or isinstance(self.env, AsyncVectorEnv)): raise ValueError( "Have multiple policy graphs {}, but the env ".format( self.policy_map) + "{} is not a subclass of MultiAgentEnv?".format(self.env)) self.filters = { policy_id: get_filter(observation_filter, policy.observation_space.shape) for (policy_id, policy) in self.policy_map.items() } # Always use vector env for consistency even if num_envs = 1 self.async_env = AsyncVectorEnv.wrap_async( self.env, make_env=make_env, num_envs=num_envs) self.num_envs = num_envs if self.batch_mode == "truncate_episodes": unroll_length = batch_steps pack_episodes = True elif self.batch_mode == "complete_episodes": unroll_length = float("inf") # never cut episodes pack_episodes = False # sampler will return 1 episode per poll else: raise ValueError("Unsupported batch mode: {}".format( self.batch_mode)) if input_evaluation_method == "simulation": logger.warning( "Requested 'simulation' input evaluation method: " "will discard all sampler outputs and keep only metrics.") sample_async = True elif input_evaluation_method == "counterfactual": raise NotImplementedError elif input_evaluation_method is None: pass else: raise ValueError("Unknown evaluation method: {}".format( input_evaluation_method)) if sample_async: self.sampler = AsyncSampler( self.async_env, self.policy_map, policy_mapping_fn, self.preprocessors, self.filters, clip_rewards, unroll_length, self.callbacks, horizon=episode_horizon, pack=pack_episodes, tf_sess=self.tf_sess, clip_actions=clip_actions, blackhole_outputs=input_evaluation_method == "simulation") self.sampler.start() else: self.sampler = SyncSampler( self.async_env, self.policy_map, policy_mapping_fn, self.preprocessors, self.filters, clip_rewards, unroll_length, self.callbacks, horizon=episode_horizon, pack=pack_episodes, tf_sess=self.tf_sess, clip_actions=clip_actions) self.io_context = IOContext(log_dir, policy_config, worker_index, self) self.input_reader = input_creator(self.io_context) assert isinstance(self.input_reader, InputReader), self.input_reader self.output_writer = output_creator(self.io_context) assert isinstance(self.output_writer, OutputWriter), self.output_writer logger.debug("Created evaluator with env {} ({}), policies {}".format( self.async_env, self.env, self.policy_map))
def testNestedTupleAsync(self): self.doTestNestedTuple( lambda _: AsyncVectorEnv.wrap_async(NestedTupleEnv()))
def testNestedDictAsync(self): self.doTestNestedDict( lambda _: AsyncVectorEnv.wrap_async(NestedDictEnv()))
def testNestedTupleAsync(self): self.assertRaisesRegexp( ValueError, "Found raw Tuple space.*", lambda: self.doTestNestedTuple(lambda _: AsyncVectorEnv.wrap_async( NestedTupleEnv())))
def testNestedDictAsync(self): self.assertRaisesRegexp( ValueError, "Found raw Dict space.*", lambda: self.doTestNestedDict(lambda _: AsyncVectorEnv.wrap_async( NestedDictEnv())))
def __init__(self, env_creator, policy_graph, policy_mapping_fn=None, policies_to_train=None, tf_session_creator=None, batch_steps=100, batch_mode="truncate_episodes", episode_horizon=None, preprocessor_pref="deepmind", sample_async=False, compress_observations=False, num_envs=1, observation_filter="NoFilter", clip_rewards=None, clip_actions=True, env_config=None, model_config=None, policy_config=None, worker_index=0, monitor_path=None, log_dir=None, log_level=None, callbacks=None, input_creator=lambda ioctx: ioctx.default_sampler_input(), input_evaluation_method=None, output_creator=lambda ioctx: NoopOutput()): """Initialize a policy evaluator. Arguments: env_creator (func): Function that returns a gym.Env given an EnvContext wrapped configuration. policy_graph (class|dict): Either a class implementing PolicyGraph, or a dictionary of policy id strings to (PolicyGraph, obs_space, action_space, config) tuples. If a dict is specified, then we are in multi-agent mode and a policy_mapping_fn should also be set. policy_mapping_fn (func): A function that maps agent ids to policy ids in multi-agent mode. This function will be called each time a new agent appears in an episode, to bind that agent to a policy for the duration of the episode. policies_to_train (list): Optional whitelist of policies to train, or None for all policies. tf_session_creator (func): A function that returns a TF session. This is optional and only useful with TFPolicyGraph. batch_steps (int): The target number of env transitions to include in each sample batch returned from this evaluator. batch_mode (str): One of the following batch modes: "truncate_episodes": Each call to sample() will return a batch of at most `batch_steps * num_envs` in size. The batch will be exactly `batch_steps * num_envs` in size if postprocessing does not change batch sizes. Episodes may be truncated in order to meet this size requirement. "complete_episodes": Each call to sample() will return a batch of at least `batch_steps * num_envs` in size. Episodes will not be truncated, but multiple episodes may be packed within one batch to meet the batch size. Note that when `num_envs > 1`, episode steps will be buffered until the episode completes, and hence batches may contain significant amounts of off-policy data. episode_horizon (int): Whether to stop episodes at this horizon. preprocessor_pref (str): Whether to prefer RLlib preprocessors ("rllib") or deepmind ("deepmind") when applicable. sample_async (bool): Whether to compute samples asynchronously in the background, which improves throughput but can cause samples to be slightly off-policy. compress_observations (bool): If true, compress the observations. They can be decompressed with rllib/utils/compression. num_envs (int): If more than one, will create multiple envs and vectorize the computation of actions. This has no effect if if the env already implements VectorEnv. observation_filter (str): Name of observation filter to use. clip_rewards (bool): Whether to clip rewards to [-1, 1] prior to experience postprocessing. Setting to None means clip for Atari only. clip_actions (bool): Whether to clip action values to the range specified by the policy action space. env_config (dict): Config to pass to the env creator. model_config (dict): Config to use when creating the policy model. policy_config (dict): Config to pass to the policy. In the multi-agent case, this config will be merged with the per-policy configs specified by `policy_graph`. worker_index (int): For remote evaluators, this should be set to a non-zero and unique value. This index is passed to created envs through EnvContext so that envs can be configured per worker. monitor_path (str): Write out episode stats and videos to this directory if specified. log_dir (str): Directory where logs can be placed. log_level (str): Set the root log level on creation. callbacks (dict): Dict of custom debug callbacks. input_creator (func): Function that returns an InputReader object for loading previous generated experiences. input_evaluation_method (str): How to evaluate the current policy. This only applies when the input is reading offline data. Options are: - None: don't evaluate the policy. The episode reward and other metrics will be NaN. - "simulation": run the environment in the background, but use this data for evaluation only and never for learning. output_creator (func): Function that returns an OutputWriter object for saving generated experiences. """ if log_level: logging.getLogger("ray.rllib").setLevel(log_level) env_context = EnvContext(env_config or {}, worker_index) policy_config = policy_config or {} self.policy_config = policy_config self.callbacks = callbacks or {} model_config = model_config or {} policy_mapping_fn = (policy_mapping_fn or (lambda agent_id: DEFAULT_POLICY_ID)) if not callable(policy_mapping_fn): raise ValueError( "Policy mapping function not callable. If you're using Tune, " "make sure to escape the function with tune.function() " "to prevent it from being evaluated as an expression.") self.env_creator = env_creator self.sample_batch_size = batch_steps * num_envs self.batch_mode = batch_mode self.compress_observations = compress_observations self.preprocessing_enabled = True self.env = env_creator(env_context) if isinstance(self.env, MultiAgentEnv) or \ isinstance(self.env, AsyncVectorEnv): def wrap(env): return env # we can't auto-wrap these env types elif is_atari(self.env) and \ not model_config.get("custom_preprocessor") and \ preprocessor_pref == "deepmind": # Deepmind wrappers already handle all preprocessing self.preprocessing_enabled = False if clip_rewards is None: clip_rewards = True def wrap(env): env = wrap_deepmind( env, dim=model_config.get("dim"), framestack=model_config.get("framestack")) if monitor_path: env = _monitor(env, monitor_path) return env else: def wrap(env): if monitor_path: env = _monitor(env, monitor_path) return env self.env = wrap(self.env) def make_env(vector_index): return wrap( env_creator(env_context.with_vector_index(vector_index))) self.tf_sess = None policy_dict = _validate_and_canonicalize(policy_graph, self.env) self.policies_to_train = policies_to_train or list(policy_dict.keys()) if _has_tensorflow_graph(policy_dict): if (ray.worker._mode() != ray.worker.LOCAL_MODE and not ray.get_gpu_ids()): logger.info("Creating policy evaluation worker {}".format( worker_index) + " on CPU (please ignore any CUDA init errors)") with tf.Graph().as_default(): if tf_session_creator: self.tf_sess = tf_session_creator() else: self.tf_sess = tf.Session( config=tf.ConfigProto( gpu_options=tf.GPUOptions(allow_growth=True))) with self.tf_sess.as_default(): self.policy_map, self.preprocessors = \ self._build_policy_map(policy_dict, policy_config) else: self.policy_map, self.preprocessors = self._build_policy_map( policy_dict, policy_config) self.multiagent = set(self.policy_map.keys()) != {DEFAULT_POLICY_ID} if self.multiagent: if not (isinstance(self.env, MultiAgentEnv) or isinstance(self.env, AsyncVectorEnv)): raise ValueError( "Have multiple policy graphs {}, but the env ".format( self.policy_map) + "{} is not a subclass of MultiAgentEnv?".format(self.env)) self.filters = { policy_id: get_filter(observation_filter, policy.observation_space.shape) for (policy_id, policy) in self.policy_map.items() } # Always use vector env for consistency even if num_envs = 1 self.async_env = AsyncVectorEnv.wrap_async( self.env, make_env=make_env, num_envs=num_envs) self.num_envs = num_envs if self.batch_mode == "truncate_episodes": unroll_length = batch_steps pack_episodes = True elif self.batch_mode == "complete_episodes": unroll_length = float("inf") # never cut episodes pack_episodes = False # sampler will return 1 episode per poll else: raise ValueError("Unsupported batch mode: {}".format( self.batch_mode)) if input_evaluation_method == "simulation": logger.warning( "Requested 'simulation' input evaluation method: " "will discard all sampler outputs and keep only metrics.") sample_async = True elif input_evaluation_method is None: pass else: raise ValueError("Unknown evaluation method: {}".format( input_evaluation_method)) if sample_async: self.sampler = AsyncSampler( self.async_env, self.policy_map, policy_mapping_fn, self.preprocessors, self.filters, clip_rewards, unroll_length, self.callbacks, horizon=episode_horizon, pack=pack_episodes, tf_sess=self.tf_sess, clip_actions=clip_actions, blackhole_outputs=input_evaluation_method == "simulation") self.sampler.start() else: self.sampler = SyncSampler( self.async_env, self.policy_map, policy_mapping_fn, self.preprocessors, self.filters, clip_rewards, unroll_length, self.callbacks, horizon=episode_horizon, pack=pack_episodes, tf_sess=self.tf_sess, clip_actions=clip_actions) self.io_context = IOContext(log_dir, policy_config, worker_index, self) self.input_reader = input_creator(self.io_context) assert isinstance(self.input_reader, InputReader), self.input_reader self.output_writer = output_creator(self.io_context) assert isinstance(self.output_writer, OutputWriter), self.output_writer logger.debug("Created evaluator with env {} ({}), policies {}".format( self.async_env, self.env, self.policy_map))