def _build_value_model(self, model_config: ModelConfigDict): """Build value model with given model configuration model_config = {'activation': str, 'hiddens': Sequence} """ activation = get_activation_fn(model_config.get("activation")) hiddens = model_config.get("hiddens", []) inputs = tf.keras.layers.Input( shape=(np.product(self.critic_preprocessor.shape),), name="value-inputs" ) last_layer = inputs for i, size in enumerate(hiddens): last_layer = tf.keras.layers.Dense( size, name="fc_{}".format(i), activation=activation, kernel_initializer=normc_initializer(1.0), )(last_layer) value_out = tf.keras.layers.Dense( 1, name="value_out", activation=None, kernel_initializer=normc_initializer(0.01), )(last_layer) return tf.keras.Model(inputs, [value_out])
def __init__( self, env_creator: Callable[[EnvContext], EnvType], policy: type, policy_mapping_fn: Callable[[AgentID], PolicyID] = None, policies_to_train: List[PolicyID] = None, tf_session_creator: Callable[[], Any] = None, rollout_fragment_length: int = 100, batch_mode: str = "truncate_episodes", episode_horizon: int = None, preprocessor_pref: str = "deepmind", sample_async: bool = False, compress_observations: bool = False, num_envs: int = 1, observation_fn: "ObservationFunction" = None, observation_filter: str = "NoFilter", clip_rewards: bool = None, clip_actions: bool = True, env_config: EnvConfigDict = None, model_config: ModelConfigDict = None, policy_config: TrainerConfigDict = None, worker_index: int = 0, num_workers: int = 0, monitor_path: str = None, log_dir: str = None, log_level: str = None, callbacks: "DefaultCallbacks" = None, input_creator: Callable[ [IOContext], InputReader] = lambda ioctx: ioctx.default_sampler_input(), input_evaluation: List[str] = frozenset([]), output_creator: Callable[ [IOContext], OutputWriter] = lambda ioctx: NoopOutput(), remote_worker_envs: bool = False, remote_env_batch_wait_ms: int = 0, soft_horizon: bool = False, no_done_at_end: bool = False, seed: int = None, extra_python_environs: dict = None, fake_sampler: bool = False): """Initialize a rollout worker. Arguments: env_creator (func): Function that returns a gym.Env given an EnvContext wrapped configuration. policy (class|dict): Either a class implementing Policy, or a dictionary of policy id strings to (Policy, obs_space, action_space, config) tuples. If a dict is specified, then we are in multi-agent mode and a policy_mapping_fn should also be set. policy_mapping_fn (func): A function that maps agent ids to policy ids in multi-agent mode. This function will be called each time a new agent appears in an episode, to bind that agent to a policy for the duration of the episode. policies_to_train (list): Optional list of policies to train, or None for all policies. tf_session_creator (func): A function that returns a TF session. This is optional and only useful with TFPolicy. rollout_fragment_length (int): The target number of env transitions to include in each sample batch returned from this worker. batch_mode (str): One of the following batch modes: "truncate_episodes": Each call to sample() will return a batch of at most `rollout_fragment_length * num_envs` in size. The batch will be exactly `rollout_fragment_length * num_envs` in size if postprocessing does not change batch sizes. Episodes may be truncated in order to meet this size requirement. "complete_episodes": Each call to sample() will return a batch of at least `rollout_fragment_length * num_envs` in size. Episodes will not be truncated, but multiple episodes may be packed within one batch to meet the batch size. Note that when `num_envs > 1`, episode steps will be buffered until the episode completes, and hence batches may contain significant amounts of off-policy data. episode_horizon (int): Whether to stop episodes at this horizon. preprocessor_pref (str): Whether to prefer RLlib preprocessors ("rllib") or deepmind ("deepmind") when applicable. sample_async (bool): Whether to compute samples asynchronously in the background, which improves throughput but can cause samples to be slightly off-policy. compress_observations (bool): If true, compress the observations. They can be decompressed with rllib/utils/compression. num_envs (int): If more than one, will create multiple envs and vectorize the computation of actions. This has no effect if if the env already implements VectorEnv. observation_fn (ObservationFunction): Optional multi-agent observation function. observation_filter (str): Name of observation filter to use. clip_rewards (bool): Whether to clip rewards to [-1, 1] prior to experience postprocessing. Setting to None means clip for Atari only. clip_actions (bool): Whether to clip action values to the range specified by the policy action space. env_config (dict): Config to pass to the env creator. model_config (dict): Config to use when creating the policy model. policy_config (dict): Config to pass to the policy. In the multi-agent case, this config will be merged with the per-policy configs specified by `policy`. worker_index (int): For remote workers, this should be set to a non-zero and unique value. This index is passed to created envs through EnvContext so that envs can be configured per worker. num_workers (int): For remote workers, how many workers altogether have been created? monitor_path (str): Write out episode stats and videos to this directory if specified. log_dir (str): Directory where logs can be placed. log_level (str): Set the root log level on creation. callbacks (DefaultCallbacks): Custom training callbacks. input_creator (func): Function that returns an InputReader object for loading previous generated experiences. input_evaluation (list): How to evaluate the policy performance. This only makes sense to set when the input is reading offline data. The possible values include: - "is": the step-wise importance sampling estimator. - "wis": the weighted step-wise is estimator. - "simulation": run the environment in the background, but use this data for evaluation only and never for learning. output_creator (func): Function that returns an OutputWriter object for saving generated experiences. remote_worker_envs (bool): If using num_envs > 1, whether to create those new envs in remote processes instead of in the current process. This adds overheads, but can make sense if your envs remote_env_batch_wait_ms (float): Timeout that remote workers are waiting when polling environments. 0 (continue when at least one env is ready) is a reasonable default, but optimal value could be obtained by measuring your environment step / reset and model inference perf. soft_horizon (bool): Calculate rewards but don't reset the environment when the horizon is hit. no_done_at_end (bool): Ignore the done=True at the end of the episode and instead record done=False. seed (int): Set the seed of both np and tf to this value to to ensure each remote worker has unique exploration behavior. extra_python_environs (dict): Extra python environments need to be set. fake_sampler (bool): Use a fake (inf speed) sampler for testing. """ self._original_kwargs: dict = locals().copy() del self._original_kwargs["self"] global _global_worker _global_worker = self # set extra environs first if extra_python_environs: for key, value in extra_python_environs.items(): os.environ[key] = str(value) def gen_rollouts(): while True: yield self.sample() ParallelIteratorWorker.__init__(self, gen_rollouts, False) policy_config: TrainerConfigDict = policy_config or {} if (tf1 and policy_config.get("framework") in ["tf2", "tfe"] and not policy_config.get("no_eager_on_workers") # This eager check is necessary for certain all-framework tests # that use tf's eager_mode() context generator. and not tf1.executing_eagerly()): tf1.enable_eager_execution() if log_level: logging.getLogger("ray.rllib").setLevel(log_level) if worker_index > 1: disable_log_once_globally() # only need 1 worker to log elif log_level == "DEBUG": enable_periodic_logging() env_context = EnvContext(env_config or {}, worker_index) self.env_context = env_context self.policy_config: TrainerConfigDict = policy_config if callbacks: self.callbacks: "DefaultCallbacks" = callbacks() else: from ray.rllib.agents.callbacks import DefaultCallbacks self.callbacks: "DefaultCallbacks" = DefaultCallbacks() self.worker_index: int = worker_index self.num_workers: int = num_workers model_config: ModelConfigDict = model_config or {} policy_mapping_fn = (policy_mapping_fn or (lambda agent_id: DEFAULT_POLICY_ID)) if not callable(policy_mapping_fn): raise ValueError("Policy mapping function not callable?") self.env_creator: Callable[[EnvContext], EnvType] = env_creator self.rollout_fragment_length: int = rollout_fragment_length * num_envs self.batch_mode: str = batch_mode self.compress_observations: bool = compress_observations self.preprocessing_enabled: bool = True self.last_batch: SampleBatchType = None self.global_vars: dict = None self.fake_sampler: bool = fake_sampler self.env = _validate_env(env_creator(env_context)) if isinstance(self.env, (BaseEnv, MultiAgentEnv)): def wrap(env): return env # we can't auto-wrap these env types elif is_atari(self.env) and \ not model_config.get("custom_preprocessor") and \ preprocessor_pref == "deepmind": # Deepmind wrappers already handle all preprocessing self.preprocessing_enabled = False # If clip_rewards not explicitly set to False, switch it # on here (clip between -1.0 and 1.0). if clip_rewards is None: clip_rewards = True def wrap(env): env = wrap_deepmind(env, dim=model_config.get("dim"), framestack=model_config.get("framestack")) if monitor_path: from gym import wrappers env = wrappers.Monitor(env, monitor_path, resume=True) return env else: def wrap(env): if monitor_path: from gym import wrappers env = wrappers.Monitor(env, monitor_path, resume=True) return env self.env: EnvType = wrap(self.env) def make_env(vector_index): return wrap( env_creator( env_context.copy_with_overrides( worker_index=worker_index, vector_index=vector_index, remote=remote_worker_envs))) self.make_env_fn = make_env self.tf_sess = None policy_dict = _validate_and_canonicalize(policy, self.env) self.policies_to_train: List[PolicyID] = policies_to_train or list( policy_dict.keys()) self.policy_map: Dict[PolicyID, Policy] = None self.preprocessors: Dict[PolicyID, Preprocessor] = None # set numpy and python seed if seed is not None: np.random.seed(seed) random.seed(seed) if not hasattr(self.env, "seed"): raise ValueError("Env doesn't support env.seed(): {}".format( self.env)) self.env.seed(seed) try: assert torch is not None torch.manual_seed(seed) except AssertionError: logger.info("Could not seed torch") if _has_tensorflow_graph(policy_dict) and not ( tf1 and tf1.executing_eagerly()): if not tf1: raise ImportError("Could not import tensorflow") with tf1.Graph().as_default(): if tf_session_creator: self.tf_sess = tf_session_creator() else: self.tf_sess = tf1.Session(config=tf1.ConfigProto( gpu_options=tf1.GPUOptions(allow_growth=True))) with self.tf_sess.as_default(): # set graph-level seed if seed is not None: tf1.set_random_seed(seed) self.policy_map, self.preprocessors = \ self._build_policy_map(policy_dict, policy_config) else: self.policy_map, self.preprocessors = self._build_policy_map( policy_dict, policy_config) if (ray.is_initialized() and ray.worker._mode() != ray.worker.LOCAL_MODE): # Check available number of GPUs if not ray.get_gpu_ids(as_str=True): logger.debug("Creating policy evaluation worker {}".format( worker_index) + " on CPU (please ignore any CUDA init errors)") elif (policy_config["framework"] in ["tf2", "tf", "tfe"] and not tf.config.experimental.list_physical_devices("GPU")) or \ (policy_config["framework"] == "torch" and not torch.cuda.is_available()): raise RuntimeError( "GPUs were assigned to this worker by Ray, but " "your DL framework ({}) reports GPU acceleration is " "disabled. This could be due to a bad CUDA- or {} " "installation.".format(policy_config["framework"], policy_config["framework"])) self.multiagent: bool = set( self.policy_map.keys()) != {DEFAULT_POLICY_ID} if self.multiagent: if not ((isinstance(self.env, MultiAgentEnv) or isinstance(self.env, ExternalMultiAgentEnv)) or isinstance(self.env, BaseEnv)): raise ValueError( "Have multiple policies {}, but the env ".format( self.policy_map) + "{} is not a subclass of BaseEnv, MultiAgentEnv or " "ExternalMultiAgentEnv?".format(self.env)) self.filters: Dict[PolicyID, Filter] = { policy_id: get_filter(observation_filter, policy.observation_space.shape) for (policy_id, policy) in self.policy_map.items() } if self.worker_index == 0: logger.info("Built filter map: {}".format(self.filters)) self.num_envs: int = num_envs if "custom_vector_env" in policy_config: custom_vec_wrapper = policy_config["custom_vector_env"] self.async_env = custom_vec_wrapper(self.env) else: # Always use vector env for consistency even if num_envs = 1. self.async_env: BaseEnv = BaseEnv.to_base_env( self.env, make_env=make_env, num_envs=num_envs, remote_envs=remote_worker_envs, remote_env_batch_wait_ms=remote_env_batch_wait_ms) # `truncate_episodes`: Allow a batch to contain more than one episode # (fragments) and always make the batch `rollout_fragment_length` # long. if self.batch_mode == "truncate_episodes": pack = True # `complete_episodes`: Never cut episodes and sampler will return # exactly one (complete) episode per poll. elif self.batch_mode == "complete_episodes": rollout_fragment_length = float("inf") pack = False else: raise ValueError("Unsupported batch mode: {}".format( self.batch_mode)) self.io_context: IOContext = IOContext(log_dir, policy_config, worker_index, self) self.reward_estimators: OffPolicyEstimator = [] for method in input_evaluation: if method == "simulation": logger.warning( "Requested 'simulation' input evaluation method: " "will discard all sampler outputs and keep only metrics.") sample_async = True elif method == "is": ise = ImportanceSamplingEstimator.create(self.io_context) self.reward_estimators.append(ise) elif method == "wis": wise = WeightedImportanceSamplingEstimator.create( self.io_context) self.reward_estimators.append(wise) else: raise ValueError( "Unknown evaluation method: {}".format(method)) if sample_async: self.sampler = AsyncSampler( worker=self, env=self.async_env, policies=self.policy_map, policy_mapping_fn=policy_mapping_fn, preprocessors=self.preprocessors, obs_filters=self.filters, clip_rewards=clip_rewards, rollout_fragment_length=rollout_fragment_length, callbacks=self.callbacks, horizon=episode_horizon, pack_multiple_episodes_in_batch=pack, tf_sess=self.tf_sess, clip_actions=clip_actions, blackhole_outputs="simulation" in input_evaluation, soft_horizon=soft_horizon, no_done_at_end=no_done_at_end, observation_fn=observation_fn, _use_trajectory_view_api=policy_config.get( "_use_trajectory_view_api", False)) # Start the Sampler thread. self.sampler.start() else: self.sampler = SyncSampler( worker=self, env=self.async_env, policies=self.policy_map, policy_mapping_fn=policy_mapping_fn, preprocessors=self.preprocessors, obs_filters=self.filters, clip_rewards=clip_rewards, rollout_fragment_length=rollout_fragment_length, callbacks=self.callbacks, horizon=episode_horizon, pack_multiple_episodes_in_batch=pack, tf_sess=self.tf_sess, clip_actions=clip_actions, soft_horizon=soft_horizon, no_done_at_end=no_done_at_end, observation_fn=observation_fn, _use_trajectory_view_api=policy_config.get( "_use_trajectory_view_api", False)) self.input_reader: InputReader = input_creator(self.io_context) self.output_writer: OutputWriter = output_creator(self.io_context) logger.debug( "Created rollout worker with env {} ({}), policies {}".format( self.async_env, self.env, self.policy_map))
def get_model_v2(obs_space: gym.Space, action_space: gym.Space, num_outputs: int, model_config: ModelConfigDict, framework: str = "tf", name: str = "default_model", model_interface: type = None, default_model: type = None, **model_kwargs) -> ModelV2: """Returns a suitable model compatible with given spaces and output. Args: obs_space (Space): Observation space of the target gym env. This may have an `original_space` attribute that specifies how to unflatten the tensor into a ragged tensor. action_space (Space): Action space of the target gym env. num_outputs (int): The size of the output vector of the model. framework (str): One of "tf", "tfe", or "torch". name (str): Name (scope) for the model. model_interface (cls): Interface required for the model default_model (cls): Override the default class for the model. This only has an effect when not using a custom model model_kwargs (dict): args to pass to the ModelV2 constructor Returns: model (ModelV2): Model to use for the policy. """ if model_config.get("custom_model"): if "custom_options" in model_config and \ model_config["custom_options"] != DEPRECATED_VALUE: deprecation_warning("model.custom_options", "model.custom_model_config", error=False) model_config["custom_model_config"] = \ model_config.pop("custom_options") if isinstance(model_config["custom_model"], type): model_cls = model_config["custom_model"] else: model_cls = _global_registry.get(RLLIB_MODEL, model_config["custom_model"]) # TODO(sven): Hard-deprecate Model(V1). if issubclass(model_cls, ModelV2): logger.info("Wrapping {} as {}".format(model_cls, model_interface)) model_cls = ModelCatalog._wrap_if_needed( model_cls, model_interface) if framework in ["tf", "tfe"]: # Track and warn if vars were created but not registered. created = set() def track_var_creation(next_creator, **kw): v = next_creator(**kw) created.add(v) return v with tf.variable_creator_scope(track_var_creation): # Try calling with kwargs first (custom ModelV2 should # accept these as kwargs, not get them from # config["custom_model_config"] anymore). try: instance = model_cls(obs_space, action_space, num_outputs, model_config, name, **model_kwargs) except TypeError as e: # Keyword error: Try old way w/o kwargs. if "__init__() got an unexpected " in e.args[0]: logger.warning( "Custom ModelV2 should accept all custom " "options as **kwargs, instead of expecting" " them in config['custom_model_config']!") instance = model_cls(obs_space, action_space, num_outputs, model_config, name) # Other error -> re-raise. else: raise e registered = set(instance.variables()) not_registered = set() for var in created: if var not in registered: not_registered.add(var) if not_registered: raise ValueError( "It looks like variables {} were created as part " "of {} but does not appear in model.variables() " "({}). Did you forget to call " "model.register_variables() on the variables in " "question?".format(not_registered, instance, registered)) else: # PyTorch automatically tracks nn.Modules inside the parent # nn.Module's constructor. # TODO(sven): Do this for TF as well. instance = model_cls(obs_space, action_space, num_outputs, model_config, name, **model_kwargs) return instance # TODO(sven): Hard-deprecate Model(V1). This check will be # superflous then. elif tf.executing_eagerly(): raise ValueError( "Eager execution requires a TFModelV2 model to be " "used, however you specified a custom model {}".format( model_cls)) if framework in ["tf", "tfe", "tf2"]: v2_class = None # Try to get a default v2 model. if not model_config.get("custom_model"): v2_class = default_model or ModelCatalog._get_v2_model_class( obs_space, model_config, framework=framework) if model_config.get("use_lstm"): wrapped_cls = v2_class forward = wrapped_cls.forward v2_class = ModelCatalog._wrap_if_needed( wrapped_cls, LSTMWrapper) v2_class._wrapped_forward = forward # fallback to a default v1 model if v2_class is None: if tf.executing_eagerly(): raise ValueError( "Eager execution requires a TFModelV2 model to be " "used, however there is no default V2 model for this " "observation space: {}, use_lstm={}".format( obs_space, model_config.get("use_lstm"))) v2_class = make_v1_wrapper(ModelCatalog.get_model) # Wrap in the requested interface. wrapper = ModelCatalog._wrap_if_needed(v2_class, model_interface) return wrapper(obs_space, action_space, num_outputs, model_config, name, **model_kwargs) elif framework == "torch": v2_class = \ default_model or ModelCatalog._get_v2_model_class( obs_space, model_config, framework=framework) if model_config.get("use_lstm"): from ray.rllib.models.torch.recurrent_net import LSTMWrapper \ as TorchLSTMWrapper wrapped_cls = v2_class forward = wrapped_cls.forward v2_class = ModelCatalog._wrap_if_needed( wrapped_cls, TorchLSTMWrapper) v2_class._wrapped_forward = forward # Wrap in the requested interface. wrapper = ModelCatalog._wrap_if_needed(v2_class, model_interface) return wrapper(obs_space, action_space, num_outputs, model_config, name, **model_kwargs) else: raise NotImplementedError( "`framework` must be 'tf|tfe|torch', but is " "{}!".format(framework))
def get_action_dist(action_space: gym.Space, config: ModelConfigDict, dist_type: str = None, framework: str = "tf", **kwargs) -> (type, int): """Returns a distribution class and size for the given action space. Args: action_space (Space): Action space of the target gym env. config (Optional[dict]): Optional model config. dist_type (Optional[str]): Identifier of the action distribution interpreted as a hint. framework (str): One of "tf", "tfe", or "torch". kwargs (dict): Optional kwargs to pass on to the Distribution's constructor. Returns: Tuple: - dist_class (ActionDistribution): Python class of the distribution. - dist_dim (int): The size of the input vector to the distribution. """ dist = None config = config or MODEL_DEFAULTS # Custom distribution given. if config.get("custom_action_dist"): action_dist_name = config["custom_action_dist"] logger.debug( "Using custom action distribution {}".format(action_dist_name)) dist = _global_registry.get(RLLIB_ACTION_DIST, action_dist_name) # Dist_type is given directly as a class. elif type(dist_type) is type and \ issubclass(dist_type, ActionDistribution) and \ dist_type not in ( MultiActionDistribution, TorchMultiActionDistribution): dist = dist_type # Box space -> DiagGaussian OR Deterministic. elif isinstance(action_space, gym.spaces.Box): if len(action_space.shape) > 1: raise UnsupportedSpaceException( "Action space has multiple dimensions " "{}. ".format(action_space.shape) + "Consider reshaping this into a single dimension, " "using a custom action distribution, " "using a Tuple action space, or the multi-agent API.") # TODO(sven): Check for bounds and return SquashedNormal, etc.. if dist_type is None: dist = TorchDiagGaussian if framework == "torch" \ else DiagGaussian elif dist_type == "deterministic": dist = TorchDeterministic if framework == "torch" \ else Deterministic # Discrete Space -> Categorical. elif isinstance(action_space, gym.spaces.Discrete): dist = TorchCategorical if framework == "torch" else Categorical # Tuple/Dict Spaces -> MultiAction. elif dist_type in (MultiActionDistribution, TorchMultiActionDistribution) or \ isinstance(action_space, (gym.spaces.Tuple, gym.spaces.Dict)): flat_action_space = flatten_space(action_space) child_dists_and_in_lens = tree.map_structure( lambda s: ModelCatalog.get_action_dist( s, config, framework=framework), flat_action_space) child_dists = [e[0] for e in child_dists_and_in_lens] input_lens = [int(e[1]) for e in child_dists_and_in_lens] return partial((TorchMultiActionDistribution if framework == "torch" else MultiActionDistribution), action_space=action_space, child_distributions=child_dists, input_lens=input_lens), int(sum(input_lens)) # Simplex -> Dirichlet. elif isinstance(action_space, Simplex): if framework == "torch": # TODO(sven): implement raise NotImplementedError( "Simplex action spaces not supported for torch.") dist = Dirichlet # MultiDiscrete -> MultiCategorical. elif isinstance(action_space, gym.spaces.MultiDiscrete): dist = TorchMultiCategorical if framework == "torch" else \ MultiCategorical return partial(dist, input_lens=action_space.nvec), \ int(sum(action_space.nvec)) # Unknown type -> Error. else: raise NotImplementedError("Unsupported args: {} {}".format( action_space, dist_type)) return dist, dist.required_model_output_shape(action_space, config)