def __init__(self, obs_space, action_space, config): config = dict(ray.rllib.agents.pg.pg.DEFAULT_CONFIG, **config) self.config = config # Setup placeholders obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape)) dist_class, self.logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") # Create the model network and action outputs self.model = ModelCatalog.get_model({ "obs": obs, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, obs_space, action_space, self.logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) # logit for each action # Setup policy loss actions = ModelCatalog.get_action_placeholder(action_space) advantages = tf.placeholder(tf.float32, [None], name="adv") loss = PGLoss(action_dist, actions, advantages).loss # Mapping from sample batch keys to placeholders. These keys will be # read from postprocessed sample batches and fed into the specified # placeholders during loss computation. loss_in = [ ("obs", obs), ("actions", actions), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ("advantages", advantages), # added during postprocessing ] # Initialize TFPolicyGraph sess = tf.get_default_session() TFPolicyGraph.__init__( self, obs_space, action_space, sess, obs_input=obs, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=loss, loss_inputs=loss_in, model=self.model, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) sess.run(tf.global_variables_initializer())
def _setup_graph(self, ob_space, ac_space): self.x = tf.placeholder(tf.float32, [None] + list(ob_space)) dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space) self._model = ModelCatalog.get_model( self.registry, self.x, self.logit_dim, self.config["model"]) self.logits = self._model.outputs self.curr_dist = dist_class(self.logits) self.sample = self.curr_dist.sample() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
def __init__(self, obs_space, action_space, config): config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config) self.config = config _, self.logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.model = ModelCatalog.get_torch_model(obs_space, self.logit_dim, self.config["model"]) loss = A3CLoss(self.model, self.config["vf_loss_coeff"], self.config["entropy_coeff"]) TorchPolicyGraph.__init__( self, obs_space, action_space, self.model, loss, loss_inputs=["obs", "actions", "advantages", "value_targets"])
def _setup_graph(self, ob_space, ac_space): self.x = tf.placeholder(tf.float32, [None] + list(ob_space)) dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space) self._model = ModelCatalog.get_model( self.registry, self.x, self.logit_dim, self.config["model"]) self.logits = self._model.outputs self.curr_dist = dist_class(self.logits) self.vf = tf.reshape(linear(self._model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) self.sample = self.curr_dist.sample() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) self.global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False)
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: obs_ph, value_targets_ph, adv_ph, act_ph, \ logits_ph, vf_preds_ph, prev_actions_ph, prev_rewards_ph = \ existing_inputs[:8] existing_state_in = existing_inputs[8:-1] existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder( tf.float32, name="obs", shape=(None, ) + observation_space.shape) adv_ph = tf.placeholder( tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder( tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder( tf.float32, name="vf_preds", shape=(None, )) value_targets_ph = tf.placeholder( tf.float32, name="value_targets", shape=(None, )) prev_actions_ph = ModelCatalog.get_action_placeholder(action_space) prev_rewards_ph = tf.placeholder( tf.float32, [None], name="prev_reward") existing_state_in = None existing_seq_lens = None self.observations = obs_ph self.prev_actions = prev_actions_ph self.prev_rewards = prev_rewards_ph self.loss_in = [ ("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logits", logits_ph), ("vf_preds", vf_preds_ph), ("prev_actions", prev_actions_ph), ("prev_rewards", prev_rewards_ph), ] self.model = ModelCatalog.get_model( { "obs": obs_ph, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable( initializer=tf.constant_initializer(self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: if self.config["vf_share_layers"]: self.value_function = self.model.value_function() else: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False if vf_config["use_lstm"]: vf_config["use_lstm"] = False logger.warning( "It is not recommended to use a LSTM model with " "vf_share_layers=False (consider setting it to True). " "If you want to not share layers, you can implement " "a custom LSTM model that overrides the " "value_function() method.") with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model({ "obs": obs_ph, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1]) if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(adv_ph, dtype=tf.bool) self.loss_obj = PPOLoss( action_space, value_targets_ph, adv_ph, act_ph, logits_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, mask, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_clip_param=self.config["vf_clip_param"], vf_loss_coeff=self.config["vf_loss_coeff"], use_gae=self.config["use_gae"]) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, action_prob=curr_action_dist.sampled_action_prob(), loss=self.loss_obj.loss, model=self.model, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions_ph, prev_reward_input=prev_rewards_ph, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) self.explained_variance = explained_variance(value_targets_ph, self.value_function) self.stats_fetches = { "cur_kl_coeff": self.kl_coeff, "cur_lr": tf.cast(self.cur_lr, tf.float64), "total_loss": self.loss_obj.loss, "policy_loss": self.loss_obj.mean_policy_loss, "vf_loss": self.loss_obj.mean_vf_loss, "vf_explained_var": self.explained_variance, "kl": self.loss_obj.mean_kl, "entropy": self.loss_obj.mean_entropy }
def __init__(self, obs_space, action_space, config): # Set up the config from possible default-config fn and given # config arg. if get_default_config: config = dict(get_default_config(), **config) self.config = config # Set the DL framework for this Policy. self.framework = self.config["framework"] = framework # Validate observation- and action-spaces. if validate_spaces: validate_spaces(self, obs_space, action_space, self.config) # Do some pre-initialization steps. if before_init: before_init(self, obs_space, action_space, self.config) # Model is customized (use default action dist class). if make_model: assert make_model_and_action_dist is None, \ "Either `make_model` or `make_model_and_action_dist`" \ " must be None!" self.model = make_model(self, obs_space, action_space, config) dist_class, _ = ModelCatalog.get_action_dist( action_space, self.config["model"], framework=framework) # Model and action dist class are customized. elif make_model_and_action_dist: self.model, dist_class = make_model_and_action_dist( self, obs_space, action_space, config) # Use default model and default action dist. else: dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"], framework=framework) self.model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=logit_dim, model_config=self.config["model"], framework=framework) # Make sure, we passed in a correct Model factory. model_cls = TorchModelV2 if framework == "torch" else JAXModelV2 assert isinstance(self.model, model_cls), \ "ERROR: Generated Model must be a TorchModelV2 object!" # Call the framework-specific Policy constructor. self.parent_cls = parent_cls self.parent_cls.__init__( self, observation_space=obs_space, action_space=action_space, config=config, model=self.model, loss=None if self.config["in_evaluation"] else loss_fn, action_distribution_class=dist_class, action_sampler_fn=action_sampler_fn, action_distribution_fn=action_distribution_fn, max_seq_len=config["model"]["max_seq_len"], get_batch_divisibility_req=get_batch_divisibility_req, ) # Merge Model's view requirements into Policy's. self.view_requirements.update(self.model.view_requirements) _before_loss_init = before_loss_init or after_init if _before_loss_init: _before_loss_init(self, self.observation_space, self.action_space, config) # Perform test runs through postprocessing- and loss functions. self._initialize_loss_from_dummy_batch( auto_remove_unneeded_view_reqs=True, stats_fn=None if self.config["in_evaluation"] else stats_fn, ) if _after_loss_init: _after_loss_init(self, obs_space, action_space, config) # Got to reset global_timestep again after this fake run-through. self.global_timestep = 0
def _setup_graph(self, ob_space, ac_space): _, self.logit_dim = ModelCatalog.get_action_dist(ac_space) self._model = ModelCatalog.get_torch_model( self.registry, ob_space, self.logit_dim, self.config["model"]) self.optimizer = torch.optim.Adam( self._model.parameters(), lr=self.config["lr"])
def __init__(self, observation_space, action_space, config, existing_inputs=None): config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() # Create input placeholders if existing_inputs: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards = existing_inputs[:7] existing_state_in = existing_inputs[7:-1] existing_seq_lens = existing_inputs[-1] else: if isinstance(action_space, gym.spaces.Discrete): ac_size = action_space.n actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for IMPALA.".format( action_space)) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder(tf.float32, [None, ac_size], name="behaviour_logits") observations = tf.placeholder(tf.float32, [None] + list(observation_space.shape)) existing_state_in = None existing_seq_lens = None # Setup the policy dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) action_dist = dist_class(self.model.outputs) values = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) def to_batches(tensor): if self.config["model"]["use_lstm"]: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = self.config["sample_batch_size"] B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes return tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) - 1 mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(rewards) # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. self.loss = VTraceLoss( actions=to_batches(actions)[:-1], actions_logp=to_batches(action_dist.logp(actions))[:-1], actions_entropy=to_batches(action_dist.entropy())[:-1], dones=to_batches(dones)[:-1], behaviour_logits=to_batches(behaviour_logits)[:-1], target_logits=to_batches(self.model.outputs)[:-1], discount=config["gamma"], rewards=to_batches(rewards)[:-1], values=to_batches(values)[:-1], bootstrap_value=to_batches(values)[-1], valid_mask=to_batches(mask)[:-1], vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"]) # KL divergence between worker and learner logits for debugging model_dist = Categorical(self.model.outputs) behaviour_dist = Categorical(behaviour_logits) self.KLs = model_dist.kl(behaviour_dist) self.mean_KL = tf.reduce_mean(self.KLs) self.max_KL = tf.reduce_max(self.KLs) self.median_KL = tf.contrib.distributions.percentile(self.KLs, 50.0) # Initialize TFPolicyGraph loss_in = [ ("actions", actions), ("dones", dones), ("behaviour_logits", behaviour_logits), ("rewards", rewards), ("obs", observations), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=observations, action_sampler=action_dist.sample(), loss=self.model.loss() + self.loss.total_loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"], batch_divisibility_req=self.config["sample_batch_size"]) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "stats": { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.vtrace_returns.vs, [-1]), tf.reshape(to_batches(values)[:-1], [-1])), "mean_KL": self.mean_KL, "max_KL": self.max_KL, "median_KL": self.median_KL, }, }
def __init__(self, obs_space, action_space, config, loss_fn, stats_fn=None, grad_stats_fn=None, before_loss_init=None, make_model=None, action_sampler_fn=None, existing_inputs=None, existing_model=None, get_batch_divisibility_req=None, obs_include_prev_action_reward=True): """Initialize a dynamic TF policy. Arguments: observation_space (gym.Space): Observation space of the policy. action_space (gym.Space): Action space of the policy. config (dict): Policy-specific configuration data. loss_fn (func): function that returns a loss tensor the policy graph, and dict of experience tensor placeholders stats_fn (func): optional function that returns a dict of TF fetches given the policy and batch input tensors grad_stats_fn (func): optional function that returns a dict of TF fetches given the policy and loss gradient tensors before_loss_init (func): optional function to run prior to loss init that takes the same arguments as __init__ make_model (func): optional function that returns a ModelV2 object given (policy, obs_space, action_space, config). All policy variables should be created in this function. If not specified, a default model will be created. action_sampler_fn (func): optional function that returns a tuple of action and action logp tensors given (policy, model, input_dict, obs_space, action_space, config). If not specified, a default action distribution will be used. existing_inputs (OrderedDict): when copying a policy, this specifies an existing dict of placeholders to use instead of defining new ones existing_model (ModelV2): when copying a policy, this specifies an existing model to clone and share weights with get_batch_divisibility_req (func): optional function that returns the divisibility requirement for sample batches obs_include_prev_action_reward (bool): whether to include the previous action and reward in the model input extra_input_dict (dict): map from input name to placeholder. Attributes: config: config of the policy model: model instance, if any """ self.config = config self._loss_fn = loss_fn self._stats_fn = stats_fn self._grad_stats_fn = grad_stats_fn self._obs_include_prev_action_reward = obs_include_prev_action_reward # Setup standard placeholders prev_actions = None prev_rewards = None if existing_inputs is not None: obs = existing_inputs[SampleBatch.CUR_OBS] if self._obs_include_prev_action_reward: prev_actions = existing_inputs[SampleBatch.PREV_ACTIONS] prev_rewards = existing_inputs[SampleBatch.PREV_REWARDS] else: obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape), name="observation") if self._obs_include_prev_action_reward: prev_actions = ModelCatalog.get_action_placeholder( action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self._input_dict = { SampleBatch.CUR_OBS: obs, SampleBatch.PREV_ACTIONS: prev_actions, SampleBatch.PREV_REWARDS: prev_rewards, "is_training": self._get_is_training_placeholder(), } self._seq_lens = tf.placeholder(dtype=tf.int32, shape=[None], name="seq_lens") # Setup model if action_sampler_fn: if not make_model: raise ValueError( "make_model is required if action_sampler_fn is given") self._dist_class = None else: self._dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_model: self.model = existing_model elif make_model: self.model = make_model(self, obs_space, action_space, config) else: self.model = ModelCatalog.get_model_v2(obs_space, action_space, logit_dim, self.config["model"], framework="tf") # if existing_inputs: # for name, mask_input in existing_inputs.items(): # if not name.endswith("mask"): # continue # else: # self._input_dict[name] = mask_input # else: # PENGZHENGHAO # for name, ph in self.model.mask_placeholder_dict.items(): # self._input_dict[name] = ph # print("Current key names of input dict: ", self._input_dict.keys()) if existing_inputs: self._state_in = [ v for k, v in existing_inputs.items() if k.startswith("state_in_") ] if self._state_in: self._seq_lens = existing_inputs["seq_lens"] else: self._state_in = [ tf.placeholder(shape=(None, ) + s.shape, dtype=s.dtype) for s in self.model.get_initial_state() ] model_out, self._state_out = self.model(self._input_dict, self._state_in, self._seq_lens) # Setup action sampler if action_sampler_fn: action_sampler, action_logp = action_sampler_fn( self, self.model, self._input_dict, obs_space, action_space, config) else: action_dist = self._dist_class(model_out, self.model) action_sampler = action_dist.sample() action_logp = action_dist.sampled_action_logp() # Phase 1 init sess = tf.get_default_session() or tf.Session() if get_batch_divisibility_req: batch_divisibility_req = get_batch_divisibility_req(self) else: batch_divisibility_req = 1 TFPolicy.__init__( self, obs_space, action_space, sess, obs_input=obs, action_sampler=action_sampler, action_logp=action_logp, loss=None, # dynamically initialized on run loss_inputs=[], model=self.model, state_inputs=self._state_in, state_outputs=self._state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self._seq_lens, max_seq_len=config["model"]["max_seq_len"], batch_divisibility_req=batch_divisibility_req) # Phase 2 init before_loss_init(self, obs_space, action_space, config) # print("Before dynamic_tf_policy's initialize_loss") if not existing_inputs: self._initialize_loss()
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() # Setup the policy self.observations = tf.placeholder( tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.model = ModelCatalog.get_model(self.observations, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) values = tf.reshape( linear(self.model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # Setup the policy loss if isinstance(action_space, gym.spaces.Discrete): ac_size = action_space.n actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for IMPALA.".format( action_space)) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder( tf.float32, [None, ac_size], name="behaviour_logits") def to_batches(tensor): if self.config["model"]["use_lstm"]: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = self.config["sample_batch_size"] B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes return tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. self.loss = VTraceLoss( actions=to_batches(actions)[:-1], actions_logp=to_batches(action_dist.logp(actions))[:-1], actions_entropy=to_batches(action_dist.entropy())[:-1], dones=to_batches(dones)[:-1], behaviour_logits=to_batches(behaviour_logits)[:-1], target_logits=to_batches(self.model.outputs)[:-1], discount=config["gamma"], rewards=to_batches(rewards)[:-1], values=to_batches(values)[:-1], bootstrap_value=to_batches(values)[-1], vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"]) # Initialize TFPolicyGraph loss_in = [ ("actions", actions), ("dones", dones), ("behaviour_logits", behaviour_logits), ("rewards", rewards), ("obs", self.observations), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), loss=self.loss.total_loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "stats": { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.vtrace_returns.vs, [-1]), tf.reshape(to_batches(values)[:-1], [-1])), }, }
def __init__(self, action_space, value_targets, advantages, actions, logprobs, vf_preds, curr_action_dist, value_fn, cur_kl_coeff, entropy_coeff=0, clip_param=0.1, vf_loss_coeff=1.0, use_gae=True): """Constructs the loss for Proximal Policy Objective. Arguments: action_space: Environment observation space specification. value_targets (Placeholder): Placeholder for target values; used for GAE. actions (Placeholder): Placeholder for actions taken from previous model evaluation. advantages (Placeholder): Placeholder for calculated advantages from previous model evaluation. logprobs (Placeholder): Placeholder for logits output from previous model evaluation. vf_preds (Placeholder): Placeholder for value function output from previous model evaluation. curr_action_dist (ActionDistribution): ActionDistribution of the current model. value_fn (Tensor): Current value function output Tensor. cur_kl_coeff (Variable): Variable holding the current PPO KL coefficient. entropy_coeff (float): Coefficient of the entropy regularizer. clip_param (float): Clip parameter vf_loss_coeff (float): Coefficient of the value function loss use_gae (bool): If true, use the Generalized Advantage Estimator. """ dist_cls, _ = ModelCatalog.get_action_dist(action_space) prev_dist = dist_cls(logprobs) # Make loss functions. logp_ratio = tf.exp( curr_action_dist.logp(actions) - prev_dist.logp(actions)) action_kl = prev_dist.kl(curr_action_dist) self.mean_kl = tf.reduce_mean(action_kl) curr_entropy = curr_action_dist.entropy() self.mean_entropy = tf.reduce_mean(curr_entropy) surrogate_loss = tf.minimum( advantages * logp_ratio, advantages * tf.clip_by_value(logp_ratio, 1 - clip_param, 1 + clip_param)) self.mean_policy_loss = tf.reduce_mean(-surrogate_loss) if use_gae: vf_loss1 = tf.square(value_fn - value_targets) vf_clipped = vf_preds + tf.clip_by_value(value_fn - vf_preds, -clip_param, clip_param) vf_loss2 = tf.square(vf_clipped - value_targets) vf_loss = tf.minimum(vf_loss1, vf_loss2) self.mean_vf_loss = tf.reduce_mean(vf_loss) loss = tf.reduce_mean(-surrogate_loss + cur_kl_coeff * action_kl + vf_loss_coeff * vf_loss - entropy_coeff * curr_entropy) else: self.mean_vf_loss = tf.constant(0.0) loss = tf.reduce_mean(-surrogate_loss + cur_kl_coeff * action_kl - entropy_coeff * curr_entropy) self.loss = loss
def __init__(self, observation_space, action_space, config): assert tf.executing_eagerly() self.framework = config.get("framework", "tfe") Policy.__init__(self, observation_space, action_space, config) self._is_training = False self._loss_initialized = False self._sess = None self._loss = loss_fn self.batch_divisibility_req = get_batch_divisibility_req(self) if \ callable(get_batch_divisibility_req) else \ (get_batch_divisibility_req or 1) self._max_seq_len = config["model"]["max_seq_len"] if get_default_config: config = dict(get_default_config(), **config) if validate_spaces: validate_spaces(self, observation_space, action_space, config) if before_init: before_init(self, observation_space, action_space, config) self.config = config self.dist_class = None if action_sampler_fn or action_distribution_fn: if not make_model: raise ValueError( "`make_model` is required if `action_sampler_fn` OR " "`action_distribution_fn` is given") else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if make_model: self.model = make_model(self, observation_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( observation_space, action_space, logit_dim, config["model"], framework=self.framework, ) self.exploration = self._create_exploration() self._state_in = [ tf.convert_to_tensor([s]) for s in self.model.get_initial_state() ] input_dict = { SampleBatch.CUR_OBS: tf.convert_to_tensor(np.array([observation_space.sample()])), SampleBatch.PREV_ACTIONS: tf.convert_to_tensor( [flatten_to_single_ndarray(action_space.sample())]), SampleBatch.PREV_REWARDS: tf.convert_to_tensor([0.]), } if action_distribution_fn: dist_inputs, self.dist_class, _ = action_distribution_fn( self, self.model, input_dict[SampleBatch.CUR_OBS]) else: self.model(input_dict, self._state_in, tf.convert_to_tensor([1])) if before_loss_init: before_loss_init(self, observation_space, action_space, config) self._initialize_loss_with_dummy_batch() self._loss_initialized = True if optimizer_fn: self._optimizer = optimizer_fn(self, config) else: self._optimizer = tf.keras.optimizers.Adam(config["lr"]) if after_init: after_init(self, observation_space, action_space, config)
def __init__(self, action_space, value_targets, advantages_ext, advantages_int, actions, logits, vf_preds, curr_action_dist, value_fn, cur_kl_coeff, rnd_target, rnd_predictor, entropy_coeff=0, clip_param=0.1, vf_clip_param=0.1, vf_loss_coeff=1.0, use_gae=True, rnd_pred_update_prop=0.25): """Constructs the loss for Proximal Policy Objective with Random Networks Distillation Arguments: action_space: Environment observation space specification. value_targets (Placeholder): Placeholder for target values; used for GAE. actions (Placeholder): Placeholder for actions taken from previous model evaluation. advantages_ext (Placeholder): Placeholder for calculated extrinsic advantages from previous model evaluation. advantages_int (Placeholder): Placeholder for calculated intrinsic advantages from previous model evaluation. logits (Placeholder): Placeholder for logits output from previous model evaluation. vf_preds (Placeholder): Placeholder for value function output from previous model evaluation. curr_action_dist (ActionDistribution): ActionDistribution of the current model. value_fn (Tensor): Current value function output Tensor. cur_kl_coeff (Variable): Variable holding the current PPO KL coefficient. rnd_target (Tensor): Current RND target network output Tensor rnd_predictor (Tensor): Current RND predictor network output Tensor entropy_coeff (float): Coefficient of the entropy regularizer. clip_param (float): Clip parameter vf_clip_param (float): Clip parameter for the value function vf_loss_coeff (float): Coefficient of the value function loss use_gae (bool): If true, use the Generalized Advantage Estimator. rnd_pred_update_prop (float): Proportion of experience used for RND predictor update. """ dist_cls, _ = ModelCatalog.get_action_dist(action_space) prev_dist = dist_cls(logits) # Make loss functions. logp_ratio = tf.exp( curr_action_dist.logp(actions) - prev_dist.logp(actions)) action_kl = prev_dist.kl(curr_action_dist) self.mean_kl = tf.reduce_mean(action_kl) curr_entropy = curr_action_dist.entropy() self.mean_entropy = tf.reduce_mean(curr_entropy) surrogate_loss = tf.minimum( advantages_ext * logp_ratio, advantages_ext * tf.clip_by_value(logp_ratio, 1 - clip_param, 1 + clip_param)) self.mean_policy_loss = tf.reduce_mean(-surrogate_loss) if use_gae: vf_loss1 = tf.square(value_fn - value_targets) vf_clipped = vf_preds + tf.clip_by_value( value_fn - vf_preds, -vf_clip_param, vf_clip_param) vf_loss2 = tf.square(vf_clipped - value_targets) vf_loss = tf.maximum(vf_loss1, vf_loss2) self.mean_vf_loss = tf.reduce_mean(vf_loss) loss = tf.reduce_mean(-surrogate_loss + cur_kl_coeff * action_kl + vf_loss_coeff * vf_loss - entropy_coeff * curr_entropy) else: self.mean_vf_loss = tf.constant(0.0) loss = tf.reduce_mean(-surrogate_loss + cur_kl_coeff * action_kl - entropy_coeff * curr_entropy) # TODO: add value loss for intrinsic rewards # Add RND loss terms to vf_loss # feat_var = tf.reduce_mean(tf.nn.moments(rnd_target, axes=[0])[1]) # TODO: use where? # max_feat = tf.reduce_max(tf.abs(rnd_target)) # TODO: use where? targets = tf.stop_gradient(rnd_target) self.int_rew = tf.reduce_mean(tf.square(targets - rnd_predictor), axis=-1, keep_dims=True) self.aux_loss = tf.reduce_mean(tf.square(targets - rnd_predictor), -1) mask = tf.random_uniform(shape=tf.shape(self.aux_loss), minval=0., maxval=1., dtype=tf.float32) mask = tf.cast(mask < rnd_pred_update_prop, tf.float32) self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum( tf.reduce_sum(mask), 1.) loss = loss + self.aux_loss self.loss = loss
def __init__(self, observation_space, action_space, config): # If this class runs as a @ray.remote actor, eager mode may not # have been activated yet. if not tf1.executing_eagerly(): tf1.enable_eager_execution() self.framework = config.get("framework", "tfe") EagerTFPolicy.__init__(self, observation_space, action_space, config) # Global timestep should be a tensor. self.global_timestep = tf.Variable(0, trainable=False, dtype=tf.int64) self.explore = tf.Variable(self.config["explore"], trainable=False, dtype=tf.bool) # Log device and worker index. from ray.rllib.evaluation.rollout_worker import get_global_worker worker = get_global_worker() worker_idx = worker.worker_index if worker else 0 if get_gpu_devices(): logger.info( "TF-eager Policy (worker={}) running on GPU.".format( worker_idx if worker_idx > 0 else "local")) else: logger.info( "TF-eager Policy (worker={}) running on CPU.".format( worker_idx if worker_idx > 0 else "local")) self._is_training = False # Only for `config.eager_tracing=True`: A counter to keep track of # how many times an eager-traced method (e.g. # `self._compute_actions_helper`) has been re-traced by tensorflow. # We will raise an error if more than n re-tracings have been # detected, since this would considerably slow down execution. # The variable below should only get incremented during the # tf.function trace operations, never when calling the already # traced function after that. self._re_trace_counter = 0 self._loss_initialized = False # To ensure backward compatibility: # Old way: If `loss` provided here, use as-is (as a function). if loss_fn is not None: self._loss = loss_fn # New way: Convert the overridden `self.loss` into a plain # function, so it can be called the same way as `loss` would # be, ensuring backward compatibility. elif self.loss.__func__.__qualname__ != "Policy.loss": self._loss = self.loss.__func__ # `loss` not provided nor overridden from Policy -> Set to None. else: self._loss = None self.batch_divisibility_req = (get_batch_divisibility_req(self) if callable(get_batch_divisibility_req) else (get_batch_divisibility_req or 1)) self._max_seq_len = config["model"]["max_seq_len"] if get_default_config: config = dict(get_default_config(), **config) if validate_spaces: validate_spaces(self, observation_space, action_space, config) if before_init: before_init(self, observation_space, action_space, config) self.config = config self.dist_class = None if action_sampler_fn or action_distribution_fn: if not make_model: raise ValueError( "`make_model` is required if `action_sampler_fn` OR " "`action_distribution_fn` is given") else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if make_model: self.model = make_model(self, observation_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( observation_space, action_space, logit_dim, config["model"], framework=self.framework, ) # Lock used for locking some methods on the object-level. # This prevents possible race conditions when calling the model # first, then its value function (e.g. in a loss function), in # between of which another model call is made (e.g. to compute an # action). self._lock = threading.RLock() # Auto-update model's inference view requirements, if recurrent. self._update_model_view_requirements_from_init_state() self.exploration = self._create_exploration() self._state_inputs = self.model.get_initial_state() self._is_recurrent = len(self._state_inputs) > 0 # Combine view_requirements for Model and Policy. self.view_requirements.update(self.model.view_requirements) if before_loss_init: before_loss_init(self, observation_space, action_space, config) if optimizer_fn: optimizers = optimizer_fn(self, config) else: optimizers = tf.keras.optimizers.Adam(config["lr"]) optimizers = force_list(optimizers) if getattr(self, "exploration", None): optimizers = self.exploration.get_exploration_optimizer( optimizers) # The list of local (tf) optimizers (one per loss term). self._optimizers: List[LocalOptimizer] = optimizers # Backward compatibility: A user's policy may only support a single # loss term and optimizer (no lists). self._optimizer: LocalOptimizer = optimizers[ 0] if optimizers else None self._initialize_loss_from_dummy_batch( auto_remove_unneeded_view_reqs=True, stats_fn=stats_fn, ) self._loss_initialized = True if after_init: after_init(self, observation_space, action_space, config) # Got to reset global_timestep again after fake run-throughs. self.global_timestep.assign(0)
def _setup_graph(self, ob_space, ac_space): _, self.logit_dim = ModelCatalog.get_action_dist(ac_space) self._model = ModelCatalog.get_torch_model(ob_space, self.logit_dim) self.optimizer = torch.optim.Adam(self._model.parameters(), lr=0.0001)
def __init__(self, observation_space, action_space, config): assert tf.executing_eagerly() self.framework = config.get("framework", "tfe") Policy.__init__(self, observation_space, action_space, config) self._is_training = False self._loss_initialized = False self._sess = None self._loss = loss_fn self.batch_divisibility_req = get_batch_divisibility_req(self) if \ callable(get_batch_divisibility_req) else \ (get_batch_divisibility_req or 1) self._max_seq_len = config["model"]["max_seq_len"] if get_default_config: config = dict(get_default_config(), **config) if validate_spaces: validate_spaces(self, observation_space, action_space, config) if before_init: before_init(self, observation_space, action_space, config) self.config = config self.dist_class = None if action_sampler_fn or action_distribution_fn: if not make_model: raise ValueError( "`make_model` is required if `action_sampler_fn` OR " "`action_distribution_fn` is given") else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if make_model: self.model = make_model(self, observation_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( observation_space, action_space, logit_dim, config["model"], framework=self.framework, ) # Auto-update model's inference view requirements, if recurrent. self._update_model_inference_view_requirements_from_init_state() self.exploration = self._create_exploration() self._state_in = [ tf.convert_to_tensor([s]) for s in self.model.get_initial_state() ] # Combine view_requirements for Model and Policy. self.view_requirements.update( self.model.inference_view_requirements) if before_loss_init: before_loss_init(self, observation_space, action_space, config) self._initialize_loss_from_dummy_batch( auto_remove_unneeded_view_reqs=True, stats_fn=stats_fn, ) self._loss_initialized = True if optimizer_fn: optimizers = optimizer_fn(self, config) else: optimizers = tf.keras.optimizers.Adam(config["lr"]) optimizers = force_list(optimizers) if getattr(self, "exploration", None): optimizers = self.exploration.get_exploration_optimizer( optimizers) # TODO: (sven) Allow tf policy to have more than 1 optimizer. # Just like torch Policy does. self._optimizer = optimizers[0] if optimizers else None if after_init: after_init(self, observation_space, action_space, config) # Got to reset global_timestep again after this fake run-through. self.global_timestep = 0
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist(action_space) if existing_inputs: obs_ph, value_targets_ph, adv_ph, act_ph, \ logits_ph, vf_preds_ph = existing_inputs[:6] existing_state_in = existing_inputs[6:-1] existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder(tf.float32, name="obs", shape=(None, ) + observation_space.shape) adv_ph = tf.placeholder(tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder(tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder(tf.float32, name="vf_preds", shape=(None, )) value_targets_ph = tf.placeholder(tf.float32, name="value_targets", shape=(None, )) existing_state_in = None existing_seq_lens = None self.loss_in = [ ("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logits", logits_ph), ("vf_preds", vf_preds_ph), ] self.model = ModelCatalog.get_model(obs_ph, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable(initializer=tf.constant_initializer( self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False vf_config["use_lstm"] = False with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model( obs_ph, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1]) self.loss_obj = PPOLoss(action_space, value_targets_ph, adv_ph, act_ph, logits_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_loss_coeff=self.config["kl_target"], use_gae=self.config["use_gae"]) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, loss=self.loss_obj.loss, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: obs_ph, value_targets_ph, adv_ph, act_ph, \ logits_ph, vf_preds_ph, prev_actions_ph, prev_rewards_ph = \ existing_inputs[:8] existing_state_in = existing_inputs[8:-1] existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder(tf.float32, name="obs", shape=(None, ) + observation_space.shape) adv_ph = tf.placeholder(tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder(tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder(tf.float32, name="vf_preds", shape=(None, )) value_targets_ph = tf.placeholder(tf.float32, name="value_targets", shape=(None, )) prev_actions_ph = ModelCatalog.get_action_placeholder(action_space) prev_rewards_ph = tf.placeholder(tf.float32, [None], name="prev_reward") existing_state_in = None existing_seq_lens = None self.observations = obs_ph self.loss_in = [ ("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logits", logits_ph), ("vf_preds", vf_preds_ph), ("prev_actions", prev_actions_ph), ("prev_rewards", prev_rewards_ph), ] self.model = ModelCatalog.get_model( { "obs": obs_ph, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable(initializer=tf.constant_initializer( self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: if self.config["vf_share_layers"]: self.value_function = self.model.value_function() else: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False vf_config["use_lstm"] = False with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model( { "obs": obs_ph, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1]) if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(adv_ph) self.loss_obj = PPOLoss(action_space, value_targets_ph, adv_ph, act_ph, logits_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, mask, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_clip_param=self.config["vf_clip_param"], vf_loss_coeff=self.config["vf_loss_coeff"], use_gae=self.config["use_gae"]) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, loss=self.model.loss() + self.loss_obj.loss, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions_ph, prev_reward_input=prev_rewards_ph, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) self.explained_variance = explained_variance(value_targets_ph, self.value_function) self.stats_fetches = { "cur_lr": tf.cast(self.cur_lr, tf.float64), "total_loss": self.loss_obj.loss, "policy_loss": self.loss_obj.mean_policy_loss, "vf_loss": self.loss_obj.mean_vf_loss, "vf_explained_var": self.explained_variance, "kl": self.loss_obj.mean_kl, "entropy": self.loss_obj.mean_entropy }
def __init__(self, obs_space, action_space, config, loss_fn, stats_fn=None, update_ops_fn=None, grad_stats_fn=None, before_loss_init=None, make_model=None, action_sampler_fn=None, existing_inputs=None, existing_model=None, get_batch_divisibility_req=None, obs_include_prev_action_reward=True): """Initialize a dynamic TF policy. Arguments: observation_space (gym.Space): Observation space of the policy. action_space (gym.Space): Action space of the policy. config (dict): Policy-specific configuration data. loss_fn (func): function that returns a loss tensor the policy graph, and dict of experience tensor placeholders stats_fn (func): optional function that returns a dict of TF fetches given the policy and batch input tensors grad_stats_fn (func): optional function that returns a dict of TF fetches given the policy and loss gradient tensors update_ops_fn (func): optional function that returns a list overriding the update ops to run when applying gradients before_loss_init (func): optional function to run prior to loss init that takes the same arguments as __init__ make_model (func): optional function that returns a ModelV2 object given (policy, obs_space, action_space, config). All policy variables should be created in this function. If not specified, a default model will be created. action_sampler_fn (func): optional function that returns a tuple of action and action prob tensors given (policy, model, input_dict, obs_space, action_space, config). If not specified, a default action distribution will be used. existing_inputs (OrderedDict): when copying a policy, this specifies an existing dict of placeholders to use instead of defining new ones existing_model (ModelV2): when copying a policy, this specifies an existing model to clone and share weights with get_batch_divisibility_req (func): optional function that returns the divisibility requirement for sample batches obs_include_prev_action_reward (bool): whether to include the previous action and reward in the model input Attributes: config: config of the policy model: model instance, if any model_out: output tensors of the model action_dist: action distribution of the model, if any state_in: state input tensors, if any state_out: state output tensors, if any seq_lens: tensor of sequence lengths """ self.config = config self._loss_fn = loss_fn self._stats_fn = stats_fn self._grad_stats_fn = grad_stats_fn self._update_ops_fn = update_ops_fn self._obs_include_prev_action_reward = obs_include_prev_action_reward # Setup standard placeholders prev_actions = None prev_rewards = None if existing_inputs is not None: obs = existing_inputs[SampleBatch.CUR_OBS] if self._obs_include_prev_action_reward: prev_actions = existing_inputs[SampleBatch.PREV_ACTIONS] prev_rewards = existing_inputs[SampleBatch.PREV_REWARDS] else: obs = tf.placeholder( tf.float32, shape=[None] + list(obs_space.shape), name="observation") if self._obs_include_prev_action_reward: prev_actions = ModelCatalog.get_action_placeholder( action_space) prev_rewards = tf.placeholder( tf.float32, [None], name="prev_reward") self.input_dict = { SampleBatch.CUR_OBS: obs, SampleBatch.PREV_ACTIONS: prev_actions, SampleBatch.PREV_REWARDS: prev_rewards, "is_training": self._get_is_training_placeholder(), } self.seq_lens = tf.placeholder( dtype=tf.int32, shape=[None], name="seq_lens") # Setup model self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_model: self.model = existing_model elif make_model: self.model = make_model(self, obs_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( obs_space, action_space, logit_dim, self.config["model"], framework="tf") if existing_inputs: self.state_in = [ v for k, v in existing_inputs.items() if k.startswith("state_in_") ] if self.state_in: self.seq_lens = existing_inputs["seq_lens"] else: self.state_in = [ tf.placeholder(shape=(None, ) + s.shape, dtype=s.dtype) for s in self.model.get_initial_state() ] self.model_out, self.state_out = self.model( self.input_dict, self.state_in, self.seq_lens) # Setup action sampler if action_sampler_fn: self.action_dist = None self.dist_class = None action_sampler, action_prob = action_sampler_fn( self, self.model, self.input_dict, obs_space, action_space, config) else: self.action_dist = self.dist_class(self.model_out) action_sampler = self.action_dist.sample() action_prob = self.action_dist.sampled_action_prob() # Phase 1 init sess = tf.get_default_session() or tf.Session() if get_batch_divisibility_req: batch_divisibility_req = get_batch_divisibility_req(self) else: batch_divisibility_req = 1 TFPolicy.__init__( self, obs_space, action_space, sess, obs_input=obs, action_sampler=action_sampler, action_prob=action_prob, loss=None, # dynamically initialized on run loss_inputs=[], model=self.model, state_inputs=self.state_in, state_outputs=self.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.seq_lens, max_seq_len=config["model"]["max_seq_len"], batch_divisibility_req=batch_divisibility_req) # Phase 2 init self._needs_eager_conversion = set() self._eager_tensors = {} before_loss_init(self, obs_space, action_space, config) if not existing_inputs: self._initialize_loss()
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist(action_space) if existing_inputs: self.loss_in = existing_inputs obs_ph, value_targets_ph, adv_ph, act_ph, \ logprobs_ph, vf_preds_ph = [ph for _, ph in existing_inputs] else: obs_ph = tf.placeholder(tf.float32, name="obs", shape=(None, ) + observation_space.shape) # Targets of the value function. value_targets_ph = tf.placeholder(tf.float32, name="value_targets", shape=(None, )) # Advantage values in the policy gradient estimator. adv_ph = tf.placeholder(tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) # Log probabilities from the policy before the policy update. logprobs_ph = tf.placeholder(tf.float32, name="logprobs", shape=(None, logit_dim)) # Value function predictions before the policy update. vf_preds_ph = tf.placeholder(tf.float32, name="vf_preds", shape=(None, )) self.loss_in = [("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logprobs", logprobs_ph), ("vf_preds", vf_preds_ph)] # TODO(ekl) feed RNN states in here # KL Coefficient self.kl_coeff = tf.get_variable(initializer=tf.constant_initializer( self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = ModelCatalog.get_model(obs_ph, logit_dim, self.config["model"]).outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model( obs_ph, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.constant("NA") self.loss_obj = PPOLoss(action_space, value_targets_ph, adv_ph, act_ph, logprobs_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_loss_coeff=self.config["kl_target"], use_gae=self.config["use_gae"]) self.is_training = tf.placeholder_with_default(True, ()) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, loss=self.loss_obj.loss, loss_inputs=self.loss_in, is_training=self.is_training)
def __init__( self, observation_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict, *, model: Optional[TorchModelV2] = None, loss: Optional[Callable[ [Policy, ModelV2, Type[TorchDistributionWrapper], SampleBatch], Union[TensorType, List[TensorType]]]] = None, action_distribution_class: Optional[ Type[TorchDistributionWrapper]] = None, action_sampler_fn: Optional[Callable[[TensorType, List[TensorType]], Tuple[TensorType, TensorType]]] = None, action_distribution_fn: Optional[ Callable[[Policy, ModelV2, TensorType, TensorType, TensorType], Tuple[TensorType, Type[TorchDistributionWrapper], List[TensorType]]]] = None, max_seq_len: int = 20, get_batch_divisibility_req: Optional[Callable[[Policy], int]] = None, ): """Initializes a TorchPolicy instance. Args: observation_space: Observation space of the policy. action_space: Action space of the policy. config: The Policy's config dict. model: PyTorch policy module. Given observations as input, this module must return a list of outputs where the first item is action logits, and the rest can be any value. loss: Callable that returns one or more (a list of) scalar loss terms. action_distribution_class: Class for a torch action distribution. action_sampler_fn: A callable returning a sampled action and its log-likelihood given Policy, ModelV2, input_dict, state batches (optional), explore, and timestep. Provide `action_sampler_fn` if you would like to have full control over the action computation step, including the model forward pass, possible sampling from a distribution, and exploration logic. Note: If `action_sampler_fn` is given, `action_distribution_fn` must be None. If both `action_sampler_fn` and `action_distribution_fn` are None, RLlib will simply pass inputs through `self.model` to get distribution inputs, create the distribution object, sample from it, and apply some exploration logic to the results. The callable takes as inputs: Policy, ModelV2, input_dict (SampleBatch), state_batches (optional), explore, and timestep. action_distribution_fn: A callable returning distribution inputs (parameters), a dist-class to generate an action distribution object from, and internal-state outputs (or an empty list if not applicable). Provide `action_distribution_fn` if you would like to only customize the model forward pass call. The resulting distribution parameters are then used by RLlib to create a distribution object, sample from it, and execute any exploration logic. Note: If `action_distribution_fn` is given, `action_sampler_fn` must be None. If both `action_sampler_fn` and `action_distribution_fn` are None, RLlib will simply pass inputs through `self.model` to get distribution inputs, create the distribution object, sample from it, and apply some exploration logic to the results. The callable takes as inputs: Policy, ModelV2, ModelInputDict, explore, timestep, is_training. max_seq_len: Max sequence length for LSTM training. get_batch_divisibility_req: Optional callable that returns the divisibility requirement for sample batches given the Policy. """ self.framework = config["framework"] = "torch" super().__init__(observation_space, action_space, config) # Create multi-GPU model towers, if necessary. # - The central main model will be stored under self.model, residing # on self.device (normally, a CPU). # - Each GPU will have a copy of that model under # self.model_gpu_towers, matching the devices in self.devices. # - Parallelization is done by splitting the train batch and passing # it through the model copies in parallel, then averaging over the # resulting gradients, applying these averages on the main model and # updating all towers' weights from the main model. # - In case of just one device (1 (fake or real) GPU or 1 CPU), no # parallelization will be done. # If no Model is provided, build a default one here. if model is None: dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"], framework=self.framework) model = ModelCatalog.get_model_v2( obs_space=self.observation_space, action_space=self.action_space, num_outputs=logit_dim, model_config=self.config["model"], framework=self.framework) if action_distribution_class is None: action_distribution_class = dist_class # Get devices to build the graph on. worker_idx = self.config.get("worker_index", 0) if not config["_fake_gpus"] and \ ray.worker._mode() == ray.worker.LOCAL_MODE: num_gpus = 0 elif worker_idx == 0: num_gpus = config["num_gpus"] else: num_gpus = config["num_gpus_per_worker"] gpu_ids = list(range(torch.cuda.device_count())) # Place on one or more CPU(s) when either: # - Fake GPU mode. # - num_gpus=0 (either set by user or we are in local_mode=True). # - No GPUs available. if config["_fake_gpus"] or num_gpus == 0 or not gpu_ids: logger.info("TorchPolicy (worker={}) running on {}.".format( worker_idx if worker_idx > 0 else "local", "{} fake-GPUs".format(num_gpus) if config["_fake_gpus"] else "CPU")) self.device = torch.device("cpu") self.devices = [ self.device for _ in range(int(math.ceil(num_gpus)) or 1) ] self.model_gpu_towers = [ model if i == 0 else copy.deepcopy(model) for i in range(int(math.ceil(num_gpus)) or 1) ] if hasattr(self, "target_model"): self.target_models = { m: self.target_model for m in self.model_gpu_towers } self.model = model # Place on one or more actual GPU(s), when: # - num_gpus > 0 (set by user) AND # - local_mode=False AND # - actual GPUs available AND # - non-fake GPU mode. else: logger.info("TorchPolicy (worker={}) running on {} GPU(s).".format( worker_idx if worker_idx > 0 else "local", num_gpus)) # We are a remote worker (WORKER_MODE=1): # GPUs should be assigned to us by ray. if ray.worker._mode() == ray.worker.WORKER_MODE: gpu_ids = ray.get_gpu_ids() if len(gpu_ids) < num_gpus: raise ValueError( "TorchPolicy was not able to find enough GPU IDs! Found " f"{gpu_ids}, but num_gpus={num_gpus}.") self.devices = [ torch.device("cuda:{}".format(i)) for i, id_ in enumerate(gpu_ids) if i < num_gpus ] self.device = self.devices[0] ids = [id_ for i, id_ in enumerate(gpu_ids) if i < num_gpus] self.model_gpu_towers = [] for i, _ in enumerate(ids): model_copy = copy.deepcopy(model) self.model_gpu_towers.append(model_copy.to(self.devices[i])) if hasattr(self, "target_model"): self.target_models = { m: copy.deepcopy(self.target_model).to(self.devices[i]) for i, m in enumerate(self.model_gpu_towers) } self.model = self.model_gpu_towers[0] # Lock used for locking some methods on the object-level. # This prevents possible race conditions when calling the model # first, then its value function (e.g. in a loss function), in # between of which another model call is made (e.g. to compute an # action). self._lock = threading.RLock() self._state_inputs = self.model.get_initial_state() self._is_recurrent = len(self._state_inputs) > 0 # Auto-update model's inference view requirements, if recurrent. self._update_model_view_requirements_from_init_state() # Combine view_requirements for Model and Policy. self.view_requirements.update(self.model.view_requirements) self.exploration = self._create_exploration() self.unwrapped_model = model # used to support DistributedDataParallel # To ensure backward compatibility: # Old way: If `loss` provided here, use as-is (as a function). if loss is not None: self._loss = loss # New way: Convert the overridden `self.loss` into a plain function, # so it can be called the same way as `loss` would be, ensuring # backward compatibility. elif self.loss.__func__.__qualname__ != "Policy.loss": self._loss = self.loss.__func__ # `loss` not provided nor overridden from Policy -> Set to None. else: self._loss = None self._optimizers = force_list(self.optimizer()) # Store, which params (by index within the model's list of # parameters) should be updated per optimizer. # Maps optimizer idx to set or param indices. self.multi_gpu_param_groups: List[Set[int]] = [] main_params = {p: i for i, p in enumerate(self.model.parameters())} for o in self._optimizers: param_indices = [] for pg_idx, pg in enumerate(o.param_groups): for p in pg["params"]: param_indices.append(main_params[p]) self.multi_gpu_param_groups.append(set(param_indices)) # Create n sample-batch buffers (num_multi_gpu_tower_stacks), each # one with m towers (num_gpus). num_buffers = self.config.get("num_multi_gpu_tower_stacks", 1) self._loaded_batches = [[] for _ in range(num_buffers)] self.dist_class = action_distribution_class self.action_sampler_fn = action_sampler_fn self.action_distribution_fn = action_distribution_fn # If set, means we are using distributed allreduce during learning. self.distributed_world_size = None self.max_seq_len = max_seq_len self.batch_divisibility_req = get_batch_divisibility_req(self) if \ callable(get_batch_divisibility_req) else \ (get_batch_divisibility_req or 1)
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config) self.config = config self.sess = tf.get_default_session() # Setup the policy self.observations = tf.placeholder(tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": self.observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards }, observation_space, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) self.vf = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # Setup the policy loss if isinstance(action_space, gym.spaces.Box): ac_size = action_space.shape[0] actions = tf.placeholder(tf.float32, [None, ac_size], name="ac") elif isinstance(action_space, gym.spaces.Discrete): actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for A3C.".format( action_space)) advantages = tf.placeholder(tf.float32, [None], name="advantages") self.v_target = tf.placeholder(tf.float32, [None], name="v_target") self.loss = A3CLoss(action_dist, actions, advantages, self.v_target, self.vf, self.config["vf_loss_coeff"], self.config["entropy_coeff"]) # Initialize TFPolicyGraph loss_in = [ ("obs", self.observations), ("actions", actions), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ("advantages", advantages), ("value_targets", self.v_target), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), loss=self.model.loss() + self.loss.total_loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) self.stats_fetches = { "stats": { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "policy_entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance(self.v_target, self.vf), }, } self.sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config): assert tf.executing_eagerly() self.framework = "tf" Policy.__init__(self, observation_space, action_space, config) self._is_training = False self._loss_initialized = False self._sess = None if get_default_config: config = dict(get_default_config(), **config) if before_init: before_init(self, observation_space, action_space, config) self.config = config self.dist_class = None if action_sampler_fn: if not make_model: raise ValueError("`make_model` is required if " "`action_sampler_fn` is given") else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if make_model: self.model = make_model(self, observation_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( observation_space, action_space, logit_dim, config["model"], framework="tf", ) self._state_in = [ tf.convert_to_tensor(np.array([s])) for s in self.model.get_initial_state() ] input_dict = { SampleBatch.CUR_OBS: tf.convert_to_tensor(np.array([observation_space.sample()])), SampleBatch.PREV_ACTIONS: tf.convert_to_tensor([_flatten_action(action_space.sample())]), SampleBatch.PREV_REWARDS: tf.convert_to_tensor([0.]), } self.model(input_dict, self._state_in, tf.convert_to_tensor([1])) if before_loss_init: before_loss_init(self, observation_space, action_space, config) self._initialize_loss_with_dummy_batch() self._loss_initialized = True if optimizer_fn: self._optimizer = optimizer_fn(self, config) else: self._optimizer = tf.train.AdamOptimizer(config["lr"]) if after_init: after_init(self, observation_space, action_space, config)
def __init__(self, observation_space, action_space, config, existing_inputs=None): config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() self.grads = None if isinstance(action_space, gym.spaces.Discrete): is_multidiscrete = False actions_shape = [None] output_hidden_shape = [action_space.n] elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete): is_multidiscrete = True actions_shape = [None, len(action_space.nvec)] output_hidden_shape = action_space.nvec.astype(np.int32) else: raise UnsupportedSpaceException( "Action space {} is not supported for IMPALA.".format( action_space)) # Create input placeholders if existing_inputs: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards = existing_inputs[:7] existing_state_in = existing_inputs[7:-1] existing_seq_lens = existing_inputs[-1] else: actions = tf.placeholder(tf.int64, actions_shape, name="ac") dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder( tf.float32, [None, sum(output_hidden_shape)], name="behaviour_logits") observations = tf.placeholder(tf.float32, [None] + list(observation_space.shape)) existing_state_in = None existing_seq_lens = None # Unpack behaviour logits unpacked_behaviour_logits = tf.split(behaviour_logits, output_hidden_shape, axis=1) # Setup the policy dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) unpacked_outputs = tf.split(self.model.outputs, output_hidden_shape, axis=1) dist_inputs = unpacked_outputs if is_multidiscrete else \ self.model.outputs action_dist = dist_class(dist_inputs) values = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) def make_time_major(tensor, drop_last=False): """Swaps batch and trajectory axis. Args: tensor: A tensor or list of tensors to reshape. drop_last: A bool indicating whether to drop the last trajectory item. Returns: res: A tensor with swapped axes or a list of tensors with swapped axes. """ if isinstance(tensor, list): return [make_time_major(t, drop_last) for t in tensor] if self.model.state_init: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = self.config["sample_batch_size"] B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes res = tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) if drop_last: return res[:-1] return res if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) - 1 mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(rewards, dtype=tf.bool) # Prepare actions for loss loss_actions = actions if is_multidiscrete else tf.expand_dims(actions, axis=1) # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. self.loss = VTraceLoss( actions=make_time_major(loss_actions, drop_last=True), actions_logp=make_time_major(action_dist.logp(actions), drop_last=True), actions_entropy=make_time_major(action_dist.entropy(), drop_last=True), dones=make_time_major(dones, drop_last=True), behaviour_logits=make_time_major(unpacked_behaviour_logits, drop_last=True), target_logits=make_time_major(unpacked_outputs, drop_last=True), discount=config["gamma"], rewards=make_time_major(rewards, drop_last=True), values=make_time_major(values, drop_last=True), bootstrap_value=make_time_major(values)[-1], valid_mask=make_time_major(mask, drop_last=True), vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"]) # KL divergence between worker and learner logits for debugging model_dist = MultiCategorical(unpacked_outputs) behaviour_dist = MultiCategorical(unpacked_behaviour_logits) kls = model_dist.kl(behaviour_dist) if len(kls) > 1: self.KL_stats = {} for i, kl in enumerate(kls): self.KL_stats.update({ "mean_KL_{}".format(i): tf.reduce_mean(kl), "max_KL_{}".format(i): tf.reduce_max(kl), "median_KL_{}".format(i): tf.contrib.distributions.percentile(kl, 50.0), }) else: self.KL_stats = { "mean_KL": tf.reduce_mean(kls[0]), "max_KL": tf.reduce_max(kls[0]), "median_KL": tf.contrib.distributions.percentile(kls[0], 50.0), } # Initialize TFPolicyGraph loss_in = [ (SampleBatch.ACTIONS, actions), (SampleBatch.DONES, dones), (BEHAVIOUR_LOGITS, behaviour_logits), (SampleBatch.REWARDS, rewards), (SampleBatch.CUR_OBS, observations), (SampleBatch.PREV_ACTIONS, prev_actions), (SampleBatch.PREV_REWARDS, prev_rewards), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=self.loss.total_loss, model=self.model, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"], batch_divisibility_req=self.config["sample_batch_size"]) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { LEARNER_STATS_KEY: dict( { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.vtrace_returns.vs, [-1]), tf.reshape(make_time_major(values, drop_last=True), [-1])), }, **self.KL_stats), }
def make_model_and_action_dist(policy, obs_space, action_space, config): """create model neural network""" policy.device = (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")) policy.log_stats = config["log_stats"] # flag to log statistics if policy.log_stats: policy.stats_dict = {} policy.stats_fn = config["stats_fn"] # Keys of the observation space that must be used at train and test time ('signal' and 'mask' will be excluded # from the actual obs space) policy.train_obs_keys = config["train_obs_keys"] policy.test_obs_keys = config["test_obs_keys"] # Check whether policy observation space is inside a Tuple space policy.requires_tupling = False if isinstance(action_space, Tuple) and len(action_space.spaces) == 1: policy.action_space = action_space.spaces[0] action_space = action_space.spaces[0] policy.requires_tupling = True if not isinstance(action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) # Get real observation space if isinstance(obs_space, Box): assert hasattr(obs_space, "original_space"), "Invalid observation space" obs_space = obs_space.original_space if isinstance(obs_space, Tuple): obs_space = obs_space.spaces[0] assert isinstance(obs_space, Dict), "Invalid observation space" policy.has_action_mask = "action_mask" in obs_space.spaces assert all([k in obs_space.spaces for k in policy.train_obs_keys]), "Invalid train keys specification" assert all([k in obs_space.spaces for k in policy.test_obs_keys]), "Invalid test keys specification" # Get observation space used for training if config["train_obs_space"] is None: train_obs_space = obs_space else: train_obs_space = config["train_obs_space"] if isinstance(train_obs_space, Box): assert hasattr(train_obs_space, "original_space"), "Invalid observation space" train_obs_space = train_obs_space.original_space if isinstance(train_obs_space, Tuple): train_obs_space = train_obs_space.spaces[0] # Obs spaces used for training and testing sp = Dict({ k: obs_space.spaces[k] for k in policy.test_obs_keys }) policy.real_test_obs_space = flatten_space(sp) policy.real_test_obs_space.original_space = sp model_space = Dict({ k: obs_space.spaces[k] for k in policy.test_obs_keys if k != "signal" and k != "action_mask" }) sp = Dict({ k: train_obs_space.spaces[k] for k in policy.train_obs_keys }) policy.real_train_obs_space = flatten_space(sp) policy.real_train_obs_space.original_space = sp policy.n_actions = action_space.n def update_target(): pass policy.update_target = update_target model = FullyConnectedNetwork(flatten_space(model_space), action_space, action_space.n, name="FcNet", model_config=config['model']).to(policy.device) return model, ModelCatalog.get_action_dist(action_space, config, framework='torch')
def __init__(self, obs_space, action_space, num_outputs, model_config, name): model_config = with_base_config( base_config=DEFAULT_STRATEGO_MODEL_CONFIG, extra_config=model_config) TFModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name) print(model_config) observation_mode = model_config['custom_options']['observation_mode'] if observation_mode == PARTIALLY_OBSERVABLE: self._obs_key = 'partial_observation' elif observation_mode == FULLY_OBSERVABLE: self._obs_key = 'full_observation' elif observation_mode == BOTH_OBSERVATIONS: raise NotImplementedError else: assert False, "policy observation_mode must be in [PARTIALLY_OBSERVABLE, FULLY_OBSERVABLE, BOTH_OBSERVATIONS]" self._action_dist_class, self._logit_dim = ModelCatalog.get_action_dist( self.action_space, model_config) self.use_lstm = model_config['use_lstm'] self.fake_lstm = model_config['custom_options'].get('fake_lstm', False) self.mask_invalid_actions = model_config['custom_options'][ 'mask_invalid_actions'] conv_activation = get_activation_fn( model_config.get("conv_activation")) base_lstm_filters = model_config["custom_options"]['base_lstm_filters'] base_cnn_filters = model_config["custom_options"]['base_cnn_filters'] pi_cnn_filters = model_config["custom_options"]['pi_cnn_filters'] q_cnn_filters = model_config["custom_options"]['q_cnn_filters'] rows = obs_space.original_space[self._obs_key].shape[0] colums = obs_space.original_space[self._obs_key].shape[1] if self.use_lstm: self._lstm_state_shape = (rows, colums, base_lstm_filters[0][0]) if self.use_lstm and not self.fake_lstm: self._base_model_out_shape = (rows, colums, base_lstm_filters[0][0]) else: self._base_model_out_shape = (rows, colums, base_cnn_filters[-1][0]) if self.use_lstm: state_in = [ tf.keras.layers.Input(shape=self._lstm_state_shape, name="base_lstm_h"), tf.keras.layers.Input(shape=self._lstm_state_shape, name="base_lstm_c") ] seq_lens_in = tf.keras.layers.Input(shape=(), name="lstm_seq_in") self._obs_inputs = tf.keras.layers.Input( shape=(None, *obs_space.original_space[self._obs_key].shape), name="observation") self._base_model_out = tf.keras.layers.Input( shape=self._base_model_out_shape, name="model_out") else: state_in, seq_lens_in = None, None self._obs_inputs = tf.keras.layers.Input( shape=obs_space.original_space[self._obs_key].shape, name="observation") self._base_model_out = tf.keras.layers.Input( shape=self._base_model_out_shape, name="model_out") def maybe_td(layer): if self.use_lstm: return tf.keras.layers.TimeDistributed(layer=layer, name=f"td_{layer.name}") else: return layer def build_shared_base_layers(prefix: str, obs_in: tf.Tensor, state_in: tf.Tensor): # obs_in = tf.debugging.check_numerics( # obs_in, f"nan found in obs_in", name=None) _last_layer = obs_in for i, (out_size, kernel, stride) in enumerate(base_cnn_filters): _last_layer = maybe_td( tf.keras.layers.Conv2D(filters=out_size, kernel_size=kernel, strides=stride, activation=conv_activation, padding="same", name="{}_conv_{}".format( prefix, i)))(_last_layer) # _last_layer = tf.debugging.check_numerics( # _last_layer, f"nan found in _last_layer {i}", name=None) base_state_out = state_in if self.use_lstm and not self.fake_lstm: for i, (out_size, kernel, stride) in enumerate(base_lstm_filters): if i > 0: raise NotImplementedError( "Only single lstm layers are implemented right now" ) _last_layer, *base_state_out = tf.keras.layers.ConvLSTM2D( filters=out_size, kernel_size=kernel, strides=stride, activation=conv_activation, padding="same", data_format='channels_last', return_sequences=True, return_state=True, name="{}_convlstm".format(prefix))( inputs=_last_layer, initial_state=state_in, mask=tf.sequence_mask(seq_lens_in)) return _last_layer, base_state_out def build_pi_layers(input_layer): _last_layer = input_layer for i, (out_size, kernel, stride) in enumerate(pi_cnn_filters): _last_layer = tf.keras.layers.Conv2D( filters=out_size, kernel_size=kernel, strides=stride, activation=conv_activation, padding="same", name="{}_conv_{}".format('pi', i))(_last_layer) print( f"action space n: {action_space.n}, rows: {rows}, columns: {colums}, filters: {int(action_space.n / (rows * colums))}" ) unmasked_logits = tf.keras.layers.Conv2D( filters=int(action_space.n / (rows * colums)), kernel_size=[3, 3], strides=1, activation=None, padding="same", name="{}_conv_{}".format('pi', "unmasked_logits"))(_last_layer) return unmasked_logits def build_q_layers(input_layer, prefix): _last_layer = input_layer for i, (out_size, kernel, stride) in enumerate(q_cnn_filters): _last_layer = tf.keras.layers.Conv2D( filters=out_size, kernel_size=kernel, strides=stride, activation=conv_activation, padding="same", name="{}_conv_{}".format(prefix, i))(_last_layer) q_val = tf.keras.layers.Conv2D( filters=int(action_space.n / (rows * colums)), kernel_size=[3, 3], strides=1, activation=None, padding="same", name="{}_conv_{}".format(prefix, "q_out"))(_last_layer) return q_val base_model_out, state_out = build_shared_base_layers( prefix="shared_base", obs_in=self._obs_inputs, state_in=state_in) pi_unmasked_logits_out = build_pi_layers( input_layer=self._base_model_out) q1_out = build_q_layers(input_layer=self._base_model_out, prefix="q1") q2_out = build_q_layers(input_layer=self._base_model_out, prefix="q2") base_inputs = [self._obs_inputs] base_outputs = [base_model_out] if self.use_lstm: base_inputs += [seq_lens_in, *state_in] base_outputs += [*state_out] self._base_model = tf.keras.Model(name=f"{name}_base", inputs=base_inputs, outputs=base_outputs) self.pi_model = tf.keras.Model(name=f"{name}_pi_head", inputs=[self._base_model_out], outputs=[pi_unmasked_logits_out]) self.q1_model = tf.keras.Model(name=f"{name}_q1_head", inputs=[self._base_model_out], outputs=[q1_out]) self.q2_model = tf.keras.Model(name=f"{name}_q2_head", inputs=[self._base_model_out], outputs=[q2_out]) print(self._base_model.summary()) print(self.pi_model.summary()) print(self.q1_model.summary()) print(self.q2_model.summary()) self.register_variables(self._base_model.variables) self.register_variables(self.pi_model.variables) self.register_variables(self.q1_model.variables) self.register_variables(self.q2_model.variables) self.log_alpha = tf.Variable(0.0, dtype=tf.float32, name="log_alpha") self.alpha = tf.exp(self.log_alpha) self.register_variables([self.log_alpha])
def __init__(self, obs_space, action_space, config): if get_default_config: config = dict(get_default_config(), **config) self.config = config if validate_spaces: validate_spaces(self, obs_space, action_space, self.config) if before_init: before_init(self, obs_space, action_space, self.config) # Model is customized (use default action dist class). if make_model: assert make_model_and_action_dist is None, \ "Either `make_model` or `make_model_and_action_dist`" \ " must be None!" self.model = make_model(self, obs_space, action_space, config) dist_class, _ = ModelCatalog.get_action_dist( action_space, self.config["model"], framework="torch") # Model and action dist class are customized. elif make_model_and_action_dist: self.model, dist_class = make_model_and_action_dist( self, obs_space, action_space, config) # Use default model and default action dist. else: dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"], framework="torch") self.model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=logit_dim, model_config=self.config["model"], framework="torch") # Make sure, we passed in a correct Model factory. assert isinstance(self.model, TorchModelV2), \ "ERROR: Generated Model must be a TorchModelV2 object!" TorchPolicy.__init__( self, observation_space=obs_space, action_space=action_space, config=config, model=self.model, loss=loss_fn, action_distribution_class=dist_class, action_sampler_fn=action_sampler_fn, action_distribution_fn=action_distribution_fn, max_seq_len=config["model"]["max_seq_len"], get_batch_divisibility_req=get_batch_divisibility_req, ) self.view_requirements.update( self.model.inference_view_requirements) _before_loss_init = before_loss_init or after_init if _before_loss_init: _before_loss_init(self, self.observation_space, self.action_space, config) self._initialize_loss_from_dummy_batch( auto_remove_unneeded_view_reqs=True, stats_fn=stats_fn, ) if _after_loss_init: _after_loss_init(self, obs_space, action_space, config) # Got to reset global_timestep again after this fake run-through. self.global_timestep = 0
def __init__(self, observation_space, action_space, config, existing_inputs=None): config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() self.grads = None if isinstance(action_space, gym.spaces.Discrete): is_multidiscrete = False output_hidden_shape = [action_space.n] elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete): is_multidiscrete = True output_hidden_shape = action_space.nvec.astype(np.int32) elif self.config["vtrace"]: raise UnsupportedSpaceException( "Action space {} is not supported for APPO + VTrace.", format(action_space)) else: is_multidiscrete = False output_hidden_shape = 1 # Policy network model dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Create input placeholders if existing_inputs: if self.config["vtrace"]: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards = existing_inputs[:7] existing_state_in = existing_inputs[7:-1] existing_seq_lens = existing_inputs[-1] else: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards, adv_ph, value_targets = \ existing_inputs[:9] existing_state_in = existing_inputs[9:-1] existing_seq_lens = existing_inputs[-1] else: actions = ModelCatalog.get_action_placeholder(action_space) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder( tf.float32, [None, logit_dim], name="behaviour_logits") observations = tf.placeholder( tf.float32, [None] + list(observation_space.shape)) existing_state_in = None existing_seq_lens = None if not self.config["vtrace"]: adv_ph = tf.placeholder( tf.float32, name="advantages", shape=(None, )) value_targets = tf.placeholder( tf.float32, name="value_targets", shape=(None, )) self.observations = observations # Unpack behaviour logits unpacked_behaviour_logits = tf.split( behaviour_logits, output_hidden_shape, axis=1) # Setup the policy dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) unpacked_outputs = tf.split( self.model.outputs, output_hidden_shape, axis=1) dist_inputs = unpacked_outputs if is_multidiscrete else \ self.model.outputs prev_dist_inputs = unpacked_behaviour_logits if is_multidiscrete else \ behaviour_logits action_dist = dist_class(dist_inputs) prev_action_dist = dist_class(prev_dist_inputs) values = self.model.value_function() self.value_function = values self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) def make_time_major(tensor, drop_last=False): """Swaps batch and trajectory axis. Args: tensor: A tensor or list of tensors to reshape. drop_last: A bool indicating whether to drop the last trajectory item. Returns: res: A tensor with swapped axes or a list of tensors with swapped axes. """ if isinstance(tensor, list): return [make_time_major(t, drop_last) for t in tensor] if self.model.state_init: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = self.config["sample_batch_size"] B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes res = tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) if drop_last: return res[:-1] return res if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) - 1 mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(rewards) # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. if self.config["vtrace"]: logger.info("Using V-Trace surrogate loss (vtrace=True)") # Prepare actions for loss loss_actions = actions if is_multidiscrete else tf.expand_dims( actions, axis=1) self.loss = VTraceSurrogateLoss( actions=make_time_major(loss_actions, drop_last=True), prev_actions_logp=make_time_major( prev_action_dist.logp(actions), drop_last=True), actions_logp=make_time_major( action_dist.logp(actions), drop_last=True), action_kl=prev_action_dist.kl(action_dist), actions_entropy=make_time_major( action_dist.entropy(), drop_last=True), dones=make_time_major(dones, drop_last=True), behaviour_logits=make_time_major( unpacked_behaviour_logits, drop_last=True), target_logits=make_time_major( unpacked_outputs, drop_last=True), discount=config["gamma"], rewards=make_time_major(rewards, drop_last=True), values=make_time_major(values, drop_last=True), bootstrap_value=make_time_major(values)[-1], valid_mask=make_time_major(mask, drop_last=True), vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self.config[ "vtrace_clip_pg_rho_threshold"], clip_param=self.config["clip_param"]) else: logger.info("Using PPO surrogate loss (vtrace=False)") self.loss = PPOSurrogateLoss( prev_actions_logp=make_time_major( prev_action_dist.logp(actions)), actions_logp=make_time_major(action_dist.logp(actions)), action_kl=prev_action_dist.kl(action_dist), actions_entropy=make_time_major(action_dist.entropy()), values=make_time_major(values), valid_mask=make_time_major(mask), advantages=make_time_major(adv_ph), value_targets=make_time_major(value_targets), vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"]) # KL divergence between worker and learner logits for debugging model_dist = MultiCategorical(unpacked_outputs) behaviour_dist = MultiCategorical(unpacked_behaviour_logits) kls = model_dist.kl(behaviour_dist) if len(kls) > 1: self.KL_stats = {} for i, kl in enumerate(kls): self.KL_stats.update({ "mean_KL_{}".format(i): tf.reduce_mean(kl), "max_KL_{}".format(i): tf.reduce_max(kl), "median_KL_{}".format(i): tf.contrib.distributions. percentile(kl, 50.0), }) else: self.KL_stats = { "mean_KL": tf.reduce_mean(kls[0]), "max_KL": tf.reduce_max(kls[0]), "median_KL": tf.contrib.distributions.percentile(kls[0], 50.0), } # Initialize TFPolicyGraph loss_in = [ ("actions", actions), ("dones", dones), ("behaviour_logits", behaviour_logits), ("rewards", rewards), ("obs", observations), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ] if not self.config["vtrace"]: loss_in.append(("advantages", adv_ph)) loss_in.append(("value_targets", value_targets)) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=self.loss.total_loss, model=self.model, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"], batch_divisibility_req=self.config["sample_batch_size"]) self.sess.run(tf.global_variables_initializer()) values_batched = make_time_major( values, drop_last=self.config["vtrace"]) self.stats_fetches = { "stats": dict({ "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.value_targets, [-1]), tf.reshape(values_batched, [-1])), }, **self.KL_stats), }
def __init__( self, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict, loss_fn: Callable[[Policy, ModelV2, type, SampleBatch], TensorType], *, stats_fn: Optional[Callable[[Policy, SampleBatch], Dict[str, TensorType]]] = None, grad_stats_fn: Optional[ Callable[[Policy, SampleBatch, ModelGradients], Dict[str, TensorType]]] = None, before_loss_init: Optional[Callable[[ Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict ], None]] = None, make_model: Optional[Callable[[ Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict ], ModelV2]] = None, action_sampler_fn: Optional[Callable[ [TensorType, List[TensorType]], Tuple[TensorType, TensorType]]] = None, action_distribution_fn: Optional[ Callable[[Policy, ModelV2, TensorType, TensorType, TensorType], Tuple[TensorType, type, List[TensorType]]]] = None, existing_inputs: Optional[Dict[str, "tf1.placeholder"]] = None, existing_model: Optional[ModelV2] = None, get_batch_divisibility_req: Optional[Callable[[Policy], int]] = None, obs_include_prev_action_reward: bool = True): """Initialize a dynamic TF policy. Arguments: observation_space (gym.spaces.Space): Observation space of the policy. action_space (gym.spaces.Space): Action space of the policy. config (TrainerConfigDict): Policy-specific configuration data. loss_fn (Callable[[Policy, ModelV2, type, SampleBatch], TensorType]): Function that returns a loss tensor for the policy graph. stats_fn (Optional[Callable[[Policy, SampleBatch], Dict[str, TensorType]]]): Optional function that returns a dict of TF fetches given the policy and batch input tensors. grad_stats_fn (Optional[Callable[[Policy, SampleBatch, ModelGradients], Dict[str, TensorType]]]): Optional function that returns a dict of TF fetches given the policy, sample batch, and loss gradient tensors. before_loss_init (Optional[Callable[ [Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict], None]]): Optional function to run prior to loss init that takes the same arguments as __init__. make_model (Optional[Callable[[Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict], ModelV2]]): Optional function that returns a ModelV2 object given policy, obs_space, action_space, and policy config. All policy variables should be created in this function. If not specified, a default model will be created. action_sampler_fn (Optional[Callable[[Policy, ModelV2, Dict[ str, TensorType], TensorType, TensorType], Tuple[TensorType, TensorType]]]): A callable returning a sampled action and its log-likelihood given Policy, ModelV2, input_dict, explore, timestep, and is_training. action_distribution_fn (Optional[Callable[[Policy, ModelV2, Dict[str, TensorType], TensorType, TensorType], Tuple[TensorType, type, List[TensorType]]]]): A callable returning distribution inputs (parameters), a dist-class to generate an action distribution object from, and internal-state outputs (or an empty list if not applicable). Note: No Exploration hooks have to be called from within `action_distribution_fn`. It's should only perform a simple forward pass through some model. If None, pass inputs through `self.model()` to get distribution inputs. The callable takes as inputs: Policy, ModelV2, input_dict, explore, timestep, is_training. existing_inputs (Optional[Dict[str, tf1.placeholder]]): When copying a policy, this specifies an existing dict of placeholders to use instead of defining new ones. existing_model (Optional[ModelV2]): When copying a policy, this specifies an existing model to clone and share weights with. get_batch_divisibility_req (Optional[Callable[[Policy], int]]]): Optional callable that returns the divisibility requirement for sample batches given the Policy. obs_include_prev_action_reward (bool): Whether to include the previous action and reward in the model input (default: True). """ self.observation_space = obs_space self.action_space = action_space self.config = config self.framework = "tf" self._loss_fn = loss_fn self._stats_fn = stats_fn self._grad_stats_fn = grad_stats_fn self._obs_include_prev_action_reward = obs_include_prev_action_reward # Setup standard placeholders prev_actions = None prev_rewards = None if existing_inputs is not None: obs = existing_inputs[SampleBatch.CUR_OBS] if self._obs_include_prev_action_reward: prev_actions = existing_inputs[SampleBatch.PREV_ACTIONS] prev_rewards = existing_inputs[SampleBatch.PREV_REWARDS] action_input = existing_inputs[SampleBatch.ACTIONS] explore = existing_inputs["is_exploring"] timestep = existing_inputs["timestep"] else: obs = tf1.placeholder(tf.float32, shape=[None] + list(obs_space.shape), name="observation") action_input = ModelCatalog.get_action_placeholder(action_space) if self._obs_include_prev_action_reward: prev_actions = ModelCatalog.get_action_placeholder( action_space, "prev_action") prev_rewards = tf1.placeholder(tf.float32, [None], name="prev_reward") explore = tf1.placeholder_with_default(True, (), name="is_exploring") timestep = tf1.placeholder(tf.int32, (), name="timestep") self._input_dict = { SampleBatch.CUR_OBS: obs, SampleBatch.PREV_ACTIONS: prev_actions, SampleBatch.PREV_REWARDS: prev_rewards, "is_training": self._get_is_training_placeholder(), } # Placeholder for RNN time-chunk valid lengths. self._seq_lens = tf1.placeholder(dtype=tf.int32, shape=[None], name="seq_lens") dist_class = dist_inputs = None if action_sampler_fn or action_distribution_fn: if not make_model: raise ValueError( "`make_model` is required if `action_sampler_fn` OR " "`action_distribution_fn` is given") else: dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Setup self.model. if existing_model: self.model = existing_model elif make_model: self.model = make_model(self, obs_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=logit_dim, model_config=self.config["model"], framework="tf") # Create the Exploration object to use for this Policy. self.exploration = self._create_exploration() if existing_inputs: self._state_in = [ v for k, v in existing_inputs.items() if k.startswith("state_in_") ] if self._state_in: self._seq_lens = existing_inputs["seq_lens"] else: self._state_in = [ tf1.placeholder(shape=(None, ) + s.shape, dtype=s.dtype) for s in self.model.get_initial_state() ] # Fully customized action generation (e.g., custom policy). if action_sampler_fn: sampled_action, sampled_action_logp = action_sampler_fn( self, self.model, obs_batch=self._input_dict[SampleBatch.CUR_OBS], state_batches=self._state_in, seq_lens=self._seq_lens, prev_action_batch=self._input_dict[SampleBatch.PREV_ACTIONS], prev_reward_batch=self._input_dict[SampleBatch.PREV_REWARDS], explore=explore, is_training=self._input_dict["is_training"]) else: # Distribution generation is customized, e.g., DQN, DDPG. if action_distribution_fn: dist_inputs, dist_class, self._state_out = \ action_distribution_fn( self, self.model, obs_batch=self._input_dict[SampleBatch.CUR_OBS], state_batches=self._state_in, seq_lens=self._seq_lens, prev_action_batch=self._input_dict[ SampleBatch.PREV_ACTIONS], prev_reward_batch=self._input_dict[ SampleBatch.PREV_REWARDS], explore=explore, is_training=self._input_dict["is_training"]) # Default distribution generation behavior: # Pass through model. E.g., PG, PPO. else: dist_inputs, self._state_out = self.model( self._input_dict, self._state_in, self._seq_lens) action_dist = dist_class(dist_inputs, self.model) # Using exploration to get final action (e.g. via sampling). sampled_action, sampled_action_logp = \ self.exploration.get_exploration_action( action_distribution=action_dist, timestep=timestep, explore=explore) # Phase 1 init. sess = tf1.get_default_session() or tf1.Session() if get_batch_divisibility_req: batch_divisibility_req = get_batch_divisibility_req(self) else: batch_divisibility_req = 1 super().__init__( observation_space=obs_space, action_space=action_space, config=config, sess=sess, obs_input=obs, action_input=action_input, # for logp calculations sampled_action=sampled_action, sampled_action_logp=sampled_action_logp, dist_inputs=dist_inputs, dist_class=dist_class, loss=None, # dynamically initialized on run loss_inputs=[], model=self.model, state_inputs=self._state_in, state_outputs=self._state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self._seq_lens, max_seq_len=config["model"]["max_seq_len"], batch_divisibility_req=batch_divisibility_req, explore=explore, timestep=timestep) # Phase 2 init. if before_loss_init is not None: before_loss_init(self, obs_space, action_space, config) if not existing_inputs: self._initialize_loss_dynamically()
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config) self.config = config self.sess = tf.get_default_session() # Setup the policy self.observations = tf.placeholder( tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model({ "obs": self.observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) self.vf = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # Setup the policy loss if isinstance(action_space, gym.spaces.Box): ac_size = action_space.shape[0] actions = tf.placeholder(tf.float32, [None, ac_size], name="ac") elif isinstance(action_space, gym.spaces.Discrete): actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for A3C.".format( action_space)) advantages = tf.placeholder(tf.float32, [None], name="advantages") self.v_target = tf.placeholder(tf.float32, [None], name="v_target") self.loss = A3CLoss(action_dist, actions, advantages, self.v_target, self.vf, self.config["vf_loss_coeff"], self.config["entropy_coeff"]) # Initialize TFPolicyGraph loss_in = [ ("obs", self.observations), ("actions", actions), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ("advantages", advantages), ("value_targets", self.v_target), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), loss=self.model.loss() + self.loss.total_loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) self.stats_fetches = { "stats": { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "policy_entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance(self.v_target, self.vf), }, } self.sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config): assert tf.executing_eagerly() self.framework = config.get("framework", "tfe") Policy.__init__(self, observation_space, action_space, config) # Log device and worker index. from ray.rllib.evaluation.rollout_worker import get_global_worker worker = get_global_worker() worker_idx = worker.worker_index if worker else 0 if get_gpu_devices(): logger.info( "TF-eager Policy (worker={}) running on GPU.".format( worker_idx if worker_idx > 0 else "local")) else: logger.info( "TF-eager Policy (worker={}) running on CPU.".format( worker_idx if worker_idx > 0 else "local")) self._is_training = False self._loss_initialized = False self._loss = loss_fn self.batch_divisibility_req = get_batch_divisibility_req(self) if \ callable(get_batch_divisibility_req) else \ (get_batch_divisibility_req or 1) self._max_seq_len = config["model"]["max_seq_len"] if get_default_config: config = dict(get_default_config(), **config) if validate_spaces: validate_spaces(self, observation_space, action_space, config) if before_init: before_init(self, observation_space, action_space, config) self.config = config self.dist_class = None if action_sampler_fn or action_distribution_fn: if not make_model: raise ValueError( "`make_model` is required if `action_sampler_fn` OR " "`action_distribution_fn` is given") else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if make_model: self.model = make_model(self, observation_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( observation_space, action_space, logit_dim, config["model"], framework=self.framework, ) # Lock used for locking some methods on the object-level. # This prevents possible race conditions when calling the model # first, then its value function (e.g. in a loss function), in # between of which another model call is made (e.g. to compute an # action). self._lock = threading.RLock() # Auto-update model's inference view requirements, if recurrent. self._update_model_view_requirements_from_init_state() self.exploration = self._create_exploration() self._state_inputs = self.model.get_initial_state() self._is_recurrent = len(self._state_inputs) > 0 # Combine view_requirements for Model and Policy. self.view_requirements.update(self.model.view_requirements) if before_loss_init: before_loss_init(self, observation_space, action_space, config) if optimizer_fn: optimizers = optimizer_fn(self, config) else: optimizers = tf.keras.optimizers.Adam(config["lr"]) optimizers = force_list(optimizers) if getattr(self, "exploration", None): optimizers = self.exploration.get_exploration_optimizer( optimizers) # The list of local (tf) optimizers (one per loss term). self._optimizers: List[LocalOptimizer] = optimizers # Backward compatibility: A user's policy may only support a single # loss term and optimizer (no lists). self._optimizer: LocalOptimizer = \ optimizers[0] if optimizers else None self._initialize_loss_from_dummy_batch( auto_remove_unneeded_view_reqs=True, stats_fn=stats_fn, ) self._loss_initialized = True if after_init: after_init(self, observation_space, action_space, config) # Got to reset global_timestep again after fake run-throughs. self.global_timestep = 0
def __init__(self, action_space, value_targets, advantages, actions, logits, vf_preds, curr_action_dist, value_fn, cur_kl_coeff, valid_mask, entropy_coeff=0, clip_param=0.1, vf_clip_param=0.1, vf_loss_coeff=1.0, use_gae=True): """Constructs the loss for Proximal Policy Objective. Arguments: action_space: Environment observation space specification. value_targets (Placeholder): Placeholder for target values; used for GAE. actions (Placeholder): Placeholder for actions taken from previous model evaluation. advantages (Placeholder): Placeholder for calculated advantages from previous model evaluation. logits (Placeholder): Placeholder for logits output from previous model evaluation. vf_preds (Placeholder): Placeholder for value function output from previous model evaluation. curr_action_dist (ActionDistribution): ActionDistribution of the current model. value_fn (Tensor): Current value function output Tensor. cur_kl_coeff (Variable): Variable holding the current PPO KL coefficient. valid_mask (Tensor): A bool mask of valid input elements (#2992). entropy_coeff (float): Coefficient of the entropy regularizer. clip_param (float): Clip parameter vf_clip_param (float): Clip parameter for the value function vf_loss_coeff (float): Coefficient of the value function loss use_gae (bool): If true, use the Generalized Advantage Estimator. """ def reduce_mean_valid(t): return tf.reduce_mean(tf.boolean_mask(t, valid_mask)) dist_cls, _ = ModelCatalog.get_action_dist(action_space, {}) prev_dist = dist_cls(logits) # Make loss functions. logp_ratio = tf.exp( curr_action_dist.logp(actions) - prev_dist.logp(actions)) action_kl = prev_dist.kl(curr_action_dist) self.mean_kl = reduce_mean_valid(action_kl) curr_entropy = curr_action_dist.entropy() self.mean_entropy = reduce_mean_valid(curr_entropy) surrogate_loss = tf.minimum( advantages * logp_ratio, advantages * tf.clip_by_value(logp_ratio, 1 - clip_param, 1 + clip_param)) self.mean_policy_loss = reduce_mean_valid(-surrogate_loss) if use_gae: vf_loss1 = tf.square(value_fn - value_targets) vf_clipped = vf_preds + tf.clip_by_value( value_fn - vf_preds, -vf_clip_param, vf_clip_param) vf_loss2 = tf.square(vf_clipped - value_targets) vf_loss = tf.maximum(vf_loss1, vf_loss2) self.mean_vf_loss = reduce_mean_valid(vf_loss) loss = reduce_mean_valid( -surrogate_loss + cur_kl_coeff * action_kl + vf_loss_coeff * vf_loss - entropy_coeff * curr_entropy) else: self.mean_vf_loss = tf.constant(0.0) loss = reduce_mean_valid(-surrogate_loss + cur_kl_coeff * action_kl - entropy_coeff * curr_entropy) self.loss = loss
def __init__( self, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict, loss_fn: Callable[ [Policy, ModelV2, Type[TFActionDistribution], SampleBatch], TensorType], *, stats_fn: Optional[Callable[[Policy, SampleBatch], Dict[str, TensorType]]] = None, grad_stats_fn: Optional[ Callable[[Policy, SampleBatch, ModelGradients], Dict[str, TensorType]]] = None, before_loss_init: Optional[Callable[[ Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict ], None]] = None, make_model: Optional[Callable[[ Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict ], ModelV2]] = None, action_sampler_fn: Optional[Callable[ [TensorType, List[TensorType]], Tuple[TensorType, TensorType]]] = None, action_distribution_fn: Optional[ Callable[[Policy, ModelV2, TensorType, TensorType, TensorType], Tuple[TensorType, type, List[TensorType]]]] = None, existing_inputs: Optional[Dict[str, "tf1.placeholder"]] = None, existing_model: Optional[ModelV2] = None, get_batch_divisibility_req: Optional[Callable[[Policy], int]] = None, obs_include_prev_action_reward: bool = True): """Initialize a dynamic TF policy. Args: observation_space (gym.spaces.Space): Observation space of the policy. action_space (gym.spaces.Space): Action space of the policy. config (TrainerConfigDict): Policy-specific configuration data. loss_fn (Callable[[Policy, ModelV2, Type[TFActionDistribution], SampleBatch], TensorType]): Function that returns a loss tensor for the policy graph. stats_fn (Optional[Callable[[Policy, SampleBatch], Dict[str, TensorType]]]): Optional function that returns a dict of TF fetches given the policy and batch input tensors. grad_stats_fn (Optional[Callable[[Policy, SampleBatch, ModelGradients], Dict[str, TensorType]]]): Optional function that returns a dict of TF fetches given the policy, sample batch, and loss gradient tensors. before_loss_init (Optional[Callable[ [Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict], None]]): Optional function to run prior to loss init that takes the same arguments as __init__. make_model (Optional[Callable[[Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict], ModelV2]]): Optional function that returns a ModelV2 object given policy, obs_space, action_space, and policy config. All policy variables should be created in this function. If not specified, a default model will be created. action_sampler_fn (Optional[Callable[[Policy, ModelV2, Dict[ str, TensorType], TensorType, TensorType], Tuple[TensorType, TensorType]]]): A callable returning a sampled action and its log-likelihood given Policy, ModelV2, input_dict, explore, timestep, and is_training. action_distribution_fn (Optional[Callable[[Policy, ModelV2, Dict[str, TensorType], TensorType, TensorType], Tuple[TensorType, type, List[TensorType]]]]): A callable returning distribution inputs (parameters), a dist-class to generate an action distribution object from, and internal-state outputs (or an empty list if not applicable). Note: No Exploration hooks have to be called from within `action_distribution_fn`. It's should only perform a simple forward pass through some model. If None, pass inputs through `self.model()` to get distribution inputs. The callable takes as inputs: Policy, ModelV2, input_dict, explore, timestep, is_training. existing_inputs (Optional[Dict[str, tf1.placeholder]]): When copying a policy, this specifies an existing dict of placeholders to use instead of defining new ones. existing_model (Optional[ModelV2]): When copying a policy, this specifies an existing model to clone and share weights with. get_batch_divisibility_req (Optional[Callable[[Policy], int]]): Optional callable that returns the divisibility requirement for sample batches. If None, will assume a value of 1. obs_include_prev_action_reward (bool): Whether to include the previous action and reward in the model input (default: True). """ self.observation_space = obs_space self.action_space = action_space self.config = config self.framework = "tf" self._loss_fn = loss_fn self._stats_fn = stats_fn self._grad_stats_fn = grad_stats_fn self._obs_include_prev_action_reward = obs_include_prev_action_reward dist_class = dist_inputs = None if action_sampler_fn or action_distribution_fn: if not make_model: raise ValueError( "`make_model` is required if `action_sampler_fn` OR " "`action_distribution_fn` is given") else: dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Setup self.model. if existing_model: self.model = existing_model elif make_model: self.model = make_model(self, obs_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=logit_dim, model_config=self.config["model"], framework="tf") # Auto-update model's inference view requirements, if recurrent. self._update_model_inference_view_requirements_from_init_state() if existing_inputs: self._state_inputs = [ v for k, v in existing_inputs.items() if k.startswith("state_in_") ] if self._state_inputs: self._seq_lens = existing_inputs["seq_lens"] else: if self.config["_use_trajectory_view_api"]: self._state_inputs = [ get_placeholder( space=vr.space, time_axis=not isinstance(vr.shift, int), ) for k, vr in self.model.inference_view_requirements.items() if k.startswith("state_in_") ] else: self._state_inputs = [ tf1.placeholder(shape=(None, ) + s.shape, dtype=s.dtype) for s in self.model.get_initial_state() ] # Use default settings. # Add NEXT_OBS, STATE_IN_0.., and others. self.view_requirements = self._get_default_view_requirements() # Combine view_requirements for Model and Policy. self.view_requirements.update(self.model.inference_view_requirements) # Setup standard placeholders. if existing_inputs is not None: timestep = existing_inputs["timestep"] explore = existing_inputs["is_exploring"] self._input_dict, self._dummy_batch = \ self._get_input_dict_and_dummy_batch( self.view_requirements, existing_inputs) else: action_ph = ModelCatalog.get_action_placeholder(action_space) prev_action_ph = ModelCatalog.get_action_placeholder( action_space, "prev_action") if self.config["_use_trajectory_view_api"]: self._input_dict, self._dummy_batch = \ self._get_input_dict_and_dummy_batch( self.view_requirements, {SampleBatch.ACTIONS: action_ph, SampleBatch.PREV_ACTIONS: prev_action_ph}) else: self._input_dict = { SampleBatch.CUR_OBS: tf1.placeholder(tf.float32, shape=[None] + list(obs_space.shape), name="observation") } self._input_dict[SampleBatch.ACTIONS] = action_ph if self._obs_include_prev_action_reward: self._input_dict.update({ SampleBatch.PREV_ACTIONS: prev_action_ph, SampleBatch.PREV_REWARDS: tf1.placeholder(tf.float32, [None], name="prev_reward"), }) # Placeholder for (sampling steps) timestep (int). timestep = tf1.placeholder_with_default(tf.zeros((), dtype=tf.int64), (), name="timestep") # Placeholder for `is_exploring` flag. explore = tf1.placeholder_with_default(True, (), name="is_exploring") # Placeholder for RNN time-chunk valid lengths. self._seq_lens = tf1.placeholder(dtype=tf.int32, shape=[None], name="seq_lens") # Placeholder for `is_training` flag. self._input_dict["is_training"] = self._get_is_training_placeholder() # Create the Exploration object to use for this Policy. self.exploration = self._create_exploration() # Fully customized action generation (e.g., custom policy). if action_sampler_fn: sampled_action, sampled_action_logp = action_sampler_fn( self, self.model, obs_batch=self._input_dict[SampleBatch.CUR_OBS], state_batches=self._state_inputs, seq_lens=self._seq_lens, prev_action_batch=self._input_dict.get( SampleBatch.PREV_ACTIONS), prev_reward_batch=self._input_dict.get( SampleBatch.PREV_REWARDS), explore=explore, is_training=self._input_dict["is_training"]) else: # Distribution generation is customized, e.g., DQN, DDPG. if action_distribution_fn: dist_inputs, dist_class, self._state_out = \ action_distribution_fn( self, self.model, obs_batch=self._input_dict[SampleBatch.CUR_OBS], state_batches=self._state_inputs, seq_lens=self._seq_lens, prev_action_batch=self._input_dict.get( SampleBatch.PREV_ACTIONS), prev_reward_batch=self._input_dict.get( SampleBatch.PREV_REWARDS), explore=explore, is_training=self._input_dict["is_training"]) # Default distribution generation behavior: # Pass through model. E.g., PG, PPO. else: dist_inputs, self._state_out = self.model( self._input_dict, self._state_inputs, self._seq_lens) action_dist = dist_class(dist_inputs, self.model) # Using exploration to get final action (e.g. via sampling). sampled_action, sampled_action_logp = \ self.exploration.get_exploration_action( action_distribution=action_dist, timestep=timestep, explore=explore) # Phase 1 init. sess = tf1.get_default_session() or tf1.Session() batch_divisibility_req = get_batch_divisibility_req(self) if \ callable(get_batch_divisibility_req) else \ (get_batch_divisibility_req or 1) super().__init__( observation_space=obs_space, action_space=action_space, config=config, sess=sess, obs_input=self._input_dict[SampleBatch.OBS], action_input=self._input_dict[SampleBatch.ACTIONS], sampled_action=sampled_action, sampled_action_logp=sampled_action_logp, dist_inputs=dist_inputs, dist_class=dist_class, loss=None, # dynamically initialized on run loss_inputs=[], model=self.model, state_inputs=self._state_inputs, state_outputs=self._state_out, prev_action_input=self._input_dict.get(SampleBatch.PREV_ACTIONS), prev_reward_input=self._input_dict.get(SampleBatch.PREV_REWARDS), seq_lens=self._seq_lens, max_seq_len=config["model"]["max_seq_len"], batch_divisibility_req=batch_divisibility_req, explore=explore, timestep=timestep) # Phase 2 init. if before_loss_init is not None: before_loss_init(self, obs_space, action_space, config) # Loss initialization and model/postprocessing test calls. if not existing_inputs: self._initialize_loss_from_dummy_batch( auto_remove_unneeded_view_reqs=True)
def __init__(self, observation_space, action_space, config): assert tf.executing_eagerly() self.framework = config.get("framework", "tfe") Policy.__init__(self, observation_space, action_space, config) self._is_training = False self._loss_initialized = False self._sess = None self._loss = loss_fn self.batch_divisibility_req = get_batch_divisibility_req(self) if \ callable(get_batch_divisibility_req) else \ (get_batch_divisibility_req or 1) self._max_seq_len = config["model"]["max_seq_len"] if get_default_config: config = dict(get_default_config(), **config) if validate_spaces: validate_spaces(self, observation_space, action_space, config) if before_init: before_init(self, observation_space, action_space, config) self.config = config self.dist_class = None if action_sampler_fn or action_distribution_fn: if not make_model: raise ValueError( "`make_model` is required if `action_sampler_fn` OR " "`action_distribution_fn` is given") else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if make_model: self.model = make_model(self, observation_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( observation_space, action_space, logit_dim, config["model"], framework=self.framework, ) # Lock used for locking some methods on the object-level. # This prevents possible race conditions when calling the model # first, then its value function (e.g. in a loss function), in # between of which another model call is made (e.g. to compute an # action). self._lock = threading.RLock() # Auto-update model's inference view requirements, if recurrent. self._update_model_view_requirements_from_init_state() self.exploration = self._create_exploration() self._state_inputs = self.model.get_initial_state() self._is_recurrent = len(self._state_inputs) > 0 # Combine view_requirements for Model and Policy. self.view_requirements.update(self.model.view_requirements) if before_loss_init: before_loss_init(self, observation_space, action_space, config) if optimizer_fn: optimizers = optimizer_fn(self, config) else: optimizers = tf.keras.optimizers.Adam(config["lr"]) optimizers = force_list(optimizers) if getattr(self, "exploration", None): optimizers = self.exploration.get_exploration_optimizer( optimizers) # TODO: (sven) Allow tf policy to have more than 1 optimizer. # Just like torch Policy does. self._optimizer = optimizers[0] if optimizers else None self._initialize_loss_from_dummy_batch( auto_remove_unneeded_view_reqs=True, stats_fn=stats_fn, ) self._loss_initialized = True if after_init: after_init(self, observation_space, action_space, config) # Got to reset global_timestep again after fake run-throughs. self.global_timestep = 0
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPORND graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ config = dict(DEFAULT_CONFIG, **config) self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist(action_space) if existing_inputs: obs_ph, value_targets_ph, adv_ph, act_ph, \ logits_ph, vf_preds_ph = existing_inputs[:6] # TODO: add adv_ph_int existing_state_in = existing_inputs[6:-1] existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder(tf.float32, name="obs", shape=(None, ) + observation_space.shape) adv_ph = tf.placeholder(tf.float32, name="advantages", shape=(None, )) adv_int_ph = tf.placeholder(tf.float32, name="advantages_int", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder(tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder(tf.float32, name="vf_preds", shape=(None, )) value_targets_ph = tf.placeholder(tf.float32, name="value_targets", shape=(None, )) existing_state_in = None existing_seq_lens = None self.observations = obs_ph self.loss_in = [ ("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logits", logits_ph), ("vf_preds", vf_preds_ph), ] self.model = ModelCatalog.get_model(obs_ph, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable(initializer=tf.constant_initializer( self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: if self.config["vf_share_layers"]: self.value_function = tf.reshape( linear(self.model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) else: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False vf_config["use_lstm"] = False with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model( obs_ph, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1]) # TODO: add another head in the policy network for estimating value of intrinsic reward # RND target network with tf.variable_scope("rnd_target"): modelconfig = self.config["model"].copy() modelconfig["free_log_std"] = False modelconfig["use_lstm"] = False self.rnd_target = ModelCatalog.get_model( obs_ph, self.config["embedding_size"], modelconfig).outputs # self.rnd_target = tf.reshape(self.rnd_target, [-1]) # TODO: necessary? # RND predictor network with tf.variable_scope("rnd_predictor"): modelconfig = self.config["model"].copy() modelconfig["free_log_std"] = False modelconfig["use_lstm"] = False self.rnd_predictor = ModelCatalog.get_model( obs_ph, self.config["embedding_size"], modelconfig).outputs self.loss_obj = PPORNDLoss( action_space, value_targets_ph, adv_ph, adv_int_ph, act_ph, logits_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, self.rnd_target, self.rnd_predictor, # TODO: valid_mask?? entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_clip_param=self.config["vf_clip_param"], vf_loss_coeff=self.config["vf_loss_coeff"], use_gae=self.config["use_gae"]) entropy_coeff = 0, clip_param = 0.1, vf_clip_param = 0.1, vf_loss_coeff = 1.0, use_gae = True, rnd_pred_update_prop = 0.25 LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, loss=self.loss_obj.loss, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) self.explained_variance = explained_variance(value_targets_ph, self.value_function) self.stats_fetches = { "cur_lr": tf.cast(self.cur_lr, tf.float64), "total_loss": self.loss_obj.loss, "policy_loss": self.loss_obj.mean_policy_loss, "vf_loss": self.loss_obj.mean_vf_loss, "vf_explained_var": self.explained_variance, "kl": self.loss_obj.mean_kl, "entropy": self.loss_obj.mean_entropy }