def __init__(self, obs_space, action_space, config): config = dict(ray.rllib.agents.pg.pg.DEFAULT_CONFIG, **config) self.config = config # Setup placeholders obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape)) dist_class, self.logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") # Create the model network and action outputs self.model = ModelCatalog.get_model({ "obs": obs, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, obs_space, action_space, self.logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) # logit for each action # Setup policy loss actions = ModelCatalog.get_action_placeholder(action_space) advantages = tf.placeholder(tf.float32, [None], name="adv") loss = PGLoss(action_dist, actions, advantages).loss # Mapping from sample batch keys to placeholders. These keys will be # read from postprocessed sample batches and fed into the specified # placeholders during loss computation. loss_in = [ ("obs", obs), ("actions", actions), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ("advantages", advantages), # added during postprocessing ] # Initialize TFPolicyGraph sess = tf.get_default_session() TFPolicyGraph.__init__( self, obs_space, action_space, sess, obs_input=obs, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=loss, loss_inputs=loss_in, model=self.model, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) sess.run(tf.global_variables_initializer())
def _setup_loss(self, action_space): self.ac = ModelCatalog.get_action_placeholder(action_space) self.adv = tf.placeholder(tf.float32, [None], name="adv") log_prob = self.dist.logp(self.ac) # policy loss self.loss = -tf.reduce_mean(log_prob * self.adv)
def __init__(self, obs_space, action_space, config): config = dict(ray.rllib.agents.pg.pg.DEFAULT_CONFIG, **config) self.config = config # Setup policy obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape)) dist_class, self.logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.model = ModelCatalog.get_model(obs, self.logit_dim, options=self.config["model"]) action_dist = dist_class(self.model.outputs) # logit for each action # Setup policy loss actions = ModelCatalog.get_action_placeholder(action_space) advantages = tf.placeholder(tf.float32, [None], name="adv") loss = PGLoss(action_dist, actions, advantages).loss # Initialize TFPolicyGraph sess = tf.get_default_session() loss_in = [ ("obs", obs), ("actions", actions), ("advantages", advantages), ] TFPolicyGraph.__init__(self, obs_space, action_space, sess, obs_input=obs, action_sampler=action_dist.sample(), loss=loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist(action_space) if existing_inputs: self.loss_in = existing_inputs obs_ph, value_targets_ph, adv_ph, act_ph, \ logprobs_ph, vf_preds_ph = [ph for _, ph in existing_inputs] else: obs_ph = tf.placeholder(tf.float32, name="obs", shape=(None, ) + observation_space.shape) # Targets of the value function. value_targets_ph = tf.placeholder(tf.float32, name="value_targets", shape=(None, )) # Advantage values in the policy gradient estimator. adv_ph = tf.placeholder(tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) # Log probabilities from the policy before the policy update. logprobs_ph = tf.placeholder(tf.float32, name="logprobs", shape=(None, logit_dim)) # Value function predictions before the policy update. vf_preds_ph = tf.placeholder(tf.float32, name="vf_preds", shape=(None, )) self.loss_in = [("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logprobs", logprobs_ph), ("vf_preds", vf_preds_ph)] # TODO(ekl) feed RNN states in here # KL Coefficient self.kl_coeff = tf.get_variable(initializer=tf.constant_initializer( self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = ModelCatalog.get_model(obs_ph, logit_dim, self.config["model"]).outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model( obs_ph, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.constant("NA") self.loss_obj = PPOLoss(action_space, value_targets_ph, adv_ph, act_ph, logprobs_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_loss_coeff=self.config["kl_target"], use_gae=self.config["use_gae"]) self.is_training = tf.placeholder_with_default(True, ()) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, loss=self.loss_obj.loss, loss_inputs=self.loss_in, is_training=self.is_training)
def _initialize_loss_from_dummy_batch( self, auto_remove_unneeded_view_reqs: bool = True, stats_fn=None) -> None: # Create the optimizer/exploration optimizer here. Some initialization # steps (e.g. exploration postprocessing) may need this. self._optimizer = self.optimizer() # Test calls depend on variable init, so initialize model first. self._sess.run(tf1.global_variables_initializer()) if self.config["_use_trajectory_view_api"]: logger.info("Testing `compute_actions` w/ dummy batch.") actions, state_outs, extra_fetches = \ self.compute_actions_from_input_dict( self._dummy_batch, explore=False, timestep=0) for key, value in extra_fetches.items(): self._dummy_batch[key] = np.zeros_like(value) self._input_dict[key] = get_placeholder(value=value, name=key) if key not in self.view_requirements: logger.info("Adding extra-action-fetch `{}` to " "view-reqs.".format(key)) self.view_requirements[key] = \ ViewRequirement(space=gym.spaces.Box( -1.0, 1.0, shape=value.shape[1:], dtype=value.dtype)) dummy_batch = self._dummy_batch else: def fake_array(tensor): shape = tensor.shape.as_list() shape = [s if s is not None else 1 for s in shape] return np.zeros(shape, dtype=tensor.dtype.as_numpy_dtype) dummy_batch = { SampleBatch.CUR_OBS: fake_array(self._obs_input), SampleBatch.NEXT_OBS: fake_array(self._obs_input), SampleBatch.DONES: np.array([False], dtype=np.bool), SampleBatch.ACTIONS: fake_array( ModelCatalog.get_action_placeholder(self.action_space)), SampleBatch.REWARDS: np.array([0], dtype=np.float32), } if self._obs_include_prev_action_reward: dummy_batch.update({ SampleBatch.PREV_ACTIONS: fake_array(self._prev_action_input), SampleBatch.PREV_REWARDS: fake_array(self._prev_reward_input), }) state_init = self.get_initial_state() state_batches = [] for i, h in enumerate(state_init): dummy_batch["state_in_{}".format(i)] = np.expand_dims(h, 0) dummy_batch["state_out_{}".format(i)] = np.expand_dims(h, 0) state_batches.append(np.expand_dims(h, 0)) if state_init: dummy_batch["seq_lens"] = np.array([1], dtype=np.int32) for k, v in self.extra_compute_action_fetches().items(): dummy_batch[k] = fake_array(v) dummy_batch = SampleBatch(dummy_batch) batch_for_postproc = UsageTrackingDict(dummy_batch) batch_for_postproc.count = dummy_batch.count logger.info("Testing `postprocess_trajectory` w/ dummy batch.") self.exploration.postprocess_trajectory(self, batch_for_postproc, self._sess) postprocessed_batch = self.postprocess_trajectory(batch_for_postproc) # Add new columns automatically to (loss) input_dict. if self.config["_use_trajectory_view_api"]: for key in batch_for_postproc.added_keys: if key not in self._input_dict: self._input_dict[key] = get_placeholder( value=batch_for_postproc[key], name=key) if key not in self.view_requirements: self.view_requirements[key] = \ ViewRequirement(space=gym.spaces.Box( -1.0, 1.0, shape=batch_for_postproc[key].shape[1:], dtype=batch_for_postproc[key].dtype)) if not self.config["_use_trajectory_view_api"]: train_batch = UsageTrackingDict( dict({ SampleBatch.CUR_OBS: self._obs_input, }, **self._loss_input_dict)) if self._obs_include_prev_action_reward: train_batch.update({ SampleBatch.PREV_ACTIONS: self._prev_action_input, SampleBatch.PREV_REWARDS: self._prev_reward_input, SampleBatch.CUR_OBS: self._obs_input, }) for k, v in postprocessed_batch.items(): if k in train_batch: continue elif v.dtype == np.object: continue # can't handle arbitrary objects in TF elif k == "seq_lens" or k.startswith("state_in_"): continue shape = (None, ) + v.shape[1:] dtype = np.float32 if v.dtype == np.float64 else v.dtype placeholder = tf1.placeholder(dtype, shape=shape, name=k) train_batch[k] = placeholder for i, si in enumerate(self._state_inputs): train_batch["state_in_{}".format(i)] = si else: train_batch = UsageTrackingDict( dict(self._input_dict, **self._loss_input_dict)) if self._state_inputs: train_batch["seq_lens"] = self._seq_lens if log_once("loss_init"): logger.debug( "Initializing loss function with dummy input:\n\n{}\n".format( summarize(train_batch))) self._loss_input_dict.update({k: v for k, v in train_batch.items()}) loss = self._do_loss_init(train_batch) all_accessed_keys = \ train_batch.accessed_keys | batch_for_postproc.accessed_keys | \ batch_for_postproc.added_keys | set( self.model.view_requirements.keys()) TFPolicy._initialize_loss( self, loss, [(k, v) for k, v in train_batch.items() if k in all_accessed_keys]) if "is_training" in self._loss_input_dict: del self._loss_input_dict["is_training"] # Call the grads stats fn. # TODO: (sven) rename to simply stats_fn to match eager and torch. if self._grad_stats_fn: self._stats_fetches.update( self._grad_stats_fn(self, train_batch, self._grads)) # Add new columns automatically to view-reqs. if self.config["_use_trajectory_view_api"] and \ auto_remove_unneeded_view_reqs: # Add those needed for postprocessing and training. all_accessed_keys = train_batch.accessed_keys | \ batch_for_postproc.accessed_keys # Tag those only needed for post-processing (with some exceptions). for key in batch_for_postproc.accessed_keys: if key not in train_batch.accessed_keys and \ key not in self.model.view_requirements and \ key not in [ SampleBatch.EPS_ID, SampleBatch.AGENT_INDEX, SampleBatch.UNROLL_ID, SampleBatch.DONES, SampleBatch.REWARDS, SampleBatch.INFOS]: if key in self.view_requirements: self.view_requirements[key].used_for_training = False if key in self._loss_input_dict: del self._loss_input_dict[key] # Remove those not needed at all (leave those that are needed # by Sampler to properly execute sample collection). # Also always leave DONES, REWARDS, and INFOS, no matter what. for key in list(self.view_requirements.keys()): if key not in all_accessed_keys and key not in [ SampleBatch.EPS_ID, SampleBatch.AGENT_INDEX, SampleBatch.UNROLL_ID, SampleBatch.DONES, SampleBatch.REWARDS, SampleBatch.INFOS] and \ key not in self.model.view_requirements: # If user deleted this key manually in postprocessing # fn, warn about it and do not remove from # view-requirements. if key in batch_for_postproc.deleted_keys: logger.warning( "SampleBatch key '{}' was deleted manually in " "postprocessing function! RLlib will " "automatically remove non-used items from the " "data stream. Remove the `del` from your " "postprocessing function.".format(key)) else: del self.view_requirements[key] if key in self._loss_input_dict: del self._loss_input_dict[key] # Add those data_cols (again) that are missing and have # dependencies by view_cols. for key in list(self.view_requirements.keys()): vr = self.view_requirements[key] if (vr.data_col is not None and vr.data_col not in self.view_requirements): used_for_training = \ vr.data_col in train_batch.accessed_keys self.view_requirements[vr.data_col] = ViewRequirement( space=vr.space, used_for_training=used_for_training) self._loss_input_dict_no_rnn = { k: v for k, v in self._loss_input_dict.items() if (v not in self._state_inputs and v != self._seq_lens) } # Initialize again after loss init. self._sess.run(tf1.global_variables_initializer())
def _initialize_loss(self): def fake_array(tensor): shape = tensor.shape.as_list() shape[0] = 1 return np.zeros(shape, dtype=tensor.dtype.as_numpy_dtype) dummy_batch = { SampleBatch.CUR_OBS: fake_array(self._obs_input), SampleBatch.NEXT_OBS: fake_array(self._obs_input), SampleBatch.DONES: np.array([False], dtype=np.bool), SampleBatch.ACTIONS: fake_array(ModelCatalog.get_action_placeholder(self.action_space)), SampleBatch.REWARDS: np.array([0], dtype=np.float32), } if self._obs_include_prev_action_reward: dummy_batch.update({ SampleBatch.PREV_ACTIONS: fake_array(self._prev_action_input), SampleBatch.PREV_REWARDS: fake_array(self._prev_reward_input), }) state_init = self.get_initial_state() for i, h in enumerate(state_init): dummy_batch["state_in_{}".format(i)] = np.expand_dims(h, 0) dummy_batch["state_out_{}".format(i)] = np.expand_dims(h, 0) if state_init: dummy_batch["seq_lens"] = np.array([1], dtype=np.int32) for k, v in self.extra_compute_action_fetches().items(): dummy_batch[k] = fake_array(v) # postprocessing might depend on variable init, so run it first here self._sess.run(tf.global_variables_initializer()) postprocessed_batch = self.postprocess_trajectory( SampleBatch(dummy_batch)) if self._obs_include_prev_action_reward: batch_tensors = UsageTrackingDict({ SampleBatch.PREV_ACTIONS: self._prev_action_input, SampleBatch.PREV_REWARDS: self._prev_reward_input, SampleBatch.CUR_OBS: self._obs_input, }) loss_inputs = [ (SampleBatch.PREV_ACTIONS, self._prev_action_input), (SampleBatch.PREV_REWARDS, self._prev_reward_input), (SampleBatch.CUR_OBS, self._obs_input), ] else: batch_tensors = UsageTrackingDict({ SampleBatch.CUR_OBS: self._obs_input, }) loss_inputs = [ (SampleBatch.CUR_OBS, self._obs_input), ] for k, v in postprocessed_batch.items(): if k in batch_tensors: continue elif v.dtype == np.object: continue # can't handle arbitrary objects in TF shape = (None, ) + v.shape[1:] dtype = np.float32 if v.dtype == np.float64 else v.dtype placeholder = tf.placeholder(dtype, shape=shape, name=k) batch_tensors[k] = placeholder if log_once("loss_init"): logger.info( "Initializing loss function with dummy input:\n\n{}\n".format( summarize(batch_tensors))) loss = self._do_loss_init(batch_tensors) for k in sorted(batch_tensors.accessed_keys): loss_inputs.append((k, batch_tensors[k])) TFPolicy._initialize_loss(self, loss, loss_inputs) if self._grad_stats_fn: self._stats_fetches.update(self._grad_stats_fn(self, self._grads)) self._sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: obs_ph, value_targets_ph, adv_ph, act_ph, \ logits_ph, vf_preds_ph = existing_inputs[:6] existing_state_in = existing_inputs[6:-1] existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder(tf.float32, name="obs", shape=(None, ) + observation_space.shape) adv_ph = tf.placeholder(tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder(tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder(tf.float32, name="vf_preds", shape=(None, )) value_targets_ph = tf.placeholder(tf.float32, name="value_targets", shape=(None, )) existing_state_in = None existing_seq_lens = None self.observations = obs_ph self.loss_in = [ ("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logits", logits_ph), ("vf_preds", vf_preds_ph), ] self.model = ModelCatalog.get_model(obs_ph, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable(initializer=tf.constant_initializer( self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: if self.config["vf_share_layers"]: self.value_function = tf.reshape( linear(self.model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) else: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False vf_config["use_lstm"] = False with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model( obs_ph, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1]) if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(adv_ph) self.loss_obj = PPOLoss(action_space, value_targets_ph, adv_ph, act_ph, logits_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, mask, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_clip_param=self.config["vf_clip_param"], vf_loss_coeff=self.config["vf_loss_coeff"], use_gae=self.config["use_gae"]) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, loss=self.loss_obj.loss, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) self.explained_variance = explained_variance(value_targets_ph, self.value_function) self.stats_fetches = { "cur_lr": tf.cast(self.cur_lr, tf.float64), "total_loss": self.loss_obj.loss, "policy_loss": self.loss_obj.mean_policy_loss, "vf_loss": self.loss_obj.mean_vf_loss, "vf_explained_var": self.explained_variance, "kl": self.loss_obj.mean_kl, "entropy": self.loss_obj.mean_entropy }
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: obs_ph, value_targets_ph, adv_ph, act_ph, \ logits_ph, vf_preds_ph, prev_actions_ph, prev_rewards_ph = \ existing_inputs[:8] existing_state_in = existing_inputs[8:-1] existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder( tf.float32, name="obs", shape=(None, ) + observation_space.shape) adv_ph = tf.placeholder( tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder( tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder( tf.float32, name="vf_preds", shape=(None, )) value_targets_ph = tf.placeholder( tf.float32, name="value_targets", shape=(None, )) prev_actions_ph = ModelCatalog.get_action_placeholder(action_space) prev_rewards_ph = tf.placeholder( tf.float32, [None], name="prev_reward") existing_state_in = None existing_seq_lens = None self.observations = obs_ph self.prev_actions = prev_actions_ph self.prev_rewards = prev_rewards_ph self.loss_in = [ (SampleBatch.CUR_OBS, obs_ph), (Postprocessing.VALUE_TARGETS, value_targets_ph), (Postprocessing.ADVANTAGES, adv_ph), (SampleBatch.ACTIONS, act_ph), (BEHAVIOUR_LOGITS, logits_ph), (SampleBatch.VF_PREDS, vf_preds_ph), (SampleBatch.PREV_ACTIONS, prev_actions_ph), (SampleBatch.PREV_REWARDS, prev_rewards_ph), ] self.model = ModelCatalog.get_model( { "obs": obs_ph, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable( initializer=tf.constant_initializer(self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: if self.config["vf_share_layers"]: self.value_function = self.model.value_function() else: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False if vf_config["use_lstm"]: vf_config["use_lstm"] = False logger.warning( "It is not recommended to use a LSTM model with " "vf_share_layers=False (consider setting it to True). " "If you want to not share layers, you can implement " "a custom LSTM model that overrides the " "value_function() method.") with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model({ "obs": obs_ph, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1]) if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(adv_ph, dtype=tf.bool) self.loss_obj = PPOLoss( action_space, value_targets_ph, adv_ph, act_ph, logits_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, mask, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_clip_param=self.config["vf_clip_param"], vf_loss_coeff=self.config["vf_loss_coeff"], use_gae=self.config["use_gae"]) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, action_prob=curr_action_dist.sampled_action_prob(), loss=self.loss_obj.loss, model=self.model, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions_ph, prev_reward_input=prev_rewards_ph, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) self.explained_variance = explained_variance(value_targets_ph, self.value_function) self.stats_fetches = { "cur_kl_coeff": self.kl_coeff, "cur_lr": tf.cast(self.cur_lr, tf.float64), "total_loss": self.loss_obj.loss, "policy_loss": self.loss_obj.mean_policy_loss, "vf_loss": self.loss_obj.mean_vf_loss, "vf_explained_var": self.explained_variance, "kl": self.loss_obj.mean_kl, "entropy": self.loss_obj.mean_entropy }
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: obs_ph, value_targets_ph, adv_ph, act_ph, \ logits_ph, vf_preds_ph, prev_actions_ph, prev_rewards_ph = \ existing_inputs[:8] existing_state_in = existing_inputs[8:-1] existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder( tf.float32, name="obs", shape=(None, ) + observation_space.shape) adv_ph = tf.placeholder( tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder( tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder( tf.float32, name="vf_preds", shape=(None, )) value_targets_ph = tf.placeholder( tf.float32, name="value_targets", shape=(None, )) prev_actions_ph = ModelCatalog.get_action_placeholder(action_space) prev_rewards_ph = tf.placeholder( tf.float32, [None], name="prev_reward") existing_state_in = None existing_seq_lens = None self.observations = obs_ph self.prev_actions = prev_actions_ph self.prev_rewards = prev_rewards_ph self.loss_in = [ ("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logits", logits_ph), ("vf_preds", vf_preds_ph), ("prev_actions", prev_actions_ph), ("prev_rewards", prev_rewards_ph), ] self.model = ModelCatalog.get_model( { "obs": obs_ph, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable( initializer=tf.constant_initializer(self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: if self.config["vf_share_layers"]: self.value_function = self.model.value_function() else: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False if vf_config["use_lstm"]: vf_config["use_lstm"] = False logger.warning( "It is not recommended to use a LSTM model with " "vf_share_layers=False (consider setting it to True). " "If you want to not share layers, you can implement " "a custom LSTM model that overrides the " "value_function() method.") with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model({ "obs": obs_ph, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1]) if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(adv_ph, dtype=tf.bool) self.loss_obj = PPOLoss( action_space, value_targets_ph, adv_ph, act_ph, logits_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, mask, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_clip_param=self.config["vf_clip_param"], vf_loss_coeff=self.config["vf_loss_coeff"], use_gae=self.config["use_gae"]) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, action_prob=curr_action_dist.sampled_action_prob(), loss=self.loss_obj.loss, model=self.model, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions_ph, prev_reward_input=prev_rewards_ph, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) self.explained_variance = explained_variance(value_targets_ph, self.value_function) self.stats_fetches = { "cur_kl_coeff": self.kl_coeff, "cur_lr": tf.cast(self.cur_lr, tf.float64), "total_loss": self.loss_obj.loss, "policy_loss": self.loss_obj.mean_policy_loss, "vf_loss": self.loss_obj.mean_vf_loss, "vf_explained_var": self.explained_variance, "kl": self.loss_obj.mean_kl, "entropy": self.loss_obj.mean_entropy }
def __init__(self, observation_space, action_space, config, existing_inputs=None): config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() self.grads = None if isinstance(action_space, gym.spaces.Discrete): is_multidiscrete = False output_hidden_shape = [action_space.n] elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete): is_multidiscrete = True output_hidden_shape = action_space.nvec.astype(np.int32) else: is_multidiscrete = False output_hidden_shape = 1 # Create input placeholders dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards = existing_inputs[:7] existing_state_in = existing_inputs[7:-1] existing_seq_lens = existing_inputs[-1] else: actions = ModelCatalog.get_action_placeholder(action_space) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder( tf.float32, [None, logit_dim], name="behaviour_logits") observations = tf.placeholder( tf.float32, [None] + list(observation_space.shape)) existing_state_in = None existing_seq_lens = None # Unpack behaviour logits unpacked_behaviour_logits = tf.split( behaviour_logits, output_hidden_shape, axis=1) # Setup the policy prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) unpacked_outputs = tf.split( self.model.outputs, output_hidden_shape, axis=1) action_dist = dist_class(self.model.outputs) values = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) def make_time_major(tensor, drop_last=False): """Swaps batch and trajectory axis. Args: tensor: A tensor or list of tensors to reshape. drop_last: A bool indicating whether to drop the last trajectory item. Returns: res: A tensor with swapped axes or a list of tensors with swapped axes. """ if isinstance(tensor, list): return [make_time_major(t, drop_last) for t in tensor] if self.model.state_init: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = self.config["sample_batch_size"] B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes res = tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) if drop_last: return res[:-1] return res if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) - 1 mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(rewards, dtype=tf.bool) # Prepare actions for loss loss_actions = actions if is_multidiscrete else tf.expand_dims( actions, axis=1) # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. self.loss = VTraceLoss( actions=make_time_major(loss_actions, drop_last=True), actions_logp=make_time_major( action_dist.logp(actions), drop_last=True), actions_entropy=make_time_major( action_dist.entropy(), drop_last=True), dones=make_time_major(dones, drop_last=True), behaviour_logits=make_time_major( unpacked_behaviour_logits, drop_last=True), target_logits=make_time_major(unpacked_outputs, drop_last=True), discount=config["gamma"], rewards=make_time_major(rewards, drop_last=True), values=make_time_major(values, drop_last=True), bootstrap_value=make_time_major(values)[-1], dist_class=Categorical if is_multidiscrete else dist_class, valid_mask=make_time_major(mask, drop_last=True), vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"]) # Initialize TFPolicy loss_in = [ (SampleBatch.ACTIONS, actions), (SampleBatch.DONES, dones), (BEHAVIOUR_LOGITS, behaviour_logits), (SampleBatch.REWARDS, rewards), (SampleBatch.CUR_OBS, observations), (SampleBatch.PREV_ACTIONS, prev_actions), (SampleBatch.PREV_REWARDS, prev_rewards), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicy.__init__( self, observation_space, action_space, self.sess, obs_input=observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=self.loss.total_loss, model=self.model, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"], batch_divisibility_req=self.config["sample_batch_size"]) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { LEARNER_STATS_KEY: { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.vtrace_returns.vs, [-1]), tf.reshape(make_time_major(values, drop_last=True), [-1])), }, }
def __init__( self, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict, loss_fn: Callable[[ Policy, ModelV2, Type[TFActionDistribution], SampleBatch ], TensorType], *, stats_fn: Optional[Callable[[Policy, SampleBatch], Dict[ str, TensorType]]] = None, grad_stats_fn: Optional[Callable[[ Policy, SampleBatch, ModelGradients ], Dict[str, TensorType]]] = None, before_loss_init: Optional[Callable[[ Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict ], None]] = None, make_model: Optional[Callable[[ Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict ], ModelV2]] = None, action_sampler_fn: Optional[Callable[[ TensorType, List[TensorType] ], Tuple[TensorType, TensorType]]] = None, action_distribution_fn: Optional[Callable[[ Policy, ModelV2, TensorType, TensorType, TensorType ], Tuple[TensorType, type, List[TensorType]]]] = None, existing_inputs: Optional[Dict[str, "tf1.placeholder"]] = None, existing_model: Optional[ModelV2] = None, get_batch_divisibility_req: Optional[Callable[[Policy], int]] = None, obs_include_prev_action_reward=DEPRECATED_VALUE): """Initializes a DynamicTFPolicy instance. Args: observation_space (gym.spaces.Space): Observation space of the policy. action_space (gym.spaces.Space): Action space of the policy. config (TrainerConfigDict): Policy-specific configuration data. loss_fn (Callable[[Policy, ModelV2, Type[TFActionDistribution], SampleBatch], TensorType]): Function that returns a loss tensor for the policy graph. stats_fn (Optional[Callable[[Policy, SampleBatch], Dict[str, TensorType]]]): Optional function that returns a dict of TF fetches given the policy and batch input tensors. grad_stats_fn (Optional[Callable[[Policy, SampleBatch, ModelGradients], Dict[str, TensorType]]]): Optional function that returns a dict of TF fetches given the policy, sample batch, and loss gradient tensors. before_loss_init (Optional[Callable[ [Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict], None]]): Optional function to run prior to loss init that takes the same arguments as __init__. make_model (Optional[Callable[[Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict], ModelV2]]): Optional function that returns a ModelV2 object given policy, obs_space, action_space, and policy config. All policy variables should be created in this function. If not specified, a default model will be created. action_sampler_fn (Optional[Callable[[Policy, ModelV2, Dict[ str, TensorType], TensorType, TensorType], Tuple[TensorType, TensorType]]]): A callable returning a sampled action and its log-likelihood given Policy, ModelV2, input_dict, explore, timestep, and is_training. action_distribution_fn (Optional[Callable[[Policy, ModelV2, Dict[str, TensorType], TensorType, TensorType], Tuple[TensorType, type, List[TensorType]]]]): A callable returning distribution inputs (parameters), a dist-class to generate an action distribution object from, and internal-state outputs (or an empty list if not applicable). Note: No Exploration hooks have to be called from within `action_distribution_fn`. It's should only perform a simple forward pass through some model. If None, pass inputs through `self.model()` to get distribution inputs. The callable takes as inputs: Policy, ModelV2, input_dict, explore, timestep, is_training. existing_inputs (Optional[Dict[str, tf1.placeholder]]): When copying a policy, this specifies an existing dict of placeholders to use instead of defining new ones. existing_model (Optional[ModelV2]): When copying a policy, this specifies an existing model to clone and share weights with. get_batch_divisibility_req (Optional[Callable[[Policy], int]]): Optional callable that returns the divisibility requirement for sample batches. If None, will assume a value of 1. """ if obs_include_prev_action_reward != DEPRECATED_VALUE: deprecation_warning( old="obs_include_prev_action_reward", error=False) self.observation_space = obs_space self.action_space = action_space self.config = config self.framework = "tf" self._loss_fn = loss_fn self._stats_fn = stats_fn self._grad_stats_fn = grad_stats_fn self._seq_lens = None self._is_tower = existing_inputs is not None dist_class = None if action_sampler_fn or action_distribution_fn: if not make_model: raise ValueError( "`make_model` is required if `action_sampler_fn` OR " "`action_distribution_fn` is given") else: dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Setup self.model. if existing_model: if isinstance(existing_model, list): self.model = existing_model[0] # TODO: (sven) hack, but works for `target_[q_]?model`. for i in range(1, len(existing_model)): setattr(self, existing_model[i][0], existing_model[i][1]) elif make_model: self.model = make_model(self, obs_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=logit_dim, model_config=self.config["model"], framework="tf") # Auto-update model's inference view requirements, if recurrent. self._update_model_view_requirements_from_init_state() # Input placeholders already given -> Use these. if existing_inputs: self._state_inputs = [ v for k, v in existing_inputs.items() if k.startswith("state_in_") ] # Placeholder for RNN time-chunk valid lengths. if self._state_inputs: self._seq_lens = existing_inputs[SampleBatch.SEQ_LENS] # Create new input placeholders. else: self._state_inputs = [ get_placeholder( space=vr.space, time_axis=not isinstance(vr.shift, int), ) for k, vr in self.model.view_requirements.items() if k.startswith("state_in_") ] # Placeholder for RNN time-chunk valid lengths. if self._state_inputs: self._seq_lens = tf1.placeholder( dtype=tf.int32, shape=[None], name="seq_lens") # Use default settings. # Add NEXT_OBS, STATE_IN_0.., and others. self.view_requirements = self._get_default_view_requirements() # Combine view_requirements for Model and Policy. self.view_requirements.update(self.model.view_requirements) # Disable env-info placeholder. if SampleBatch.INFOS in self.view_requirements: self.view_requirements[SampleBatch.INFOS].used_for_training = False # Setup standard placeholders. if self._is_tower: timestep = existing_inputs["timestep"] explore = False self._input_dict, self._dummy_batch = \ self._get_input_dict_and_dummy_batch( self.view_requirements, existing_inputs) else: action_ph = ModelCatalog.get_action_placeholder(action_space) prev_action_ph = {} if SampleBatch.PREV_ACTIONS not in self.view_requirements: prev_action_ph = { SampleBatch.PREV_ACTIONS: ModelCatalog. get_action_placeholder(action_space, "prev_action") } self._input_dict, self._dummy_batch = \ self._get_input_dict_and_dummy_batch( self.view_requirements, dict({SampleBatch.ACTIONS: action_ph}, **prev_action_ph)) # Placeholder for (sampling steps) timestep (int). timestep = tf1.placeholder_with_default( tf.zeros((), dtype=tf.int64), (), name="timestep") # Placeholder for `is_exploring` flag. explore = tf1.placeholder_with_default( True, (), name="is_exploring") # Placeholder for `is_training` flag. self._input_dict.is_training = self._get_is_training_placeholder() # Multi-GPU towers do not need any action computing/exploration # graphs. sampled_action = None sampled_action_logp = None dist_inputs = None self._state_out = None if not self._is_tower: # Create the Exploration object to use for this Policy. self.exploration = self._create_exploration() # Fully customized action generation (e.g., custom policy). if action_sampler_fn: sampled_action, sampled_action_logp = action_sampler_fn( self, self.model, obs_batch=self._input_dict[SampleBatch.CUR_OBS], state_batches=self._state_inputs, seq_lens=self._seq_lens, prev_action_batch=self._input_dict.get( SampleBatch.PREV_ACTIONS), prev_reward_batch=self._input_dict.get( SampleBatch.PREV_REWARDS), explore=explore, is_training=self._input_dict.is_training) # Distribution generation is customized, e.g., DQN, DDPG. else: if action_distribution_fn: # Try new action_distribution_fn signature, supporting # state_batches and seq_lens. in_dict = self._input_dict try: dist_inputs, dist_class, self._state_out = \ action_distribution_fn( self, self.model, input_dict=in_dict, state_batches=self._state_inputs, seq_lens=self._seq_lens, explore=explore, timestep=timestep, is_training=in_dict.is_training) # Trying the old way (to stay backward compatible). # TODO: Remove in future. except TypeError as e: if "positional argument" in e.args[0] or \ "unexpected keyword argument" in e.args[0]: dist_inputs, dist_class, self._state_out = \ action_distribution_fn( self, self.model, obs_batch=in_dict[SampleBatch.CUR_OBS], state_batches=self._state_inputs, seq_lens=self._seq_lens, prev_action_batch=in_dict.get( SampleBatch.PREV_ACTIONS), prev_reward_batch=in_dict.get( SampleBatch.PREV_REWARDS), explore=explore, is_training=in_dict.is_training) else: raise e # Default distribution generation behavior: # Pass through model. E.g., PG, PPO. else: if isinstance(self.model, tf.keras.Model): dist_inputs, self._state_out, \ self._extra_action_fetches = \ self.model(self._input_dict) else: dist_inputs, self._state_out = self.model( self._input_dict, self._state_inputs, self._seq_lens) action_dist = dist_class(dist_inputs, self.model) # Using exploration to get final action (e.g. via sampling). sampled_action, sampled_action_logp = \ self.exploration.get_exploration_action( action_distribution=action_dist, timestep=timestep, explore=explore) # Phase 1 init. sess = tf1.get_default_session() or tf1.Session( config=tf1.ConfigProto(**self.config["tf_session_args"])) batch_divisibility_req = get_batch_divisibility_req(self) if \ callable(get_batch_divisibility_req) else \ (get_batch_divisibility_req or 1) super().__init__( observation_space=obs_space, action_space=action_space, config=config, sess=sess, obs_input=self._input_dict[SampleBatch.OBS], action_input=self._input_dict[SampleBatch.ACTIONS], sampled_action=sampled_action, sampled_action_logp=sampled_action_logp, dist_inputs=dist_inputs, dist_class=dist_class, loss=None, # dynamically initialized on run loss_inputs=[], model=self.model, state_inputs=self._state_inputs, state_outputs=self._state_out, prev_action_input=self._input_dict.get(SampleBatch.PREV_ACTIONS), prev_reward_input=self._input_dict.get(SampleBatch.PREV_REWARDS), seq_lens=self._seq_lens, max_seq_len=config["model"]["max_seq_len"], batch_divisibility_req=batch_divisibility_req, explore=explore, timestep=timestep) # Phase 2 init. if before_loss_init is not None: before_loss_init(self, obs_space, action_space, config) # Loss initialization and model/postprocessing test calls. if not self._is_tower: self._initialize_loss_from_dummy_batch( auto_remove_unneeded_view_reqs=True) # Create MultiGPUTowerStacks, if we have at least one actual # GPU or >1 CPUs (fake GPUs). if len(self.devices) > 1 or any("gpu" in d for d in self.devices): # Per-GPU graph copies created here must share vars with the # policy. Therefore, `reuse` is set to tf1.AUTO_REUSE because # Adam nodes are created after all of the device copies are # created. with tf1.variable_scope("", reuse=tf1.AUTO_REUSE): self.multi_gpu_tower_stacks = [ TFMultiGPUTowerStack(policy=self) for i in range( self.config.get("num_multi_gpu_tower_stacks", 1)) ] # Initialize again after loss and tower init. self.get_session().run(tf1.global_variables_initializer())
def __init__(self, observation_space, action_space, config, existing_inputs=None, unsupType='action', designHead='universe'): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ self.unsup = unsupType is not None predictor = None numaction = action_space.n config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) self.sess = tf.get_default_session() self.action_space = action_space self.config = config dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) obs_ph = tf.placeholder(tf.float32, name="obs", shape=(None, ) + observation_space.shape) act_ph = ModelCatalog.get_action_placeholder(action_space) self.observations = obs_ph self.actions = act_ph # print("Observations space:", observation_space) print("Observations shape:", tf.shape(self.observations)[0]) # tf.shape(x)[0] print("Observations shape:", self.observations.shape[1:]) print("obs_ph shape:", tf.shape(obs_ph)[:1]) with tf.variable_scope("predictor"): if 'state' in unsupType: self.local_ap_network = predictor = StatePredictor(tf.shape(self.observations)[0], numaction, designHead, unsupType) else: self.local_ap_network = predictor = StateActionPredictor(tf.shape(self.observations)[0], numaction, designHead) self.logits = self.local_ap_network.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() self.loss_obj = ICMLoss(action_space, unsupType, predictor) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) self.loss_in = [ ("obs", obs_ph), ("actions", act_ph), ("phi1", self.local_ap_network.s1), ("phi2", self.local_ap_network.s2), ("asample", self.local_ap_network.asample), ] TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, loss=self.loss_obj.predloss, loss_inputs=self.loss_in, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "cur_lr": tf.cast(self.cur_lr, tf.float64), "total_loss": self.loss_obj.predloss # "policy_loss": self.loss_obj.mean_policy_loss, # "vf_loss": self.loss_obj.mean_vf_loss, # "vf_explained_var": self.explained_variance, # "kl": self.loss_obj.mean_kl, # "entropy": self.loss_obj.mean_entropy }
def __init__(self, observation_space, action_space, config, existing_inputs=None, unsupType='action', designHead='universe'): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ self.unsup = unsupType is not None # self.cur_batch = None # self.cur_sample_batch = {} predictor = None numaction = action_space.n config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: obs_ph, value_targets_ph, adv_ph, act_ph, \ logits_ph, vf_preds_ph, phi1, phi2, asample = existing_inputs[:9] # TODO: updates to account for s1, s2 and asample existing_state_in = existing_inputs[9:-1] existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder(tf.float32, name="obs", shape=(None, ) + observation_space.shape) adv_ph = tf.placeholder(tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder(tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder(tf.float32, name="vf_preds", shape=(None, )) value_targets_ph = tf.placeholder(tf.float32, name="value_targets", shape=(None, )) phi1 = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape, name="phi1") phi2 = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape, name="phi2") asample = tf.placeholder(tf.float32, shape=(None, numaction), name="asample") existing_state_in = None existing_seq_lens = None self.observations = obs_ph if self.unsup: with tf.variable_scope("predictor"): if 'state' in unsupType: self.local_ap_network = predictor = StatePredictor( phi1, phi2, asample, observation_space, numaction, designHead, unsupType) else: self.local_ap_network = predictor = StateActionPredictor( phi1, phi2, asample, observation_space, numaction, designHead) self.model = pi = ModelCatalog.get_model(obs_ph, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable(initializer=tf.constant_initializer( self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: if self.config["vf_share_layers"]: self.value_function = tf.reshape( linear(self.model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) else: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False vf_config["use_lstm"] = False with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model( obs_ph, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1]) self.loss_obj = PPOLoss(action_space, value_targets_ph, adv_ph, act_ph, logits_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, unsupType, predictor, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_clip_param=self.config["vf_clip_param"], vf_loss_coeff=self.config["vf_loss_coeff"], use_gae=self.config["use_gae"]) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) self.loss_in = [ ("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logits", logits_ph), ("vf_preds", vf_preds_ph), ("s1", phi1), ("s2", phi2), ("asample", asample), ] self.extra_inputs = ["s1", "s2", "asample"] # TODO: testing to see if this lets me pass inputs to ICM # self.variables = ray.experimental.TensorFlowVariables(self.loss_in, self.sess) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, loss=self.loss_obj.loss, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) self.explained_variance = explained_variance(value_targets_ph, self.value_function) self.stats_fetches = { "cur_lr": tf.cast(self.cur_lr, tf.float64), "total_loss": self.loss_obj.loss, "policy_loss": self.loss_obj.mean_policy_loss, "vf_loss": self.loss_obj.mean_vf_loss, "vf_explained_var": self.explained_variance, "kl": self.loss_obj.mean_kl, "entropy": self.loss_obj.mean_entropy }
def _initialize_loss(self): def fake_array(tensor): shape = tensor.shape.as_list() shape = [s if s is not None else 1 for s in shape] return np.zeros(shape, dtype=tensor.dtype.as_numpy_dtype) dummy_batch = { SampleBatch.CUR_OBS: fake_array(self._obs_input), SampleBatch.NEXT_OBS: fake_array(self._obs_input), SampleBatch.DONES: np.array([False], dtype=np.bool), SampleBatch.ACTIONS: fake_array(ModelCatalog.get_action_placeholder(self.action_space)), SampleBatch.REWARDS: np.array([0], dtype=np.float32), } # Add dummy things PENGZHENGHAO # for name, val in self.model.mask_placeholder_dict.items(): # shape = val.shape.as_list() # shape = [1] + [s if s is not None else 1 for s in shape] # dummy_batch[name] = \ # np.zeros(shape, dtype=val.dtype.as_numpy_dtype) if self._obs_include_prev_action_reward: dummy_batch.update({ SampleBatch.PREV_ACTIONS: fake_array(self._prev_action_input), SampleBatch.PREV_REWARDS: fake_array(self._prev_reward_input), }) state_init = self.get_initial_state() state_batches = [] for i, h in enumerate(state_init): dummy_batch["state_in_{}".format(i)] = np.expand_dims(h, 0) dummy_batch["state_out_{}".format(i)] = np.expand_dims(h, 0) state_batches.append(np.expand_dims(h, 0)) if state_init: dummy_batch["seq_lens"] = np.array([1], dtype=np.int32) for k, v in self.extra_compute_action_fetches().items(): dummy_batch[k] = fake_array(v) # postprocessing might depend on variable init, so run it first here self._sess.run(tf.global_variables_initializer()) postprocessed_batch = self.postprocess_trajectory( SampleBatch(dummy_batch)) # model forward pass for the loss (needed after postprocess to # overwrite any tensor state from that call) self.model(self._input_dict, self._state_in, self._seq_lens) if self._obs_include_prev_action_reward: train_batch = UsageTrackingDict({ SampleBatch.PREV_ACTIONS: self._prev_action_input, SampleBatch.PREV_REWARDS: self._prev_reward_input, SampleBatch.CUR_OBS: self._obs_input, }) loss_inputs = [ (SampleBatch.PREV_ACTIONS, self._prev_action_input), (SampleBatch.PREV_REWARDS, self._prev_reward_input), (SampleBatch.CUR_OBS, self._obs_input), ] else: train_batch = UsageTrackingDict({ SampleBatch.CUR_OBS: self._obs_input, }) loss_inputs = [ (SampleBatch.CUR_OBS, self._obs_input), ] # When using the mask, the key of postprocessed_batch is : # dict_keys(['obs', 'new_obs', 'dones', 'actions', 'rewards', # 'fc_1_mask', 'fc_2_mask', 'prev_actions', 'prev_rewards', # 'action_prob', 'action_logp', 'vf_preds', 'behaviour_logits', # 'layer0', 'layer1', 'advantages', 'value_targets']) # When not using the mask, the keys is: # dict_keys(['obs', 'new_obs', 'dones', 'actions', 'rewards', # 'fc_1_mask', 'fc_2_mask', 'prev_actions', 'prev_rewards', # 'action_prob', 'action_logp', 'vf_preds', 'behaviour_logits', # 'layer0', 'layer1', 'advantages', 'value_targets']) for k, v in postprocessed_batch.items(): if k in train_batch: continue elif v.dtype == np.object: continue # can't handle arbitrary objects in TF elif k == "seq_lens" or k.startswith("state_in_"): continue shape = (None, ) + v.shape[1:] dtype = np.float32 if v.dtype == np.float64 else v.dtype placeholder = tf.placeholder(dtype, shape=shape, name=k) train_batch[k] = placeholder # When using the mask. At this time, the train_batch contain 17 # element. # <class 'list'>: ['prev_actions', 'prev_rewards', 'obs', 'new_obs', # 'dones', 'actions', 'rewards', 'fc_1_mask', 'fc_2_mask', # 'action_prob', 'action_logp', 'vf_preds', 'behaviour_logits', # 'layer0', 'layer1', 'advantages', 'value_targets'] for i, si in enumerate(self._state_in): train_batch["state_in_{}".format(i)] = si train_batch["seq_lens"] = self._seq_lens if log_once("loss_init"): logger.debug( "Initializing loss function with dummy input:\n\n{}\n".format( summarize(train_batch))) self._loss_input_dict = train_batch # At this time, the accessed_keys: <class 'set'>: # {'obs', 'prev_rewards', 'value_targets', 'behaviour_logits', # 'prev_actions', 'advantages', 'action_logp', 'actions', # 'vf_preds', 'accessed_keys', 'intercepted_values'} # However, in the no-mask exp, current accessed_keys: # <class 'set'>: {'intercepted_values', 'accessed_keys'} loss = self._do_loss_init(train_batch) # after the above line, the accessed_keys: <class 'set'>: # {'advantages', 'action_logp', 'behaviour_logits', 'prev_rewards', # 'prev_actions', 'vf_preds', 'actions', 'value_targets', 'obs'} # However, in the no-mask exp, above line lead to: They are same. # but different order. # {'action_logp', 'prev_actions', 'behaviour_logits', # 'value_targets', 'obs', 'prev_rewards', 'advantages', 'vf_preds', # 'actions'} # at this time, the loss input already has: prev_actions, # prev_rewards, obs for k in sorted(train_batch.accessed_keys): # sorted train_batch.accessed_keys: <class 'list'>: [ # 'action_logp', 'actions', 'advantages', 'behaviour_logits', # 'obs', 'prev_actions', 'prev_rewards', 'value_targets', # 'vf_preds'] if k != "seq_lens" and not k.startswith("state_in_"): loss_inputs.append((k, train_batch[k])) # PENGZHENGHAO # for name, ph in self.model.mask_placeholder_dict.items(): # loss_inputs.append((name, ph)) TFPolicy._initialize_loss(self, loss, loss_inputs) if self._grad_stats_fn: self._stats_fetches.update( self._grad_stats_fn(self, train_batch, self._grads)) self._sess.run(tf.global_variables_initializer())
def __init__(self, obs_space, action_space, config, loss_fn, stats_fn=None, grad_stats_fn=None, before_loss_init=None, make_model=None, action_sampler_fn=None, action_distribution_fn=None, existing_inputs=None, existing_model=None, get_batch_divisibility_req=None, obs_include_prev_action_reward=True): """Initialize a dynamic TF policy. Arguments: observation_space (gym.Space): Observation space of the policy. action_space (gym.Space): Action space of the policy. config (dict): Policy-specific configuration data. loss_fn (func): function that returns a loss tensor the policy graph, and dict of experience tensor placeholders stats_fn (func): optional function that returns a dict of TF fetches given the policy and batch input tensors grad_stats_fn (func): optional function that returns a dict of TF fetches given the policy and loss gradient tensors before_loss_init (Optional[callable]): Optional function to run prior to loss init that takes the same arguments as __init__. make_model (func): optional function that returns a ModelV2 object given (policy, obs_space, action_space, config). All policy variables should be created in this function. If not specified, a default model will be created. action_sampler_fn (Optional[callable]): An optional callable returning a tuple of action and action prob tensors given (policy, model, input_dict, obs_space, action_space, config). If None, a default action distribution will be used. action_distribution_fn (Optional[callable]): A callable returning distribution inputs (parameters), a dist-class to generate an action distribution object from, and internal-state outputs (or an empty list if not applicable). Note: No Exploration hooks have to be called from within `action_distribution_fn`. It's should only perform a simple forward pass through some model. If None, pass inputs through `self.model()` to get the distribution inputs. existing_inputs (OrderedDict): When copying a policy, this specifies an existing dict of placeholders to use instead of defining new ones existing_model (ModelV2): when copying a policy, this specifies an existing model to clone and share weights with get_batch_divisibility_req (func): optional function that returns the divisibility requirement for sample batches obs_include_prev_action_reward (bool): whether to include the previous action and reward in the model input """ self.observation_space = obs_space self.action_space = action_space self.config = config self.framework = "tf" self._loss_fn = loss_fn self._stats_fn = stats_fn self._grad_stats_fn = grad_stats_fn self._obs_include_prev_action_reward = obs_include_prev_action_reward # Setup standard placeholders prev_actions = None prev_rewards = None if existing_inputs is not None: obs = existing_inputs[SampleBatch.CUR_OBS] if self._obs_include_prev_action_reward: prev_actions = existing_inputs[SampleBatch.PREV_ACTIONS] prev_rewards = existing_inputs[SampleBatch.PREV_REWARDS] action_input = existing_inputs[SampleBatch.ACTIONS] explore = existing_inputs["is_exploring"] timestep = existing_inputs["timestep"] else: obs = tf1.placeholder(tf.float32, shape=[None] + list(obs_space.shape), name="observation") action_input = ModelCatalog.get_action_placeholder(action_space) if self._obs_include_prev_action_reward: prev_actions = ModelCatalog.get_action_placeholder( action_space, "prev_action") prev_rewards = tf1.placeholder(tf.float32, [None], name="prev_reward") explore = tf1.placeholder_with_default(True, (), name="is_exploring") timestep = tf1.placeholder(tf.int32, (), name="timestep") self._input_dict = { SampleBatch.CUR_OBS: obs, SampleBatch.PREV_ACTIONS: prev_actions, SampleBatch.PREV_REWARDS: prev_rewards, "is_training": self._get_is_training_placeholder(), } # Placeholder for RNN time-chunk valid lengths. self._seq_lens = tf1.placeholder(dtype=tf.int32, shape=[None], name="seq_lens") dist_class = dist_inputs = None if action_sampler_fn or action_distribution_fn: if not make_model: raise ValueError( "`make_model` is required if `action_sampler_fn` OR " "`action_distribution_fn` is given") else: dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Setup self.model. if existing_model: self.model = existing_model elif make_model: self.model = make_model(self, obs_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=logit_dim, model_config=self.config["model"], framework="tf", **self.config["model"].get("custom_model_config", {})) # Create the Exploration object to use for this Policy. self.exploration = self._create_exploration() if existing_inputs: self._state_in = [ v for k, v in existing_inputs.items() if k.startswith("state_in_") ] if self._state_in: self._seq_lens = existing_inputs["seq_lens"] else: self._state_in = [ tf1.placeholder(shape=(None, ) + s.shape, dtype=s.dtype) for s in self.model.get_initial_state() ] # Fully customized action generation (e.g., custom policy). if action_sampler_fn: sampled_action, sampled_action_logp = action_sampler_fn( self, self.model, obs_batch=self._input_dict[SampleBatch.CUR_OBS], state_batches=self._state_in, seq_lens=self._seq_lens, prev_action_batch=self._input_dict[SampleBatch.PREV_ACTIONS], prev_reward_batch=self._input_dict[SampleBatch.PREV_REWARDS], explore=explore, is_training=self._input_dict["is_training"]) else: # Distribution generation is customized, e.g., DQN, DDPG. if action_distribution_fn: dist_inputs, dist_class, self._state_out = \ action_distribution_fn( self, self.model, obs_batch=self._input_dict[SampleBatch.CUR_OBS], state_batches=self._state_in, seq_lens=self._seq_lens, prev_action_batch=self._input_dict[ SampleBatch.PREV_ACTIONS], prev_reward_batch=self._input_dict[ SampleBatch.PREV_REWARDS], explore=explore, is_training=self._input_dict["is_training"]) # Default distribution generation behavior: # Pass through model. E.g., PG, PPO. else: dist_inputs, self._state_out = self.model( self._input_dict, self._state_in, self._seq_lens) action_dist = dist_class(dist_inputs, self.model) # Using exploration to get final action (e.g. via sampling). sampled_action, sampled_action_logp = \ self.exploration.get_exploration_action( action_distribution=action_dist, timestep=timestep, explore=explore) # Phase 1 init. sess = tf1.get_default_session() or tf1.Session() if get_batch_divisibility_req: batch_divisibility_req = get_batch_divisibility_req(self) else: batch_divisibility_req = 1 super().__init__( observation_space=obs_space, action_space=action_space, config=config, sess=sess, obs_input=obs, action_input=action_input, # for logp calculations sampled_action=sampled_action, sampled_action_logp=sampled_action_logp, dist_inputs=dist_inputs, dist_class=dist_class, loss=None, # dynamically initialized on run loss_inputs=[], model=self.model, state_inputs=self._state_in, state_outputs=self._state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self._seq_lens, max_seq_len=config["model"]["max_seq_len"], batch_divisibility_req=batch_divisibility_req, explore=explore, timestep=timestep) # Phase 2 init. if before_loss_init is not None: before_loss_init(self, obs_space, action_space, config) if not existing_inputs: self._initialize_loss()
def __init__(self, obs_space, action_space, config, loss_fn, stats_fn=None, grad_stats_fn=None, before_loss_init=None, make_model=None, action_sampler_fn=None, log_likelihood_fn=None, existing_inputs=None, existing_model=None, get_batch_divisibility_req=None, obs_include_prev_action_reward=True): """Initialize a dynamic TF policy. Arguments: observation_space (gym.Space): Observation space of the policy. action_space (gym.Space): Action space of the policy. config (dict): Policy-specific configuration data. loss_fn (func): function that returns a loss tensor the policy graph, and dict of experience tensor placeholders stats_fn (func): optional function that returns a dict of TF fetches given the policy and batch input tensors grad_stats_fn (func): optional function that returns a dict of TF fetches given the policy and loss gradient tensors before_loss_init (Optional[callable]): Optional function to run prior to loss init that takes the same arguments as __init__. make_model (func): optional function that returns a ModelV2 object given (policy, obs_space, action_space, config). All policy variables should be created in this function. If not specified, a default model will be created. action_sampler_fn (Optional[callable]): An optional callable returning a tuple of action and action prob tensors given (policy, model, input_dict, obs_space, action_space, config). If None, a default action distribution will be used. log_likelihood_fn (Optional[callable]): A callable, returning a log-likelihood op. If None, a default class is used and distribution inputs (for parameterization) will be generated by a model call. existing_inputs (OrderedDict): When copying a policy, this specifies an existing dict of placeholders to use instead of defining new ones existing_model (ModelV2): when copying a policy, this specifies an existing model to clone and share weights with get_batch_divisibility_req (func): optional function that returns the divisibility requirement for sample batches obs_include_prev_action_reward (bool): whether to include the previous action and reward in the model input """ self.config = config self.framework = "tf" self._loss_fn = loss_fn self._stats_fn = stats_fn self._grad_stats_fn = grad_stats_fn self._obs_include_prev_action_reward = obs_include_prev_action_reward # Setup standard placeholders prev_actions = None prev_rewards = None if existing_inputs is not None: obs = existing_inputs[SampleBatch.CUR_OBS] if self._obs_include_prev_action_reward: prev_actions = existing_inputs[SampleBatch.PREV_ACTIONS] prev_rewards = existing_inputs[SampleBatch.PREV_REWARDS] action_input = existing_inputs[SampleBatch.ACTIONS] else: obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape), name="observation") action_input = ModelCatalog.get_action_placeholder(action_space) if self._obs_include_prev_action_reward: prev_actions = ModelCatalog.get_action_placeholder( action_space, "prev_action") prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") explore = tf.placeholder_with_default(False, (), name="is_exploring") self._input_dict = { SampleBatch.CUR_OBS: obs, SampleBatch.PREV_ACTIONS: prev_actions, SampleBatch.PREV_REWARDS: prev_rewards, "is_training": self._get_is_training_placeholder(), } # Placeholder for RNN time-chunk valid lengths. self._seq_lens = tf.placeholder(dtype=tf.int32, shape=[None], name="seq_lens") if action_sampler_fn: if not make_model: raise ValueError( "`make_model` is required if `action_sampler_fn` is given") self.dist_class = None else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Setup model if existing_model: self.model = existing_model elif make_model: self.model = make_model(self, obs_space, action_space, config) else: self.model = ModelCatalog.get_model_v2(obs_space, action_space, logit_dim, self.config["model"], framework="tf") if existing_inputs: self._state_in = [ v for k, v in existing_inputs.items() if k.startswith("state_in_") ] if self._state_in: self._seq_lens = existing_inputs["seq_lens"] else: self._state_in = [ tf.placeholder(shape=(None, ) + s.shape, dtype=s.dtype) for s in self.model.get_initial_state() ] model_out, self._state_out = self.model(self._input_dict, self._state_in, self._seq_lens) # Create the Exploration object to use for this Policy. self.exploration = self._create_exploration(action_space, config) timestep = tf.placeholder(tf.int32, (), name="timestep") # Setup custom action sampler. if action_sampler_fn: sampled_action, sampled_action_logp = action_sampler_fn( self, self.model, self._input_dict, obs_space, action_space, explore, config, timestep) # Create a default action sampler. else: # Using an exploration setup. sampled_action, sampled_action_logp = \ self.exploration.get_exploration_action( model_out, self.dist_class, self.model, explore=explore, timestep=timestep) # Phase 1 init. sess = tf.get_default_session() or tf.Session() if get_batch_divisibility_req: batch_divisibility_req = get_batch_divisibility_req(self) else: batch_divisibility_req = 1 # Generate the log-likelihood op. log_likelihood = None # From a given function. if log_likelihood_fn: log_likelihood = log_likelihood_fn(self, self.model, action_input, self._input_dict, obs_space, action_space, config) # Create default, iff we have a distribution class. elif self.dist_class is not None: log_likelihood = self.dist_class(model_out, self.model).logp(action_input) super().__init__( obs_space, action_space, config, sess, obs_input=obs, action_input=action_input, # for logp calculations sampled_action=sampled_action, sampled_action_logp=sampled_action_logp, log_likelihood=log_likelihood, loss=None, # dynamically initialized on run loss_inputs=[], model=self.model, state_inputs=self._state_in, state_outputs=self._state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self._seq_lens, max_seq_len=config["model"]["max_seq_len"], batch_divisibility_req=batch_divisibility_req, explore=explore, timestep=timestep) # Phase 2 init. if before_loss_init is not None: before_loss_init(self, obs_space, action_space, config) if not existing_inputs: self._initialize_loss()
def __init__(self, observation_space, action_space, config): # config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] self.inner_lr = self.config["inner_lr"] self.outer_lr = self.config["outer_lr"] self.mode = self.config["mode"] assert self.mode in ["local", "remote"] assert self.kl_coeff_val == 0.0 dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) with tf.name_scope("inputs"): obs_ph = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape, name="obs") adv_ph = tf.placeholder(tf.float32, shape=(None, ), name="advantages") act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder(tf.float32, shape=(None, logit_dim), name="logits") vf_preds_ph = tf.placeholder(tf.float32, shape=(None, ), name="vf_preds") value_targets_ph = tf.placeholder(tf.float32, shape=(None, ), name="value_targets") prev_actions_ph = ModelCatalog.get_action_placeholder(action_space) prev_rewards_ph = tf.placeholder(tf.float32, shape=(None, ), name="prev_rewards") existing_state_in = None existing_seq_lens = None self.observations = obs_ph self.a3c_loss_in = [("obs", obs_ph), ("advantages", adv_ph), ("actions", act_ph), ("value_targets", value_targets_ph), ("vf_preds", vf_preds_ph), ("prev_actions", prev_actions_ph), ("prev_rewards", prev_rewards_ph)] # self.a3c_loss_in = [ # ("obs", obs_ph), # ("advantages", adv_ph), # ("actions", act_ph), # ("prev_actions", prev_actions_ph), # ("prev_rewards", prev_rewards_ph)] self.ppo_loss_in = list(self.a3c_loss_in) \ + [("logits", logits_ph)] assert self.config["model"]["custom_model"] logger.info( f'Using custom model {self.config["model"]["custom_model"]}') model_cls = _global_registry.get(RLLIB_MODEL, self.config["model"]["custom_model"]) new_variables, grad_placeholders, custom_variables, dummy_variables = \ model_cls.prepare(observation_space, (logit_dim // 2 if self.config["model"]["free_log_std"] else logit_dim), self.config["model"], func=lambda x, y: x - self.inner_lr * y) self._new_variables = new_variables self._grad_placeholders = grad_placeholders self._custom_variables = custom_variables self._dummy_variables = dummy_variables self._inner_variables = nest.flatten(custom_variables) # for Meta-SGD, `custom_variables` and `adaptive learning rates` self._outer_variables = nest.flatten(custom_variables) self._variables = {var.op.name: var for var in self._outer_variables} self._grad_phs_loss_inputs = [] for i in range(len(grad_placeholders)): for key, ph in grad_placeholders[i].items(): self._grad_phs_loss_inputs.append( (custom_variables[i][key].op.name, ph)) self._grad_phs_loss_input_dict = dict(self._grad_phs_loss_inputs) self.model = model_cls( { "obs": obs_ph, "prev_actions": prev_actions_ph, "prev_rewards": prev_actions_ph }, observation_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens, custom_params=new_variables) self.logits = self.model.outputs with tf.name_scope("sampler"): curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() assert self.config["use_gae"] and self.config["vf_share_layers"] self.value_function = self.model.value_function() if self.model.state_in: raise NotImplementedError else: mask = None with tf.name_scope("a3c_loss"): self.a3c_loss_obj = A3CLoss( action_dist=curr_action_dist, actions=act_ph, advantages=adv_ph, value_targets=value_targets_ph, vf_preds=vf_preds_ph, value_function=self.value_function, vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], vf_clip_param=self.config["vf_clip_param"]) # self.a3c_loss_obj = PGLoss( # curr_action_dist, act_ph, adv_ph) with tf.name_scope( "ppo_loss" ): # write own PPO loss, boolean_mask -> dynamic_partition self.ppo_loss_obj = PPOLoss( action_dist=curr_action_dist, action_space=action_space, logits=logits_ph, actions=act_ph, advantages=adv_ph, value_targets=value_targets_ph, vf_preds=vf_preds_ph, value_function=self.value_function, valid_mask=mask, kl_coeff=self.kl_coeff_val, clip_param=self.config["clip_param"], vf_clip_param=self.config["vf_clip_param"], vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], use_gae=self.config["use_gae"]) BaseMAMLPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, inner_loss=self.a3c_loss_obj.loss, inner_loss_inputs=self.a3c_loss_in, outer_loss=self.ppo_loss_obj.loss, outer_loss_inputs=self.ppo_loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions_ph, prev_reward_input=prev_rewards_ph, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) self.a3c_stats_fetches = { "total_loss": self.a3c_loss_obj.loss, "policy_loss": self.a3c_loss_obj.mean_policy_loss, "vf_loss": self.a3c_loss_obj.mean_vf_loss, "entropy": self.a3c_loss_obj.mean_entropy } self.ppo_stats_fetches = { "total_loss": self.ppo_loss_obj.loss, "policy_Loss": self.ppo_loss_obj.mean_policy_loss, "vf_loss": self.ppo_loss_obj.mean_vf_loss, "entropy": self.ppo_loss_obj.mean_entropy, "kl": self.ppo_loss_obj.mean_kl } self.sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config, existing_inputs=None): config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() self.grads = None if isinstance(action_space, gym.spaces.Discrete): is_multidiscrete = False output_hidden_shape = [action_space.n] elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete): is_multidiscrete = True output_hidden_shape = action_space.nvec.astype(np.int32) elif self.config["vtrace"]: raise UnsupportedSpaceException( "Action space {} is not supported for APPO + VTrace.", format(action_space)) else: is_multidiscrete = False output_hidden_shape = 1 # Policy network model dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Create input placeholders if existing_inputs: if self.config["vtrace"]: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards = existing_inputs[:7] existing_state_in = existing_inputs[7:-1] existing_seq_lens = existing_inputs[-1] else: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards, adv_ph, value_targets = \ existing_inputs[:9] existing_state_in = existing_inputs[9:-1] existing_seq_lens = existing_inputs[-1] else: actions = ModelCatalog.get_action_placeholder(action_space) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder(tf.float32, [None, logit_dim], name="behaviour_logits") observations = tf.placeholder(tf.float32, [None] + list(observation_space.shape)) existing_state_in = None existing_seq_lens = None if not self.config["vtrace"]: adv_ph = tf.placeholder(tf.float32, name="advantages", shape=(None, )) value_targets = tf.placeholder(tf.float32, name="value_targets", shape=(None, )) self.observations = observations # Unpack behaviour logits unpacked_behaviour_logits = tf.split(behaviour_logits, output_hidden_shape, axis=1) # Setup the policy dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) unpacked_outputs = tf.split(self.model.outputs, output_hidden_shape, axis=1) dist_inputs = unpacked_outputs if is_multidiscrete else \ self.model.outputs prev_dist_inputs = unpacked_behaviour_logits if is_multidiscrete else \ behaviour_logits action_dist = dist_class(dist_inputs) prev_action_dist = dist_class(prev_dist_inputs) values = self.model.value_function() self.value_function = values self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) def make_time_major(tensor, drop_last=False): """Swaps batch and trajectory axis. Args: tensor: A tensor or list of tensors to reshape. drop_last: A bool indicating whether to drop the last trajectory item. Returns: res: A tensor with swapped axes or a list of tensors with swapped axes. """ if isinstance(tensor, list): return [make_time_major(t, drop_last) for t in tensor] if self.model.state_init: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = self.config["sample_batch_size"] B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes res = tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) if drop_last: return res[:-1] return res if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) - 1 mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(rewards) # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. if self.config["vtrace"]: logger.info("Using V-Trace surrogate loss (vtrace=True)") # Prepare actions for loss loss_actions = actions if is_multidiscrete else tf.expand_dims( actions, axis=1) self.loss = VTraceSurrogateLoss( actions=make_time_major(loss_actions, drop_last=True), prev_actions_logp=make_time_major( prev_action_dist.logp(actions), drop_last=True), actions_logp=make_time_major(action_dist.logp(actions), drop_last=True), action_kl=prev_action_dist.kl(action_dist), actions_entropy=make_time_major(action_dist.entropy(), drop_last=True), dones=make_time_major(dones, drop_last=True), behaviour_logits=make_time_major(unpacked_behaviour_logits, drop_last=True), target_logits=make_time_major(unpacked_outputs, drop_last=True), discount=config["gamma"], rewards=make_time_major(rewards, drop_last=True), values=make_time_major(values, drop_last=True), bootstrap_value=make_time_major(values)[-1], valid_mask=make_time_major(mask, drop_last=True), vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self. config["vtrace_clip_pg_rho_threshold"], clip_param=self.config["clip_param"]) else: logger.info("Using PPO surrogate loss (vtrace=False)") self.loss = PPOSurrogateLoss( prev_actions_logp=make_time_major( prev_action_dist.logp(actions)), actions_logp=make_time_major(action_dist.logp(actions)), action_kl=prev_action_dist.kl(action_dist), actions_entropy=make_time_major(action_dist.entropy()), values=make_time_major(values), valid_mask=make_time_major(mask), advantages=make_time_major(adv_ph), value_targets=make_time_major(value_targets), vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"]) # KL divergence between worker and learner logits for debugging model_dist = MultiCategorical(unpacked_outputs) behaviour_dist = MultiCategorical(unpacked_behaviour_logits) kls = model_dist.kl(behaviour_dist) if len(kls) > 1: self.KL_stats = {} for i, kl in enumerate(kls): self.KL_stats.update({ "mean_KL_{}".format(i): tf.reduce_mean(kl), "max_KL_{}".format(i): tf.reduce_max(kl), "median_KL_{}".format(i): tf.contrib.distributions.percentile(kl, 50.0), }) else: self.KL_stats = { "mean_KL": tf.reduce_mean(kls[0]), "max_KL": tf.reduce_max(kls[0]), "median_KL": tf.contrib.distributions.percentile(kls[0], 50.0), } # Initialize TFPolicyGraph loss_in = [ ("actions", actions), ("dones", dones), ("behaviour_logits", behaviour_logits), ("rewards", rewards), ("obs", observations), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ] if not self.config["vtrace"]: loss_in.append(("advantages", adv_ph)) loss_in.append(("value_targets", value_targets)) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=self.loss.total_loss, model=self.model, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"], batch_divisibility_req=self.config["sample_batch_size"]) self.sess.run(tf.global_variables_initializer()) values_batched = make_time_major(values, drop_last=self.config["vtrace"]) self.stats_fetches = { "stats": dict( { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.value_targets, [-1]), tf.reshape(values_batched, [-1])), }, **self.KL_stats), }
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist(action_space) if existing_inputs: obs_ph, value_targets_ph, adv_ph, act_ph, \ logits_ph, vf_preds_ph = existing_inputs[:6] existing_state_in = existing_inputs[6:-1] existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder( tf.float32, name="obs", shape=(None,)+observation_space.shape) adv_ph = tf.placeholder( tf.float32, name="advantages", shape=(None,)) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder( tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder( tf.float32, name="vf_preds", shape=(None,)) value_targets_ph = tf.placeholder( tf.float32, name="value_targets", shape=(None,)) existing_state_in = None existing_seq_lens = None self.loss_in = [ ("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logits", logits_ph), ("vf_preds", vf_preds_ph), ] self.model = ModelCatalog.get_model( obs_ph, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable( initializer=tf.constant_initializer(self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False vf_config["use_lstm"] = False with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model( obs_ph, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1]) self.loss_obj = PPOLoss( action_space, value_targets_ph, adv_ph, act_ph, logits_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_loss_coeff=self.config["kl_target"], use_gae=self.config["use_gae"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, loss=self.loss_obj.loss, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer())
def _initialize_loss(self): def fake_array(tensor): shape = tensor.shape.as_list() shape = [s if s is not None else 1 for s in shape] return np.zeros(shape, dtype=tensor.dtype.as_numpy_dtype) dummy_batch = { SampleBatch.CUR_OBS: fake_array(self._obs_input), SampleBatch.NEXT_OBS: fake_array(self._obs_input), SampleBatch.DONES: np.array([False], dtype=np.bool), SampleBatch.ACTIONS: fake_array( ModelCatalog.get_action_placeholder(self.action_space)), SampleBatch.REWARDS: np.array([0], dtype=np.float32), } if self._obs_include_prev_action_reward: dummy_batch.update({ SampleBatch.PREV_ACTIONS: fake_array(self._prev_action_input), SampleBatch.PREV_REWARDS: fake_array(self._prev_reward_input), }) state_init = self.get_initial_state() state_batches = [] for i, h in enumerate(state_init): dummy_batch["state_in_{}".format(i)] = np.expand_dims(h, 0) dummy_batch["state_out_{}".format(i)] = np.expand_dims(h, 0) state_batches.append(np.expand_dims(h, 0)) if state_init: dummy_batch["seq_lens"] = np.array([1], dtype=np.int32) for k, v in self.extra_compute_action_fetches().items(): dummy_batch[k] = fake_array(v) # postprocessing might depend on variable init, so run it first here self._sess.run(tf.global_variables_initializer()) postprocessed_batch = self.postprocess_trajectory( SampleBatch(dummy_batch)) # model forward pass for the loss (needed after postprocess to # overwrite any tensor state from that call) self.model(self._input_dict, self._state_in, self._seq_lens) if self._obs_include_prev_action_reward: train_batch = UsageTrackingDict({ SampleBatch.PREV_ACTIONS: self._prev_action_input, SampleBatch.PREV_REWARDS: self._prev_reward_input, SampleBatch.CUR_OBS: self._obs_input, }) loss_inputs = [ (SampleBatch.PREV_ACTIONS, self._prev_action_input), (SampleBatch.PREV_REWARDS, self._prev_reward_input), (SampleBatch.CUR_OBS, self._obs_input), ] else: train_batch = UsageTrackingDict({ SampleBatch.CUR_OBS: self._obs_input, }) loss_inputs = [ (SampleBatch.CUR_OBS, self._obs_input), ] for k, v in postprocessed_batch.items(): if k in train_batch: continue elif v.dtype == np.object: continue # can't handle arbitrary objects in TF elif k == "seq_lens" or k.startswith("state_in_"): continue shape = (None, ) + v.shape[1:] dtype = np.float32 if v.dtype == np.float64 else v.dtype placeholder = tf.placeholder(dtype, shape=shape, name=k) train_batch[k] = placeholder for i, si in enumerate(self._state_in): train_batch["state_in_{}".format(i)] = si train_batch["seq_lens"] = self._seq_lens if log_once("loss_init"): logger.debug( "Initializing loss function with dummy input:\n\n{}\n".format( summarize(train_batch))) self._loss_input_dict = train_batch loss = self._do_loss_init(train_batch) for k in sorted(train_batch.accessed_keys): if k != "seq_lens" and not k.startswith("state_in_"): loss_inputs.append((k, train_batch[k])) TFPolicy._initialize_loss(self, loss, loss_inputs) if self._grad_stats_fn: self._stats_fetches.update( self._grad_stats_fn(self, train_batch, self._grads)) self._sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config, existing_inputs=None): config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() self.grads = None if isinstance(action_space, gym.spaces.Discrete): is_multidiscrete = False output_hidden_shape = [action_space.n] elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete): is_multidiscrete = True output_hidden_shape = action_space.nvec.astype(np.int32) elif self.config["vtrace"]: raise UnsupportedSpaceException( "Action space {} is not supported for APPO + VTrace.", format(action_space)) else: is_multidiscrete = False output_hidden_shape = 1 # Policy network model dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Create input placeholders if existing_inputs: if self.config["vtrace"]: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards = existing_inputs[:7] existing_state_in = existing_inputs[7:-1] existing_seq_lens = existing_inputs[-1] else: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards, adv_ph, value_targets = \ existing_inputs[:9] existing_state_in = existing_inputs[9:-1] existing_seq_lens = existing_inputs[-1] else: actions = ModelCatalog.get_action_placeholder(action_space) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder( tf.float32, [None, logit_dim], name="behaviour_logits") observations = tf.placeholder( tf.float32, [None] + list(observation_space.shape)) existing_state_in = None existing_seq_lens = None if not self.config["vtrace"]: adv_ph = tf.placeholder( tf.float32, name="advantages", shape=(None, )) value_targets = tf.placeholder( tf.float32, name="value_targets", shape=(None, )) self.observations = observations # Unpack behaviour logits unpacked_behaviour_logits = tf.split( behaviour_logits, output_hidden_shape, axis=1) # Setup the policy dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) unpacked_outputs = tf.split( self.model.outputs, output_hidden_shape, axis=1) dist_inputs = unpacked_outputs if is_multidiscrete else \ self.model.outputs prev_dist_inputs = unpacked_behaviour_logits if is_multidiscrete else \ behaviour_logits action_dist = dist_class(dist_inputs) prev_action_dist = dist_class(prev_dist_inputs) values = self.model.value_function() self.value_function = values self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) def make_time_major(tensor, drop_last=False): """Swaps batch and trajectory axis. Args: tensor: A tensor or list of tensors to reshape. drop_last: A bool indicating whether to drop the last trajectory item. Returns: res: A tensor with swapped axes or a list of tensors with swapped axes. """ if isinstance(tensor, list): return [make_time_major(t, drop_last) for t in tensor] if self.model.state_init: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = self.config["sample_batch_size"] B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes res = tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) if drop_last: return res[:-1] return res if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) - 1 mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(rewards) # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. if self.config["vtrace"]: logger.info("Using V-Trace surrogate loss (vtrace=True)") # Prepare actions for loss loss_actions = actions if is_multidiscrete else tf.expand_dims( actions, axis=1) self.loss = VTraceSurrogateLoss( actions=make_time_major(loss_actions, drop_last=True), prev_actions_logp=make_time_major( prev_action_dist.logp(actions), drop_last=True), actions_logp=make_time_major( action_dist.logp(actions), drop_last=True), action_kl=prev_action_dist.kl(action_dist), actions_entropy=make_time_major( action_dist.entropy(), drop_last=True), dones=make_time_major(dones, drop_last=True), behaviour_logits=make_time_major( unpacked_behaviour_logits, drop_last=True), target_logits=make_time_major( unpacked_outputs, drop_last=True), discount=config["gamma"], rewards=make_time_major(rewards, drop_last=True), values=make_time_major(values, drop_last=True), bootstrap_value=make_time_major(values)[-1], valid_mask=make_time_major(mask, drop_last=True), vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self.config[ "vtrace_clip_pg_rho_threshold"], clip_param=self.config["clip_param"]) else: logger.info("Using PPO surrogate loss (vtrace=False)") self.loss = PPOSurrogateLoss( prev_actions_logp=make_time_major( prev_action_dist.logp(actions)), actions_logp=make_time_major(action_dist.logp(actions)), action_kl=prev_action_dist.kl(action_dist), actions_entropy=make_time_major(action_dist.entropy()), values=make_time_major(values), valid_mask=make_time_major(mask), advantages=make_time_major(adv_ph), value_targets=make_time_major(value_targets), vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"]) # KL divergence between worker and learner logits for debugging model_dist = MultiCategorical(unpacked_outputs) behaviour_dist = MultiCategorical(unpacked_behaviour_logits) kls = model_dist.kl(behaviour_dist) if len(kls) > 1: self.KL_stats = {} for i, kl in enumerate(kls): self.KL_stats.update({ "mean_KL_{}".format(i): tf.reduce_mean(kl), "max_KL_{}".format(i): tf.reduce_max(kl), "median_KL_{}".format(i): tf.contrib.distributions. percentile(kl, 50.0), }) else: self.KL_stats = { "mean_KL": tf.reduce_mean(kls[0]), "max_KL": tf.reduce_max(kls[0]), "median_KL": tf.contrib.distributions.percentile(kls[0], 50.0), } # Initialize TFPolicyGraph loss_in = [ ("actions", actions), ("dones", dones), ("behaviour_logits", behaviour_logits), ("rewards", rewards), ("obs", observations), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ] if not self.config["vtrace"]: loss_in.append(("advantages", adv_ph)) loss_in.append(("value_targets", value_targets)) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=self.loss.total_loss, model=self.model, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"], batch_divisibility_req=self.config["sample_batch_size"]) self.sess.run(tf.global_variables_initializer()) values_batched = make_time_major( values, drop_last=self.config["vtrace"]) self.stats_fetches = { "stats": dict({ "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.value_targets, [-1]), tf.reshape(values_batched, [-1])), }, **self.KL_stats), }
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config) self.config = config self.sess = tf.get_default_session() # Setup the policy self.observations = tf.placeholder(tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.prev_actions = ModelCatalog.get_action_placeholder(action_space) self.prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": self.observations, "prev_actions": self.prev_actions, "prev_rewards": self.prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) self.vf = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # Setup the policy loss if isinstance(action_space, gym.spaces.Box): ac_size = action_space.shape[0] actions = tf.placeholder(tf.float32, [None, ac_size], name="ac") elif isinstance(action_space, gym.spaces.Discrete): actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for A3C.".format( action_space)) advantages = tf.placeholder(tf.float32, [None], name="advantages") self.v_target = tf.placeholder(tf.float32, [None], name="v_target") self.loss = A3CLoss(action_dist, actions, advantages, self.v_target, self.vf, self.config["vf_loss_coeff"], self.config["entropy_coeff"]) # Initialize TFPolicyGraph loss_in = [ ("obs", self.observations), ("actions", actions), ("prev_actions", self.prev_actions), ("prev_rewards", self.prev_rewards), ("advantages", advantages), ("value_targets", self.v_target), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=self.loss.total_loss, model=self.model, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=self.prev_actions, prev_reward_input=self.prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) self.stats_fetches = { LEARNER_STATS_KEY: { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "policy_entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance(self.v_target, self.vf), }, } self.sess.run(tf.global_variables_initializer())
def _initialize_loss(self): def fake_array(tensor): shape = tensor.shape.as_list() shape[0] = 1 return np.zeros(shape, dtype=tensor.dtype.as_numpy_dtype) dummy_batch = { SampleBatch.CUR_OBS: fake_array(self._obs_input), SampleBatch.NEXT_OBS: fake_array(self._obs_input), SampleBatch.DONES: np.array([False], dtype=np.bool), SampleBatch.ACTIONS: fake_array(ModelCatalog.get_action_placeholder(self.action_space)), SampleBatch.REWARDS: np.array([0], dtype=np.float32), } if self._obs_include_prev_action_reward: dummy_batch.update({ SampleBatch.PREV_ACTIONS: fake_array(self._prev_action_input), SampleBatch.PREV_REWARDS: fake_array(self._prev_reward_input), }) state_init = self.get_initial_state() for i, h in enumerate(state_init): dummy_batch["state_in_{}".format(i)] = np.expand_dims(h, 0) dummy_batch["state_out_{}".format(i)] = np.expand_dims(h, 0) if state_init: dummy_batch["seq_lens"] = np.array([1], dtype=np.int32) for k, v in self.extra_compute_action_fetches().items(): dummy_batch[k] = fake_array(v) # postprocessing might depend on variable init, so run it first here self._sess.run(tf.global_variables_initializer()) postprocessed_batch = self.postprocess_trajectory( SampleBatch(dummy_batch)) if self._obs_include_prev_action_reward: batch_tensors = UsageTrackingDict({ SampleBatch.PREV_ACTIONS: self._prev_action_input, SampleBatch.PREV_REWARDS: self._prev_reward_input, SampleBatch.CUR_OBS: self._obs_input, }) loss_inputs = [ (SampleBatch.PREV_ACTIONS, self._prev_action_input), (SampleBatch.PREV_REWARDS, self._prev_reward_input), (SampleBatch.CUR_OBS, self._obs_input), ] else: batch_tensors = UsageTrackingDict({ SampleBatch.CUR_OBS: self._obs_input, }) loss_inputs = [ (SampleBatch.CUR_OBS, self._obs_input), ] for k, v in postprocessed_batch.items(): if k in batch_tensors: continue elif v.dtype == np.object: continue # can't handle arbitrary objects in TF shape = (None, ) + v.shape[1:] dtype = np.float32 if v.dtype == np.float64 else v.dtype placeholder = tf.placeholder(dtype, shape=shape, name=k) batch_tensors[k] = placeholder if log_once("loss_init"): logger.info( "Initializing loss function with dummy input:\n\n{}\n".format( summarize(batch_tensors))) loss = self._do_loss_init(batch_tensors) for k in sorted(batch_tensors.accessed_keys): loss_inputs.append((k, batch_tensors[k])) # XXX experimental support for automatically eagerifying the loss. # The main limitation right now is that TF doesn't support mixing eager # and non-eager tensors, so losses that read non-eager tensors through # `policy` need to use `policy.convert_to_eager(tensor)`. if self.config["use_eager"]: if not self.model: raise ValueError("eager not implemented in this case") graph_tensors = list(self._needs_eager_conversion) def gen_loss(model_outputs, *args): # fill in the batch tensor dict with eager ensors eager_inputs = dict( zip([k for (k, v) in loss_inputs], args[:len(loss_inputs)])) # fill in the eager versions of all accessed graph tensors self._eager_tensors = dict( zip(graph_tensors, args[len(loss_inputs):])) # patch the action dist to use eager mode tensors self.action_dist.inputs = model_outputs return self._loss_fn(self, eager_inputs) # TODO(ekl) also handle the stats funcs loss = tf.py_function( gen_loss, # cast works around TypeError: Cannot convert provided value # to EagerTensor. Provided value: 0.0 Requested dtype: int64 [self.model.outputs] + [tf.cast(v, tf.float32) for (k, v) in loss_inputs] + [tf.cast(t, tf.float32) for t in graph_tensors], tf.float32) TFPolicy._initialize_loss(self, loss, loss_inputs) if self._grad_stats_fn: self._stats_fetches.update(self._grad_stats_fn(self, self._grads)) self._sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config, existing_inputs=None): config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() # Create input placeholders if existing_inputs: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards = existing_inputs[:7] existing_state_in = existing_inputs[7:-1] existing_seq_lens = existing_inputs[-1] else: if isinstance(action_space, gym.spaces.Discrete): ac_size = action_space.n actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for IMPALA.".format( action_space)) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder(tf.float32, [None, ac_size], name="behaviour_logits") observations = tf.placeholder(tf.float32, [None] + list(observation_space.shape)) existing_state_in = None existing_seq_lens = None # Setup the policy dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) action_dist = dist_class(self.model.outputs) values = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) def to_batches(tensor): if self.model.state_init: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = self.config["sample_batch_size"] B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes return tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) - 1 mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(rewards, dtype=tf.bool) # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. self.loss = VTraceLoss( actions=to_batches(actions)[:-1], actions_logp=to_batches(action_dist.logp(actions))[:-1], actions_entropy=to_batches(action_dist.entropy())[:-1], dones=to_batches(dones)[:-1], behaviour_logits=to_batches(behaviour_logits)[:-1], target_logits=to_batches(self.model.outputs)[:-1], discount=config["gamma"], rewards=to_batches(rewards)[:-1], values=to_batches(values)[:-1], bootstrap_value=to_batches(values)[-1], valid_mask=to_batches(mask)[:-1], vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"]) # KL divergence between worker and learner logits for debugging model_dist = Categorical(self.model.outputs) behaviour_dist = Categorical(behaviour_logits) self.KLs = model_dist.kl(behaviour_dist) self.mean_KL = tf.reduce_mean(self.KLs) self.max_KL = tf.reduce_max(self.KLs) self.median_KL = tf.contrib.distributions.percentile(self.KLs, 50.0) # Initialize TFPolicyGraph loss_in = [ ("actions", actions), ("dones", dones), ("behaviour_logits", behaviour_logits), ("rewards", rewards), ("obs", observations), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=self.model.loss() + self.loss.total_loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"], batch_divisibility_req=self.config["sample_batch_size"]) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "stats": { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.vtrace_returns.vs, [-1]), tf.reshape(to_batches(values)[:-1], [-1])), "mean_KL": self.mean_KL, "max_KL": self.max_KL, "median_KL": self.median_KL, }, }
def __init__(self, obs_space, action_space, config, loss_fn, stats_fn=None, update_ops_fn=None, grad_stats_fn=None, before_loss_init=None, make_action_sampler=None, existing_inputs=None, get_batch_divisibility_req=None, obs_include_prev_action_reward=True): """Initialize a dynamic TF policy. Arguments: observation_space (gym.Space): Observation space of the policy. action_space (gym.Space): Action space of the policy. config (dict): Policy-specific configuration data. loss_fn (func): function that returns a loss tensor the policy graph, and dict of experience tensor placeholders stats_fn (func): optional function that returns a dict of TF fetches given the policy and batch input tensors grad_stats_fn (func): optional function that returns a dict of TF fetches given the policy and loss gradient tensors update_ops_fn (func): optional function that returns a list overriding the update ops to run when applying gradients before_loss_init (func): optional function to run prior to loss init that takes the same arguments as __init__ make_action_sampler (func): optional function that returns a tuple of action and action prob tensors. The function takes (policy, input_dict, obs_space, action_space, config) as its arguments existing_inputs (OrderedDict): when copying a policy, this specifies an existing dict of placeholders to use instead of defining new ones get_batch_divisibility_req (func): optional function that returns the divisibility requirement for sample batches obs_include_prev_action_reward (bool): whether to include the previous action and reward in the model input """ self.config = config self._loss_fn = loss_fn self._stats_fn = stats_fn self._grad_stats_fn = grad_stats_fn self._update_ops_fn = update_ops_fn self._obs_include_prev_action_reward = obs_include_prev_action_reward # Setup standard placeholders prev_actions = None prev_rewards = None if existing_inputs is not None: obs = existing_inputs[SampleBatch.CUR_OBS] if self._obs_include_prev_action_reward: prev_actions = existing_inputs[SampleBatch.PREV_ACTIONS] prev_rewards = existing_inputs[SampleBatch.PREV_REWARDS] else: obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape), name="observation") if self._obs_include_prev_action_reward: prev_actions = ModelCatalog.get_action_placeholder( action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.input_dict = { SampleBatch.CUR_OBS: obs, SampleBatch.PREV_ACTIONS: prev_actions, SampleBatch.PREV_REWARDS: prev_rewards, "is_training": self._get_is_training_placeholder(), } # Create the model network and action outputs if make_action_sampler: assert not existing_inputs, \ "Cloning not supported with custom action sampler" self.model = None self.dist_class = None self.action_dist = None action_sampler, action_prob = make_action_sampler( self, self.input_dict, obs_space, action_space, config) else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: existing_state_in = [ v for k, v in existing_inputs.items() if k.startswith("state_in_") ] if existing_state_in: existing_seq_lens = existing_inputs["seq_lens"] else: existing_seq_lens = None else: existing_state_in = [] existing_seq_lens = None self.model = ModelCatalog.get_model(self.input_dict, obs_space, action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) self.action_dist = self.dist_class(self.model.outputs) action_sampler = self.action_dist.sample() action_prob = self.action_dist.sampled_action_prob() # Phase 1 init sess = tf.get_default_session() or tf.Session() if get_batch_divisibility_req: batch_divisibility_req = get_batch_divisibility_req(self) else: batch_divisibility_req = 1 TFPolicy.__init__( self, obs_space, action_space, sess, obs_input=obs, action_sampler=action_sampler, action_prob=action_prob, loss=None, # dynamically initialized on run loss_inputs=[], model=self.model, state_inputs=self.model and self.model.state_in, state_outputs=self.model and self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model and self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"], batch_divisibility_req=batch_divisibility_req) # Phase 2 init self._needs_eager_conversion = set() self._eager_tensors = {} before_loss_init(self, obs_space, action_space, config) if not existing_inputs: self._initialize_loss()
def __init__(self, obs_space, action_space, config, loss_fn, stats_fn=None, grad_stats_fn=None, before_loss_init=None, make_model=None, action_sampler_fn=None, existing_inputs=None, existing_model=None, get_batch_divisibility_req=None, obs_include_prev_action_reward=True): """Initialize a dynamic TF policy. Arguments: observation_space (gym.Space): Observation space of the policy. action_space (gym.Space): Action space of the policy. config (dict): Policy-specific configuration data. loss_fn (func): function that returns a loss tensor the policy graph, and dict of experience tensor placeholders stats_fn (func): optional function that returns a dict of TF fetches given the policy and batch input tensors grad_stats_fn (func): optional function that returns a dict of TF fetches given the policy and loss gradient tensors before_loss_init (func): optional function to run prior to loss init that takes the same arguments as __init__ make_model (func): optional function that returns a ModelV2 object given (policy, obs_space, action_space, config). All policy variables should be created in this function. If not specified, a default model will be created. action_sampler_fn (func): optional function that returns a tuple of action and action prob tensors given (policy, model, input_dict, obs_space, action_space, config). If not specified, a default action distribution will be used. existing_inputs (OrderedDict): when copying a policy, this specifies an existing dict of placeholders to use instead of defining new ones existing_model (ModelV2): when copying a policy, this specifies an existing model to clone and share weights with get_batch_divisibility_req (func): optional function that returns the divisibility requirement for sample batches obs_include_prev_action_reward (bool): whether to include the previous action and reward in the model input Attributes: config: config of the policy model: model instance, if any model_out: output tensors of the model action_dist: action distribution of the model, if any state_in: state input tensors, if any state_out: state output tensors, if any seq_lens: tensor of sequence lengths """ self.config = config self._loss_fn = loss_fn self._stats_fn = stats_fn self._grad_stats_fn = grad_stats_fn self._obs_include_prev_action_reward = obs_include_prev_action_reward # Setup standard placeholders prev_actions = None prev_rewards = None if existing_inputs is not None: obs = existing_inputs[SampleBatch.CUR_OBS] if self._obs_include_prev_action_reward: prev_actions = existing_inputs[SampleBatch.PREV_ACTIONS] prev_rewards = existing_inputs[SampleBatch.PREV_REWARDS] else: obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape), name="observation") if self._obs_include_prev_action_reward: prev_actions = ModelCatalog.get_action_placeholder( action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.input_dict = { SampleBatch.CUR_OBS: obs, SampleBatch.PREV_ACTIONS: prev_actions, SampleBatch.PREV_REWARDS: prev_rewards, "is_training": self._get_is_training_placeholder(), } self.seq_lens = tf.placeholder(dtype=tf.int32, shape=[None], name="seq_lens") # Setup model if action_sampler_fn: if not make_model: raise ValueError( "make_model is required if action_sampler_fn is given") self.dist_class = None else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_model: self.model = existing_model elif make_model: self.model = make_model(self, obs_space, action_space, config) else: self.model = ModelCatalog.get_model_v2(obs_space, action_space, logit_dim, self.config["model"], framework="tf") if existing_inputs: self.state_in = [ v for k, v in existing_inputs.items() if k.startswith("state_in_") ] if self.state_in: self.seq_lens = existing_inputs["seq_lens"] else: self.state_in = [ tf.placeholder(shape=(None, ) + s.shape, dtype=s.dtype) for s in self.model.get_initial_state() ] self.model_out, self.state_out = self.model(self.input_dict, self.state_in, self.seq_lens) # Setup action sampler if action_sampler_fn: self.action_dist = None action_sampler, action_prob = action_sampler_fn( self, self.model, self.input_dict, obs_space, action_space, config) else: self.action_dist = self.dist_class(self.model_out) action_sampler = self.action_dist.sample() action_prob = self.action_dist.sampled_action_prob() # Phase 1 init sess = tf.get_default_session() or tf.Session() if get_batch_divisibility_req: batch_divisibility_req = get_batch_divisibility_req(self) else: batch_divisibility_req = 1 TFPolicy.__init__( self, obs_space, action_space, sess, obs_input=obs, action_sampler=action_sampler, action_prob=action_prob, loss=None, # dynamically initialized on run loss_inputs=[], model=self.model, state_inputs=self.state_in, state_outputs=self.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.seq_lens, max_seq_len=config["model"]["max_seq_len"], batch_divisibility_req=batch_divisibility_req) # Phase 2 init before_loss_init(self, obs_space, action_space, config) if not existing_inputs: self._initialize_loss()
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist(action_space) if existing_inputs: obs_ph, value_targets_ph, adv_ph, act_ph, \ logits_ph, vf_preds_ph = existing_inputs[:6] existing_state_in = existing_inputs[6:-1] existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder( tf.float32, name="obs", shape=(None, ) + observation_space.shape) adv_ph = tf.placeholder( tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder( tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder( tf.float32, name="vf_preds", shape=(None, )) value_targets_ph = tf.placeholder( tf.float32, name="value_targets", shape=(None, )) existing_state_in = None existing_seq_lens = None self.loss_in = [ ("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logits", logits_ph), ("vf_preds", vf_preds_ph), ] self.model = ModelCatalog.get_model( obs_ph, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable( initializer=tf.constant_initializer(self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() self.value_function = self.model.value_function self.loss_obj = PPOLoss( action_space, value_targets_ph, adv_ph, act_ph, logits_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_loss_coeff=self.config["kl_target"], use_gae=self.config["use_gae"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, loss=self.loss_obj.loss, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer())
def __init__( self, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict, loss_fn: Callable[ [Policy, ModelV2, Type[TFActionDistribution], SampleBatch], TensorType], *, stats_fn: Optional[Callable[[Policy, SampleBatch], Dict[str, TensorType]]] = None, grad_stats_fn: Optional[ Callable[[Policy, SampleBatch, ModelGradients], Dict[str, TensorType]]] = None, before_loss_init: Optional[Callable[[ Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict ], None]] = None, make_model: Optional[Callable[[ Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict ], ModelV2]] = None, action_sampler_fn: Optional[Callable[ [TensorType, List[TensorType]], Tuple[TensorType, TensorType]]] = None, action_distribution_fn: Optional[ Callable[[Policy, ModelV2, TensorType, TensorType, TensorType], Tuple[TensorType, type, List[TensorType]]]] = None, existing_inputs: Optional[Dict[str, "tf1.placeholder"]] = None, existing_model: Optional[ModelV2] = None, get_batch_divisibility_req: Optional[Callable[[Policy], int]] = None, obs_include_prev_action_reward: bool = True): """Initialize a dynamic TF policy. Args: observation_space (gym.spaces.Space): Observation space of the policy. action_space (gym.spaces.Space): Action space of the policy. config (TrainerConfigDict): Policy-specific configuration data. loss_fn (Callable[[Policy, ModelV2, Type[TFActionDistribution], SampleBatch], TensorType]): Function that returns a loss tensor for the policy graph. stats_fn (Optional[Callable[[Policy, SampleBatch], Dict[str, TensorType]]]): Optional function that returns a dict of TF fetches given the policy and batch input tensors. grad_stats_fn (Optional[Callable[[Policy, SampleBatch, ModelGradients], Dict[str, TensorType]]]): Optional function that returns a dict of TF fetches given the policy, sample batch, and loss gradient tensors. before_loss_init (Optional[Callable[ [Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict], None]]): Optional function to run prior to loss init that takes the same arguments as __init__. make_model (Optional[Callable[[Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict], ModelV2]]): Optional function that returns a ModelV2 object given policy, obs_space, action_space, and policy config. All policy variables should be created in this function. If not specified, a default model will be created. action_sampler_fn (Optional[Callable[[Policy, ModelV2, Dict[ str, TensorType], TensorType, TensorType], Tuple[TensorType, TensorType]]]): A callable returning a sampled action and its log-likelihood given Policy, ModelV2, input_dict, explore, timestep, and is_training. action_distribution_fn (Optional[Callable[[Policy, ModelV2, Dict[str, TensorType], TensorType, TensorType], Tuple[TensorType, type, List[TensorType]]]]): A callable returning distribution inputs (parameters), a dist-class to generate an action distribution object from, and internal-state outputs (or an empty list if not applicable). Note: No Exploration hooks have to be called from within `action_distribution_fn`. It's should only perform a simple forward pass through some model. If None, pass inputs through `self.model()` to get distribution inputs. The callable takes as inputs: Policy, ModelV2, input_dict, explore, timestep, is_training. existing_inputs (Optional[Dict[str, tf1.placeholder]]): When copying a policy, this specifies an existing dict of placeholders to use instead of defining new ones. existing_model (Optional[ModelV2]): When copying a policy, this specifies an existing model to clone and share weights with. get_batch_divisibility_req (Optional[Callable[[Policy], int]]): Optional callable that returns the divisibility requirement for sample batches. If None, will assume a value of 1. obs_include_prev_action_reward (bool): Whether to include the previous action and reward in the model input (default: True). """ self.observation_space = obs_space self.action_space = action_space self.config = config self.framework = "tf" self._loss_fn = loss_fn self._stats_fn = stats_fn self._grad_stats_fn = grad_stats_fn self._obs_include_prev_action_reward = obs_include_prev_action_reward dist_class = dist_inputs = None if action_sampler_fn or action_distribution_fn: if not make_model: raise ValueError( "`make_model` is required if `action_sampler_fn` OR " "`action_distribution_fn` is given") else: dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Setup self.model. if existing_model: if isinstance(existing_model, list): self.model = existing_model[0] # TODO: (sven) hack, but works for `target_[q_]?model`. for i in range(1, len(existing_model)): setattr(self, existing_model[i][0], existing_model[i][1]) elif make_model: self.model = make_model(self, obs_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=logit_dim, model_config=self.config["model"], framework="tf") # Auto-update model's inference view requirements, if recurrent. self._update_model_view_requirements_from_init_state() if existing_inputs: self._state_inputs = [ v for k, v in existing_inputs.items() if k.startswith("state_in_") ] if self._state_inputs: self._seq_lens = existing_inputs["seq_lens"] else: if self.config["_use_trajectory_view_api"]: self._state_inputs = [ get_placeholder( space=vr.space, time_axis=not isinstance(vr.shift, int), ) for k, vr in self.model.view_requirements.items() if k.startswith("state_in_") ] else: self._state_inputs = [ tf1.placeholder(shape=(None, ) + s.shape, dtype=s.dtype) for s in self.model.get_initial_state() ] # Use default settings. # Add NEXT_OBS, STATE_IN_0.., and others. self.view_requirements = self._get_default_view_requirements() # Combine view_requirements for Model and Policy. self.view_requirements.update(self.model.view_requirements) # Setup standard placeholders. if existing_inputs is not None: timestep = existing_inputs["timestep"] explore = existing_inputs["is_exploring"] self._input_dict, self._dummy_batch = \ self._get_input_dict_and_dummy_batch( self.view_requirements, existing_inputs) else: action_ph = ModelCatalog.get_action_placeholder(action_space) if self.config["_use_trajectory_view_api"]: prev_action_ph = {} if SampleBatch.PREV_ACTIONS not in self.view_requirements: prev_action_ph = { SampleBatch.PREV_ACTIONS: ModelCatalog.get_action_placeholder( action_space, "prev_action") } self._input_dict, self._dummy_batch = \ self._get_input_dict_and_dummy_batch( self.view_requirements, dict({SampleBatch.ACTIONS: action_ph}, **prev_action_ph)) else: prev_action_ph = ModelCatalog.get_action_placeholder( action_space, "prev_action") self._input_dict = { SampleBatch.CUR_OBS: tf1.placeholder(tf.float32, shape=[None] + list(obs_space.shape), name="observation") } self._input_dict[SampleBatch.ACTIONS] = action_ph if self._obs_include_prev_action_reward: self._input_dict.update({ SampleBatch.PREV_ACTIONS: prev_action_ph, SampleBatch.PREV_REWARDS: tf1.placeholder(tf.float32, [None], name="prev_reward"), }) # Placeholder for (sampling steps) timestep (int). timestep = tf1.placeholder_with_default(tf.zeros((), dtype=tf.int64), (), name="timestep") # Placeholder for `is_exploring` flag. explore = tf1.placeholder_with_default(True, (), name="is_exploring") # Placeholder for RNN time-chunk valid lengths. self._seq_lens = tf1.placeholder(dtype=tf.int32, shape=[None], name="seq_lens") # Placeholder for `is_training` flag. self._input_dict["is_training"] = self._get_is_training_placeholder() # Create the Exploration object to use for this Policy. self.exploration = self._create_exploration() # Fully customized action generation (e.g., custom policy). if action_sampler_fn: sampled_action, sampled_action_logp = action_sampler_fn( self, self.model, obs_batch=self._input_dict[SampleBatch.CUR_OBS], state_batches=self._state_inputs, seq_lens=self._seq_lens, prev_action_batch=self._input_dict.get( SampleBatch.PREV_ACTIONS), prev_reward_batch=self._input_dict.get( SampleBatch.PREV_REWARDS), explore=explore, is_training=self._input_dict["is_training"]) # Distribution generation is customized, e.g., DQN, DDPG. else: if action_distribution_fn: # Try new action_distribution_fn signature, supporting # state_batches and seq_lens. in_dict = self._input_dict try: dist_inputs, dist_class, self._state_out = \ action_distribution_fn( self, self.model, input_dict=in_dict, state_batches=self._state_inputs, seq_lens=self._seq_lens, explore=explore, timestep=timestep, is_training=in_dict["is_training"]) # Trying the old way (to stay backward compatible). # TODO: Remove in future. except TypeError as e: if "positional argument" in e.args[0] or \ "unexpected keyword argument" in e.args[0]: dist_inputs, dist_class, self._state_out = \ action_distribution_fn( self, self.model, obs_batch=in_dict[SampleBatch.CUR_OBS], state_batches=self._state_inputs, seq_lens=self._seq_lens, prev_action_batch=in_dict.get( SampleBatch.PREV_ACTIONS), prev_reward_batch=in_dict.get( SampleBatch.PREV_REWARDS), explore=explore, is_training=in_dict["is_training"]) else: raise e # Default distribution generation behavior: # Pass through model. E.g., PG, PPO. else: dist_inputs, self._state_out = self.model( self._input_dict, self._state_inputs, self._seq_lens) action_dist = dist_class(dist_inputs, self.model) # Using exploration to get final action (e.g. via sampling). sampled_action, sampled_action_logp = \ self.exploration.get_exploration_action( action_distribution=action_dist, timestep=timestep, explore=explore) # Phase 1 init. sess = tf1.get_default_session() or tf1.Session() batch_divisibility_req = get_batch_divisibility_req(self) if \ callable(get_batch_divisibility_req) else \ (get_batch_divisibility_req or 1) super().__init__( observation_space=obs_space, action_space=action_space, config=config, sess=sess, obs_input=self._input_dict[SampleBatch.OBS], action_input=self._input_dict[SampleBatch.ACTIONS], sampled_action=sampled_action, sampled_action_logp=sampled_action_logp, dist_inputs=dist_inputs, dist_class=dist_class, loss=None, # dynamically initialized on run loss_inputs=[], model=self.model, state_inputs=self._state_inputs, state_outputs=self._state_out, prev_action_input=self._input_dict.get(SampleBatch.PREV_ACTIONS), prev_reward_input=self._input_dict.get(SampleBatch.PREV_REWARDS), seq_lens=self._seq_lens, max_seq_len=config["model"]["max_seq_len"], batch_divisibility_req=batch_divisibility_req, explore=explore, timestep=timestep) # Phase 2 init. if before_loss_init is not None: before_loss_init(self, obs_space, action_space, config) # Loss initialization and model/postprocessing test calls. if not existing_inputs: self._initialize_loss_from_dummy_batch( auto_remove_unneeded_view_reqs=True)
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config) self.config = config self.sess = tf.get_default_session() # Setup the policy self.observations = tf.placeholder( tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model({ "obs": self.observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) self.vf = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # Setup the policy loss if isinstance(action_space, gym.spaces.Box): ac_size = action_space.shape[0] actions = tf.placeholder(tf.float32, [None, ac_size], name="ac") elif isinstance(action_space, gym.spaces.Discrete): actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for A3C.".format( action_space)) advantages = tf.placeholder(tf.float32, [None], name="advantages") self.v_target = tf.placeholder(tf.float32, [None], name="v_target") self.loss = A3CLoss(action_dist, actions, advantages, self.v_target, self.vf, self.config["vf_loss_coeff"], self.config["entropy_coeff"]) # Initialize TFPolicyGraph loss_in = [ ("obs", self.observations), ("actions", actions), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ("advantages", advantages), ("value_targets", self.v_target), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), loss=self.model.loss() + self.loss.total_loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) self.stats_fetches = { "stats": { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "policy_entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance(self.v_target, self.vf), }, } self.sess.run(tf.global_variables_initializer())