def __init__(self, obs_space, action_space, config): if config["use_gae"]: if config["vf_share_layers"]: self.value_function = self.model.value_function() else: vf_config = config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False if vf_config["use_lstm"]: vf_config["use_lstm"] = False logger.warning( "It is not recommended to use a LSTM model with " "vf_share_layers=False (consider setting it to True). " "If you want to not share layers, you can implement " "a custom LSTM model that overrides the " "value_function() method.") with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model( { "obs": self._obs_input, "prev_actions": self._prev_action_input, "prev_rewards": self._prev_reward_input, "is_training": self._get_is_training_placeholder(), }, obs_space, action_space, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.zeros(shape=tf.shape(self._obs_input)[:1])
def __init__(self, obs_space, action_space, registry, config): self.config = config # setup policy self.x = tf.placeholder(tf.float32, shape=[None]+list(obs_space.shape)) dist_class, self.logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.model = ModelCatalog.get_model( registry, self.x, self.logit_dim, options=self.config["model"]) self.dist = dist_class(self.model.outputs) # logit for each action # setup policy loss self.ac = ModelCatalog.get_action_placeholder(action_space) self.adv = tf.placeholder(tf.float32, [None], name="adv") self.loss = -tf.reduce_mean(self.dist.logp(self.ac) * self.adv) # initialize TFPolicyGraph self.sess = tf.get_default_session() self.loss_in = [ ("obs", self.x), ("actions", self.ac), ("advantages", self.adv), ] self.is_training = tf.placeholder_with_default(True, ()) TFPolicyGraph.__init__( self, self.sess, obs_input=self.x, action_sampler=self.dist.sample(), loss=self.loss, loss_inputs=self.loss_in, is_training=self.is_training) self.sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config) self.config = config self.sess = tf.get_default_session() # Setup the policy self.observations = tf.placeholder( tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.model = ModelCatalog.get_model(self.observations, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) self.vf = tf.reshape( linear(self.model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # Setup the policy loss if isinstance(action_space, gym.spaces.Box): ac_size = action_space.shape[0] actions = tf.placeholder(tf.float32, [None, ac_size], name="ac") elif isinstance(action_space, gym.spaces.Discrete): actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for A3C.".format( action_space)) advantages = tf.placeholder(tf.float32, [None], name="advantages") v_target = tf.placeholder(tf.float32, [None], name="v_target") self.loss = A3CLoss(action_dist, actions, advantages, v_target, self.vf, self.config["vf_loss_coeff"], self.config["entropy_coeff"]) # Initialize TFPolicyGraph loss_in = [ ("obs", self.observations), ("actions", actions), ("advantages", advantages), ("value_targets", v_target), ] TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), loss=self.loss.total_loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer())
def __init__(self, obs_space, action_space, config): config = dict(ray.rllib.agents.pg.pg.DEFAULT_CONFIG, **config) self.config = config # Setup placeholders obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape)) dist_class, self.logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") # Create the model network and action outputs self.model = ModelCatalog.get_model({ "obs": obs, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, obs_space, self.logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) # logit for each action # Setup policy loss actions = ModelCatalog.get_action_placeholder(action_space) advantages = tf.placeholder(tf.float32, [None], name="adv") loss = PGLoss(action_dist, actions, advantages).loss # Mapping from sample batch keys to placeholders. These keys will be # read from postprocessed sample batches and fed into the specified # placeholders during loss computation. loss_in = [ ("obs", obs), ("actions", actions), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ("advantages", advantages), # added during postprocessing ] # Initialize TFPolicyGraph sess = tf.get_default_session() TFPolicyGraph.__init__( self, obs_space, action_space, sess, obs_input=obs, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=loss, loss_inputs=loss_in, model=self.model, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) sess.run(tf.global_variables_initializer())
def __init__(self, obs_space, action_space, config): config = dict(ray.rllib.agents.pg.pg.DEFAULT_CONFIG, **config) self.config = config # Setup placeholders obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape)) dist_class, self.logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") # Create the model network and action outputs self.model = ModelCatalog.get_model({ "obs": obs, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, obs_space, action_space, self.logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) # logit for each action # Setup policy loss actions = ModelCatalog.get_action_placeholder(action_space) advantages = tf.placeholder(tf.float32, [None], name="adv") loss = PGLoss(action_dist, actions, advantages).loss # Mapping from sample batch keys to placeholders. These keys will be # read from postprocessed sample batches and fed into the specified # placeholders during loss computation. loss_in = [ ("obs", obs), ("actions", actions), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ("advantages", advantages), # added during postprocessing ] # Initialize TFPolicyGraph sess = tf.get_default_session() TFPolicyGraph.__init__( self, obs_space, action_space, sess, obs_input=obs, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=loss, loss_inputs=loss_in, model=self.model, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) sess.run(tf.global_variables_initializer())
def _setup_graph(self, ob_space, ac_space): self.x = tf.placeholder(tf.float32, [None] + list(ob_space)) dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space) self._model = ModelCatalog.get_model( self.registry, self.x, self.logit_dim, self.config["model"]) self.logits = self._model.outputs self.curr_dist = dist_class(self.logits) self.sample = self.curr_dist.sample() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
def __init__(self, obs_space, action_space, config): config = dict(ray.rllib.agents.pg.pg.DEFAULT_CONFIG, **config) self.config = config # Setup policy obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape)) dist_class, self.logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": obs, "prev_actions": prev_actions, "prev_rewards": prev_rewards }, obs_space, self.logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) # logit for each action # Setup policy loss actions = ModelCatalog.get_action_placeholder(action_space) advantages = tf.placeholder(tf.float32, [None], name="adv") loss = PGLoss(action_dist, actions, advantages).loss # Initialize TFPolicyGraph sess = tf.get_default_session() # Mapping from sample batch keys to placeholders loss_in = [ ("obs", obs), ("actions", actions), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ("advantages", advantages), ] TFPolicyGraph.__init__(self, obs_space, action_space, sess, obs_input=obs, action_sampler=action_dist.sample(), loss=loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) sess.run(tf.global_variables_initializer())
def _setup_graph(self, ob_space, ac_space): self.x = tf.placeholder(tf.float32, [None] + list(ob_space)) dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space) self._model = ModelCatalog.get_model( self.registry, self.x, self.logit_dim, self.config["model"]) self.logits = self._model.outputs self.curr_dist = dist_class(self.logits) self.vf = tf.reshape(linear(self._model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) self.sample = self.curr_dist.sample() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) self.global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False)
def __init__(self, obs_space, action_space, config): config = dict(ray.rllib.pg.pg.DEFAULT_CONFIG, **config) self.config = config # Setup policy obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape)) dist_class, self.logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.model = ModelCatalog.get_model(obs, self.logit_dim, options=self.config["model"]) action_dist = dist_class(self.model.outputs) # logit for each action # Setup policy loss actions = ModelCatalog.get_action_placeholder(action_space) advantages = tf.placeholder(tf.float32, [None], name="adv") loss = PGLoss(action_dist, actions, advantages).loss # Initialize TFPolicyGraph sess = tf.get_default_session() loss_in = [ ("obs", obs), ("actions", actions), ("advantages", advantages), ] # LSTM support for i, ph in enumerate(self.model.state_in): loss_in.append(("state_in_{}".format(i), ph)) is_training = tf.placeholder_with_default(True, ()) TFPolicyGraph.__init__(self, obs_space, action_space, sess, obs_input=obs, action_sampler=action_dist.sample(), loss=loss, loss_inputs=loss_in, is_training=is_training, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) sess.run(tf.global_variables_initializer())
def _setup_graph(self, ob_space, ac_space): self.x = tf.placeholder(tf.float32, [None] + list(ob_space)) dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space) self._model = ModelCatalog.get_model(self.x, self.logit_dim) self.logits = self._model.outputs self.curr_dist = dist_class(self.logits) # with tf.variable_scope("vf"): # vf_model = ModelCatalog.get_model(self.x, 1) self.vf = tf.reshape(linear(self._model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) self.sample = self.curr_dist.sample() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) self.global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False)
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config) self.config = config self.sess = tf.get_default_session() # Setup the policy self.observations = tf.placeholder( tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.model = ModelCatalog.get_model( self.observations, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) self.vf = tf.reshape( linear(self.model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) is_training = tf.placeholder_with_default(True, ()) # Setup the policy loss if isinstance(action_space, gym.spaces.Box): ac_size = action_space.shape[0] actions = tf.placeholder(tf.float32, [None, ac_size], name="ac") elif isinstance(action_space, gym.spaces.Discrete): actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for A3C.".format( action_space)) advantages = tf.placeholder(tf.float32, [None], name="advantages") v_target = tf.placeholder(tf.float32, [None], name="v_target") self.loss = A3CLoss( action_dist, actions, advantages, v_target, self.vf, self.config["vf_loss_coeff"], self.config["entropy_coeff"]) # Initialize TFPolicyGraph loss_in = [ ("obs", self.observations), ("actions", actions), ("advantages", advantages), ("value_targets", v_target), ] for i, ph in enumerate(self.model.state_in): loss_in.append(("state_in_{}".format(i), ph)) self.state_in = self.model.state_in self.state_out = self.model.state_out TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), loss=self.loss.total_loss, loss_inputs=loss_in, is_training=is_training, state_inputs=self.state_in, state_outputs=self.state_out, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) if self.config.get("summarize"): bs = tf.to_float(tf.shape(self.observations)[0]) tf.summary.scalar("model/policy_graph", self.loss.pi_loss / bs) tf.summary.scalar("model/value_loss", self.loss.vf_loss / bs) tf.summary.scalar("model/entropy", self.loss.entropy / bs) tf.summary.scalar("model/grad_gnorm", tf.global_norm(self._grads)) tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list)) self.summary_op = tf.summary.merge_all() self.sess.run(tf.global_variables_initializer())
def _build_actor_network(self, registry, inputs, ac_space, config): frontend = ModelCatalog.get_model(registry, inputs, 1, config["model"]) act = frontend.outputs a_bound = ac_space.high act = tf.multiply(act, a_bound, name='scaled_a') return act
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: obs_ph, value_targets_ph, adv_ph, act_ph, \ logits_ph, vf_preds_ph, prev_actions_ph, prev_rewards_ph = \ existing_inputs[:8] existing_state_in = existing_inputs[8:-1] existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder( tf.float32, name="obs", shape=(None, ) + observation_space.shape) adv_ph = tf.placeholder( tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder( tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder( tf.float32, name="vf_preds", shape=(None, )) value_targets_ph = tf.placeholder( tf.float32, name="value_targets", shape=(None, )) prev_actions_ph = ModelCatalog.get_action_placeholder(action_space) prev_rewards_ph = tf.placeholder( tf.float32, [None], name="prev_reward") existing_state_in = None existing_seq_lens = None self.observations = obs_ph self.prev_actions = prev_actions_ph self.prev_rewards = prev_rewards_ph self.loss_in = [ (SampleBatch.CUR_OBS, obs_ph), (Postprocessing.VALUE_TARGETS, value_targets_ph), (Postprocessing.ADVANTAGES, adv_ph), (SampleBatch.ACTIONS, act_ph), (BEHAVIOUR_LOGITS, logits_ph), (SampleBatch.VF_PREDS, vf_preds_ph), (SampleBatch.PREV_ACTIONS, prev_actions_ph), (SampleBatch.PREV_REWARDS, prev_rewards_ph), ] self.model = ModelCatalog.get_model( { "obs": obs_ph, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable( initializer=tf.constant_initializer(self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: if self.config["vf_share_layers"]: self.value_function = self.model.value_function() else: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False if vf_config["use_lstm"]: vf_config["use_lstm"] = False logger.warning( "It is not recommended to use a LSTM model with " "vf_share_layers=False (consider setting it to True). " "If you want to not share layers, you can implement " "a custom LSTM model that overrides the " "value_function() method.") with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model({ "obs": obs_ph, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1]) if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(adv_ph, dtype=tf.bool) self.loss_obj = PPOLoss( action_space, value_targets_ph, adv_ph, act_ph, logits_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, mask, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_clip_param=self.config["vf_clip_param"], vf_loss_coeff=self.config["vf_loss_coeff"], use_gae=self.config["use_gae"]) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, action_prob=curr_action_dist.sampled_action_prob(), loss=self.loss_obj.loss, model=self.model, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions_ph, prev_reward_input=prev_rewards_ph, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) self.explained_variance = explained_variance(value_targets_ph, self.value_function) self.stats_fetches = { "cur_kl_coeff": self.kl_coeff, "cur_lr": tf.cast(self.cur_lr, tf.float64), "total_loss": self.loss_obj.loss, "policy_loss": self.loss_obj.mean_policy_loss, "vf_loss": self.loss_obj.mean_vf_loss, "vf_explained_var": self.explained_variance, "kl": self.loss_obj.mean_kl, "entropy": self.loss_obj.mean_entropy }
def __init__(self, observation_space, action_space, config, existing_inputs=None): config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() self.grads = None if isinstance(action_space, gym.spaces.Discrete): is_multidiscrete = False output_hidden_shape = [action_space.n] elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete): is_multidiscrete = True output_hidden_shape = action_space.nvec.astype(np.int32) elif self.config["vtrace"]: raise UnsupportedSpaceException( "Action space {} is not supported for APPO + VTrace.", format(action_space)) else: is_multidiscrete = False output_hidden_shape = 1 # Policy network model dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Create input placeholders if existing_inputs: if self.config["vtrace"]: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards = existing_inputs[:7] existing_state_in = existing_inputs[7:-1] existing_seq_lens = existing_inputs[-1] else: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards, adv_ph, value_targets = \ existing_inputs[:9] existing_state_in = existing_inputs[9:-1] existing_seq_lens = existing_inputs[-1] else: actions = ModelCatalog.get_action_placeholder(action_space) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder(tf.float32, [None, logit_dim], name="behaviour_logits") observations = tf.placeholder(tf.float32, [None] + list(observation_space.shape)) existing_state_in = None existing_seq_lens = None if not self.config["vtrace"]: adv_ph = tf.placeholder(tf.float32, name="advantages", shape=(None, )) value_targets = tf.placeholder(tf.float32, name="value_targets", shape=(None, )) self.observations = observations # Unpack behaviour logits unpacked_behaviour_logits = tf.split(behaviour_logits, output_hidden_shape, axis=1) # Setup the policy dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) unpacked_outputs = tf.split(self.model.outputs, output_hidden_shape, axis=1) dist_inputs = unpacked_outputs if is_multidiscrete else \ self.model.outputs prev_dist_inputs = unpacked_behaviour_logits if is_multidiscrete else \ behaviour_logits action_dist = dist_class(dist_inputs) prev_action_dist = dist_class(prev_dist_inputs) values = self.model.value_function() self.value_function = values self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) def make_time_major(tensor, drop_last=False): """Swaps batch and trajectory axis. Args: tensor: A tensor or list of tensors to reshape. drop_last: A bool indicating whether to drop the last trajectory item. Returns: res: A tensor with swapped axes or a list of tensors with swapped axes. """ if isinstance(tensor, list): return [make_time_major(t, drop_last) for t in tensor] if self.model.state_init: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = self.config["sample_batch_size"] B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes res = tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) if drop_last: return res[:-1] return res if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) - 1 mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(rewards) # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. if self.config["vtrace"]: logger.info("Using V-Trace surrogate loss (vtrace=True)") # Prepare actions for loss loss_actions = actions if is_multidiscrete else tf.expand_dims( actions, axis=1) self.loss = VTraceSurrogateLoss( actions=make_time_major(loss_actions, drop_last=True), prev_actions_logp=make_time_major( prev_action_dist.logp(actions), drop_last=True), actions_logp=make_time_major(action_dist.logp(actions), drop_last=True), action_kl=prev_action_dist.kl(action_dist), actions_entropy=make_time_major(action_dist.entropy(), drop_last=True), dones=make_time_major(dones, drop_last=True), behaviour_logits=make_time_major(unpacked_behaviour_logits, drop_last=True), target_logits=make_time_major(unpacked_outputs, drop_last=True), discount=config["gamma"], rewards=make_time_major(rewards, drop_last=True), values=make_time_major(values, drop_last=True), bootstrap_value=make_time_major(values)[-1], valid_mask=make_time_major(mask, drop_last=True), vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self. config["vtrace_clip_pg_rho_threshold"], clip_param=self.config["clip_param"]) else: logger.info("Using PPO surrogate loss (vtrace=False)") self.loss = PPOSurrogateLoss( prev_actions_logp=make_time_major( prev_action_dist.logp(actions)), actions_logp=make_time_major(action_dist.logp(actions)), action_kl=prev_action_dist.kl(action_dist), actions_entropy=make_time_major(action_dist.entropy()), values=make_time_major(values), valid_mask=make_time_major(mask), advantages=make_time_major(adv_ph), value_targets=make_time_major(value_targets), vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"]) # KL divergence between worker and learner logits for debugging model_dist = MultiCategorical(unpacked_outputs) behaviour_dist = MultiCategorical(unpacked_behaviour_logits) kls = model_dist.kl(behaviour_dist) if len(kls) > 1: self.KL_stats = {} for i, kl in enumerate(kls): self.KL_stats.update({ "mean_KL_{}".format(i): tf.reduce_mean(kl), "max_KL_{}".format(i): tf.reduce_max(kl), "median_KL_{}".format(i): tf.contrib.distributions.percentile(kl, 50.0), }) else: self.KL_stats = { "mean_KL": tf.reduce_mean(kls[0]), "max_KL": tf.reduce_max(kls[0]), "median_KL": tf.contrib.distributions.percentile(kls[0], 50.0), } # Initialize TFPolicyGraph loss_in = [ ("actions", actions), ("dones", dones), ("behaviour_logits", behaviour_logits), ("rewards", rewards), ("obs", observations), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ] if not self.config["vtrace"]: loss_in.append(("advantages", adv_ph)) loss_in.append(("value_targets", value_targets)) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=self.loss.total_loss, model=self.model, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"], batch_divisibility_req=self.config["sample_batch_size"]) self.sess.run(tf.global_variables_initializer()) values_batched = make_time_major(values, drop_last=self.config["vtrace"]) self.stats_fetches = { "stats": dict( { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.value_targets, [-1]), tf.reshape(values_batched, [-1])), }, **self.KL_stats), }
def __init__(self, observation_space, action_space, config, existing_inputs=None): config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() self.grads = None if isinstance(action_space, gym.spaces.Discrete): is_multidiscrete = False output_hidden_shape = [action_space.n] elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete): is_multidiscrete = True output_hidden_shape = action_space.nvec.astype(np.int32) else: is_multidiscrete = False output_hidden_shape = 1 # Create input placeholders dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards = existing_inputs[:7] existing_state_in = existing_inputs[7:-1] existing_seq_lens = existing_inputs[-1] else: actions = ModelCatalog.get_action_placeholder(action_space) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder( tf.float32, [None, logit_dim], name="behaviour_logits") observations = tf.placeholder( tf.float32, [None] + list(observation_space.shape)) existing_state_in = None existing_seq_lens = None # Unpack behaviour logits unpacked_behaviour_logits = tf.split( behaviour_logits, output_hidden_shape, axis=1) # Setup the policy prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) unpacked_outputs = tf.split( self.model.outputs, output_hidden_shape, axis=1) action_dist = dist_class(self.model.outputs) values = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) def make_time_major(tensor, drop_last=False): """Swaps batch and trajectory axis. Args: tensor: A tensor or list of tensors to reshape. drop_last: A bool indicating whether to drop the last trajectory item. Returns: res: A tensor with swapped axes or a list of tensors with swapped axes. """ if isinstance(tensor, list): return [make_time_major(t, drop_last) for t in tensor] if self.model.state_init: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = self.config["sample_batch_size"] B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes res = tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) if drop_last: return res[:-1] return res if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) - 1 mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(rewards, dtype=tf.bool) # Prepare actions for loss loss_actions = actions if is_multidiscrete else tf.expand_dims( actions, axis=1) # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. self.loss = VTraceLoss( actions=make_time_major(loss_actions, drop_last=True), actions_logp=make_time_major( action_dist.logp(actions), drop_last=True), actions_entropy=make_time_major( action_dist.entropy(), drop_last=True), dones=make_time_major(dones, drop_last=True), behaviour_logits=make_time_major( unpacked_behaviour_logits, drop_last=True), target_logits=make_time_major(unpacked_outputs, drop_last=True), discount=config["gamma"], rewards=make_time_major(rewards, drop_last=True), values=make_time_major(values, drop_last=True), bootstrap_value=make_time_major(values)[-1], dist_class=Categorical if is_multidiscrete else dist_class, valid_mask=make_time_major(mask, drop_last=True), vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"]) # Initialize TFPolicy loss_in = [ (SampleBatch.ACTIONS, actions), (SampleBatch.DONES, dones), (BEHAVIOUR_LOGITS, behaviour_logits), (SampleBatch.REWARDS, rewards), (SampleBatch.CUR_OBS, observations), (SampleBatch.PREV_ACTIONS, prev_actions), (SampleBatch.PREV_REWARDS, prev_rewards), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicy.__init__( self, observation_space, action_space, self.sess, obs_input=observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=self.loss.total_loss, model=self.model, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"], batch_divisibility_req=self.config["sample_batch_size"]) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { LEARNER_STATS_KEY: { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.vtrace_returns.vs, [-1]), tf.reshape(make_time_major(values, drop_last=True), [-1])), }, }
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: obs_ph, value_targets_ph, adv_ph, act_ph, \ logits_ph, vf_preds_ph, prev_actions_ph, prev_rewards_ph = \ existing_inputs[:8] existing_state_in = existing_inputs[8:-1] existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder( tf.float32, name="obs", shape=(None, ) + observation_space.shape) adv_ph = tf.placeholder( tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder( tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder( tf.float32, name="vf_preds", shape=(None, )) value_targets_ph = tf.placeholder( tf.float32, name="value_targets", shape=(None, )) prev_actions_ph = ModelCatalog.get_action_placeholder(action_space) prev_rewards_ph = tf.placeholder( tf.float32, [None], name="prev_reward") existing_state_in = None existing_seq_lens = None self.observations = obs_ph self.prev_actions = prev_actions_ph self.prev_rewards = prev_rewards_ph self.loss_in = [ ("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logits", logits_ph), ("vf_preds", vf_preds_ph), ("prev_actions", prev_actions_ph), ("prev_rewards", prev_rewards_ph), ] self.model = ModelCatalog.get_model( { "obs": obs_ph, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable( initializer=tf.constant_initializer(self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: if self.config["vf_share_layers"]: self.value_function = self.model.value_function() else: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False if vf_config["use_lstm"]: vf_config["use_lstm"] = False logger.warning( "It is not recommended to use a LSTM model with " "vf_share_layers=False (consider setting it to True). " "If you want to not share layers, you can implement " "a custom LSTM model that overrides the " "value_function() method.") with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model({ "obs": obs_ph, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1]) if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(adv_ph, dtype=tf.bool) self.loss_obj = PPOLoss( action_space, value_targets_ph, adv_ph, act_ph, logits_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, mask, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_clip_param=self.config["vf_clip_param"], vf_loss_coeff=self.config["vf_loss_coeff"], use_gae=self.config["use_gae"]) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, action_prob=curr_action_dist.sampled_action_prob(), loss=self.loss_obj.loss, model=self.model, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions_ph, prev_reward_input=prev_rewards_ph, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) self.explained_variance = explained_variance(value_targets_ph, self.value_function) self.stats_fetches = { "cur_kl_coeff": self.kl_coeff, "cur_lr": tf.cast(self.cur_lr, tf.float64), "total_loss": self.loss_obj.loss, "policy_loss": self.loss_obj.mean_policy_loss, "vf_loss": self.loss_obj.mean_vf_loss, "vf_explained_var": self.explained_variance, "kl": self.loss_obj.mean_kl, "entropy": self.loss_obj.mean_entropy }
def __init__(self, observation_space, action_space, config, unsupType='action', envWrap=False, designHead='universe', noReward=False): """ An implementation of the A3C algorithm that is reasonably well-tuned for the VNC environments. Below, we will have a modest amount of complexity due to the way TensorFlow handles data parallelism. But overall, we'll define the model, specify its inputs, and describe how the policy gradients step should be computed. """ self.unsup = unsupType is not None self.cur_batch = None predictor = None numaction = action_space.n config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config) self.config = config self.sess = tf.get_default_session() # Setup the policy # ===================================================================== self.observations = tf.placeholder(tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # NOTE: value function and trainable variables are defined in self.model # Define the policy network self.model = pi = ModelCatalog.get_model(self.observations, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) # Define S/S+A predictor network if self.unsup: with tf.variable_scope("predictor"): if 'state' in unsupType: self.local_ap_network = predictor = StatePredictor( observation_space.shape, numaction, designHead, unsupType) else: self.local_ap_network = predictor = StateActionPredictor( observation_space.shape, numaction, designHead) # Setup the policy loss # ===================================================================== if isinstance(action_space, gym.spaces.Box): ac_size = action_space.shape[0] actions = tf.placeholder(tf.float32, [None, ac_size], name="ac") elif isinstance(action_space, gym.spaces.Discrete): actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for A3C.".format( action_space)) advantages = tf.placeholder(tf.float32, [None], name="advantages") self.v_target = tf.placeholder(tf.float32, [None], name="v_target") # compute policy loss and predictor loss self.loss = A3CLoss(action_dist, actions, advantages, self.v_target, self.model.vf, unsupType, predictor, self.config["vf_loss_coeff"], self.config["entropy_coeff"]) # Initialize TFPolicyGraph loss_in = [ ("obs", self.observations), ("actions", actions), ("advantages", advantages), ("value_targets", self.v_target), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), loss=self.loss.total_loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) self.stats_fetches = { "stats": { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "policy_entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.model.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance(self.v_target, self.model.vf), }, } self.sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist(action_space) if existing_inputs: obs_ph, value_targets_ph, adv_ph, act_ph, \ logits_ph, vf_preds_ph = existing_inputs[:6] existing_state_in = existing_inputs[6:-1] existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder( tf.float32, name="obs", shape=(None,)+observation_space.shape) adv_ph = tf.placeholder( tf.float32, name="advantages", shape=(None,)) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder( tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder( tf.float32, name="vf_preds", shape=(None,)) value_targets_ph = tf.placeholder( tf.float32, name="value_targets", shape=(None,)) existing_state_in = None existing_seq_lens = None self.loss_in = [ ("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logits", logits_ph), ("vf_preds", vf_preds_ph), ] self.model = ModelCatalog.get_model( obs_ph, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable( initializer=tf.constant_initializer(self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False vf_config["use_lstm"] = False with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model( obs_ph, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1]) self.loss_obj = PPOLoss( action_space, value_targets_ph, adv_ph, act_ph, logits_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_loss_coeff=self.config["kl_target"], use_gae=self.config["use_gae"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, loss=self.loss_obj.loss, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config) self.config = config self.sess = tf.get_default_session() # Setup the policy self.observations = tf.placeholder( tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model({ "obs": self.observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) self.vf = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # Setup the policy loss if isinstance(action_space, gym.spaces.Box): ac_size = action_space.shape[0] actions = tf.placeholder(tf.float32, [None, ac_size], name="ac") elif isinstance(action_space, gym.spaces.Discrete): actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for A3C.".format( action_space)) advantages = tf.placeholder(tf.float32, [None], name="advantages") self.v_target = tf.placeholder(tf.float32, [None], name="v_target") self.loss = A3CLoss(action_dist, actions, advantages, self.v_target, self.vf, self.config["vf_loss_coeff"], self.config["entropy_coeff"]) # Initialize TFPolicyGraph loss_in = [ ("obs", self.observations), ("actions", actions), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ("advantages", advantages), ("value_targets", self.v_target), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), loss=self.model.loss() + self.loss.total_loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) self.stats_fetches = { "stats": { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "policy_entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance(self.v_target, self.vf), }, } self.sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist(action_space) if existing_inputs: self.loss_in = existing_inputs obs_ph, value_targets_ph, adv_ph, act_ph, \ logprobs_ph, vf_preds_ph = [ph for _, ph in existing_inputs] else: obs_ph = tf.placeholder(tf.float32, name="obs", shape=(None, ) + observation_space.shape) # Targets of the value function. value_targets_ph = tf.placeholder(tf.float32, name="value_targets", shape=(None, )) # Advantage values in the policy gradient estimator. adv_ph = tf.placeholder(tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) # Log probabilities from the policy before the policy update. logprobs_ph = tf.placeholder(tf.float32, name="logprobs", shape=(None, logit_dim)) # Value function predictions before the policy update. vf_preds_ph = tf.placeholder(tf.float32, name="vf_preds", shape=(None, )) self.loss_in = [("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logprobs", logprobs_ph), ("vf_preds", vf_preds_ph)] # TODO(ekl) feed RNN states in here # KL Coefficient self.kl_coeff = tf.get_variable(initializer=tf.constant_initializer( self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = ModelCatalog.get_model(obs_ph, logit_dim, self.config["model"]).outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model( obs_ph, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.constant("NA") self.loss_obj = PPOLoss(action_space, value_targets_ph, adv_ph, act_ph, logprobs_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_loss_coeff=self.config["kl_target"], use_gae=self.config["use_gae"]) self.is_training = tf.placeholder_with_default(True, ()) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, loss=self.loss_obj.loss, loss_inputs=self.loss_in, is_training=self.is_training)
def __init__(self, observation_space, action_space, config, existing_inputs=None, unsupType='action', designHead='universe'): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ self.unsup = unsupType is not None # self.cur_batch = None # self.cur_sample_batch = {} predictor = None numaction = action_space.n config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: obs_ph, value_targets_ph, adv_ph, act_ph, \ logits_ph, vf_preds_ph, phi1, phi2, asample = existing_inputs[:9] # TODO: updates to account for s1, s2 and asample existing_state_in = existing_inputs[9:-1] existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder(tf.float32, name="obs", shape=(None, ) + observation_space.shape) adv_ph = tf.placeholder(tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder(tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder(tf.float32, name="vf_preds", shape=(None, )) value_targets_ph = tf.placeholder(tf.float32, name="value_targets", shape=(None, )) phi1 = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape, name="phi1") phi2 = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape, name="phi2") asample = tf.placeholder(tf.float32, shape=(None, numaction), name="asample") existing_state_in = None existing_seq_lens = None self.observations = obs_ph if self.unsup: with tf.variable_scope("predictor"): if 'state' in unsupType: self.local_ap_network = predictor = StatePredictor( phi1, phi2, asample, observation_space, numaction, designHead, unsupType) else: self.local_ap_network = predictor = StateActionPredictor( phi1, phi2, asample, observation_space, numaction, designHead) self.model = pi = ModelCatalog.get_model(obs_ph, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable(initializer=tf.constant_initializer( self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: if self.config["vf_share_layers"]: self.value_function = tf.reshape( linear(self.model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) else: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False vf_config["use_lstm"] = False with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model( obs_ph, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1]) self.loss_obj = PPOLoss(action_space, value_targets_ph, adv_ph, act_ph, logits_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, unsupType, predictor, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_clip_param=self.config["vf_clip_param"], vf_loss_coeff=self.config["vf_loss_coeff"], use_gae=self.config["use_gae"]) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) self.loss_in = [ ("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logits", logits_ph), ("vf_preds", vf_preds_ph), ("s1", phi1), ("s2", phi2), ("asample", asample), ] self.extra_inputs = ["s1", "s2", "asample"] # TODO: testing to see if this lets me pass inputs to ICM # self.variables = ray.experimental.TensorFlowVariables(self.loss_in, self.sess) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, loss=self.loss_obj.loss, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) self.explained_variance = explained_variance(value_targets_ph, self.value_function) self.stats_fetches = { "cur_lr": tf.cast(self.cur_lr, tf.float64), "total_loss": self.loss_obj.loss, "policy_loss": self.loss_obj.mean_policy_loss, "vf_loss": self.loss_obj.mean_vf_loss, "vf_explained_var": self.explained_variance, "kl": self.loss_obj.mean_kl, "entropy": self.loss_obj.mean_entropy }
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config) self.config = config self.sess = tf.get_default_session() # Setup the policy self.observations = tf.placeholder(tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.prev_actions = ModelCatalog.get_action_placeholder(action_space) self.prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": self.observations, "prev_actions": self.prev_actions, "prev_rewards": self.prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) self.vf = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # Setup the policy loss if isinstance(action_space, gym.spaces.Box): ac_size = action_space.shape[0] actions = tf.placeholder(tf.float32, [None, ac_size], name="ac") elif isinstance(action_space, gym.spaces.Discrete): actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for A3C.".format( action_space)) advantages = tf.placeholder(tf.float32, [None], name="advantages") self.v_target = tf.placeholder(tf.float32, [None], name="v_target") self.loss = A3CLoss(action_dist, actions, advantages, self.v_target, self.vf, self.config["vf_loss_coeff"], self.config["entropy_coeff"]) # Initialize TFPolicyGraph loss_in = [ ("obs", self.observations), ("actions", actions), ("prev_actions", self.prev_actions), ("prev_rewards", self.prev_rewards), ("advantages", advantages), ("value_targets", self.v_target), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=self.loss.total_loss, model=self.model, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=self.prev_actions, prev_reward_input=self.prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) self.stats_fetches = { LEARNER_STATS_KEY: { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "policy_entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance(self.v_target, self.vf), }, } self.sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() # Setup the policy self.observations = tf.placeholder( tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.model = ModelCatalog.get_model(self.observations, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) values = tf.reshape( linear(self.model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # Setup the policy loss if isinstance(action_space, gym.spaces.Box): ac_size = action_space.shape[0] actions = tf.placeholder(tf.float32, [None, ac_size], name="ac") elif isinstance(action_space, gym.spaces.Discrete): ac_size = action_space.n actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for IMPALA.".format( action_space)) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder( tf.float32, [None, ac_size], name="behaviour_logits") def to_batches(tensor): if self.config["model"]["use_lstm"]: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = (self.config["sample_batch_size"] // self.config["num_envs_per_worker"]) B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes return tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. self.loss = VTraceLoss( actions=to_batches(actions)[:-1], actions_logp=to_batches(action_dist.logp(actions))[:-1], actions_entropy=to_batches(action_dist.entropy())[:-1], dones=to_batches(dones)[:-1], behaviour_logits=to_batches(behaviour_logits)[:-1], target_logits=to_batches(self.model.outputs)[:-1], discount=config["gamma"], rewards=to_batches(rewards)[:-1], values=to_batches(values)[:-1], bootstrap_value=to_batches(values)[-1], vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"]) # Initialize TFPolicyGraph loss_in = [ ("actions", actions), ("dones", dones), ("behaviour_logits", behaviour_logits), ("rewards", rewards), ("obs", self.observations), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), loss=self.loss.total_loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "stats": { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.vtrace_returns.vs, [-1]), tf.reshape(to_batches(values)[:-1], [-1])), }, }
def __init__(self, observation_space, action_space, config, existing_inputs=None): config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() self.grads = None if isinstance(action_space, gym.spaces.Discrete): is_multidiscrete = False output_hidden_shape = [action_space.n] elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete): is_multidiscrete = True output_hidden_shape = action_space.nvec.astype(np.int32) elif self.config["vtrace"]: raise UnsupportedSpaceException( "Action space {} is not supported for APPO + VTrace.", format(action_space)) else: is_multidiscrete = False output_hidden_shape = 1 # Policy network model dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Create input placeholders if existing_inputs: if self.config["vtrace"]: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards = existing_inputs[:7] existing_state_in = existing_inputs[7:-1] existing_seq_lens = existing_inputs[-1] else: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards, adv_ph, value_targets = \ existing_inputs[:9] existing_state_in = existing_inputs[9:-1] existing_seq_lens = existing_inputs[-1] else: actions = ModelCatalog.get_action_placeholder(action_space) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder( tf.float32, [None, logit_dim], name="behaviour_logits") observations = tf.placeholder( tf.float32, [None] + list(observation_space.shape)) existing_state_in = None existing_seq_lens = None if not self.config["vtrace"]: adv_ph = tf.placeholder( tf.float32, name="advantages", shape=(None, )) value_targets = tf.placeholder( tf.float32, name="value_targets", shape=(None, )) self.observations = observations # Unpack behaviour logits unpacked_behaviour_logits = tf.split( behaviour_logits, output_hidden_shape, axis=1) # Setup the policy dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) unpacked_outputs = tf.split( self.model.outputs, output_hidden_shape, axis=1) dist_inputs = unpacked_outputs if is_multidiscrete else \ self.model.outputs prev_dist_inputs = unpacked_behaviour_logits if is_multidiscrete else \ behaviour_logits action_dist = dist_class(dist_inputs) prev_action_dist = dist_class(prev_dist_inputs) values = self.model.value_function() self.value_function = values self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) def make_time_major(tensor, drop_last=False): """Swaps batch and trajectory axis. Args: tensor: A tensor or list of tensors to reshape. drop_last: A bool indicating whether to drop the last trajectory item. Returns: res: A tensor with swapped axes or a list of tensors with swapped axes. """ if isinstance(tensor, list): return [make_time_major(t, drop_last) for t in tensor] if self.model.state_init: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = self.config["sample_batch_size"] B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes res = tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) if drop_last: return res[:-1] return res if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) - 1 mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(rewards) # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. if self.config["vtrace"]: logger.info("Using V-Trace surrogate loss (vtrace=True)") # Prepare actions for loss loss_actions = actions if is_multidiscrete else tf.expand_dims( actions, axis=1) self.loss = VTraceSurrogateLoss( actions=make_time_major(loss_actions, drop_last=True), prev_actions_logp=make_time_major( prev_action_dist.logp(actions), drop_last=True), actions_logp=make_time_major( action_dist.logp(actions), drop_last=True), action_kl=prev_action_dist.kl(action_dist), actions_entropy=make_time_major( action_dist.entropy(), drop_last=True), dones=make_time_major(dones, drop_last=True), behaviour_logits=make_time_major( unpacked_behaviour_logits, drop_last=True), target_logits=make_time_major( unpacked_outputs, drop_last=True), discount=config["gamma"], rewards=make_time_major(rewards, drop_last=True), values=make_time_major(values, drop_last=True), bootstrap_value=make_time_major(values)[-1], valid_mask=make_time_major(mask, drop_last=True), vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self.config[ "vtrace_clip_pg_rho_threshold"], clip_param=self.config["clip_param"]) else: logger.info("Using PPO surrogate loss (vtrace=False)") self.loss = PPOSurrogateLoss( prev_actions_logp=make_time_major( prev_action_dist.logp(actions)), actions_logp=make_time_major(action_dist.logp(actions)), action_kl=prev_action_dist.kl(action_dist), actions_entropy=make_time_major(action_dist.entropy()), values=make_time_major(values), valid_mask=make_time_major(mask), advantages=make_time_major(adv_ph), value_targets=make_time_major(value_targets), vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"]) # KL divergence between worker and learner logits for debugging model_dist = MultiCategorical(unpacked_outputs) behaviour_dist = MultiCategorical(unpacked_behaviour_logits) kls = model_dist.kl(behaviour_dist) if len(kls) > 1: self.KL_stats = {} for i, kl in enumerate(kls): self.KL_stats.update({ "mean_KL_{}".format(i): tf.reduce_mean(kl), "max_KL_{}".format(i): tf.reduce_max(kl), "median_KL_{}".format(i): tf.contrib.distributions. percentile(kl, 50.0), }) else: self.KL_stats = { "mean_KL": tf.reduce_mean(kls[0]), "max_KL": tf.reduce_max(kls[0]), "median_KL": tf.contrib.distributions.percentile(kls[0], 50.0), } # Initialize TFPolicyGraph loss_in = [ ("actions", actions), ("dones", dones), ("behaviour_logits", behaviour_logits), ("rewards", rewards), ("obs", observations), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ] if not self.config["vtrace"]: loss_in.append(("advantages", adv_ph)) loss_in.append(("value_targets", value_targets)) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=self.loss.total_loss, model=self.model, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"], batch_divisibility_req=self.config["sample_batch_size"]) self.sess.run(tf.global_variables_initializer()) values_batched = make_time_major( values, drop_last=self.config["vtrace"]) self.stats_fetches = { "stats": dict({ "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.value_targets, [-1]), tf.reshape(values_batched, [-1])), }, **self.KL_stats), }
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: obs_ph, value_targets_ph, adv_ph, act_ph, \ logits_ph, vf_preds_ph = existing_inputs[:6] existing_state_in = existing_inputs[6:-1] existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder(tf.float32, name="obs", shape=(None, ) + observation_space.shape) adv_ph = tf.placeholder(tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder(tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder(tf.float32, name="vf_preds", shape=(None, )) value_targets_ph = tf.placeholder(tf.float32, name="value_targets", shape=(None, )) existing_state_in = None existing_seq_lens = None self.observations = obs_ph self.loss_in = [ ("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logits", logits_ph), ("vf_preds", vf_preds_ph), ] self.model = ModelCatalog.get_model(obs_ph, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable(initializer=tf.constant_initializer( self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: if self.config["vf_share_layers"]: self.value_function = tf.reshape( linear(self.model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) else: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False vf_config["use_lstm"] = False with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model( obs_ph, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1]) if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(adv_ph) self.loss_obj = PPOLoss(action_space, value_targets_ph, adv_ph, act_ph, logits_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, mask, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_clip_param=self.config["vf_clip_param"], vf_loss_coeff=self.config["vf_loss_coeff"], use_gae=self.config["use_gae"]) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, loss=self.loss_obj.loss, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) self.explained_variance = explained_variance(value_targets_ph, self.value_function) self.stats_fetches = { "cur_lr": tf.cast(self.cur_lr, tf.float64), "total_loss": self.loss_obj.loss, "policy_loss": self.loss_obj.mean_policy_loss, "vf_loss": self.loss_obj.mean_vf_loss, "vf_explained_var": self.explained_variance, "kl": self.loss_obj.mean_kl, "entropy": self.loss_obj.mean_entropy }
def __init__(self, obs_space, action_space, config, loss_fn, stats_fn=None, update_ops_fn=None, grad_stats_fn=None, before_loss_init=None, make_action_sampler=None, existing_inputs=None, get_batch_divisibility_req=None, obs_include_prev_action_reward=True): """Initialize a dynamic TF policy. Arguments: observation_space (gym.Space): Observation space of the policy. action_space (gym.Space): Action space of the policy. config (dict): Policy-specific configuration data. loss_fn (func): function that returns a loss tensor the policy graph, and dict of experience tensor placeholders stats_fn (func): optional function that returns a dict of TF fetches given the policy and batch input tensors grad_stats_fn (func): optional function that returns a dict of TF fetches given the policy and loss gradient tensors update_ops_fn (func): optional function that returns a list overriding the update ops to run when applying gradients before_loss_init (func): optional function to run prior to loss init that takes the same arguments as __init__ make_action_sampler (func): optional function that returns a tuple of action and action prob tensors. The function takes (policy, input_dict, obs_space, action_space, config) as its arguments existing_inputs (OrderedDict): when copying a policy, this specifies an existing dict of placeholders to use instead of defining new ones get_batch_divisibility_req (func): optional function that returns the divisibility requirement for sample batches obs_include_prev_action_reward (bool): whether to include the previous action and reward in the model input """ self.config = config self._loss_fn = loss_fn self._stats_fn = stats_fn self._grad_stats_fn = grad_stats_fn self._update_ops_fn = update_ops_fn self._obs_include_prev_action_reward = obs_include_prev_action_reward # Setup standard placeholders prev_actions = None prev_rewards = None if existing_inputs is not None: obs = existing_inputs[SampleBatch.CUR_OBS] if self._obs_include_prev_action_reward: prev_actions = existing_inputs[SampleBatch.PREV_ACTIONS] prev_rewards = existing_inputs[SampleBatch.PREV_REWARDS] else: obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape), name="observation") if self._obs_include_prev_action_reward: prev_actions = ModelCatalog.get_action_placeholder( action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.input_dict = { SampleBatch.CUR_OBS: obs, SampleBatch.PREV_ACTIONS: prev_actions, SampleBatch.PREV_REWARDS: prev_rewards, "is_training": self._get_is_training_placeholder(), } # Create the model network and action outputs if make_action_sampler: assert not existing_inputs, \ "Cloning not supported with custom action sampler" self.model = None self.dist_class = None self.action_dist = None action_sampler, action_prob = make_action_sampler( self, self.input_dict, obs_space, action_space, config) else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: existing_state_in = [ v for k, v in existing_inputs.items() if k.startswith("state_in_") ] if existing_state_in: existing_seq_lens = existing_inputs["seq_lens"] else: existing_seq_lens = None else: existing_state_in = [] existing_seq_lens = None self.model = ModelCatalog.get_model(self.input_dict, obs_space, action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) self.action_dist = self.dist_class(self.model.outputs) action_sampler = self.action_dist.sample() action_prob = self.action_dist.sampled_action_prob() # Phase 1 init sess = tf.get_default_session() or tf.Session() if get_batch_divisibility_req: batch_divisibility_req = get_batch_divisibility_req(self) else: batch_divisibility_req = 1 TFPolicy.__init__( self, obs_space, action_space, sess, obs_input=obs, action_sampler=action_sampler, action_prob=action_prob, loss=None, # dynamically initialized on run loss_inputs=[], model=self.model, state_inputs=self.model and self.model.state_in, state_outputs=self.model and self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model and self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"], batch_divisibility_req=batch_divisibility_req) # Phase 2 init self._needs_eager_conversion = set() self._eager_tensors = {} before_loss_init(self, obs_space, action_space, config) if not existing_inputs: self._initialize_loss()
def __init__(self, observation_space, action_space, config, existing_inputs=None): config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() # Create input placeholders if existing_inputs: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards = existing_inputs[:7] existing_state_in = existing_inputs[7:-1] existing_seq_lens = existing_inputs[-1] else: if isinstance(action_space, gym.spaces.Discrete): ac_size = action_space.n actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for IMPALA.".format( action_space)) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder(tf.float32, [None, ac_size], name="behaviour_logits") observations = tf.placeholder(tf.float32, [None] + list(observation_space.shape)) existing_state_in = None existing_seq_lens = None # Setup the policy dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) action_dist = dist_class(self.model.outputs) values = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) def to_batches(tensor): if self.model.state_init: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = self.config["sample_batch_size"] B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes return tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) - 1 mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(rewards, dtype=tf.bool) # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. self.loss = VTraceLoss( actions=to_batches(actions)[:-1], actions_logp=to_batches(action_dist.logp(actions))[:-1], actions_entropy=to_batches(action_dist.entropy())[:-1], dones=to_batches(dones)[:-1], behaviour_logits=to_batches(behaviour_logits)[:-1], target_logits=to_batches(self.model.outputs)[:-1], discount=config["gamma"], rewards=to_batches(rewards)[:-1], values=to_batches(values)[:-1], bootstrap_value=to_batches(values)[-1], valid_mask=to_batches(mask)[:-1], vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"]) # KL divergence between worker and learner logits for debugging model_dist = Categorical(self.model.outputs) behaviour_dist = Categorical(behaviour_logits) self.KLs = model_dist.kl(behaviour_dist) self.mean_KL = tf.reduce_mean(self.KLs) self.max_KL = tf.reduce_max(self.KLs) self.median_KL = tf.contrib.distributions.percentile(self.KLs, 50.0) # Initialize TFPolicyGraph loss_in = [ ("actions", actions), ("dones", dones), ("behaviour_logits", behaviour_logits), ("rewards", rewards), ("obs", observations), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=self.model.loss() + self.loss.total_loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"], batch_divisibility_req=self.config["sample_batch_size"]) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "stats": { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.vtrace_returns.vs, [-1]), tf.reshape(to_batches(values)[:-1], [-1])), "mean_KL": self.mean_KL, "max_KL": self.max_KL, "median_KL": self.median_KL, }, }
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist(action_space) if existing_inputs: obs_ph, value_targets_ph, adv_ph, act_ph, \ logits_ph, vf_preds_ph = existing_inputs[:6] existing_state_in = existing_inputs[6:-1] existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder( tf.float32, name="obs", shape=(None, ) + observation_space.shape) adv_ph = tf.placeholder( tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder( tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder( tf.float32, name="vf_preds", shape=(None, )) value_targets_ph = tf.placeholder( tf.float32, name="value_targets", shape=(None, )) existing_state_in = None existing_seq_lens = None self.loss_in = [ ("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logits", logits_ph), ("vf_preds", vf_preds_ph), ] self.model = ModelCatalog.get_model( obs_ph, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable( initializer=tf.constant_initializer(self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() self.value_function = self.model.value_function self.loss_obj = PPOLoss( action_space, value_targets_ph, adv_ph, act_ph, logits_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_loss_coeff=self.config["kl_target"], use_gae=self.config["use_gae"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, loss=self.loss_obj.loss, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer())