def __init__(self, obs_space, action_space, registry, config): self.config = config # setup policy self.x = tf.placeholder(tf.float32, shape=[None]+list(obs_space.shape)) dist_class, self.logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.model = ModelCatalog.get_model( registry, self.x, self.logit_dim, options=self.config["model"]) self.dist = dist_class(self.model.outputs) # logit for each action # setup policy loss self.ac = ModelCatalog.get_action_placeholder(action_space) self.adv = tf.placeholder(tf.float32, [None], name="adv") self.loss = -tf.reduce_mean(self.dist.logp(self.ac) * self.adv) # initialize TFPolicyGraph self.sess = tf.get_default_session() self.loss_in = [ ("obs", self.x), ("actions", self.ac), ("advantages", self.adv), ] self.is_training = tf.placeholder_with_default(True, ()) TFPolicyGraph.__init__( self, self.sess, obs_input=self.x, action_sampler=self.dist.sample(), loss=self.loss, loss_inputs=self.loss_in, is_training=self.is_training) self.sess.run(tf.global_variables_initializer())
def __init__(self, ob_space, action_space, config): self.local_steps = 0 self.config = config self.summarize = config.get("summarize") self._setup_graph(ob_space, action_space) assert all( hasattr(self, attr) for attr in ["vf", "logits", "x", "var_list"]) print("Setting up loss") self.setup_loss(action_space) self.is_training = tf.placeholder_with_default(True, ()) self.sess = tf.get_default_session() TFPolicyGraph.__init__(self, self.sess, obs_input=self.x, action_sampler=self.action_dist.sample(), loss=self.loss, loss_inputs=self.loss_in, is_training=self.is_training, state_inputs=self.state_in, state_outputs=self.state_out) self.sess.run(tf.global_variables_initializer()) if self.summarize: bs = tf.to_float(tf.shape(self.x)[0]) tf.summary.scalar("model/policy_graph", self.pi_loss / bs) tf.summary.scalar("model/value_loss", self.vf_loss / bs) tf.summary.scalar("model/entropy", self.entropy / bs) tf.summary.scalar("model/grad_gnorm", tf.global_norm(self._grads)) tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list)) self.summary_op = tf.summary.merge_all()
def __init__(self, obs_space, action_space, config): config = dict(ray.rllib.pg.pg.DEFAULT_CONFIG, **config) self.config = config # Setup policy obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape)) dist_class, self.logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.model = ModelCatalog.get_model(obs, self.logit_dim, options=self.config["model"]) action_dist = dist_class(self.model.outputs) # logit for each action # Setup policy loss actions = ModelCatalog.get_action_placeholder(action_space) advantages = tf.placeholder(tf.float32, [None], name="adv") loss = PGLoss(action_dist, actions, advantages).loss # Initialize TFPolicyGraph sess = tf.get_default_session() loss_in = [ ("obs", obs), ("actions", actions), ("advantages", advantages), ] # LSTM support for i, ph in enumerate(self.model.state_in): loss_in.append(("state_in_{}".format(i), ph)) is_training = tf.placeholder_with_default(True, ()) TFPolicyGraph.__init__(self, obs_space, action_space, sess, obs_input=obs, action_sampler=action_dist.sample(), loss=loss, loss_inputs=loss_in, is_training=is_training, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config): if not isinstance(action_space, Box): raise UnsupportedSpaceException( "Action space {} is not supported for DDPG.".format( action_space)) self.config = config self.cur_epsilon = 1.0 dim_actions = action_space.shape[0] low_action = action_space.low high_action = action_space.high self.actor_optimizer = tf.train.AdamOptimizer( learning_rate=config["actor_lr"]) self.critic_optimizer = tf.train.AdamOptimizer( learning_rate=config["critic_lr"]) # Action inputs self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") self.eps = tf.placeholder(tf.float32, (), name="eps") self.cur_observations = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) # Actor: P (policy) network with tf.variable_scope(P_SCOPE) as scope: p_values = _build_p_network(self.cur_observations, dim_actions, config) self.p_func_vars = _scope_vars(scope.name) # Action outputs with tf.variable_scope(A_SCOPE): self.output_actions = _build_action_network( p_values, low_action, high_action, self.stochastic, self.eps, config["exploration_theta"], config["exploration_sigma"]) with tf.variable_scope(A_SCOPE, reuse=True): exploration_sample = tf.get_variable(name="ornstein_uhlenbeck") self.reset_noise_op = tf.assign(exploration_sample, dim_actions * [.0]) # Replay inputs self.obs_t = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape, name="observation") self.act_t = tf.placeholder(tf.float32, shape=(None, ) + action_space.shape, name="action") self.rew_t = tf.placeholder(tf.float32, [None], name="reward") self.obs_tp1 = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) self.done_mask = tf.placeholder(tf.float32, [None], name="done") self.importance_weights = tf.placeholder(tf.float32, [None], name="weight") # p network evaluation with tf.variable_scope(P_SCOPE, reuse=True) as scope: self.p_t = _build_p_network(self.obs_t, dim_actions, config) # target p network evaluation with tf.variable_scope(P_TARGET_SCOPE) as scope: p_tp1 = _build_p_network(self.obs_tp1, dim_actions, config) target_p_func_vars = _scope_vars(scope.name) # Action outputs with tf.variable_scope(A_SCOPE, reuse=True): deterministic_flag = tf.constant(value=False, dtype=tf.bool) zero_eps = tf.constant(value=.0, dtype=tf.float32) output_actions = _build_action_network(self.p_t, low_action, high_action, deterministic_flag, zero_eps, config["exploration_theta"], config["exploration_sigma"]) output_actions_estimated = _build_action_network( p_tp1, low_action, high_action, deterministic_flag, zero_eps, config["exploration_theta"], config["exploration_sigma"]) # q network evaluation with tf.variable_scope(Q_SCOPE) as scope: q_t = _build_q_network(self.obs_t, self.act_t, config) self.q_func_vars = _scope_vars(scope.name) with tf.variable_scope(Q_SCOPE, reuse=True): q_tp0 = _build_q_network(self.obs_t, output_actions, config) # target q network evalution with tf.variable_scope(Q_TARGET_SCOPE) as scope: q_tp1 = _build_q_network(self.obs_tp1, output_actions_estimated, config) target_q_func_vars = _scope_vars(scope.name) q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1) q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1) q_tp1_best_masked = (1.0 - self.done_mask) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = ( self.rew_t + config["gamma"]**config["n_step"] * q_tp1_best_masked) # compute the error (potentially clipped) self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) if config.get("use_huber"): errors = _huber_loss(self.td_error, config.get("huber_threshold")) else: errors = 0.5 * tf.square(self.td_error) self.loss = tf.reduce_mean(self.importance_weights * errors) # for policy gradient self.actor_loss = -1.0 * tf.reduce_mean(q_tp0) if config["l2_reg"] is not None: for var in self.p_func_vars: if "bias" not in var.name: self.actor_loss += (config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) for var in self.q_func_vars: if "bias" not in var.name: self.loss += config["l2_reg"] * 0.5 * tf.nn.l2_loss(var) # update_target_fn will be called periodically to copy Q network to # target Q network self.tau_value = config.get("tau") self.tau = tf.placeholder(tf.float32, (), name="tau") update_target_expr = [] for var, var_target in zip( sorted(self.q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) for var, var_target in zip( sorted(self.p_func_vars, key=lambda v: v.name), sorted(target_p_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) self.update_target_expr = tf.group(*update_target_expr) self.sess = tf.get_default_session() self.loss_inputs = [ ("obs", self.obs_t), ("actions", self.act_t), ("rewards", self.rew_t), ("new_obs", self.obs_tp1), ("dones", self.done_mask), ("weights", self.importance_weights), ] self.is_training = tf.placeholder_with_default(True, ()) TFPolicyGraph.__init__(self, self.sess, obs_input=self.cur_observations, action_sampler=self.output_actions, loss=self.loss, loss_inputs=self.loss_inputs, is_training=self.is_training) self.sess.run(tf.global_variables_initializer()) # Note that this encompasses both the policy and Q-value networks and # their corresponding target networks self.variables = ray.experimental.TensorFlowVariables( tf.group(q_tp0, q_tp1), self.sess) # Hard initial update self.update_target(tau=1.0)
def set_state(self, state): TFPolicyGraph.set_state(self, state[0]) self.set_epsilon(state[1])
def get_state(self): return [TFPolicyGraph.get_state(self), self.cur_epsilon]
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.a3c.a3c.DEFAULT_CONFIG, **config) self.config = config self.sess = tf.get_default_session() # Setup the policy self.observations = tf.placeholder(tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.model = ModelCatalog.get_model(self.observations, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) self.vf = tf.reshape( linear(self.model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) is_training = tf.placeholder_with_default(True, ()) # Setup the policy loss if isinstance(action_space, gym.spaces.Box): ac_size = action_space.shape[0] actions = tf.placeholder(tf.float32, [None, ac_size], name="ac") elif isinstance(action_space, gym.spaces.Discrete): actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for A3C.".format( action_space)) advantages = tf.placeholder(tf.float32, [None], name="advantages") v_target = tf.placeholder(tf.float32, [None], name="v_target") self.loss = A3CLoss(action_dist, actions, advantages, v_target, self.vf, self.config["vf_loss_coeff"], self.config["entropy_coeff"]) # Initialize TFPolicyGraph loss_in = [ ("obs", self.observations), ("actions", actions), ("advantages", advantages), ("value_targets", v_target), ] for i, ph in enumerate(self.model.state_in): loss_in.append(("state_in_{}".format(i), ph)) self.state_in = self.model.state_in self.state_out = self.model.state_out TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), loss=self.loss.total_loss, loss_inputs=loss_in, is_training=is_training, state_inputs=self.state_in, state_outputs=self.state_out, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) if self.config.get("summarize"): bs = tf.to_float(tf.shape(self.observations)[0]) tf.summary.scalar("model/policy_graph", self.loss.pi_loss / bs) tf.summary.scalar("model/value_loss", self.loss.vf_loss / bs) tf.summary.scalar("model/entropy", self.loss.entropy / bs) tf.summary.scalar("model/grad_gnorm", tf.global_norm(self._grads)) tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list)) self.summary_op = tf.summary.merge_all() self.sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, registry, config): if not isinstance(action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format( action_space)) self.config = config self.cur_epsilon = 1.0 num_actions = action_space.n # Action inputs self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") self.eps = tf.placeholder(tf.float32, (), name="eps") self.cur_observations = tf.placeholder( tf.float32, shape=(None,) + observation_space.shape) # Action Q network with tf.variable_scope(Q_SCOPE) as scope: q_values = _build_q_network( registry, self.cur_observations, num_actions, config) self.q_func_vars = _scope_vars(scope.name) # Action outputs self.output_actions = _build_action_network( q_values, self.cur_observations, num_actions, self.stochastic, self.eps) # Replay inputs self.obs_t = tf.placeholder( tf.float32, shape=(None,) + observation_space.shape) self.act_t = tf.placeholder(tf.int32, [None], name="action") self.rew_t = tf.placeholder(tf.float32, [None], name="reward") self.obs_tp1 = tf.placeholder( tf.float32, shape=(None,) + observation_space.shape) self.done_mask = tf.placeholder(tf.float32, [None], name="done") self.importance_weights = tf.placeholder( tf.float32, [None], name="weight") # q network evaluation with tf.variable_scope(Q_SCOPE, reuse=True): q_t = _build_q_network( registry, self.obs_t, num_actions, config) # target q network evalution with tf.variable_scope(Q_TARGET_SCOPE) as scope: q_tp1 = _build_q_network( registry, self.obs_tp1, num_actions, config) self.target_q_func_vars = _scope_vars(scope.name) # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum( q_t * tf.one_hot(self.act_t, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if config["double_q"]: with tf.variable_scope(Q_SCOPE, reuse=True): q_tp1_using_online_net = _build_q_network( registry, self.obs_tp1, num_actions, config) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot( q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - self.done_mask) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = ( self.rew_t + config["gamma"] ** config["n_step"] * q_tp1_best_masked) # compute the error (potentially clipped) self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) self.loss = tf.reduce_mean( self.importance_weights * _huber_loss(self.td_error)) # update_target_fn will be called periodically to copy Q network to # target Q network update_target_expr = [] for var, var_target in zip( sorted(self.q_func_vars, key=lambda v: v.name), sorted(self.target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) self.update_target_expr = tf.group(*update_target_expr) # initialize TFPolicyGraph self.sess = tf.get_default_session() self.loss_inputs = [ ("obs", self.obs_t), ("actions", self.act_t), ("rewards", self.rew_t), ("new_obs", self.obs_tp1), ("dones", self.done_mask), ("weights", self.importance_weights), ] self.is_training = tf.placeholder_with_default(True, ()) TFPolicyGraph.__init__( self, self.sess, obs_input=self.cur_observations, action_sampler=self.output_actions, loss=self.loss, loss_inputs=self.loss_inputs, is_training=self.is_training) self.sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.dqn.dqn.DEFAULT_CONFIG, **config) if not isinstance(action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format( action_space)) self.config = config self.cur_epsilon = 1.0 num_actions = action_space.n def _build_q_network(obs): return QNetwork(ModelCatalog.get_model(obs, 1, config["model"]), num_actions, config["dueling"], config["hiddens"]).value # Action inputs self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") self.eps = tf.placeholder(tf.float32, (), name="eps") self.cur_observations = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) # Action Q network with tf.variable_scope(Q_SCOPE) as scope: q_values = _build_q_network(self.cur_observations) self.q_func_vars = _scope_vars(scope.name) # Action outputs self.output_actions = QValuePolicy(q_values, self.cur_observations, num_actions, self.stochastic, self.eps).action # Replay inputs self.obs_t = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) self.act_t = tf.placeholder(tf.int32, [None], name="action") self.rew_t = tf.placeholder(tf.float32, [None], name="reward") self.obs_tp1 = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) self.done_mask = tf.placeholder(tf.float32, [None], name="done") self.importance_weights = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation with tf.variable_scope(Q_SCOPE, reuse=True): q_t = _build_q_network(self.obs_t) # target q network evalution with tf.variable_scope(Q_TARGET_SCOPE) as scope: q_tp1 = _build_q_network(self.obs_tp1) self.target_q_func_vars = _scope_vars(scope.name) # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(self.act_t, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if config["double_q"]: with tf.variable_scope(Q_SCOPE, reuse=True): q_tp1_using_online_net = _build_q_network(self.obs_tp1) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) self.loss = QLoss(q_t_selected, q_tp1_best, self.importance_weights, self.rew_t, self.done_mask, config["gamma"], config["n_step"]) # update_target_fn will be called periodically to copy Q network to # target Q network update_target_expr = [] for var, var_target in zip( sorted(self.q_func_vars, key=lambda v: v.name), sorted(self.target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) self.update_target_expr = tf.group(*update_target_expr) # initialize TFPolicyGraph self.sess = tf.get_default_session() self.loss_inputs = [ ("obs", self.obs_t), ("actions", self.act_t), ("rewards", self.rew_t), ("new_obs", self.obs_tp1), ("dones", self.done_mask), ("weights", self.importance_weights), ] self.is_training = tf.placeholder_with_default(True, ()) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=self.cur_observations, action_sampler=self.output_actions, loss=self.loss.loss, loss_inputs=self.loss_inputs, is_training=self.is_training) self.sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.ddpg.ddpg.DEFAULT_CONFIG, **config) if not isinstance(action_space, Box): raise UnsupportedSpaceException( "Action space {} is not supported for DDPG.".format( action_space)) self.config = config self.cur_epsilon = 1.0 dim_actions = action_space.shape[0] low_action = action_space.low high_action = action_space.high self.actor_optimizer = tf.train.AdamOptimizer( learning_rate=config["actor_lr"]) self.critic_optimizer = tf.train.AdamOptimizer( learning_rate=config["critic_lr"]) def _build_q_network(obs, actions): return QNetwork( ModelCatalog.get_model(obs, 1, config["model"]), actions, config["critic_hiddens"]).value def _build_p_network(obs): return PNetwork( ModelCatalog.get_model(obs, 1, config["model"]), dim_actions, config["actor_hiddens"]).action_scores def _build_action_network(p_values, stochastic, eps): return ActionNetwork( p_values, low_action, high_action, stochastic, eps, config["exploration_theta"], config["exploration_sigma"]).actions # Action inputs self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") self.eps = tf.placeholder(tf.float32, (), name="eps") self.cur_observations = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape) # Actor: P (policy) network with tf.variable_scope(P_SCOPE) as scope: p_values = _build_p_network(self.cur_observations) self.p_func_vars = _scope_vars(scope.name) # Action outputs with tf.variable_scope(A_SCOPE): self.output_actions = _build_action_network( p_values, self.stochastic, self.eps) with tf.variable_scope(A_SCOPE, reuse=True): exploration_sample = tf.get_variable(name="ornstein_uhlenbeck") self.reset_noise_op = tf.assign(exploration_sample, dim_actions * [.0]) # Replay inputs self.obs_t = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape, name="observation") self.act_t = tf.placeholder( tf.float32, shape=(None, ) + action_space.shape, name="action") self.rew_t = tf.placeholder(tf.float32, [None], name="reward") self.obs_tp1 = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape) self.done_mask = tf.placeholder(tf.float32, [None], name="done") self.importance_weights = tf.placeholder( tf.float32, [None], name="weight") # p network evaluation with tf.variable_scope(P_SCOPE, reuse=True) as scope: self.p_t = _build_p_network(self.obs_t) # target p network evaluation with tf.variable_scope(P_TARGET_SCOPE) as scope: p_tp1 = _build_p_network(self.obs_tp1) target_p_func_vars = _scope_vars(scope.name) # Action outputs with tf.variable_scope(A_SCOPE, reuse=True): deterministic_flag = tf.constant(value=False, dtype=tf.bool) zero_eps = tf.constant(value=.0, dtype=tf.float32) output_actions = _build_action_network( self.p_t, deterministic_flag, zero_eps) output_actions_estimated = _build_action_network( p_tp1, deterministic_flag, zero_eps) # q network evaluation with tf.variable_scope(Q_SCOPE) as scope: q_t = _build_q_network(self.obs_t, self.act_t) self.q_func_vars = _scope_vars(scope.name) with tf.variable_scope(Q_SCOPE, reuse=True): q_tp0 = _build_q_network(self.obs_t, output_actions) # target q network evalution with tf.variable_scope(Q_TARGET_SCOPE) as scope: q_tp1 = _build_q_network(self.obs_tp1, output_actions_estimated) target_q_func_vars = _scope_vars(scope.name) self.loss = ActorCriticLoss( q_t, q_tp1, q_tp0, self.importance_weights, self.rew_t, self.done_mask, config["gamma"], config["n_step"], config["use_huber"], config["huber_threshold"]) if config["l2_reg"] is not None: for var in self.p_func_vars: if "bias" not in var.name: self.loss.actor_loss += ( config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) for var in self.q_func_vars: if "bias" not in var.name: self.loss.critic_loss += ( config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) # update_target_fn will be called periodically to copy Q network to # target Q network self.tau_value = config.get("tau") self.tau = tf.placeholder(tf.float32, (), name="tau") update_target_expr = [] for var, var_target in zip( sorted(self.q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) for var, var_target in zip( sorted(self.p_func_vars, key=lambda v: v.name), sorted(target_p_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) self.update_target_expr = tf.group(*update_target_expr) self.sess = tf.get_default_session() self.loss_inputs = [ ("obs", self.obs_t), ("actions", self.act_t), ("rewards", self.rew_t), ("new_obs", self.obs_tp1), ("dones", self.done_mask), ("weights", self.importance_weights), ] self.is_training = tf.placeholder_with_default(True, ()) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.cur_observations, action_sampler=self.output_actions, loss=self.loss.total_loss, loss_inputs=self.loss_inputs, is_training=self.is_training) self.sess.run(tf.global_variables_initializer()) # Note that this encompasses both the policy and Q-value networks and # their corresponding target networks self.variables = ray.experimental.TensorFlowVariables( tf.group(q_tp0, q_tp1), self.sess) # Hard initial update self.update_target(tau=1.0)
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist(action_space) if existing_inputs: self.loss_in = existing_inputs obs_ph, value_targets_ph, adv_ph, act_ph, \ logprobs_ph, vf_preds_ph = [ph for _, ph in existing_inputs] else: obs_ph = tf.placeholder(tf.float32, name="obs", shape=(None, ) + observation_space.shape) # Targets of the value function. value_targets_ph = tf.placeholder(tf.float32, name="value_targets", shape=(None, )) # Advantage values in the policy gradient estimator. adv_ph = tf.placeholder(tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) # Log probabilities from the policy before the policy update. logprobs_ph = tf.placeholder(tf.float32, name="logprobs", shape=(None, logit_dim)) # Value function predictions before the policy update. vf_preds_ph = tf.placeholder(tf.float32, name="vf_preds", shape=(None, )) self.loss_in = [("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logprobs", logprobs_ph), ("vf_preds", vf_preds_ph)] # KL Coefficient self.kl_coeff = tf.get_variable(initializer=tf.constant_initializer( self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = ModelCatalog.get_model(obs_ph, logit_dim, self.config["model"]).outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model( obs_ph, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.constant("NA") self.loss_obj = PPOLoss(action_space, value_targets_ph, adv_ph, act_ph, logprobs_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_loss_coeff=self.config["kl_target"], use_gae=self.config["use_gae"]) self.is_training = tf.placeholder_with_default(True, ()) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, loss=self.loss_obj.loss, loss_inputs=self.loss_in, is_training=self.is_training)