def _build_networks(self): # Define input placeholders self.s = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='state') self.a = tf.placeholder(tf.int32, shape=(None, ), name='action') self.s_next = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='next_state') self.r = tf.placeholder(tf.float32, shape=(None, ), name='reward') self.done = tf.placeholder(tf.float32, shape=(None, ), name='done_flag') # Actor: action probabilities self.actor = dense_nn(self.s, self.layer_sizes + [self.act_size], name='actor') self.sampled_actions = tf.squeeze(tf.multinomial(self.actor, 1)) self.actor_proba = tf.nn.softmax(self.actor) self.actor_vars = self.scope_vars('actor') # Critic: action value (V value) self.critic = dense_nn(self.s, self.layer_sizes + [1], name='critic') self.critic_next = dense_nn(self.s_next, self.layer_sizes + [1], name='critic', reuse=True) self.critic_vars = self.scope_vars('critic') # TD target self.td_target = self.r + self.gamma * tf.squeeze( self.critic_next) * (1.0 - self.done) self.td_error = self.td_target - tf.squeeze(self.critic)
def _build_networks(self): """For continuous action space. """ # Define input placeholders self.s = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='state') self.a = tf.placeholder(tf.float32, shape=[None] + self.act_dim, name='action') self.s_next = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='next_state') self.r = tf.placeholder(tf.float32, shape=[ None, ], name='reward') with tf.variable_scope('primary'): # Actor: deterministic policy mu(s) outputs one action vector. self.mu = dense_nn(self.s, self.actor_layers + self.act_dim, output_fn=tf.nn.tanh, name='mu') # Critic: action value, Q(s, a) self.Q = dense_nn(tf.concat([self.s, self.a], axis=1), self.critic_layers + [1], name='Q') # We want to train mu network to maximize Q value that is estimated by our critic; # this is only used for training. self.Q_mu = dense_nn(tf.concat([self.s, self.mu], axis=1), self.critic_layers + [1], name='Q', reuse=True) with tf.variable_scope('target'): # Clone target networks. self.mu_target = dense_nn(self.s_next, self.actor_layers + self.act_dim, output_fn=tf.nn.tanh, name='mu') self.Q_target = dense_nn(tf.concat([self.s_next, self.mu_target], axis=1), self.critic_layers + [1], name='Q') self.Q_vars = self.scope_vars('primary/Q') self.mu_vars = self.scope_vars('primary/mu') # sanity check self.primary_vars = self.Q_vars + self.mu_vars self.target_vars = self.scope_vars('target/Q') + self.scope_vars( 'target/mu') assert len(self.primary_vars) == len(self.target_vars)
def build(self): self.lr = tf.placeholder(tf.float32, shape=None, name='learning_rate') # Inputs self.s = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='state') self.a = tf.placeholder(tf.int32, shape=(None,), name='action') self.returns = tf.placeholder(tf.float32, shape=(None,), name='return') # Build network self.pi = dense_nn(self.s, self.layer_sizes + [self.act_size], name='pi_network') self.sampled_actions = tf.squeeze(tf.multinomial(self.pi, 1)) self.pi_vars = self.scope_vars('pi_network') if self.baseline: # State value estimation as the baseline self.v = dense_nn(self.s, self.layer_sizes + [1], name='v_network') self.target = self.returns - self.v # advantage with tf.variable_scope('v_optimize'): self.loss_v = tf.reduce_mean(tf.squared_difference(self.v, self.returns)) self.optim_v = tf.train.AdamOptimizer(self.lr).minimize(self.loss_v, name='adam_optim_v') else: self.target = tf.identity(self.returns) with tf.variable_scope('pi_optimize'): self.loss_pi = tf.reduce_mean( tf.stop_gradient(self.target) * tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.pi, labels=self.a), name='loss_pi') # self.optim_pi = tf.train.AdamOptimizer(self.lr) # self.grads_pi = self.optim_pi.compute_gradients(self.loss_pi, self.pi_vars) # self.train_pi_op = self.optim_pi.apply_gradients(self.grads_pi) self.optim_pi = tf.train.AdamOptimizer(self.lr).minimize(self.loss_pi, name='adam_optim_pi') with tf.variable_scope('summary'): self.loss_pi_summ = tf.summary.scalar('loss_pi', self.loss_pi) self.ep_reward = tf.placeholder(tf.float32, name='episode_reward') self.ep_reward_summ = tf.summary.scalar('episode_reward', self.ep_reward) summ_list = [self.loss_pi_summ, self.ep_reward_summ] if self.baseline: self.loss_v_summ = tf.summary.scalar('loss_v', self.loss_v) summ_list.append(self.loss_v_summ) self.merged_summary = tf.summary.merge(summ_list) if self.baseline: self.train_ops = [self.optim_pi, self.optim_v] else: self.train_ops = [self.optim_pi] self.sess.run(tf.global_variables_initializer())
def _build_networks(self): # Define input placeholders self.s = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='state') self.a = tf.placeholder(tf.int32, shape=(None, ), name='action') self.s_next = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='next_state') self.r = tf.placeholder(tf.float32, shape=(None, ), name='reward') self.done = tf.placeholder(tf.float32, shape=(None, ), name='done_flag') self.old_logp_a = tf.placeholder(tf.float32, shape=(None, ), name='old_logp_actor') self.v_target = tf.placeholder(tf.float32, shape=(None, ), name='v_target') self.adv = tf.placeholder(tf.float32, shape=(None, ), name='return') with tf.variable_scope('actor'): # Actor: action probabilities self.actor = dense_nn(self.s, self.actor_layers + [self.act_size], name='actor') self.actor_proba = tf.nn.softmax(self.actor) a_ohe = tf.one_hot(self.a, self.act_size, 1.0, 0.0, name='action_ohe') self.logp_a = tf.reduce_sum(tf.log(self.actor_proba) * a_ohe, reduction_indices=-1, name='new_logp_actor') self.actor_vars = self.scope_vars('actor') with tf.variable_scope('critic'): # Critic: action value (V value) self.critic = tf.squeeze( dense_nn(self.s, self.critic_layers + [1], name='critic')) self.critic_next = tf.squeeze( dense_nn(self.s_next, self.critic_layers + [1], name='critic', reuse=True)) self.critic_vars = self.scope_vars('critic')
def create_q_networks(self): # The first dimension should have batch_size * step_size self.states = tf.placeholder(tf.float32, shape=(None, *self.state_dim), name='state') self.states_next = tf.placeholder(tf.float32, shape=(None, *self.state_dim), name='state_next') self.actions = tf.placeholder(tf.int32, shape=(None,), name='action') self.actions_next = tf.placeholder(tf.int32, shape=(None,), name='action_next') self.rewards = tf.placeholder(tf.float32, shape=(None,), name='reward') self.done_flags = tf.placeholder(tf.float32, shape=(None,), name='done') # The output is a probability distribution over all the actions. net_class, net_params = self._extract_network_params() if self.dueling: self.q_hidden = net_class(self.states, self.layer_sizes[:-1], name='Q_primary', **net_params) self.adv = dense_nn(self.q_hidden, self.layer_sizes[-1:] + [self.act_size], name='Q_primary_adv') self.v = dense_nn(self.q_hidden, self.layer_sizes[-1:] + [1], name='Q_primary_v') # Average Dueling self.q = self.v + (self.adv - tf.reduce_mean( self.adv, reduction_indices=1, keep_dims=True)) self.q_target_hidden = net_class(self.states_next, self.layer_sizes[:-1], name='Q_target', **net_params) self.adv_target = dense_nn(self.q_target_hidden, self.layer_sizes[-1:] + [self.act_size], name='Q_target_adv') self.v_target = dense_nn(self.q_target_hidden, self.layer_sizes[-1:] + [1], name='Q_target_v') # Average Dueling self.q_target = self.v_target + (self.adv_target - tf.reduce_mean( self.adv_target, reduction_indices=1, keep_dims=True)) else: self.q = net_class(self.states, self.layer_sizes + [self.act_size], name='Q_primary', **net_params) self.q_target = net_class(self.states_next, self.layer_sizes + [self.act_size], name='Q_target', **net_params) # The primary and target Q networks should match. self.q_vars = self.scope_vars('Q_primary') self.q_target_vars = self.scope_vars('Q_target') assert len(self.q_vars) == len(self.q_target_vars), "Two Q-networks are not same."