def build_train(actor, critic, obs_dim, num_actions, gamma=1.0, scope='ddpg', tau=0.001, reuse=None): with tf.variable_scope(scope, reuse=reuse): # input placeholders obs_t_input = tf.placeholder(tf.float32, [None, obs_dim], name='obs_t') act_t_ph = tf.placeholder(tf.float32, [None, num_actions], name='action') rew_t_ph = tf.placeholder(tf.float32, [None], name='reward') obs_tp1_input = tf.placeholder(tf.float32, [None, obs_dim], name='obs_tp1') done_mask_ph = tf.placeholder(tf.float32, [None], name='done') # actor network policy_t = actor(obs_t_input, num_actions, scope='actor') actor_func_vars = util.scope_vars(util.absolute_scope_name('actor'), trainable_only=True) # target actor network policy_tp1 = actor(obs_tp1_input, num_actions, scope='target_actor') target_actor_func_vars = util.scope_vars( util.absolute_scope_name('target_actor'), trainable_only=True) # critic network q_t = critic(obs_t_input, act_t_ph, num_actions, scope='critic') q_t_with_actor = critic(obs_t_input, policy_t, num_actions, scope='critic', reuse=True) critic_func_vars = util.scope_vars(util.absolute_scope_name('critic'), trainable_only=True) # target critic network q_tp1 = critic(obs_tp1_input, policy_tp1, num_actions, scope='target_critic') target_critic_func_vars = util.scope_vars( util.absolute_scope_name('target_critic'), trainable_only=True) # loss with tf.variable_scope('target_q'): v = (1 - done_mask_ph) * gamma * tf.stop_gradient(q_tp1) target_q = rew_t_ph + v critic_loss = tf.reduce_mean(tf.square(target_q - q_t), name='critic_loss') actor_loss = -tf.reduce_mean(q_t_with_actor, name='actor_loss') # optimize operations critic_optimizer = tf.train.AdamOptimizer(0.001) critic_optimize_expr = critic_optimizer.minimize( critic_loss, var_list=critic_func_vars) actor_optimizer = tf.train.AdamOptimizer(0.0001) actor_optimize_expr = actor_optimizer.minimize( actor_loss, var_list=actor_func_vars) # update critic target operations with tf.variable_scope('update_critic_target'): update_critic_target_expr = [] sorted_vars = sorted(critic_func_vars, key=lambda v: v.name) sorted_target_vars = sorted(target_critic_func_vars, key=lambda v: v.name) # assign critic variables to target critic variables for var, var_target in zip(sorted_vars, sorted_target_vars): new_var = tau * var + (1 - tau) * var_target update_critic_target_expr.append(var_target.assign(new_var)) update_critic_target_expr = tf.group(*update_critic_target_expr) # update actor target operations with tf.variable_scope('update_actor_target'): update_actor_target_expr = [] sorted_vars = sorted(actor_func_vars, key=lambda v: v.name) sorted_target_vars = sorted(target_actor_func_vars, key=lambda v: v.name) # assign actor variables to target actor variables for var, var_target in zip(sorted_vars, sorted_target_vars): new_var = tau * var + (1 - tau) * var_target update_actor_target_expr.append(var_target.assign(new_var)) update_actor_target_expr = tf.group(*update_actor_target_expr) # action theano-style function act = util.function(inputs=[obs_t_input], outputs=policy_t) # train theano-style function train_actor = util.function(inputs=[obs_t_input], outputs=[actor_loss], updates=[actor_optimize_expr]) train_critic = util.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph ], outputs=[critic_loss], updates=[critic_optimize_expr]) # update target theano-style function update_actor_target = util.function([], [], updates=[update_actor_target_expr]) update_critic_target = util.function( [], [], updates=[update_critic_target_expr]) return act, train_actor, train_critic, update_actor_target, update_critic_target
step_size_ph = tf.placeholder(tf.int32, [], name='step_size') mask_ph = tf.placeholder(tf.float32, [None], name='mask') if continous: act_t_ph = tf.placeholder( tf.float32, [None, num_actions], name='action') else: act_t_ph = tf.placeholder(tf.int32, [None], name='action') # rnn state in tuple rnn_state_tuple = tf.contrib.rnn.LSTMStateTuple( rnn_state_ph0, rnn_state_ph1) policy, value, dist = network( obs_t_input, rnn_state_tuple, num_actions, lstm_unit, nenvs, step_size, continuous, scope='network', reuse=reuse) network_func_vars = util.scope_vars( util.absolute_scope_name('network'), trainable_only=True) old_policy, old_value, old_dist = network( obs_t_input, num_actions, scope='old_network', reuse=reuse) old_network_func_vars = util.scope_vars( util.absolute_scope_name('old_network'), trainable_only=True) tmp_policy, tmp_value, tmp_dist = network( obs_t_input, num_actions, scope='tmp_network', reuse=reuse) tmp_network_func_vars = util.scope_vars( util.absolute_scope_name('tmp_network'), trainable_only=True) # reshape inputs advantages = tf.reshape(advantage_t_ph, [-1, 1])
def build_train(network, obs_dim, num_actions, gamma=1.0, epsilon=0.2, beta=0.01, scope='ppo', reuse=None): with tf.device('/gpu:0'): with tf.variable_scope(scope, reuse=reuse): # input placeholders obs_t_input = tf.placeholder(tf.float32, [None, obs_dim], name='obs_t') act_t_ph = tf.placeholder(tf.float32, [None, num_actions], name='action') return_t_ph = tf.placeholder(tf.float32, [None, 1], name='return') advantage_t_ph = tf.placeholder(tf.float32, [None, 1], name='advantage') policy, value, dist = network(obs_t_input, num_actions, scope='network', reuse=reuse) network_func_vars = util.scope_vars( util.absolute_scope_name('network'), trainable_only=True) old_policy, old_value, old_dist = network(obs_t_input, num_actions, scope='old_network', reuse=reuse) old_network_func_vars = util.scope_vars( util.absolute_scope_name('old_network'), trainable_only=True) tmp_policy, tmp_value, tmp_dist = network(obs_t_input, num_actions, scope='tmp_network', reuse=reuse) tmp_network_func_vars = util.scope_vars( util.absolute_scope_name('tmp_network'), trainable_only=True) # clipped surrogate objective cur_policy = dist.log_prob(act_t_ph + 1e-5) old_policy = old_dist.log_prob(act_t_ph + 1e-5) ratio = tf.exp(cur_policy - old_policy) clipped_ratio = tf.clip_by_value(ratio, 1.0 - epsilon, 1.0 + epsilon) surrogate = -tf.reduce_mean( tf.minimum(ratio, clipped_ratio) * advantage_t_ph, name='surrogate') with tf.variable_scope('loss'): # value network loss value_loss = tf.reduce_mean(tf.square(value - return_t_ph)) # entropy penalty for exploration entropy = tf.reduce_mean(dist.entropy()) penalty = -beta * entropy # total loss loss = surrogate + value_loss + penalty # optimize operations optimizer = tf.train.AdamOptimizer(3 * 1e-4) optimize_expr = optimizer.minimize(loss, var_list=network_func_vars) # update old network operations with tf.variable_scope('update_old_network'): update_old_expr = [] sorted_tmp_vars = sorted(tmp_network_func_vars, key=lambda v: v.name) sorted_old_vars = sorted(old_network_func_vars, key=lambda v: v.name) for var_tmp, var_old in zip(sorted_tmp_vars, sorted_old_vars): update_old_expr.append(var_old.assign(var_tmp)) update_old_expr = tf.group(*update_old_expr) # update tmp network operations with tf.variable_scope('update_tmp_network'): update_tmp_expr = [] sorted_vars = sorted(network_func_vars, key=lambda v: v.name) sorted_tmp_vars = sorted(tmp_network_func_vars, key=lambda v: v.name) for var, var_tmp in zip(sorted_vars, sorted_tmp_vars): update_tmp_expr.append(var_tmp.assign(var)) update_tmp_expr = tf.group(*update_tmp_expr) # action theano-style function act = util.function(inputs=[obs_t_input], outputs=[policy, value]) # train theano-style function train = util.function( inputs=[obs_t_input, act_t_ph, return_t_ph, advantage_t_ph], outputs=[loss, value_loss, tf.reduce_mean(ratio)], updates=[optimize_expr]) # update target theano-style function update_old = util.function([], [], updates=[update_old_expr]) backup_current = util.function([], [], updates=[update_tmp_expr]) return act, train, update_old, backup_current