def test_success(self): dim1 = np.random.randint(10) + 1 dim2 = np.random.randint(10) + 1 var1 = tf.Variable(np.random.random((dim1, dim2)), name='var1') var2 = tf.Variable(np.random.random((dim1, dim2)), name='var2') ops = build_optim(var1, 1e-4, 'var1') with self.test_session() as sess: sess.run(tf.global_variables_initializer()) before_var1, before_var2 = sess.run([var1, var2]) sess.run(ops) after_var1, after_var2 = sess.run([var1, var2]) assert_variable_mismatch(before_var1, after_var1) assert_variable_match(before_var2, after_var2)
def _build(self, params): with tf.variable_scope('sac'): self.obs_t_ph = tf.placeholder(tf.float32, (None, ) + params.state_shape, name='obs_t') self.actions_t_ph = tf.placeholder(tf.float32, (None, params.num_actions), name='actions_t') self.rewards_tp1_ph = tf.placeholder(tf.float32, (None, ), name='rewards_tp1') self.obs_tp1_ph = tf.placeholder(tf.float32, (None, ) + params.state_shape, name='obs_tp1') self.dones_tp1_ph = tf.placeholder(tf.float32, (None, ), name='dones_tp1') # policy function pi_t = stochastic_policy_function(params.fcs, self.obs_t_ph, params.num_actions, tf.nn.relu, share=True, w_init=XAVIER_INIT, last_w_init=XAVIER_INIT, last_b_init=XAVIER_INIT, scope='pi') squashed_action_t, log_prob_t = squash_action(pi_t) # value function v_t = value_function(params.fcs, self.obs_t_ph, tf.nn.relu, XAVIER_INIT, XAVIER_INIT, ZEROS_INIT, scope='v') # target value function v_tp1 = value_function(params.fcs, self.obs_tp1_ph, tf.nn.relu, XAVIER_INIT, XAVIER_INIT, ZEROS_INIT, scope='target_v') # two q functions q1_t_with_pi = q_function(params.fcs, self.obs_t_ph, squashed_action_t, params.concat_index, tf.nn.relu, XAVIER_INIT, XAVIER_INIT, ZEROS_INIT, scope='q1') q1_t = q_function(params.fcs, self.obs_t_ph, self.actions_t_ph, params.concat_index, tf.nn.relu, XAVIER_INIT, XAVIER_INIT, ZEROS_INIT, scope='q1') q2_t_with_pi = q_function(params.fcs, self.obs_t_ph, squashed_action_t, params.concat_index, tf.nn.relu, XAVIER_INIT, XAVIER_INIT, ZEROS_INIT, scope='q2') q2_t = q_function(params.fcs, self.obs_t_ph, self.actions_t_ph, params.concat_index, tf.nn.relu, XAVIER_INIT, XAVIER_INIT, ZEROS_INIT, scope='q2') # prepare for loss rewards_tp1 = tf.reshape(self.rewards_tp1_ph, [-1, 1]) dones_tp1 = tf.reshape(self.dones_tp1_ph, [-1, 1]) # value function loss self.v_loss = build_v_loss(v_t, q1_t_with_pi, q2_t_with_pi, log_prob_t) # q function loss self.q1_loss = build_q_loss(q1_t, rewards_tp1, v_tp1, dones_tp1, params.gamma) self.q2_loss = build_q_loss(q2_t, rewards_tp1, v_tp1, dones_tp1, params.gamma) # policy function loss self.pi_loss = build_pi_loss(log_prob_t, q1_t_with_pi, q2_t_with_pi) # policy reguralization policy_decay = build_policy_reg(pi_t, params.reg) # target update self.target_update = build_target_update('sac/v', 'sac/target_v', params.tau) # optimization self.v_optimize_expr = build_optim(self.v_loss, params.v_lr, 'sac/v') self.q1_optimize_expr = build_optim(self.q1_loss, params.q_lr, 'sac/q1') self.q2_optimize_expr = build_optim(self.q2_loss, params.q_lr, 'sac/q2') self.pi_optimize_expr = build_optim(self.pi_loss + policy_decay, params.pi_lr, 'sac/pi') # for inference self.action = squashed_action_t[0] self.value = tf.reshape(v_t, [-1])[0] self.log_prob = tf.reshape(log_prob_t, [-1])[0]
def _build(self, params): with tf.variable_scope('td3', reuse=tf.AUTO_REUSE): self.obs_t_ph = tf.placeholder(tf.float32, [None] + list(params.state_shape), name='obs_t') self.actions_t_ph = tf.placeholder(tf.float32, [None, params.num_actions], name='actions_t') self.rewards_tp1_ph = tf.placeholder(tf.float32, [None], name='rewards_tp1') self.obs_tp1_ph = tf.placeholder(tf.float32, [None] + list(params.state_shape), name='obs_tp1') self.dones_tp1_ph = tf.placeholder(tf.float32, [None], name='dones_tp1') # policy function raw_policy_t = _policy_function(params, self.obs_t_ph, 'actor') policy_t = tf.nn.tanh(raw_policy_t) # target policy function raw_policy_tp1 = _policy_function(params, self.obs_tp1_ph, 'target_actor') policy_tp1 = tf.nn.tanh(raw_policy_tp1) # target policy smoothing reguralization smoothed_policy_tp1 = build_smoothed_target( policy_tp1, params.target_noise_sigma, params.target_noise_clip) # first critic q1_t = _q_function(params, self.obs_t_ph, self.actions_t_ph, 'critic/1') q1_t_with_actor = _q_function(params, self.obs_t_ph, policy_t, 'critic/1') # first target critic q1_tp1 = _q_function(params, self.obs_tp1_ph, smoothed_policy_tp1, 'target_critic/1') # second critic q2_t = _q_function(params, self.obs_t_ph, self.actions_t_ph, 'critic/2') q2_t_with_actor = _q_function(params, self.obs_t_ph, policy_t, 'critic/2') # second target critic q2_tp1 = _q_function(params, self.obs_tp1_ph, smoothed_policy_tp1, 'target_critic/2') # prepare for loss calculation rewards_tp1 = tf.reshape(self.rewards_tp1_ph, [-1, 1]) dones_tp1 = tf.reshape(self.dones_tp1_ph, [-1, 1]) # critic loss target = build_target(rewards_tp1, q1_tp1, q2_tp1, dones_tp1, params.gamma) self.critic_loss = build_critic_loss(q1_t, q2_t, target) # actor loss self.actor_loss = -build_actor_loss(q1_t_with_actor, q2_t_with_actor) # target update self.update_target_critic = build_target_update( 'td3/critic', 'td3/target_critic', params.tau) self.update_target_actor = build_target_update( 'td3/actor', 'td3/target_actor', params.tau) # optimization self.critic_optimize_expr = build_optim(self.critic_loss, params.critic_lr, 'td3/critic') self.actor_optimize_expr = build_optim(self.actor_loss, params.actor_lr, 'td3/actor') # action self.action = policy_t self.value = tf.reshape(q1_t_with_actor, [-1])
def _build(self, fcs, concat_index, state_shape, num_actions, gamma, tau, pi_lr, q_lr, v_lr, reg): with tf.variable_scope('sac'): obs_t_ph = self.obs_t_ph = tf.placeholder( tf.float32, (None,) + state_shape, name='obs_t') actions_t_ph = self.actions_t_ph = tf.placeholder( tf.float32, (None, num_actions), name='actions_t') rewards_tp1_ph = self.rewards_tp1_ph = tf.placeholder( tf.float32, (None,), name='rewards_tp1') obs_tp1_ph = self.obs_tp1_ph = tf.placeholder( tf.float32, (None,) + state_shape, name='obs_tp1') dones_tp1_ph = self.dones_tp1_ph = tf.placeholder( tf.float32, (None,), name='dones_tp1') # initialzier zeros_init = tf.zeros_initializer() w_init = tf.contrib.layers.xavier_initializer() last_w_init = tf.contrib.layers.xavier_initializer() last_b_init = tf.contrib.layers.xavier_initializer() # policy function pi_t = stochastic_policy_function(fcs, obs_t_ph, num_actions, tf.nn.relu, share=True, w_init=w_init, last_w_init=last_w_init, last_b_init=last_b_init, scope='pi') sampled_action_t = pi_t.sample(1)[0] squashed_action_t = tf.nn.tanh(sampled_action_t) diff = tf.reduce_sum( tf.log(1 - squashed_action_t ** 2 + 1e-6), axis=1, keepdims=True) log_prob_t = tf.reshape( pi_t.log_prob(sampled_action_t), [-1, 1]) - diff # value function v_t = value_function( fcs, obs_t_ph, tf.nn.relu, w_init, last_w_init, zeros_init, scope='v') # target value function v_tp1 = value_function( fcs, obs_tp1_ph, tf.nn.relu, w_init, last_w_init, zeros_init, scope='target_v') # two q functions q1_t_with_pi = q_function(fcs, obs_t_ph, squashed_action_t, concat_index, tf.nn.relu, w_init, last_w_init, zeros_init, scope='q1') q1_t = q_function(fcs, obs_t_ph, actions_t_ph, concat_index, tf.nn.relu, w_init, last_w_init, zeros_init, scope='q1') q2_t_with_pi = q_function(fcs, obs_t_ph, squashed_action_t, concat_index, tf.nn.relu, w_init, last_w_init, zeros_init, scope='q2') q2_t = q_function(fcs, obs_t_ph, actions_t_ph, concat_index, tf.nn.relu, w_init, last_w_init, zeros_init, scope='q2') # prepare for loss rewards_tp1 = tf.reshape(rewards_tp1_ph, [-1, 1]) dones_tp1 = tf.reshape(dones_tp1_ph, [-1, 1]) # value function loss self.v_loss = build_v_loss( v_t, q1_t_with_pi, q2_t_with_pi, log_prob_t) # q function loss self.q1_loss = build_q_loss( q1_t, rewards_tp1, v_tp1, dones_tp1, gamma) self.q2_loss = build_q_loss( q2_t, rewards_tp1, v_tp1, dones_tp1, gamma) # policy function loss self.pi_loss = build_pi_loss( log_prob_t, q1_t_with_pi, q2_t_with_pi) # target update self.target_update = build_target_update( 'sac/v', 'sac/target_v', tau) # policy reguralization pi_mean_loss = 0.5 * tf.reduce_mean(pi_t.mean() ** 2) pi_logstd_loss = 0.5 * tf.reduce_mean(tf.log(pi_t.stddev()) ** 2) policy_decay = reg * (pi_mean_loss + pi_logstd_loss) # optimization self.v_optimize_expr = build_optim(self.v_loss, v_lr, 'sac/v') self.q1_optimize_expr = build_optim(self.q1_loss, q_lr, 'sac/q1') self.q2_optimize_expr = build_optim(self.q2_loss, q_lr, 'sac/q2') self.pi_optimize_expr = build_optim(self.pi_loss + policy_decay, pi_lr, 'sac/pi') # for inference self.action = squashed_action_t[0] self.value = tf.reshape(v_t, [-1])[0] self.log_prob = tf.reshape(log_prob_t, [-1])[0]