Esempio n. 1
0
 def test_target_var_init(self):
     """ test target_var_init op, sets target and main variables equal
     """
     with tf.variable_scope(TARGET):
         target_val = tf_utils.mlp(self.obs_ph, (4, ), activation=tf.tanh)
     with tf.variable_scope(MAIN):
         main_val = tf_utils.mlp(self.obs_ph, (4, ), activation=tf.tanh)
     with self.agent.sess as sess:
         sess.run(tf.global_variables_initializer())
         target_vars = tf_utils.var_list(TARGET)
         main_vars = tf_utils.var_list(MAIN)
         target_nps, main_nps = sess.run((target_vars, main_vars))
         for targ, upd in zip(target_nps, main_nps):
             assert targ.shape == upd.shape
             # the biases should actually be the same, all zeros
             if len(targ.shape) > 1:
                 assert not (targ == upd).all()
         # now set target and main equal
         init_op = self.agent.target_var_init()
         # now make sure all target and main parrameters are equal
         target_vars = tf_utils.var_list(TARGET)
         main_vars = tf_utils.var_list(MAIN)
         target_nps, main_nps = sess.run((target_vars, main_vars))
         for targ, upd in zip(target_nps, main_nps):
             assert targ.shape == upd.shape
             np.testing.assert_allclose(targ, upd)
 def target_var_init(self):
     """ returns tensorflow op to initialize target variables to be equal
     to the updated variables """
     op_list = [
         tf.assign(target_var, updated_var)
         for target_var, updated_var in zip(tf_utils.var_list(TARGET),
                                            tf_utils.var_list(MAIN))
     ]
     self.sess.run(tf.group(op_list))
Esempio n. 3
0
 def test_build_policy_and_qval(self):
     """ smoke test, make sure the number of parameters is right """
     pi, qval, qval_pi = self.agent.build_policy_and_qval(
         self.obs_ph, self.act_ph, self.env.action_space)
     with self.cached_session() as sess:
         sess.run(tf.global_variables_initializer())
         pi_vars = tf_utils.var_list(POLICY)
         assert len(pi_vars) == 4  # 2 kernels and 2 biases
         qval_vars = tf_utils.var_list(QVAL)
         assert len(qval_vars) == 4  # 2 kernels and 2 biases
 def build_target_update_op(self):
     """ returns tensorflow operation to update target parameters
     based on updated parameters and polyak """
     op_list = [
         tf.assign(
             target_var,
             self.polyak * target_var + (1 - self.polyak) * updated_var)
         for (target_var, updated_var
              ) in zip(tf_utils.var_list(TARGET), tf_utils.var_list(MAIN))
     ]
     return tf.group(op_list)
 def build_policy_loss(self, qval_pi):
     """ build loss function and train op for deterministic policy """
     loss = -1 * tf.reduce_mean(qval_pi)
     train_op = tf.train.AdamOptimizer(learning_rate=self.pi_lr).minimize(
         loss, var_list=tf_utils.var_list(MAIN + '/' + POLICY))
     return loss, train_op
 def build_qval_loss(self, qval, qval_target):
     """ build loss for action-value function """
     loss = tf.losses.mean_squared_error(qval, qval_target)
     train_op = tf.train.AdamOptimizer(learning_rate=self.q_lr).minimize(
         loss, var_list=tf_utils.var_list(MAIN + '/' + QVAL))
     return loss, train_op