Beispiel #1
0
 def setup_critic_optimizer(self):
     logger.info('setting up critic optimizer')
     normalized_critic_target_tf = tf.clip_by_value(
         normalize(self.critic_target, self.ret_rms), self.return_range[0],
         self.return_range[1])
     self.critic_loss = tf.reduce_mean(
         tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
     if self.critic_l2_reg > 0.:
         critic_reg_vars = [
             var for var in self.critic.trainable_vars
             if 'kernel' in var.name and 'output' not in var.name
         ]
         for var in critic_reg_vars:
             logger.info('  regularizing: {}'.format(var.name))
         logger.info('  applying l2 regularization with {}'.format(
             self.critic_l2_reg))
         critic_reg = tc.layers.apply_regularization(
             tc.layers.l2_regularizer(self.critic_l2_reg),
             weights_list=critic_reg_vars)
         self.critic_loss += critic_reg
     critic_shapes = [
         var.get_shape().as_list() for var in self.critic.trainable_vars
     ]
     critic_nb_params = sum(
         [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
     logger.info('  critic shapes: {}'.format(critic_shapes))
     logger.info('  critic params: {}'.format(critic_nb_params))
     self.critic_grads = U.flatgrad(self.critic_loss,
                                    self.critic.trainable_vars,
                                    clip_norm=self.clip_norm)
     self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                     beta1=0.9,
                                     beta2=0.999,
                                     epsilon=1e-08,
                                     single_train=self.single_train)
    def setup_actor_optimizer(self):
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.info('setting up actor optimizer')

        ## as used in Hindsight Experience Replay to stop saturation in tanh
        if self.actor_reg:
            preactivation = tf.get_default_graph().get_tensor_by_name(
                'actor/preactivation:0')
            self.actor_loss = -tf.reduce_mean(
                self.critic_with_actor_tf) + tf.reduce_mean(
                    tf.square(preactivation))

        else:
            self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)

        actor_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_vars
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))

        self.actor_grads = U.flatgrad(self.actor_loss,
                                      self.actor.trainable_vars,
                                      clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08)
def test_MpiAdam():
    np.random.seed(0)
    tf.set_random_seed(0)
    
    a = tf.Variable(np.random.randn(3).astype('float32'))
    b = tf.Variable(np.random.randn(2,5).astype('float32'))
    loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))

    stepsize = 1e-2
    update_op = tf.train.AdamOptimizer(stepsize).minimize(loss)
    do_update = U.function([], loss, updates=[update_op])

    tf.get_default_session().run(tf.global_variables_initializer())
    for i in range(10):
        print(i,do_update())

    tf.set_random_seed(0)
    tf.get_default_session().run(tf.global_variables_initializer())

    var_list = [a,b]
    lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op])
    adam = MpiAdam(var_list)

    for i in range(10):
        l,g = lossandgrad()
        adam.update(g, stepsize)
        print(i,l)