Exemple #1
0
    def actor_optimizer(self):
        action = K.placeholder(shape=[None, self.action_size])
        advantages = K.placeholder(shape=[
            None,
        ])
        #advatages -> *multi-step*

        policy = self.actor.output

        action_prob = K.sum(action * policy, axis=1)
        cross_entropy = K.log(action_prob + 1e-10) * advantages
        cross_entropy = -K.mean(cross_entropy)

        # add (-entropy) to loss function, for enthusiastic search
        minus_entropy = K.sum(policy * K.log(policy + 1e-10), axis=1)
        minus_entropy = K.mean(minus_entropy)

        # optimizing loss minimizes cross_entropy, maximizes entropy
        loss = cross_entropy  #+ 0.01 * minus_entropy

        optimizer = Adam(lr=self.actor_lr)
        updates = optimizer.get_updates(loss, self.actor.trainable_weights)
        train = K.function([self.actor.input, action, advantages], [loss],
                           updates=updates)
        return train
Exemple #2
0
    def critic_optimizer(self):
        discounted_prediction = K.placeholder(shape=(None, ))

        value = self.critic.output

        # loss = MSE(discounted_prediction, value)
        loss = K.mean(K.square(discounted_prediction - value))

        optimizer = Adam(lr=self.critic_lr)
        updates = optimizer.get_updates(loss, self.critic.trainable_weights)
        train = K.function([self.critic.input, discounted_prediction], [loss],
                           updates=updates)
        return train