Esempio n. 1
0
def rrm_loss(regrets, action_utilities, ignore_negative_regrets=True):
    regrets = tf.convert_to_tensor(regrets)
    return rrm_loss_given_policy(
        regrets,
        rm_policy(regrets),
        action_utilities,
        ignore_negative_regrets=ignore_negative_regrets)
Esempio n. 2
0
    def __init__(self,
                 *args,
                 softmax_temperatures=[],
                 use_cumulative_values=False,
                 **kwargs):
        def f(temp):
            def g(z):
                return tf.nn.softmax(z[:, :-1] /
                                     self._adjusted_temperature(temp))

        policies = ([lambda z: cpea.rm_policy(z[:, :-1] - z[:, -1:])] +
                    list(map(f), softmax_temperatures))
        super(SplitRrm, self).__init__(policies, *args, **kwargs)
Esempio n. 3
0
    def loss(self, predictions, policy, cfv):
        q, v = predictions[:, :-1], predictions[:, -1:]
        r = q - v

        pi_rm = cpea.rm_policy(r)

        q_diffs = tf.square(q - cfv)
        q_loss = tf.reduce_mean(tf.reduce_sum(q_diffs, axis=1)) / 2.0

        ev = tf.stop_gradient(tf.reduce_sum(cfv * pi_rm, axis=1,
                                            keepdims=True))

        v_loss = tf.reduce_mean(tf.square(v - ev)) / 2.0
        return q_loss + v_loss
Esempio n. 4
0
def rrm_utilities(model, contexts, action_utilities):
    return utility(rm_policy(model(contexts)), action_utilities)
Esempio n. 5
0
 def policy_activation(self, pre_activations):
     return rm_policy(pre_activations)
Esempio n. 6
0
 def meta_policy(self):
     return cpea.rm_policy(self.meta_qregrets)
Esempio n. 7
0
 def loss(self, predictions, policy, cfv):
     r = tf.stop_gradient(
         cpea.rm_policy(cfv -
                        tf.reduce_sum(cfv * policy, axis=1, keepdims=True)))
     error = tf.square(r - predictions) / 2.0
     return tf.reduce_mean(tf.reduce_sum(error, axis=1))
Esempio n. 8
0
 def loss(self, predictions, policy, cfv):
     r = tf.stop_gradient(
         cpea.rm_policy(cfv -
                        tf.reduce_sum(cfv * policy, axis=1, keepdims=True)))
     log_policy = tf.log(tf.clip_by_value(policy, 1e-15, 1 - 1e-15))
     return -tf.reduce_mean(tf.reduce_sum(r * log_policy, axis=1))
Esempio n. 9
0
 def loss(self, predictions, policy, cfv):
     pi = cpea.rm_policy(predictions)
     inst_r = cfv - cpea.utility(pi, cfv)
     inst_q = tf.stop_gradient(tf.maximum(inst_r, -tf.nn.relu(predictions)))
     return tf.reduce_mean(
         tf.reduce_sum(tf.square(predictions - inst_q), axis=1)) / 2.0