Ejemplo n.º 1
0
def make_loss_ops(a_logits, graph_v, entropy_bonus, value_loss_coef, debug):
    actions = tf.placeholder(tf.int64, [None])
    returns = tf.placeholder(tf.float32, [None])

    # For the policy loss, we want to calculate log π(action_t | state_t).
    # That means we want log(action_prob_0 | state_t) if action_t = 0,
    #                    log(action_prob_1 | state_t) if action_t = 1, etc.
    # It turns out that's exactly what a cross-entropy loss gives us!
    # The cross-entropy of a distribution p wrt a distribution q is:
    #   - sum over x: p(x) * log2(q(x))
    # Note that for a categorical distribution, considering the
    # cross-entropy of the ground truth distribution wrt the
    # distribution of predicted class probabilities, p(x) is 1 if the
    # ground truth label is x and 0 otherwise. We therefore have:
    #   - log2(q(0)) if ground truth label = 0,
    #   - log2(q(1)) if ground truth label = 1, etc.
    # So here, by taking the cross-entropy of the distribution of
    # action 'labels' wrt the produced action probabilities, we can get
    # exactly what we want :)
    _neglogprob = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=a_logits, labels=actions)
    with tf.control_dependencies([tf.assert_rank(_neglogprob, 1)]):
        neglogprob = _neglogprob

    if debug:
        neglogprob = tf.Print(neglogprob, [actions],
                              message='\ndebug actions:',
                              summarize=2147483647)

    _advantage = returns - graph_v
    with tf.control_dependencies([tf.assert_rank(_advantage, 1)]):
        advantage = _advantage

    if debug:
        advantage = tf.Print(advantage, [returns],
                             message='\ndebug returns:',
                             summarize=2147483647)

    policy_entropy = tf.reduce_mean(logit_entropy(a_logits))

    # Note that the advantage is treated as a constant for the
    # policy network update step.
    # Note also that we're calculating advantages on-the-fly using
    # the value approximator. This might make us worry: what if we're
    # using the loss for training, and the advantages are calculated
    # /after/ training has changed the network? But for A3C, we don't
    # need to worry, because we compute the gradients seperately from
    # applying them.
    # We want to maximise entropy, which is the same as
    # minimising negative entropy.
    policy_loss = neglogprob * tf.stop_gradient(advantage)
    policy_loss = tf.reduce_mean(policy_loss) - entropy_bonus * policy_entropy
    value_loss = value_loss_coef * tf.reduce_mean(0.5 * advantage ** 2)
    loss = policy_loss + value_loss

    return actions, returns, advantage, policy_entropy, \
           policy_loss, value_loss, loss
Ejemplo n.º 2
0
 def test_stability(self):
     """
     Test an example which would normally break numerical stability.
     """
     logits = np.array([0., 1000.])
     expected_entropy = 0.
     actual_entropy = self.sess.run(logit_entropy(logits))
     np.testing.assert_approx_equal(actual_entropy,
                                    expected_entropy,
                                    significant=5)
Ejemplo n.º 3
0
 def test_basic(self):
     """
     Manually calculate entropy, and check the result matches.
     """
     logits = np.array([1., 2., 3., 4.])
     probs = np.exp(logits) / np.sum(np.exp(logits))
     expected_entropy = -np.sum(probs * np.log(probs))
     actual_entropy = self.sess.run(logit_entropy(logits))
     np.testing.assert_approx_equal(actual_entropy,
                                    expected_entropy,
                                    significant=5)
Ejemplo n.º 4
0
 def test_batch(self):
     """
     Make sure we get the right result if calculating entropies on a batch
     of probabilities.
     """
     # shape is 2 (batch size) x 4
     logits = np.array([[1., 2., 3., 4.], [1., 2., 2., 1.]])
     probs = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)
     expected_entropy = -np.sum(
         probs * np.log(probs), axis=1, keepdims=True)
     actual_entropy = self.sess.run(logit_entropy(logits))
     np.testing.assert_allclose(actual_entropy, expected_entropy, atol=1e-4)
Ejemplo n.º 5
0
 def test_gradient_descent(self):
     """
     Check that if we start with a distribution and use gradient descent
     to maximise entropy, we end up with a maximise entropy distribution.
     """
     logits = tf.Variable([1., 2., 3., 4., 5.])
     neg_ent = -logit_entropy(logits)
     train_op = tf.train.AdamOptimizer().minimize(neg_ent)
     self.sess.run(tf.global_variables_initializer())
     for i in range(10000):
         self.sess.run(train_op)
     expected = [0.2, 0.2, 0.2, 0.2, 0.2]  # maximum entropy distribution
     actual = self.sess.run(tf.nn.softmax(logits))
     np.testing.assert_allclose(actual, expected, atol=1e-4)
Ejemplo n.º 6
0
def create_network(scope, debug=False):
    with tf.variable_scope(scope):
        graph_s = tf.placeholder(tf.float32, [None, 84, 84, 4])
        graph_action = tf.placeholder(tf.int64, [None])
        graph_r = tf.placeholder(tf.float32, [None])

        x = tf.layers.conv2d(inputs=graph_s,
                             filters=32,
                             kernel_size=8,
                             strides=4,
                             activation=tf.nn.relu)

        if debug:
            # Dump observations as fed into the network to stderr,
            # for viewing with show_observations.py.
            x = tf.Print(
                x,
                [graph_s],
                message='\ndebug observations:',
                # max no. of values to display; max int32
                summarize=2147483647)

        x = tf.layers.conv2d(inputs=x,
                             filters=64,
                             kernel_size=4,
                             strides=2,
                             activation=tf.nn.relu)

        x = tf.layers.conv2d(inputs=x,
                             filters=64,
                             kernel_size=3,
                             strides=1,
                             activation=tf.nn.relu)

        w, h, f = x.get_shape()[1:]
        x = tf.reshape(x, [-1, int(w * h * f)])

        x = tf.layers.dense(inputs=x, units=512, activation=tf.nn.relu)

        a_logits = tf.layers.dense(inputs=x, units=N_ACTIONS, activation=None)

        a_softmax = tf.nn.softmax(a_logits)

        graph_v = tf.layers.dense(inputs=x, units=1, activation=None)
        # Shape is currently (?, 1)
        # Convert to just (?)
        graph_v = graph_v[:, 0]

        advantage = graph_r - graph_v

        if debug:
            advantage = tf.Print(advantage, [graph_r],
                                 message='\ndebug returns:',
                                 summarize=2147483647)

        p = 0
        for i in range(N_ACTIONS):
            p += tf.cast(tf.equal(graph_action, i), tf.float32) * a_softmax[:,
                                                                            i]

        if debug:
            p = tf.Print(p, [graph_action],
                         message='\ndebug actions:',
                         summarize=2147483647)

        # Log probability: higher is better for actions we want to encourage
        # Negative log probability: lower is better for actions we want to
        #                           encourage
        # 1e-7: prevent log(0)
        nlp = -1 * tf.log(p + 1e-7)

        check_nlp = tf.assert_rank(nlp, 1)
        check_advantage = tf.assert_rank(advantage, 1)
        with tf.control_dependencies([check_nlp, check_advantage]):
            # Note that the advantage is treated as a constant for the
            # policy network update step.
            # Note also that we're calculating advantages on-the-fly using
            # the value approximator. This might make us worry: what if we're
            # using the loss for training, and the advantages are calculated
            # /after/ training has changed the network? But for A3C, we don't
            # need to worry, because we compute the gradients seperately from
            # applying them.
            policy_loss = nlp * tf.stop_gradient(advantage)
            policy_loss = tf.reduce_sum(policy_loss)

            policy_entropy = logit_entropy(a_logits)
            # We want to maximise entropy, which is the same as
            # minimising negative entropy
            policy_loss -= tf.reduce_sum(BETA * policy_entropy)

            value_loss = advantage**2
            value_loss = tf.reduce_sum(value_loss)

        network = Network(s=graph_s,
                          a=graph_action,
                          r=graph_r,
                          a_softmax=a_softmax,
                          graph_v=graph_v,
                          policy_loss=policy_loss,
                          value_loss=value_loss,
                          policy_entropy=policy_entropy)

        return network