def testEntropy(self, is_multi_actions):
    with self.test_session() as sess:
      # Large values check numerical stability through the logs
      policy_logits_np = np.array([[0, 1], [1, 2], [0, 2], [1, 1], [0, -1000],
                                   [0, 1000]])
      if is_multi_actions:
        num_action_components = 3
        policy_logits_nest = [tf.constant(policy_logits_np, dtype=tf.float32)
                              for _ in xrange(num_action_components)]
      else:
        num_action_components = 1
        policy_logits_nest = tf.constant(policy_logits_np, dtype=tf.float32)

      entropy_op = pg_ops.discrete_policy_entropy_loss(policy_logits_nest)
      entropy = entropy_op.extra.entropy
      self.assertEqual(entropy.get_shape(), tf.TensorShape(6))
      # Get these reference values in Torch with:
      #   c = nnd.EntropyCriterion()
      #   s = nn.LogSoftMax()
      #   result = c:forward(s:forward(logits))
      expected_entropy = num_action_components * np.array(
          [0.58220309, 0.58220309, 0.36533386, 0.69314718, 0, 0])
      self.assertAllClose(sess.run(entropy),
                          expected_entropy,
                          atol=1e-4)
  def testGradient(self, is_multi_actions):
    with self.test_session() as sess:
      policy_logits_np = np.array([[0, 1], [1, 2], [0, 2], [1, 1], [0, -1000],
                                   [0, 1000]])
      if is_multi_actions:
        num_action_components = 3
        policy_logits_nest = [tf.constant(policy_logits_np, dtype=tf.float32)
                              for _ in xrange(num_action_components)]
      else:
        num_action_components = 1
        policy_logits_nest = tf.constant(policy_logits_np, dtype=tf.float32)

      entropy_op = pg_ops.discrete_policy_entropy_loss(policy_logits_nest)
      entropy = entropy_op.extra.entropy
      # Counterintuitively, the gradient->0 as policy->deterministic, that's why
      # the gradients for the large logit cases are `[0, 0]`. They should
      # strictly be >0, but they get truncated when we run out of precision.
      expected_gradients = np.array([[0.1966119, -0.1966119],
                                     [0.1966119, -0.1966119],
                                     [0.2099872, -0.2099872],
                                     [0, 0],
                                     [0, 0],
                                     [0, 0]])
      for policy_logits in nest.flatten(policy_logits_nest):
        gradients = tf.gradients(entropy, policy_logits)
        grad_policy_logits = sess.run(gradients[0])
        self.assertAllClose(grad_policy_logits,
                            expected_gradients,
                            atol=1e-4)
  def testGradient(self, is_multi_actions):
    with self.test_session() as sess:
      policy_logits_np = np.array([[0, 1], [1, 2], [0, 2], [1, 1], [0, -1000],
                                   [0, 1000]])
      if is_multi_actions:
        num_action_components = 3
        policy_logits_nest = [tf.constant(policy_logits_np, dtype=tf.float32)
                              for _ in xrange(num_action_components)]
      else:
        num_action_components = 1
        policy_logits_nest = tf.constant(policy_logits_np, dtype=tf.float32)

      entropy_op = pg_ops.discrete_policy_entropy_loss(policy_logits_nest)
      entropy = entropy_op.extra.entropy
      # Counterintuitively, the gradient->0 as policy->deterministic, that's why
      # the gradients for the large logit cases are `[0, 0]`. They should
      # strictly be >0, but they get truncated when we run out of precision.
      expected_gradients = np.array([[0.1966119, -0.1966119],
                                     [0.1966119, -0.1966119],
                                     [0.2099872, -0.2099872],
                                     [0, 0],
                                     [0, 0],
                                     [0, 0]])
      for policy_logits in nest.flatten(policy_logits_nest):
        gradients = tf.gradients(entropy, policy_logits)
        grad_policy_logits = sess.run(gradients[0])
        self.assertAllClose(grad_policy_logits,
                            expected_gradients,
                            atol=1e-4)
  def testEntropy(self, is_multi_actions):
    with self.test_session() as sess:
      # Large values check numerical stability through the logs
      policy_logits_np = np.array([[0, 1], [1, 2], [0, 2], [1, 1], [0, -1000],
                                   [0, 1000]])
      if is_multi_actions:
        num_action_components = 3
        policy_logits_nest = [tf.constant(policy_logits_np, dtype=tf.float32)
                              for _ in xrange(num_action_components)]
      else:
        num_action_components = 1
        policy_logits_nest = tf.constant(policy_logits_np, dtype=tf.float32)

      entropy_op = pg_ops.discrete_policy_entropy_loss(policy_logits_nest)
      entropy = entropy_op.extra.entropy
      self.assertEqual(entropy.get_shape(), tf.TensorShape(6))
      # Get these reference values in Torch with:
      #   c = nnd.EntropyCriterion()
      #   s = nn.LogSoftMax()
      #   result = c:forward(s:forward(logits))
      expected_entropy = num_action_components * np.array(
          [0.58220309, 0.58220309, 0.36533386, 0.69314718, 0, 0])
      self.assertAllClose(sess.run(entropy),
                          expected_entropy,
                          atol=1e-4)
 def testShapeInference3D(self, sequence_length, batch_size, num_actions,
                          normalise):
   T, B, A = sequence_length, batch_size, num_actions  # pylint: disable=invalid-name
   op = pg_ops.discrete_policy_entropy_loss(
       policy_logits=tf.placeholder(tf.float32, shape=[T, B, A]),
       normalise=normalise)
   op.extra.entropy.get_shape().assert_is_compatible_with([T, B])
   op.loss.get_shape().assert_is_compatible_with([T, B])
 def testShapeInference3D(self, sequence_length, batch_size, num_actions,
                          normalise):
   T, B, A = sequence_length, batch_size, num_actions  # pylint: disable=invalid-name
   op = pg_ops.discrete_policy_entropy_loss(
       policy_logits=tf.placeholder(tf.float32, shape=[T, B, A]),
       normalise=normalise)
   op.extra.entropy.get_shape().assert_is_compatible_with([T, B])
   op.loss.get_shape().assert_is_compatible_with([T, B])
 def testNormalisation(self, num_actions):
   with self.test_session() as sess:
     if isinstance(num_actions, list):
       policy_logits = [tf.constant([[1.0] * n], dtype=tf.float32)
                        for n in num_actions]
     else:
       policy_logits = tf.constant(
           [[1.0] * num_actions], dtype=tf.float32)
     entropy_op = pg_ops.discrete_policy_entropy_loss(
         policy_logits, normalise=True)
     self.assertAllClose(sess.run(entropy_op.loss), [-1.0])
 def testNormalisation(self, num_actions):
   with self.test_session() as sess:
     if isinstance(num_actions, list):
       policy_logits = [tf.constant([[1.0] * n], dtype=tf.float32)
                        for n in num_actions]
     else:
       policy_logits = tf.constant(
           [[1.0] * num_actions], dtype=tf.float32)
     entropy_op = pg_ops.discrete_policy_entropy_loss(
         policy_logits, normalise=True)
     self.assertAllClose(sess.run(entropy_op.loss), [-1.0])
 def testShapeInference2D(self, batch_size, num_actions, normalise):
   policy_logits = tf.placeholder(tf.float32, shape=[batch_size, num_actions])
   op = pg_ops.discrete_policy_entropy_loss(policy_logits, normalise=normalise)
   op.extra.entropy.get_shape().assert_is_compatible_with([batch_size])
   op.loss.get_shape().assert_is_compatible_with([batch_size])
 def testShapeInference2D(self, batch_size, num_actions, normalise):
   policy_logits = tf.placeholder(tf.float32, shape=[batch_size, num_actions])
   op = pg_ops.discrete_policy_entropy_loss(policy_logits, normalise=normalise)
   op.extra.entropy.get_shape().assert_is_compatible_with([batch_size])
   op.loss.get_shape().assert_is_compatible_with([batch_size])