コード例 #1
0
 def testInconsistentSupportsAndWeightsParameters(self):
   supports = tf.constant([[0, 2, 4, 6, 8], [3, 4, 5, 6, 7]], dtype=tf.float32)
   weights = tf.constant(
       [[0.1, 0.2, 0.3, 0.2], [0.1, 0.2, 0.3, 0.2]], dtype=tf.float32)
   target_support = tf.constant([4, 5, 6, 7, 8], dtype=tf.float32)
   with self.assertRaisesRegexp(ValueError, 'are incompatible'):
     rainbow_agent.project_distribution(supports, weights, target_support)
コード例 #2
0
 def testZeroDimensionalTargetSupport(self):
   supports = tf.constant([[0, 2, 4, 6, 8], [3, 4, 5, 6, 7]], dtype=tf.float32)
   weights = tf.constant(
       [[0.1, 0.2, 0.3, 0.2, 0.2], [0.1, 0.2, 0.3, 0.2, 0.2]],
       dtype=tf.float32)
   target_support = tf.constant(3, dtype=tf.float32)
   with self.assertRaisesRegexp(ValueError, 'Index out of range'):
     rainbow_agent.project_distribution(supports, weights, target_support)
コード例 #3
0
    def _build_target_distribution(self):
        """Builds the C51 target distribution as per Bellemare et al. (2017).

    First, we compute the support of the Bellman target, r + gamma Z'. Where Z'
    is the support of the next state distribution:

      * Evenly spaced in [-vmax, vmax] if the current state is nonterminal;
      * 0 otherwise (duplicated num_atoms times).

    Second, we compute the next-state probabilities, corresponding to the action
    with highest expected value.

    Finally we project the Bellman target (support + probabilities) onto the
    original support.

    Returns:
      target_distribution: tf.tensor, the target distribution from the replay.
    """
        batch_size = self._replay.batch_size

        # size of rewards: batch_size x 1
        rewards = self._replay.rewards[:, None]

        # size of tiled_support: batch_size x num_atoms
        tiled_support = tf.tile(self._support, [batch_size])
        tiled_support = tf.reshape(tiled_support,
                                   [batch_size, self._num_atoms])

        # size of target_support: batch_size x num_atoms

        is_terminal_multiplier = 1. - tf.cast(self._replay.terminals,
                                              tf.float32)
        # Incorporate terminal state to discount factor.
        # size of gamma_with_terminal: batch_size x 1
        gamma_with_terminal = self.cumulative_gamma * is_terminal_multiplier
        gamma_with_terminal = gamma_with_terminal[:, None]

        target_support = rewards + gamma_with_terminal * tiled_support

        # size of next_qt_argmax: 1 x batch_size
        next_qt_argmax = tf.argmax(
            self._replay_next_target_net_outputs.q_values, axis=1)[:, None]
        batch_indices = tf.range(tf.to_int64(batch_size))[:, None]
        # size of next_qt_argmax: batch_size x 2
        batch_indexed_next_qt_argmax = tf.concat(
            [batch_indices, next_qt_argmax], axis=1)

        # size of next_probabilities: batch_size x num_atoms
        next_probabilities = tf.gather_nd(
            self._replay_next_target_net_outputs.probabilities,
            batch_indexed_next_qt_argmax)

        return rainbow_agent.project_distribution(target_support,
                                                  next_probabilities,
                                                  self._support)
コード例 #4
0
 def testProjectFromNonMonotonicSupport(self):
     supports = tf.constant([[4, 3, 2, 1, 0]], dtype=tf.float32)
     weights = tf.constant([[0.1, 0.2, 0.1, 0.3, 0.3]], dtype=tf.float32)
     target_support = tf.constant([3, 4, 5, 6, 7], dtype=tf.float32)
     projection = rainbow_agent.project_distribution(
         supports, weights, target_support)
     expected_projection = [[0.9, 0.1, 0.0, 0.0, 0.0]]
     with self.test_session() as sess:
         tf.global_variables_initializer().run()
         projection_ = sess.run(projection)
         self.assertAllClose(expected_projection, projection_)
コード例 #5
0
 def testProjectSingleIdenticalDistribution(self):
     supports = tf.constant([[0, 1, 2, 3, 4]], dtype=tf.float32)
     expected_weights = [0.1, 0.2, 0.1, 0.3, 0.3]
     weights = tf.constant([expected_weights], dtype=tf.float32)
     target_support = tf.constant([0, 1, 2, 3, 4], dtype=tf.float32)
     projection = rainbow_agent.project_distribution(
         supports, weights, target_support)
     with self.test_session() as sess:
         tf.global_variables_initializer().run()
         projection_ = sess.run(projection)
         self.assertAllClose([expected_weights], projection_)
コード例 #6
0
 def testProjectFromNonMonotonicSupport(self):
   supports = tf.constant([[4, 3, 2, 1, 0]], dtype=tf.float32)
   weights = tf.constant([[0.1, 0.2, 0.1, 0.3, 0.3]], dtype=tf.float32)
   target_support = tf.constant([3, 4, 5, 6, 7], dtype=tf.float32)
   projection = rainbow_agent.project_distribution(supports, weights,
                                                   target_support)
   expected_projection = [[0.9, 0.1, 0.0, 0.0, 0.0]]
   with self.test_session() as sess:
     tf.global_variables_initializer().run()
     projection_ = sess.run(projection)
     self.assertAllClose(expected_projection, projection_)
コード例 #7
0
 def testProjectSingleIdenticalDistribution(self):
   supports = tf.constant([[0, 1, 2, 3, 4]], dtype=tf.float32)
   expected_weights = [0.1, 0.2, 0.1, 0.3, 0.3]
   weights = tf.constant([expected_weights], dtype=tf.float32)
   target_support = tf.constant([0, 1, 2, 3, 4], dtype=tf.float32)
   projection = rainbow_agent.project_distribution(supports, weights,
                                                   target_support)
   with self.test_session() as sess:
     tf.global_variables_initializer().run()
     projection_ = sess.run(projection)
     self.assertAllClose([expected_weights], projection_)
コード例 #8
0
 def testProjectNewSupportHasInconsistentDeltask(self):
   supports = tf.constant([[0, 2, 4, 6, 8], [3, 4, 5, 6, 7]], dtype=tf.float32)
   weights = tf.constant(
       [[0.1, 0.2, 0.3, 0.2, 0.2], [0.1, 0.2, 0.3, 0.2, 0.2]],
       dtype=tf.float32)
   target_support = tf.constant([3, 4, 6, 7, 8], dtype=tf.float32)
   projection = rainbow_agent.project_distribution(
       supports, weights, target_support, validate_args=True)
   with self.test_session() as sess:
     tf.global_variables_initializer().run()
     with self.assertRaisesRegexp(tf.errors.InvalidArgumentError,
                                  'assertion failed'):
       sess.run(projection)
コード例 #9
0
 def testExampleFromCodeComments(self):
   supports = tf.constant([[0, 2, 4, 6, 8], [1, 3, 4, 5, 6]], dtype=tf.float32)
   weights = tf.constant(
       [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.2, 0.5, 0.1, 0.1]],
       dtype=tf.float32)
   target_support = tf.constant([4, 5, 6, 7, 8], dtype=tf.float32)
   projection = rainbow_agent.project_distribution(supports, weights,
                                                   target_support)
   expected_projections = [[0.8, 0.0, 0.1, 0.0, 0.1],
                           [0.8, 0.1, 0.1, 0.0, 0.0]]
   with self.test_session() as sess:
     tf.global_variables_initializer().run()
     projection_ = sess.run(projection)
     self.assertAllClose(expected_projections, projection_)
コード例 #10
0
 def testProjectBatchOfDifferentDistributionsWithLargerDelta(self):
     supports = tf.constant([[0, 2, 4, 6, 8], [8, 9, 10, 12, 14]],
                            dtype=tf.float32)
     weights = tf.constant(
         [[0.1, 0.2, 0.2, 0.2, 0.3], [0.1, 0.2, 0.4, 0.1, 0.2]],
         dtype=tf.float32)
     target_support = tf.constant([0, 4, 8, 12, 16], dtype=tf.float32)
     projection = rainbow_agent.project_distribution(
         supports, weights, target_support)
     expected_projections = [[0.2, 0.4, 0.4, 0.0, 0.0],
                             [0.0, 0.0, 0.45, 0.45, 0.1]]
     with self.test_session() as sess:
         tf.global_variables_initializer().run()
         projection_ = sess.run(projection)
         self.assertAllClose(expected_projections, projection_)
コード例 #11
0
 def testProjectBatchOfDifferentDistributionsWithLargerDelta(self):
   supports = tf.constant(
       [[0, 2, 4, 6, 8], [8, 9, 10, 12, 14]], dtype=tf.float32)
   weights = tf.constant(
       [[0.1, 0.2, 0.2, 0.2, 0.3], [0.1, 0.2, 0.4, 0.1, 0.2]],
       dtype=tf.float32)
   target_support = tf.constant([0, 4, 8, 12, 16], dtype=tf.float32)
   projection = rainbow_agent.project_distribution(supports, weights,
                                                   target_support)
   expected_projections = [[0.2, 0.4, 0.4, 0.0, 0.0],
                           [0.0, 0.0, 0.45, 0.45, 0.1]]
   with self.test_session() as sess:
     tf.global_variables_initializer().run()
     projection_ = sess.run(projection)
     self.assertAllClose(expected_projections, projection_)
コード例 #12
0
 def testProjectBatchOfDifferentDistributions(self):
   supports = tf.constant(
       [[0, 2, 4, 6, 8], [0, 1, 2, 3, 4], [3, 4, 5, 6, 7]], dtype=tf.float32)
   weights = tf.constant(
       [[0.1, 0.2, 0.3, 0.2, 0.2], [0.1, 0.2, 0.1, 0.3, 0.3],
        [0.1, 0.2, 0.3, 0.2, 0.2]],
       dtype=tf.float32)
   target_support = tf.constant([3, 4, 5, 6, 7], dtype=tf.float32)
   projection = rainbow_agent.project_distribution(supports, weights,
                                                   target_support)
   expected_projections = [[0.3, 0.3, 0.0, 0.2,
                            0.2], [0.7, 0.3, 0.0, 0.0, 0.0],
                           [0.1, 0.2, 0.3, 0.2, 0.2]]
   with self.test_session() as sess:
     tf.global_variables_initializer().run()
     projection_ = sess.run(projection)
     self.assertAllClose(expected_projections, projection_)
コード例 #13
0
 def testMultiDimensionalTargetSupportWithPlaceholders(self):
   supports = [[0, 2, 4, 6, 8], [3, 4, 5, 6, 7]]
   supports_ph = tf.placeholder(tf.float32, None)
   weights = [[0.1, 0.2, 0.3, 0.2, 0.2], [0.1, 0.2, 0.3, 0.2, 0.2]]
   weights_ph = tf.placeholder(tf.float32, None)
   target_support = [[3]]
   target_support_ph = tf.placeholder(tf.float32, None)
   projection = rainbow_agent.project_distribution(
       supports_ph, weights_ph, target_support_ph, validate_args=True)
   with self.test_session() as sess:
     tf.global_variables_initializer().run()
     with (self.assertRaises(tf.errors.InvalidArgumentError)):
       sess.run(
           projection,
           feed_dict={
               supports_ph: supports,
               weights_ph: weights,
               target_support_ph: target_support
           })
コード例 #14
0
 def testUsingPlaceholders(self):
     supports = [[0, 2, 4, 6, 8], [0, 1, 2, 3, 4], [3, 4, 5, 6, 7]]
     supports_ph = tf.placeholder(tf.float32, None)
     weights = [[0.1, 0.2, 0.3, 0.2, 0.2], [0.1, 0.2, 0.1, 0.3, 0.3],
                [0.1, 0.2, 0.3, 0.2, 0.2]]
     weights_ph = tf.placeholder(tf.float32, None)
     target_support = [3, 4, 5, 6, 7]
     target_support_ph = tf.placeholder(tf.float32, None)
     projection = rainbow_agent.project_distribution(
         supports_ph, weights_ph, target_support_ph)
     expected_projections = [[0.3, 0.3, 0.0, 0.2, 0.2],
                             [0.7, 0.3, 0.0, 0.0, 0.0],
                             [0.1, 0.2, 0.3, 0.2, 0.2]]
     with self.test_session() as sess:
         tf.global_variables_initializer().run()
         projection_ = sess.run(projection,
                                feed_dict={
                                    supports_ph: supports,
                                    weights_ph: weights,
                                    target_support_ph: target_support
                                })
         self.assertAllClose(expected_projections, projection_)
コード例 #15
0
 def testUsingPlaceholders(self):
   supports = [[0, 2, 4, 6, 8], [0, 1, 2, 3, 4], [3, 4, 5, 6, 7]]
   supports_ph = tf.placeholder(tf.float32, None)
   weights = [[0.1, 0.2, 0.3, 0.2, 0.2], [0.1, 0.2, 0.1, 0.3, 0.3],
              [0.1, 0.2, 0.3, 0.2, 0.2]]
   weights_ph = tf.placeholder(tf.float32, None)
   target_support = [3, 4, 5, 6, 7]
   target_support_ph = tf.placeholder(tf.float32, None)
   projection = rainbow_agent.project_distribution(supports_ph, weights_ph,
                                                   target_support_ph)
   expected_projections = [[0.3, 0.3, 0.0, 0.2,
                            0.2], [0.7, 0.3, 0.0, 0.0, 0.0],
                           [0.1, 0.2, 0.3, 0.2, 0.2]]
   with self.test_session() as sess:
     tf.global_variables_initializer().run()
     projection_ = sess.run(
         projection,
         feed_dict={
             supports_ph: supports,
             weights_ph: weights,
             target_support_ph: target_support
         })
     self.assertAllClose(expected_projections, projection_)