Example #1
0
 def test_rewards_to_go(self):
     time_steps = 4
     # [1., 1., 1., 1.]
     rewards = np.ones((time_steps, ))
     # No discounting.
     self.assertAllEqual(ppo.rewards_to_go(rewards, gamma=1.0),
                         np.array([4., 3., 2., 1.]))
     # Discounting.
     self.assertAllEqual(ppo.rewards_to_go(rewards, gamma=0.5),
                         np.array([1.875, 1.75, 1.5, 1.]))
Example #2
0
    def test_rewards_to_go(self):
        rewards = np.array([
            [1, 2, 4, 8, 16, 32, 64, 128],
            [1, 1, 1, 1, 1, 1, 1, 1],
        ])

        rewards_mask = np.array([
            [1, 1, 1, 1, 1, 0, 0, 0],
            [1, 1, 1, 1, 1, 1, 1, 0],
        ])

        gamma = 0.5

        rewards_to_go = ppo.rewards_to_go(rewards, rewards_mask, gamma)

        self.assertAllEqual(
            np.array([
                [5, 8, 12, 16, 16, 0, 0, 0],
                [1.984375, 1.96875, 1.9375, 1.875, 1.75, 1.5, 1.0, 0],
            ]), rewards_to_go)
Example #3
0
  def test_rewards_to_go_really_long_sequences(self):
    T = 1200  # pylint: disable=invalid-name

    rewards = np.random.uniform(1e-3, 1e-2, (1, T))

    # Make a mask, clear out a fixed number `L` of 1s from the end.
    L = 36  # pylint: disable=invalid-name
    assert L < T
    rewards_mask = np.ones_like(rewards)
    rewards_mask[0, L:] = 0

    gamma = 0.94

    actual_r2g = ppo.rewards_to_go(rewards, rewards_mask, gamma).reshape(-1)

    # Let's compute r2g the slow way.
    masked_rewards = (rewards_mask * rewards).reshape(-1)
    expected_r2g = np.zeros_like(masked_rewards)
    for t in range(T):
      for j in range(t, T):
        expected_r2g[t] += (gamma**(j - t)) * masked_rewards[j]

    self.assertAllClose(expected_r2g, actual_r2g)