Esempio n. 1
0
  def test_clipped_objective(self):
    probab_ratios = np.array([
        [1.5, 2.0, 0.5, 0.7],
        [2.5, 2.0, 0.1, 1.0],
    ])

    advantages = np.array([
        [0.1, -0.1, 0.5, 0.7],
        [2.0, -2.0, 2.0, 2.0],
    ])

    mask = np.array([[1, 1, 0, 0], [1, 1, 1, 0]])

    epsilon = 0.1

    clipped_probab_ratios = np.array([
        [1.1, 1.1, 0.9, 0.9],
        [1.1, 1.1, 0.9, 1.0],
    ])

    unused_advantages_x_probab_ratios = np.array([
        [0.15, -0.2, 0.25, 0.49],
        [5.00, -4.0, 0.20, 2.00]
    ])

    unused_advantages_x_clipped_probab_ratios = np.array([
        [0.11, -0.11, 0.45, 0.63],
        [2.20, -2.20, 1.80, 2.00]
    ])

    unused_minimums = np.array([
        [0.11, -0.2, 0.25, 0.49],
        [2.20, -4.0, 0.20, 2.00]
    ])

    # minimums * mask
    objective = np.array([
        [0.11, -0.2, 0.0, 0.],
        [2.20, -4.0, 0.2, 0.]
    ])

    # Assert that we computed things correctly in this test.
    self.assertAllClose(
        np.minimum(probab_ratios * advantages,
                   clipped_probab_ratios * advantages) * mask,
        objective)

    self.assertAllClose(
        objective,
        ppo.clipped_objective(probab_ratios, advantages, mask, epsilon))
Esempio n. 2
0
    def test_clipped_objective(self):
        probab_ratios = np.array([
            [1.5, 2.0, 0.5, 0.7],
            [2.5, 2.0, 0.1, 1.0],
        ])

        advantages = np.array([
            [0.1, 0.1, 0.5, 0.7],
            [2.0, 2.0, 2.0, 2.0],
        ])

        mask = np.array([[1, 1, 0, 0], [1, 1, 1, 0]])

        epsilon = 0.1

        unused_clipped_probab_ratios = np.array([
            [1.1, 1.1, 0.9, 0.9],
            [1.1, 1.1, 0.9, 1.0],
        ])

        minimums = np.array([
            [1.1, 1.1, 0.5, 0.7],
            [1.1, 1.1, 0.1, 1.0],
        ])

        # advantages * minimums * mask
        objective = np.array([
            [0.11, 0.11, 0.0, 0.0],
            [2.2, 2.2, 0.2, 0.0],
        ])

        # Assert that we computed things correctly in this test.
        self.assertAllClose(advantages * mask * minimums, objective)

        self.assertAllClose(
            objective,
            ppo.clipped_objective(probab_ratios, advantages, mask, epsilon))