Exemple #1
0
  def test_compute_probab_ratios(self):
    p_old = np.array([[
        [np.log(0.1), np.log(0.2), np.log(0.6), np.log(0.1)],
        [np.log(0.4), np.log(0.1), np.log(0.4), np.log(0.1)],
        [np.log(0.3), np.log(0.1), np.log(0.5), np.log(0.1)],
        [np.log(0.1), np.log(0.2), np.log(0.6), np.log(0.1)],
        [np.log(0.3), np.log(0.1), np.log(0.5), np.log(0.1)],
    ], [
        [np.log(0.3), np.log(0.1), np.log(0.5), np.log(0.1)],
        [np.log(0.1), np.log(0.1), np.log(0.4), np.log(0.4)],
        [np.log(0.3), np.log(0.1), np.log(0.5), np.log(0.1)],
        [np.log(0.1), np.log(0.2), np.log(0.6), np.log(0.1)],
        [np.log(0.3), np.log(0.1), np.log(0.5), np.log(0.1)],
    ]])

    p_new = np.array([[
        [np.log(0.3), np.log(0.1), np.log(0.5), np.log(0.1)],
        [np.log(0.4), np.log(0.1), np.log(0.1), np.log(0.3)],
        [np.log(0.1), np.log(0.2), np.log(0.1), np.log(0.6)],
        [np.log(0.3), np.log(0.1), np.log(0.5), np.log(0.1)],
        [np.log(0.1), np.log(0.2), np.log(0.1), np.log(0.6)],
    ], [
        [np.log(0.1), np.log(0.2), np.log(0.1), np.log(0.6)],
        [np.log(0.1), np.log(0.1), np.log(0.2), np.log(0.6)],
        [np.log(0.3), np.log(0.1), np.log(0.3), np.log(0.3)],
        [np.log(0.1), np.log(0.2), np.log(0.1), np.log(0.6)],
        [np.log(0.1), np.log(0.2), np.log(0.1), np.log(0.6)],
    ]])

    actions = np.array([[1, 2, 0, 1], [0, 3, 3, 0]])

    mask = np.array([[1, 1, 0, 0], [1, 1, 1, 0]])

    probab_ratios = ppo.compute_probab_ratios(p_new, p_old, actions, mask)

    self.assertAllClose(
        np.array([
            [0.1 / 0.2, 0.1 / 0.4, 0.0, 0.0],
            [0.1 / 0.3, 0.6 / 0.4, 0.3 / 0.1, 0.0],
        ]), probab_ratios)
Exemple #2
0
    def test_compute_probab_ratios(self):
        p_old = np.array([[
            [0.1, 0.2, 0.6, 0.1],
            [0.4, 0.1, 0.4, 0.1],
            [0.3, 0.1, 0.5, 0.1],
            [0.1, 0.2, 0.6, 0.1],
        ],
                          [
                              [0.3, 0.1, 0.5, 0.1],
                              [0.1, 0.1, 0.4, 0.4],
                              [0.3, 0.1, 0.5, 0.1],
                              [0.1, 0.2, 0.6, 0.1],
                          ]])

        p_new = np.array([[
            [0.3, 0.1, 0.5, 0.1],
            [0.4, 0.1, 0.1, 0.3],
            [0.1, 0.2, 0.1, 0.6],
            [0.3, 0.1, 0.5, 0.1],
        ],
                          [
                              [0.1, 0.2, 0.1, 0.6],
                              [0.1, 0.1, 0.2, 0.6],
                              [0.3, 0.1, 0.3, 0.3],
                              [0.1, 0.2, 0.1, 0.6],
                          ]])

        actions = np.array([[1, 2, 0, 1], [0, 3, 3, 0]])

        mask = np.array([[1, 1, 0, 0], [1, 1, 1, 0]])

        probab_ratios = ppo.compute_probab_ratios(p_old, p_new, actions, mask)

        self.assertAllClose(
            np.array([
                [0.1 / 0.2, 0.1 / 0.4, 0.0, 0.0],
                [0.1 / 0.3, 0.6 / 0.4, 0.3 / 0.1, 0.0],
            ]), probab_ratios)