Beispiel #1
0
    def test_success(self):
        nd_q1_t = np.random.random((4, 1))
        nd_q2_t = np.random.random((4, 1))

        q1_t = to_tf(nd_q1_t)
        q2_t = to_tf(nd_q2_t)

        loss = build_actor_loss(q1_t, q2_t)

        with self.test_session() as sess:
            nd_loss = np.mean(np.min(np.hstack([nd_q1_t, nd_q2_t]), axis=1))
            assert np.allclose(sess.run(loss), nd_loss)
Beispiel #2
0
    def test_success(self):
        nd_q1_t = np.random.random((4, 1))
        nd_q2_t = np.random.random((4, 1))
        nd_log_prob_t = np.random.random((4, 1))
        q1_t = to_tf(nd_q1_t)
        q2_t = to_tf(nd_q2_t)
        log_prob_t = to_tf(nd_log_prob_t)

        loss = build_pi_loss(log_prob_t, q1_t, q2_t)

        with self.test_session() as sess:
            q_t = np.minimum(nd_q1_t, nd_q2_t)
            answer = np.mean(nd_log_prob_t - q_t)
            assert np.allclose(sess.run(loss), answer)
Beispiel #3
0
    def test_success_with_negative_clipped(self):
        epsilon = np.random.random()
        nd_old_values = np.random.random((4, 1))
        nd_values = nd_old_values - epsilon * 1.1
        nd_returns = -np.random.random((4, 1))
        values = to_tf(nd_values)
        returns = to_tf(nd_returns)
        old_values = to_tf(nd_old_values)

        loss = build_value_loss(values, returns, old_values, epsilon, 0.5)

        with self.test_session() as sess:
            answer = 0.5 * np.mean((nd_returns - (nd_old_values - epsilon)) ** 2)
            assert np.allclose(sess.run(loss), answer)
Beispiel #4
0
    def test_success(self):
        nd_q1_t = np.random.random((4, 1))
        nd_q2_t = np.random.random((4, 1))
        nd_target = np.random.random((4, 1))

        q1_t = to_tf(nd_q1_t)
        q2_t = to_tf(nd_q2_t)
        target = to_tf(nd_target)

        loss = build_critic_loss(q1_t, q2_t, target)

        with self.test_session() as sess:
            nd_loss = np.mean((nd_target - nd_q1_t)**2) + np.mean(
                (nd_target - nd_q2_t)**2)
            assert np.allclose(sess.run(loss), nd_loss)
Beispiel #5
0
    def test_success_with_unclipped(self):
        epsilon = np.random.random()
        nd_returns = np.random.random((4, 1))
        nd_old_values = np.random.random((4, 1))
        nd_values = nd_old_values + epsilon * 0.9
        values = to_tf(nd_values)
        returns = to_tf(nd_returns)
        old_values = to_tf(nd_old_values)

        loss = build_value_loss(values, returns, old_values, epsilon, 0.5)
        assert len(loss.shape) == 0

        with self.test_session() as sess:
            answer = 0.5 * np.mean((nd_returns - nd_values) ** 2)
            assert np.allclose(sess.run(loss), answer)
Beispiel #6
0
    def test_success(self):
        nd_v_t = np.random.random((4, 1))
        nd_q1_t = np.random.random((4, 1))
        nd_q2_t = np.random.random((4, 1))
        nd_log_prob_t = np.random.random((4, 1))
        v_t = to_tf(nd_v_t)
        q1_t = to_tf(nd_q1_t)
        q2_t = to_tf(nd_q2_t)
        log_prob_t = to_tf(nd_log_prob_t)

        loss = build_v_loss(v_t, q1_t, q2_t, log_prob_t)

        with self.test_session() as sess:
            nd_q_t = np.minimum(nd_q1_t, nd_q2_t)
            answer = 0.5 * np.mean((nd_v_t - (nd_q_t - nd_log_prob_t))**2)
            assert np.allclose(sess.run(loss), answer)
Beispiel #7
0
    def test_success(self):
        nd_q_t = np.random.random((4, 1))
        nd_rewards_tp1 = np.random.random((4, 1))
        nd_q_tp1 = np.random.random((4, 1))
        nd_dones_tp1 = np.random.randint(2, size=(4, 1))
        q_t = to_tf(nd_q_t)
        rewards_tp1 = to_tf(nd_rewards_tp1)
        q_tp1 = to_tf(nd_q_tp1)
        dones_tp1 = to_tf(nd_dones_tp1)
        gamma = np.random.random()

        loss = build_critic_loss(q_t, rewards_tp1, q_tp1, dones_tp1, gamma)

        with self.test_session() as sess:
            target = nd_rewards_tp1 + gamma * nd_q_tp1 * (1.0 - nd_dones_tp1)
            answer = np.mean((target - nd_q_t)**2)
            assert np.allclose(sess.run(loss), answer)
Beispiel #8
0
    def test_success_with_positive_not_clipped(self):
        # new/old < 0.5
        nd_log_probs = np.log(np.random.random((4, 1)) * 0.2)
        nd_old_log_probs = np.log(np.random.random((4, 1)) * 0.5 + 0.5)
        nd_advantages = np.random.random((4, 1))

        log_probs = to_tf(nd_log_probs)
        old_log_probs = to_tf(nd_old_log_probs)
        advantages = to_tf(nd_advantages)

        loss = build_policy_loss(log_probs, old_log_probs, advantages, 0.2)

        ratio = np.exp(nd_log_probs - nd_old_log_probs)
        answer = -np.mean(ratio * nd_advantages)

        with self.test_session() as sess:
            assert np.allclose(sess.run(loss), answer)
Beispiel #9
0
    def test_success(self):
        nd_rewards_tp1 = np.random.random((4, 1))
        nd_q1_tp1 = np.random.random((4, 1))
        nd_q2_tp1 = np.random.random((4, 1))
        nd_dones_tp1 = np.random.randint(2, size=(4, 1))
        gamma = np.random.random()

        rewards_tp1 = to_tf(nd_rewards_tp1)
        q1_tp1 = to_tf(nd_q1_tp1)
        q2_tp1 = to_tf(nd_q2_tp1)
        dones_tp1 = to_tf(nd_dones_tp1)

        target = build_target(rewards_tp1, q1_tp1, q2_tp1, dones_tp1, gamma)

        with self.test_session() as sess:
            q_tp1 = np.reshape(
                np.min(np.hstack([nd_q1_tp1, nd_q2_tp1]), axis=1), (-1, 1))
            nd_target = nd_rewards_tp1 + gamma * q_tp1 * (1.0 - nd_dones_tp1)
            assert np.allclose(sess.run(target), nd_target)