Esempio n. 1
0
def test_rescale_reward():
    # tolerance
    tol = 1e-14

    rng = Seeder(123).rng

    for _ in range(10):
        # generate random MDP
        S, A = 5, 2
        R = rng.uniform(0.0, 1.0, (S, A))
        P = rng.uniform(0.0, 1.0, (S, A, S))
        for ss in range(S):
            for aa in range(A):
                P[ss, aa, :] /= P[ss, aa, :].sum()
        env = FiniteMDP(R, P)

        # test
        wrapped = RescaleRewardWrapper(env, (-10, 10))
        _ = wrapped.reset()
        for _ in range(100):
            _, reward, _, _ = wrapped.sample(
                wrapped.observation_space.sample(),
                wrapped.action_space.sample())
            assert reward <= 10 + tol and reward >= -10 - tol

        _ = wrapped.reset()
        for _ in range(100):
            _, reward, _, _ = wrapped.step(wrapped.action_space.sample())
            assert reward <= 10 + tol and reward >= -10 - tol
Esempio n. 2
0
def test_mbqvi(S, A):
    rng = Seeder(123).rng

    for sim in range(5):
        # generate random MDP with deterministic transitions
        R = rng.uniform(0.0, 1.0, (S, A))
        P = np.zeros((S, A, S))
        for ss in range(S):
            for aa in range(A):
                ns = rng.integers(0, S)
                P[ss, aa, ns] = 1

        # run MBQVI and check exactness of estimators
        env = FiniteMDP(R, P)
        agent = MBQVIAgent(env, n_samples=1)
        agent.fit()
        assert np.abs(R - agent.R_hat).max() < 1e-16
        assert np.abs(P - agent.P_hat).max() < 1e-16