def test_rescale_reward(): # tolerance tol = 1e-14 rng = Seeder(123).rng for _ in range(10): # generate random MDP S, A = 5, 2 R = rng.uniform(0.0, 1.0, (S, A)) P = rng.uniform(0.0, 1.0, (S, A, S)) for ss in range(S): for aa in range(A): P[ss, aa, :] /= P[ss, aa, :].sum() env = FiniteMDP(R, P) # test wrapped = RescaleRewardWrapper(env, (-10, 10)) _ = wrapped.reset() for _ in range(100): _, reward, _, _ = wrapped.sample( wrapped.observation_space.sample(), wrapped.action_space.sample()) assert reward <= 10 + tol and reward >= -10 - tol _ = wrapped.reset() for _ in range(100): _, reward, _, _ = wrapped.step(wrapped.action_space.sample()) assert reward <= 10 + tol and reward >= -10 - tol
def test_mbqvi(S, A): rng = Seeder(123).rng for sim in range(5): # generate random MDP with deterministic transitions R = rng.uniform(0.0, 1.0, (S, A)) P = np.zeros((S, A, S)) for ss in range(S): for aa in range(A): ns = rng.integers(0, S) P[ss, aa, ns] = 1 # run MBQVI and check exactness of estimators env = FiniteMDP(R, P) agent = MBQVIAgent(env, n_samples=1) agent.fit() assert np.abs(R - agent.R_hat).max() < 1e-16 assert np.abs(P - agent.P_hat).max() < 1e-16