def test_plotting_integration(): mdp = examples.build_SB_example35() trajectories = [] for _ in range(3): # 3 trajectories trajectory = [mdp.reset()] for _ in range(10): # 10 steps maximum state, reward, done, info = mdp.step( random.sample( [actions.LEFT, actions.RIGHT, actions.UP, actions.DOWN], 1)[0]) trajectory.append(state) trajectories.append(trajectory) gwp = GridWorldPlotter( mdp.size, mdp.has_absorbing_state ) # alternatively you can use GridWorldPlotter.from_mdp(mdp) fig = plt.figure(figsize=(10, 4)) ax = fig.add_subplot(121) # trajectory gwp.plot_trajectories(ax, trajectories) gwp.plot_grid(ax) # heatmap ax = fig.add_subplot(122) gwp.plot_heatmap(ax, trajectories) gwp.plot_grid(ax)
def test_differentiable(): mdp = build_SB_example35() print(mdp.reward) # random policy: policy = np.ones((mdp.P.shape[0], mdp.P.shape[1]))/mdp.P.shape[1] policy = torch.tensor(policy, requires_grad=True).float() V_pi = calculate_V_pi(mdp.P, mdp.reward, policy, mdp.discount) grads = torch.autograd.grad(V_pi.mean(), [policy]) assert grads is not None for grad in grads: assert torch.isfinite(grad).all() assert not torch.equal(grad, torch.tensor(0.0))
def test_V_pi(): mdp = build_SB_example35() print(mdp.reward) # random policy: policy = np.ones((mdp.P.shape[0], mdp.P.shape[1]))/mdp.P.shape[1] V_pi = calculate_V_pi(mdp.P, mdp.reward, policy, mdp.discount) assert np.allclose(np.round(V_pi, 1), np.array([3.3, 8.8, 4.4, 5.3, 1.5, 1.5, 3.0, 2.3, 1.9, 0.5, 0.1, 0.7, 0.7, 0.4, -0.4, -1.0, -0.4, -0.4, -0.6, -1.2, -1.9, -1.3, -1.2, -1.4, -2.0]))
def test_V_pi(): """Check if computation works.""" mdp = build_SB_example35() print(mdp.reward) # random policy: policy = np.ones((mdp.P.shape[0], mdp.P.shape[1]))/mdp.P.shape[1] policy = torch.from_numpy(policy).float() V_pi = calculate_V_pi(mdp.P, mdp.reward, policy, mdp.discount).detach().numpy() assert np.allclose(np.round(V_pi, 1), np.array([3.3, 8.8, 4.4, 5.3, 1.5, 1.5, 3.0, 2.3, 1.9, 0.5, 0.1, 0.7, 0.7, 0.4, -0.4, -1.0, -0.4, -0.4, -0.6, -1.2, -1.9, -1.3, -1.2, -1.4, -2.0]))
def test_SB_example35(): mdp = build_SB_example35() mdp.set_current_state_to((0, 0)) state, reward, done, _ = mdp.step(actions.UP) assert not done assert reward == -1 assert mdp.unflatten_state(state) == (0, 0) state, reward, done, _ = mdp.step(actions.RIGHT) assert not done assert reward == 0 assert mdp.unflatten_state(state) == (0, 1) state, reward, done, _ = mdp.step(actions.RIGHT) assert not done assert reward == +10 assert mdp.unflatten_state(state) == (4, 1)
def test_gym_int_observation(): env = emdp.emdp_gym.gymify( examples.build_SB_example35(), observation_one_hot=False) state = env.reset() assert type(state) == int
def __init__(self): super().__init__(examples.build_SB_example35())