Exemple #1
0
def test_plotting_integration():
    mdp = examples.build_SB_example35()

    trajectories = []
    for _ in range(3):  # 3 trajectories
        trajectory = [mdp.reset()]
        for _ in range(10):  # 10 steps maximum
            state, reward, done, info = mdp.step(
                random.sample(
                    [actions.LEFT, actions.RIGHT, actions.UP, actions.DOWN],
                    1)[0])
            trajectory.append(state)
        trajectories.append(trajectory)

    gwp = GridWorldPlotter(
        mdp.size, mdp.has_absorbing_state
    )  # alternatively you can use GridWorldPlotter.from_mdp(mdp)
    fig = plt.figure(figsize=(10, 4))
    ax = fig.add_subplot(121)

    # trajectory
    gwp.plot_trajectories(ax, trajectories)
    gwp.plot_grid(ax)

    # heatmap
    ax = fig.add_subplot(122)
    gwp.plot_heatmap(ax, trajectories)
    gwp.plot_grid(ax)
Exemple #2
0
def test_differentiable():
    mdp = build_SB_example35()

    print(mdp.reward)
    # random policy:
    policy = np.ones((mdp.P.shape[0], mdp.P.shape[1]))/mdp.P.shape[1]
    policy = torch.tensor(policy, requires_grad=True).float()
    V_pi = calculate_V_pi(mdp.P, mdp.reward, policy, mdp.discount)
    grads = torch.autograd.grad(V_pi.mean(), [policy])
    assert grads is not None
    for grad in grads:
        assert torch.isfinite(grad).all()
        assert not torch.equal(grad, torch.tensor(0.0))
def test_V_pi():
    mdp = build_SB_example35()

    print(mdp.reward)
    # random policy:
    policy = np.ones((mdp.P.shape[0], mdp.P.shape[1]))/mdp.P.shape[1]

    V_pi = calculate_V_pi(mdp.P, mdp.reward, policy, mdp.discount)

    assert np.allclose(np.round(V_pi, 1), np.array([3.3, 8.8, 4.4, 5.3, 1.5,
                                       1.5, 3.0, 2.3, 1.9, 0.5,
                                       0.1, 0.7, 0.7, 0.4, -0.4,
                                       -1.0, -0.4, -0.4, -0.6, -1.2,
                                       -1.9, -1.3, -1.2, -1.4, -2.0]))
Exemple #4
0
def test_V_pi():
    """Check if computation works."""
    mdp = build_SB_example35()

    print(mdp.reward)
    # random policy:
    policy = np.ones((mdp.P.shape[0], mdp.P.shape[1]))/mdp.P.shape[1]
    policy = torch.from_numpy(policy).float()
    V_pi = calculate_V_pi(mdp.P, mdp.reward, policy, mdp.discount).detach().numpy()

    assert np.allclose(np.round(V_pi, 1), np.array([3.3, 8.8, 4.4, 5.3, 1.5,
                                       1.5, 3.0, 2.3, 1.9, 0.5,
                                       0.1, 0.7, 0.7, 0.4, -0.4,
                                       -1.0, -0.4, -0.4, -0.6, -1.2,
                                       -1.9, -1.3, -1.2, -1.4, -2.0]))
Exemple #5
0
def test_SB_example35():

    mdp = build_SB_example35()

    mdp.set_current_state_to((0, 0))
    state, reward, done, _ = mdp.step(actions.UP)
    assert not done
    assert reward == -1
    assert mdp.unflatten_state(state) == (0, 0)

    state, reward, done, _ = mdp.step(actions.RIGHT)
    assert not done
    assert reward == 0
    assert mdp.unflatten_state(state) == (0, 1)

    state, reward, done, _ = mdp.step(actions.RIGHT)
    assert not done
    assert reward == +10
    assert mdp.unflatten_state(state) == (4, 1)
Exemple #6
0
def test_gym_int_observation():
    env = emdp.emdp_gym.gymify(
        examples.build_SB_example35(),
        observation_one_hot=False)
    state = env.reset()
    assert type(state) == int
Exemple #7
0
 def __init__(self):
     super().__init__(examples.build_SB_example35())