Ejemplo n.º 1
0
def test_value_and_policy_iteration_gridworld(sx, sy, gamma):
    # Tolerance
    tol = 1e-8

    # Environment
    env = GridWorld(nrows=sx, ncols=sy)

    dp_agent_val = DynProgAgent(env, gamma=gamma, method='value-iteration')
    dp_agent_pol = DynProgAgent(env, gamma=gamma, method='policy-iteration')
    V_value_it, _ = dp_agent_val.train(val_it_tol=tol)
    V_pol_it, _ = dp_agent_pol.train()

    assert np.allclose(V_value_it, V_pol_it, atol=tol, rtol=1e2 * tol)
Ejemplo n.º 2
0
def test_value_and_policy_iteration(gamma, seed, Ns, Na):
    # Tolerance
    tol = 1e-8

    # Environment
    env = ToyEnv2(Ns, Na, seed)

    dp_agent_val = DynProgAgent(env, gamma=gamma, method='value-iteration')
    dp_agent_pol = DynProgAgent(env, gamma=gamma, method='policy-iteration')
    V_value_it, _ = dp_agent_val.train(val_it_tol=tol)
    V_pol_it, _ = dp_agent_pol.train()

    assert dp_agent_val.policy == dp_agent_pol.policy
    assert np.allclose(V_value_it, V_pol_it, atol=tol, rtol=1e1 * tol)
Ejemplo n.º 3
0
        assert nrows >= 3
        assert ncols >= 3

        # defining walls
        middle_col = ncols // 2
        middle_row = nrows // 2
        walls = ()
        for row in range(nrows):
            if row != middle_row:
                walls += ((row, middle_col), )
        #

        super().__init__(seed_val, nrows, ncols, start_coord, terminal_states,
                         success_probability, reward_at, walls, default_reward,
                         enable_render)


if __name__ == '__main__':
    gw = TwoRoomDense(9, 9, success_probability=1.0)

    from rlplan.agents.planning import DynProgAgent
    dynprog = DynProgAgent(gw, method='policy-iteration', gamma=0.9)
    V, _ = dynprog.train()
    gw.display_values(V)

    # run
    gw.render(mode='auto', policy=dynprog.policy)

    # reset
    gw.reset()
Ejemplo n.º 4
0
from rlplan.envs.toy import ToyEnv1
from rlplan.envs.gridworld import GridWorld
from rlplan.envs import Chain
from rlplan.prediction import TabularTD

# Discount factor
gamma = 0.9

# Create environment
# env = Chain(10)
# env = ToyEnv1()
env = GridWorld(success_probability=0.9, nrows=4, ncols=4, walls=((1, 1), ))

# Initialize and train dynamic programming agent
dp_agent = DynProgAgent(env, method='policy-iteration', gamma=gamma)
V_dp, _ = dp_agent.train()

# Initialize and train q-learning agent
ql_agent = QLearningAgent(env,
                          gamma=gamma,
                          learning_rate=None,
                          min_learning_rate=0.1,
                          epsilon=0.2)
training_info = ql_agent.train(n_steps=1e5)
V_ql = training_info['V']

# Use tabular TD
tab_td = TabularTD(env,
                   dp_agent.policy,
                   gamma,
                   lambd=0.9,
Ejemplo n.º 5
0
    """

    action_freq = np.zeros((env.Ns, env.Na))
    H = len(env.history)

    for ii in range(H):
        state, action, reward, next_state, done = env.history[ii]
        action_freq[state, action] += 1.0

    for state in range(env.Ns):
        action_freq[
            state, :] = action_freq[state, :] / action_freq[state, :].sum()

    return action_freq


if __name__ == '__main__':
    from rlplan.envs import GridWorld
    from rlplan.agents.planning import DynProgAgent

    env = GridWorld()
    dp_agent = DynProgAgent(env, method='policy-iteration', gamma=0.9)
    dp_agent.train()

    env.track = True
    for step in range(15):
        env.step(dp_agent.policy.sample(env.state))
    draw_gridworld_history(env)

    env.render('manual')