def main():
    grid_size = 10
    grid_world = GridWorld(grid_size,
                           num_obstacles=20,
                           stochastic_cell_ratio=0.1)

    params = {}
    params['type'] = 'value_iteration'
    params['grid_size'] = grid_size
    params['rewards'] = grid_world.rewards
    params['transition_matrix'] = grid_world.transition_matrix
    params['step_func'] = GridWorld.deterministic_step
    params['discount'] = 0.9
    agent = AgentFactory.create_agent(params)

    episode_ended = False
    while True:
        grid_world.get_user_input()
        grid_world.draw_with_state_values(
            agent.v, policy=agent.pi if grid_world.render_policy else None)

        if not grid_world.pause:
            if episode_ended:
                grid_world.restart_episode()
                grid_world.draw_black_screen()
                episode_ended = False
            else:
                agent.do_job()
                if agent.ready_to_play():
                    action = agent.get_action(grid_world.pos)
                    episode_ended, _, _ = grid_world.step(action)

        grid_world.tick_tock()
Ejemplo n.º 2
0
def Q_learning(env: GridWorld,
               epsilon: float,
               lr: float,
               initQ: np.ndarray,
               converge=False) -> (np.ndarray, float):
    """
    Performs Q learning for single episode in environment and returns learned policy.
    :param env: GridWorld subclass
    :param epsilon: Exploitation rate
    :param lr: learning rate
    :param initQ: Q table to update
    :param converge: Flag to determine if delta of Q-values need to be tracked
    for convergence within set bound.
    :return: Updated Q table after a single episode of training and maximum change for any Q-value
    """

    # keep track of maximum state-action value change
    delta = 0.0

    state = env.reset()

    done = False

    while not done:

        # explore action space
        if np.random.uniform(0, 1) > epsilon:
            action = env.sample()

        # exploit
        else:
            action = np.argmax(initQ[state])

        # take step in env
        obs, r, done = env.step(action)

        # update Q table
        prev_value = initQ[state, action]

        new_value = prev_value + lr * (r + env.gamma * np.max(initQ[obs]) -
                                       prev_value)

        initQ[state, action] = new_value

        # update state
        state = obs

        if converge:
            delta = max(delta, np.abs(new_value - prev_value))

    return initQ, delta