Example #1
0
def policy_iteration(S: np.ndarray,
                     A: np.ndarray,
                     P: np.ndarray,
                     T: np.ndarray,
                     gamma: float = 0.99,
                     theta: float = 0.000001) -> (np.ndarray, np.ndarray):
    Pi = tabular_uniform_random_policy(S.shape[0], A.shape[0])
    V = np.random.random((S.shape[0], ))
    V[T] = 0.0
    while True:
        V = iterative_policy_evaluation(S, A, P, T, Pi, gamma, theta, V)
        policy_stable = True
        for s in S:
            old_action = np.argmax(Pi[s])
            best_action = 0
            best_action_score = -9999999999999
            for a in A:
                tmp_sum = 0
                for s_p in S:
                    tmp_sum += P[s, a, s_p,
                                 0] * (P[s, a, s_p, 1] + gamma * V[s_p])
                if tmp_sum > best_action_score:
                    best_action = a
                    best_action_score = tmp_sum
            Pi[s] = 0.0
            Pi[s, best_action] = 1.0
            if best_action != old_action:
                policy_stable = False
        if policy_stable:
            break
    return V, Pi
Example #2
0
def monte_carlo_with_exploring_starts_control_2(
    s_terminal,
    s_sp,
    player,
    states_count: int,
    actions_count: int,
    is_terminal_func: Callable,
    step_func: Callable,
    episodes_count: int = 10000,
    max_steps_per_episode: int = 10,
    gamma: float = 0.99,
) -> (np.ndarray, np.ndarray):
    states = np.arange(states_count)
    actions = np.arange(actions_count)
    pi = tabular_uniform_random_policy(states_count, actions_count)
    q = np.random.random((states_count, actions_count))
    for s in states:
        if is_terminal_func(s, s_terminal):
            q[s, :] = 0.0
            pi[s, :] = 0.0

    returns = np.zeros((states_count, actions_count))
    returns_count = np.zeros((states_count, actions_count))
    for episode_id in range(episodes_count):

        s0 = np.random.choice(states)

        if is_terminal_func(s0, s_terminal):
            continue

        a0 = np.random.choice(actions)

        s1, r1, t1, a0 = step_func(s0, a0, s_terminal, s_sp, player)

        s_list, a_list, _, r_list = step_until_the_end_of_the_episode_and_return_history_2(
            s_terminal, s_sp, player, s1, pi, is_terminal_func, step_func,
            max_steps_per_episode)
        s_list = [s0] + s_list
        a_list = [a0] + a_list
        r_list = [r1] + r_list

        G = 0
        for t in reversed(range(len(s_list))):
            G = gamma * G + r_list[t]
            st = s_list[t]
            at = a_list[t]

            if (st, at) in zip(s_list[0:t], a_list[0:t]):
                continue
            returns[st, at] += G
            returns_count[st, at] += 1
            q[st, at] = returns[st, at] / returns_count[st, at]
            pi[st, :] = 0.0
            pi[st, np.argmax(q[st, :])] = 1.0
    return q, pi
Example #3
0
def off_policy_monte_carlo_control(
    s_terminal,
    s_sp,
    player,
    states_count: int,
    actions_count: int,
    is_terminal_func: Callable,
    step_func: Callable,
    episodes_count: int = 10000,
    max_steps_per_episode: int = 10,
    gamma: float = 0.99,
    epsilon: float = 0.2,
) -> (np.ndarray, np.ndarray):
    states = np.arange(states_count)
    b = tabular_uniform_random_policy(states_count, actions_count)
    pi = np.zeros((states_count, actions_count))
    C = np.zeros((states_count, actions_count))
    q = np.random.random((states_count, actions_count))
    for s in states:
        if is_terminal_func(s, s_terminal):
            q[s, :] = 0.0
            pi[s, :] = 0.0
        pi[s, :] = 0.0
        pi[s, np.argmax(q[s, :])] = 1.0

    for episode_id in range(episodes_count):
        s0 = reset_tic_tac(s_sp)

        s_list, a_list, _, r_list = step_until_the_end_of_the_episode_and_return_history_2(
            s_terminal, s_sp, player, s0, pi, is_terminal_func, step_func,
            max_steps_per_episode)

        G = 0
        W = 1
        for t in reversed(range(len(s_list))):
            G = gamma * G + r_list[t]
            st = s_list[t]
            at = a_list[t]

            C[st, at] += W

            q[st, at] += W / C[st, at] * (G - q[st, at])
            pi[st, :] = 0.0
            pi[st, np.argmax(q[st, :])] = 1.0

            if at != np.argmax(q[st, :]):
                break

            W = W / b[st, at]

    return q, pi
Example #4
0
def test_grid_iterative_policy_evaluation():
    pygame.init()

    w = 6
    h = 5
    rewards = ((24, 1),
               (2, -1),
               (11, -1))

    terminal = [2, 11, 24]
    new_pos = {"top": 2, "bot": 3, "left": 0, "right": 1}

    S, A, T, P = create_grid_world(w, h, rewards, terminal)
    Pi = tabular_uniform_random_policy(S.shape[0], A.shape[0])

    start_time = time()
    V = iterative_policy_evaluation(S, A, P, T, Pi)
    print("--- %s seconds ---" % (time() - start_time))


    win = pygame.display.set_mode((w * 100, h * 100))
    for i in range(w * h):
        if i % w == 0 and i != 0:
            print("")
        print(round(V[i], 7), end=" ")
    print("")
    st = reset_grid(w, h)

    while not is_terminal(st, T):
        display_grid(win, w, h)
        event_loop()
        display_reward_grid(win, rewards, w, h)
        display_mouse_grid(win, st, w, h)
        sleep(1)

        positions = {"top": st - w,
                "bot": st + w,
                "left": st - 1,
                "right": st + 1
                }

        positions = {key: V[value] for key, value in positions.items() if 0 <= value < w * h}
        action = max(positions, key=positions.get)

        a = new_pos[action]

        st, r, term = step(st, a, T, S, P)
    display_grid(win, w, h)
    display_reward_grid(win, rewards, w, h)
    display_mouse_grid(win, st, w, h)
Example #5
0
def on_policy_first_visit_monte_carlo_control(
    s_terminal,
    s_sp,
    player,
    states_count: int,
    actions_count: int,
    is_terminal_func: Callable,
    step_func: Callable,
    episodes_count: int = 10000,
    max_steps_per_episode: int = 10,
    gamma: float = 0.99,
    epsilon: float = 0.2,
) -> (np.ndarray, np.ndarray):
    states = np.arange(states_count)
    pi = tabular_uniform_random_policy(states_count, actions_count)
    q = np.random.random((states_count, actions_count))
    for s in states:
        if is_terminal_func(s, s_terminal):
            q[s, :] = 0.0
            pi[s, :] = 0.0

    returns = np.zeros((states_count, actions_count))
    returns_count = np.zeros((states_count, actions_count))
    for episode_id in range(episodes_count):
        s0 = reset_tic_tac(s_sp)

        s_list, a_list, _, r_list = step_until_the_end_of_the_episode_and_return_history_2(
            s_terminal, s_sp, player, s0, pi, is_terminal_func, step_func,
            max_steps_per_episode)
        G = 0
        for t in reversed(range(len(s_list))):
            G = gamma * G + r_list[t]
            st = s_list[t]
            at = a_list[t]

            if (st, at) in zip(s_list[0:t], a_list[0:t]):
                continue
            returns[st, at] += G
            returns_count[st, at] += 1
            q[st, at] = returns[st, at] / returns_count[st, at]
            pi[st, :] = epsilon / actions_count
            pi[st,
               np.argmax(q[st, :])] = 1.0 - epsilon + epsilon / actions_count
    return q, pi
Example #6
0
def test_line_iterative_policy_evaluation():
    pygame.init()

    num_states = 15
    rewards = ((0, -1),
               (14, 1))

    terminal = [0, 14]

    S, A, T, P = create_line_world(num_states, rewards, terminal)
    Pi = tabular_uniform_random_policy(S.shape[0], A.shape[0])

    start_time = time()
    V = iterative_policy_evaluation(S, A, P, T, Pi)
    print("--- %s seconds ---" % (time() - start_time))

    print(V)


    win = pygame.display.set_mode((num_states * 100, 100))

    st = reset_line(num_states)

    while not is_terminal(st, T):
        display_line(win, num_states)
        event_loop()
        display_reward_line(win, rewards, num_states)
        display_mouse_line(win, st, num_states)
        sleep(1)

        if V[st + 1] > V[st - 1] or V[st + 1] == 0:
            a = 1
        elif V[st + 1] <  V[st - 1] or V[st - 1] == 0:
            a = 0
        st, r, term = step(st, a, T, S, P)

    display_line(win, num_states)
    display_reward_line(win, rewards,num_states)
    display_mouse_line(win, st, num_states)
    sleep(1)