def policy_iteration(
        S: np.ndarray,
        A: np.ndarray,
        P: np.ndarray,
        T: np.ndarray,
        gamma: float = 0.99,
        theta: float = 0.000001
) -> (np.ndarray, np.ndarray):
    Pi = tabular_uniform_random_policy(S.shape[0], A.shape[0])
    V = np.random.random((S.shape[0],))
    V[T] = 0.0
    while True:
        V = iterative_policy_evaluation(S, A, P, T, Pi, gamma, theta, V)
        policy_stable = True
        for s in S:
            old_action = np.argmax(Pi[s])
            best_action = 0
            best_action_score = -9999999999999
            for a in A:
                tmp_sum = 0
                for s_p in S:
                    tmp_sum += P[s, a, s_p, 0] * (
                            P[s, a, s_p, 1] + gamma * V[s_p]
                    )
                if tmp_sum > best_action_score:
                    best_action = a
                    best_action_score = tmp_sum
            Pi[s] = 0.0
            Pi[s, best_action] = 1.0
            if best_action != old_action:
                policy_stable = False
        if policy_stable:
            break
    return V, Pi
Exemple #2
0
def on_policy_first_visit_monte_carlo_control(
    states_count: int,
    actions_count: int,
    is_terminal_func: Callable,
    step_func: Callable,
    episodes_count: int = 10000,
    max_steps_per_episode: int = 10,
    epsilon: float = 0.2,
    gamma: float = 0.99,
) -> (np.ndarray, np.ndarray):
    states = np.arange(states_count)  #états possibles
    pi = tabular_uniform_random_policy(states_count,
                                       actions_count)  # policy random uniform
    q = np.random.random((states_count, actions_count))  # valeurs aléatoires

    #print("pi init : ", pi)

    for i in range(
            len(pi)
    ):  #met à 0 les états inutiles (état où on va dans la même case que celle actuelle => impossible)
        for j in range(i):
            if j == i:
                pi[i][j] = 0.0
                q[i][j] = 0.0

    returns = np.zeros((states_count, actions_count))
    returns_count = np.zeros((states_count, actions_count))

    for episode_id in range(episodes_count):
        #print("pi : ", pi)
        s0 = np.random.choice(states)  # état initial aléatoire
        board = [0 for i in range(9)]
        board[s0] = 1
        board[np.random.choice(len(availablePositions(board)))] = -1

        s_list, a_list, r_list = play_a_game(s0, board, pi,
                                             max_steps_per_episode)

        G = 0
        for t in reversed(range(len(s_list))):
            G = gamma * G + r_list[t]
            st = s_list[t]
            at = a_list[t]

            if (st, at) in zip(s_list[0:t], a_list[0:t]):
                continue
            returns[st, at] += G
            returns_count[st, at] += 1
            q[st, at] = returns[st, at] / returns_count[st, at]
            pi[st, :] = epsilon / actions_count
            pi[st,
               np.argmax(q[st, :])] = 1.0 - epsilon + epsilon / actions_count
    return q, pi
def monte_carlo_with_exploring_starts_control(
        states_count: int,
        actions_count: int,
        is_terminal_func: Callable,
        step_func: Callable,
        episodes_count: int = 10000,
        max_steps_per_episode: int = 10,
        gamma: float = 0.99,
) -> (np.ndarray, np.ndarray):
    states = np.arange(states_count)
    actions = np.arange(actions_count)
    pi = tabular_uniform_random_policy(states_count, actions_count)
    q = np.random.random((states_count, actions_count))
    for s in states:
        if is_terminal_func(s):
            q[s, :] = 0.0
            pi[s, :] = 0.0

    returns = np.zeros((states_count, actions_count))
    returns_count = np.zeros((states_count, actions_count))
    for episode_id in range(episodes_count):
        s0 = np.random.choice(states)

        if is_terminal_func(s0):
            continue

        a0 = np.random.choice(actions)
        s1, r1, t1 = step_func(s0, a0)

        s_list, a_list, _, r_list = step_until_the_end_of_the_episode_and_return_history(s1, pi, is_terminal_func,
                                                                                         step_func,
                                                                                         max_steps_per_episode)
        s_list = [s0] + s_list
        a_list = [a0] + a_list
        r_list = [r1] + r_list

        G = 0
        for t in reversed(range(len(s_list))):
            G = gamma * G + r_list[t]
            st = s_list[t]
            at = a_list[t]

            if (st, at) in zip(s_list[0:t], a_list[0:t]):
                continue
            returns[st, at] += G
            returns_count[st, at] += 1
            q[st, at] = returns[st, at] / returns_count[st, at]
            pi[st, :] = 0.0
            pi[st, np.argmax(q[st, :])] = 1.0
    return q, pi
def off_policy_monte_carlo_control(
    states_count: int,
    actions_count: int,
    is_terminal_func: Callable,
    step_func: Callable,
    episodes_count: int = 10000,
    max_steps_per_episode: int = 10,
    epsilon: float = 0.2,
    gamma: float = 0.99,
) -> (np.ndarray, np.ndarray):
    states = np.arange(states_count)
    b = tabular_uniform_random_policy(states_count, actions_count)
    pi = np.zeros((states_count, actions_count))
    C = np.zeros((states_count, actions_count))
    q = np.random.random((states_count, actions_count))
    for i in range(len(pi)):
        for j in range(i):
            if j == i:
                pi[i][j] = 0.0
                q[i][j] = 0.0

    for episode_id in range(episodes_count):
        # print("pi : ", pi)
        s0 = np.random.choice(states)  # état initial aléatoire
        board = [0 for i in range(9)]
        board[s0] = 1
        board[np.random.choice(len(availablePositions(board)))] = -1

        s_list, a_list, r_list = play_a_game(s0, board, pi,
                                             max_steps_per_episode)

        G = 0
        W = 1
        for t in reversed(range(len(s_list))):
            G = gamma * G + r_list[t]
            st = s_list[t]
            at = a_list[t]

            C[st, at] += W

            q[st, at] += W / C[st, at] * (G - q[st, at])
            pi[st, :] = 0.0
            pi[st, np.argmax(q[st, :])] = 1.0

            if at != np.argmax(q[st, :]):
                break

            W = W / b[st, at]

    return q, pi
def off_policy_monte_carlo_control(
        states_count: int,
        actions_count: int,
        reset_func: Callable,
        is_terminal_func: Callable,
        step_func: Callable,
        episodes_count: int = 10000,
        max_steps_per_episode: int = 10,
        epsilon: float = 0.2,
        gamma: float = 0.99,
) -> (np.ndarray, np.ndarray):
    states = np.arange(states_count)
    b = tabular_uniform_random_policy(states_count, actions_count)
    pi = np.zeros((states_count, actions_count))
    C = np.zeros((states_count, actions_count))
    q = np.random.random((states_count, actions_count))
    for s in states:
        if is_terminal_func(s):
            q[s, :] = 0.0
            pi[s, :] = 0.0
        pi[s, :] = 0.0
        pi[s, np.argmax(q[s, :])] = 1.0

    for episode_id in range(episodes_count):
        s0 = reset_func()

        s_list, a_list, _, r_list = step_until_the_end_of_the_episode_and_return_history(s0, b, is_terminal_func,
                                                                                         step_func,
                                                                                         max_steps_per_episode)

        G = 0
        W = 1
        for t in reversed(range(len(s_list))):
            G = gamma * G + r_list[t]
            st = s_list[t]
            at = a_list[t]

            C[st, at] += W

            q[st, at] += W / C[st, at] * (G - q[st, at])
            pi[st, :] = 0.0
            pi[st, np.argmax(q[st, :])] = 1.0

            if at != np.argmax(q[st, :]):
                break

            W = W / b[st, at]

    return q, pi
def on_policy_first_visit_monte_carlo_control(
        states_count: int,
        actions_count: int,
        reset_func: Callable,
        is_terminal_func: Callable,
        step_func: Callable,
        episodes_count: int = 10000,
        max_steps_per_episode: int = 10,
        epsilon: float = 0.2,
        gamma: float = 0.99,
) -> (np.ndarray, np.ndarray):
    states = np.arange(states_count)
    pi = tabular_uniform_random_policy(states_count, actions_count)
    q = np.random.random((states_count, actions_count))
    for s in states:
        if is_terminal_func(s):
            q[s, :] = 0.0
            pi[s, :] = 0.0

    returns = np.zeros((states_count, actions_count))
    returns_count = np.zeros((states_count, actions_count))
    for episode_id in range(episodes_count):
        s0 = reset_func()

        s_list, a_list, _, r_list = step_until_the_end_of_the_episode_and_return_history(s0, pi, is_terminal_func,
                                                                                         step_func,
                                                                                         max_steps_per_episode)

        G = 0
        for t in reversed(range(len(s_list))):
            G = gamma * G + r_list[t]
            st = s_list[t]
            at = a_list[t]

            if (st, at) in zip(s_list[0:t], a_list[0:t]):
                continue
            returns[st, at] += G
            returns_count[st, at] += 1
            q[st, at] = returns[st, at] / returns_count[st, at]
            pi[st, :] = epsilon / actions_count
            pi[st, np.argmax(q[st, :])] = 1.0 - epsilon + epsilon / actions_count
    return q, pi
import numpy as np

from algorithms import iterative_policy_evaluation
from line_world import S, A, P, T
from policies import tabular_uniform_random_policy

if __name__ == "__main__":
    import time

    start_time = time.time()
    Pi = tabular_uniform_random_policy(S.shape[0], A.shape[0])
    V = iterative_policy_evaluation(S, A, P, T, Pi)
    print("--- %s seconds ---" % (time.time() - start_time))
    print(V)

    # Pi = np.zeros((S.shape[0], A.shape[0]))
    # Pi[:, 1] = 1.0
    # V = iterative_policy_evaluation(S, A, P, T, Pi)
    # print(V)
    #
    # Pi = np.zeros((S.shape[0], A.shape[0]))
    # Pi[:, 0] = 1.0
    # V = iterative_policy_evaluation(S, A, P, T, Pi)
    # print(V)