Esempio n. 1
0
def get_pi_policy():
    problem_size = 10
    seed = 20
    env, random_map = generate_frozen_lake(problem_size, p=0.80, seed=seed)

    # plotting.plot_frozen_lake(random_map)

    opt_V, opt_policy, delta = policy_iteration.policy_iteration(
        env, discount_factor=0.999, max_iteration=10000)
    # plotting.plot_lake_policy("Policy Iteration", opt_policy)

    wins, total_reward, average_reward = play_episodes(env, 1000, opt_policy,
                                                       seed)
    print(wins, total_reward, average_reward)
def policy_iteration_test(ENV_NAME):
    env = get_environment(ENV_NAME)
    n_iters = {k: [] for k in thetas}
    runtimes = {k: [] for k in thetas}

    for theta in thetas:
        print('theta=%s' % theta)
        for gamma in gammas:
            temp_n_iters = []
            temp_runtimes = []
            for t in range(n_trials):
                print('theta=%s gamma=%s trial=%s' % (theta, gamma, t))
                _, _, n_iter, runtime = policy_iteration(env,
                                                         discount_factor=gamma,
                                                         theta=theta,
                                                         max_iter=100)
                temp_n_iters.append(n_iter)
                temp_runtimes.append(runtime)
            n_iters[theta].append(np.mean(temp_n_iters))
            runtimes[theta].append(np.mean(temp_runtimes))

    for key, iterlist in n_iters.items():
        plt.plot(gammas, iterlist, label=('theta=%s' % key))
    plt.title('PI - Iterations until Convergence %s Problem' % ENV_NAME)
    plt.legend(loc='upper left')
    plt.xlabel('Gamma')
    plt.ylabel('Iterations')
    file_name = '{}/{}/{}_iterations.png'.format(FIGURES_DIRECTORY, ENV_NAME,
                                                 'pi')
    plt.savefig(file_name, format='png', dpi=150)
    plt.close()

    for key, rt in runtimes.items():
        plt.plot(gammas, [t * 1000 for t in rt], label=('theta=%s' % key))
    plt.title('PI - Time until Convergence %s Problem' % ENV_NAME)
    plt.legend(loc='upper left')
    plt.xlabel('Gamma')
    plt.ylabel('Total Milliseconds')
    file_name = '{}/{}/{}_time.png'.format(FIGURES_DIRECTORY, ENV_NAME, 'pi')
    plt.savefig(file_name, format='png', dpi=150)
    plt.close()
def generic_experiment(seed=1):
    # Model-based or model-free solution to the linear-quadratic game with stochastic parameters for generic systems
    npr.seed(seed)

    # problem_data_id = 1581199445 # 5-state random system
    problem_data_id = 1581378883  # 2-state example system
    # problem_data_id = 3 # 3-state example system
    # problem_data_id = 2 # 2-state example system
    problem_data = get_problem_data(problem_type='load',
                                    problem_data_id=problem_data_id)
    problem_data_keys = [
        'A', 'B', 'C', 'Ai', 'Bj', 'Ck', 'varAi', 'varBj', 'varCk', 'Q', 'R',
        'S'
    ]
    A, B, C, Ai, Bj, Ck, varAi, varBj, varCk, Q, R, S = [
        problem_data[key] for key in problem_data_keys
    ]
    n, m, p = [M.shape[1] for M in [A, B, C]]
    q, r, s = [M.shape[0] for M in [Ai, Bj, Ck]]

    # Initial gains
    K0, L0 = get_initial_gains(problem_data, initial_gain_method='zero')

    # Simulation options
    # Std deviation for initial state, defender inputs, attacker inputs, and additive noise
    xstd, ustd, vstd, wstd = 1.0, 1.0, 1.0, 0.0

    # Rollout length
    nt = 4000

    # Number of rollouts
    nr = 1

    # Rollout computation type
    group_option = 'group'

    # Q-function estimation scheme
    # qfun_estimator = 'direct'
    # qfun_estimator = 'lsadp'
    qfun_estimator = 'lstdq'

    sim_options_keys = [
        'xstd', 'ustd', 'vstd', 'wstd', 'nt', 'nr', 'group_option',
        'qfun_estimator'
    ]
    sim_options_values = [
        xstd, ustd, vstd, wstd, nt, nr, group_option, qfun_estimator
    ]
    sim_options = dict(zip(sim_options_keys, sim_options_values))

    num_iterations = 20

    # Policy iteration
    problem_data_known = False
    P_pi, K_pi, L_pi, H_pi, P_history_pi, K_history_pi, L_history_pi, c_history_pi, H_history_pi = policy_iteration(
        problem_data, problem_data_known, K0, L0, sim_options, num_iterations)
    verify_gare(problem_data, P_pi, algo_str='Policy iteration')

    # Value iteration
    # Start value iteration at the same initial P as from policy iteration
    P0 = gdlyap(problem_data, K0, L0)
    P_vi, K_vi, L_vi, P_history_vi, c_history_vi = value_iteration(
        problem_data, P0, num_iterations)
    verify_gare(problem_data, P_vi, algo_str='Value iteration')

    # Plotting
    plt.close('all')
    t_history = np.arange(num_iterations) + 1

    # Cost-to-go matrix
    fig, ax = plt.subplots(ncols=2)
    plt.suptitle('Value matrix (P)')
    ax[0].imshow(P_pi)
    ax[1].imshow(P_vi)
    ax[0].set_title('Policy iteration')
    ax[1].set_title('Value iteration')

    # Cost over iterations
    fig, ax = plt.subplots()
    ax.plot(t_history, c_history_pi)
    ax.plot(t_history, c_history_vi)
    plt.legend(['Policy iteration', 'Value iteration'])
    plt.xlabel('Iteration')
    plt.ylabel('Cost')
    if num_iterations <= 20:
        plt.xticks(np.arange(num_iterations) + 1)

    plt.show()
def model_free_network_slq_game_experiment(seed=2):
    npr.seed(seed)

    problem_data = example_system_erdos_renyi(n=3, m=2, p=2, seed=seed)
    # from data_io import load_problem_data
    # data_files = load_problem_data(4)

    problem_data_keys = [
        'A', 'B', 'C', 'Ai', 'Bj', 'Ck', 'varAi', 'varBj', 'varCk', 'Q', 'R',
        'S'
    ]
    A, B, C, Ai, Bj, Ck, varAi, varBj, varCk, Q, R, S = [
        problem_data[key] for key in problem_data_keys
    ]
    n, m, p = [M.shape[1] for M in [A, B, C]]
    q, r, s = [M.shape[0] for M in [Ai, Bj, Ck]]

    # Initial gains
    K0, L0 = get_initial_gains(problem_data, initial_gain_method='zero')

    # Simulation options
    # Std deviation for initial state, defender inputs, attacker inputs, and additive noise
    xstd, ustd, vstd, wstd = 1.0, 1.0, 1.0, 0.0

    # Rollout length
    nt = 5000

    # Number of rollouts
    nr = 1

    # Rollout computation type
    group_option = 'group'

    # Q-function estimation scheme
    # qfun_estimator = 'direct'
    # qfun_estimator = 'lsadp'
    qfun_estimator = 'lstdq'

    sim_options_keys = [
        'xstd', 'ustd', 'vstd', 'wstd', 'nt', 'nr', 'group_option',
        'qfun_estimator'
    ]
    sim_options_values = [
        xstd, ustd, vstd, wstd, nt, nr, group_option, qfun_estimator
    ]
    sim_options = dict(zip(sim_options_keys, sim_options_values))

    num_iterations = 10

    # Policy iteration
    problem_data_known = True
    all_data = policy_iteration(problem_data, problem_data_known, K0, L0,
                                sim_options, num_iterations)
    P, K, L, H, P_history, K_history, L_history, c_history, H_history = all_data
    verify_gare(problem_data, P, algo_str='Policy iteration')

    all_data_list = []
    problem_data_known = False
    num_trials = 10
    for i in range(num_trials):
        all_data_list.append(
            policy_iteration(problem_data, problem_data_known, K0, L0,
                             sim_options, num_iterations))

    def norm_history_plotter(ax, M_history, M_history_list, ylabel_str):
        # Plot the history of the error norm from the single trial in the model-based case
        ax.plot(la.norm(M_history - M_history[-1], ord=2, axis=(1, 2)))

        # Plot the history of the error norm from each of the individual trials in the model-free case
        for i in range(num_trials):
            ax.plot(la.norm(M_history_list[i] - M_history[-1],
                            ord=2,
                            axis=(1, 2)),
                    color='k',
                    alpha=0.5)
        ax.set_ylabel(ylabel_str, rotation=0, labelpad=10)

    fig, ax = plt.subplots(nrows=4, figsize=(6, 8))
    data_idx_list = [4, 5, 6, 8]
    ylabel_list = ['P', 'K', 'L', 'H']
    for i in range(4):
        j = data_idx_list[i]
        norm_history_plotter(ax[i], all_data[j],
                             [all_data_list[i][j] for i in range(num_trials)],
                             ylabel_list[i])
        ax[i].set_yscale('log')
        ax[i].legend(['Model-based', 'Model-free'])
    # plt.tight_layout()
    ax[0].set_title('Relative norm of error vs iteration')
    plt.xlabel('Iteration')
    plt.show()
    figure_out_path = os.path.join(
        '..', 'results', 'results_model_free_network_slq_game_experiment.png')
    plt.savefig(figure_out_path, dpi=300)
def model_based_robust_stabilization_experiment():
    seed = 1
    npr.seed(seed)

    problem_data_true, problem_data = gen_double_spring_mass()

    problem_data_keys = [
        'A', 'B', 'C', 'Ai', 'Bj', 'Ck', 'varAi', 'varBj', 'varCk', 'Q', 'R',
        'S'
    ]
    A, B, C, Ai, Bj, Ck, varAi, varBj, varCk, Q, R, S = [
        problem_data[key] for key in problem_data_keys
    ]

    n, m, p = [M.shape[1] for M in [A, B, C]]
    q, r, s = [M.shape[0] for M in [Ai, Bj, Ck]]

    # Synthesize controllers using various uncertainty modeling terms

    # Modify problem data_files
    # LQR w/ game adversary
    # Setting varAi, varBj, varCk = 0 => no multiplicative noise on the game
    problem_data_model_n = copy.deepcopy(problem_data)
    problem_data_model_n['varAi'] *= 0
    problem_data_model_n['varBj'] *= 0
    problem_data_model_n['varCk'] *= 0

    # LQR w/ multiplicative noise
    # Setting C = 0 and varCk = 0 => no game adversary
    problem_data_model_m = copy.deepcopy(problem_data)
    problem_data_model_m['C'] *= 0
    problem_data_model_m['varCk'] *= 0

    # Simulation options
    sim_options = None
    num_iterations = 50
    problem_data_known = True

    # Policy iteration on LQR w/ game adversary and multiplicative noise
    K0, L0 = get_initial_gains(problem_data, initial_gain_method='dare')
    print("LQR w/ game adversary and multiplicative noise")
    P_pi, K_pi, L_pi, H_pi, P_history_pi, K_history_pi, L_history_pi, c_history_pi, H_history_pi = policy_iteration(
        problem_data, problem_data_known, K0, L0, sim_options, num_iterations)
    verify_gare(problem_data,
                P_pi,
                algo_str='Policy iteration - Game w/ Multiplicative noise')

    # Check concavity condition
    Qvv_pi = -S + mdot(C.T, P_pi, C) + np.sum(
        [varCk[k] * mdot(Ck[k].T, P_pi, Ck[k]) for k in range(s)], axis=0)
    if not is_pos_def(-Qvv_pi):
        raise Exception(
            'Problem fails the concavity condition, adjust adversary strength')

    # Check positive definiteness condition
    QKL_pi = Q + mdot(K_pi.T, R, K_pi) - mdot(L_pi.T, S, L_pi)
    if not is_pos_def(QKL_pi):
        raise Exception(
            'Problem fails the positive definiteness condition, adjust adversary strength'
        )
    print(QKL_pi)

    # Policy Iteration on LQR w/ game adversary
    K0n, L0n = get_initial_gains(problem_data_model_n,
                                 initial_gain_method='dare')
    print("LQR w/ game adversary")
    Pn_pi, Kn_pi, Ln_pi, Hn_pi, Pn_history_pi, Kn_history_pi, Ln_history_pi, cn_history_pi, Hn_history_pi = policy_iteration(
        problem_data_model_n, problem_data_known, K0n, L0n, sim_options,
        num_iterations)
    verify_gare(problem_data_model_n,
                Pn_pi,
                algo_str='Policy iteration - Game w/o Multiplicative noise')

    # Policy Iteration on LQR w/ multiplicative noise
    K0m, L0m = get_initial_gains(problem_data_model_m,
                                 initial_gain_method='dare')
    print("LQR w/ multiplicative noise")
    Pm_pi, Km_pi, Lm_pi, Hm_pi, Pm_history_pi, Km_history_pi, Lm_history_pi, cm_history_pi, Hm_history_pi = policy_iteration(
        problem_data_model_m, problem_data_known, K0m, L0m, sim_options,
        num_iterations)
    verify_gare(problem_data_model_m,
                Pm_pi,
                algo_str='Policy iteration - LQR w/ Multiplicative noise')

    # LQR on true system
    A_true, B_true, Q_true, R_true = [
        problem_data_true[key] for key in ['A', 'B', 'Q', 'R']
    ]
    n_true, m_true = [M.shape[1] for M in [A_true, B_true]]
    Pare_true, Kare_true = dare_gain(A_true, B_true, Q_true, R_true)

    # LQR on nominal system, no explicit robust control design
    Pce, Kce = dare_gain(A, B, Q, R)

    # Check if synthesized controllers stabilize the true system
    K_pi_true = np.hstack([K_pi, np.zeros([m, n])])
    Kn_pi_true = np.hstack([Kn_pi, np.zeros([m, n])])
    Km_pi_true = np.hstack([Km_pi, np.zeros([m, n])])
    Kce_true = np.hstack([Kce, np.zeros([m, n])])
    Kol_true = np.zeros_like(Kce_true)

    control_method_strings = [
        'open-loop     ', 'cert equiv    ', 'noise         ', 'game          ',
        'noise + game  ', 'optimal       '
    ]
    K_list = [Kol_true, Kce_true, Km_pi_true, Kn_pi_true, K_pi_true, Kare_true]
    AK_list = [A_true + np.dot(B_true, K) for K in K_list]
    QK_list = [Q_true + mdot(K.T, R_true, K) for K in K_list]
    specrad_list = [specrad(AK) for AK in AK_list]
    cost_list = [
        np.trace(dlyap(AK.T, QK)) if sr < 1 else np.inf
        for AK, QK, sr in zip(AK_list, QK_list, specrad_list)
    ]

    set_numpy_decimal_places(1)

    output_text_filename = 'results_model_based_robust_stabilization_experiment.txt'
    output_text_path = os.path.join('..', 'results', output_text_filename)
    with open(output_text_path, 'w') as f:
        header_str = 'method       |  specrad  |   cost   |  gains'
        print(header_str, file=f)
        for control_method_string, sr, cost, K in zip(control_method_strings,
                                                      specrad_list, cost_list,
                                                      K_list):
            line_str = '%s  %.3f %8s      %s' % (control_method_string, sr,
                                                 '%10.0f' % cost, K)
            print(line_str, file=f)
def main():
    mode = 25
    in_file = open(
        "F:\\AI-1\\projects\\reinforcement learning\\RL_Project_python\\sample_%s.txt"
        % (mode))
    row_col = in_file.readline().split()
    row = eval(row_col[0])
    column = eval(row_col[1])

    Board = []
    k = 1
    for i in range(row):
        for j in range(column):
            Board.append(index(0, 0, 0, 0, 0, k, 0, ""))
            k += 1

    North = []
    East = []
    South = []
    West = []

    for i in range(0, row * column):
        coordinates = in_file.readline().split()
        North.append(eval(coordinates[0]))
        East.append(eval(coordinates[1]))

        South.append(eval(coordinates[2]))
        West.append(eval(coordinates[3]))
        Board[i].N = coordinates[0]
        Board[i].E = coordinates[1]
        Board[i].S = coordinates[2]
        Board[i].W = coordinates[3]

    temp = in_file.readline().split()

    Source = eval(temp[0])
    goals = []
    for i in range(1, len(temp)):
        goals.append(eval(temp[i]))
        Board[eval(temp[i]) - 1].goal = TRUE

    M = GUI.maze(Board, row, column, North, East, South, West, Source, goals,
                 mode)

    selection = eval(
        input("1.value iteration\n2.policy iteration\n3.Q_learning\n"))
    if (selection == 1):
        Board = value_iteration.value_iteration(Board, 0.0001, row)
    elif (selection == 2):
        Board = policy_iteration.policy_iteration(Board, 0.0001, row)
    elif (selection == 3):
        Board = Q_learning.Q_learning(Board, row)

    for i in range(len(Board)):
        Board[i].print_index()

    # displaying the policy
    M = GUI.maze(Board, row, column, North, East, South, West, Source, goals,
                 mode)

    # playing the game starting with source point
    current_index = Source - 1
    final_points = []

    for i in range(len(goals)):
        final_points.append(goals[i] - 1)
        print(final_points[i])
    while (current_index not in final_points):

        if (Board[current_index].best_action == "N"):
            current_index -= 1
            print("N", current_index)

            M = GUI.maze(Board, row, column, North, East, South, West,
                         current_index + 1, goals, mode)

        elif (Board[current_index].best_action == "E"):
            current_index += row
            print("E", current_index)

            M = GUI.maze(Board, row, column, North, East, South, West,
                         current_index + 1, goals, mode)

        elif (Board[current_index].best_action == "S"):
            current_index += 1
            print("S", current_index)

            M = GUI.maze(Board, row, column, North, East, South, West,
                         current_index + 1, goals, mode)

        elif (Board[current_index].best_action == "W"):
            current_index -= row
            print("W", current_index)

            M = GUI.maze(Board, row, column, North, East, South, West,
                         current_index + 1, goals, mode)
Esempio n. 7
0
import numpy as np
import policy_iteration as pi

T = np.array([[[0.1, 0.4, 0.3, 0.2, 0, 0], [0.1, 0, 0.4, 0.3, 0.1, 0.1],
               [0, 0.3, 0, 0.4, 0.1, 0.2], [0, 0.1, 0.3, 0.3, 0.2, 0.1],
               [0, 0, 0, 0, 0.7, 0.3], [0, 0, 0, 0, 0.3, 0.7]],
              [[0.4, 0.3, 0.2, 0.1, 0, 0], [0.1, 0, 0.2, 0.4, 0.2, 0.1],
               [0, 0.2, 0, 0.4, 0.2, 0.2], [0, 0.1, 0.4, 0.2, 0.2, 0.1],
               [0, 0, 0, 0.3, 0.7, 0], [0, 0, 0, 0, 0.7, 0.3]]])

R = np.array([[1, 3, 1, 2, 4, 4], [4, 2, 2, 1, 3, 5]])

for gamma in [0.5, 0.8]:
    print(f'For gamma = {gamma}')
    h, acplan, V, V_hat = pi.policy_iteration(T, R, gamma)
    print('Optimal Policy = ', acplan[h, :])
    print('Optimal Value  = ', V[h, :])
    print('')
Esempio n. 8
0
if __name__ == '__main__':
    
    for ENV_NAME in ENV_NAMES:
        gamma = 0.9
        theta = 0.0001        
        env_kwargs = {
            'map_name': ENV_NAME,
            'slip_rate': .2,
            'rewards': (-0.1, -1, 1)
        }
        print(ENV_NAME)
        pi_env = FrozenLakeEnv(**env_kwargs)
        pi_env = pi_env.unwrapped
        print('policy iteration begin')
        pi_policy, pi_V, pi_iter, pi_time = policy_iteration(pi_env, discount_factor=gamma, theta=theta)
        print('policy iteration end')
        visualize_policy(pi_policy, ENV_NAME, pi_env.desc.shape,'pi', 'Policy Iteration - Optimal Policy {} Iterations'.format(pi_iter))
        visualize_value(pi_V, ENV_NAME, pi_env.desc.shape,'pi', 'Policy Iteration - Estimated Value of each State')


    for ENV_NAME in ENV_NAMES:
        gamma = 0.85
        theta = 0.001        
        env_kwargs = {
            'map_name': ENV_NAME,
            'slip_rate': .2,
            'rewards': (-0.1, -1, 1)
        }
        vi_env = FrozenLakeEnv(**env_kwargs)
        vi_env = vi_env.unwrapped
Esempio n. 9
0
from os import path

import mdp_models as mm
import policy_iteration as polit

# Create MDPs for each problem
jcr = mm.JCR_MDP()
jcr2 = mm.JCR_MDP_2()


for mdp in [jcr, jcr2]:
    # Solve both problems with policy iteration
    # and plot results.
    pi_list, v_list = polit.policy_iteration(mdp)
    pi = pi_list[-1]
    v = v_list[-1]

    # Plot optimal policy
    polit.visualize_policy_plot(pi, 'Optimal Policy', 
        path.join('images', f'policy_{str(mdp)}.png'))
    # Plot optimal value function
    polit.visualize_values(v, 'Optimal Value Function', 
        path.join('images', f'values_{str(mdp)}.png'))
Esempio n. 10
0
def main():
    seed = 0

    # Small lake
    small_lake = [['&', '.', '.', '.'], ['.', '#', '.', '#'],
                  ['.', '.', '.', '#'], ['#', '.', '.', '$']]

    big_lake = [['&', '.', '.', '.', '.', '.', '.', '.'],
                ['.', '.', '.', '.', '.', '.', '.', '.'],
                ['.', '.', '.', '#', '.', '.', '.', '.'],
                ['.', '.', '.', '.', '.', '#', '.', '.'],
                ['.', '.', '.', '#', '.', '.', '.', '.'],
                ['.', '#', '#', '.', '.', '.', '#', '.'],
                ['.', '#', '.', '.', '#', '.', '#', '.'],
                ['.', '.', '.', '#', '.', '.', '.', '$']]

    # lake = big_lake
    lake = small_lake
    size = len(lake) * len(lake[0])
    env = FrozenLake(lake, slip=0.1, max_steps=size, seed=seed)
    #env.play()

    print('# Model-based algorithms')
    gamma = 0.9
    theta = 0.001
    max_iterations = 10000

    print('')

    print('## Policy iteration')
    policy, value = policy_iteration(env, gamma, theta, max_iterations)
    env.render(policy, value)

    print('')

    print('## Value iteration')
    optimal_policy, value = value_iteration(env, gamma, theta, max_iterations)
    env.render(optimal_policy, value)

    max_episodes = 2000
    eta = 0.5
    epsilon = 0.5
    print('# Model-free algorithms')
    print('## sarsa')
    policy, value = sarsa(env, max_episodes, eta, gamma, epsilon, seed=seed)
    env.render(policy, value)
    print('## q_learning')
    policy, value = q_learning(env,
                               max_episodes,
                               eta,
                               gamma,
                               epsilon,
                               seed=seed)
    env.render(policy, value)

    print('# Model-free algorithms')
    linear_env = LinearWrapper(env)

    print('## linear sarsa')
    parameters = linear_sarsa(linear_env,
                              max_episodes,
                              eta,
                              gamma,
                              epsilon,
                              seed=seed)
    policy, value = linear_env.decode_policy(parameters)
    linear_env.render(policy, value)

    print('## linear q_learning')
    parameters = linear_q_learning(linear_env,
                                   max_episodes,
                                   eta,
                                   gamma,
                                   epsilon,
                                   seed=seed)
    policy, value = linear_env.decode_policy(parameters)
    linear_env.render(policy, value)

    print('# finding number of episodes for optimal policy')
    for episodes in np.arange(500, 5000, 100):
        print(f'sarsa episodes = {episodes}')
        policy, value = sarsa(env, episodes, eta, gamma, epsilon, seed=seed)
        policy[
            15] = 0  # set the policy for the goal state as 0 to compare with optimal policy
        if np.array_equal(policy, optimal_policy):
            break
    env.render(policy, value)

    for episodes in np.arange(500, 5000, 100):
        print(f'q_learning episodes = {episodes}')
        policy, value = q_learning(env,
                                   episodes,
                                   eta,
                                   gamma,
                                   epsilon,
                                   seed=seed)
        policy[
            15] = 0  # set the policy for the goal state as 0 to compare with optimal policy
        if np.array_equal(policy, optimal_policy):
            break
    env.render(policy, value)

    print('find best policy on big map')

    lake = big_lake
    size = len(lake) * len(lake[0])
    env = FrozenLake(lake, slip=0.1, max_steps=size, seed=seed)

    print('## Value iteration')
    optimal_policy, value = value_iteration(env, gamma, theta, max_iterations)
    env.render(optimal_policy, value)

    max_episodes = 200000
    eta = 0.8
    epsilon = 0.99
    gamma = 0.91

    print('## sarsa')
    policy, value = sarsa(env, max_episodes, eta, gamma, epsilon, seed=seed)
    env.render(policy, value)
    correct = (policy == optimal_policy).sum()
    print(f'sarsa optimalness = {100 * correct / size}%')

    print('## q_learning')
    policy, value = q_learning(env,
                               max_episodes,
                               eta,
                               gamma,
                               epsilon,
                               seed=seed)
    env.render(policy, value)
    correct = (policy == optimal_policy).sum()
    print(f'q-learning optimalness = {100 * correct / size}%')
Esempio n. 11
0
def run_discrete(environment_name, mapping=None, shape=None):
    problem = gym.make(environment_name)
    print('== {} =='.format(environment_name))
    print('Actions:', problem.action_space.n)
    print('States:', problem.observation_space.n)
    print(problem.desc)
    print()

    if environment_name == 'TaxiEnv-v1':
        print('== Value Iteration ==')
        value_policy, iters = value_iteration_local(problem)
        print('Iterations:', iters)
        print()

        print('== Policy Iteration ==')
        policy, iters = policy_iteration_local(problem)
        print('Iterations:', iters)
        print()

        diff = sum([
            abs(x - y)
            for x, y in zip(policy.flatten(), value_policy.flatten())
        ])
        if diff > 0:
            print('Discrepancy:', diff)
            print()

        if shape is not None:
            print('== Policy ==')
            print_policy(value_policy, mapping, shape)
            print_policy(policy, mapping, shape)
            print()

        taxi_q_learning()
    else:
        print('== Value Iteration ==')
        value_policy_local, iters = value_iteration_local(problem)
        value_policy, Vi, iters, time = value_iteration(problem)
        print('Iterations:', iters)
        print()

        print('== Policy Iteration ==')
        policy_local, iters = policy_iteration_local(problem)
        policy, V, iters, time = policy_iteration(problem)
        print('Iterations:', iters)
        print()

        visualize_policy(value_policy, environment_name, problem.desc.shape,
                         'Optimal policy - Modified transition model')
        visualize_value(Vi, environment_name, problem.desc.shape,
                        'Value estimates - Modified transition model')

        diff = sum([
            abs(x - y)
            for x, y in zip(policy.flatten(), value_policy.flatten())
        ])
        if diff > 0:
            print('Discrepancy:', diff)
            print()

        if shape is not None:
            print('== Policy ==')
            print_policy(value_policy_local, mapping, shape)
            print_policy(policy_local, mapping, shape)
            print()

        frozenlake_q_learning()

        Q, stats, Nsa, final_policy = q_learning(problem, 'greedy', 1000)

        plotting.plot_episode_stats(stats)

    return policy
Esempio n. 12
0
def run_policy_iteration():

    # Run PI across various problem sizes
    if False:
        convergence_times = {}
        deltas = {}
        for problem_size in PROBLEM_SIZES:
            convergence_times[problem_size] = []
            deltas[problem_size] = []
            for seed in SEEDS:
                print(f"Problem Size: {problem_size}, Seed: {seed}")
                env = generate_frozen_lake(problem_size, p=0.8, seed=seed)
                tic = time.time()
                opt_V, opt_policy, delta = policy_iteration.policy_iteration(
                    env, discount_factor=0.999, max_iteration=10000)
                toc = time.time()
                elapsed_time = (toc - tic) * 1000
                convergence_times[problem_size].append(elapsed_time)
                print(f"Time to converge: {elapsed_time: 0.3} ms")
                deltas[problem_size].append(delta)

        # Save the values from running
        deltas_file_pi = open('params/policy_iteration/deltas', 'wb')
        pickle.dump(deltas, deltas_file_pi)
        times_file_pi = open('params/policy_iteration/times', 'wb')
        pickle.dump(convergence_times, times_file_pi)

    # Run policy iteration across various discount factors
    if True:
        convergence_times = {}
        deltas = {}
        policies = {}
        rewards = {}
        problem_size = MEDIUM_SIZE
        p = 0.8
        L = [0.60, 0.70, 0.80, 0.85, 0.90, 0.95, 0.99]
        # L = [0.70, 0.80, 0.90, 0.99]
        for discount in L:
            convergence_times[discount] = []
            deltas[discount] = []
            policies[discount] = []
            rewards[discount] = []
            for seed in SEEDS:
                print(f"p: {p}, Seed: {seed}")
                env = generate_frozen_lake(problem_size, p=p, seed=seed)
                tic = time.time()
                opt_V, opt_policy, delta = policy_iteration.policy_iteration(
                    env, discount_factor=discount, max_iteration=2000)
                toc = time.time()
                elapsed_time = (toc - tic) * 1000
                convergence_times[discount].append(elapsed_time)
                print(f"Time to converge: {elapsed_time: 0.3} ms")
                deltas[discount].append(delta)
                policies[discount].append(opt_policy)

                wins, total_reward, average_reward = play_episodes(
                    env, 300, opt_policy, False)
                print(f"Average reward: {total_reward}")
                rewards[discount].append(total_reward)

            # Save the values from running
            pickle.dump(deltas, open('params/policy_iteration/d_deltas', 'wb'))
            pickle.dump(convergence_times,
                        open('params/policy_iteration/d_times', 'wb'))
            pickle.dump(policies,
                        open('params/policy_iteration/d_policies', 'wb'))
            pickle.dump(rewards, open('params/policy_iteration/d_rewards',
                                      'wb'))

    # Plot things
    if True:
        deltas = pickle.load(open('params/policy_iteration/d_deltas', 'rb'))

        convergence_times = pickle.load(
            open('params/policy_iteration/times', 'rb'))

        rewards = pickle.load(open('params/value_iteration/d_rewards', 'rb'))

        for size in PROBLEM_SIZES:
            print(
                f"Average time to converge: {np.mean(convergence_times[size])} ms, std: {np.std(convergence_times[size])} ms"
            )
        # plotting.plot_pi_convergence_size(deltas)
        # plotting.plot_pi_convergence_d(deltas)
        plotting.plot_pi_convergence_dr(rewards)
Esempio n. 13
0
def frozen_pi_experiment(env_name, new_lake):
    np.random.seed(0)
    min_r = -100.0
    max_r = 100.0
    env = MyWrapper.TransformReward(gym.make(env_name, desc=new_lake),
                                    lambda r: np.clip(r * 100.0, min_r, max_r))
    env.seed(0)
    env.reset()
    total_times = [0] * 10
    gammas = [0] * 10
    num_iterations = [0] * 10
    average_reward_list = [0] * 10

    for i in range(0, 10):
        start_time = time.time()
        policy_iter_instance = policy_iteration(env, (i + 0.5) / 10, 0.0001,
                                                100000)
        improved_policy, value_vector, iteration_counter = (
            policy_iter_instance.policy_improvement())
        average_improved_policy_reward = policy_iter_instance.policy_evaluation(
            improved_policy)  # average reward per iteration
        end_time = time.time()

        gammas[i] = (i + 0.5) / 10
        # print(iteration_counter)
        num_iterations[i] = iteration_counter
        total_times[i] = (end_time - start_time) * 1000  # in millisecond
        average_reward_list[i] = average_improved_policy_reward

    # for plotting, gamma vs reward,  gamma vs time, gamma vs iteration,  iteration vs reward, iteration vs computation time??
    """plot 1: gamma vs reward """
    plt.title("gamma_vs_reward")
    plt.plot(gammas, average_reward_list)
    plt.xlabel("gammas")
    plt.ylabel("average_reward_by_optimal_policy")
    plt.savefig(
        "./plots/frozen_lake_experiment/frozen_PolicyIteration_gamma_vs_reward.png"
    )
    plt.close()
    plt.figure()
    #
    #
    """ plot2: gamma vs time"""
    plt.title("gamma_vs_iteration")
    plt.plot(gammas, num_iterations)  # in mili seconds
    plt.xlabel("gammas")
    plt.ylabel("num_iterations")
    plt.savefig(
        "./plots/frozen_lake_experiment/frozen_PolicyIteration_gamma_vs_iteration.png"
    )
    plt.close()
    plt.figure()
    #
    """ plot3: gamma vs time"""
    plt.title("gamma_vs_time")
    plt.plot(gammas, total_times)  # in mili seconds
    plt.xlabel("gammas")
    plt.ylabel("computational time (mili seconds)")
    plt.savefig(
        "./plots/frozen_lake_experiment/frozen_PolicyIteration_gamma_vs_time.png"
    )
    plt.close()
    plt.figure()
    #
    #
    #
    #
    """ plot4: iteration vs reward"""
    plt.title("iteration_vs_reward")
    plt.plot(num_iterations, average_reward_list
             )  # in mili seconds, iteration here is when its break
    plt.xlabel("num_iterations")
    plt.ylabel("average_reward_by_optimal_policy")
    plt.savefig(
        "./plots/frozen_lake_experiment/frozen_PolicyIteration_iteration_vs_reward.png"
    )
    plt.close()
    plt.figure()
    #
    #
    """ plot5: iteration vs computation time"""
    plt.title("iteration_vs_time")
    plt.plot(num_iterations,
             total_times)  # in mili seconds, iteration here is when its break
    plt.xlabel("num_iterations")
    plt.ylabel("computational time (mili seconds)")
    plt.savefig(
        "./plots/frozen_lake_experiment/frozen_PolicyIteration_iteration_vs_time.png"
    )
    plt.close()
    plt.figure()