Beispiel #1
0
    def value_iteration(self, discount=0.999, epsilon=0.001, save_policy=False, save_plot=False):
        vi = mdp.ValueIteration(transitions=self.prob, reward=self.rewards, gamma=discount,epsilon=epsilon)

        run_stats = vi.run()
        self.plot(run_stats, 'Fire Management - Value Iteration')

        expected_values = vi.V
        optimal_policy = vi.policy
        iterations = vi.iter
        time = vi.time

        return [expected_values, optimal_policy, iterations, time]
Beispiel #2
0
def vi_experiment_extended(P, R):
    """
    compares regular VI vs Gauss-Seidel Value iteration
    :param P:
    :param R:
    :return:
    """
    vi_reg = mdp.ValueIteration(P, R, 0.65)
    vi_reg_stat = vi_reg.run()
    print(vi_reg.iter, vi_reg.time)

    vi_gs = mdp.ValueIterationGS(P, R, 0.65, max_iter=1000)
    vi_gs_stat = vi_gs.run()
    print(vi_gs.iter, vi_gs.time)

    return vi_reg_stat, vi_gs_stat
Beispiel #3
0
def vi_experiment_gamma(P, R):
    """
    experiments on effectiveness of gamma on # of iterations and rewards
    :param P:
    :param R:
    :param problem:
    :return:
    """
    gammas = np.linspace(0.05, 0.95, 10)
    vi_stats = []
    for gamma in gammas:
        vi = mdp.ValueIteration(P, R, gamma, epsilon=0.001)
        vi_stat = vi.run()
        vi_stats.append(vi_stat)

    return vi_stats, gammas
Beispiel #4
0
def vi_experiment_epsilon(P, R):
    """
    experiments on effectiveness of epsilon on # of iteration and time
    :param P:
    :param R:
    :param problem:
    :return:
    """

    epsilons = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8]
    vi_stats = []
    for epsilon in epsilons:
        vi = mdp.ValueIteration(P, R, 0.65, epsilon=epsilon)
        vi_stat = vi.run()
        vi_stats.append(vi_stat)

    return vi_stats, epsilons
Beispiel #5
0
def runValue(transitions, rewards, envName, maxIters, discountRange):
    stats = []
    for discount in discountRange:
        alg = mdp.ValueIteration(transitions,
                                 rewards,
                                 discount,
                                 max_iter=maxIters)
        result = alg.run()
        runStats = alg.run_stats[-1]
        stats.append([
            discount, alg.time, runStats['Iteration'], runStats['Error'],
            runStats['Reward'],
            np.mean(alg.V), alg.policy
        ])
        # stats.append([discount, alg.time, runStats['Iteration'], alg.error_mean, runStats['Reward'], alg.v_mean, alg.policy])

    statsT = list(zip(*stats))
    discounts = statsT[0]
    times = statsT[1]
    iterations = statsT[2]
    errors = statsT[3]
    wins = statsT[4]
    rewards = statsT[5]

    stats = sorted(stats, key=lambda x: x[5], reverse=True)
    topPolicy = stats[0]
    topPolicy = topPolicy[-1]

    title = '{0} Value Iteration - Time'.format(envName)
    fig, ax = plotAx(discounts, times, title, 'Discount Factor', 'Time')
    show(title, fig, ax)

    title = '{0} Value Iteration - Iterations'.format(envName)
    fig, ax = plotAx(discounts, iterations, title, 'Discount Factor',
                     'Iteration')
    show(title, fig, ax)

    title = '{0} Value Iteration - Error'.format(envName)
    fig, ax = plotAx(discounts, errors, title, 'Discount Factor', 'Error')
    show(title, fig, ax)

    title = '{0} Value Iteration - Reward'.format(envName)
    fig, ax = plotAx(discounts, rewards, title, 'Discount Factor', 'Rewards')
    show(title, fig, ax)

    return topPolicy
Beispiel #6
0
def frozen_lake_vi(P, R, gamma_range, mapping, shape):
    print("== Value Iteration == ")
    print("gamma    # Iterations    time (ms)")
    prev_policy = []
    prev_gamma = 0
    no_diff_list = []
    standard_policy = []

    for i, gamma in enumerate(gamma_range):
        vi = mdp.ValueIteration(P, R, gamma, max_iter=10000, epsilon=0.001)
        vi.run()

        timestr = "%0.3f" % (vi.time * 1000)
        atab = " \t"
        if vi.iter <= 99:
            spacing = 4
        else:
            spacing = 3

        gamma_str = "%0.2f" % gamma
        msg = gamma_str + atab + str(vi.iter) + atab * spacing + timestr

        print(msg)
        if gamma == 0.95:
            standard_policy.append((vi.policy, mapping, shape))

        if list(vi.policy) == list(prev_policy):
            no_diff_list.append([prev_gamma, gamma])

        prev_policy = vi.policy
        prev_gamma = gamma

    print()
    print("Value Iteration Policy at Gamma = 0.95")
    contents = standard_policy.pop()
    print_policy(contents[0], contents[1], contents[2])
    print()
    no_diff_len = len(no_diff_list)
    str_list = ["No Policy Difference Between These Gammas: "] * no_diff_len
    policy_diffs = zip(str_list, no_diff_list)
    for diff in policy_diffs:
        print("%s %0.2f %0.2f" % (diff[0], diff[1][0], diff[1][1]))
    print()
Beispiel #7
0
def run_VI_experiments(problem, name, load=True):
    data = []
    if load:
        with open(os.path.join(out, 'VI_results_' + name + '.pkl'), 'rb') as f:
            df = pickle.load(f)
        return df

    print("===========\nValue Iteration\n==========")
    for gamma in config['discount']:
        for param in problem.params:
            P, R = problem.p(**param)
            mdp_util.check(P, R)
            vi = MDP.ValueIteration(P,
                                    R,
                                    gamma,
                                    epsilon=0.01,
                                    max_iter=1000,
                                    skip_check=False)
            run_stats = vi.run()
            data.append([
                problem.name, vi.S, vi.A, vi.gamma,
                [i['Time'] for i in run_stats], vi.iter, vi.max_iter,
                [i['Mean V'] for i in run_stats],
                np.std(vi.V), [i['Max V'] for i in run_stats],
                [i['Reward'] for i in run_stats],
                [i['Error'] for i in run_stats], vi.policy
            ])
            print(problem.name, vi.S, vi.A, gamma)

    df = pd.DataFrame(data,
                      columns=[
                          'name', '#states', '#actions', 'discount', 'time',
                          'iter', 'max_iter', 'mean_V', 'max_V', 'std_V',
                          'reward', 'error_mean', 'policy'
                      ])

    with open(os.path.join(out, 'VI_results_' + name + '.pkl'), 'wb') as f:
        pickle.dump(df, f)

    return df
def run_forest():
    np.random.seed(0)
    P, R = example.forest(S=5, r1=3, r2=15, p=0.2)
    print("Transition Array: ")
    print(P.shape)
    print(P)  # Transition array A x S x S
    print("Reward Array: ")
    print(R.shape)
    print(R)  # Reward array S x A

    # TODO
    gamma_range = np.array([0.1, 0.9, 0.99])
    alpha_range = np.array([0.01, 0.5, 0.99])
    epsilon_range = np.array([0.1, 0.5, 0.95])
    e_decay_range = np.array([0.1, 0.5, 0.999])

    # gamma_range = np.append(np.linspace(0.1, 0.9, 9), np.linspace(0.91, 0.99, 9))
    # alpha_range = np.append(np.linspace(0.01, 0.1, 9), np.linspace(0.2, 0.99, 4))
    # epsilon_range = np.linspace(0.1, 1.0, 10)
    # e_decay_range = np.append(np.linspace(0.1, 0.9, 4), np.linspace(0.91, 0.99, 9))

    difference_list = np.zeros(gamma_range.shape)
    value_iteration_list = np.zeros(gamma_range.shape)
    value_time_list = np.zeros(gamma_range.shape)
    value_reward_list = np.zeros(gamma_range.shape)
    value_error_list = np.zeros(gamma_range.shape)

    policy_iteration_list = np.zeros(gamma_range.shape)
    policy_time_list = np.zeros(gamma_range.shape)
    policy_reward_list = np.zeros(gamma_range.shape)
    policy_error_list = np.zeros(gamma_range.shape)

    for i, gamma in enumerate(gamma_range):
        print('Gamma %0.2f' % gamma)

        vi = mdp.ValueIteration(transitions=P, reward=R, gamma=gamma, epsilon=0.0001, max_iter=10000)
        vi.setVerbose()
        vi.run()
        vi_stats = vi.run_stats
        value_iteration_list[i] = vi_stats[-1:][0]['Iteration']
        value_time_list[i] = vi_stats[-1:][0]['Time']
        value_reward_list[i] = vi_stats[-1:][0]['Reward']
        value_error_list[i] = vi_stats[-1:][0]['Error']
        plot_stats(vi_stats, ('vi_forest_%0.2f' % gamma))

        pi = mdp.PolicyIteration(transitions=P, reward=R, gamma=gamma, max_iter=10000, eval_type=1)
        pi.setVerbose()
        pi.run()
        stats = pi.run_stats
        policy_iteration_list[i] = stats[-1:][0]['Iteration']
        policy_time_list[i] = stats[-1:][0]['Time']
        policy_reward_list[i] = stats[-1:][0]['Reward']
        policy_error_list[i] = stats[-1:][0]['Error']
        plot_stats(stats, ('pi_forest_%0.2f' % gamma))
        print('Policies Found')
        print('Value Iteration: ' + str(vi.policy))
        print('Policy Iteration: ' + str(pi.policy))

        difference1 = sum([abs(x - y) for x, y in zip(pi.policy, vi.policy)])
        difference_list[i] = difference1
        print("Discrepancy in Policy and Value Iteration: ", difference1)
        print()

    # Plotting
    # Error v Iteration
    plt.clf()
    plt.title('Value Iteration: Error v Iterations')
    plt.xlabel('Iterations')
    plt.ylabel('Error')
    plt.plot(list(value_iteration_list), list(value_error_list))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/vi_error_v_iteration.png')

    # Reward v Gamma
    plt.clf()
    plt.title('Value Iteration: Reward v Gamma')
    plt.xlabel('Gamma')
    plt.ylabel('Reward')
    plt.plot(list(gamma_range), list(value_reward_list))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/vi_reward_v_gamma.png')

    # Gamma v Iterations
    plt.clf()
    plt.title('Value Iteration: Gamma v Iterations')
    plt.xlabel('Iterations')
    plt.ylabel('Gamma')
    plt.plot(list(value_iteration_list), list(gamma_range))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/vi_gamma_v_iterations.png')

    # Gamma v Time
    plt.clf()
    plt.title('Value Iteration: Gamma v Time')
    plt.xlabel('Time')
    plt.ylabel('Gamma')
    plt.plot(list(value_time_list), list(gamma_range))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/vi_gamma_v_time.png')

    # Reward vs Iterations
    plt.clf()
    plt.title('Value Iteration: Reward v Iterations')
    plt.xlabel('Iterations')
    plt.ylabel('Reward')
    plt.plot(list(value_iteration_list), list(value_reward_list))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/vi_reward_v_iterations.png')

    # Policy
    # Error v Iteration
    plt.clf()
    plt.title('Policy Iteration: Error v Iterations')
    plt.xlabel('Iterations')
    plt.ylabel('Error')
    plt.plot(list(policy_iteration_list), list(policy_error_list))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/pi_error_v_iteration.png')

    # Gamma v Reward
    plt.clf()
    plt.title('Policy Iteration: Reward v Gamma')
    plt.xlabel('Gamma')
    plt.ylabel('Reward')
    plt.plot(list(gamma_range), list(policy_reward_list))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/pi_reward_v_gamma.png')

    # Gamma v Iterations
    plt.clf()
    plt.title('Policy Iteration: Gamma v Iterations')
    plt.xlabel('Iterations')
    plt.ylabel('Gamma')
    plt.plot(list(policy_iteration_list), list(gamma_range))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/pi_gamma_v_iterations.png')

    # Gamma v Time
    plt.clf()
    plt.title('Policy Iteration: Gamma v Time')
    plt.xlabel('Time')
    plt.ylabel('Gamma')
    plt.plot(list(policy_time_list), list(gamma_range))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/pi_gamma_v_time.png')

    # Reward vs Iterations
    plt.clf()
    plt.title('Policy Iteration: Reward v Iterations')
    plt.xlabel('Iterations')
    plt.ylabel('Reward')
    plt.plot(list(policy_iteration_list), list(policy_reward_list))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/pi_reward_v_iterations.png')

    # Gamma vs Policy Differences
    plt.clf()
    plt.title('Gamma v Policy Differences')
    plt.xlabel('Gamma')
    plt.ylabel('Policy Differences')
    plt.plot(list(gamma_range), list(difference_list))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/gamma_v_differences.png')
    plt.close('all')

    prev_Q = None
    thresh = 1e-4
    print('== Q Learning ==')
    for i, gamma in enumerate(gamma_range):
        for j, alpha in enumerate(alpha_range):
            for k, ep in enumerate(epsilon_range):
                for l, ed in enumerate(e_decay_range):
                    # print('ql: gamma - {}, alpha - {}, epsilon - {}, e_decay - {}'.format(gamma, alpha, ep, ed))
                    ql = mdp.QLearning(transitions=P, reward=R, gamma=gamma, alpha=alpha, alpha_decay=1.0, alpha_min=0.001,
                                       epsilon=ep, epsilon_min=0.1, epsilon_decay=ed, n_iter=10e4)
                    stats = ql.run()
                    plot_stats(stats, ('ql_forest_%0.2f_%0.2f_%0.2f_%0.2f' % (gamma, alpha, ep, ed)))

                    # print('Policy: ')
                    # print(ql.policy)
                    # print(ql.run_stats)

                    df = pd.DataFrame.from_records(ql.run_stats)
                    iteration_list = df['Iteration'][-100:]
                    windowed_reward = df['Reward'][-100:].mean()
                    error_list = df['Error'][-100:].mean()

                    if prev_Q is None:
                        prev_Q = ql.Q
                    else:
                        variation = np.absolute(np.subtract(np.asarray(ql.Q), np.asarray(prev_Q))).max()
                        res = np.abs(np.subtract(np.asarray(prev_Q), np.asarray(ql.Q)))
                        print('Result: ')
                        print(res)
                        print('Variation: ')
                        print(variation)
                        print('Mean Reward for Last 100 Iterations:')
                        print(windowed_reward)
                        if np.all(res < thresh) or variation < thresh or windowed_reward > 1.0:
                            print('Breaking! Below Thresh')
                            print('Found at: gamma - {}, alpha - {}, epsilon - {}, e_decay - {}'.format(
                                gamma, alpha, ep, ed))
                            print('Optimal Policy: ')
                            print(ql.policy)
                            break
Beispiel #9
0
def run(verbose=False):
    # MDP Forest Problem
    # transitions, reward = example.forest()
    nS = 1000
    # transitions, reward = example.forest(S=nS, r1=250, r2=120, p=0.01, is_sparse=False)
    transitions, reward = example.forest(S=nS,
                                         r1=1045,
                                         r2=1025,
                                         p=0.01,
                                         is_sparse=False)

    # print(transitions)
    # print (reward)
    # return
    print('~~~~~~~~~~ Forest - Policy Iteration ~~~~~~~~~~')
    pi = mdp.PolicyIteration(transitions, reward, 0.75, max_iter=10000)
    if verbose:
        pi.setVerbose()
    pi.run()
    util.print_debugs(pi)
    # print(pi.run_stats)
    # return

    print('~~~~~~~~~~ Forest - Value Iteration ~~~~~~~~~~')
    vi = mdp.ValueIteration(transitions, reward, 0.75, max_iter=100000)
    if verbose:
        vi.setVerbose()
    vi.run()
    util.print_debugs(vi)

    if (vi.policy == pi.policy):
        print('Forest - Value and Policy Iteration policies are the same! ')
    else:
        print('Forest - Value and Policy Iteration policies are NOT the same.')

    print('~~~~~~~~~~ Forest - Q-Learning ~~~~~~~~~~')
    # transitions, reward, gamma,
    #  alpha=0.1, alpha_decay=0.99, alpha_min=0.001,
    #  epsilon=1.0, epsilon_min=0.1, epsilon_decay=0.99,
    #  n_iter=10000, skip_check=False, iter_callback=None,
    #  run_stat_frequency=None):

    ql = mdp.QLearning(transitions,
                       reward,
                       0.75,
                       alpha=0.3,
                       epsilon_min=0.005,
                       n_iter=500000)
    if verbose:
        ql.setVerbose()
    start_t = time.process_time()
    ql.run()
    end_t = time.process_time()

    # Output
    print('~~~~~~~~~~ Forest - Policy Iteration ~~~~~~~~~~')
    util.print_debugs(pi)
    print('~~~~~~~~~~ Forest - Value Iteration ~~~~~~~~~~')
    util.print_debugs(vi)
    print('~~~~~~~~~~ Forest - Q-Learning ~~~~~~~~~~')
    print(ql.policy)
    print('Q-Learning # of Iterations: %i' % q_counter)
    print('Clock time')
    print(end_t - start_t)

    if (vi.policy == pi.policy):
        print('Forest - Value and Policy Iteration policies are the same! ')
    else:
        print('Forest - Value and Policy Iteration policies are NOT the same.')

    if (vi.policy == ql.policy):
        print('Forest – QL and VI Policies are the same!')
    else:
        print('Forest – QL and VI Policies are NOT the same.')
    if (pi.policy == ql.policy):
        print('Forest – PI and PI Policies are the same!')
    else:
        print('Forest – PI and VI Policies are NOT the same.')

    # A Q-Learning Algorithm
    #
    # Source:
    #   https://www.oreilly.com/radar/introduction-to-reinforcement-learning-and-openai-gym/
    """
Beispiel #10
0
def vi_pi_comp(P, R):
    vi = mdp.ValueIteration(P, R, 0.60, epsilon=0.001).run()
    pi = mdp.PolicyIteration(P, R, 0.60, eval_type=1).run()

    return vi, pi
Beispiel #11
0
def vi_pi_q_comp(P, R):
    vi = mdp.ValueIteration(P, R, 0.60, epsilon=0.001).run()
    pi = mdp.PolicyIteration(P, R, 0.60, eval_type=1).run()
    q = mdp.QLearning(P, R, 0.6, alpha=0.2).run()
    return vi, pi, q
Beispiel #12
0
def run_vi(envs, gamma=0.96, max_iters=1000, verbose=True):
    all_rewards = []
    all_iters = []
    all_error_means = []
    all_error_dfs = []
    time_per_run = []

    num_episodes = len(envs)
    for env, episode in zip(envs, range(num_episodes)):
        P, R = env
        fm_vi = mdp.ValueIteration(
            transitions=P,
            reward=R,
            gamma=gamma,
            max_iter=max_iters,
        )
        # if verbose: fm_vi.setVerbose()
        t0 = time()
        fm_vi.run()
        time_elapsed = time() - t0
        time_per_run.append(time_elapsed)
        if verbose:
            print("Forest Management VI Episode", episode, "runtime (s):",
                  time_elapsed)

        # add error means for each episode
        error_m = np.sum(fm_vi.error_mean)
        all_error_means.append(error_m)
        if verbose:
            print("Forest Management VI Episode", episode, "error mean:",
                  error_m, '\n')

        error_over_iters = fm_vi.error_over_iters
        # print(error_over_iters)
        error_plot_df = pd.DataFrame(0,
                                     index=np.arange(1, max_iters + 1),
                                     columns=['error'])
        error_plot_df.iloc[0:len(error_over_iters), :] = error_over_iters
        all_error_dfs.append(error_plot_df)

        print_policy(fm_vi.policy)
        # print(fm_vi.policy, '\n', R, '\n')

        # rewards = calc_reward(fm_vi.policy, R)
        # total_reward = np.sum(rewards)
        # all_rewards.append(total_reward)
        # if verbose: print("Forest Management VI Episode", episode, "reward:", total_reward, '\n')

        all_iters.append(fm_vi.iter)
        if verbose:
            print("Forest Management VI Episode", episode, "last iter:",
                  fm_vi.iter, '\n')

    filename = "fm_vi_stats.csv"
    output_to_csv(filename, all_iters, all_rewards)

    combined_error_df = pd.concat(all_error_dfs, axis=1)
    mean_error_per_iter = combined_error_df.mean(axis=1)
    mean_error_per_iter.to_csv("tmp/fm_vi_error.csv")

    # plot the error over iterations
    title = "FM VI: error vs. iter (mean over " + str(
        num_episodes) + " episodes)"
    path = "graphs/fm_vi_error_iter.png"
    plotting.plot_error_over_iters(mean_error_per_iter, title, path, xlim=200)

    # show avg time per run
    avg_time_per_run = np.mean(np.array(time_per_run))
    print("FM VI - avg seconds per run:", avg_time_per_run, '\n')
Beispiel #13
0
import hiive.mdptoolbox.example as example
import hiive.mdptoolbox.mdp as mdp
states = 50

P, R = example.forest(S=states)

#pi = mdp.QLearning(P, R, 0.99, n_iter=500000, alpha=0.3, alpha_min=0.1, epsilon_min=0.1, epsilon_decay=0.9999)
pi = mdp.ValueIteration(P, R, 0.99)

pi.run()
#print("deltas_" + str(gamma)[2:] + " = " + str(pi.deltas))

for x in pi.run_stats:
    print(x)

print(pi.policy)

l = len(pi.run_stats) - 1
print('Time: ', pi.run_stats[l]['Time'], "Reward: ", pi.run_stats[l]['Reward'])
Beispiel #14
0
# region VI

# avg V, n_iter, time
ep_vals = [.1, .0001]
gamma_vals = [.2, .5, .8, .95, .999]

big_vs = []
big_n = []
big_t = []
for epsilon in ep_vals:
    avg_vs = []
    n_iters = []
    times = []
    for gamma in gamma_vals:
        vi = mdp.ValueIteration(P_small, R_small, gamma=gamma, epsilon=epsilon)
        stats = vi.run()

        avg_v = stats[-1]['Mean V']
        n_iter = len(stats)
        time = stats[-1]['Time']

        avg_vs.append(avg_v)
        n_iters.append(n_iter)
        times.append(time)

    big_vs.append(avg_vs)
    big_n.append(n_iters)
    big_t.append(times)

for i in range(len(ep_vals)):
Beispiel #15
0
def comparing_mdps(P, R, mapping, shape):
    print("Comparing the Two Policies")
    vi = mdp.ValueIteration(P, R, 0.9, max_iter=10000)
    vi.run()
    print("Value Function: ")
    print(vi.V)
    print("Policy: ")
    print(vi.policy)
    print_policy(vi.policy, mapping, shape)
    print("Iter: ")
    print(vi.iter)
    print("Time: ")
    print(vi.time)
    # print(vi.run_stats)
    print()
    pi = mdp.PolicyIteration(P, R, 0.9, max_iter=100000)
    pi.run()
    print("Policy Function: ")
    print(pi.V)
    print("Policy: ")
    print(pi.policy)
    print_policy(pi.policy, mapping, shape)
    print("Iter: ")
    print(pi.iter)
    print("Time: ")
    print(pi.time)
    # print(pi.run_stats)
    print()
    pim = mdp.PolicyIterationModified(P, R, 0.9, max_iter=100000, epsilon=0.05)
    pim.run()
    print("Policy Modified Function: ")
    print(pim.V)
    print("Policy: ")
    print(pim.policy)
    print_policy(pim.policy, mapping, shape)
    print("Iter: ")
    print(pim.iter)
    print("Time: ")
    print(pim.time)
    # print(pi.run_stats)
    print()
    ql = mdp.QLearning(
        P,
        R,
        0.9,
        n_iter=10e4,
        epsilon=0.1,
        epsilon_decay=0.1,
        epsilon_min=0.1,
    )
    ql.run()
    print("Q Learning Function: ")
    print(ql.V)
    print("Policy: ")
    print(ql.policy)
    print_policy(ql.policy, mapping, shape)
    print("Mean Discrepancy: ")
    print(ql.error_mean)
    # print(ql.v_mean)
    print("Epsilon: ")
    print(ql.epsilon)
    difference1 = sum([abs(x - y) for x, y in zip(pi.policy, vi.policy)])
    if difference1 > 0:
        print("Discrepancy in Policy and Value Iteration: ", difference1)
        print()
    difference2 = sum([abs(x - y) for x, y in zip(pim.policy, vi.policy)])
    if difference2 > 0:
        print("Discrepancy in Policy Modified and Value Iteration: ",
              difference2)
        print()
    difference3 = sum([abs(x - y) for x, y in zip(pim.policy, pi.policy)])
    if difference3 > 0:
        print("Discrepancy in Policy Modified and Policy Iteration: ",
              difference3)
        print()
    difference4 = sum([abs(x - y) for x, y in zip(vi.policy, ql.policy)])
    if difference4 > 0:
        print("Discrepancy in Q Learning and Value Iteration: ", difference4)
        print()
    difference5 = sum([abs(x - y) for x, y in zip(pi.policy, ql.policy)])
    if difference5 > 0:
        print("Discrepancy in Q Learning and Policy Iteration: ", difference5)
        print()
    difference6 = sum([abs(x - y) for x, y in zip(pim.policy, ql.policy)])
    if difference6 > 0:
        print("Discrepancy in Q Learning and Policy Iteration Modified: ",
              difference6)
        print()
Beispiel #16
0
                rewards[state][action] += reward

    # tune PI/VI gamma values
    tune_gamma = False
    if tune_gamma:
        gamma_range = np.linspace(0.01, 0.99, 99)
        vi_iter = []
        pi_iter = []
        vi_time = []
        pi_time = []
        vi_max_v = []
        pi_max_v = []

        for gamma in gamma_range:
            vi = mdp.ValueIteration(transitions,
                                    rewards,
                                    gamma,
                                    max_iter=10000)
            vi.run()
            vi_time.append(vi.time)
            vi_max_v.append(np.max(vi.V))
            vi_iter.append(vi.iter)

            pi = mdp.PolicyIterationModified(transitions,
                                             rewards,
                                             gamma,
                                             max_iter=1000)
            pi.run()
            pi_time.append(pi.time)
            pi_max_v.append(np.max(pi.V))
            pi_iter.append(pi.iter)
    for w in WINS:
        S = sum(1 if (w[k] == 1 and state[k] == who) else 0
                for k in range(ACTIONS))
        if S == 3:
            # We have a win
            return True
    # There were no wins so return False
    return False


def isValid(state):
    """"""
    # S1 is the sum of the player's cells
    S1 = sum(1 if x == PLAYER else 0 for x in state)
    # S2 is the sum of the opponent's cells
    S2 = sum(1 if x == OPPONENT else 0 for x in state)
    if (S1, S2) in OWNED_CELLS:
        return True
    else:
        return False


if __name__ == "__main__":
    P, R = getTransitionAndRewardArrays()
    ttt = mdp.ValueIteration(P, R, 1)
    ttt.setVerbose()
    ttt.run()
    f = "tictactoe.pkl"
    pickle.dump(ttt.policy, open(f, "wb"))
    print("Optimal policy pickled as '%s' in current directory." % f)
Beispiel #18
0
def main():

    print("Create a frozen lake of Size 10x10")
    p = generate_FrozenLake(size=10)
    num_states = len(p)
    num_actions = len(p[0])
    print("Num of States:", num_states)
    print("Num of Actions:", num_actions)
    P = np.zeros((num_actions, num_states, num_states))
    R = np.zeros((num_actions, num_states, num_states))

    for i in range(num_states):
        for j in range(num_actions):
            sum = 0
            for prob, next_state, rewards, done in p[i][j]:
                P[j][i][next_state] += prob
                R[j][i][next_state] = rewards
                sum += prob

    # VI
    for gamma in [.9, 0.6]:
        vi = mdp.ValueIteration(transitions=P,
                                reward=R,
                                gamma=gamma,
                                epsilon=0.000001,
                                max_iter=5000)
        stats_data = vi.run()
        plot_mpd_graph(
            stats_data,
            'VI Frozen_Lake(10x10), Gamma={}, Reward plot'.format(gamma),
            'Reward', 'Reward')

        plot_mpd_graph(
            stats_data,
            'VI Frozen_Lake(10x10), Gamma={}, Time PLot'.format(gamma),
            'Time(seconds)', 'Time')

    # PI
    for gamma in [.9, 0.6]:
        print('PI {}'.format(gamma))
        pi = mdp.PolicyIteration(transitions=P,
                                 reward=R,
                                 gamma=gamma,
                                 max_iter=5000,
                                 eval_type=1)
        stats_data = pi.run()
        plot_mpd_graph(
            stats_data,
            'PI Frozen_Lake(10x10), Gamma={}, error plot'.format(gamma),
            'Error', 'Error')

        plot_mpd_graph(
            stats_data,
            'PI Frozen_Lake(10x10), Gamma={}, Time PLot'.format(gamma),
            'Time(seconds)', 'Time')

    # QLearning
    for alpha in [0.1, 0.4]:
        qlearn = mdp.QLearning(transitions=P,
                               reward=R,
                               gamma=0.6,
                               alpha=alpha,
                               alpha_decay=0.1,
                               alpha_min=0.0001,
                               epsilon=0.1,
                               epsilon_min=0.9,
                               epsilon_decay=0,
                               n_iter=10000)
        stats_data = qlearn.run()
        plot_mpd_graph(
            stats_data,
            'Qlearning Frozen_Lake(10x10),  alpha={}, Error plot'.format(
                alpha), 'Error', 'Error')

        plot_mpd_graph(
            stats_data,
            'Qlearning Frozen_Lake(10x10),  alpha={}, Reward plot'.format(
                alpha), 'Reward', 'Reward')

        plot_mpd_graph(
            stats_data,
            'Qlearning Frozen_Lake(10x10),  alpha={}, Time PLot'.format(alpha),
            'Time(seconds)', 'Time')
Beispiel #19
0
def run_forest(size):
    seed_val = 42
    np.random.seed(seed_val)
    random.seed(seed_val)

    S = size
    r1 = 10  # The reward when the forest is in its oldest state and action ‘Wait’ is performed
    r2 = 50  # The reward when the forest is in its oldest state and action ‘Cut’ is performed
    p = 0.1

    P, R = mdptoolbox.example.forest(S=S, r1=r1, r2=r2,
                                     p=p)  # Defaults left the same

    epsilons = [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001]
    gammas = [0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 0.999]
    gammas = [0.99]
    epsilons = [0.01]

    per_won_hm = np.zeros((len(gammas), len(epsilons)))
    iters_hm = np.zeros((len(gammas), len(epsilons)))
    time_hm = np.zeros((len(gammas), len(epsilons)))

    best_rew = -1
    best_pol_arr = []
    g_cnt = 0
    e_cnt = 0
    for g in gammas:
        e_cnt = 0
        print(g)
        best_pol = []
        best_rew = -1

        for e in epsilons:
            pi = mdp.ValueIteration(P, R, gamma=g, epsilon=e)
            pi.run()
            rew = run_episodes(pi.policy, S, R, p, 1000, 20)
            if rew > best_rew:
                best_rew = rew
                best_pol = pi.policy
            per_won_hm[g_cnt][e_cnt] = rew
            iters_hm[g_cnt][e_cnt] = pi.iter
            time_hm[g_cnt][e_cnt] = pi.time * 1000
            e_cnt += 1
        best_pol_arr.append(list(best_pol))
        g_cnt += 1

    iterations = [i["Iteration"] for i in pi.run_stats]
    mean_val = [i["Mean V"] for i in pi.run_stats]
    error = [i["Error"] for i in pi.run_stats]
    reward = [i["Reward"] for i in pi.run_stats]

    fig, ax = plt.subplots()
    ax.plot(mean_val, label='Mean V')
    ax.plot(error, label='Error')
    ax.plot(reward, label='Reward')
    ax.legend()
    plt.xlabel('Iterations', fontsize=15)
    plt.ylabel('V/Error/Reward', fontsize=15)
    plt.title("Mean V/Error/ Reward vs iterations")
    plt.show()

    op_list = [list(best_pol)]
    print(best_pol_arr)

    # Plot Percent Games Won Heatmap
    fig, ax = plt.subplots()

    im, cbar = heatmap(per_won_hm,
                       gammas,
                       epsilons,
                       ax=ax,
                       cmap="YlGn",
                       cbarlabel="Average Reward")
    texts = annotate_heatmap(im, valfmt="{x:.0f}")

    fig.tight_layout()
    plt.savefig('Images\\VI-Forest-Per_Heatmap-' + str(size) + '.png')
    plt.show()

    # Plot Iterations Heatmap
    fig, ax = plt.subplots()

    im, cbar = heatmap(iters_hm,
                       gammas,
                       epsilons,
                       ax=ax,
                       cmap="YlGn",
                       cbarlabel="# of Iterations to Convergence")
    texts = annotate_heatmap(im, valfmt="{x:.0f}")

    fig.tight_layout()
    plt.savefig('Images\\VI-Forest-Iter_Heatmap-' + str(size) + '.png')
    plt.show()

    # Plot Run time Heatmap
    fig, ax = plt.subplots()

    im, cbar = heatmap(time_hm,
                       gammas,
                       epsilons,
                       ax=ax,
                       cmap="YlGn",
                       cbarlabel="Runtime (ms)")
    texts = annotate_heatmap(im, valfmt="{x:.1}")

    fig.tight_layout()
    plt.savefig('Images\\VI-Forest-Time_Heatmap-' + str(size) + '.png')
    plt.show()

    # Plot out optimal policy
    # Citation: https://stackoverflow.com/questions/52566969/python-mapping-a-2d-array-to-a-grid-with-pyplot
    cmap = colors.ListedColormap(['blue', 'red'])
    fig, ax = plt.subplots(figsize=(12, 3.75))
    plt.title("Forest VI Policy - Red = Cut, Blue = Wait")
    gammas.reverse()
    ax.set_yticklabels(gammas, fontsize=15)
    plt.xticks(fontsize=15)
    ax.tick_params(left=False)  # remove the ticks
    plt.xlabel('State', fontsize=15)
    plt.ylabel('Gamma', fontsize=15)
    plt.pcolor(best_pol_arr[::-1], cmap=cmap, edgecolors='k', linewidths=0)
    plt.savefig('Images\\VI-Forest-Heatmap-' + str(size) + '.png')

    plt.show()
Beispiel #20
0
    map = generate_random_map(size=mapsize, p=0.96)
    env = gym.make('FrozenLake-v0', desc=map)
    env._max_episode_steps = 1e6
elif problem == 'forest':
    env = gym.make('Forest-v0')
    env._max_episode_steps = 1e3

state = env.reset()
R, T = evaluateRT(env)

if algo == 'pi':
    solver = mdp.PolicyIteration(T, R, 0.9, max_iter=5000)
elif algo == 'vi':
    solver = mdp.ValueIteration(T,
                                R,
                                0.9,
                                epsilon=1e-6,
                                max_iter=5000,
                                initial_value=0)
elif algo == 'q':
    solver = mdp.QLearning(T,
                           R,
                           0.99,
                           alpha=1.0,
                           alpha_decay=0.9999993,
                           alpha_min=0.1,
                           epsilon=1.0,
                           epsilon_min=0.2,
                           epsilon_decay=0.999999,
                           n_iter=6e6,
                           run_stat_frequency=1e4)
Beispiel #21
0
def getVIFrames(env, P, R):
    vi = mdp.ValueIteration(P, R, 0.9, epsilon=0.01)
    vi.run()
    run_stats = vi.run_stats
    return [step['Value'].reshape(env.nrow, env.ncol) for step in run_stats]
Beispiel #22
0
def getPlotsForGridWorldViPi(worlds, grid, starts, goals):
    iters = []
    iter = range(1, 21, 1)
    iters.append(iter)
    iter = range(1, 41, 1)
    iters.append(iter)
    qlearningIter = [100000, 100000000]
    worldCntr = 1

    for data in worlds:
        pi_rewards = []
        pi_error = []
        pi_time = []
        pi_iter = []
        vi_rewards = []
        vi_error = []
        vi_time = []
        vi_iter = []
        size = len(data)
        holesCoords = []
        for row in range(0, data.shape[0]):
            for col in range(0, data.shape[1]):
                if data[row, col] == 1:  # Obstacle
                    holesCoords.append((row, col))
                if data[row, col] == 2:  # El roboto
                    start = (row, col)
                if data[row, col] == 3:  # Goal
                    goal = (row, col)
        transitions, reward, discount, lake = get_environement(
            data, size, holesCoords, start, goal)

        for iter in iters[worldCntr - 1]:
            # Policy iteration
            policy_iteration = mdp.PolicyIteration(transitions,
                                                   reward,
                                                   discount,
                                                   policy0=None,
                                                   max_iter=iter,
                                                   eval_type=0)
            policy_iteration.run()
            print_as_grid(policy_iteration.policy, lake.lake, size)
            pi_rewards.append(
                policy_iteration.run_stats[len(policy_iteration.run_stats) -
                                           1]['Reward'])
            pi_error.append(
                policy_iteration.run_stats[len(policy_iteration.run_stats) -
                                           1]['Error'])
            pi_time.append(
                policy_iteration.run_stats[len(policy_iteration.run_stats) -
                                           1]['Time'])
            pi_iter.append(
                policy_iteration.run_stats[len(policy_iteration.run_stats) -
                                           1]['Iteration'])

            # Value iteration
            value_iteration = mdp.ValueIteration(transitions,
                                                 reward,
                                                 discount,
                                                 epsilon=0.001,
                                                 max_iter=iter,
                                                 initial_value=0)
            value_iteration.run()
            print_as_grid(value_iteration.policy, lake.lake, size)
            vi_rewards.append(
                value_iteration.run_stats[len(value_iteration.run_stats) -
                                          1]['Reward'])
            vi_error.append(
                value_iteration.run_stats[len(value_iteration.run_stats) -
                                          1]['Error'])
            vi_time.append(
                value_iteration.run_stats[len(value_iteration.run_stats) -
                                          1]['Time'])
            vi_iter.append(
                value_iteration.run_stats[len(value_iteration.run_stats) -
                                          1]['Iteration'])

        plt.style.use('seaborn-whitegrid')
        plt.plot(iters[worldCntr - 1], pi_error, label='PI')
        plt.plot(iters[worldCntr - 1], vi_error, label='VI')
        plt.ylabel('Convergence', fontsize=12)
        plt.xlabel('Iter.', fontsize=12)
        plt.title('Convergence vs Iteration for Grid World no.' +
                  str(worldCntr),
                  fontsize=12,
                  y=1.03)
        plt.legend()
        plt.savefig(
            'Figures/Grid/Convergence vs Iteration for Grid World no.' +
            str(worldCntr) + '.png')
        plt.close()

        plt.style.use('seaborn-whitegrid')
        plt.plot(iters[worldCntr - 1], pi_rewards, label='PI')
        plt.plot(iters[worldCntr - 1], vi_rewards, label='VI')
        plt.ylabel('Reward', fontsize=12)
        plt.xlabel('Iter.', fontsize=12)
        plt.title('Reward vs Iteration for Grid World no.' + str(worldCntr),
                  fontsize=12,
                  y=1.03)
        plt.legend()
        plt.savefig('Figures/Grid/Reward vs Iteration for Grid World no.' +
                    str(worldCntr) + '.png')
        plt.close()

        plt.style.use('seaborn-whitegrid')
        plt.plot(iters[worldCntr - 1], pi_time, label='PI')
        plt.plot(iters[worldCntr - 1], vi_time, label='VI')
        plt.ylabel('Time', fontsize=12)
        plt.xlabel('Iter.', fontsize=12)
        plt.title('Time vs Iteration for Grid World no.' + str(worldCntr),
                  fontsize=12,
                  y=1.03)
        plt.legend()
        plt.savefig('Figures/Grid/Time vs Iteration for Grid World no.' +
                    str(worldCntr) + '.png')
        plt.close()

        worldCntr += 1
def run(verbose=False):
    # env = gym.make('FrozenLake-v0', is_slippery=True)
    env = gym.make('FrozenLake8x8-v0', is_slippery=True)
    # env = gym.make('FrozenLake-v0')

    # Debug
    # print('env.P')
    # pprint(env.P)
    # print('env.R')
    # print(env.R)

    

    P, R = transform_for_MDPToolbox(env)
    # print('Reward')
    # print(R)
    # return

    print('~~~~~~~~~~ FrozenLake-v0 – 4x4 Policy Iteration ~~~~~~~~~~')
    pi = mdp.PolicyIteration(P, R, 0.6, max_iter=100000)

    if verbose:
        pi.setVerbose()
    pi.run()
    util.print_debugs(pi)
    total_r_pi = render_env_policy(env, pi.policy, display=verbose)


    print('~~~~~~~~~~ FrozenLake-v0 – 4x4 Value Iteration ~~~~~~~~~~')
    vi = mdp.ValueIteration(P, R, 0.6, epsilon=0.005, max_iter=10000)
    if verbose:
        vi.setVerbose()
    vi.run()
    util.print_debugs(vi)
    total_r_vi = render_env_policy(env, pi.policy, display=verbose)
    if(vi.policy == pi.policy):
        print('FrozenLake-v0 4x4 - Value and Policy Iteration policies are the same! ')
    else:
        print('FrozenLake-v0 4x4 - Value and Policy Iteration policies are NOT the same. ')


    print('~~~~~~~~~~ FrozenLake-v0 – Q-Learning ~~~~~~~~~~')
    ql = mdp.QLearning(P, R, 0.6, alpha=0.3, epsilon_min=0.005, n_iter=100000)
    if verbose:
        ql.setVerbose()
    start_t = time.process_time()
    ql.run()
    end_t = time.process_time()

    total_r_ql = render_env_policy(env, ql.policy, display=verbose)

# Output
    print('~~~~~~~~~~ FrozenLake-v0 - Policy Iteration ~~~~~~~~~~')
    util.print_debugs(pi)
    print('Total Reward: %f' %total_r_pi)
    print('~~~~~~~~~~ FrozenLake-v0 - Value Iteration ~~~~~~~~~~')
    util.print_debugs(vi)
    print('Total Reward: %f' %total_r_vi)
    print('~~~~~~~~~~ FrozenLake-v0 - Q-Learning ~~~~~~~~~~')
    print('Clock time')
    print(end_t - start_t)
    print('Total Reward: %f' %total_r_pi)
    print(ql.policy)
    
    if(vi.policy == pi.policy):
        print('FrozenLake-v0 - Value and Policy Iteration policies are the same! ')
    else:
        print('FrozenLake-v0 - Value and Policy Iteration policies are NOT the same.')
    
    
    if(vi.policy == ql.policy):
        print('FrozenLake-v0 – QL and VI Policies are the same!')
    else:
        print('FrozenLake-v0 – QL and VI Policies are NOT the same.')
    if(pi.policy == ql.policy):
        print('FrozenLake-v0 – PI and PI Policies are the same!')
    else:
        print('FrozenLake-v0 – PI and VI Policies are NOT the same.')

    print('VI Policy')
    print_policy(vi.policy)
    # print('PI Policy')
    # print_policy(vi.policy)
    print('QL Policy')
    print_policy(ql.policy)


    # Source: 
    #   https://www.oreilly.com/radar/introduction-to-reinforcement-learning-and-openai-gym/
    """
Beispiel #24
0
def findBestPolicyForGridWorlds(worlds, grid, starts, goals):
    qlearningIter = [1000, 10000]
    worldCntr = 1

    for data in worlds:
        size = len(data)
        holesCoords = []
        for row in range(0, data.shape[0]):
            for col in range(0, data.shape[1]):
                if data[row, col] == 1:  # Obstacle
                    holesCoords.append((row, col))
                if data[row, col] == 2:  # El roboto
                    start = (row, col)
                if data[row, col] == 3:  # Goal
                    goal = (row, col)
        transitions, reward, discount, lake = get_environement(
            data, size, holesCoords, start, goal)

        #Policy iteration
        policy_iteration = mdp.PolicyIteration(transitions,
                                               reward,
                                               discount,
                                               policy0=None,
                                               max_iter=1000,
                                               eval_type=0)
        policy_iteration.run()
        print_as_grid(policy_iteration.policy, lake.lake, size)
        print(policy_iteration.time)
        print(policy_iteration.iter)

        actions = getActions(policy_iteration.policy, start, goal, size)
        svg = gv.gridworld(n=size,
                           tile2classes=lake.tile2classes,
                           actions=actions,
                           extra_css='goal',
                           start=start,
                           policyList=policy_iteration.policy)
        svg.saveas("Figures/Grid/PI-Final-Path for World " + str(worldCntr) +
                   ".svg",
                   pretty=True)

        #Value iteration
        value_iteration = mdp.ValueIteration(transitions,
                                             reward,
                                             discount,
                                             epsilon=0.001,
                                             max_iter=1000,
                                             initial_value=0)
        value_iteration.run()
        print_as_grid(value_iteration.policy, lake.lake, size)
        print(value_iteration.time)
        print(value_iteration.iter)

        actions = getActions(value_iteration.policy, start, goal, size)
        svg = gv.gridworld(n=size,
                           tile2classes=lake.tile2classes,
                           actions=actions,
                           extra_css='goal',
                           start=start,
                           policyList=value_iteration.policy)
        svg.saveas("Figures/Grid/VI-Final-Path for World " + str(worldCntr) +
                   ".svg",
                   pretty=True)

        #Q-Learning
        q_learning = QLearner.QLearningEx(transitions,
                                          reward,
                                          grid=grid[worldCntr - 1],
                                          start=starts[worldCntr - 1],
                                          goals=goals[worldCntr - 1],
                                          n_iter=qlearningIter[worldCntr - 1],
                                          n_restarts=1000,
                                          alpha=0.2,
                                          gamma=0.9,
                                          rar=0.1,
                                          radr=0.99)
        q_learning.run()
        print_as_grid(q_learning.policy, lake.lake, size)
        #print(q_learning.time)

        actions = getActions(q_learning.policy, start, goal, size)
        svg = gv.gridworld(n=size,
                           tile2classes=lake.tile2classes,
                           actions=actions,
                           extra_css='goal',
                           start=start,
                           policyList=q_learning.policy)
        svg.saveas("Figures/Grid/QL-Final-Path for World " + str(worldCntr) +
                   ".svg",
                   pretty=True)

        worldCntr += 1
def findBestPolicyForForest():
	cntr = 0
	pi_rewards = []
	pi_error = []
	pi_time = []
	pi_iter = []
	vi_rewards = []
	vi_error = []
	vi_time = []
	vi_iter = []
	for size in [1000]:
		forest = ForestMng(states=size, reward_wait=4, reward_cut=2, prob_fire=0.3)

		# Policy iteration
		policy_iteration = mdp.PolicyIteration(forest.P, forest.R, gamma=0.9, policy0=None, max_iter=1000, eval_type=0)
		policy_iteration.run()
		print(policy_iteration.time)
		print(policy_iteration.iter)
		print(policy_iteration.policy)
		pi_rewards.append([sub['Reward'] for sub in policy_iteration.run_stats])
		pi_error.append([ sub['Error'] for sub in policy_iteration.run_stats ])
		pi_time.append([ sub['Time'] for sub in policy_iteration.run_stats ])
		pi_iter.append([ sub['Iteration'] for sub in policy_iteration.run_stats ])

		# Value iteration
		value_iteration = mdp.ValueIteration(forest.P, forest.R, gamma=0.9, max_iter=1000)
		value_iteration.run()
		print(value_iteration.time)
		print(value_iteration.iter)
		print(value_iteration.policy)
		vi_rewards.append([sub['Reward'] for sub in value_iteration.run_stats])
		vi_error.append([sub['Error'] for sub in value_iteration.run_stats])
		vi_time.append([sub['Time'] for sub in value_iteration.run_stats])
		vi_iter.append([sub['Iteration'] for sub in value_iteration.run_stats])

		if max(pi_iter[cntr]) < max(vi_iter[cntr]):
			for i in range(max(vi_iter[cntr]) - max(pi_iter[cntr])):
				pi_error[cntr].append(pi_error[cntr][len(pi_error[cntr])-1])
				pi_rewards[cntr].append(pi_rewards[cntr][len(pi_rewards[cntr]) - 1])
				pi_time[cntr].append(pi_time[cntr][len(pi_time[cntr]) - 1])

		cntr += 1

	plt.style.use('seaborn-whitegrid')
	plt.plot(vi_iter[0], pi_error[0], label='PI')
	plt.plot(vi_iter[0], vi_error[0], label='VI')
	plt.ylabel('Convergence', fontsize=12)
	plt.xlabel('Iter.', fontsize=12)
	plt.title('Convergence of Error vs Iteration for Forest Mng State 1000 p03', fontsize=12, y=1.03)
	plt.legend()
	plt.savefig('Figures/Forest/Error Convergence vs Iteration for Forest Mng state 1000 p03.png')
	plt.close()

	plt.style.use('seaborn-whitegrid')
	plt.plot(vi_iter[0], pi_rewards[0], label='PI')
	plt.plot(vi_iter[0], vi_rewards[0], label='VI')
	plt.ylabel('Reward', fontsize=12)
	plt.xlabel('Iter.', fontsize=12)
	plt.title('Rewards vs Iteration for Forest Mng state 1000 p03', fontsize=12, y=1.03)
	plt.legend()
	plt.savefig('Figures/Forest/Rewards vs Iteration for Forest Mng state 1000 p03.png')
	plt.close()

	plt.style.use('seaborn-whitegrid')
	plt.plot(vi_iter[0], pi_time[0], label='PI')
	plt.plot(vi_iter[0], vi_time[0], label='VI')
	plt.ylabel('Time', fontsize=12)
	plt.xlabel('Iter.', fontsize=12)
	plt.title('Time vs Iteration for Forest Mng state 1000 p03', fontsize=12, y=1.03)
	plt.legend()
	plt.savefig('Figures/Forest/Time vs Iteration for Forest Mng  state 1000 p3.png')
	plt.close()
Beispiel #26
0
def frozen_lake_all(P, R, gamma_range, mapping, shape):

    vi_iteration_list = np.zeros(gamma_range.shape)
    vi_time_list = np.zeros(gamma_range.shape)
    vi_reward_list = np.zeros(gamma_range.shape)
    vi_error_list = np.zeros(gamma_range.shape)

    pi_iteration_list = np.zeros(gamma_range.shape)
    pi_time_list = np.zeros(gamma_range.shape)
    pi_reward_list = np.zeros(gamma_range.shape)
    pi_error_list = np.zeros(gamma_range.shape)

    diff_list = np.zeros(gamma_range.shape)

    expected_policy = None

    for i, gamma in enumerate(gamma_range):
        print('Gamma %0.2f' % gamma)

        vi = mdp.ValueIteration(transitions=P,
                                reward=R,
                                gamma=gamma,
                                epsilon=0.0001,
                                max_iter=5000)
        # vi.setVerbose()
        vi.run()

        vi_iteration_list[i] = vi.run_stats[-1:][0]['Iteration']
        vi_time_list[i] = vi.run_stats[-1:][0]['Time']
        vi_reward_list[i] = vi.run_stats[-1:][0]['Reward']
        vi_error_list[i] = vi.run_stats[-1:][0]['Error']

        pi = mdp.PolicyIteration(transitions=P,
                                 reward=R,
                                 gamma=gamma,
                                 max_iter=5000,
                                 eval_type=1)
        # pi.setVerbose()
        pi.run()

        pi_iteration_list[i] = pi.run_stats[-1:][0]['Iteration']
        pi_time_list[i] = pi.run_stats[-1:][0]['Time']
        pi_reward_list[i] = pi.run_stats[-1:][0]['Reward']
        pi_error_list[i] = pi.run_stats[-1:][0]['Error']

        print('Value Iteration Policy Found: ' + str(vi.policy))
        print_policy(vi.policy, mapping, shape)
        print('Policy Iteration Policy Found: ' + str(pi.policy))
        print_policy(pi.policy, mapping, shape)

        difference1 = sum([abs(x - y) for x, y in zip(pi.policy, vi.policy)])
        diff_list[i] = difference1
        print('Discrepancy in Policy and Value Iteration: ', difference1)

        if difference1 == 0:
            expected_policy = vi.policy

        print()

        # Plotting
        # Error v Iteration
        plt.clf()
        plt.title('Value Iteration: Error v Iterations')
        plt.xlabel('Iterations')
        plt.ylabel('Error')
        plt.plot(list(vi_iteration_list), list(vi_error_list))
        plt.tight_layout()
        plt.savefig('plots/frozen_lakes/vi_error_v_iteration.png')

        # Reward v Gamma
        plt.clf()
        plt.title('Value Iteration: Reward v Gamma')
        plt.xlabel('Gamma')
        plt.ylabel('Reward')
        plt.plot(list(gamma_range), list(vi_reward_list))
        plt.tight_layout()
        plt.savefig('plots/frozen_lakes/vi_reward_v_gamma.png')

        # Gamma v Iterations
        plt.clf()
        plt.title('Value Iteration: Gamma v Iterations')
        plt.xlabel('Iterations')
        plt.ylabel('Gamma')
        plt.plot(list(vi_iteration_list), list(gamma_range))
        plt.tight_layout()
        plt.savefig('plots/frozen_lakes/vi_gamma_v_iterations.png')

        # Gamma v Time
        plt.clf()
        plt.title('Value Iteration: Gamma v Time')
        plt.xlabel('Time')
        plt.ylabel('Gamma')
        plt.plot(list(vi_time_list), list(gamma_range))
        plt.tight_layout()
        plt.savefig('plots/frozen_lakes/vi_gamma_v_time.png')

        # Reward vs Iterations
        plt.clf()
        plt.title('Value Iteration: Reward v Iterations')
        plt.xlabel('Iterations')
        plt.ylabel('Reward')
        plt.plot(list(vi_iteration_list), list(vi_reward_list))
        plt.tight_layout()
        plt.savefig('plots/frozen_lakes/vi_reward_v_iterations.png')

        # Policy
        # Error v Iteration
        plt.clf()
        plt.title('Policy Iteration: Error v Iterations')
        plt.xlabel('Iterations')
        plt.ylabel('Error')
        plt.scatter(list(pi_iteration_list), list(pi_error_list))
        plt.tight_layout()
        plt.savefig('plots/frozen_lakes/pi_error_v_iteration.png')

        # Gamma v Reward
        plt.clf()
        plt.title('Policy Iteration: Reward v Gamma')
        plt.xlabel('Gamma')
        plt.ylabel('Reward')
        plt.scatter(list(gamma_range), list(pi_reward_list))
        plt.tight_layout()
        plt.savefig('plots/frozen_lakes/pi_reward_v_gamma.png')

        # Gamma v Iterations
        plt.clf()
        plt.title('Policy Iteration: Gamma v Iterations')
        plt.xlabel('Iterations')
        plt.ylabel('Gamma')
        plt.scatter(list(pi_iteration_list), list(gamma_range))
        plt.tight_layout()
        plt.savefig('plots/frozen_lakes/pi_gamma_v_iterations.png')

        # Gamma v Time
        plt.clf()
        plt.title('Policy Iteration: Gamma v Time')
        plt.xlabel('Time')
        plt.ylabel('Gamma')
        plt.scatter(list(pi_time_list), list(gamma_range))
        plt.tight_layout()
        plt.savefig('plots/frozen_lakes/pi_gamma_v_time.png')

        # Reward vs Iterations
        plt.clf()
        plt.title('Policy Iteration: Reward v Iterations')
        plt.xlabel('Iterations')
        plt.ylabel('Reward')
        plt.scatter(list(pi_iteration_list), list(pi_reward_list))
        plt.tight_layout()
        plt.savefig('plots/frozen_lakes/pi_reward_v_iterations.png')

        # Gamma vs Policy Differences
        plt.clf()
        plt.title('Gamma v Policy Differences')
        plt.xlabel('Gamma')
        plt.ylabel('Policy Differences')
        plt.scatter(list(gamma_range), list(diff_list))
        plt.tight_layout()
        plt.savefig('plots/frozen_lakes/gamma_v_differences.png')

    # TODO
    gamma_range = np.array([0.8, 0.9, 0.99])
    alpha_range = np.array([0.1, 0.9, 0.99])
    epsilon_range = np.array([0.1, 0.5, 0.9, 0.999])
    e_decay_range = np.array([0.1, 0.5, 0.9, 0.999])

    # alpha_range = np.append(np.linspace(0.01, 0.1, 9), np.linspace(0.2, 0.99, 4))
    # epsilon_range = np.linspace(0.1, 1.0, 10)
    # e_decay_range = np.append(np.linspace(0.1, 0.9, 4), np.linspace(0.91, 0.99, 9))

    prev_Q = None
    thresh = 1e-4
    print('== Q Learning ==')
    for i, gamma in enumerate(gamma_range):
        for j, alpha in enumerate(alpha_range):
            for k, ep in enumerate(epsilon_range):
                for l, ed in enumerate(e_decay_range):
                    # print('ql: gamma - {}, alpha - {}, epsilon - {}, e_decay - {}'.format(gamma, alpha, ep, ed))
                    ql = mdp.QLearning(transitions=P,
                                       reward=R,
                                       gamma=gamma,
                                       alpha=alpha,
                                       alpha_decay=1.0,
                                       alpha_min=0.001,
                                       epsilon=ep,
                                       epsilon_min=0.1,
                                       epsilon_decay=ed,
                                       n_iter=10e4)
                    stats = ql.run()
                    plot_stats(stats,
                               ('ql_frozen_lake_%0.2f_%0.2f_%0.2f_%0.2f' %
                                (gamma, alpha, ep, ed)))

                    # print('Policy: ')
                    # print(ql.policy)
                    # print(ql.run_stats)
                    df = pd.DataFrame.from_records(ql.run_stats)
                    iteration_list = df['Iteration'][-100:]
                    windowed_reward = df['Reward'][-100:].mean()
                    error_list = df['Error'][-100:].mean()

                    if prev_Q is None:
                        prev_Q = ql.Q
                    else:
                        variation = np.absolute(
                            np.subtract(np.asarray(ql.Q),
                                        np.asarray(prev_Q))).max()
                        res = np.abs(
                            np.subtract(np.asarray(prev_Q), np.asarray(ql.Q)))
                        print('Result: ')
                        print(res)
                        print('Variation: ')
                        print(variation)
                        print('Mean Reward for Last 100 Iterations:')
                        print(windowed_reward)
                        if np.all(
                                res < thresh
                        ) or variation < thresh or windowed_reward > 45.0:
                            print('Breaking! Below Thresh')
                            print(
                                'Found at: gamma - {}, alpha - {}, epsilon - {}, e_decay - {}'
                                .format(gamma, alpha, ep, ed))
                            print('Optimal Policy: ')
                            print(ql.policy)
                            break