Ejemplo n.º 1
0
 def __init__(self,
              transition,
              reward,
              gamma,
              epsilon,
              policy0=None,
              max_iter=1000,
              eval_type=0,
              skip_check=False,
              run_stat_frequency=None):
     """ Init original PI with modified PI espsilon handling """
     # Call PI constructor
     PolicyIteration.__init__(
         self,
         transition,
         reward,
         gamma,
         policy0=policy0,
         max_iter=max_iter,
         eval_type=eval_type,
         skip_check=skip_check,
         run_stat_frequency=run_stat_frequency,
     )
     # Copied Epsilon code from mdp.py
     # Set threshold based on epsilon
     self.epsilon = epsilon
     self.gamma = gamma
     if self.gamma != 1:
         self.thresh = self.epsilon * (1 - self.gamma) / self.gamma
     else:
         self.thresh = self.epsilon
Ejemplo n.º 2
0
def policy_iteration(P, R, discount = [0.9]):
    df_pi = pd.DataFrame(columns=["Discount", "Iteration", "Time", "Reward", "Policy"])
    for disc in discount:
        pi = PolicyIteration(P, R, gamma=disc, max_iter=1e6)
        pi.run()
        reward = test_policy(P, R, pi.policy)
        info = [disc, pi.iter, pi.time, reward, pi.policy]
        df_length = len(df_pi)
        df_pi.loc[df_length] = info
    return df_pi
Ejemplo n.º 3
0
def policy_iteration():
    deltas = {}
    rewards = {}
    for size in PROBLEM_SIZES:
        P, R = forest(S=size, r1=1, r2=5, p=.1)
        pi = PolicyIteration(P, R, 0.9, max_iter=10)
        pi.run()
        delta = [pi.run_stats[i]['Error'] for i in range(len(pi.run_stats))]
        reward = [pi.run_stats[i]['Reward'] for i in range(len(pi.run_stats))]
        deltas[size] = delta
        rewards[size] = reward
        print(pi.policy)
        print(pi.S)

    # forest_plot.plot_pi_forest_convergence_size(rewards)

    deltas = {}
    rewards = {}
    for p in [.2, .1, .05, .01]:
        P, R = forest(S=10, r1=1, r2=5, p=p)
        pi = PolicyIteration(P, R, 0.9, max_iter=10)
        pi.run()
        delta = [pi.run_stats[i]['Error'] for i in range(len(pi.run_stats))]
        reward = [pi.run_stats[i]['Reward'] for i in range(len(pi.run_stats))]
        deltas[p] = delta
        rewards[p] = reward
        print(pi.policy)
        print(pi.S)

    forest_plot.plot_pi_forest_convergence_p(rewards)
Ejemplo n.º 4
0
def policy_iteration(P, R, id=None):
    gammas = [0.99, 0.9, 0.8, 0.7, 0.5]
    runtimes = []
    Rs = []
    iters = []
    tracker = ''
    policies = []
    for g in gammas:
        pi = PolicyIteration(P, R, g, max_iter=100000)
        pi.run()
        r = test_policy(P, R, pi.policy)
        Rs.append(r)
        policies.append(pi.policy)
        runtimes.append(pi.time)
        iters.append(pi.iter)
        tracker += 'gamma={}: reward was {}, iters was {}, time was {}\n'.format(
            g, r, pi.iter, pi.time)

    # write
    with open('figures/PI_variables_forest_{}.txt'.format(id), 'w') as f:
        f.write(tracker)

    with open('figures/PI_policies_forest_{}.txt'.format(id), 'w') as f:
        for i, g in enumerate(gammas):
            f.write('gamma={}: policy={}\n'.format(g, policies[i]))

    # plot
    plt.plot(gammas, Rs)
    plt.title('PI Avg Rewards')
    plt.xlabel('Gammas')
    plt.ylabel('Avg rewards')
    plt.savefig('figures/PI_rewards_forest_{}.png'.format(id))
    plt.clf()

    plt.plot(gammas, iters)
    plt.title('PI iterations')
    plt.xlabel('Gammas')
    plt.ylabel('Iterations')
    plt.savefig('figures/PI_iters_forest_{}.png'.format(id))
    plt.clf()
    print('done')
Ejemplo n.º 5
0
def gamma_iter_value_p():
    gamma = np.arange(0.1, 1.0, 0.1)
    v1_iter = []
    v2_iter = []
    v1_v_mean = []
    v2_v_mean = []
    for g in gamma:
        P, R = forest(num_states, r1, r2, p_fire)
        P2, R2 = forest(num_states, r1, r2, 0.9)
        vi = PolicyIteration(P, R, g)
        vi.run()

        vi2 = PolicyIteration(P2, R2, g)
        vi2.run()
        v1_iter.append(len(vi.run_stats))
        v2_iter.append(len(vi2.run_stats))
        v1_v_mean.append(vi.run_stats[-1]["Mean V"])
        v2_v_mean.append(vi2.run_stats[-1]["Mean V"])

    # plt.plot(gamma,v1_iter, linestyle='--', marker='o', color='b',label="fire possibility = 0.1")
    # plt.plot(gamma, v2_iter, linestyle='--', marker='o', color='r',label="fire possibility = 0.9")
    # plt.xlabel("Gamma")
    # plt.ylabel("Converged iteration #")
    # plt.title("Converged happen at iteration# vs gamma")
    # plt.legend(('fire possibility = 0.1', 'fire possibility = 0.9'), loc="upper left")
    # plt.show()

    plt.plot(gamma,
             v1_v_mean,
             linestyle='--',
             marker='o',
             color='b',
             label="fire possibility = 0.1")
    plt.plot(gamma,
             v2_v_mean,
             linestyle='--',
             marker='o',
             color='r',
             label="fire possibility = 0.9")
    plt.xlabel("Gamma")
    plt.ylabel("Converged Mean Value")
    plt.title("converged Mean Value vs gamma")
    plt.legend(('fire possibility = 0.1', 'fire possibility = 0.9'),
               loc="upper left")
    plt.show()
Ejemplo n.º 6
0
        eps_schedule = make_schedules(n_epi)["exp_decay"]
        alpha_schedule = make_schedules(n_epi)["constant_0.01"]
    elif prob_key == "forest":
        n_epi = 100000
        eps_schedule = make_schedules(n_epi)["constant_0.5"]
        alpha_schedule = make_schedules(n_epi)["constant_0.5"]

    print("..Running value iteration...")
    vi = ValueIteration(P, R, gamma=0.99, epsilon=0.001, max_iter=1000)
    vi.run()
    vi_df = pd.DataFrame(vi.run_stats).set_index("Iteration")
    vi_df.columns = pd.MultiIndex.from_product([vi_df.columns, ["value_iter"]])
    print(f"Runtime per value iter: {vi.time/vi.iter} sec")

    print("..Running policy iteration...")
    pi = PolicyIteration(P, R, gamma=0.99, eval_type=1, max_iter=1000)
    pi.run()
    pi_df = pd.DataFrame(pi.run_stats).set_index("Iteration")
    pi_df.columns = pd.MultiIndex.from_product(
        [pi_df.columns, ["policy_iter"]])
    print(f"Runtime per policy iter: {pi.time/pi.iter} sec")

    print("..Running q-learning...")
    ql = QLearning(
        P,
        R,
        gamma=0.99,
        alpha_schedule=alpha_schedule,
        epsilon_schedule=eps_schedule,
        n_episode=n_epi,
    )
Ejemplo n.º 7
0
    reward = np.zeros(len(vi.run_stats))
    i = 0
    for stat in vi.run_stats:
        iterations[i] = stat["Iteration"]
        reward[i] = stat["Reward"]
        i += 1

    fig, ax = plt.subplots()
    ax.plot(iterations, reward)

    ax.set(xlabel="Iterations", ylabel="Reward", title="Frozen Lake Value Iteration")
    ax.grid()

    fig.savefig("frozen-lake.vi.png")

    pi = PolicyIteration(P, R, Gamma, None, max_iter=20000)

    # run pi
    pi.setVerbose()
    pi.run()
    print("== Policy Iteration ==")
    print("Policy: ")
    print_policy(pi.policy, mapping, shape)
    print("Iterations: ")
    print(pi.iter)
    print("Time: ")
    print(pi.time)
    print(pi.run_stats[-1:])

    iterations = np.zeros(len(pi.run_stats))
    reward = np.zeros(len(pi.run_stats))
Ejemplo n.º 8
0
    P, R = forest(num_states, r1, r2, p_fire)
    vi = ValueIteration(P, R, 0.96, 1e-20)
    vi.run()

    P2, R2 = forest(num_states, r1, r2, 0.8)
    vi2 = ValueIteration(P2, R2, 0.96, 1e-20)
    vi2.run()

    # # calculate and plot the v_mean
    # iter_score(vi, vi2)

    # gamma_iter_value()
    # #
    #

    pi = PolicyIteration(P, R, 0.96)
    pi.run()

    pi2 = PolicyIteration(P2, R2, 0.96)
    pi2.run()
    # iter_score(pi, pi2)
    # #iter_policy(pi, pi2)
    # gamma_iter_value_p()

    q = QLearning(P, R, 0.4, alpha=0.9, n_iter=100000)
    q.run()

    q2 = QLearning(P2, R2, 0.4, alpha=0.9, n_iter=100000)
    q2.run()
    iter_score(q, q2)
Ejemplo n.º 9
0
for stat in VI.run_stats:
    iterations[i] = stat['Iteration']
    reward[i] = stat['Reward']
    i += 1

fig, ax = plt.subplots()
ax.plot(iterations, reward)

ax.set(xlabel='Iterations',
       ylabel='Reward',
       title='Frozen Lake Value Iteration')
ax.grid()

fig.savefig("frozen-lake.vi.png")

PI = PolicyIteration(P, R, Gamma, None, 20000)

# run PI
PI.setVerbose()
PI.run()
print('PI')
print(PI.iter)
print(PI.time)
print(PI.run_stats[-1:])

iterations = np.zeros(len(PI.run_stats))
reward = np.zeros(len(PI.run_stats))
i = 0
for stat in PI.run_stats:
    iterations[i] = stat['Iteration']
    reward[i] = stat['Reward']
Ejemplo n.º 10
0
from hiive.mdptoolbox.mdp import PolicyIterationModified, PolicyIteration
from helpers.open_ai_convert import OpenAI_MDPToolbox
from helpers.plot_graphs import plot_discount, plot_rewards

disc = [0.1, 0.3, 0.5, 0.7, 0.9]
ep = [0.00099, 0.001, 0.005, 0.01, 0.03]

ex = OpenAI_MDPToolbox('FrozenLake-v0', False)
P = ex.P
R = ex.R
results = []
for d in disc:
    pi = PolicyIteration(
        # pi = PolicyIterationModified(
        P,  # transitions
        R,  # rewards
        d,  # discount
        # epsilon=0.01,
        max_iter=1000,
    )
    pi.run()
    print('policy iteration value function:', pi.V)
    print('policy iteration iterations:', pi.iter)
    print('policy iteration time:', pi.time)
    print('policy iteration best policy:', pi.policy)
    results.append(pi)

plot_rewards(disc, results, 'Policy Iteration Discount/Rewards FrozenLake',
             'policy_iteration_discount_rewards_frozenlake', 'Discount')
results = []
for e in ep:
    pi = PolicyIteration(
Ejemplo n.º 11
0
i = 0
for stat in VI.run_stats:
    iterations[i] = stat['Iteration']
    reward[i] = stat['Reward']
    i += 1

fig, ax = plt.subplots()
ax.plot(iterations, reward)

ax.set(xlabel='Iterations', ylabel='Reward', title='Forest Value Iteration')
ax.grid()

fig.savefig("forest.vi.png")

Gamma = 0.99
PI = PolicyIteration(P, R, Gamma)

# run PI
PI.setVerbose()
PI.run()
print('PI')
print(PI.iter)
print(PI.time)
print(PI.run_stats[-1:])

iterations = np.zeros(len(PI.run_stats))
reward = np.zeros(len(PI.run_stats))
i = 0
for stat in PI.run_stats:
    iterations[i] = stat['Iteration']
    reward[i] = stat['Reward']