Beispiel #1
0
def value_iteration(P, R, discount = [0.9], eps = [1e-8]):
    df_vi = pd.DataFrame(columns=["Discount", "Policy", "Iteration", 
                                  "Time", "Reward", "Value Function"])
    for disc in discount:
        vi = ValueIteration(P, R, gamma=disc, epsilon=eps[0], max_iter=int(1e15))
        vi.run()
        reward = test_policy(P, R, vi.policy)
        info = [float(disc), vi.policy, vi.iter, vi.time, reward, vi.V]
        df_length = len(df_vi)
        df_vi.loc[df_length] = info
    return df_vi
Beispiel #2
0
def value_iteration():
    deltas = {}
    rewards = {}
    for size in PROBLEM_SIZES:
        P, R = forest(S=size, r1=1, r2=5, p=.1)
        vi = ValueIteration(P, R, 0.9, max_iter=10)
        vi.run()
        delta = [vi.run_stats[i]['Error'] for i in range(len(vi.run_stats))]
        reward = [vi.run_stats[i]['Reward'] for i in range(len(vi.run_stats))]
        deltas[size] = delta
        rewards[size] = reward
        print(vi.policy)
        print(vi.S)
Beispiel #3
0
 def __init__(self, name, transition, reward, config, outdir):
     """ Constructor for VI """
     self.name = name
     self.title = "VI"
     self.transition = transition
     self.reward = reward
     self.outdir = outdir
     self.config = config
     self.results = None
     self.dataframe = None
     self.policy = None
     self.instance = ValueIteration(transition,
                                    reward,
                                    gamma=config['gamma'],
                                    epsilon=config['epsilon'],
                                    max_iter=config['max_iter'])
Beispiel #4
0
def gamma_iter_value():
    gamma = np.arange(0.1, 1.0, 0.1)
    v1_iter = []
    v2_iter = []
    v1_v_mean = []
    v2_v_mean = []
    for g in gamma:
        P, R = forest(num_states, r1, r2, p_fire)
        P2, R2 = forest(num_states, r1, r2, 0.9)
        vi = ValueIteration(P, R, g, 1e-20)
        vi.run()

        vi2 = ValueIteration(P2, R2, g, 1e-20)
        vi2.run()
        v1_iter.append(len(vi.run_stats))
        v2_iter.append(len(vi2.run_stats))
        v1_v_mean.append(vi.run_stats[-1]["Mean V"])
        v2_v_mean.append(vi2.run_stats[-1]["Mean V"])

    # plt.plot(gamma, v1_iter, linestyle='--', marker='o', color='b',label="fire possibility = 0.1")
    # plt.plot(gamma, v2_iter, linestyle='--', marker='o', color='r',label="fire possibility = 0.9")
    # plt.xlabel("Gamma")
    # plt.ylabel("Converged iteration #")
    # plt.title("converged happen at iteration # vs gamma")
    # plt.legend(('fire possibility = 0.1', 'fire possibility = 0.9'), loc="upper left")
    # plt.show()

    plt.plot(gamma,
             v1_v_mean,
             linestyle='--',
             marker='o',
             color='b',
             label="fire possibility = 0.1")
    plt.plot(gamma,
             v2_v_mean,
             linestyle='--',
             marker='o',
             color='r',
             label="fire possibility = 0.9")
    plt.xlabel("Gamma")
    plt.ylabel("Converged Mean Value")
    plt.title("converged Mean Value vs gamma")
    plt.legend(('fire possibility = 0.1', 'fire possibility = 0.9'),
               loc="upper left")
    plt.show()
Beispiel #5
0
    def plot_convergence(self):

        title = "Convergence for Value Iteration \n" + self.problem_name
        title_dic = {'fontsize': 7, 'fontweight': 'bold'}
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(5, 2))
        ax1.set_xlabel("Iterations", title_dic)
        ax1.set_title(title, title_dic)
        ax1.set_ylabel("Delta Utility(Log Scale)", title_dic)
        ax1.tick_params(axis="x", labelsize=7)
        ax1.tick_params(axis="y", labelsize=7)
        ax1.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
        ax1.grid()
        ax1.set_yscale('log')

        title1 = "Cumulative Time (ms) vs Iterations \n" + self.problem_name
        ax2.set_xlabel("Iterations", title_dic)
        ax2.set_title(title1, title_dic)
        ax2.set_ylabel("Time(ms)", title_dic)
        ax2.tick_params(axis="x", labelsize=7)
        ax2.tick_params(axis="y", labelsize=7)
        ax2.yaxis.set_major_formatter(FormatStrFormatter('%.3f'))
        ax2.grid()
        plt.tight_layout()
        plt.grid()

        deltaTime = None
        for problem in self.problems:

            hyper_params = problem[self.HYPER_PARAMS_INDEX]
            vi = ValueIteration(problem[self.P_INDEX],
                                problem[self.R_INDEX],
                                gamma=hyper_params["gamma"],
                                max_iter=hyper_params["max_iter"],
                                epsilon=hyper_params["epsilon"])

            stats = vi.run()
            time = [dict["Time"] for dict in stats]
            time = np.array([x - time[i - 1] for i, x in enumerate(time)][1:])
            time = time * 1000

            self.policies[problem[self.PROBLEM_SIZE_INDEX]] = list(vi.policy)

            path = os.path.join(self.output_dir)
            filename = "valueiterationrunstats"
            filename = os.path.join(path, filename)

            outfile = open(filename, 'wb')
            pickle.dump(stats, outfile)
            outfile.close()

            delta = [dict["Error"] for dict in stats]
            ax1.plot(delta,
                     label="Size:" + str(problem[self.PROBLEM_SIZE_INDEX]))
            ax2.plot(time,
                     label="Size:" + str(problem[self.PROBLEM_SIZE_INDEX]))

            #rewards_list.append(rewards)

        ax1.legend(loc='best', fontsize=6)
        ax2.legend(loc='best', fontsize=6)
        path = os.path.join(self.output_dir)
        filename = title + ".png"
        filename = os.path.join(path, filename)
        plt.savefig(filename)
        plt.close()
Beispiel #6
0
    def tune_hyper_parameter(self, hyper_param_name, values):

        for problem in self.problems:

            rewards_list = []
            hyper_params = problem[self.HYPER_PARAMS_INDEX]

            title = "Tuning " + hyper_param_name + " for Value Iteration \n" + self.problem_name + " Size " + problem[
                self.PROBLEM_SIZE_INDEX]
            title_dic = {'fontsize': 7, 'fontweight': 'bold'}
            fig, (ax1) = plt.subplots(1, 1, figsize=(3, 2))
            ax1.set_xlabel("Iterations (Log Scale)", title_dic)
            ax1.set_xscale('log')
            ax1.set_title(title, title_dic)
            ax1.set_ylabel("Max Utility", title_dic)
            ax1.tick_params(axis="x", labelsize=7)
            ax1.tick_params(axis="y", labelsize=7)
            ax1.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
            ax1.grid()
            plt.tight_layout()
            plt.grid()

            if "max_iter" in hyper_params:
                iters = hyper_params["max_iter"]
            else:
                iters = 10000

            if "epsilon" in hyper_params:
                epsilon = hyper_params["epsilon"]
            else:
                epsilon = 0.0000001

            if 'gamma' in hyper_params:
                gamma = hyper_params["gamma"]
            else:
                gamma = 0.99

            for value in values:

                if hyper_param_name == 'gamma':
                    vi = ValueIteration(problem[self.P_INDEX],
                                        problem[self.R_INDEX],
                                        gamma=value,
                                        max_iter=iters,
                                        epsilon=epsilon)
                elif hyper_param_name == 'max_iter':
                    vi = ValueIteration(problem[self.P_INDEX],
                                        problem[self.R_INDEX],
                                        gamma=gamma,
                                        max_iter=value,
                                        epsilon=epsilon)
                elif hyper_param_name == 'epsilon':
                    vi = ValueIteration(problem[self.P_INDEX],
                                        problem[self.R_INDEX],
                                        gamma=gamma,
                                        max_iter=iters,
                                        epsilon=value)

                stats = vi.run()

                rewards = [dict["Max V"] for dict in stats]
                print("Max Reward with " + hyper_param_name + " value " +
                      str(value) + " " + str(rewards[-1]))

                ax1.plot(rewards, label=str(value))

                #rewards_list.append(rewards)

            ax1.legend(loc='lower right', fontsize=6, ncol=2)
            path = os.path.join(self.output_dir)
            filename = title + ".png"
            filename = os.path.join(path, filename)
            plt.savefig(filename)
            plt.close()
Beispiel #7
0
    if prob_key not in to_solve:
        continue
    print(f"Running {prob_key}...")
    P, R = PROBS[prob_key]

    if prob_key == "frozen_lake":
        n_epi = 10000
        eps_schedule = make_schedules(n_epi)["exp_decay"]
        alpha_schedule = make_schedules(n_epi)["constant_0.01"]
    elif prob_key == "forest":
        n_epi = 100000
        eps_schedule = make_schedules(n_epi)["constant_0.5"]
        alpha_schedule = make_schedules(n_epi)["constant_0.5"]

    print("..Running value iteration...")
    vi = ValueIteration(P, R, gamma=0.99, epsilon=0.001, max_iter=1000)
    vi.run()
    vi_df = pd.DataFrame(vi.run_stats).set_index("Iteration")
    vi_df.columns = pd.MultiIndex.from_product([vi_df.columns, ["value_iter"]])
    print(f"Runtime per value iter: {vi.time/vi.iter} sec")

    print("..Running policy iteration...")
    pi = PolicyIteration(P, R, gamma=0.99, eval_type=1, max_iter=1000)
    pi.run()
    pi_df = pd.DataFrame(pi.run_stats).set_index("Iteration")
    pi_df.columns = pd.MultiIndex.from_product(
        [pi_df.columns, ["policy_iter"]])
    print(f"Runtime per policy iter: {pi.time/pi.iter} sec")

    print("..Running q-learning...")
    ql = QLearning(
Beispiel #8
0
    P, R = PROBS[prob_key]

    print("..Running value iteration...")

    res_dict = {
        "Iteration to converge": [],
        "Max V": [],
        "Mean V": [],
        "Optimal policy reward": [],
    }
    V_all = []
    policy_all = []
    r_std_all = []
    for g in DISCOUNT_RATES:
        print(f"..discount rate = {g}...")
        vi = ValueIteration(P, R, gamma=g, epsilon=0.001)
        vi.run()
        res_dict["Iteration to converge"].append(vi.iter)
        res_dict["Max V"].append(vi.run_stats[-1]["Max V"])
        res_dict["Mean V"].append(vi.run_stats[-1]["Mean V"])
        print("...testing...")
        test_r_mean, test_r_std = test_policy(P, R, vi.policy)
        res_dict["Optimal policy reward"].append(test_r_mean)
        r_std_all.append(test_r_std)
        V_all.append(vi.V)
        policy_all.append(vi.policy)
    res_df = pd.DataFrame(res_dict)
    res_df.index = np.array(DISCOUNT_RATES).astype(str)
    res_df.plot(
        subplots=True,
        title=f"Value iteration vs. discount rate on {prob_key}",
Beispiel #9
0
    env.reset()

    # Enumerate state and action space sizes
    num_states = env.observation_space.n
    num_actions = env.action_space.n

    P = np.zeros((num_actions, num_states, num_states))
    R = np.zeros((num_states, num_actions))
    # prepare gym for mdptoolbox
    for state in env.env.P:
        for action in env.env.P[state]:
            for option in env.env.P[state][action]:
                P[action][state][option[1]] += option[0]
                R[state][action] += option[2]

    vi = ValueIteration(P, R, Gamma, epsilon=0.01, max_iter=20000)

    # run vi
    vi.setVerbose()
    vi.run()
    print("== Value Iteration ==")
    print("Policy: ")
    print_policy(vi.policy, mapping, shape)
    print(vi.policy)
    print("Iterations: ")
    print(vi.iter)
    print("Time: ")
    print(vi.time)
    print(vi.run_stats[-1:])

    iterations = np.zeros(len(vi.run_stats))
Beispiel #10
0
    plt.xlabel("Gamma")
    plt.ylabel("Converged Mean Value")
    plt.title("converged Mean Value vs gamma")
    plt.legend(('fire possibility = 0.1', 'fire possibility = 0.9'),
               loc="upper left")
    plt.show()


if __name__ == '__main__':
    gamma = 0.9
    num_states = 20
    r1 = 4
    r2 = 2
    p_fire = 0.1
    P, R = forest(num_states, r1, r2, p_fire)
    vi = ValueIteration(P, R, 0.96, 1e-20)
    vi.run()

    P2, R2 = forest(num_states, r1, r2, 0.8)
    vi2 = ValueIteration(P2, R2, 0.96, 1e-20)
    vi2.run()

    # # calculate and plot the v_mean
    # iter_score(vi, vi2)

    # gamma_iter_value()
    # #
    #

    pi = PolicyIteration(P, R, 0.96)
    pi.run()
Beispiel #11
0
def value_iteration(P, R, id=None):
    # gammas = [0.99, 0.9, 0.8, 0.7, 0.5]
    epsilons = [.1, .01, .001, .00001]
    runtimes = []
    Rs = []
    iters = []
    tracker = ''
    policies = []
    for e in epsilons:
        vi = ValueIteration(P, R, 0.9, epsilon=e, max_iter=100000)
        vi.run()
        r = test_policy(P, R, vi.policy)
        Rs.append(r)
        policies.append(vi.policy)
        runtimes.append(vi.time)
        iters.append(vi.iter)
        tracker += 'epsilon={}: reward was {}, iters was {}, time was {}\n'.format(
            e, r, vi.iter, vi.time)

    # write
    with open('figures/VI_variables_forest_{}.txt'.format(id), 'w') as f:
        f.write(tracker)

    with open('figures/VI_policies_forest_{}.txt'.format(id), 'w') as f:
        for i, e in enumerate(epsilons):
            f.write('epsilon={}: policy={}\n'.format(e, policies[i]))

    # plot
    plt.plot(epsilons, Rs)
    plt.title('VI Avg Rewards')
    plt.xlabel('Epsilons')
    plt.ylabel('Avg rewards')
    plt.savefig('figures/VI_rewards_forest_{}.png'.format(id))
    plt.clf()

    plt.plot(epsilons, iters)
    plt.title('VI iterations')
    plt.xlabel('Epsilons')
    plt.ylabel('Iterations')
    plt.savefig('figures/VI_iters_forest_{}.png'.format(id))
    plt.clf()
    # -----------------------------------
    gammas = [0.99, 0.9, 0.8, 0.7, 0.5]
    # epsilons = [.1, .01, .001, .00001]
    runtimes = []
    Rs = []
    iters = []
    tracker = ''
    policies = []
    for g in gammas:
        vi = ValueIteration(P, R, g, epsilon=0.01, max_iter=100000)
        vi.run()
        r = test_policy(P, R, vi.policy)
        Rs.append(r)
        policies.append(vi.policy)
        runtimes.append(vi.time)
        iters.append(vi.iter)
        tracker += 'gamma={}: reward was {}, iters was {}, time was {}\n'.format(
            g, r, vi.iter, vi.time)

    # write
    with open('figures/VI_variables_forest_{}_g.txt'.format(id), 'w') as f:
        f.write(tracker)

    with open('figures/VI_policies_forest_{}_g.txt'.format(id), 'w') as f:
        for i, g in enumerate(gammas):
            f.write('gamma={}: policy={}\n'.format(g, policies[i]))

    # plot
    plt.plot(gammas, Rs)
    plt.title('VI Avg Rewards')
    plt.xlabel('Gammas')
    plt.ylabel('Avg rewards')
    plt.savefig('figures/VI_rewards_forest_{}_g.png'.format(id))
    plt.clf()

    plt.plot(gammas, iters)
    plt.title('VI iterations')
    plt.xlabel('Gammas')
    plt.ylabel('Iterations')
    plt.savefig('figures/VI_iters_forest_{}_g.png'.format(id))
    plt.clf()
    print('done')
Beispiel #12
0
env = gym.make('FrozenLake-v0')

Gamma = 0.99

env.reset()

P = np.zeros((4, 16, 16))
R = np.zeros((16, 4))
# prepare gym for mdptoolbox
for state in env.env.P:
    for action in env.env.P[state]:
        for option in env.env.P[state][action]:
            P[action][state][option[1]] += option[0]
            R[state][action] += option[2]

VI = ValueIteration(P, R, Gamma, 0.1, 20000)

# run VI
VI.setVerbose()
VI.run()
print('VI')
print(VI.iter)
print(VI.time)
print(VI.run_stats[-1:])

iterations = np.zeros(len(VI.run_stats))
reward = np.zeros(len(VI.run_stats))
i = 0
for stat in VI.run_stats:
    iterations[i] = stat['Iteration']
    reward[i] = stat['Reward']
Beispiel #13
0
import mdptoolbox.example
from helpers.open_ai_convert import OpenAI_MDPToolbox
from helpers.plot_graphs import plot_rewards
from hiive.mdptoolbox.mdp import ValueIteration

disc = [0.1, 0.3, 0.5, 0.7, 0.9]
ep = [0.00099, 0.001, 0.005, 0.01, 0.03]

ex = OpenAI_MDPToolbox('FrozenLake-v0', False)
P = ex.P
R = ex.R
results = []
for d in disc:
    vi = ValueIteration(P, R, d, epsilon=0.001, max_iter=1000)
    vi.run()
    print('value iteration value function:', vi.V)
    print('value iteration iterations:', vi.iter)
    print('value iteration time:', vi.time)
    print('value iteration best policy:', vi.policy)
    results.append(vi)

plot_rewards(disc, results, 'Value Iteration Discount/Rewards FrozenLake',
             'value_iteration_discount_rewards_frozenlake', 'Discount')

results = []
for e in ep:
    vi = ValueIteration(P, R, 0.9, epsilon=e, max_iter=1000)
    vi.run()
    print('value iteration value function:', vi.V)
    print('value iteration iterations:', vi.iter)
    print('value iteration time:', vi.time)
Beispiel #14
0
from hiive.mdptoolbox.mdp import ValueIteration, QLearning, PolicyIteration
from hiive.mdptoolbox.example import forest
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

P, R = forest(2000)

compare_VI_QI_policy = []  # True or False
compare_VI_PI_policy = []

Gamma = 1
Epsilon = 0.0000000000000000000000000000000000000000000000000000000000000000000000000001
Max_Iterations = 200000

VI = ValueIteration(P, R, Gamma, Epsilon, Max_Iterations)

# run VI
VI.setVerbose()
VI.run()
print('VI')
print(VI.iter)
print(VI.time)
print(VI.run_stats[-1:])

iterations = np.zeros(len(VI.run_stats))
reward = np.zeros(len(VI.run_stats))
i = 0
for stat in VI.run_stats:
    iterations[i] = stat['Iteration']
    reward[i] = stat['Reward']