Beispiel #1
0
def run_maze(maze, title=""):
    T = maze.get_transitions()
    R = maze.get_rewards()
    discount = 0.90

    value_iteration = ValueIteration(T, R, discount)
    value_iteration.run()
    print "VITER REWARD", maze.find_reward(value_iteration.policy)
    print "VITER TIME", value_iteration.time
    print "VITER ITERS", value_iteration.iter
    maze.draw_maze(value_iteration.policy, title=title+"v")

    policy_iteration = PolicyIteration(T,R, discount)
    policy_iteration.run()
    print "PITER REWARD", maze.find_reward(policy_iteration.policy)
    print "PITER TIME", policy_iteration.time
    print "PITER ITERS", policy_iteration.iter
    maze.draw_maze(policy_iteration.policy, title=title+'p')


    s = time.time()
    Q = maze.qlearn()
    n = time.time()
    q_policy = []
    for state in Q:
        q_policy.append(np.argmax(state))

    maze.draw_maze(q_policy, title=title+'q')
    print "Q LEARN", maze.find_reward(q_policy)
    print "Q LEARN TIME", (n-s)
    print "Q ITERS", maze.q_iters
Beispiel #2
0
def stocks_vs_state(n_states=None):
    """Compare performance on the Stocks MDP as a function of state size."""
    if n_states is None:
        n_states = [7, 9, 13, 15, 17, 23, 29, 35, 41, 53, 65, 77, 89]

    for N in n_states:
        mdp = Stocks(N)
        discount = 0.9
        T = mdp.transitions()
        R = mdp.rewards()

        viter = ValueIteration(T, R, discount)
        viter.run()
        rewards, _ = simulate_policy(viter, mdp)
        print "\nValue iteration: {}".format(viter.policy)
        print "# of iterations: {}".format(viter.iter)
        print "Execution time: {}".format(viter.time)
        print "Average reward: {}".format(np.mean(rewards))

        piter = PolicyIteration(T, R, discount)
        piter.run()
        rewards, _ = simulate_policy(piter, mdp)
        print "\nPolicy iteration: {}".format(piter.policy)
        print "# of iterations: {}".format(piter.iter)
        print "Execution time: {}".format(piter.time)
        print "Average reward: {}".format(np.mean(rewards))

        qlearn = QLearning(T, R, discount, n_iter=10000)
        qlearn.run()
        rewards, _ = simulate_policy(piter, mdp)
        print "\nQ-learning: {}".format(qlearn.policy)
        print "# of iterations: {}".format(qlearn.max_iter)
        print "Execution time: {}".format(qlearn.time)
        print "Average reward: {}".format(np.mean(rewards))
Beispiel #3
0
    def solve_MDP(self, dataset, curr_time):

        # only consider until curr_time
        eval_dataset = dataset[dataset[:, 0] <= curr_time, :]

        if eval_dataset.shape[0] > self.param.mdp_max_data:
            eval_dataset = eval_dataset[-self.param.mdp_max_data:, :]

        print('MDP eval_dataset.shape: ', eval_dataset.shape)

        P = self.get_MDP_P()  # in AxSxS

        self.P = P

        R = self.get_MDP_R(eval_dataset, curr_time)  # in SxA

        mdp = ValueIteration(P, R, self.param.mdp_gamma, self.param.mdp_eps,
                             self.param.mdp_max_iter)
        mdp.run()
        V = np.array(mdp.V)
        Q = self.get_MDP_Q(R, V, self.param.mdp_gamma)

        self.v = V
        self.q = Q
        self.r = R.flatten()
        return self.v, self.q, self.r
Beispiel #4
0
def solve_mini_maze():
    """Solve miniature Maze MDP."""
    M = mini_maze()
    T = M.transitions()
    R = M.rewards()
    discount = 0.9

    viter = ValueIteration(T, R, discount)
    viter.run()
    print "\nValue iteration: {}".format(viter.policy)
    print "# of iterations: {}".format(viter.iter)
    print "Execution time: {}".format(viter.time)

    piter = PolicyIteration(T, R, discount, max_iter=2000)
    piter.run()
    print "\nPolicy iteration: {}".format(piter.policy)
    print "# of iterations: {}".format(piter.iter)
    print "Execution time: {}".format(piter.time)

    qlearn = QLearning(T, R, discount, n_iter=50000)
    qlearn.run()
    print "\nQ-learning: {}".format(qlearn.policy)
    print "# of iterations: {}".format(qlearn.max_iter)
    print "Execution time: {}".format(qlearn.time)

    return viter, piter, qlearn
Beispiel #5
0
def solve_stocks(N=7):
    """Solve the Stocks MDP."""
    tmp = Stocks(N)
    discount = 0.9
    T = tmp.transitions()
    R = tmp.rewards()

    viter = ValueIteration(T, R, discount)
    viter.run()
    print "\nValue iteration: {}".format(viter.policy)
    print "# of iterations: {}".format(viter.iter)
    print "Execution time: {}".format(viter.time)

    piter = PolicyIteration(T, R, discount)
    piter.run()
    print "\nPolicy iteration: {}".format(piter.policy)
    print "# of iterations: {}".format(piter.iter)
    print "Execution time: {}".format(piter.time)

    qlearn = QLearning(T, R, discount, n_iter=200000)
    qlearn.run()
    print "\nQ-learning: {}".format(qlearn.policy)
    #print "\nQ: \n{}".format(qlearn.Q)
    print "# of iterations: {}".format(qlearn.max_iter)
    print "Execution time: {}".format(qlearn.time)

    return viter, piter, qlearn
Beispiel #6
0
def run_gamma_sweep(mdp, vi_pi, prob_str, P, R, gammas, dim):
    if mdp is "forest":
        pass
    elif mdp is "grid":
        pass
    else:
        print("ERROR: Need forest|grid for mdp.  Passed: ", mdp)
        exit(1)
    if vi_pi is "vi":
        pass
    elif vi_pi is "pi":
        pass
    else:
        print("ERROR: Need vi|pi for vi_pi.  Passed: ", vi_pi)
        exit
    base_path = './output/csv/' + mdp + '_' + prob_str + '_' + vi_pi + '_'
    base_sweep_path = './output/' + mdp + '_' + prob_str + '_'
    gamma_sweep_file = base_sweep_path + 'gamma_sweep.rpt'
    if mdp is "grid":
        gw = visualize_grid_world(R[:, 0], dim, dim)
        with open(gamma_sweep_file, 'a') as f:
            f.write("Grid World is:\n" + str(gw) + "\n\n")
    for gamma in gammas:
        gamma_stats_file = base_path + 'gamma_' + str(gamma) + '.csv'
        print("Running Value Iteration with gamma", gamma)
        if vi_pi is "vi":
            alg = ValueIteration(P, R, gamma)
        elif vi_pi is "pi":
            alg = PolicyIteration(P, R, gamma)
        stats = alg.run()
        df = pd.DataFrame(stats)
        df.to_csv(gamma_stats_file, index_label="Iteration")
        print("Value Iteration complete.")
        print("Optimal value function: ", alg.V)
        print("Optimal policy: ", alg.policy)
        with open(gamma_sweep_file, 'a') as f:
            f.write("***" + vi_pi + " with Gamma=" + str(gamma) + "***\n")
            if mdp is "forest":
                # Just dump policy
                f.write("Policy is:\n" + str(alg.policy) + "\n")
            if mdp is "grid":
                # Dump reshaped policy and simulated rewards
                reshaped_policy = visualize_policy(alg.policy, dim)
                simulated_rewards = get_reward(P, R, alg.policy, 10)
                f.write("Policy is:\n" + str(reshaped_policy) + "\n")
                f.write("Simulated rewards are:" + str(simulated_rewards) +
                        "\n")
            f.write("***End of " + vi_pi + " with Gamma=" + str(gamma) +
                    "***\n\n")
Beispiel #7
0
 def valueIteration(self):
     policy_filename = 'policy_npursuers_%d_seed_%d_nrows_%d_ncols_%d_empty_%s.pkl' % (
         self.num_pursuers, self.seed, self.nrows, self.ncols, self.empty)
     if os.path.exists(policy_filename):
         with open(policy_filename, 'rb') as policy_file:
             policy = pickle.load(policy_file)
         return policy
     transitions, rewards = self.compute_alltransitions_reward()
     valueIterationMDP = ValueIteration(transitions,
                                        rewards,
                                        0.99,
                                        skip_check=True)
     valueIterationMDP.run()
     with open(policy_filename, 'wb') as policy_file:
         pickle.dump(valueIterationMDP.policy, policy_file)
     return valueIterationMDP.policy
Beispiel #8
0
 def __init__(self,
              true_transition_kernel,
              reward,
              discount,
              epsilon=0.01,
              max_iter=1000,
              initial_value=0,
              skip_check=False):
     # call parent constructor
     ValueIteration.__init__(self, true_transition_kernel, reward,
                             discount, epsilon, max_iter, initial_value,
                             skip_check)
     innerfunction.attachProblem(self)
     # bind context of inner function and make it accessable
     self.innerfunction = innerfunction
     self.v_next = full(self.V.shape, -inf)
     self.sigma = 0
     self.max_iter = max_iter  #is set to wrong value by ValueIteration
Beispiel #9
0
 def solve_mdp(self, algorithm='PolicyIteration', discount=0.999):
     """
     Run the algorithm over the Markov Decision Process built.
     Available algorithms: PolicyIteration, PolicyIterationModified, ValueIteration (default).
     """
     self.__print_msg('Solving MDP...')
     alg_setup = PolicyIteration(
         self.transitions, self.rewards, discount=discount
     ) if algorithm == 'PolicyIteration' else PolicyIterationModified(
         self.transitions, self.rewards, discount=discount
     ) if algorithm == 'PolicyIterationModified' else ValueIteration(
         self.transitions, self.rewards, discount=discount)
     alg_setup.run()
     optimal_policy = [self.jg_actions[i] for i in alg_setup.policy]
     try:
         goal_index = optimal_policy.index(self.goal_state) + 1
     except:
         goal_index = None
     return optimal_policy[:goal_index]
Beispiel #10
0
    # Build world
    Trans_Prob, Rewards = grid_world(X=dim,
                                     Y=dim,
                                     prob_desired_move=prob_desired,
                                     prob_bad_state=prob_bad_state,
                                     is_sparse=sparse)
    gw = visualize_grid_world(Rewards[:, 0], dim, dim)
    print("Grid world is: ")
    print(gw)
    with open(summary_file, out_type) as f:
        f.write("Grid world is:\n")
        f.write(str(gw) + "\n\n")

    if run_vi:
        vi = ValueIteration(Trans_Prob, Rewards, 0.9)
        vi_stats = vi.run()
        vi_df = pd.DataFrame(vi_stats)
        vi_df.to_csv(vi_stats_file, index_label="Iteration")
        reshaped_value_function = np.reshape(vi.V, (dim, dim))
        reshaped_policy = visualize_policy(vi.policy, dim)
        simulated_rewards = get_reward(Trans_Prob, Rewards, vi.policy, 10, dim,
                                       sparse)
        print("VI: Performed ", vi.iter, " iterations in ", vi.time,
              " and got rewards of: ", simulated_rewards)
        with open(summary_file, 'a') as f:
            f.write("***Value Iteration Section***\n")
            f.write("Iterations: " + str(vi.iter) + "\n")
            f.write("Runtime: " + str(vi.time) + "\n")
            f.write("Value function:\n")
            f.write(str(reshaped_value_function))
from mdptoolbox.example import rand
from mdptoolbox.mdp import ValueIteration
from src.KronMDP import multiagent, multiagent_full, KronValueIteration
from timeit import default_timer as timer
from functools import reduce

RUNBIG = False
RUNKRON = True
RUNFULL = False

# large example with memory problems - python cannot create example
if RUNBIG:
    P, R = rand(10, 2)
    vi = ValueIteration(P, R, 0.95)
    vi.run()

# kron example (not as dense)
if RUNKRON:
    Ps, R = multiagent(S=10, N=5)
    start = timer()
    vi = KronValueIteration(Ps, R, 0.95, skip_check=True)
    vi.run()
    end = timer()
    print("kronecker method took", end - start, "seconds")

# compare with fully computed example
if RUNFULL:
    P, R = multiagent_full(S=10, N=2)
    start = timer()
    vi = ValueIteration(P, R, 0.95)
    vi.run()
Beispiel #12
0
def forest_experiment():
    P, R = mdptoolbox.example.forest(S=1250, r1=500, r2=250)
    value = []
    policy = []
    iters = []
    time_ = []
    gamma = []

    rewards_p = []
    rewards_v = []
    time_p = []
    time_v = []
    iters_p = []
    iters_v = []
    rewards_q = []
    time_q = []
    iters_q = []

    mean_discrep = []

    env2 = gym.make('FrozenLake-v0')

    q_table = []
    value_q = []
    policy_q = []
    iters_q = []
    time_q_arr = []
    gamma_q = []
    q_vals = []
    q_rewards = []
    mean_discrep = []

    for i in range(0, 10):
        start = time.time()
        q_policy = mdptoolbox.mdp.QLearning(P, R, 0.8)
        time_q = time.time() - start
        q_policy.run()
        q_rewards.append(np.mean(q_policy.V))
        value_q.append(np.mean(q_policy.V))
        policy_q.append(q_policy.policy)
        gamma_q.append((i + 0.5) / 10)
        q_vals.append(q_policy.Q)
        mean_discrep.append(q_policy.mean_discrepancy)
        # iters_q.append(q_policy.n_iters)
        time_q_arr.append(time_q)


    plt.plot(gamma_q, mean_discrep, label='Q-Learning')
    plt.xlabel('Gammas')
    plt.title('Q-Learning Mean Discrepancy')
    plt.ylabel('Mean Discrepancy')
    plt.grid()
    plt.show()

    for size in [1250]:

        P, R = mdptoolbox.example.forest(S=size)
        forest_policy_p = PolicyIteration(P, R, 0.99)
        forest_policy_v = ValueIteration(P, R, 0.99)
        forest_policy_q = QLearning(P, R, 0.1)
        forest_policy_p.run()
        forest_policy_v.run()
        forest_policy_q.run()
        rewards_p.append(np.mean(forest_policy_p.V))
        rewards_v.append(np.mean(forest_policy_v.V))
        rewards_q.append(np.mean(forest_policy_q.V))
        time_p.append(forest_policy_p.time)
        time_v.append(forest_policy_v.time)
        #time_q.append(forest_policy_q.time)
        iters_p.append(forest_policy_p.iter)
        iters_v.append(forest_policy_v.iter)
        #iters_q.append(forest_policy_q.iter)


    #plt.plot([1250, 1500, 1750, 2000, 2250, 2500], rewards_p, label='Policy Iteration')
    #plt.plot([1250, 1500, 1750, 2000, 2250, 2500], rewards_v, label='Value Iteration')
    #plt.plot([1250, 1500, 1750, 2000, 2250, 2500], rewards_q, label='Q-Learning')
    #plt.grid()
    #plt.xlabel('State Size')
    #plt.title('Forest Management - Rewards vs State Size')
    #plt.ylabel('Average Rewards')
    #plt.legend()
    #plt.show()

    #plt.plot([1250, 1500, 1750, 2000, 2250, 2500], time_p, label='Policy Iteration')
    #plt.plot([1250, 1500, 1750, 2000, 2250, 2500], time_v, label='Value Iteration')
    #plt.plot([1250, 1500, 1750, 2000, 2250, 2500], time_q, label='Q-Learning')
    #plt.grid()
    #plt.xlabel('State Size')
    #plt.title('Forest Management - Computation Time vs State Size')
    #plt.ylabel('Computation Time')
    #plt.legend()
    #plt.show()

    #plt.plot([1250, 1500, 1750, 2000, 2250, 2500], iters_p, label='Policy Iteration')
    #plt.plot([1250, 1500, 1750, 2000, 2250, 2500], iters_v, label='Value Iteration')
    #plt.grid()
    #plt.xlabel('State Size')
    #plt.title('Forest Management - Convergence vs State Size')
    #plt.ylabel('Iterations')
    #plt.legend()
    #plt.show()

    value_vi = []
    policy_vi = []
    iters_vi = []
    time_vi = []
    gamma_vi = []
    mean_discrep_p = []

    for i in range(0, 10):
        forest_policy = PolicyIteration(P, R, (i+0.5)/10)
        forest_policy.run()
        gamma.append((i+0.5)/10)
        plt.imshow(np.atleast_2d(forest_policy.policy))
        time_.append(forest_policy.time)
        policy.append(forest_policy.policy)
        iters.append(forest_policy.iter)
        value.append(np.mean(forest_policy.V))

    for i in range(0, 10):
        forest_policy = ValueIteration(P, R, (i+0.5)/10)
        forest_policy.run()
        gamma_vi.append((i+0.5)/10)
        time_vi.append(forest_policy.time)
        policy_vi.append(forest_policy.policy)
        iters_vi.append(forest_policy.iter)
        value_vi.append(np.mean(forest_policy.V))

    #P, R = mdptoolbox.example.forest(S=1250, p=0.1)
    value_q = []
    policy_q = []
    iters_q = []
    time_q_arr = []
    gamma_q = []
    q_vals = []
    q_rewards = []
    mean_discrep = []


    env2 = gym.make('FrozenLake-v0')

    q_table = []

    for i in range(0, 10):
        start = time.time()
        q_policy = mdptoolbox.mdp.QLearning(P,R, 0.1)
        time_q = time.time() - start
        q_policy.run()
        q_rewards.append(np.mean(q_policy.V))
        value_q.append(np.mean(q_policy.V))
        policy_q.append(q_policy.policy)
        gamma_q.append((i+0.5)/10)
        q_vals.append(q_policy.Q)
        mean_discrep.append(q_policy.mean_discrepancy)
        #iters_q.append(q_policy.n_iters)
        time_q_arr.append(time_q)

    plt.plot(gamma, time_, label='Policy Iteration')
    plt.plot(gamma_vi, time_vi, label='Value Iteration')
    plt.plot(gamma_q, time_q_arr, label='Q-Learning')
    plt.xlabel('Gammas')
    plt.title('Forest Management - Computation Time - Policy Iteration vs Value Iteration vs Q-Learning')
    plt.ylabel('Computation Time')
    plt.grid()
    plt.legend()
    plt.show()
    

    plt.plot(gamma, value, label='Policy Iteration')
    plt.plot(gamma_vi, value_vi, label='Value Iteration')
    plt.plot(gamma_q, q_rewards, label='Q-Learning')
    plt.xlabel('Gammas')
    plt.title('Average Rewards - Policy Iteration vs Value Iteration vs Q-Learning')
    plt.ylabel('Average Rewards')
    plt.grid()
    plt.legend()
    plt.show()

    plt.plot(gamma, iters, label="Policy Iteration")
    plt.plot(gamma_vi, iters_vi, label="Value Iteration")
    #plt.plot(gamma_q, iters_q, label="Q-Learning")
    plt.xlabel('Gammas')
    plt.title('Iterations to Converge - Policy Iteration vs Value Iteration')
    plt.ylabel('Iterations')
    plt.grid()
    plt.legend()
    plt.show()
Beispiel #13
0
eiters = []
eghls = []
ets = []
bestgoal = 0
bestpolicy = None
bestpolicyV = None
bestpolicyparams = {}

print("Running ...")
for epsilon in epsilons:
    iters = []
    ghls = []
    ts = []
    for gamma in gammas:
        #print("gamma: %.1f, epsilon: %s" % (gamma, str(epsilon)))
        func = ValueIteration(P, R, gamma, max_iter=maxiter, epsilon=epsilon)
        func.run()
        #print("best policy:")
        #common.printPolicy(env, func.policy, actions)
        timesteps, gtimesteps, ghl = common.runPolicy(env, episodes,
                                                      func.policy)
        if ghl[0] > bestgoal:
            bestgoal = ghl[0]
            bestpolicy = func.policy
            bestpolicyV = func.V
            bestpolicyparams['gamma'] = gamma
            bestpolicyparams['epsilon'] = epsilon
            bestpolicyparams['iterations'] = func.iter
            bestpolicyparams['elapsedtime'] = func.time
            bestpolicyparams['meangtimesteps'] = np.mean(gtimesteps)
        iters.append(func.iter)
Beispiel #14
0
def example():
    """Run the MDP Toolbox forest example."""
    transitions, rewards = mdptoolbox.example.forest()
    viter = ValueIteration(transitions, rewards, 0.9)
    viter.run()
    print viter.policy