def run_maze(maze, title=""): T = maze.get_transitions() R = maze.get_rewards() discount = 0.90 value_iteration = ValueIteration(T, R, discount) value_iteration.run() print "VITER REWARD", maze.find_reward(value_iteration.policy) print "VITER TIME", value_iteration.time print "VITER ITERS", value_iteration.iter maze.draw_maze(value_iteration.policy, title=title+"v") policy_iteration = PolicyIteration(T,R, discount) policy_iteration.run() print "PITER REWARD", maze.find_reward(policy_iteration.policy) print "PITER TIME", policy_iteration.time print "PITER ITERS", policy_iteration.iter maze.draw_maze(policy_iteration.policy, title=title+'p') s = time.time() Q = maze.qlearn() n = time.time() q_policy = [] for state in Q: q_policy.append(np.argmax(state)) maze.draw_maze(q_policy, title=title+'q') print "Q LEARN", maze.find_reward(q_policy) print "Q LEARN TIME", (n-s) print "Q ITERS", maze.q_iters
def __init__(self, transitions, reward, discount, epsilon=0.01, max_iter=10, skip_check=False): # Initialise a (modified) policy iteration MDP. # Maybe its better not to subclass from PolicyIteration, because the # initialisation of the two are quite different. eg there is policy0 # being calculated here which doesn't need to be. The only thing that # is needed from the PolicyIteration class is the _evalPolicyIterative # function. Perhaps there is a better way to do it? PolicyIteration.__init__(self, transitions, reward, discount, None, max_iter, 1, skip_check=skip_check) # PolicyIteration doesn't pass epsilon to MDP.__init__() so we will # check it here self.epsilon = float(epsilon) assert epsilon > 0, "'epsilon' must be greater than 0." # computation of threshold of variation for V for an epsilon-optimal # policy if self.discount != 1: self.thresh = self.epsilon * (1 - self.discount) / self.discount else: self.thresh = self.epsilon if self.discount == 1: self.V = _np.zeros(self.S) else: Rmin = min(R.min() for R in self.R) self.V = 1 / (1 - self.discount) * Rmin * _np.ones((self.S,))
def stocks_vs_state(n_states=None): """Compare performance on the Stocks MDP as a function of state size.""" if n_states is None: n_states = [7, 9, 13, 15, 17, 23, 29, 35, 41, 53, 65, 77, 89] for N in n_states: mdp = Stocks(N) discount = 0.9 T = mdp.transitions() R = mdp.rewards() viter = ValueIteration(T, R, discount) viter.run() rewards, _ = simulate_policy(viter, mdp) print "\nValue iteration: {}".format(viter.policy) print "# of iterations: {}".format(viter.iter) print "Execution time: {}".format(viter.time) print "Average reward: {}".format(np.mean(rewards)) piter = PolicyIteration(T, R, discount) piter.run() rewards, _ = simulate_policy(piter, mdp) print "\nPolicy iteration: {}".format(piter.policy) print "# of iterations: {}".format(piter.iter) print "Execution time: {}".format(piter.time) print "Average reward: {}".format(np.mean(rewards)) qlearn = QLearning(T, R, discount, n_iter=10000) qlearn.run() rewards, _ = simulate_policy(piter, mdp) print "\nQ-learning: {}".format(qlearn.policy) print "# of iterations: {}".format(qlearn.max_iter) print "Execution time: {}".format(qlearn.time) print "Average reward: {}".format(np.mean(rewards))
def solve_mini_maze(): """Solve miniature Maze MDP.""" M = mini_maze() T = M.transitions() R = M.rewards() discount = 0.9 viter = ValueIteration(T, R, discount) viter.run() print "\nValue iteration: {}".format(viter.policy) print "# of iterations: {}".format(viter.iter) print "Execution time: {}".format(viter.time) piter = PolicyIteration(T, R, discount, max_iter=2000) piter.run() print "\nPolicy iteration: {}".format(piter.policy) print "# of iterations: {}".format(piter.iter) print "Execution time: {}".format(piter.time) qlearn = QLearning(T, R, discount, n_iter=50000) qlearn.run() print "\nQ-learning: {}".format(qlearn.policy) print "# of iterations: {}".format(qlearn.max_iter) print "Execution time: {}".format(qlearn.time) return viter, piter, qlearn
def solve_stocks(N=7): """Solve the Stocks MDP.""" tmp = Stocks(N) discount = 0.9 T = tmp.transitions() R = tmp.rewards() viter = ValueIteration(T, R, discount) viter.run() print "\nValue iteration: {}".format(viter.policy) print "# of iterations: {}".format(viter.iter) print "Execution time: {}".format(viter.time) piter = PolicyIteration(T, R, discount) piter.run() print "\nPolicy iteration: {}".format(piter.policy) print "# of iterations: {}".format(piter.iter) print "Execution time: {}".format(piter.time) qlearn = QLearning(T, R, discount, n_iter=200000) qlearn.run() print "\nQ-learning: {}".format(qlearn.policy) #print "\nQ: \n{}".format(qlearn.Q) print "# of iterations: {}".format(qlearn.max_iter) print "Execution time: {}".format(qlearn.time) return viter, piter, qlearn
def solve_mdp(self, algorithm='PolicyIteration', discount=0.999): """ Run the algorithm over the Markov Decision Process built. Available algorithms: PolicyIteration, PolicyIterationModified, ValueIteration (default). """ self.__print_msg('Solving MDP...') alg_setup = PolicyIteration( self.transitions, self.rewards, discount=discount ) if algorithm == 'PolicyIteration' else PolicyIterationModified( self.transitions, self.rewards, discount=discount ) if algorithm == 'PolicyIterationModified' else ValueIteration( self.transitions, self.rewards, discount=discount) alg_setup.run() optimal_policy = [self.jg_actions[i] for i in alg_setup.policy] try: goal_index = optimal_policy.index(self.goal_state) + 1 except: goal_index = None return optimal_policy[:goal_index]
def run_gamma_sweep(mdp, vi_pi, prob_str, P, R, gammas, dim): if mdp is "forest": pass elif mdp is "grid": pass else: print("ERROR: Need forest|grid for mdp. Passed: ", mdp) exit(1) if vi_pi is "vi": pass elif vi_pi is "pi": pass else: print("ERROR: Need vi|pi for vi_pi. Passed: ", vi_pi) exit base_path = './output/csv/' + mdp + '_' + prob_str + '_' + vi_pi + '_' base_sweep_path = './output/' + mdp + '_' + prob_str + '_' gamma_sweep_file = base_sweep_path + 'gamma_sweep.rpt' if mdp is "grid": gw = visualize_grid_world(R[:, 0], dim, dim) with open(gamma_sweep_file, 'a') as f: f.write("Grid World is:\n" + str(gw) + "\n\n") for gamma in gammas: gamma_stats_file = base_path + 'gamma_' + str(gamma) + '.csv' print("Running Value Iteration with gamma", gamma) if vi_pi is "vi": alg = ValueIteration(P, R, gamma) elif vi_pi is "pi": alg = PolicyIteration(P, R, gamma) stats = alg.run() df = pd.DataFrame(stats) df.to_csv(gamma_stats_file, index_label="Iteration") print("Value Iteration complete.") print("Optimal value function: ", alg.V) print("Optimal policy: ", alg.policy) with open(gamma_sweep_file, 'a') as f: f.write("***" + vi_pi + " with Gamma=" + str(gamma) + "***\n") if mdp is "forest": # Just dump policy f.write("Policy is:\n" + str(alg.policy) + "\n") if mdp is "grid": # Dump reshaped policy and simulated rewards reshaped_policy = visualize_policy(alg.policy, dim) simulated_rewards = get_reward(P, R, alg.policy, 10) f.write("Policy is:\n" + str(reshaped_policy) + "\n") f.write("Simulated rewards are:" + str(simulated_rewards) + "\n") f.write("***End of " + vi_pi + " with Gamma=" + str(gamma) + "***\n\n")
def __init__(self, transitions, reward, discount, policy0=None, max_iter=1000, eval_type=0, skip_check=False): # Python MDP toolbox from https://github.com/sawcordwell/pymdptoolbox # In Matlab MDP Toolbox, P = (S, S, A), R = (S, A) # In Python MDP Toolbox, P = (A, S, S), R= (S, A) transitions = np.transpose( transitions, (2, 0, 1)).copy() # Change to Action First (A, S, S) skip_check = True # To Avoid StochasticError: 'PyMDPToolbox - The transition probability matrix is not stochastic.' PolicyIteration.__init__(self, transitions, reward, discount, policy0=None, max_iter=1000, eval_type=0, skip_check=skip_check)
print("VI: Performed ", vi.iter, " iterations in ", vi.time, " and got rewards of: ", simulated_rewards) with open(summary_file, 'a') as f: f.write("***Value Iteration Section***\n") f.write("Iterations: " + str(vi.iter) + "\n") f.write("Runtime: " + str(vi.time) + "\n") f.write("Value function:\n") f.write(str(reshaped_value_function)) f.write("\nPolicy:\n") f.write(str(reshaped_policy)) f.write("\nSimulated rewards:\n") f.write(str(simulated_rewards)) f.write("\n***End of Value Iteration Section***\n\n") if run_pi: pi = PolicyIteration(Trans_Prob, Rewards, 0.9) pi_stats = pi.run() pi_df = pd.DataFrame(pi_stats) pi_df.to_csv(pi_stats_file, index_label="Iteration") reshaped_value_function = np.reshape(pi.V, (dim, dim)) reshaped_policy = visualize_policy(pi.policy, dim) simulated_rewards = get_reward(Trans_Prob, Rewards, pi.policy, 10, dim, sparse) print("PI: Performed ", pi.iter, " iterations in ", pi.time, " and got rewards of: ", simulated_rewards) with open(summary_file, 'a') as f: f.write("***Policy Iteration Section***\n") f.write("Iterations: " + str(pi.iter) + "\n") f.write("Runtime: " + str(pi.time) + "\n") f.write("Value function:\n") f.write(str(reshaped_value_function))
def forest_experiment(): P, R = mdptoolbox.example.forest(S=1250, r1=500, r2=250) value = [] policy = [] iters = [] time_ = [] gamma = [] rewards_p = [] rewards_v = [] time_p = [] time_v = [] iters_p = [] iters_v = [] rewards_q = [] time_q = [] iters_q = [] mean_discrep = [] env2 = gym.make('FrozenLake-v0') q_table = [] value_q = [] policy_q = [] iters_q = [] time_q_arr = [] gamma_q = [] q_vals = [] q_rewards = [] mean_discrep = [] for i in range(0, 10): start = time.time() q_policy = mdptoolbox.mdp.QLearning(P, R, 0.8) time_q = time.time() - start q_policy.run() q_rewards.append(np.mean(q_policy.V)) value_q.append(np.mean(q_policy.V)) policy_q.append(q_policy.policy) gamma_q.append((i + 0.5) / 10) q_vals.append(q_policy.Q) mean_discrep.append(q_policy.mean_discrepancy) # iters_q.append(q_policy.n_iters) time_q_arr.append(time_q) plt.plot(gamma_q, mean_discrep, label='Q-Learning') plt.xlabel('Gammas') plt.title('Q-Learning Mean Discrepancy') plt.ylabel('Mean Discrepancy') plt.grid() plt.show() for size in [1250]: P, R = mdptoolbox.example.forest(S=size) forest_policy_p = PolicyIteration(P, R, 0.99) forest_policy_v = ValueIteration(P, R, 0.99) forest_policy_q = QLearning(P, R, 0.1) forest_policy_p.run() forest_policy_v.run() forest_policy_q.run() rewards_p.append(np.mean(forest_policy_p.V)) rewards_v.append(np.mean(forest_policy_v.V)) rewards_q.append(np.mean(forest_policy_q.V)) time_p.append(forest_policy_p.time) time_v.append(forest_policy_v.time) #time_q.append(forest_policy_q.time) iters_p.append(forest_policy_p.iter) iters_v.append(forest_policy_v.iter) #iters_q.append(forest_policy_q.iter) #plt.plot([1250, 1500, 1750, 2000, 2250, 2500], rewards_p, label='Policy Iteration') #plt.plot([1250, 1500, 1750, 2000, 2250, 2500], rewards_v, label='Value Iteration') #plt.plot([1250, 1500, 1750, 2000, 2250, 2500], rewards_q, label='Q-Learning') #plt.grid() #plt.xlabel('State Size') #plt.title('Forest Management - Rewards vs State Size') #plt.ylabel('Average Rewards') #plt.legend() #plt.show() #plt.plot([1250, 1500, 1750, 2000, 2250, 2500], time_p, label='Policy Iteration') #plt.plot([1250, 1500, 1750, 2000, 2250, 2500], time_v, label='Value Iteration') #plt.plot([1250, 1500, 1750, 2000, 2250, 2500], time_q, label='Q-Learning') #plt.grid() #plt.xlabel('State Size') #plt.title('Forest Management - Computation Time vs State Size') #plt.ylabel('Computation Time') #plt.legend() #plt.show() #plt.plot([1250, 1500, 1750, 2000, 2250, 2500], iters_p, label='Policy Iteration') #plt.plot([1250, 1500, 1750, 2000, 2250, 2500], iters_v, label='Value Iteration') #plt.grid() #plt.xlabel('State Size') #plt.title('Forest Management - Convergence vs State Size') #plt.ylabel('Iterations') #plt.legend() #plt.show() value_vi = [] policy_vi = [] iters_vi = [] time_vi = [] gamma_vi = [] mean_discrep_p = [] for i in range(0, 10): forest_policy = PolicyIteration(P, R, (i+0.5)/10) forest_policy.run() gamma.append((i+0.5)/10) plt.imshow(np.atleast_2d(forest_policy.policy)) time_.append(forest_policy.time) policy.append(forest_policy.policy) iters.append(forest_policy.iter) value.append(np.mean(forest_policy.V)) for i in range(0, 10): forest_policy = ValueIteration(P, R, (i+0.5)/10) forest_policy.run() gamma_vi.append((i+0.5)/10) time_vi.append(forest_policy.time) policy_vi.append(forest_policy.policy) iters_vi.append(forest_policy.iter) value_vi.append(np.mean(forest_policy.V)) #P, R = mdptoolbox.example.forest(S=1250, p=0.1) value_q = [] policy_q = [] iters_q = [] time_q_arr = [] gamma_q = [] q_vals = [] q_rewards = [] mean_discrep = [] env2 = gym.make('FrozenLake-v0') q_table = [] for i in range(0, 10): start = time.time() q_policy = mdptoolbox.mdp.QLearning(P,R, 0.1) time_q = time.time() - start q_policy.run() q_rewards.append(np.mean(q_policy.V)) value_q.append(np.mean(q_policy.V)) policy_q.append(q_policy.policy) gamma_q.append((i+0.5)/10) q_vals.append(q_policy.Q) mean_discrep.append(q_policy.mean_discrepancy) #iters_q.append(q_policy.n_iters) time_q_arr.append(time_q) plt.plot(gamma, time_, label='Policy Iteration') plt.plot(gamma_vi, time_vi, label='Value Iteration') plt.plot(gamma_q, time_q_arr, label='Q-Learning') plt.xlabel('Gammas') plt.title('Forest Management - Computation Time - Policy Iteration vs Value Iteration vs Q-Learning') plt.ylabel('Computation Time') plt.grid() plt.legend() plt.show() plt.plot(gamma, value, label='Policy Iteration') plt.plot(gamma_vi, value_vi, label='Value Iteration') plt.plot(gamma_q, q_rewards, label='Q-Learning') plt.xlabel('Gammas') plt.title('Average Rewards - Policy Iteration vs Value Iteration vs Q-Learning') plt.ylabel('Average Rewards') plt.grid() plt.legend() plt.show() plt.plot(gamma, iters, label="Policy Iteration") plt.plot(gamma_vi, iters_vi, label="Value Iteration") #plt.plot(gamma_q, iters_q, label="Q-Learning") plt.xlabel('Gammas') plt.title('Iterations to Converge - Policy Iteration vs Value Iteration') plt.ylabel('Iterations') plt.grid() plt.legend() plt.show()
from mdptoolbox.example import rand from mdptoolbox.mdp import ValueIteration, PolicyIteration, _LP from examples.MDP_models import multiagent, multiagent_full from src.KronMDP import KronValueIteration, KronPolicyIteration from timeit import default_timer as timer from functools import reduce RUNBIG = False RUNKRON = True RUNFULL = False # large example with memory problems - python cannot create example if RUNBIG: start = timer() P, R = rand(1000000, 5) vi = PolicyIteration(P, R, 0.95) vi.run() end = timer() print("Full method took", end - start, "seconds") # kron example (not as dense) if RUNKRON: Ps, R = multiagent(S=10, N=5) start = timer() vi = KronPolicyIteration(Ps, R, 0.95, skip_check=True) vi.run() end = timer() print("kronecker method took", end - start, "seconds") # compare with fully computed example if RUNFULL: