alpha_min=0.001, epsilon=1.0, epsilon_min=0.1, epsilon_decay=0.99, n_iter=10000, skip_check=False, iter_callback=None, run_stat_frequency=None) ql.run() # print('q learning Q matrix:', ql.Q) print('q learning value function:', ql.V) # print('q learning mean discrepancy:', ql.mean_discrepancy) print('q learning best policy:', ql.policy) results.append(ql) plot_rewards(disc, results, 'Q-Learning Discount/Rewards FrozenLake', 'q_learning_discount_rewards_frozenlake', 'Discount') results = [] for e in ep: ql = QLearning( P, # transitions R, # rewards 0.55, # discount alpha=0.1, alpha_decay=0.99, alpha_min=0.001, epsilon=e, epsilon_min=0.000001, epsilon_decay=0.99, n_iter=10000, skip_check=False,
pi = PolicyIteration( # pi = PolicyIterationModified( P, # transitions R, # rewards d, # discount # epsilon=0.01, max_iter=1000, ) pi.run() print('policy iteration value function:', pi.V) print('policy iteration iterations:', pi.iter) print('policy iteration time:', pi.time) print('policy iteration best policy:', pi.policy) results.append(pi) plot_rewards(disc, results, 'Policy Iteration Discount/Rewards FrozenLake', 'policy_iteration_discount_rewards_frozenlake', 'Discount') results = [] for e in ep: pi = PolicyIteration( # pi = PolicyIterationModified( P, # transitions R, # rewards 0.9, # discount # epsilon=e, max_iter=1000, ) pi.epsilon = e pi.run() print('policy iteration value function:', pi.V) print('policy iteration iterations:', pi.iter) print('policy iteration time:', pi.time)
ep = [0.00099, 0.001, 0.005, 0.01, 0.03] ex = OpenAI_MDPToolbox('FrozenLake-v0', False) P = ex.P R = ex.R results = [] for d in disc: vi = ValueIteration(P, R, d, epsilon=0.001, max_iter=1000) vi.run() print('value iteration value function:', vi.V) print('value iteration iterations:', vi.iter) print('value iteration time:', vi.time) print('value iteration best policy:', vi.policy) results.append(vi) plot_rewards(disc, results, 'Value Iteration Discount/Rewards FrozenLake', 'value_iteration_discount_rewards_frozenlake', 'Discount') results = [] for e in ep: vi = ValueIteration(P, R, 0.9, epsilon=e, max_iter=1000) vi.run() print('value iteration value function:', vi.V) print('value iteration iterations:', vi.iter) print('value iteration time:', vi.time) print('value iteration best policy:', vi.policy) results.append(vi) plot_rewards(ep, results, 'Value Iteration Epsilon/Rewards FrozenLake', 'value_iteration_epsilon_rewards_frozenlake', 'Epsilon') print('----------------Best VI FrozenLake---------------')
pi = PolicyIteration( # pi = PolicyIterationModified( P, # transitions R, # rewards d, # discount max_iter=1000, ) pi.run() print('policy iteration value function:', pi.V) print('policy iteration iterations:', pi.iter) print('policy iteration time:', pi.time) print('policy iteration best policy:', pi.policy) results.append(pi) plot_rewards( disc, results, 'Policy Iteration Discount/Rewards Forest', 'policy_iteration_discount_rewards_forest', 'Discount' ) results = [] for e in ep: pi = PolicyIteration( # pi = PolicyIterationModified( P, # transitions R, # rewards 0.9, # discount # epsilon=e, max_iter=1000, ) pi.epsilon = e pi.run() print('policy iteration value function:', pi.V) print('policy iteration iterations:', pi.iter)
disc = [0.1, 0.3, 0.5, 0.7, 0.9] ep = [0.00099, 0.001, 0.005, 0.01, 0.03] P, R = mdptoolbox.example.forest(S=500, r1=100, r2=2, p=0.1, is_sparse=False) results = [] for d in disc: vi = ValueIteration(P, R, d, epsilon=0.001, max_iter=1000) vi.run() print('value iteration value function:', vi.V) print('value iteration iterations:', vi.iter) print('value iteration time:', vi.time) print('value iteration best policy:', vi.policy) results.append(vi) plot_rewards(disc, results, 'Value Iteration Discount/Rewards Forest', 'value_iteration_discount_rewards_forest', 'Discount') results = [] for e in ep: vi = ValueIteration(P, R, 0.9, epsilon=e, max_iter=1000) vi.run() print('value iteration value function:', vi.V) print('value iteration iterations:', vi.iter) print('value iteration time:', vi.time) print('value iteration best policy:', vi.policy) results.append(vi) plot_rewards(ep, results, 'Value Iteration Epsilon/Rewards Forest', 'value_iteration_epsilon_rewards_forest', 'Epsilon') print('----------------Best VI Forest---------------')