def test_expected_sarsa(env): # obtain the estimated optimal policy and corresponding action-value function Q_expsarsa = TD.expected_sarsa(env, 500, .2) # print the estimated optimal policy policy_expsarsa = np.array([np.argmax(Q_expsarsa[key]) if key in Q_expsarsa else -1 for key in np.arange(48)]).reshape(4,12) check_test.run_check('td_control_check', policy_expsarsa) print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):") print(policy_expsarsa) # plot the estimated optimal state-value function plot_values([np.max(Q_expsarsa[key]) if key in Q_expsarsa else 0 for key in np.arange(48)])
def test_q_learning(env): #visualize the estimated optimal policy and the corresponding state-value function # obtain the estimated optimal policy and corresponding action-value function Q_sarsamax = TD.q_learning(env, 500, .2) # print the estimated optimal policy policy_sarsamax = np.array([np.argmax(Q_sarsamax[key]) if key in Q_sarsamax else -1 for key in np.arange(48)]).reshape((4,12)) check_test.run_check('td_control_check', policy_sarsamax) print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):") print(policy_sarsamax) # plot the estimated optimal state-value function plot_values([np.max(Q_sarsamax[key]) if key in Q_sarsamax else 0 for key in np.arange(48)])
def test_sarsa(): # obtain the estimated optimal policy and corresponding action-value function # eps=.1 safe path, eps = .01 optimal path Q_sarsa = sarsa(env, 5000, .01, eps=0.01) # print the estimated optimal policy policy_sarsa = np.array([np.argmax( Q_sarsa[key]) if key in Q_sarsa else -1 for key in np.arange(48)]).reshape(4, 12) check_test.run_check('td_control_check', policy_sarsa) print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):") print(policy_sarsa) # plot the estimated optimal state-value function V_sarsa = ( [np.max(Q_sarsa[key]) if key in Q_sarsa else 0 for key in np.arange(48)]) plot_values(V_sarsa)
def evaluate_sarsa(): env = gym.make("CliffWalking-v0") # obtain the estimated optimal policy and corresponding action-value function Q_sarsa = sarsa(env, 5000, 0.01) # print the estimated optimal policy policy_sarsa = np.array( [np.argmax(Q_sarsa[key]) if key in Q_sarsa else -1 for key in np.arange(48)] ).reshape(4, 12) check_test.run_check("td_control_check", policy_sarsa) print( "\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):" ) print(policy_sarsa) # plot the estimated optimal state-value function V_sarsa = [np.max(Q_sarsa[key]) if key in Q_sarsa else 0 for key in np.arange(48)] plot_values(V_sarsa)
print("\rEpisode {}/{}".format(i_episode, num_episodes), end="") sys.stdout.flush() ## TODO: complete the function state = env.reset() while True: probabilities = epsilon_greedy(state, Q, epsilon, env.action_space.n) action = get_action(probabilities) next_state, reward, done, info = env.step(action) Qmax = np.max(Q[next_state]) Q[state][action] = Q[state][action] + alpha * ( reward + gamma * Qmax - Q[state][action]) state = next_state if done: break return Q # obtain the estimated optimal policy and corresponding action-value function Q_sarsamax = q_learning(env, 5000, .01) # print the estimated optimal policy policy_sarsamax = np.array([np.argmax(Q_sarsamax[key]) if key in Q_sarsamax else -1 for key in np.arange(48)]).reshape((4,12)) check_test.run_check('td_control_check', policy_sarsamax) print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):") print(policy_sarsamax) # plot the estimated optimal state-value function plot_values([np.max(Q_sarsamax[key]) if key in Q_sarsamax else 0 for key in np.arange(48)])
Vs += action_prob * prob * (reward + gamma * V[next_state]) delta = max(delta, np.abs(V[s] - Vs)) V[s] = Vs if delta < theta: break return V random_policy = np.ones([env.nS, env.nA]) / env.nA # evaluate the policy V = policy_evaluation(env, random_policy) import check_test check_test.run_check('policy_evaluation_check', policy_evaluation) def q_from_v(env, V, s, gamma=1): q = np.zeros(env.nA) for a in range(env.nA): for prob, next_state, reward, done in env.P[s][a]: q[a] += prob * (reward + gamma * V[next_state]) return q Q = np.zeros([env.nS, env.nA]) for s in range(env.nS): Q[s] = q_from_v(env, V, s) check_test.run_check('q_from_v_check', q_from_v)
return V # Run the code cell below to test your implementation and visualize the estimated state-value function. If the code cell returns **PASSED**, then you have implemented the function correctly! Feel free to change the `num_episodes` and `alpha` parameters that are supplied to the function. However, if you'd like to ensure the accuracy of the unit test, please do not change the value of `gamma` from the default. # In[17]: import check_test # evaluate the policy and reshape the state-value function V_pred = td_prediction(env, 5000, policy, .01) # please do not change the code below this line V_pred_plot = np.reshape( [V_pred[key] if key in V_pred else 0 for key in np.arange(48)], (4, 12)) check_test.run_check('td_prediction_check', V_pred_plot) plot_values(V_pred_plot) # How close is your estimated state-value function to the true state-value function corresponding to the policy? # # You might notice that some of the state values are not estimated by the agent. This is because under this policy, the agent will not visit all of the states. In the TD prediction algorithm, the agent can only estimate the values corresponding to states that are visited. # ### Part 2: TD Control: Sarsa # # In this section, you will write your own implementation of the Sarsa control algorithm. # # Your algorithm has four arguments: # - `env`: This is an instance of an OpenAI Gym environment. # - `num_episodes`: This is the number of episodes that are generated through agent-environment interaction. # - `alpha`: This is the step-size parameter for the update step. # - `gamma`: This is the discount rate. It must be a value between 0 and 1, inclusive (default value: `1`).
if done: tem_scores.append(score) break if (i_episode % plot_every == 0): avg_scores.append(np.mean(tem_scores)) plt.plot(np.linspace(0, num_episodes, len(avg_scores), endpoint=False), np.asarray(avg_scores)) plt.xlabel('Episode Number') plt.ylabel('Average Reward (Over Next %d Episodes)' % plot_every) plt.show() # print best 100-episode performance print(('Best Average Reward over %d Episodes: ' % plot_every), np.max(avg_scores)) return Q Q_learn = Q_learning(env, 5000, .01) sarsamax_policy = np.array([ np.argmax(Q_learn[key]) if key in Q_learn else -1 for key in np.arange(48) ]).reshape(4, 12) check_test.run_check('td_control_check', sarsamax_policy) print( "\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):" ) print(sarsamax_policy) V_sarsamax = ([ np.max(Q_learn[key]) if key in Q_learn else 0 for key in np.arange(48) ]) plot_values(V_sarsamax)
V_opt[0:13][2] = -np.arange(3, 15)[::-1] + 2 V_opt[3][0] = -13 # plot_values(V_opt) next_state, reward, done, info = env.step(env.action_space.sample()) print(next_state) if __name__ == "__main__": env = gym.make('CliffWalking-v0') # obtain the estimated optimal policy and corresponding action-value function Q_expsarsa = expected_sarsa(env, 10000, 0.3) # print the estimated optimal policy policy_expsarsa = np.array([ np.argmax(Q_expsarsa[key]) if key in Q_expsarsa else -1 for key in np.arange(48) ]).reshape(4, 12) check_test.run_check('td_control_check', policy_expsarsa) print( "\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):" ) print(policy_expsarsa) # plot the estimated optimal state-value function # plot_values([np.max(Q_expsarsa[key]) if key in Q_expsarsa else 0 for key in np.arange(48)]) # plot_values(V_opt)
def q_learning(env, num_episodes, alpha, gamma=1.0): # initialize action-value function (empty dictionary of arrays) Q = defaultdict(lambda: np.zeros(env.nA)) # initialize performance monitor plot_every = 100 tmp_scores = deque(maxlen=plot_every) scores = deque(maxlen=num_episodes) # loop over episodes for i_episode in range(1, num_episodes+1): # monitor progress if i_episode % 100 == 0: print("\rEpisode {}/{}".format(i_episode, num_episodes), end="") sys.stdout.flush() # initialize score score = 0 # begin an episode, observe S state = env.reset() while True: # get epsilon-greedy action probabilities policy_s = epsilon_greedy_probs(env, Q[state], i_episode) # pick next action A action = np.random.choice(np.arange(env.nA), p=policy_s) # take action A, observe R, S' next_state, reward, done, info = env.step(action) # update Q Q[state][action] = update_Q(Q[state][action], np.max(Q[next_state]), \ reward, alpha, gamma) # S <- S' state = next_state # until S is terminal if done: # append score tmp_scores.append(score) break if(i_episode % plot_every == 0): scores.append(np.mean(tmp_scores)) # plot performance plt.plot(np.linspace(0,num_episodes,len(scores),endpoint=False),np.asarray(scores)) plt.xlabel('Episode Number') plt.ylabel('Average Reward (Over Next %d Episodes)' % plot_every) plt.show() # print best 100-episode performance print(('Best Average Reward over %d Episodes: ' % plot_every), np.max(scores)) return Q # obtain the estimated optimal policy and corresponding action-value function Q_sarsamax = q_learning(env, 5000, .01) # print the estimated optimal policy policy_sarsamax = np.array([np.argmax(Q_sarsamax[key]) if key in Q_sarsamax else -1 for key in np.arange(48)]).reshape((4,12)) check_test.run_check('td_control_check', policy_sarsamax) print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):") print(policy_sarsamax) # plot the estimated optimal state-value function plot_values([np.max(Q_sarsamax[key]) if key in Q_sarsamax else 0 for key in np.arange(48)]) # Part 3: TD Control: Expected Sarsa # Input: policy π, positive integer num episodes, small positive fraction α, GLIE {εi} # Output: value function Q (≈ qπ if num episodes is large enough) # Initialize Q arbitrarily (e.g., Q(s, a) = 0 for all s ∈ S and a ∈ A(s), and Q(terminal-state, ·) = 0) # for i ← 1 to num episodes do # ε ← εi # Observe S0 # t←0 # repeat # Choose action At using policy derived from Q (e.g., ε-greedy) # Take action At and observe Rt+1 , St+1 # Q(St, At) ← Q(St, At) + α(Rt+1 + γ a π(a|St+1)Q(St+1, a) − Q(St, At)) # t←t+1 # until St is terminal; # end # return Q def expected_sarsa(env, num_episodes, alpha, gamma=1.0): # initialize action-value function (empty dictionary of arrays) Q = defaultdict(lambda: np.zeros(env.nA)) # initialize performance moniter plot_every = 100 tmp_scores deque(maxlen=plot_every) scores = deque(maxlen=num_episodes) # loop over episodes for i_episode in range(1, num_episodes+1): # monitor progress if i_episode % 100 == 0: print("\rEpisode {}/{}".format(i_episode, num_episodes), end="") sys.stdout.flush() # initialize score socre = 0 # begin an episode state = env.reset() # get epsilon-greedy action probabilities policy_s = epsilon_greedy_probs(env, Q[state], i_episode, 0.005) while True: # pick next action action = np.random.choice(np.arange(env.nA), p=policy_s) # take action A, observe R, S' next_state, reward, done, info = env.step(action) # add reward to score score += reward # get epsilon-greedy action probabilities (for S') policy_s = epsilon_greedy_probs(env, Q[next_state], i_episode, 0.005) # update Q Q[state][action] = update_Q(Q[state][action], np.dot(Q[next_state], policy_s), \ reward, alpha, gamma) # S <- S' state = next_state # until S is terminal if done: # append score tmp_scores.append(score) break if (i_episode % plot_every == 0): scores.append(np.mean(tmp_scores)) # plot performance plt.plot(np.linspace(0,num_episodes,len(scores),endpoint=False),np.asarray(scores)) plt.xlabel('Episode Number') plt.ylabel('Average Reward (Over Next %d Episodes)' % plot_every) plt.show() # print best 100-episode performance print(('Best Average Reward over %d Episodes: ' % plot_every), np.max(scores)) return Q