def test_mc_control(env): # obtain the estimated optimal policy and action-value function policy, Q = MC_control.mc_control(env, 500000, 0.02) # obtain the corresponding state-value function V = dict((k, np.max(v)) for k, v in Q.items()) # plot the state-value function plot_blackjack_values(V) # plot the policy plot_policy(policy)
def mc_control(env, num_episodes, alpha, gamma=1.0, eps=0.8, min_eps=0.05): nA = env.action_space.n Q = defaultdict(lambda: np.zeros(nA)) policy = {} for i_episode in range(1, num_episodes + 1): if i_episode % 1000 == 0: eps *= 0.98 print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="") episode = episode_eps_greedy(env, max(eps, min_eps), policy) states, actions, rewards = zip(*episode) G = 0 for i in range(len(episode)): G += rewards[i] * (gamma**i) Q[states[i]][actions[i]] = ( (1 - alpha) * Q[states[i]][actions[i]]) + (alpha * G) for s in Q: policy.update({s: np.argmax(Q[s])}) return policy, Q env = gym.make('Blackjack-v0') print(env.observation_space) print(env.action_space) policy, Q = mc_control(env, 200000, 0.05) V = dict((k, np.max(v)) for k, v in Q.items()) plot_blackjack_values(V) plot_policy(policy)
# obtain the state-value function V_glie = dict((k,np.max(v)) for k, v in Q_glie.items()) # plot the state-value function plot_blackjack_values(V_glie) # Finally, we visualize the policy that is estimated to be optimal. # In[33]: from plot_utils import plot_policy # plot the policy plot_policy(policy_glie) # The **true** optimal policy $\pi_*$ can be found on page 82 of the [textbook](http://go.udacity.com/rl-textbook) (and appears below). Compare your final estimate to the optimal policy - how close are you able to get? If you are not happy with the performance of your algorithm, take the time to tweak the decay rate of $\epsilon$ and/or run the algorithm for more episodes to attain better results. # # ![True Optimal Policy](images/optimal.png) # ### Part 4: MC Control: Constant-$\alpha$ # # In this section, you will write your own implementation of constant-$\alpha$ MC control. # # Your algorithm has four arguments: # - `env`: This is an instance of an OpenAI Gym environment. # - `num_episodes`: This is the number of episodes that are generated through agent-environment interaction. # - `alpha`: This is the step-size parameter for the update step. # - `gamma`: This is the discount rate. It must be a value between 0 and 1, inclusive (default value: `1`).