Ejemplo n.º 1
0
    def test_mc_control(env):
        # obtain the estimated optimal policy and action-value function
        policy, Q = MC_control.mc_control(env, 500000, 0.02)
        # obtain the corresponding state-value function
        V = dict((k, np.max(v)) for k, v in Q.items())

        # plot the state-value function
        plot_blackjack_values(V)
        # plot the policy
        plot_policy(policy)
Ejemplo n.º 2
0
def mc_control(env, num_episodes, alpha, gamma=1.0, eps=0.8, min_eps=0.05):
    nA = env.action_space.n
    Q = defaultdict(lambda: np.zeros(nA))
    policy = {}
    for i_episode in range(1, num_episodes + 1):
        if i_episode % 1000 == 0:
            eps *= 0.98
            print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
        episode = episode_eps_greedy(env, max(eps, min_eps), policy)
        states, actions, rewards = zip(*episode)
        G = 0
        for i in range(len(episode)):
            G += rewards[i] * (gamma**i)
            Q[states[i]][actions[i]] = (
                (1 - alpha) * Q[states[i]][actions[i]]) + (alpha * G)
        for s in Q:
            policy.update({s: np.argmax(Q[s])})
    return policy, Q


env = gym.make('Blackjack-v0')
print(env.observation_space)
print(env.action_space)

policy, Q = mc_control(env, 200000, 0.05)

V = dict((k, np.max(v)) for k, v in Q.items())

plot_blackjack_values(V)
plot_policy(policy)
# obtain the state-value function
V_glie = dict((k,np.max(v)) for k, v in Q_glie.items())

# plot the state-value function
plot_blackjack_values(V_glie)


# Finally, we visualize the policy that is estimated to be optimal.

# In[33]:


from plot_utils import plot_policy

# plot the policy
plot_policy(policy_glie)


# The **true** optimal policy $\pi_*$ can be found on page 82 of the [textbook](http://go.udacity.com/rl-textbook) (and appears below).  Compare your final estimate to the optimal policy - how close are you able to get?  If you are not happy with the performance of your algorithm, take the time to tweak the decay rate of $\epsilon$ and/or run the algorithm for more episodes to attain better results.
# 
# ![True Optimal Policy](images/optimal.png)

# ### Part 4: MC Control: Constant-$\alpha$
# 
# In this section, you will write your own implementation of constant-$\alpha$ MC control.  
# 
# Your algorithm has four arguments:
# - `env`: This is an instance of an OpenAI Gym environment.
# - `num_episodes`: This is the number of episodes that are generated through agent-environment interaction.
# - `alpha`: This is the step-size parameter for the update step.
# - `gamma`: This is the discount rate.  It must be a value between 0 and 1, inclusive (default value: `1`).