def test_expected_sarsa(env):
     # obtain the estimated optimal policy and corresponding action-value function
     Q_expsarsa = TD.expected_sarsa(env, 500, .2)    
     # print the estimated optimal policy
     policy_expsarsa = np.array([np.argmax(Q_expsarsa[key]) if key in Q_expsarsa else -1 for key in np.arange(48)]).reshape(4,12)
     check_test.run_check('td_control_check', policy_expsarsa)
     print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):")
     print(policy_expsarsa)    
     # plot the estimated optimal state-value function
     plot_values([np.max(Q_expsarsa[key]) if key in Q_expsarsa else 0 for key in np.arange(48)])    
 def test_q_learning(env):
     #visualize the estimated optimal policy and the corresponding state-value function
     # obtain the estimated optimal policy and corresponding action-value function
     Q_sarsamax = TD.q_learning(env, 500, .2)
     
     # print the estimated optimal policy
     policy_sarsamax = np.array([np.argmax(Q_sarsamax[key]) if key in Q_sarsamax else -1 for key in np.arange(48)]).reshape((4,12))
     check_test.run_check('td_control_check', policy_sarsamax)
     print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):")
     print(policy_sarsamax)
     # plot the estimated optimal state-value function
     plot_values([np.max(Q_sarsamax[key]) if key in Q_sarsamax else 0 for key in np.arange(48)])    
def test_sarsa():
    # obtain the estimated optimal policy and corresponding action-value function
    # eps=.1 safe path, eps = .01 optimal path
    Q_sarsa = sarsa(env, 5000, .01, eps=0.01)

    # print the estimated optimal policy
    policy_sarsa = np.array([np.argmax(
        Q_sarsa[key]) if key in Q_sarsa else -1 for key in np.arange(48)]).reshape(4, 12)
    check_test.run_check('td_control_check', policy_sarsa)
    print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):")
    print(policy_sarsa)

    # plot the estimated optimal state-value function
    V_sarsa = (
        [np.max(Q_sarsa[key]) if key in Q_sarsa else 0 for key in np.arange(48)])
    plot_values(V_sarsa)
def evaluate_sarsa():
    env = gym.make("CliffWalking-v0")
    # obtain the estimated optimal policy and corresponding action-value function
    Q_sarsa = sarsa(env, 5000, 0.01)

    # print the estimated optimal policy
    policy_sarsa = np.array(
        [np.argmax(Q_sarsa[key]) if key in Q_sarsa else -1 for key in np.arange(48)]
    ).reshape(4, 12)
    check_test.run_check("td_control_check", policy_sarsa)
    print(
        "\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):"
    )
    print(policy_sarsa)

    # plot the estimated optimal state-value function
    V_sarsa = [np.max(Q_sarsa[key]) if key in Q_sarsa else 0 for key in np.arange(48)]
    plot_values(V_sarsa)
Beispiel #5
0
            print("\rEpisode {}/{}".format(i_episode, num_episodes), end="")
            sys.stdout.flush()

        ## TODO: complete the function
        state = env.reset()
        while True:
            probabilities = epsilon_greedy(state, Q, epsilon, env.action_space.n)
            action = get_action(probabilities)

            next_state, reward, done, info = env.step(action)
            Qmax = np.max(Q[next_state])
            Q[state][action] = Q[state][action] + alpha * (
                        reward + gamma * Qmax - Q[state][action])
            state = next_state
            if done:
                break

    return Q

# obtain the estimated optimal policy and corresponding action-value function
Q_sarsamax = q_learning(env, 5000, .01)

# print the estimated optimal policy
policy_sarsamax = np.array([np.argmax(Q_sarsamax[key]) if key in Q_sarsamax else -1 for key in np.arange(48)]).reshape((4,12))
check_test.run_check('td_control_check', policy_sarsamax)
print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):")
print(policy_sarsamax)

# plot the estimated optimal state-value function
plot_values([np.max(Q_sarsamax[key]) if key in Q_sarsamax else 0 for key in np.arange(48)])
                    Vs += action_prob * prob * (reward + gamma * V[next_state])
            delta = max(delta, np.abs(V[s] - Vs))
            V[s] = Vs
        if delta < theta:
            break
    return V


random_policy = np.ones([env.nS, env.nA]) / env.nA

# evaluate the policy
V = policy_evaluation(env, random_policy)

import check_test

check_test.run_check('policy_evaluation_check', policy_evaluation)


def q_from_v(env, V, s, gamma=1):
    q = np.zeros(env.nA)
    for a in range(env.nA):
        for prob, next_state, reward, done in env.P[s][a]:
            q[a] += prob * (reward + gamma * V[next_state])
    return q


Q = np.zeros([env.nS, env.nA])
for s in range(env.nS):
    Q[s] = q_from_v(env, V, s)

check_test.run_check('q_from_v_check', q_from_v)
Beispiel #7
0
    return V


# Run the code cell below to test your implementation and visualize the estimated state-value function.  If the code cell returns **PASSED**, then you have implemented the function correctly!  Feel free to change the `num_episodes` and `alpha` parameters that are supplied to the function.  However, if you'd like to ensure the accuracy of the unit test, please do not change the value of `gamma` from the default.

# In[17]:

import check_test

# evaluate the policy and reshape the state-value function
V_pred = td_prediction(env, 5000, policy, .01)

# please do not change the code below this line
V_pred_plot = np.reshape(
    [V_pred[key] if key in V_pred else 0 for key in np.arange(48)], (4, 12))
check_test.run_check('td_prediction_check', V_pred_plot)
plot_values(V_pred_plot)

# How close is your estimated state-value function to the true state-value function corresponding to the policy?
#
# You might notice that some of the state values are not estimated by the agent.  This is because under this policy, the agent will not visit all of the states.  In the TD prediction algorithm, the agent can only estimate the values corresponding to states that are visited.

# ### Part 2: TD Control: Sarsa
#
# In this section, you will write your own implementation of the Sarsa control algorithm.
#
# Your algorithm has four arguments:
# - `env`: This is an instance of an OpenAI Gym environment.
# - `num_episodes`: This is the number of episodes that are generated through agent-environment interaction.
# - `alpha`: This is the step-size parameter for the update step.
# - `gamma`: This is the discount rate.  It must be a value between 0 and 1, inclusive (default value: `1`).
            if done:
                tem_scores.append(score)
                break
        if (i_episode % plot_every == 0):
            avg_scores.append(np.mean(tem_scores))
    plt.plot(np.linspace(0, num_episodes, len(avg_scores), endpoint=False),
             np.asarray(avg_scores))
    plt.xlabel('Episode Number')
    plt.ylabel('Average Reward (Over Next %d Episodes)' % plot_every)
    plt.show()
    # print best 100-episode performance
    print(('Best Average Reward over %d Episodes: ' % plot_every),
          np.max(avg_scores))

    return Q


Q_learn = Q_learning(env, 5000, .01)
sarsamax_policy = np.array([
    np.argmax(Q_learn[key]) if key in Q_learn else -1 for key in np.arange(48)
]).reshape(4, 12)
check_test.run_check('td_control_check', sarsamax_policy)
print(
    "\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):"
)
print(sarsamax_policy)

V_sarsamax = ([
    np.max(Q_learn[key]) if key in Q_learn else 0 for key in np.arange(48)
])
plot_values(V_sarsamax)
Beispiel #9
0
    V_opt[0:13][2] = -np.arange(3, 15)[::-1] + 2
    V_opt[3][0] = -13

    # plot_values(V_opt)

    next_state, reward, done, info = env.step(env.action_space.sample())
    print(next_state)


if __name__ == "__main__":
    env = gym.make('CliffWalking-v0')

    # obtain the estimated optimal policy and corresponding action-value function
    Q_expsarsa = expected_sarsa(env, 10000, 0.3)

    # print the estimated optimal policy
    policy_expsarsa = np.array([
        np.argmax(Q_expsarsa[key]) if key in Q_expsarsa else -1
        for key in np.arange(48)
    ]).reshape(4, 12)
    check_test.run_check('td_control_check', policy_expsarsa)
    print(
        "\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):"
    )
    print(policy_expsarsa)

    # plot the estimated optimal state-value function

    # plot_values([np.max(Q_expsarsa[key]) if key in Q_expsarsa else 0 for key in np.arange(48)])
    # plot_values(V_opt)
def  q_learning(env, num_episodes, alpha, gamma=1.0):
	# initialize action-value function (empty dictionary of arrays)
	Q = defaultdict(lambda: np.zeros(env.nA))
	# initialize performance monitor
	plot_every = 100
	tmp_scores = deque(maxlen=plot_every)
	scores = deque(maxlen=num_episodes)
	# loop over episodes
	for i_episode in range(1, num_episodes+1):
		# monitor progress
        if i_episode % 100 == 0:
            print("\rEpisode {}/{}".format(i_episode, num_episodes), end="")
            sys.stdout.flush()
        # initialize score
        score = 0
        # begin an episode, observe S
        state = env.reset()
        while True:
        	# get epsilon-greedy action probabilities
        	policy_s = epsilon_greedy_probs(env, Q[state], i_episode)
        	# pick next action A
        	action = np.random.choice(np.arange(env.nA), p=policy_s)
        	# take action A, observe R, S'
        	next_state, reward, done, info = env.step(action)
        	# update Q
        	Q[state][action] = update_Q(Q[state][action], np.max(Q[next_state]), \
        									reward, alpha, gamma)

        	# S <- S'
        	state = next_state
        	# until S is terminal
        	if done:
        		# append score
        		tmp_scores.append(score)
        		break
        if(i_episode % plot_every == 0):
        	scores.append(np.mean(tmp_scores))
    # plot performance
    plt.plot(np.linspace(0,num_episodes,len(scores),endpoint=False),np.asarray(scores))
    plt.xlabel('Episode Number')
    plt.ylabel('Average Reward (Over Next %d Episodes)' % plot_every)
    plt.show()
    # print best 100-episode performance
    print(('Best Average Reward over %d Episodes: ' % plot_every), np.max(scores))
    return Q


	# obtain the estimated optimal policy and corresponding action-value function
	Q_sarsamax = q_learning(env, 5000, .01)

	# print the estimated optimal policy
	policy_sarsamax = np.array([np.argmax(Q_sarsamax[key]) if key in Q_sarsamax else -1 for key in np.arange(48)]).reshape((4,12))
	check_test.run_check('td_control_check', policy_sarsamax)
	print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):")
	print(policy_sarsamax)

	# plot the estimated optimal state-value function
	plot_values([np.max(Q_sarsamax[key]) if key in Q_sarsamax else 0 for key in np.arange(48)])

	# Part 3: TD Control: Expected Sarsa

	# Input: policy π, positive integer num episodes, small positive fraction α, GLIE {εi}
	# Output: value function Q (≈ qπ if num episodes is large enough)
	# Initialize Q arbitrarily (e.g., Q(s, a) = 0 for all s ∈ S and a ∈ A(s), and Q(terminal-state, ·) = 0)
	# for i ← 1 to num episodes do 
	#     ε ← εi
	#     Observe S0 
	#     t←0 
	#     repeat
	#       Choose action At using policy derived from Q (e.g., ε-greedy)
	#       Take action At and observe Rt+1 , St+1
	#       Q(St, At) ← Q(St, At) + α(Rt+1 + γ 􏰀a π(a|St+1)Q(St+1, a) − Q(St, At)) 
	#       t←t+1
	#     until St is terminal; 
	# end
	# return Q

	def expected_sarsa(env, num_episodes, alpha, gamma=1.0):
		# initialize action-value function (empty dictionary of arrays)
		Q = defaultdict(lambda: np.zeros(env.nA))
		# initialize performance moniter
		plot_every = 100
		tmp_scores deque(maxlen=plot_every)
		scores = deque(maxlen=num_episodes)
		# loop over episodes
		for i_episode in range(1, num_episodes+1):
			# monitor progress
			if i_episode % 100 == 0:
				print("\rEpisode {}/{}".format(i_episode, num_episodes), end="")
				sys.stdout.flush()
			# initialize score
			socre = 0
			# begin an episode
			state = env.reset()
			# get epsilon-greedy action probabilities
			policy_s = epsilon_greedy_probs(env, Q[state], i_episode, 0.005)
			while True:
				# pick next action
				action = np.random.choice(np.arange(env.nA), p=policy_s)
				# take action A, observe R, S'
				next_state, reward, done, info = env.step(action)
				# add reward to score
				score += reward
				# get epsilon-greedy action probabilities (for S')
				policy_s = epsilon_greedy_probs(env, Q[next_state], i_episode, 0.005)
				# update Q
				Q[state][action] = update_Q(Q[state][action], np.dot(Q[next_state], policy_s), \
													reward, alpha, gamma)

				# S <- S'
				state = next_state
				# until S is terminal
				if done:
					# append score
					tmp_scores.append(score)
					break
	        if (i_episode % plot_every == 0):
	            scores.append(np.mean(tmp_scores))
	    # plot performance
	    plt.plot(np.linspace(0,num_episodes,len(scores),endpoint=False),np.asarray(scores))
	    plt.xlabel('Episode Number')
	    plt.ylabel('Average Reward (Over Next %d Episodes)' % plot_every)
	    plt.show()
	    # print best 100-episode performance
	    print(('Best Average Reward over %d Episodes: ' % plot_every), np.max(scores))
	    return Q