print("Policy Probability Distribution:") print(policy) print("") print("Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):") print(np.reshape(np.argmax(policy, axis=1), env.shape)) print_policy(np.reshape(np.argmax(policy, axis=1), env.shape)) """ ^ < < v ^ ^ ^ v ^ ^ > v ^ > > ^ """ print("") print("Value Function:") print(v) print("") print("Reshaped Grid Value Function:") print(v.reshape(env.shape)) print("") # Test the value function expected_v = np.array( [0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1, 0]) np.testing.assert_array_almost_equal(v, expected_v, decimal=2) env._render()
for i in range(num_episodes): episodes = [] init_state = choice(list(set( env.P.keys()))) # draw a random state to start # generate an episode while not env.is_terminal(init_state): action = choice(list(env.P[init_state].keys( ))) # random policy such that draw an action randomly next_state = env.P[init_state][action][0][1] reward = env.P[init_state][action][0][2] episodes.append([init_state, action, reward]) init_state = next_state G = 0 states_seen = set() for S, A, R in reversed(episodes): G = 1.0 * G + R # assuming discount factor is 1.0 if S not in states_seen: states_seen.add(S) returns[S].append(G) V[S] = np.mean(returns[S]) V_sorted = sorted(V.items(), key=lambda x: x[0]) # sort by state return V_sorted if __name__ == '__main__': env = GridworldEnv((9, 9)) print(env.P) env._render(mode="human") V = mc_policy_evaluation_random_policy(env, 5000) print(V)