print("Policy Probability Distribution:")
print(policy)
print("")

print("Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):")
print(np.reshape(np.argmax(policy, axis=1), env.shape))
print_policy(np.reshape(np.argmax(policy, axis=1), env.shape))
"""
 ^ < < v
 ^ ^ ^ v
 ^ ^ > v
 ^ > > ^
"""
print("")

print("Value Function:")
print(v)
print("")

print("Reshaped Grid Value Function:")
print(v.reshape(env.shape))
print("")

# Test the value function
expected_v = np.array(
    [0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1, 0])
np.testing.assert_array_almost_equal(v, expected_v, decimal=2)

env._render()
Ejemplo n.º 2
0
    for i in range(num_episodes):
        episodes = []
        init_state = choice(list(set(
            env.P.keys())))  # draw a random state to start
        # generate an episode
        while not env.is_terminal(init_state):
            action = choice(list(env.P[init_state].keys(
            )))  # random policy such that draw an action randomly
            next_state = env.P[init_state][action][0][1]
            reward = env.P[init_state][action][0][2]
            episodes.append([init_state, action, reward])
            init_state = next_state
        G = 0
        states_seen = set()
        for S, A, R in reversed(episodes):
            G = 1.0 * G + R  # assuming discount factor is 1.0
            if S not in states_seen:
                states_seen.add(S)
                returns[S].append(G)
                V[S] = np.mean(returns[S])
    V_sorted = sorted(V.items(), key=lambda x: x[0])  # sort by state
    return V_sorted


if __name__ == '__main__':
    env = GridworldEnv((9, 9))
    print(env.P)
    env._render(mode="human")
    V = mc_policy_evaluation_random_policy(env, 5000)
    print(V)