(0, 1): 'R',
        (0, 2): 'R',
        (1, 2): 'R',
        (2, 1): 'R',
        (2, 2): 'R',
        (2, 3): 'U',
    }

    V = {}
    returns = {}  # dictionary of state -> list of returns we've received
    states = grid.all_states()
    for s in states:
        if s in grid.actions:
            returns[s] = []
        else:
            V[s] = 0

    for t in range(100):
        states_and_returns = play_game(grid, policy)
        seen_states = set()
        for s, G in states_and_returns:
            if s not in seen_states:
                returns[s].append(G)
                V[s] = np.mean(returns[s])
                seen_states.add(s)

    print("values:")
    print_values(V, grid)
    print("policy:")
    print_policy(policy, grid)
    for s in states:
        if (
                len(actions[states.index(s)]) != 0
        ):  # Check for terminal or unreachable positions, they have no further action
            random_index = np.random.choice(
                np.arange(len(actions[states.index(
                    s)])))  #Choose randomnly one of the allowed next positions
            policy_list.append(actions[states.index(s)][random_index])
        else:
            policy_list.append(
                '     ')  # Terminal states have no further action

    policy = dict(zip(states, policy_list)
                  )  # Create a dictionary keys: position, value: next position
    print("The initial random policy is:")
    print_policy(policy, grid)  # Print the initial policy
    print("")
    #######################################

    ### initialize the values V(s) randomly ####
    V = {}
    for s in states:  # Initialize the values to 0
        if (
                len(actions[states.index(s)]) != 0
        ):  # Check for terminal or unreachable positions, they have no further action
            V[s] = np.random.random()
        else:
            V[s] = 0
    print(
        "The values are initialized randomly, terminal and unreachable positions have value 0:"
    )