break

    return states, actions, rewards

blackjack = Blackjack ()

average_policy = np.zeros ((2, 21, 11), dtype=np.float)

for k in range (1001) :
    policy = np.ones ((2, 21, 11))
    policy[:, 18:21, :] = 0

    q_values = np.zeros ((2, 21, 11, 2))

    for i in range (5000) :
        initial_state = blackjack.new_state ()
        states, actions, rewards = generate_episode (initial_state, policy)

        G = 0
        for j in range (len(actions)-1, -1, -1) :
            G = G + rewards[j+1]

            q_values[states[j][0], states[j][1], states[j][2], int (actions[j])] = 0.1 * q_values[states[j][0], states[j][1], states[j][2], int (actions[j])] + 0.9 * G
        
        policy = np.argmax (q_values, axis=-1)

    average_policy += policy

average_policy /= 1001
average_policy = np.round (average_policy, decimals=2)