break return states, actions, rewards blackjack = Blackjack () average_policy = np.zeros ((2, 21, 11), dtype=np.float) for k in range (1001) : policy = np.ones ((2, 21, 11)) policy[:, 18:21, :] = 0 q_values = np.zeros ((2, 21, 11, 2)) for i in range (5000) : initial_state = blackjack.new_state () states, actions, rewards = generate_episode (initial_state, policy) G = 0 for j in range (len(actions)-1, -1, -1) : G = G + rewards[j+1] q_values[states[j][0], states[j][1], states[j][2], int (actions[j])] = 0.1 * q_values[states[j][0], states[j][1], states[j][2], int (actions[j])] + 0.9 * G policy = np.argmax (q_values, axis=-1) average_policy += policy average_policy /= 1001 average_policy = np.round (average_policy, decimals=2)