def generate_episode_from_Q(env: BlackjackEnv, Q, epsilon, action_count) -> [tuple]: """ Generates an episode @param env: @param Q: @param epsilon: @param action_count Returns """ episode = [] # stores the initial state. [sum of player cards, open dealer card, has usable Ace] state = env.reset() while True: if state in Q: # choose the action with the Q table in mind action = np.random.choice(np.arange(action_count), p=get_probs(Q[state], epsilon, action_count)) else: # if we have never visited this state before, just throw the dice action = env.action_space.sample() next_state, reward, done, info = env.step(action) episode.append((state, action, reward)) state = next_state if done: break return episode
def reset_env_with_s0(env: BlackjackEnv, s0: State) -> BlackjackEnv: env.reset() player_sum = s0[0] oppo_sum = s0[1] has_usable = s0[2] env.dealer[0] = oppo_sum if has_usable: env.player[0] = 1 env.player[1] = player_sum - 11 else: if player_sum > 11: env.player[0] = 10 env.player[1] = player_sum - 10 else: env.player[0] = 2 env.player[1] = player_sum - 2 return env
def mc_control_exploring_starts_state(env: BlackjackEnv, s_0: State, num_episodes, discount_factor=1.0) \ -> Tuple[ActionValue, Policy]: states = list(product(range(10, 22), range(1, 11), (True, False))) policy = { s: np.ones(env.action_space.n) * 1.0 / env.action_space.n for s in states } Q = defaultdict(lambda: np.zeros(env.action_space.n)) returns_sum = defaultdict(float) returns_count = defaultdict(float) for episode_i in range(1, num_episodes + 1): player_sum = s_0[0] oppo_sum = s_0[1] has_usable = s_0[2] env.reset() env.dealer[0] = oppo_sum if has_usable: env.player[0] = 1 env.player[1] = player_sum - 11 else: if player_sum > 11: env.player[0] = 10 env.player[1] = player_sum - 10 else: env.player[0] = 2 env.player[1] = player_sum - 2 episode_history = gen_custom_s0_stochastic_episode(policy, env, s_0) G = 0 a = episode_history[0][1] for s_a_r in episode_history: G += s_a_r[2] returns_sum[s_0, a] += G returns_count[s_0, a] += 1.0 Q[s_0][a] = returns_sum[s_0, a] / returns_count[s_0, a] best_a = np.argmax(Q[s_0]) policy[s_0][best_a] = 1.0 policy[s_0][1 - best_a] = 0.0 return Q, policy
def gen_episode_data(policy: DeterministicPolicy, env: BlackjackEnv) -> List[Tuple[State, Action, Reward]]: episode_history = [] state = env.reset() done = False while not done: action = policy(state) next_state, reward, done, _ = env.step(action) episode_history.append((state, action, reward)) state = next_state return episode_history
def gen_stochastic_episode( policy: Policy, env: BlackjackEnv) -> List[Tuple[State, Action, Reward]]: episode_history = [] state = env.reset() done = False while not done: A: ActionValue = policy[state] action = np.random.choice([0, 1], p=A / sum(A)) next_state, reward, done, _ = env.step(action) episode_history.append((state, action, reward)) state = next_state return episode_history