コード例 #1
0
def generate_episode_from_Q(env: BlackjackEnv, Q, epsilon,
                            action_count) -> [tuple]:
    """
    Generates an episode
    @param env:
    @param Q:
    @param epsilon:
    @param action_count
    Returns
    """
    episode = []
    # stores the initial state.  [sum of player cards, open dealer card, has usable Ace]
    state = env.reset()
    while True:
        if state in Q:
            # choose the action with the Q table in mind
            action = np.random.choice(np.arange(action_count),
                                      p=get_probs(Q[state], epsilon,
                                                  action_count))
        else:
            # if we have never visited this state before, just throw the dice
            action = env.action_space.sample()

        next_state, reward, done, info = env.step(action)
        episode.append((state, action, reward))
        state = next_state
        if done:
            break
    return episode
コード例 #2
0
def reset_env_with_s0(env: BlackjackEnv, s0: State) -> BlackjackEnv:
    env.reset()
    player_sum = s0[0]
    oppo_sum = s0[1]
    has_usable = s0[2]

    env.dealer[0] = oppo_sum
    if has_usable:
        env.player[0] = 1
        env.player[1] = player_sum - 11
    else:
        if player_sum > 11:
            env.player[0] = 10
            env.player[1] = player_sum - 10
        else:
            env.player[0] = 2
            env.player[1] = player_sum - 2
    return env
コード例 #3
0
def mc_control_exploring_starts_state(env: BlackjackEnv, s_0: State, num_episodes, discount_factor=1.0) \
        -> Tuple[ActionValue, Policy]:
    states = list(product(range(10, 22), range(1, 11), (True, False)))
    policy = {
        s: np.ones(env.action_space.n) * 1.0 / env.action_space.n
        for s in states
    }
    Q = defaultdict(lambda: np.zeros(env.action_space.n))

    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)

    for episode_i in range(1, num_episodes + 1):
        player_sum = s_0[0]
        oppo_sum = s_0[1]
        has_usable = s_0[2]

        env.reset()
        env.dealer[0] = oppo_sum
        if has_usable:
            env.player[0] = 1
            env.player[1] = player_sum - 11
        else:
            if player_sum > 11:
                env.player[0] = 10
                env.player[1] = player_sum - 10
            else:
                env.player[0] = 2
                env.player[1] = player_sum - 2

        episode_history = gen_custom_s0_stochastic_episode(policy, env, s_0)

        G = 0
        a = episode_history[0][1]
        for s_a_r in episode_history:
            G += s_a_r[2]
            returns_sum[s_0, a] += G
            returns_count[s_0, a] += 1.0
            Q[s_0][a] = returns_sum[s_0, a] / returns_count[s_0, a]
            best_a = np.argmax(Q[s_0])
            policy[s_0][best_a] = 1.0
            policy[s_0][1 - best_a] = 0.0

    return Q, policy
コード例 #4
0
def gen_episode_data(policy: DeterministicPolicy,
                     env: BlackjackEnv) -> List[Tuple[State, Action, Reward]]:
    episode_history = []
    state = env.reset()
    done = False
    while not done:
        action = policy(state)
        next_state, reward, done, _ = env.step(action)
        episode_history.append((state, action, reward))
        state = next_state
    return episode_history
コード例 #5
0
def gen_stochastic_episode(
        policy: Policy,
        env: BlackjackEnv) -> List[Tuple[State, Action, Reward]]:
    episode_history = []
    state = env.reset()
    done = False
    while not done:
        A: ActionValue = policy[state]
        action = np.random.choice([0, 1], p=A / sum(A))
        next_state, reward, done, _ = env.step(action)
        episode_history.append((state, action, reward))
        state = next_state
    return episode_history