def iterate_policy(policy, env, gamma, n_iter=100):
    """
    input:
    - policy: array[states X actions] with probabilities for each action
    - env: environment
    - gamma: discount_rate
    - n_iter: max # of times to iterate policy
    output:
    - policy: optimal policy
    """
    for k in range(n_iter):
        policy_stable = True
        V = evaluate_policy(policy, env, gamma)
        for s in range(env.nS):
            old_a = get_policy_action(s, policy)
            greedy_a, _ = get_greedy_action(s, env, V, gamma)
            policy[s] = np.eye(env.nA)[greedy_a]
            if old_a != greedy_a:
                policy_stable = False

        if policy_stable:
            print(f'policy iteration stabilized after {k+1} iterations')
            break
        elif k == n_iter - 1:
            print('policy iteration never stabilized')

    return policy, V
def iterate_state_values(policy, env, gamma, n_iter=100, epsilon=0.01):
    """
    input:
    - policy: array[states X actions] with probabilities for each action
    - env: environment
    - gamma: discount_rate
    - n_iter: # of times to iteration
    output:
    - policy: optimal policy
    """
    # initialize state value function
    V = np.zeros(env.nS)
    for k in range(n_iter):
        v_delta = 0
        for s in range(env.nS):
            greedy_a, greedy_a_value = get_greedy_action(s, env, V, gamma)
            v_delta = max(v_delta, abs(greedy_a_value - V[s]))
            V[s] = greedy_a_value
            # can create policy from final V or just update it in this loop
            policy[s] = np.eye(env.nA)[greedy_a]

        if v_delta < epsilon:
            print(f'value iteration converged after {k+1} iterations')
            break
        elif k == n_iter - 1:
            print('value iteration never converged')

    return policy, V
def mc_control_importance_sampling(env,
                                   num_episodes,
                                   discount_rate=1.0,
                                   epsilon=0.2):
    """
    Importance sampling, off-policy monte carlo control
    input:
    - env: environment
    - num_episodes: # of episodes to run
    - discount_rate: gamma
    - epsilon: probability of "exploring" or choosing a random action
    output:
    - target_policy: epsilon greedy optimal policy
    - Q: state-action value function
    """
    b = get_random_policy(env)
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    C = defaultdict(lambda: np.zeros(env.action_space.n))
    target_policy = defaultdict(int)  # deterministic

    for t in range(num_episodes):
        # generate episode with behavior policy (b)
        episode = get_episode_epsilon_greedy(b, env, epsilon)
        W = 1

        # loop backwards through episode
        for i, (state, action, _reward) in enumerate(list(reversed(episode))):
            G = get_disounted_reward(list(map(lambda x: x[2], episode[i:])),
                                     discount_rate)

            # add to running some of importance-sampling ratios
            # for this state-action pair
            C[state][action] += W

            # update state-action value
            Q[state][action] += W / C[state][action] * (G - Q[state][action])
            # update target policy with updated Q value
            target_policy[state] = get_greedy_action(Q, state)

            # if the action taken by the behavior policy is not the same as
            # what the target policy would take, the importance-sampling
            # ration becomes 0 because target policy is deterministic, so break
            if action != target_policy[state].argmax():
                break

            # update importance sampling ratio, numerator is 1 because target
            # policy is deterministic
            W /= b[state][action]

    return target_policy, Q
def _get_policy_from_state_values(V, env, gamma):
    """
    input:
    - V: state value function
    - env: environment
    - gamma: discount_rate
    output:
    - policy driven by highest expected state values from V
    """
    policy = initialize_policy(env.nA, env.nS)
    for s in range(env.nS):
        greedy_a, _ = get_greedy_action(s, env, V, gamma)
        policy[s] = np.eye(env.nA)[greedy_a]
    return policy
def mc_control_epsilon_greedy(env,
                              num_episodes,
                              discount_rate=1.0,
                              epsilon=0.2):
    """
    First visit monte carlo epsilon greedy control
    input:
    - env: environment
    - num_episodes: # of episodes to run
    - discount_rate: gamma
    - epsilon: probability of "exploring" or choosing a random action
    output:
    - policy: epsilon greedy optimal policy
    - Q: state-action value function
    """
    state_counter = defaultdict(lambda: np.zeros(env.action_space.n))
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    policy = defaultdict(int)

    for t in range(num_episodes):
        episode = get_episode_epsilon_greedy(Q, env, epsilon)
        visited_states = set()

        for i, (state, action, _reward) in enumerate(episode):
            # only allow first visit to contribute to state value
            if state not in visited_states:
                visited_states.add(state)
                state_counter[state][action] += 1
                G = get_disounted_reward(
                    list(map(lambda x: x[2], episode[i:])), discount_rate)

                # incremental mean
                Q[state][action] += (G - Q[state][action]) \
                    / state_counter[state][action]
                policy[state] = get_greedy_action(Q, state)

    return policy, Q
Ejemplo n.º 6
0
import cv2
import time

Q = torch.load('DQN/trained_Q.pth')
Q.eval()
env = gym.make('Breakout-v0', frameskip=4)
env.reset()

m = 4
num_episodes = 2

transform = T.Compose([T.ToTensor()])

for _ in range(num_episodes):
    frame_sequence = initialize_frame_sequence(env, m)
    state = transform(np.stack(frame_sequence, axis=2))
    done = False

    while not done:
        action = get_greedy_action(Q, state.unsqueeze(0)).item()
        frame, reward, done, _ = env.step(action)

        frame_sequence.append(preprocess_frame(frame))
        state = transform(np.stack(frame_sequence, axis=2))

        env.render()
        time.sleep(.1)

        # cv2.imshow('', frame)
        # cv2.waitKey(100)