def iterate_policy(policy, env, gamma, n_iter=100): """ input: - policy: array[states X actions] with probabilities for each action - env: environment - gamma: discount_rate - n_iter: max # of times to iterate policy output: - policy: optimal policy """ for k in range(n_iter): policy_stable = True V = evaluate_policy(policy, env, gamma) for s in range(env.nS): old_a = get_policy_action(s, policy) greedy_a, _ = get_greedy_action(s, env, V, gamma) policy[s] = np.eye(env.nA)[greedy_a] if old_a != greedy_a: policy_stable = False if policy_stable: print(f'policy iteration stabilized after {k+1} iterations') break elif k == n_iter - 1: print('policy iteration never stabilized') return policy, V
def iterate_state_values(policy, env, gamma, n_iter=100, epsilon=0.01): """ input: - policy: array[states X actions] with probabilities for each action - env: environment - gamma: discount_rate - n_iter: # of times to iteration output: - policy: optimal policy """ # initialize state value function V = np.zeros(env.nS) for k in range(n_iter): v_delta = 0 for s in range(env.nS): greedy_a, greedy_a_value = get_greedy_action(s, env, V, gamma) v_delta = max(v_delta, abs(greedy_a_value - V[s])) V[s] = greedy_a_value # can create policy from final V or just update it in this loop policy[s] = np.eye(env.nA)[greedy_a] if v_delta < epsilon: print(f'value iteration converged after {k+1} iterations') break elif k == n_iter - 1: print('value iteration never converged') return policy, V
def mc_control_importance_sampling(env, num_episodes, discount_rate=1.0, epsilon=0.2): """ Importance sampling, off-policy monte carlo control input: - env: environment - num_episodes: # of episodes to run - discount_rate: gamma - epsilon: probability of "exploring" or choosing a random action output: - target_policy: epsilon greedy optimal policy - Q: state-action value function """ b = get_random_policy(env) Q = defaultdict(lambda: np.zeros(env.action_space.n)) C = defaultdict(lambda: np.zeros(env.action_space.n)) target_policy = defaultdict(int) # deterministic for t in range(num_episodes): # generate episode with behavior policy (b) episode = get_episode_epsilon_greedy(b, env, epsilon) W = 1 # loop backwards through episode for i, (state, action, _reward) in enumerate(list(reversed(episode))): G = get_disounted_reward(list(map(lambda x: x[2], episode[i:])), discount_rate) # add to running some of importance-sampling ratios # for this state-action pair C[state][action] += W # update state-action value Q[state][action] += W / C[state][action] * (G - Q[state][action]) # update target policy with updated Q value target_policy[state] = get_greedy_action(Q, state) # if the action taken by the behavior policy is not the same as # what the target policy would take, the importance-sampling # ration becomes 0 because target policy is deterministic, so break if action != target_policy[state].argmax(): break # update importance sampling ratio, numerator is 1 because target # policy is deterministic W /= b[state][action] return target_policy, Q
def _get_policy_from_state_values(V, env, gamma): """ input: - V: state value function - env: environment - gamma: discount_rate output: - policy driven by highest expected state values from V """ policy = initialize_policy(env.nA, env.nS) for s in range(env.nS): greedy_a, _ = get_greedy_action(s, env, V, gamma) policy[s] = np.eye(env.nA)[greedy_a] return policy
def mc_control_epsilon_greedy(env, num_episodes, discount_rate=1.0, epsilon=0.2): """ First visit monte carlo epsilon greedy control input: - env: environment - num_episodes: # of episodes to run - discount_rate: gamma - epsilon: probability of "exploring" or choosing a random action output: - policy: epsilon greedy optimal policy - Q: state-action value function """ state_counter = defaultdict(lambda: np.zeros(env.action_space.n)) Q = defaultdict(lambda: np.zeros(env.action_space.n)) policy = defaultdict(int) for t in range(num_episodes): episode = get_episode_epsilon_greedy(Q, env, epsilon) visited_states = set() for i, (state, action, _reward) in enumerate(episode): # only allow first visit to contribute to state value if state not in visited_states: visited_states.add(state) state_counter[state][action] += 1 G = get_disounted_reward( list(map(lambda x: x[2], episode[i:])), discount_rate) # incremental mean Q[state][action] += (G - Q[state][action]) \ / state_counter[state][action] policy[state] = get_greedy_action(Q, state) return policy, Q
import cv2 import time Q = torch.load('DQN/trained_Q.pth') Q.eval() env = gym.make('Breakout-v0', frameskip=4) env.reset() m = 4 num_episodes = 2 transform = T.Compose([T.ToTensor()]) for _ in range(num_episodes): frame_sequence = initialize_frame_sequence(env, m) state = transform(np.stack(frame_sequence, axis=2)) done = False while not done: action = get_greedy_action(Q, state.unsqueeze(0)).item() frame, reward, done, _ = env.step(action) frame_sequence.append(preprocess_frame(frame)) state = transform(np.stack(frame_sequence, axis=2)) env.render() time.sleep(.1) # cv2.imshow('', frame) # cv2.waitKey(100)