def sarsa_lambda(num_episodes=1000, lamba=0, gamma=1, yield_progress=False): q_sa = {} n_s = {} n_sa = {} for n in range(num_episodes): e_sa = {} state = State() s = state.as_tuple() a = epsilon_greedy_action(q_sa, s, calculate_epsilon(n_s, s)) while not state.terminal: state, reward = step(state, a) n_s[s] = n_s.get(s, 0) + 1 s_next = state.as_tuple() a_next = epsilon_greedy_action(q_sa, s_next, calculate_epsilon(n_s, s_next)) sa = s + (a, ) sa_next = s_next + (a_next, ) qsa = q_sa.get(sa, 0) qsa_next = q_sa.get(sa_next, 0) nsa = n_sa.get(sa, 0) + 1 n_sa[sa] = nsa delta = reward + gamma * qsa_next - qsa e_sa[sa] = e_sa.get(sa, 0) + 1 for (s, a) in generate_all_state_action_pairs(): sa = s + (a, ) q_sa[sa] = q_sa.get(sa, 0) + (delta * e_sa.get(sa, 0)) / nsa e_sa[sa] = gamma * lamba * e_sa.get(sa, 0) s = s_next a = a_next if yield_progress: yield n + 1, q_sa if not yield_progress: yield num_episodes, q_sa
def mc_control(num_episodes=10000): q_sa = {} p = {} n_s = {} n_sa = {} n0 = 100 for _ in range(num_episodes): state = State() reward = 0 episode_s = [] episode_sa = [] while not state.terminal: s = state.as_tuple() if s in p: a = sample_action(p[s]) else: a = Action.random() episode_s.append(s) episode_sa.append(s + (a, )) state, reward = step(state, a) ns = n_s.get(s, 0) n_s[s] = ns + 1 sa = s + (a, ) nsa = n_sa.get(sa, 0) n_sa[sa] = nsa + 1 # GLIE MC Control for sa in set(episode_sa): nsa = n_sa[sa] qsa = q_sa.get(sa, 0) q_sa[sa] = qsa + ((reward - qsa) / nsa) # Improve policy for s in set(episode_s): a_best = greedy_action(q_sa, s) ns = n_s.get(s, 0) epsilon = n0 / (n0 + ns) selection_probs = [] for a in list(Action): if a is a_best: selection_probs.append(1 - epsilon + epsilon / len(Action)) else: selection_probs.append(epsilon / len(Action)) p[s] = selection_probs return q_sa
def lfa_sarsa_lambda(num_episodes=1000, lamba=0, gamma=1, alpha=0.01, yield_progress=False): # Set up the coarse codes, initial weights. action_codes = {} for action in list(Action): action_fns = [] for dealer_interval in [(1,4), (4,7), (7,10)]: for player_interval in [(1,6), (4,9), (7,12), (10,15), (13,18), (16,21)]: cuboid_fn = create_cuboid_fn(dealer_interval, player_interval, action) action_fns.append(cuboid_fn) action_codes[action] = action_fns def greedy(s, w): p, d = s action_values = [] for a in list(Action): value = 0 for cuboid_fn in action_codes[a]: if cuboid_fn(p, d, a): value += w.get(cuboid_fn, 0) action_values.append((a, value)) action_values.sort(key=itemgetter(1), reverse=True) return action_values[0][0] def e_greedy(s, w, epsilon=0.05): a_best = greedy(s, w) selection_probs = [] default_p = epsilon / len(Action) for a in list(Action): if a is a_best: selection_probs.append(1 - epsilon + default_p) else: selection_probs.append(default_p) return sample_action(selection_probs) def f_sa(s, a): p, d = s for cuboid_fn in action_codes[a]: if cuboid_fn(p, d, a): yield cuboid_fn def compile_q_sa(w): q_sa = {} for (p, d), a in generate_all_state_action_pairs(): sa = (p, d, a) val = 0 for i in f_sa((p, d), a): val += w.get(i, 0) q_sa[sa] = val return q_sa w_f = {} for n in range(num_episodes): state = State() s = state.as_tuple() a = e_greedy(s, w_f) z_f = {} while not state.terminal: state, reward = step(state, a) delta = reward for i in f_sa(s, a): delta = delta - w_f.get(i, 0) z_f[i] = z_f.get(i, 0) + 1 if state.terminal: for i, zi in z_f.items(): w_f[i] = w_f.get(i, 0) + alpha * delta * zi break s_next = state.as_tuple() a_next = e_greedy(s_next, w_f) for i in f_sa(s_next, a_next): delta = delta + gamma * w_f.get(i, 0) for i, zi in z_f.items(): w_f[i] = w_f.get(i, 0) + alpha * delta * zi z_f[i] = gamma * lamba * zi s = s_next a = a_next if yield_progress: yield n+1, compile_q_sa(w_f) if not yield_progress: yield num_episodes, compile_q_sa(w_f)