def sarsa(lamb: int, num_episodes: int, Qstar, record=False): Q = state_action_map(plus=True) N = state_action_map() N_s = state_map(plus=True) mses = [] for k in range(num_episodes): E = state_action_map() s = State(deal=True) a = get_e_greedy_action(Q, N_s, s) while not s.terminal(): N_s[s.get_state()] += 1 N[s.get_state(), a] += 1 s_dash, r = step(s, a) a_dash = get_e_greedy_action(Q, N_s, s_dash) delta = r + Q[s_dash.get_state(), a_dash] - Q[s.get_state(), a] E[s.get_state(), a] += 1 for d in DEALER_RANGE: for p in PLAYER_RANGE: for action in ACTIONS: Q[(d, p), action] += (1 / (N[(d, p), action] + 1e-9)) * delta * E[ (d, p), action] E[(d, p), action] *= lamb s = s_dash a = a_dash if record: mses.append(calc_mse(Q, Qstar)) return Q, mses
def sample_episode(pi): history = [] s = State(deal=True) while not s.terminal(): a = pi[s.get_state()] # rewards do not need to be appended to history as rewards are only *rewarded* when entering the terminal state. history.append([s.get_state(), a]) s, r = step(s, a) return history, r
def sarsa(lamb: int, num_episodes: int, Qstar, record=False): alpha = ALPHA w = np.zeros(36) # w = np.random.uniform(-1, 1, 36) mses = [] for k in range(num_episodes): E = np.zeros(36) s = State(deal=True) a = get_e_greedy_action(s, w) while not s.terminal(): x = phi(s, a) s_dash, r = step(s, a) a_dash = get_e_greedy_action(s_dash, w) delta = r + q_hat(s_dash, a_dash, w) - q_hat(s, a, w) E = np.add(np.multiply(E, lamb), x) dw = np.multiply(E, alpha * delta) w += dw s = s_dash a = a_dash if record: mses.append(calc_mse_linear(w, Qstar)) return w, mses