def sarsa_lambda(w, alpha, lambd, nsimus, gamma): n = w.width * w.height Q = np.zeros((n, 4)) for k in range(1, nsimus + 1): w.reset() E = np.zeros((n, 4)) s = w.player.cell.pos a = Policy.e_greedy_action_from_qvs(s, 1 / k, Q) while not w.finished: reward = w.take_action(a) s2 = w.player.cell.pos a2 = Policy.e_greedy_action_from_qvs(s2, 1 / k, Q) err = reward + gamma * Q[s2][a2] - Q[s][a] E[s][a] += 1 for si in range(n): for ai in range(4): Q[si][ai] += alpha * err * E[si][ai] E[si][ai] *= gamma * lambd s = s2 a = a2 return Policy.build_deterministic(Policy.qvs_to_table(Q))
def qvs_to_policy(self, qvs): w = self.world n = w.width * w.height policy = [0] * n for s in range(n): policy[s] = np.argmax(qvs[s]) return Policy.build_deterministic(policy)
def policy_iteration(self, policy=None): w = self.world if policy == None: policy = Policy.build_deterministic([0] * w.width * w.height) while True: old_policy = policy vs = self.iterative_policy_evaluation(policy, 20) qvs = self.qvs_from_vs(vs) policy = self.qvs_to_policy(qvs) if policy == old_policy: break return policy
def sarsa_offline(w, alpha, nsimus, gamma): n = w.width * w.height Q = np.zeros((n, 4)) for t in range(1, nsimus + 1): w.reset() s = w.player.cell.pos epsilon = 1 / t while not w.finished: a = Policy.e_greedy_action_from_qvs(s, epsilon, Q) reward = w.take_action(a) s2 = w.player.cell.pos Q[s][a] += alpha * (reward + gamma * np.max(Q[s2]) - Q[s][a]) s = s2 return Policy.build_deterministic(Policy.qvs_to_table(Q))
def sarsa(w, alpha, nsimus, gamma): n = w.width * w.height Q = np.zeros((n, 4)) for k in range(1, nsimus + 1): w.reset() s = w.player.cell.pos a = Policy.e_greedy_action_from_qvs(s, 1 / k, Q) while not w.finished: reward = w.take_action(a) s2 = w.player.cell.pos a2 = Policy.e_greedy_action_from_qvs(s2, 1 / k, Q) Q[s][a] += alpha * (reward + gamma * Q[s2][a2] - Q[s][a]) s = s2 a = a2 return Policy.build_deterministic(Policy.qvs_to_table(Q))