biggest_change = 0 states_actions_returns = play_game(grid, policy) seen_state_action_pairs = set() for s, a, G in states_actions_returns: sa = (s, a) if sa not in seen_state_action_pairs: old_q = Q[s][a] returns[sa].append(G) Q[s][a] = np.mean(returns[sa]) biggest_change = max(biggest_change, np.abs(old_q - Q[s][a])) seen_state_action_pairs.add(sa) deltas.append(biggest_change) for s in policy.keys(): a, _ = max_dict(Q[s]) policy[s] = a plt.plot(deltas) plt.show() print("final policy") print_policy(policy, grid) V = {} for s, Qs in Q.items(): V[s] = max_dict(Q[s])[1] print("final values") print_values(V, grid)
update_counts_sa[s] = {} for a in ALL_POSSIBLE_ACTIONS: update_counts_sa[s][a] = 1.0 t = 1.0 deltas = [] for it in range(10000): if it % 100 == 0: t += 10e-3 if it % 2000 == 0: print(it) s = (2, 0) grid.set_state(s) a = max_dict(Q[s])[0] biggest_change = 0 while not grid.game_over(): a = random_action(a, eps=0.5/t) r = grid.move(a) s2 = grid.current_state() alpha = ALPHA / update_counts_sa[s][a] update_counts_sa[s][a] += 0.005 old_qsa = Q[s][a] a2, max_q_s2a2 = max_dict(Q[s2]) Q[s][a] = Q[s][a] + alpha * ( r + GAMMA * max_q_s2a2 - Q[s][a]
update_counts_sa[s] = {} for a in all_possible_actions: update_counts_sa[s][a] = 1.0 t = 1.0 deltas = [] for it in range(10000): if it % 100 == 0: t += 1e-2 if it % 2000 == 0: print("it:") print(it) s = (2, 0) grid.set_state(s) a = max_dict(Q[s])[0] a = random_action(a, eps=0.5 / t) biggest_change = 0 while not grid.game_over(): r = grid.move(a) s2 = grid.current_state(s) a2 = max_dict(Q[s2])[0] a2 = random_action(a2, eps=0.5 / t) alpha = alpha / update_counts_sa[s][a] update_counts_sa[s][a] += 0.005 old_qsa = Q[s][a] Q[s][a] = Q[s][a] + alpha * (r + gamma * Q[s2][a2] - Q[s][a]) biggest_change = max(biggest_change, np.abs(old_qsa - Q[s][a]))
t = 1.0 t2 = 1.0 for it in range(20000): if it % 100 == 0: t += 10e-3 t2 += 0.01 if it % 1000 == 0: print(it) alpha = ALPHA / t2 s = (2, 0) grid.set_state(s) Qs = getQs(model, s) a = max_dict(Qs)[0] a = random_action(a, 0.5/t) biggest_change = 0 while not grid.game_over(): r = grid.move(a) s2 = grid.current_state() old_theta = model.theta.copy() if grid.is_terminal(s2): model.theta += alpha * ( r - model.predict(s, a) ) * model.grad(s, a) else: Qs2 = getQs(model, s2)