else: states_and_returns.append((s, G)) G = r + GAMMA * G states_and_returns.reverse() return states_and_returns if __name__ == '__main__': # use the standard grid again (0 for every step) so that we can compare # to iterative policy evaluation grid = standard_grid() # print rewards print("rewards:") print_values(grid.rewards, grid) # state -> action policy = { (2, 0): 'U', (1, 0): 'U', (0, 0): 'R', (0, 1): 'R', (0, 2): 'R', (1, 2): 'R', (2, 1): 'R', (2, 2): 'R', (2, 3): 'U', } V = {}
from grid_world import standard_grid, negative_grid from policy_evaluation import print_values, print_policy from monte_carlo_es import max_dict from td0_prediction import random_action GAMMA = 0.9 ALPHA = 0.1 ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R') if __name__ == '__main__': grid = negative_grid() # print rewards print("rewards:") print_values(grid.rewards, grid) Q = {} states = grid.all_states() for s in states: Q[s] = {} for a in ALL_POSSIBLE_ACTIONS: Q[s][a] = 0 update_counts = {} update_counts_sa = {} for s in states: update_counts_sa[s] = {} for a in ALL_POSSIBLE_ACTIONS: update_counts_sa[s][a] = 1.0