new_V = v V[s] = new_V delta = max(delta, np.abs(old_V - V[s])) if delta < THRESHOLD: break for s in policy.keys(): best_a = None best_value = float("-inf") for a in POSSIBLE_ACTIONS: grid.set_state(s) r = grid.move(a) v = r + GAMMA * V[grid.current_state()] if v > best_value: best_value = v best_a = a policy[s] = best_a print("Final value function") print_value_func(V, grid) print("\n") print("Final policy") print_policy(policy, grid)
if first: first = False else: states_returns.append((s, G)) G = r + GAMMA * G states_returns.reverse() return states_returns if __name__ == "__main__": grid = standard_grid() print("Rewards") print_value_func(grid.rewards, grid) print("\n") policy = { (2, 0): "U", (1, 0): "U", (0, 0): "R", (0, 1): "R", (0, 2): "R", (1, 2): "U", (2, 1): "L", (2, 2): "U", (2, 3): "L", } V = {}
import numpy as np from grid_world import standard_grid, negative_grid from dp_policy_evaluation import print_value_func, print_policy from monte_carlo_policy_iteration_es import max_dict from td_zero_prediction import random_action GAMMA = 0.9 ALPHA = 0.1 POSSIBLE_ACTIONS = ("U", "D", "L", "R") if __name__ == "__main__": grid = negative_grid(step_cost=-0.1) print("Rewards") print_value_func(grid.rewards, grid) print("\n") # Initialize Q(s,a) Q = {} states = grid.all_states() for s in states: Q[s] = {} for a in POSSIBLE_ACTIONS: Q[s][a] = 0 # Keep track of how many times Q[s] has been updated update_counts = {} update_counts_sa = {} for s in states: update_counts_sa[s] = {}