saG.reverse() return saG # Function to return the argmax value from a given dictionary and its key def max_dict(d): max_key, max_val = None, float('-inf') for k, v in d.items(): if v > max_val: max_val = v max_key = k return max_key, max_val if __name__ == '__main__': grid = negative_maze(step_cost=-.5) # Randomly initialize a policy policy = {} for s in grid.actions.keys(): policy[s] = np.random.choice(ALL_ACTIONS) # Q = mean of returns G for state s and action a Q = {} G = {} state = grid.all_states() for s in state: if s in grid.actions.keys(): Q[s] = {} for a in ALL_ACTIONS: Q[s][a] = 0
return np.random.choice(ALL_ACTIONS) # Argmax key from a dictionary def max_dict(d): max_k, max_v = None, float('-inf') for k, v in d.items(): if v > max_v: max_k = k max_v = v return max_k, max_v if __name__ == '__main__': maze = negative_maze() print("Values: ") print_values(maze.rewards, maze) states = maze.all_states() # Initialize Q for all states and actions Q = {} for s in states: Q[s] = {} for a in ALL_ACTIONS: Q[s][a] = 0 # This would determine the decay in learning rate after each update state_lr_decay = {} for s in states:
import numpy as np from Maze import negative_maze, print_values, print_policy # Create a minimum threshold to check for convergence THRESHOLD = 1e-3 # A discount variable is added so that the program is not greedy GAMMA = 0.9 ALL_ACTIONS = ('U', 'D', 'L', 'R') if __name__ == '__main__': # Make a grid object grid = negative_maze() print("Rewards: ") print_values(grid.rewards, grid) # Randomly initialize a policy for all playable states policy = {} states = grid.all_states() for s in states: if s in grid.actions.keys(): policy[s] = np.random.choice(ALL_ACTIONS) print("Random initial Policy") print_policy(policy, grid) # Randomly initialize a Value for all playable states and 0 for other states V = {} for s in states: if s in grid.actions.keys(): V[s] = np.random.random() else: