if first: first = False else: states_actions_returns.append((s, a, G)) G = r + GAMMA*G states_actions_returns.reverse() # we want it to be in order of state visited return states_actions_returns if __name__ == '__main__': # use the standard grid again (0 for every step) so that we can compare # to iterative policy evaluation # grid = standard_grid() # try the negative grid too, to see if agent will learn to go past the "bad spot" # in order to minimize number of steps grid = negative_grid(step_cost=-0.1) # print rewards print("rewards:") print_values(grid.rewards, grid) # state -> action # initialize a random policy policy = {} for s in grid.actions.keys(): policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS) # initialize Q(s,a) and returns Q = {} returns = {} # dictionary of state -> list of returns we've received states = grid.all_states()
''' p = np.random.random() if p > eps: return a else: return np.random.choice(ALL_POSSIBLE_ACTIONS) if __name__ == '__main__': grid_type = input('\nchoose grid type (standard/negative):\n').strip() if grid_type == 'negative': step_cost = float( input('\nenter step_cost (e.g. \'-1\' or \'-0.1\'):\n').strip()) # get the grid: grid = negative_grid(step_cost=step_cost) else: # assuming the standard grid: grid = standard_grid() # display rewards: print('\nrewards:') print_values(grid.rewards, grid) # initialize the action-value function and the counts for s and (s, a) pairs: Q = {} N = {} alpha_divisor = {} for s in grid.all_states: Q[s] = {}
import numpy as np import matplotlib.pyplot as plt from gridworld import standard_grid, negative_grid from iterative_policy_evaluation import print_values, print_policy SMALL_ENOUGH = 10e-4 GAMMA = 0.9 ALL_POSSIBLE_ACTIONS = ('U', 'D', 'R', 'L') if __name__ == '__main__': grid = negative_grid() print('rewards:') print_values(grid.rewards, grid) policy = {} for s in grid.actions.keys(): policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS) # print('Initial policy:') # print_policy(policy, grid) V = {} states = grid.all_states() for s in states: if s in grid.actions: V[s] = np.random.random() else: V[s] = 0
from gridworld import standard_grid, negative_grid from iter_policy_eval_moving_penalty import print_values, print_policy SMALL_ENOUGH = 1e-3 GAMMA = 0.9 ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R') # next state and reward will now have some randomness # you'll go in your desired direction with probability 0.5 # you'll go in a random direction a' != a with probability 0.5/3 if __name__ == '__main__': # this grid gives you a reward of -0.1 for every non-terminal state # we want to see if this will encourage finding a shorter path to the goal grid = negative_grid(step_cost=-1.0) # grid = negative_grid(step_cost=-0.1) # grid = standard_grid() # print rewards print("rewards:") print_values(grid.rewards, grid) # state -> action # we'll randomly choose an action and update as we learn policy = {} for s in grid.actions.keys(): policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS) # initial policy print("initial policy:")
def main(grid_type='negative'): # NOTE: every p(s',r|s,a) is deterministic (1 or 0) if grid_type == 'negative': # get the grid: grid = negative_grid() else: # assuming the standard grid: grid = standard_grid() # print the rewards: print('\nrewards:') print_values(grid.rewards, grid) # prints any dict with # a tuple of numbers as the key # and a number as the value # STEP 1: randomly initialize V(s) and the policy, pi(s): V = {} states = grid.all_states for s in states: # we can simply initialize all to zero: V[s] = 0 # or perform a random initialization: # if s in grid.actions: # if not a terminal state # V[s] = np.random.random() # else: # # terminal # V[s] = 0 print('\ninitial values:') print_values(V, grid) policy = {} for s in grid.actions.keys(): policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS) print('\ninitial policy:') print_policy(policy, grid) # STEP 2: alternate between policy evaluation and policy improvement: # repeat untill convergence: i = 0 while True: # STEP 2A: iterative policy evaluation while True: # NOTE: all of the actions, next states and rewards # are considered deterministic max_change = 0 for s in states: old_v = V[s] # save the old value of the state # check if not a terminal state: if s in grid.actions: grid.set_state(s) # take an action according to the policy and get the reward: a = policy[s] r = grid.move(a) # the "look-ahead" - get the value of the next state, s_prime: s_prime = grid.current_state # s_prime is needed in order to calculate # the value of the current state - the Bellman equation: V[s] = r + GAMMA * V[s_prime] # update max_change: max_change = max(max_change, np.abs(V[s] - old_v)) # check if converged: if max_change < THRESHOLD: break # STEP 2B: policy iteration # for each state we take an action according to the policy # and check whether there is a better action - take all possible # actions from that state and calculate the values; # we choose the action that results in the max value of the state. policy_improved = False for s in states: # check if not a terminal-state: if s in grid.actions: grid.set_state(s) # yep, don't forget to set the position! # save the old policy: old_a = policy[s] max_v = np.float('-inf') # worse is unlikely to occur # choose the best action among all the possible ones: for a in ALL_POSSIBLE_ACTIONS: # print('reached here!') grid.set_state(s) # take an action, receive your keto-chocolate bar: r = grid.move(a) s_prime = grid.current_state new_v = r + GAMMA * V[s_prime] # compare the values: if new_v > max_v: max_v = new_v better_a = a # change the policy: policy[s] = better_a if old_a != better_a: # print('policy_improved') policy_improved = True # if policy has changed, we need to recalculate the values of all states - # get back to STEP 2A; # else - we're done! # and since the policy's not changed, the values remain the same: if not policy_improved: break i += 1 print('\niterations to converge:', i) # print the values: print('\nvalues:') print_values(V, grid) # print the policy: print('\nthe improved policy:') print_policy(policy, grid)
def main(grid_type='negative'): # NOTE: every p(s',r|s,a) is now random, i.e. lies in [0,1], # but the policy is deterministic! if grid_type == 'negative': step_cost = float( input('\nenter step_cost (e.g. \'-1\' or \'-0.1\'):\n').strip()) # get the grid: grid = negative_grid(step_cost=step_cost) else: # assuming the standard grid: grid = standard_grid() # print the rewards: print('\nrewards:') print_values(grid.rewards, grid) # prints any dict with # a tuple of numbers as the key # and a number as the value # STEP 1: randomly initialize V(s) and the policy, pi(s): V = {} states = grid.all_states for s in states: # we can simply initialize all to zero: # V[s] = 0 # or perform a random initialization: if s in grid.actions: # if not a terminal state V[s] = np.random.random() else: # terminal V[s] = 0 print('\ninitial values:') print_values(V, grid) policy = {} for s in grid.actions.keys(): policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS) print('\ninitial policy:') print_policy(policy, grid) # STEP 2: alternate between policy evaluation and policy improvement # with random state-transitions: # repeat untill convergence: i = 0 while True: # STEP 2A: iterative policy evaluation while True: max_change = 0 for s in states: old_v = V[s] # save the old value of the state new_v = 0 # check if not a terminal state: if s in grid.actions: for a in ALL_POSSIBLE_ACTIONS: grid.set_state(s) # possible_actions = list(grid.actions[s]) # print('\npossible actions from the state (%d, %d):' % grid.current_state) # print(possible_actions) if a == policy[s]: # take this action with the probability p(a|s)=P_A: p_s_prime_and_r = P_A else: p_s_prime_and_r = (1 - P_A) / ( len(ALL_POSSIBLE_ACTIONS) - 1) # same as: p(s',r|s,!policy[s]) # move in the chosen direciton: r = grid.move(a) # the "look-ahead" - get the value of the next state, s_prime: s_prime = grid.current_state # s_prime is needed in order to calculate # the value of the current state - the Bellman equation: new_v += p_s_prime_and_r * (r + GAMMA * V[s_prime]) V[s] = new_v # update max_change: max_change = max(max_change, np.abs(V[s] - old_v)) # check if converged: if max_change < THRESHOLD: break # STEP 2B: policy iteration # for each state we take an action according to the policy # and check whether there is a better action - take all possible # actions from that state and calculate the values, but now we also # take into account that our state-transitions are random!!! # we then choose the action that results in the max value of the state. policy_improved = False for s in states: # check if not a terminal-state: if s in grid.actions: grid.set_state(s) # yep, don't forget to set the position! # save the old policy: old_a = policy[s] max_v = np.float('-inf') # worse is unlikely to occur # choose the best action among all the possible ones: for a in ALL_POSSIBLE_ACTIONS: # print('reached here!') new_v = 0 # we're to accumulate the value for another_a in ALL_POSSIBLE_ACTIONS: grid.set_state(s) # since the state-transitions are random, # we check if the action is desired: if another_a == a: # take this action with the probability p(a|s)=0.5: p_s_prime_and_r = P_A else: p_s_prime_and_r = (1 - P_A) / ( len(ALL_POSSIBLE_ACTIONS) - 1) # take an action, receive your keto-chocolate bar: r = grid.move(another_a) s_prime = grid.current_state new_v += p_s_prime_and_r * (r + GAMMA * V[s_prime]) # compare the values: if new_v > max_v: max_v = new_v better_a = a # change the policy: policy[s] = better_a if old_a != better_a: # print('policy_improved') policy_improved = True # if policy has changed, we need to recalculate the values of all states - # get back to STEP 2A; # else - we're done! # and since the policy's not changed, the values remain the same: if not policy_improved: break i += 1 print('\niterations to converge:', i) # print the values: print('\nvalues:') print_values(V, grid) # print the policy: print('\nthe improved policy:') print_policy(policy, grid)
def main(grid_type='negative'): if grid_type == 'negative': step_cost = float( input('\nenter step_cost (e.g. \'-1\' or \'-0.1\'):\n').strip()) # get the grid: grid = negative_grid(step_cost=step_cost) else: # assuming the standard grid: grid = standard_grid() # display rewards: print('\nrewards:') print_values(grid.rewards, grid) states = grid.all_states # STEP 1: randomly initialize the value function, V(s): V = {} # the values for s in states: # as an option, initialize to 0: # V[s] = 0 # check if not a terminal state: if s in grid.actions: V[s] = np.random.random() else: V[s] = 0 print('\ninitial values:') print_values(V, grid) # STEP 2: value iteration while True: max_change = 0 for s in states: old_v = V[s] # if we're not in a terminal state: if s in grid.actions: # choose an action that results in the maximum value # for this state: best_v = np.float('-inf') # best_a = np.random.choice(ALL_POSSIBLE_ACTIONS) for a in ALL_POSSIBLE_ACTIONS: # arrive in the state: grid.set_state(s) # take the action and receive the reward: r = grid.move(a) # calculate the Bellman equation: v = r + GAMMA * V[grid.current_state] if v > best_v: best_v = v # p[s] = a # we'll do it in another loop later # update the value of this state: V[s] = best_v # update the maximum change: max_change = max(max_change, np.abs(old_v - V[s])) # check if converged: if max_change < THRESHOLD: break # STEP 3: take our optimal value funciton # and find our optimal policy p = {} # the policy for s in states: best_a = None best_v = float('-inf') # if not a terminal state: if s in grid.actions: # find the best action: for a in ALL_POSSIBLE_ACTIONS: grid.set_state(s) r = grid.move(a) v = r + GAMMA * V[grid.current_state] if v > best_v: best_v = v best_a = a p[s] = best_a # optimal values: print('\noptimal values:') print_values(V, grid) # optimal policy: print('\noptimal policy:') print_policy(p, grid)