Printing out the actions that will be taken at each place on the grid, according to the policy. """ for i in range(grid.height): print("-" + "-------" * grid.width) for j in range(grid.width): if not j: print("|", end="") # begin row with vertical line a = policy.get((i, j), ' ') print(" %s |" % a, end="") print("") # new line print("-" + "-------" * grid.width) if __name__ == '__main__': the_grid = standard_grid() states = the_grid.all_states() # # # poilcy with uniformly random actions # # # V = {} for s in states: V[s] = 0 # initialize all state values to 0 gamma = 1 # discount factor # repeat until convergence while True: biggest_change = 0 for s in states: old_v = V[s] # keep track so we can measure change # terminal states have no value (no future returns)
def calculate_state_values(self): """ Calculate state-value function from state-action value function. For each state s, V(s) = Q(s, max(a)). """ visited = set() for (s, _) in self.Q.keys(): if s not in visited: visited.add(s) self.V[s] = np.max( [self.Q.get((s, a), 0) for a in ALL_ACTIONS]) if __name__ == '__main__': the_grid = standard_grid(step_cost=-0.1, windy=True) # print rewards associated with transitioning into each state on the grid print("Rewards:") the_grid.display_rewards() # Learn using SARSA or off-policy Q-learning control strategy. # Both have very similar results (optimal value-function and policy found), # though off-policy Q-learning takes longer as the Agent is not actively # trying to make it to the goal. SARSA = True if SARSA: # SARSA follows an explore-exploit strategy, so agent moves through # environment semi-greedily (epsilon-greedy here) according to policy. policy = TemporalDifferencePolicy(the_grid, alpha=.1, gamma=.9) policy.Qlearning(1000, SARSA=SARSA)
def main(grid_type='negative'): # NOTE: every p(s',r|s,a) is deterministic (1 or 0) if grid_type == 'negative': # get the grid: grid = negative_grid() else: # assuming the standard grid: grid = standard_grid() # print the rewards: print('\nrewards:') print_values(grid.rewards, grid) # prints any dict with # a tuple of numbers as the key # and a number as the value # STEP 1: randomly initialize V(s) and the policy, pi(s): V = {} states = grid.all_states for s in states: # we can simply initialize all to zero: V[s] = 0 # or perform a random initialization: # if s in grid.actions: # if not a terminal state # V[s] = np.random.random() # else: # # terminal # V[s] = 0 print('\ninitial values:') print_values(V, grid) policy = {} for s in grid.actions.keys(): policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS) print('\ninitial policy:') print_policy(policy, grid) # STEP 2: alternate between policy evaluation and policy improvement: # repeat untill convergence: i = 0 while True: # STEP 2A: iterative policy evaluation while True: # NOTE: all of the actions, next states and rewards # are considered deterministic max_change = 0 for s in states: old_v = V[s] # save the old value of the state # check if not a terminal state: if s in grid.actions: grid.set_state(s) # take an action according to the policy and get the reward: a = policy[s] r = grid.move(a) # the "look-ahead" - get the value of the next state, s_prime: s_prime = grid.current_state # s_prime is needed in order to calculate # the value of the current state - the Bellman equation: V[s] = r + GAMMA * V[s_prime] # update max_change: max_change = max(max_change, np.abs(V[s] - old_v)) # check if converged: if max_change < THRESHOLD: break # STEP 2B: policy iteration # for each state we take an action according to the policy # and check whether there is a better action - take all possible # actions from that state and calculate the values; # we choose the action that results in the max value of the state. policy_improved = False for s in states: # check if not a terminal-state: if s in grid.actions: grid.set_state(s) # yep, don't forget to set the position! # save the old policy: old_a = policy[s] max_v = np.float('-inf') # worse is unlikely to occur # choose the best action among all the possible ones: for a in ALL_POSSIBLE_ACTIONS: # print('reached here!') grid.set_state(s) # take an action, receive your keto-chocolate bar: r = grid.move(a) s_prime = grid.current_state new_v = r + GAMMA * V[s_prime] # compare the values: if new_v > max_v: max_v = new_v better_a = a # change the policy: policy[s] = better_a if old_a != better_a: # print('policy_improved') policy_improved = True # if policy has changed, we need to recalculate the values of all states - # get back to STEP 2A; # else - we're done! # and since the policy's not changed, the values remain the same: if not policy_improved: break i += 1 print('\niterations to converge:', i) # print the values: print('\nvalues:') print_values(V, grid) # print the policy: print('\nthe improved policy:') print_policy(policy, grid)
p_a = .5 / 3 # 16.6% chance of moving in other 3 directions r = grid.move(a) # move and get associated reward new_v += p_a * (r + gamma * V.get(the_grid.get_state(), 0)) biggest_change = max(biggest_change, np.abs(old_v - V[s])) V[s] = new_v # update if biggest_change < convergence_threshold: break return V if __name__ == '__main__': # Agent will try to end game as quickly as possible with step costs this # high. Even if that means taking the negative terminal state. the_grid = standard_grid(step_cost=-1) all_actions = list(the_grid.moves.keys()) # print rewards print("Rewards:") display_values(the_grid.rewards, the_grid) print("") # state -> action # randomly choose an action and update as we learn policy = {} for s, options in the_grid.actions.items(): policy[s] = np.random.choice(options) print("Randomly initialized policy:") display_policy(policy, the_grid)
def main(grid_type='negative'): # NOTE: every p(s',r|s,a) is now random, i.e. lies in [0,1], # but the policy is deterministic! if grid_type == 'negative': step_cost = float( input('\nenter step_cost (e.g. \'-1\' or \'-0.1\'):\n').strip()) # get the grid: grid = negative_grid(step_cost=step_cost) else: # assuming the standard grid: grid = standard_grid() # print the rewards: print('\nrewards:') print_values(grid.rewards, grid) # prints any dict with # a tuple of numbers as the key # and a number as the value # STEP 1: randomly initialize V(s) and the policy, pi(s): V = {} states = grid.all_states for s in states: # we can simply initialize all to zero: # V[s] = 0 # or perform a random initialization: if s in grid.actions: # if not a terminal state V[s] = np.random.random() else: # terminal V[s] = 0 print('\ninitial values:') print_values(V, grid) policy = {} for s in grid.actions.keys(): policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS) print('\ninitial policy:') print_policy(policy, grid) # STEP 2: alternate between policy evaluation and policy improvement # with random state-transitions: # repeat untill convergence: i = 0 while True: # STEP 2A: iterative policy evaluation while True: max_change = 0 for s in states: old_v = V[s] # save the old value of the state new_v = 0 # check if not a terminal state: if s in grid.actions: for a in ALL_POSSIBLE_ACTIONS: grid.set_state(s) # possible_actions = list(grid.actions[s]) # print('\npossible actions from the state (%d, %d):' % grid.current_state) # print(possible_actions) if a == policy[s]: # take this action with the probability p(a|s)=P_A: p_s_prime_and_r = P_A else: p_s_prime_and_r = (1 - P_A) / ( len(ALL_POSSIBLE_ACTIONS) - 1) # same as: p(s',r|s,!policy[s]) # move in the chosen direciton: r = grid.move(a) # the "look-ahead" - get the value of the next state, s_prime: s_prime = grid.current_state # s_prime is needed in order to calculate # the value of the current state - the Bellman equation: new_v += p_s_prime_and_r * (r + GAMMA * V[s_prime]) V[s] = new_v # update max_change: max_change = max(max_change, np.abs(V[s] - old_v)) # check if converged: if max_change < THRESHOLD: break # STEP 2B: policy iteration # for each state we take an action according to the policy # and check whether there is a better action - take all possible # actions from that state and calculate the values, but now we also # take into account that our state-transitions are random!!! # we then choose the action that results in the max value of the state. policy_improved = False for s in states: # check if not a terminal-state: if s in grid.actions: grid.set_state(s) # yep, don't forget to set the position! # save the old policy: old_a = policy[s] max_v = np.float('-inf') # worse is unlikely to occur # choose the best action among all the possible ones: for a in ALL_POSSIBLE_ACTIONS: # print('reached here!') new_v = 0 # we're to accumulate the value for another_a in ALL_POSSIBLE_ACTIONS: grid.set_state(s) # since the state-transitions are random, # we check if the action is desired: if another_a == a: # take this action with the probability p(a|s)=0.5: p_s_prime_and_r = P_A else: p_s_prime_and_r = (1 - P_A) / ( len(ALL_POSSIBLE_ACTIONS) - 1) # take an action, receive your keto-chocolate bar: r = grid.move(another_a) s_prime = grid.current_state new_v += p_s_prime_and_r * (r + GAMMA * V[s_prime]) # compare the values: if new_v > max_v: max_v = new_v better_a = a # change the policy: policy[s] = better_a if old_a != better_a: # print('policy_improved') policy_improved = True # if policy has changed, we need to recalculate the values of all states - # get back to STEP 2A; # else - we're done! # and since the policy's not changed, the values remain the same: if not policy_improved: break i += 1 print('\niterations to converge:', i) # print the values: print('\nvalues:') print_values(V, grid) # print the policy: print('\nthe improved policy:') print_policy(policy, grid)
def main(): grid = standard_grid() states = grid.all_states() V = {} for s in states: V[s] = 0 gamma = 1.0 while True: biggest_change = 0 for s in states: old_v = V[s] if s in grid._actions: new_v = 0 p_a = 1.0 / len(grid._actions[s]) for a in grid._actions: grid.set_state(s) r = grid.move(a) new_v += p_a * (r + gamma * V[grid.current_state()]) V[s] = new_v biggest_change = max(biggest_change, np.abs(old_v - V[s])) if biggest_change < SMALL_ENOUGH: break print("values for uniform random actions") print_values(V, grid) print("\n\n") policy = { (2, 0): 'U', (1, 0): 'U', (0, 0): 'R', (0, 1): 'R', (0, 2): 'R', (1, 2): 'R', (2, 1): 'R', (2, 2): 'R', (2, 3): 'U', } print_policy(policy, grid) #init V(s) = 0 V = {} for s in states: V[s] = 0 gamma = .9 #repeat until convergence while True: biggest_change = 0 for s in states: old_v = V[s] #V(s) only has value if it's not a terminal state if s in policy: a = policy[s] grid.set_state(s) r = grid.move(a) V[s] = r + gamma * V[grid.current_state()] biggest_change = max(biggest_change, np.abs(old_v - V[s])) if biggest_change < SMALL_ENOUGH: break print("values for a fixed policy:") print_values(V, grid)
for s in states: evaluate the value for state del = max(0, |new_val-old_val|) if del < threshold: break return values TODO: Run all the code and analyse it. ''' import numpy as np from gridworld import standard_grid import prettytable as pt import math # importing the satndard grid and defining some variables grid = standard_grid() # the standard grid has rewards only at the terminal states and 0 rewards for all other states rewards = grid.rewards tolerance = 1e-3 # the tolerance gamma = 1.0 # the gamma value # initialize value for each state states = grid.all_states() values = {st:0.00 for st in states} # set to be 0 for each state def print_in_gridworld(v): ''' Function to print the values in the gridworld ''' # making the grid list out = []
def main(policy='uniform'): # let's find value function V(s), given a policy p(a|s). # # recall that there are 2 different policies: # 1) completely random policy; # 2) completely deterministic (fixed) policy. # we are going to find value function for both. # # NOTE: # there are 2 probability distributions in the Bellman equation: # 1) p(a|s) - the policy, defines what action to take given the state; # 2) p(s',r|s,a) - state-transition probability, # defines the next state and reward # given a state-action pair. # we will only model a uniform random policy, i.e., p(a|s) = uniform. grid = standard_grid() # the states will be positions (i, j). # gridworld is simpler than tic-tac-toe, b/c there's only one player # (i.e., a robot) that can only be at one position at a time. states = grid.all_states if policy == 'uniform': #################### 1) UNIFORM POLICY #################### # initialize V(s) to 0: V = {} for s in states: V[s] = 0 # define the discount factor: gamma = 1.0 i = 0 # repeat until convergence: while True: max_change = 0 # max change for the currenent iteration for s in states: # keep a copy of old V(s), s.t. we can keep track # of the magnitude of each change: old_v = V[s] # NOTE: V(terminal_state) has no value: # check if not a terminal state: if s in grid.actions: # accumulate the value of this state: new_v = 0 # we consider a UNIFORM policy, # i.e., the probability of taking any action is the same; p_a = 1.0 / len(grid.actions[s]) # loop over all possible actions that can be taken # from the current state, s: for a in grid.actions[s]: # set our current state on the grid: grid.set_state(s) # make a move to get the reward, r, and next state, s': r = grid.move(a) s_prime = grid.current_state # for debugging: #print('s:', s, 's_prime:', s_prime, 'r:', r) # calculate (basically, accumulate) the Bellman equation: new_v += p_a * (r + gamma * V[s_prime]) # update the value of the current state V[s] = new_v # update max_change: max_change = max(max_change, np.abs(old_v - V[s])) i += 1 # check if converged: if max_change < THRESHOLD: break print('iterations to converge:', i, '\n') print('values for uniform policy:') print_values(V, grid) else: #################### 2) FIXED POLICY #################### # define our policy: policy = { (0, 0): 'R', (0, 1): 'R', (0, 2): 'R', (1, 0): 'U', (1, 2): 'R', (2, 0): 'U', (2, 1): 'R', (2, 2): 'R', (2, 3): 'U', } # display the policy: print('the policy:') print_policy(policy, grid) print('\n') # initialize V(s) to 0: V = {} for s in states: V[s] = 0 # define the discount factor: gamma = 0.9 # so now the further we get away from the winning state, # the smaller V(s) should be # repeat untill convergence: i = 0 while True: max_change = 0 # maximum change for the current iteration # print('i:', i) for s in states: # copy the value of the current state old_v = V[s] # NOTE: V(terminal_state) has no value: if s in policy: # set our state: grid.set_state(s) # take the action and receive a reward: a = policy[s] r = grid.move(a) s_prime = grid.current_state # for debugging: # print('s:', s, 's_prime:', s_prime, 'r:', r) # update the value of the state: V[s] = r + gamma * V[s_prime] # update the maximum change: max_change = max(max_change, np.abs(old_v - V[s])) i += 1 # check if converged: if max_change < THRESHOLD: break print('iterations to converge:', i, '\n') print('values for fixed policy:') print_values(V, grid)
def main(grid_type='negative'): if grid_type == 'negative': step_cost = float( input('\nenter step_cost (e.g. \'-1\' or \'-0.1\'):\n').strip()) # get the grid: grid = negative_grid(step_cost=step_cost) else: # assuming the standard grid: grid = standard_grid() # display rewards: print('\nrewards:') print_values(grid.rewards, grid) states = grid.all_states # STEP 1: randomly initialize the value function, V(s): V = {} # the values for s in states: # as an option, initialize to 0: # V[s] = 0 # check if not a terminal state: if s in grid.actions: V[s] = np.random.random() else: V[s] = 0 print('\ninitial values:') print_values(V, grid) # STEP 2: value iteration while True: max_change = 0 for s in states: old_v = V[s] # if we're not in a terminal state: if s in grid.actions: # choose an action that results in the maximum value # for this state: best_v = np.float('-inf') # best_a = np.random.choice(ALL_POSSIBLE_ACTIONS) for a in ALL_POSSIBLE_ACTIONS: # arrive in the state: grid.set_state(s) # take the action and receive the reward: r = grid.move(a) # calculate the Bellman equation: v = r + GAMMA * V[grid.current_state] if v > best_v: best_v = v # p[s] = a # we'll do it in another loop later # update the value of this state: V[s] = best_v # update the maximum change: max_change = max(max_change, np.abs(old_v - V[s])) # check if converged: if max_change < THRESHOLD: break # STEP 3: take our optimal value funciton # and find our optimal policy p = {} # the policy for s in states: best_a = None best_v = float('-inf') # if not a terminal state: if s in grid.actions: # find the best action: for a in ALL_POSSIBLE_ACTIONS: grid.set_state(s) r = grid.move(a) v = r + GAMMA * V[grid.current_state] if v > best_v: best_v = v best_a = a p[s] = best_a # optimal values: print('\noptimal values:') print_values(V, grid) # optimal policy: print('\noptimal policy:') print_policy(p, grid)
""" This file comtains the code implrmentation of the Policy Iteration Algorithm """ import numpy as np from gridworld import standard_grid, ACTION_SPACE # from IterativePolicyEvaluation_probabilistic import print_in_gridworld, print_policy grid = standard_grid() # initializing the grid gamma = 0.9 tol = 1e-3 def print_in_gridworld(v): ''' Function to print the values in the gridworld ''' # making the grid list out = [] for i in range(7): if i % 2 != 0: inv = [] for j in range(9): if j % 2 == 0: inv.append("|") else: if (i // 2, j // 2) == (1, 1): inv.append(0.00) else: _ = v[(i // 2, j // 2)] # _ = round(_, 2)