return self.sa2x(s, a) def getPolicyQ(model, s): # we need Q(s,a) to choose an action # i.e. a = argmax[a]{ Q(s,a) } policyQ = {} for a in allActions: q = model.predict(s, a) policyQ[a] = q return policyQ if __name__ == '__main__': grid = negativeGrid(stepCost=-0.1) # initialize model model = Model() # repeat until convergence t = 1.0 t2 = 1.0 deltas = [] for i in range(20000): if i % 100 == 0: t += 10e-3 t2 += 0.01 alpha = learningRate / t2 s = (2, 0) grid.setState(s)
grid.setState(s) statesRewardsList.append((s, 0)) #traverse the grid till we reach the terminal state while not grid.isGameOver(): oldS = s oldR = r a = randomAction(policy[s]) r = grid.move(a) s = grid.getCurrentState() valueF[oldS] += alpha * (r + gamma * valueF[s] - valueF[oldS]) return valueF if __name__ == '__main__': grid = negativeGrid() valueF = {} policy = {} for s in grid.allStates(): valueF[s] = 0 # state -> action policy = { (2, 0): 'U', (1, 0): 'U', (0, 0): 'R', (0, 1): 'R', (0, 2): 'R', (1, 2): 'R', (2, 1): 'R', (2, 2): 'R', (2, 3): 'U',
import numpy as np from grid_world import standardGrid, negativeGrid from iterative_policy_evaluation import printValues, printPolicy #this basically refers to the windy grid problem, where lets assume a wind is blowing in the cells and tberefore we are not sure that #our agent would move for sure where it wants to #the case taken here is that if our agent wants to move to say direction L, it could do so only with a probability of 0.5 #and it could move to any other direction with an equal probability if __name__ == '__main__': smallEnough = 10e-4 gamma = 0.9 print('Hola') grid = negativeGrid(stepCost=-1.0) states = grid.allStates() allActions = ('U', 'D', 'L', 'R') valueF = {} policy = {} for s in states: valueF[s] = 0 # initial policy for a in grid.actions.keys(): policy[a] = np.random.choice(allActions) while True: #policy evaluation step while True: largest = 0 for s in states: if s in policy: