class TDL_solution: def __init__(self): self.game = GridWorld( (5,5)) self.squareCountGrid = self.game.createSquareCount() self.alpha = 0.1 self.gamma = 0.9 def playTDLGame(self,startSquare, randomMove): self.game.currentSquare = startSquare keepPlaying = not self.game.gameOver() squares_and_returns = [(self.game.currentSquare,0)] while keepPlaying: #policy i = self.game.currentSquare[0] j = self.game.currentSquare[1] move = self.game.policyGrid[i][j] if randomMove < np.random.rand(): moves = self.game.possibleMoves((i,j)) moves.remove(move) if len(moves) > 0: idx = np.random.randint(0,len(moves)) move = moves[idx] #move self.game.move(move) i = self.game.currentSquare[0] j = self.game.currentSquare[1] theReturn = self.game.returnGrid[i][j] squares_and_returns.append( (self.game.currentSquare,theReturn) ) keepPlaying = not self.game.gameOver() G = 0 self.squares_and_values = [] for square , theReturn in reversed(squares_and_returns): self.squares_and_values.append( (square,G) ) G = theReturn + self.game.gamma*G #self.squares_and_values.reverse() def playSarsa(self,startSquare, randomMove): self.game.currentSquare = startSquare keepPlaying = not self.game.gameOver() while keepPlaying: #policy i1 = self.game.currentSquare[0] j1 = self.game.currentSquare[1] move = self.game.policyGrid[i1][j1] if randomMove < np.random.rand(): moves = self.game.possibleMoves((i1,j1)) print( str(i1) + " " + str(j1) + " " + str(moves) + " " + str(move) ) moves.remove(move) if len(moves) > 0: idx = np.random.randint(0,len(moves)) move = moves[idx] #move self.game.move(move) i2 = self.game.currentSquare[0] j2 = self.game.currentSquare[1] theReturn = self.game.returnGrid[i2][j2] self.game.valueGrid[i1][j1] = self.game.valueGrid[i1][j1] + self.alpha*(theReturn + self.gamma*self.game.valueGrid[i2][j2]- self.game.valueGrid[i1][j1] ) keepPlaying = not self.game.gameOver() def playQLearning(self,startSquare, randomMove): self.game.currentSquare = startSquare keepPlaying = not self.game.gameOver() while keepPlaying: #policy i1 = self.game.currentSquare[0] j1 = self.game.currentSquare[1] move = self.game.policyGrid[i1][j1] # we use the best move even if random runs over it i3 = self.game.currentSquare[0] j3 = self.game.currentSquare[1] if randomMove < np.random.rand(): moves = self.game.possibleMoves((i1,j1)) print( str(i1) + " " + str(j1) + " " + str(moves) + " " + str(move) ) moves.remove(move) if len(moves) > 0: idx = np.random.randint(0,len(moves)) move = moves[idx] #move self.game.move(move) i2 = self.game.currentSquare[0] j2 = self.game.currentSquare[1] theReturn = self.game.returnGrid[i2][j2] self.game.valueGrid[i1][j1] = self.game.valueGrid[i1][j1] + self.alpha*(theReturn + self.gamma*self.game.valueGrid[i3][j3]- self.game.valueGrid[i1][j1] ) keepPlaying = not self.game.gameOver() def updateValueGrid(self): for t in range(len(self.squares_and_values) -1): square , _ = self.squares_and_values[t] nextSquare, value = self.squares_and_values[t+1] i1 = square[0] j1 = square[1] i2 = nextSquare[0] j2 = nextSquare[1] self.game.valueGrid[i1][j1] = self.game.valueGrid[i1][j1] + self.alpha*(value + self.gamma*self.game.valueGrid[i2][j2]- self.game.valueGrid[i1][j1] ) def updatePolicyGrid(self): #check if policy change #hasChanged = False #if bestMove is new set to true. rows = self.game.size[0] cols = self.game.size[1] change = False for i in range(rows): for j in range(cols): if self.game.policyGrid[i][j] in [0,1,2,3]: self.game.currentSquare = (i,j) oldMove = self.game.policyGrid[i][j] self.game.policyGrid[i][j] = self.game.bestMove() if oldMove != self.game.policyGrid[i][j]: change = True return change def printGrids(self): self.game.printPolicyGrid() self.game.printReturnGrid() self.game.printValueGrid()
class DP_Solution: def __init__(self, gamma, lower_limit): self.game = GridWorld((5, 5)) self.gamma = gamma self.lower_limit = lower_limit def updateValueGrid(self): rows = self.game.size[0] cols = self.game.size[1] for i in range(rows): for j in range(cols): move = self.game.policyGrid[i][j] #print(str(i)+" " + str(j) +" " +str(move)) if move in [0, 1, 2, 3]: if move == 0: theReturn = self.game.returnGrid[i - 1][j] self.game.valueGrid[i][j] = self.gamma * ( theReturn + self.game.valueGrid[i - 1][j]) if move == 1: theReturn = self.game.returnGrid[i][j + 1] self.game.valueGrid[i][j] = self.gamma * ( theReturn + self.game.valueGrid[i][j + 1]) if move == 2: theReturn = self.game.returnGrid[i + 1][j] self.game.valueGrid[i][j] = self.gamma * ( theReturn + self.game.valueGrid[i + 1][j]) if move == 3: theReturn = self.game.returnGrid[i][j - 1] self.game.valueGrid[i][j] = self.gamma * ( theReturn + self.game.valueGrid[i][j - 1]) def updateValueGridWindy(self, sucessRate=0.75): rows = self.game.size[0] cols = self.game.size[1] for i in range(rows): for j in range(cols): possibleMoves = self.game.possibleMoves((i, j)) nrOfWrongMoves = len(possibleMoves) - 1 chosenMove = self.game.policyGrid[i][j] if not self.game.policyGrid[i][j] in [-1, 9]: self.game.valueGrid[i][j] = 0 for move in possibleMoves: if move == chosenMove: p = sucessRate else: if nrOfWrongMoves != 0: p = (1 - sucessRate) / nrOfWrongMoves else: p = 0 # shouldnt happen if move == 0: theReturn = self.game.returnGrid[i - 1][j] self.game.valueGrid[i][j] += p * self.gamma * ( theReturn + self.game.valueGrid[i - 1][j]) if move == 1: theReturn = self.game.returnGrid[i][j + 1] self.game.valueGrid[i][j] += p * self.gamma * ( theReturn + self.game.valueGrid[i][j + 1]) if move == 2: theReturn = self.game.returnGrid[i + 1][j] self.game.valueGrid[i][j] += p * self.gamma * ( theReturn + self.game.valueGrid[i + 1][j]) if move == 3: theReturn = self.game.returnGrid[i][j - 1] self.game.valueGrid[i][j] += p * self.gamma * ( theReturn + self.game.valueGrid[i][j - 1]) def updatePolicyGrid(self): #check if policy change #hasChanged = False #if bestMove is new set to true. rows = self.game.size[0] cols = self.game.size[1] change = False for i in range(rows): for j in range(cols): if self.game.policyGrid[i][j] in [0, 1, 2, 3]: self.game.currentSquare = (i, j) oldMove = self.game.policyGrid[i][j] self.game.policyGrid[i][j] = self.game.bestMove() if oldMove != self.game.policyGrid[i][j]: change = True return change def updatePolicyGridWindy(self): #check if policy change #hasChanged = False #if bestMove is new set to true. rows = self.game.size[0] cols = self.game.size[1] change = False for i in range(rows): for j in range(cols): if self.game.policyGrid[i][j] in [0, 1, 2, 3]: self.game.currentSquare = (i, j) oldMove = self.game.policyGrid[i][j] self.game.policyGrid[i][j] = self.game.bestMove() if oldMove != self.game.policyGrid[i][j]: change = True return change def updateUntilConvergence(self): change = True count = 0 while change: change = self.updatePolicyGrid() self.updateValueGridWindy() count += 1 if count % 1000 == 0: print("count: " + str(count)) if count > 10000: print("didnt converge") break def printGrids(self): self.game.printPolicyGrid() self.game.printReturnGrid() self.game.printValueGrid()
class MC_solution: def __init__(self): self.game = GridWorld((5, 5)) self.squareCountGrid = self.game.createSquareCount() def playMCGame(self, startSquare, randomMove): self.game.currentSquare = startSquare keepPlaying = not self.game.gameOver() squares_and_returns = [(self.game.currentSquare, 0)] while keepPlaying: #policy i = self.game.currentSquare[0] j = self.game.currentSquare[1] move = self.game.policyGrid[i][j] if randomMove < np.random.rand(): moves = self.game.possibleMoves((i, j)) moves.remove(move) if len(moves) > 0: idx = np.random.randint(0, len(moves)) move = moves[idx] #move self.game.move(move) i = self.game.currentSquare[0] j = self.game.currentSquare[1] theReturn = self.game.returnGrid[i][j] squares_and_returns.append((self.game.currentSquare, theReturn)) keepPlaying = not self.game.gameOver() G = 0 self.squares_and_values = [] for square, theReturn in reversed(squares_and_returns): self.squares_and_values.append((square, G)) G = theReturn + self.game.gamma * G #self.squares_and_values.reverse() def updateValueGrid(self): visitedSquares = set() for square, G in self.squares_and_values: #print(square) if not square in visitedSquares: visitedSquares.add(square) i = square[0] j = square[1] self.squareCountGrid[i][j] += 1 self.game.valueGrid[i][j] = self.game.valueGrid[i][j] + ( G - self.game.valueGrid[i][j]) / self.squareCountGrid[i][j] def updatePolicyGrid(self): #check if policy change #hasChanged = False #if bestMove is new set to true. rows = self.game.size[0] cols = self.game.size[1] change = False for i in range(rows): for j in range(cols): if self.game.policyGrid[i][j] in [0, 1, 2, 3]: self.game.currentSquare = (i, j) oldMove = self.game.policyGrid[i][j] self.game.policyGrid[i][j] = self.game.bestMove() if oldMove != self.game.policyGrid[i][j]: change = True return change def printGrids(self): self.game.printPolicyGrid() self.game.printReturnGrid() self.game.printValueGrid() print(self.squareCountGrid)
class MC_Aprox_Solution: def __init__(self): self.game = GridWorld((5, 5)) self.learning_rate = 0.001 self.theta = np.random.randn(4) / 2 def s2x(self, square): return np.array( [square[0] - 1, square[1] - 1.5, square[0] * square[1] - 3, 1]) def playMCGame(self, startSquare, randomMove): self.game.currentSquare = startSquare keepPlaying = not self.game.gameOver() squares_and_returns = [(self.game.currentSquare, 0)] counter = 0 while keepPlaying: counter += 1 if counter > 2000: return False #policy i = self.game.currentSquare[0] j = self.game.currentSquare[1] move = self.game.policyGrid[i][j] if randomMove < np.random.rand(): moves = self.game.possibleMoves((i, j)) moves.remove(move) if len(moves) > 0: idx = np.random.randint(0, len(moves)) move = moves[idx] #move self.game.move(move) i = self.game.currentSquare[0] j = self.game.currentSquare[1] theReturn = self.game.returnGrid[i][j] squares_and_returns.append((self.game.currentSquare, theReturn)) keepPlaying = not self.game.gameOver() G = 0 self.squares_and_values = [] for square, theReturn in reversed(squares_and_returns): self.squares_and_values.append((square, G)) G = theReturn + self.game.gamma * G return True def updateValueGrid(self, t): visitedSquares = set() alpha = self.learning_rate / (t + 1) for square, G in self.squares_and_values: #print(square) if not square in visitedSquares: visitedSquares.add(square) old_theta = self.theta.copy() x = self.s2x(square) V_hat = theta.dot(x) self.theta += alpha * (G - V_hat) * x rows = self.game.size[0] cols = self.game.size[1] for i in range(rows): for j in range(cols): if self.game.policyGrid[i][j] in [0, 1, 2, 3]: self.game.valueGrid[i][j] = self.theta.dot(self.s2x( (i, j))) def updatePolicyGrid(self): #check if policy change #hasChanged = False #if bestMove is new set to true. rows = self.game.size[0] cols = self.game.size[1] change = False for i in range(rows): for j in range(cols): if self.game.policyGrid[i][j] in [0, 1, 2, 3]: self.game.currentSquare = (i, j) oldMove = self.game.policyGrid[i][j] self.game.policyGrid[i][j] = self.game.bestMove() if oldMove != self.game.policyGrid[i][j]: change = True return change def printGrids(self): self.game.printPolicyGrid() self.game.printReturnGrid() self.game.printValueGrid()