Example #1
0
class TDL_solution:
    def __init__(self):
        self.game = GridWorld( (5,5))
        self.squareCountGrid = self.game.createSquareCount()
        self.alpha = 0.1
        self.gamma = 0.9
    
    def playTDLGame(self,startSquare, randomMove):
        self.game.currentSquare = startSquare
        
        keepPlaying = not self.game.gameOver()
        squares_and_returns = [(self.game.currentSquare,0)]
     
        while keepPlaying:
            
            #policy
            i = self.game.currentSquare[0]
            j = self.game.currentSquare[1]
            move = self.game.policyGrid[i][j]
      
            if randomMove < np.random.rand():
                moves = self.game.possibleMoves((i,j))
               
                moves.remove(move)
                if len(moves) > 0:
                    idx = np.random.randint(0,len(moves))
                    move = moves[idx]
            #move
            self.game.move(move)
            i = self.game.currentSquare[0]
            j = self.game.currentSquare[1]
            theReturn = self.game.returnGrid[i][j]
            squares_and_returns.append( (self.game.currentSquare,theReturn) )
            keepPlaying = not self.game.gameOver()
        
        G = 0
        self.squares_and_values = []
        for square , theReturn in reversed(squares_and_returns):
            self.squares_and_values.append( (square,G) )
            G = theReturn + self.game.gamma*G
        #self.squares_and_values.reverse()
    
    def playSarsa(self,startSquare, randomMove):
        self.game.currentSquare = startSquare
        keepPlaying = not self.game.gameOver()
        
        while keepPlaying:
            
            #policy
            i1 = self.game.currentSquare[0]
            j1 = self.game.currentSquare[1]
            move = self.game.policyGrid[i1][j1]
      
            if randomMove < np.random.rand():
                moves = self.game.possibleMoves((i1,j1))
                print( str(i1) + " " + str(j1) + " " + str(moves) + " " + str(move) )
                moves.remove(move)
                if len(moves) > 0:
                    idx = np.random.randint(0,len(moves))
                    move = moves[idx]
            #move
            self.game.move(move)
            i2 = self.game.currentSquare[0]
            j2 = self.game.currentSquare[1]
            theReturn = self.game.returnGrid[i2][j2]
            self.game.valueGrid[i1][j1] = self.game.valueGrid[i1][j1] + self.alpha*(theReturn + self.gamma*self.game.valueGrid[i2][j2]- self.game.valueGrid[i1][j1] )
            keepPlaying = not self.game.gameOver()
            
    def playQLearning(self,startSquare, randomMove):
        self.game.currentSquare = startSquare
        keepPlaying = not self.game.gameOver()
        
        while keepPlaying:
            
            #policy
            i1 = self.game.currentSquare[0]
            j1 = self.game.currentSquare[1]
            move = self.game.policyGrid[i1][j1]
            
            # we use the best move even if random runs over it
            i3 = self.game.currentSquare[0]
            j3 = self.game.currentSquare[1]
      
            if randomMove < np.random.rand():
                moves = self.game.possibleMoves((i1,j1))
                print( str(i1) + " " + str(j1) + " " + str(moves) + " " + str(move) )
                moves.remove(move)
                if len(moves) > 0:
                    idx = np.random.randint(0,len(moves))
                    move = moves[idx]
            #move
            self.game.move(move)
            i2 = self.game.currentSquare[0]
            j2 = self.game.currentSquare[1]
            theReturn = self.game.returnGrid[i2][j2]
            self.game.valueGrid[i1][j1] = self.game.valueGrid[i1][j1] + self.alpha*(theReturn + self.gamma*self.game.valueGrid[i3][j3]- self.game.valueGrid[i1][j1] )
            keepPlaying = not self.game.gameOver()
    
        
        
    def updateValueGrid(self):
        for t in range(len(self.squares_and_values) -1):
            
            square , _ = self.squares_and_values[t]
            nextSquare, value = self.squares_and_values[t+1]
            i1 = square[0]
            j1 = square[1]
            i2 = nextSquare[0]
            j2 = nextSquare[1]
            self.game.valueGrid[i1][j1] = self.game.valueGrid[i1][j1] + self.alpha*(value + self.gamma*self.game.valueGrid[i2][j2]- self.game.valueGrid[i1][j1] )  
    
    def updatePolicyGrid(self):
        
        #check if policy change
        #hasChanged = False
        #if bestMove is new set to true.
        rows = self.game.size[0]
        cols = self.game.size[1]
        change = False
        for i in range(rows):
            for j in range(cols):
                if self.game.policyGrid[i][j] in [0,1,2,3]:
                    self.game.currentSquare = (i,j)
                    oldMove = self.game.policyGrid[i][j]
                    self.game.policyGrid[i][j] = self.game.bestMove()
                    if oldMove != self.game.policyGrid[i][j]:
                        change = True
        return change
        
        
    def printGrids(self):
        self.game.printPolicyGrid()
        self.game.printReturnGrid()
        self.game.printValueGrid()
Example #2
0
class MC_solution:
    def __init__(self):
        self.game = GridWorld((5, 5))
        self.squareCountGrid = self.game.createSquareCount()

    def playMCGame(self, startSquare, randomMove):
        self.game.currentSquare = startSquare

        keepPlaying = not self.game.gameOver()
        squares_and_returns = [(self.game.currentSquare, 0)]

        while keepPlaying:

            #policy
            i = self.game.currentSquare[0]
            j = self.game.currentSquare[1]
            move = self.game.policyGrid[i][j]

            if randomMove < np.random.rand():
                moves = self.game.possibleMoves((i, j))

                moves.remove(move)
                if len(moves) > 0:
                    idx = np.random.randint(0, len(moves))
                    move = moves[idx]
            #move
            self.game.move(move)
            i = self.game.currentSquare[0]
            j = self.game.currentSquare[1]
            theReturn = self.game.returnGrid[i][j]
            squares_and_returns.append((self.game.currentSquare, theReturn))
            keepPlaying = not self.game.gameOver()

        G = 0
        self.squares_and_values = []
        for square, theReturn in reversed(squares_and_returns):
            self.squares_and_values.append((square, G))
            G = theReturn + self.game.gamma * G
        #self.squares_and_values.reverse()

    def updateValueGrid(self):
        visitedSquares = set()

        for square, G in self.squares_and_values:
            #print(square)
            if not square in visitedSquares:
                visitedSquares.add(square)
                i = square[0]
                j = square[1]
                self.squareCountGrid[i][j] += 1
                self.game.valueGrid[i][j] = self.game.valueGrid[i][j] + (
                    G - self.game.valueGrid[i][j]) / self.squareCountGrid[i][j]

    def updatePolicyGrid(self):

        #check if policy change
        #hasChanged = False
        #if bestMove is new set to true.
        rows = self.game.size[0]
        cols = self.game.size[1]
        change = False
        for i in range(rows):
            for j in range(cols):
                if self.game.policyGrid[i][j] in [0, 1, 2, 3]:
                    self.game.currentSquare = (i, j)
                    oldMove = self.game.policyGrid[i][j]
                    self.game.policyGrid[i][j] = self.game.bestMove()
                    if oldMove != self.game.policyGrid[i][j]:
                        change = True
        return change

    def printGrids(self):
        self.game.printPolicyGrid()
        self.game.printReturnGrid()
        self.game.printValueGrid()
        print(self.squareCountGrid)
class MazeRunner:
    def __init__(self, pygameIn):
        # initialize the pygame module
        self.pygame = pygameIn
        self.pygame.init()

        # load and set the logo

        self.UP = 0
        self.RIGHT = 1
        self.DOWN = 2
        self.LEFT = 3

        self.MAX_Y = 20 * 32
        self.MAX_X = 16 * 32
        self.INFO_X = 6 * 32
        self.INFO_Y = self.MAX_Y

        #print(self.START_X)
        #print(self.START_Y)
        # positions and borders

        self.stepSize = 32

        self.leftWall = 0

        self.upperWall = 0

        #screen and background
        logo = self.pygame.image.load("unicorn32.bmp")

        self.pygame.display.set_icon(logo)
        self.pygame.display.set_caption("Maze Runner")
        self.screen = self.pygame.display.set_mode(
            (self.MAX_X + self.INFO_X, self.MAX_Y))

        self.score = 0

        #border 16px of grey/white

        self.BLACK = (0, 0, 0)
        self.WHITE = (255, 255, 255)
        # main loop

        self.menuDict = {'Play': 1, 'DP': 2, 'MC': 3, 'Exit': 4}

        self.mazeDict = {'small': 1, 'medium': 2, 'large': 3, 'Exit': 4}

        self.loadImages()

    def loadImages(self):
        self.unicornImage = self.pygame.image.load("unicorn32.bmp")
        self.rainbowImage = pygame.image.load("rainbow32.bmp")
        self.wallImage = pygame.image.load("brick32.bmp")
        self.hellImage = pygame.image.load("hell32.bmp")
        self.appleImage = pygame.image.load("apple32.bmp")
        self.bombImage = pygame.image.load("bomb32.bmp")

    def drawBorder(self):

        FRAME = 8
        color = (255, 255, 255)

        x1 = self.START_X - FRAME - 2
        y1 = self.START_Y - FRAME

        x2 = self.START_X + self.MAZE_X + FRAME
        y2 = y1
        #y2 = self.START_Y + self.MAZE_X + FRAME
        self.pygame.draw.line(self.screen, color, (x1, y1), (x2, y2), FRAME)

        #left
        x1 = self.START_X - FRAME
        y1 = self.START_Y - FRAME - 2

        x2 = x1
        y2 = self.START_Y + self.MAZE_X + FRAME
        self.pygame.draw.line(self.screen, color, (x1, y1), (x2, y2), FRAME)

        #right
        x1 = self.START_X + self.MAZE_X + FRAME
        y1 = self.START_Y - FRAME

        x2 = x1
        y2 = self.START_Y + self.MAZE_X + FRAME
        self.pygame.draw.line(self.screen, color, (x1, y1), (x2, y2), FRAME)

        #bottom
        x1 = self.START_X - FRAME
        y1 = self.START_Y + self.MAZE_X + FRAME

        y2 = self.START_X + self.MAZE_X + FRAME
        y2 = self.START_Y + self.MAZE_X + FRAME
        self.pygame.draw.line(self.screen, color, (x1, y1), (x2, y2), FRAME)
        self.pygame.display.flip()

    def placeTokens(self):
        cols = self.GridWorldGame.size[0]
        rows = self.GridWorldGame.size[1]

        returnValue = self.GridWorldGame.returnGridValue
        for i in range(rows):
            for j in range(cols):
                if self.GridWorldGame.policyGrid[i][j] == -1:
                    x = self.START_X + j * 32
                    y = self.START_Y + i * 32
                    self.screen.blit(self.wallImage, (x, y))
                if self.GridWorldGame.policyGrid[i][
                        j] == 9 and self.GridWorldGame.returnGrid[i][j] > 0:
                    x = self.START_X + j * 32
                    y = self.START_Y + i * 32
                    self.screen.blit(self.rainbowImage, (x, y))
                if self.GridWorldGame.policyGrid[i][
                        j] == 9 and self.GridWorldGame.returnGrid[i][j] < 0:
                    x = self.START_X + j * 32
                    y = self.START_Y + i * 32
                    self.screen.blit(self.hellImage, (x, y))
                if not self.GridWorldGame.policyGrid[i][j] in [
                        1, 9
                ] and self.GridWorldGame.returnGrid[i][j] > returnValue:
                    x = self.START_X + j * 32
                    y = self.START_Y + i * 32
                    self.screen.blit(self.appleImage, (x, y))
                if not self.GridWorldGame.policyGrid[i][j] in [
                        1, 9
                ] and self.GridWorldGame.returnGrid[i][j] < returnValue:
                    x = self.START_X + j * 32
                    y = self.START_Y + i * 32
                    self.screen.blit(self.bombImage, (x, y))

        self.screen.blit(self.unicornImage, self.smileyPos)
        self.pygame.display.flip()

    def run(self):
        self.mainMenu()
        self.pygame.quit()

    def play(self):
        # event handling, gets all event from the event queue

        running = True

        while (running):
            self.pygame.time.delay(100)
            self.pygame.event.pump()
            key = self.pygame.key.get_pressed()

            if key[self.pygame.K_LEFT]:
                self.move(self.LEFT)
                self.printScore()
            if key[self.pygame.K_UP]:
                self.move(self.UP)
                self.printScore()
            if key[self.pygame.K_RIGHT]:
                self.move(self.RIGHT)
                self.printScore()
            if key[self.pygame.K_DOWN]:
                self.move(self.DOWN)
                self.printScore()

            if self.GridWorldGame.gameOver():
                #self.pygame.quit()
                running = False
                break

            if key[self.pygame.K_q]:
                pass

            for event in self.pygame.event.get():
                # only do something if the event is of type QUIT
                if event.type == self.pygame.QUIT:
                    # maybe should go back to main menu or so
                    self.pygame.quit()
                    running = False

    def mazeMenu(self):
        self.clearScreen()

        self.menuItems = 0
        self.menuItemsPos = []
        self.menuItemIdx = 1
        self.printText('Choose Maze')
        self.printText('Small Maze')
        self.printText('Medium')
        self.printText('Large')
        self.printText('Back')

        self.pygame.display.flip()

        self.pygame.time.delay(100)
        return self.chooseMaze()

    def chooseMaze(self):
        idx = 1
        cursor = self.menuItemsPos[idx]
        self.screen.blit(self.unicornImage, cursor)

        while (True):
            self.pygame.time.delay(100)

            key = self.pygame.key.get_pressed()
            self.pygame.event.pump()

            x1 = cursor[0]
            y1 = cursor[1]
            if key[self.pygame.K_UP] and idx > 1:
                self.pygame.draw.rect(self.screen, self.BLACK,
                                      (x1, y1, 32, 32))
                idx -= 1
                cursor = self.menuItemsPos[idx]
                self.screen.blit(self.unicornImage, cursor)

            if key[self.pygame.K_DOWN] and idx < (self.menuItems - 1):
                self.pygame.draw.rect(self.screen, self.BLACK,
                                      (x1, y1, 32, 32))
                idx += 1
                cursor = self.menuItemsPos[idx]
                self.screen.blit(self.unicornImage, cursor)

            if key[self.pygame.K_RETURN]:
                if idx == self.mazeDict['small']:
                    self.createSmallMaze()
                    return True

                if idx == self.mazeDict['medium']:
                    pass
                    return True
                if idx == self.mazeDict['Exit']:
                    return False

            self.pygame.display.flip()
            if self.quitting():
                return False

    def createSmallMaze(self):
        #should be GridWorldSmall()
        self.GridWorldGame = GridWorld((5, 5))
        cols = self.GridWorldGame.size[0]
        rows = self.GridWorldGame.size[1]
        self.MAZE_X = cols * 32
        self.MAZE_Y = rows * 32
        FRAME = 8
        self.START_X = (self.MAX_X - cols *
                        32) / 2 + FRAME  #what happens if its not 0 in %32
        self.START_Y = (self.MAX_Y - rows * 32) / 2 + FRAME
        self.smileyPos = (self.START_X, self.START_Y)

    def dynamicProgramming(self):
        pass

    def setupGame(self):
        self.clearScreen()
        self.drawBorder()
        self.placeTokens()
        self.setupRightArea()
        self.GridWorldGame.returnCount = 0
        self.GridWorldGame.currentSquare = (0, 0)
        self.printScore()

    def setupRightArea(self):
        FRAME = 4
        white = (255, 255, 255)
        black = (0, 0, 0)

        x1 = self.MAX_X
        y1 = 0

        x2 = self.MAX_X
        y2 = self.MAX_Y
        #y2 = self.START_Y + self.MAZE_X + FRAME
        self.pygame.draw.line(self.screen, white, (x1, y1), (x2, y2), FRAME)

        fontSize = 32
        fontScore = self.pygame.freetype.Font('freesansbold.ttf', fontSize)

        x1 = self.MAX_X + 64
        y1 = 64

        x2 = 0
        y2 = 0

        (textScore, textposScore) = fontScore.render("Score", white, black)
        textposScore = [x1, y1, x2, y2]
        self.screen.blit(textScore, textposScore)

        rainbowImage = pygame.image.load("rainbow32.bmp")
        wallImage = pygame.image.load("brick32.bmp")
        hellImage = pygame.image.load("hell32.bmp")
        appleImage = pygame.image.load("apple32.bmp")
        bombImage = pygame.image.load("bomb32.bmp")

        fontSize = 24
        adjustY = 12
        #collect apples
        x1 = self.MAX_X + 8
        y1 = 192

        lengthOfText = self.printTextRightArea(24, "Collect: ", x1, y1)
        self.screen.blit(appleImage, (x1 + lengthOfText, y1 - adjustY))

        #dont collect bombs
        x1 = self.MAX_X + 8
        y1 = 256

        lengthOfText = self.printTextRightArea(24, "Avoid: ", x1, y1)
        self.screen.blit(bombImage, (x1 + lengthOfText, y1 - adjustY))

        #rainbow is good exit
        x1 = self.MAX_X + 8
        y1 = 320

        lengthOfText = self.printTextRightArea(24, "Good Exit: ", x1, y1)
        self.screen.blit(rainbowImage, (x1 + lengthOfText, y1 - adjustY))

        #Flame is bad exit
        x1 = self.MAX_X + 8
        y1 = 384

        lengthOfText = self.printTextRightArea(24, "Bad Exit: ", x1, y1)
        self.screen.blit(hellImage, (x1 + lengthOfText, y1 - adjustY))

        # its a wall
        x1 = self.MAX_X + 8
        y1 = 448

        lengthOfText = self.printTextRightArea(24, "Just a wall: ", x1, y1)
        self.screen.blit(wallImage, (x1 + lengthOfText, y1 - adjustY))

        self.pygame.display.flip()

    def printTextRightArea(self, fontSize, text, x, y):

        fontToken = self.pygame.freetype.Font('freesansbold.ttf', fontSize)
        #collect apples
        x1 = x
        y1 = y

        x2 = 0
        y2 = 0

        (textCollect,
         textposCollect) = fontToken.render(text, self.WHITE, self.BLACK)
        lengthOfCollect = textposCollect[2] - textposCollect[0]
        textposCollect = [x1, y1, x2, y2]
        self.screen.blit(textCollect, textposCollect)

        return lengthOfCollect

    def printScore(self):
        score = str(self.GridWorldGame.returnCount)
        fontSize = 32

        #erase is it needed? think so
        x1 = self.MAX_X + self.INFO_X / 2
        y1 = 3 * 32
        self.pygame.draw.rect(self.screen, self.BLACK, (x1, y1, 32, 32))

        fontScore = self.pygame.freetype.Font('freesansbold.ttf', fontSize)

        white = (255, 255, 255)
        black = (0, 0, 0)
        (textScore, textposScore) = fontScore.render(score, white, black)
        textposScore = [x1, y1, 0, 0]
        self.screen.blit(textScore, textposScore)

        self.pygame.display.flip()

    def move(self, direction):
        oldSquare = self.GridWorldGame.currentSquare
        x1 = oldSquare[1] * 32 + self.START_X
        y1 = oldSquare[0] * 32 + self.START_Y

        #print(oldSquare)
        self.GridWorldGame.move(direction)

        newSquare = self.GridWorldGame.currentSquare
        #print(newSquare)
        x2 = newSquare[1] * 32 + self.START_X
        y2 = newSquare[0] * 32 + self.START_Y

        self.pygame.draw.rect(self.screen, self.BLACK, (x1, y1, 32, 32))
        self.screen.blit(self.unicornImage, (x2, y2))
        self.pygame.display.flip()

    def mainMenu(self):
        #https://www.programcreek.com/python/example/93421/pygame.freetype
        running = True
        while (running):

            self.clearScreen()

            self.menuItems = 0
            self.menuItemsPos = []
            self.menuItemIdx = 1

            self.printText('Main menu')
            self.printText('Play game')
            self.printText('Dynamic Programming')
            self.printText('Monte Carlo')
            self.printText('Exit')

            self.pygame.display.flip()

            self.pygame.time.delay(100)
            running = self.choseFromMenu()

    def printText(self, text):
        if self.menuItems == 0:
            fontSize = 48
            startY = 32
        else:
            fontSize = 24
            startY = 32 + self.menuItems * 48

        fontMenu = self.pygame.freetype.Font('freesansbold.ttf', fontSize)
        white = (255, 255, 255)
        black = (0, 0, 0)
        (textMenu, textposMenu) = fontMenu.render(text, white, black)
        lengthOfText = textposMenu[2] - textposMenu[0]
        textposMenu[0] = (self.MAX_X + self.INFO_X - lengthOfText) / 2
        textposMenu[1] = startY
        textposMenu[2] = textposMenu[2] + lengthOfText
        textposMenu[3] = textposMenu[1] + 32

        self.menuItemsPos.append((textposMenu[0] - 32, startY))

        self.screen.blit(textMenu, textposMenu)

        self.menuItems += 1
        #self.pygame.display.flip()
    def choseFromMenu(self):
        idx = 1
        cursor = self.menuItemsPos[idx]
        self.screen.blit(self.unicornImage, cursor)

        while (True):
            self.pygame.time.delay(100)

            key = self.pygame.key.get_pressed()
            self.pygame.event.pump()

            x1 = cursor[0]
            y1 = cursor[1]
            if key[self.pygame.K_UP] and idx > 1:
                self.pygame.draw.rect(self.screen, self.BLACK,
                                      (x1, y1, 32, 32))
                idx -= 1
                cursor = self.menuItemsPos[idx]
                self.screen.blit(self.unicornImage, cursor)

            if key[self.pygame.K_DOWN] and idx < (self.menuItems - 1):
                self.pygame.draw.rect(self.screen, self.BLACK,
                                      (x1, y1, 32, 32))
                idx += 1
                cursor = self.menuItemsPos[idx]
                self.screen.blit(self.unicornImage, cursor)

            if key[self.pygame.K_RETURN]:
                if idx == self.menuDict['Play']:
                    if self.mazeMenu():
                        self.setupGame()
                        self.play()
                        self.endScreen()
                        return True
                    else:
                        return True
                if idx == self.menuDict['DP']:
                    self.dynamicProgramming()
                    return True
                if idx == self.menuDict['Exit']:
                    return False

            self.pygame.display.flip()
            if self.quitting():
                return False

    def quitting(self):
        for event in self.pygame.event.get():
            # only do something if the event is of type QUIT
            if event.type == self.pygame.QUIT:
                # maybe should go back to main menu or so
                self.pygame.quit()
                return True
        return False

    def clearScreen(self):
        self.pygame.draw.rect(self.screen, self.BLACK,
                              (0, 0, self.MAX_X + self.INFO_X, self.MAX_Y))
        pass

    def endScreen(self):
        #remove maze
        self.clearScreen()
        fontSize = 32
        fontScore = self.pygame.freetype.Font('freesansbold.ttf', fontSize)
        white = (255, 255, 255)
        black = (0, 0, 0)

        (textScore, textposScore) = fontScore.render("your score is", white,
                                                     black)
        lengthOfText = textposScore[2] - textposScore[0]
        textposScore[0] = (self.MAX_X + self.INFO_X - lengthOfText) / 2
        textposScore[1] = 64
        textposScore[2] = 0
        textposScore[3] = 0

        self.screen.blit(textScore, textposScore)

        score = str(self.GridWorldGame.returnCount)

        (textScore, textposScore) = fontScore.render(score, white, black)
        lengthOfText = textposScore[2] - textposScore[0]
        textposScore[0] = (self.MAX_X + self.INFO_X - lengthOfText) / 2
        textposScore[1] = 128
        textposScore[2] = 0
        textposScore[3] = 0

        self.screen.blit(textScore, textposScore)

        (textScore,
         textposScore) = fontScore.render("Press Q to get back to menu", white,
                                          black)
        lengthOfText = textposScore[2] - textposScore[0]
        textposScore[0] = (self.MAX_X + self.INFO_X - lengthOfText) / 2
        textposScore[1] = 192
        textposScore[2] = 0
        textposScore[3] = 0

        self.screen.blit(textScore, textposScore)

        self.pygame.display.flip()
        #your score was
        #back to main menu?
        while (True):
            self.pygame.time.delay(100)
            self.pygame.event.pump()
            key = self.pygame.key.get_pressed()
            if key[self.pygame.K_q]:
                break

        for event in self.pygame.event.get():
            # only do something if the event is of type QUIT
            if event.type == self.pygame.QUIT:
                # maybe should go back to main menu or so
                self.pygame.quit()
                break
class MC_Aprox_Solution:
    def __init__(self):
        self.game = GridWorld((5, 5))
        self.learning_rate = 0.001
        self.theta = np.random.randn(4) / 2

    def s2x(self, square):
        return np.array(
            [square[0] - 1, square[1] - 1.5, square[0] * square[1] - 3, 1])

    def playMCGame(self, startSquare, randomMove):
        self.game.currentSquare = startSquare

        keepPlaying = not self.game.gameOver()
        squares_and_returns = [(self.game.currentSquare, 0)]
        counter = 0
        while keepPlaying:

            counter += 1
            if counter > 2000:
                return False

            #policy
            i = self.game.currentSquare[0]
            j = self.game.currentSquare[1]
            move = self.game.policyGrid[i][j]

            if randomMove < np.random.rand():
                moves = self.game.possibleMoves((i, j))

                moves.remove(move)
                if len(moves) > 0:
                    idx = np.random.randint(0, len(moves))
                    move = moves[idx]
            #move
            self.game.move(move)
            i = self.game.currentSquare[0]
            j = self.game.currentSquare[1]
            theReturn = self.game.returnGrid[i][j]
            squares_and_returns.append((self.game.currentSquare, theReturn))
            keepPlaying = not self.game.gameOver()

        G = 0
        self.squares_and_values = []
        for square, theReturn in reversed(squares_and_returns):
            self.squares_and_values.append((square, G))
            G = theReturn + self.game.gamma * G

        return True

    def updateValueGrid(self, t):
        visitedSquares = set()

        alpha = self.learning_rate / (t + 1)
        for square, G in self.squares_and_values:
            #print(square)
            if not square in visitedSquares:
                visitedSquares.add(square)

                old_theta = self.theta.copy()
                x = self.s2x(square)
                V_hat = theta.dot(x)

                self.theta += alpha * (G - V_hat) * x

        rows = self.game.size[0]
        cols = self.game.size[1]
        for i in range(rows):
            for j in range(cols):
                if self.game.policyGrid[i][j] in [0, 1, 2, 3]:
                    self.game.valueGrid[i][j] = self.theta.dot(self.s2x(
                        (i, j)))

    def updatePolicyGrid(self):

        #check if policy change
        #hasChanged = False
        #if bestMove is new set to true.
        rows = self.game.size[0]
        cols = self.game.size[1]
        change = False
        for i in range(rows):
            for j in range(cols):
                if self.game.policyGrid[i][j] in [0, 1, 2, 3]:
                    self.game.currentSquare = (i, j)
                    oldMove = self.game.policyGrid[i][j]
                    self.game.policyGrid[i][j] = self.game.bestMove()
                    if oldMove != self.game.policyGrid[i][j]:
                        change = True
        return change

    def printGrids(self):
        self.game.printPolicyGrid()
        self.game.printReturnGrid()
        self.game.printValueGrid()