def expectiDepthMax(board, Net, gamma, currDepth, finalDepth): if currDepth >= finalDepth: maxScoreSoFar = 0 movesToDo = nn2048helper.checkMoves(board) chosenMove = None for candMove in movesToDo: candBoard, candRewToNet, candReward = nn2048helper.makeMove( board, candMove) candBoardTensor = nn2048helper.makeTensor(candBoard) candScore = getScore(candBoard, Net, candBoardTensor) if candRewToNet + gamma * candScore > maxScoreSoFar: maxScoreSoFar = candRewToNet + gamma * candScore chosenMove = candMove return maxScoreSoFar, chosenMove else: maxScoreSoFar = 0 movesToDo = nn2048helper.checkMoves(board) chosenMove = None for candMove in movesToDo: candBoard, candRewToNet, candReward = nn2048helper.makeMove( board, candMove) candScore = 0 candNextBoards = nn2048helper.addAllPossibleNums(candBoard) for candNextBoard in candNextBoards: nonZeroCt = np.count_nonzero(candNextBoard[0]) candScore += gamma * expectiDepthMax( candNextBoard[0], Net, gamma, currDepth + 1, depthsPolicy[16 - nonZeroCt])[0] * candNextBoard[1] candScore *= 2 / len(candNextBoards) if candRewToNet + gamma * candScore > maxScoreSoFar: maxScoreSoFar = candRewToNet + gamma * candScore chosenMove = candMove return maxScoreSoFar, chosenMove
def getScore(board, Net=None, boardTensor = None): if len(nn2048helper.checkMoves(board)) == 0: return 0 if Net is not None: if boardTensor is not None: score = Net.forward(boardTensor.view(-1, 18, 4, 4)).data.tolist()[0][0] return score score = Net.forward(nn2048helper.makeTensor(board).view(-1, 18, 4, 4)).data.tolist()[0][0] return score return 0
def makeMoveUnderPolicy(board, Net, gamma, eps): movesToDo = nn2048helper.checkMoves(board) if len(movesToDo) == 0: return None, 0, 0 if random.random() <= eps: move = random.choice(movesToDo) newBoard, rewToNet, reward = nn2048helper.makeMove(board, move) return newBoard, rewToNet, reward else: maxScoreSoFar = float('-inf') for candMove in movesToDo: candBoard, candRewToNet, candReward = nn2048helper.makeMove(board, candMove) candBoardTensor = nn2048helper.makeTensor(candBoard) candScore = getScore(candBoard, Net, candBoardTensor) if candRewToNet + gamma*candScore > maxScoreSoFar: maxScoreSoFar = candRewToNet + gamma*candScore chosenBoard = candBoard.copy() chosenRewToNet = candRewToNet chosenReward = candReward return chosenBoard, chosenRewToNet, chosenReward
def run(self): overallScore = 0 if self.epNum <= 50: board = nn2048helper.initBoard() else: if random.random() <= 0.3: board = nn2048helper.initBoard() else: board = nn2048helper.randomBoard(random.randint(2, 12)) board, rewToNet, reward = makeMoveUnderPolicy(board, self.Net, self.gamma, self.eps) overallScore += reward currMax = 0 while board is not None: currMax = max(currMax, np.max(board)) boardNewGain = newGain(board, self.Net, self.gamma) boardTensor = nn2048helper.makeTensor(board) currScore = getScore(board, self.Net, boardTensor) self.data.append([boardTensor, currScore*(1-self.alpha) + self.alpha*boardNewGain]) board = nn2048helper.addNum(board) board, rewToNet, reward = makeMoveUnderPolicy(board, self.Net, self.gamma, self.eps) overallScore += reward print('Score: ', overallScore)