def updateQ(self, game): #unpack #player turn made move, taking the board from state s_prev to s hist = game.history[-1] s_prev, move, s, score = getkey(hist['s_prev']), hist['move'], getkey(hist['s']), hist['score'] move = 3*move[0] + move[1] #update Q table if game.done: self.Q[s_prev][move] = score else: func = {0:min, 1:max}[game.turn] expected = self.decay_rate * func(self.Q[s]) self.Q[s_prev][move] *= (1 - self.learning_rate) self.Q[s_prev][move] += self.learning_rate * expected
def learned_move(game, training): poss = game.allpossible() poss = [xy[0]*3+xy[1] for xy in poss] move_list = [_ for _ in enumerate(training.Q[getkey(game.state)]) if (_[0] in poss)] func = {0:min, 1:max}[game.turn] best_move = func(move_list,key=lambda x:x[1])[0] return (best_move//3, best_move%3)
def simulate_game(self): #create new board, play 1 full game and update Q with each step board = TTTBoard() while not board.done: xy = self.action(board) if not board.possible(xy): self.Q[getkey(board.state)][3*xy[0]+xy[1]] = .5 - board.turn else: board = board.update(xy) self.updateQ(board)
def action(self, board): if np.random.rand() <= self.random_rate: xy = board.allpossible()[np.random.randint(len(board.allpossible()))] else: start = getkey(board.state) if board.turn == 1: i = np.argmax(self.Q[start]) else: i = np.argmin(self.Q[start]) xy = (i//3, i%3) return xy
def recursive_train(self, game, move): next_board = game.update(move) if next_board.done: val = next_board.score() else: func = {0: max, 1: min}[game.turn] val = self.decay_rate * func([ self.recursive_train(next_board, xy) for xy in next_board.allpossible() ]) self.Q[getkey(game.state)][move[0] * 3 + move[1]] = val return val
def train(self): game = TTTBoard() for move in game.allpossible(): self.Q[getkey(game.state)][move[0] * 3 + move[1]] = self.recursive_train( game, move)