def train(debug, iterations, table): game = TicTacToe() ai1 = Agent(True, table) ai2 = Agent(True, table) try: while True: if iterations == 0: table.save_q_table() break if iterations > 0: #if iterations < 50: # debug = True iterations -= 1 if iterations % 100_000 == 0: print(iterations) print("q_table len", len(table.table)) if game.is_board_full() or game.get_winner(): if game.get_winner() is Player.ONE: ai1.reward(1) ai2.reward(0) elif game.get_winner() is Player.TWO: ai1.reward(0) ai2.reward(1) else: ai1.reward(0.1) ai2.reward(0.5) game.reset() ai1.reset_history() ai2.reset_history() continue ai1.iterate(game, print_q=debug) if game.get_player( ) == Player.ONE else ai2.iterate(game) if debug: print() print(game.get_hash()) game.print_board() time.sleep(0.1) continue except KeyboardInterrupt: table.save_q_table() exit()
def iterate(self, game: TicTacToe, train: bool = False, print_q: bool = False): moves = game.get_legal_moves() move = self.random_move(moves) field_hash = game.get_hash() if self.ai: move = self.get_optimal_move(moves, field_hash, print_q, train) self.history = [{ "hash": field_hash, "move": str(move) }] + self.history game.input(int(move))