def move(self, board: Board) -> (GameResult, bool): """ Makes a move on the given input state :param board: The current state of the game :return: The GameResult after this move, Flag to indicate whether the move finished the game """ self.board_position_log.append(board.state.copy()) nn_input = self.board_state_to_nn_input(board.state) probs = self.get_valid_probs([nn_input], [board]) probs = probs[0] # Most of the time our next move is the one with the highest probability after removing all illegal ones. # Occasionally, however we randomly chose a random move to encourage exploration if (self.training is True) and \ (self.game_counter < self.pre_training_games): move = board.random_empty_spot() else: if np.isnan(probs).any(): # Can happen when all probabilities degenerate to 0. Best thing we can do is # make a random legal move move = board.random_empty_spot() else: move = np.random.choice(np.arange(len(probs)), p=probs) if not board.is_legal(move): # Debug case only, I hope print("Illegal move!") # We record the action we selected as well as the Q values of the current state for later use when # adjusting NN weights. self.action_log.append(move) _, res, finished = board.move(move, self.side) return res, finished
def play_game(board: Board, player1: Player, player2: Player): player1.new_game(CROSS) player2.new_game(NAUGHT) board.reset() finished = False while not finished: result, finished = player1.move(board) if finished: if result == GameResult.DRAW: final_result = GameResult.DRAW else: final_result = GameResult.CROSS_WIN else: result, finished = player2.move(board) if finished: if result == GameResult.DRAW: final_result = GameResult.DRAW else: final_result = GameResult.NAUGHT_WIN # noinspection PyUnboundLocalVariable player1.final_result(final_result) # noinspection PyUnboundLocalVariable player2.final_result(final_result) return final_result
def move(self, board: Board) -> (GameResult, bool): """ Implements the Player interface and makes a move on Board `board` :param board: The Board to make a move on :return: A tuple of the GameResult and a flag indicating if the game is over after this move. """ # We record all game positions to feed them into the NN for training with the corresponding updated Q # values. self.board_position_log.append(board.state.copy()) nn_input = self.board_state_to_nn_input(board.state) probs, _ = self.get_valid_probs([nn_input], self.q_net, [board]) probs = probs[0] # Most of the time our next move is the one with the highest probability after removing all illegal ones. # Occasionally, however we randomly chose a random move to encourage exploration if (self.training is True) and \ ((self.game_counter < self.pre_training_games) or (np.random.rand(1) < self.random_move_prob)): move = board.random_empty_spot() else: move = np.argmax(probs) # We record the action we selected as well as the Q values of the current state for later use when # adjusting NN weights. self.action_log.append(move) # We execute the move and return the result _, res, finished = board.move(move, self.side) return res, finished
def get_move(self, board: Board) -> int: """ Return the next move given the board `board` based on the current values of next states :param board: The current board state :return: The next move based on the current values of next states, starting from input state """ if self.move_strategy == MoveStrategy.EXPLORATION: # exploratory random move m = board.random_empty_spot() _ = self.get_v( board) # just to ensure we have values for our board state return m else: # greedy move: exploiting current knowledge vals = self.get_v(board) # type: np.ndarray while True: maxv_idxs = np.argwhere( vals == np.amax(vals)) # positions of max values in array m = np.random.choice(maxv_idxs.flatten().tolist()) # type: int #m = np.argmax(vals) # type: int # this instead would return 1st occurance if board.is_legal(m): # print("vals=", end='') # print(vals) # print("m={}".format(m)) return m else: vals[m] = -1.0
def move(self, board: Board) -> (GameResult, bool): """ Making a random move :param board: The board to make a move on :return: The result of the move """ _, res, finished = board.move(board.random_empty_spot(), self.side) return res, finished
def move(self, board: Board): """ Makes a move and returns the game result after this move and whether the move ended the game :param board: The board to make a move on :return: The GameResult after this move, Flag to indicate whether the move finished the game """ m = self.get_move(board) self.move_history.append((board.hash_value(), m)) _, res, finished = board.move(m, self.side) return res, finished
def get_move(self, board: Board) -> int: """ Return the next move given the board `board` based on the current Q values :param board: The current board state :return: The next move based on the current Q values for the input state """ board_hash = board.hash_value() # type: int qvals = self.get_q(board_hash) # type: np.ndarray while True: m = np.argmax(qvals) # type: int if board.is_legal(m): return m else: qvals[m] = -1.0
def get_v(self, board: Board) -> np.ndarray: """ Returns all values when moving from current state of 'board' :param board: The current board state :return: List of values of all possible next board states """ # We build the value dictionary in a lazy manner, only adding a state when it is actually used for the first time # board_hash = board.hash_value( ) # needed because value dictionary maps *hashed* state to values if board_hash in self.v: vals = self.v[board_hash] else: vals = np.full(9, self.v_init) # default initial value # set values for winning states to WIN_VALUE # (player cannot end up in a losing state after a move # so losing states need not be considered): for pos in range(vals.size): # vals.size = BOARD_SIZE if board.is_legal(pos): b = Board(board.state) b.move(pos, self.side) if b.check_win(): vals[pos] = self.v_win elif b.num_empty() == 0: # if it is not a win, and there are no other positions # available, then it is a draw vals[pos] = self.v_draw # Update dictionary: self.v[board_hash] = vals # print("v[{}]={}".format(board_hash, self.v[board_hash])) return vals
def play_random_game(): board = Board() finished = False last_play = NAUGHT next_play = CROSS while not finished: _, result, finished = board.move(board.random_empty_spot(), next_play) print_board(board) last_play, next_play = next_play, last_play if result == GameResult.DRAW: print("Game is a draw") elif last_play == CROSS: print("Cross won!") else: print("Naught won!")
def battle(player1: Player = RandomPlayer(), player2: Player = RandomPlayer(), num_games: int = 100000, silent: bool = False): board = Board() draw_count = 0 cross_count = 0 naught_count = 0 for _ in range(num_games): result = play_game(board, player1, player2) if result == GameResult.CROSS_WIN: cross_count += 1 elif result == GameResult.NAUGHT_WIN: naught_count += 1 else: draw_count += 1 if not silent: print( "After {} game we have draws: {}, Player 1 wins: {}, and Player 2 wins: {}." .format(num_games, draw_count, cross_count, naught_count)) print( "Which gives percentages of draws: {:.2%}, Player 1 wins: {:.2%}, and Player 2 wins: {:.2%}" .format(draw_count / num_games, cross_count / num_games, naught_count / num_games)) return cross_count, naught_count, draw_count
def evaluate_players(p1: Player, p2: Player, games_per_battle=100, num_battles=100): board = Board() p1_wins = [] p2_wins = [] draws = [] game_number = [] game_counter = 0 TFSessionManager.set_session(tf.Session()) TFSessionManager.get_session().run(tf.global_variables_initializer()) for i in range(num_battles): p1win, p2win, draw = battle(p1, p2, games_per_battle, False) p1_wins.append(p1win) p2_wins.append(p2win) draws.append(draw) game_counter = game_counter + 1 game_number.append(game_counter) TFSessionManager.set_session(None) return game_number, p1_wins, p2_wins, draws
def move(self, board: Board) -> (GameResult, bool): """ Implements the Player interface and makes a move on Board `board` :param board: The Board to make a move on :return: A tuple of the GameResult and a flag indicating if the game is over after this move. """ # We record all game positions to feed them into the NN for training with the corresponding updated Q # values. self.board_position_log.append(board.state.copy()) nn_input = self.board_state_to_nn_input(board.state) probs, qvalues = self.get_probs(nn_input) qvalues = np.copy(qvalues) # We filter out all illegal moves by setting the probability to 0. We don't change the q values # as we don't want the NN to waste any effort of learning different Q values for moves that are illegal # anyway. for index, p in enumerate(qvalues): if not board.is_legal(index): probs[index] = -1 elif probs[index] < 0: probs[index] = 0.0 # Most of the time our next move is the one with the highest probability after removing all illegal ones. # Occasionally, however we randomly chose a random move to encourage exploration if (self.training is True) and (np.random.rand(1) < self.random_move_prob): move = board.random_empty_spot() else: move = np.argmax(probs) # Unless this is the very first move, the max Q value of this state is also the max Q value of # the move that got the game from the previous state to this one. if len(self.action_log) > 0: self.next_max_log.append(qvalues[np.argmax(probs)]) # We record the action we selected as well as the Q values of the current state for later use when # adjusting NN weights. self.action_log.append(move) self.values_log.append(qvalues) # We execute the move and return the result _, res, finished = board.move(move, self.side) return res, finished
def move(self, board: Board) -> (GameResult, bool): """ Making a move according to the MinMax algorithm :param board: The board to make a move on :return: The result of the move """ score, action = self._max(board) _, res, finished = board.move(action, self.side) return res, finished
def move(self, board: Board): """ Makes a move and returns the game result after this move and whether the move ended the game :param board: The board to make a move on :return: The GameResult after this move, Flag to indicate whether the move finished the game """ # Select strategy to choose next move: exploit known or explore unknown? if np.random.uniform(0, 1) <= self.epsilon: self.move_strategy = MoveStrategy.EXPLORATION else: self.move_strategy = MoveStrategy.EXPLOITATION m = self.get_move(board) self.move_history.append((board.hash_value(), m)) self.backup_value() # print("v={}".format(self.v)) _, res, finished = board.move(m, self.side) return res, finished
def move(self, board: Board) -> (GameResult, bool): """ Making a move according to the MinMax algorithm. If more than one best move exist, chooses amongst them randomly. :param board: The board to make a move on :return: The result of the move """ score, action = self._max(board) _, res, finished = board.move(action, self.side) return res, finished
def _min(self, board: Board) -> (float, int): """ Evaluate the board position `board` from the Minimizing player's point of view. :param board: The board position to evaluate :return: Tuple of (Best Result, Best Move in this situation). Returns -1 for best move if the game has already finished """ # # First we check if we have seen this board position before, and if yes just return the cached value # board_hash = board.hash_value() if board_hash in self.cache: return self.cache[board_hash] # # Init the min value as well as action. Min value is set to DRAW as this value will pass through in case # of a draw # min_value = self.DRAW_VALUE action = -1 # # If the game has already finished we return. Otherwise we look at possible continuations # winner = board.who_won() if winner == self.side: min_value = self.WIN_VALUE action = -1 elif winner == board.other_side(self.side): min_value = self.LOSS_VALUE action = -1 else: for index in [ i for i, e in enumerate(board.state) if board.state[i] == EMPTY ]: b = Board(board.state) b.move(index, board.other_side(self.side)) res, _ = self._max(b) if res < min_value or action == -1: min_value = res action = index # Shortcut: Can't get better than that, so abort here and return this move if min_value == self.LOSS_VALUE: self.cache[board_hash] = (min_value, action) return min_value, action self.cache[board_hash] = (min_value, action) return min_value, action
def play_game(board: Board, player1: Player, player2: Player, silent: bool = True): player1.new_game(CROSS) player2.new_game(NAUGHT) board.reset() if not silent: print() board.print_board() time.sleep(1) finished = False while not finished: # player1 move result, finished = player1.move(board) if not silent: print() print("{} move:".format(player1.name)) board.print_board() time.sleep(1) if finished: if result == GameResult.DRAW: final_result = GameResult.DRAW else: final_result = GameResult.CROSS_WIN else: # player 2 move result, finished = player2.move(board) if not silent: print() print("{} move:".format(player2.name)) board.print_board() time.sleep(1) if finished: if result == GameResult.DRAW: final_result = GameResult.DRAW else: final_result = GameResult.NAUGHT_WIN player1.final_result(final_result) player2.final_result(final_result) if not silent: print() if final_result == GameResult.CROSS_WIN: print("{} wins!".format(player1.name)) elif final_result == GameResult.NAUGHT_WIN: print("{} wins!".format(player2.name)) else: print("Draw!") return final_result
def _min(self, board: Board) -> int: """ Evaluate the board position `board` from the Minimizing player's point of view. :param board: The board position to evaluate :return: returns the best Move in this situation. Returns -1 for best move if the game has already finished """ # # First we check if we have seen this board position before, and if yes just return a random choice # from the cached values # board_hash = board.hash_value() if board_hash in self.cache: return random.choice(self.cache[board_hash]) # # If the game has already finished we return. Otherwise we look at possible continuations # winner = board.who_won() if winner == self.side: best_moves = {(self.WIN_VALUE, -1)} elif winner == board.other_side(self.side): best_moves = {(self.LOSS_VALUE, -1)} else: # # Init the min value as well as action. Min value is set to DRAW as this value will pass through in case # of a draw # min_value = self.DRAW_VALUE action = -1 best_moves = {(min_value, action)} for index in [ i for i, e in enumerate(board.state) if board.state[i] == EMPTY ]: b = Board(board.state) b.move(index, board.other_side(self.side)) res, _ = self._max(b) if res < min_value or action == -1: min_value = res action = index best_moves = {(min_value, action)} elif res == min_value: action = index best_moves.add((min_value, action)) best_moves = tuple(best_moves) self.cache[board_hash] = best_moves return random.choice(best_moves)
def board_state_to_nn_input(self, state: np.ndarray) -> np.ndarray: """ Converts a Tic Tac Tow board state to an input feature vector for the Neural Network. The input feature vector is a bit array of size 27. The first 9 bits are set to 1 on positions containing the player's pieces, the second 9 bits are set to 1 on positions with our opponents pieces, and the final 9 bits are set on empty positions on the board. :param state: The board state that is to be converted to a feature vector. :return: The feature vector representing the input Tic Tac Toe board state. """ res = np.array([(state == self.side).astype(int), (state == Board.other_side(self.side)).astype(int), (state == EMPTY).astype(int)]) return res.reshape(-1)
def move(self, board: Board) -> (GameResult, bool): """ Make move corresponding to key pressed by user :param board: The board to make a move on :return: The result of the move """ print() while True: key = input("Your move? ") if key in self.keys: break position = self.keys.index(key) _, res, finished = board.move(position, self.side) return res, finished
def battle(player1: Player, player2: Player, num_games: int = 100000, silent: bool = False): board = Board() draw_count = 0 cross_count = 0 naught_count = 0 if not silent: print("Battling", end="", flush=True) for _ in range(1, num_games + 1): result = play_game(board, player1, player2) if result == GameResult.CROSS_WIN: cross_count += 1 elif result == GameResult.NAUGHT_WIN: naught_count += 1 else: draw_count += 1 if not silent and _ % 1000 == 0: print(".", end="", flush=True) if not silent: print() print("After {} game we have draws: {}, {} wins: {}, and {} wins: {}.". format(num_games, draw_count, player1.name, cross_count, player2.name, naught_count)) print( "Which gives percentages of draws: {:.2%}, {} wins: {:.2%}, and {} wins: {:.2%}" .format(draw_count / num_games, player1.name, cross_count / num_games, player2.name, naught_count / num_games)) print() return cross_count, naught_count, draw_count
from tic_tac_toe.Board import Board, GameResult, CROSS, NAUGHT, EMPTY from util import print_board, play_game, battle from tic_tac_toe.RandomPlayer import RandomPlayer from tic_tac_toe.MinMaxAgent import MinMaxAgent from tic_tac_toe.RndMinMaxAgent import RndMinMaxAgent from tic_tac_toe.TabularQPlayer import TQPlayer from tic_tac_toe.SimpleNNQPlayer import NNQPlayer from tic_tac_toe.TFSessionManager import TFSessionManager import matplotlib.pyplot as plt import tensorflow as tf import random board = Board() #tf.reset_default_graph() player1 = RandomPlayer() player2 = RandomPlayer() p1_wins = [] p1count = 0 p2_wins = [] p2count = 0 draws = [] drawcount = 0 count = [] num_battles = 100 games_per_battle = 10 TFSessionManager.set_session(tf.Session()) TFSessionManager.get_session().run(tf.global_variables_initializer())
def final_result(self, result: GameResult): """ This method is called once the game is over. If `self.training` is True, we execute a training run for the Neural Network. :param result: The result of the game that just finished. """ self.game_counter += 1 # Compute the final reward based on the game outcome if (result == GameResult.NAUGHT_WIN and self.side == NAUGHT) or (result == GameResult.CROSS_WIN and self.side == CROSS): reward = self.win_value # type: float elif (result == GameResult.NAUGHT_WIN and self.side == CROSS) or (result == GameResult.CROSS_WIN and self.side == NAUGHT): reward = self.loss_value # type: float elif result == GameResult.DRAW: reward = self.draw_value # type: float else: raise ValueError("Unexpected game result {}".format(result)) self.add_game_to_replay_buffer(reward) # If we are in training mode we run the optimizer. if self.training and (self.game_counter > self.pre_training_games): batch_third = self.batch_size // 3 train_batch = self.replay_buffer_win.sample(batch_third) train_batch.extend(self.replay_buffer_loss.sample(batch_third)) train_batch.extend(self.replay_buffer_draw.sample(batch_third)) train_batch = np.array(train_batch) # # Let's compute the target q values for all non terminal move # We extract the resulting state, run it through the target net work and # get the maximum q value (of all valid moves) next_states = [s[2] for s in train_batch if s[2] is not None] target_qs = [] if len(next_states) > 0: probs, qvals = self.get_valid_probs( [self.board_state_to_nn_input(s) for s in next_states], self.target_net, [Board(s) for s in next_states]) i = 0 for t in train_batch: if t[2] is not None: max_move = np.argmax(probs[i]) max_qval = qvals[i][max_move] target_qs.append(max_qval * self.reward_discount) i += 1 else: target_qs.append(t[3]) if i != len(next_states): print("Something wrong here!!!") else: target_qs.extend(train_batch[:, 3]) # We convert the input states we have recorded to feature vectors to feed into the training. nn_input = [ self.board_state_to_nn_input(x[0]) for x in train_batch ] actions = train_batch[:, 1] # We run the training step with the recorded inputs and new Q value targets. summary, _ = TFSN.get_session().run( [self.q_net.merge, self.q_net.train_step], feed_dict={ self.q_net.input_positions: nn_input, self.q_net.target_q: target_qs, self.q_net.actions: actions }) self.random_move_prob *= self.random_move_decrease if self.writer is not None: self.writer.add_summary(summary, self.game_counter) summary = tf.Summary(value=[ tf.Summary.Value(tag='Random_Move_Probability', simple_value=self.random_move_prob) ]) self.writer.add_summary(summary, self.game_counter) TFSN.get_session().run(self.graph_copy_op)
from tic_tac_toe.Board import Board, GameResult from tic_tac_toe.RandomPlayer import RandomPlayer from tic_tac_toe.MinMaxAgent import MinMaxAgent from tic_tac_toe.RndMinMaxAgent import RndMinMaxAgent from tic_tac_toe.HumanPlayer import HumanPlayer from tic_tac_toe.TQPlayer import TQPlayer from tic_tac_toe.VFPlayer import VFPlayer from util import * # battle(RandomPlayer("RandomPlayer1"), RandomPlayer("RandomPlayer2"), num_games=10000) # battle(MinMaxAgent(), RandomPlayer(), num_games=10000) # battle(RandomPlayer(), MinMaxAgent(), num_games=10000) # battle(MinMaxAgent(), RndMinMaxAgent(), num_games=10000) #play_game(Board(), RndMinMaxAgent(), HumanPlayer(), silent=False) #play_game(Board(), VFPlayer(), MinMaxAgent(), silent=False) player1 = VFPlayer("VFPlayer1", learning_rate=0.1, exploration_rate=0.01, v_init=0.6) #player1 = TQPlayer() eval_players(player1, RndMinMaxAgent(), 50) player1.set_exloration_rate(0.0) eval_players(player1, RndMinMaxAgent(), 50) while True: play_game(Board(), player1, HumanPlayer(), silent=False)