def self_play_one_game(self, game: ConnectNGame) \ -> List[Tuple[NetGameState, ActionProbs, NDArray[(Any), np.float]]]: """ :param game: :return: Sequence of (s, pi, z) of a complete game play. The number of list is the game play length. """ states: List[NetGameState] = [] probs: List[ActionProbs] = [] current_players: List[np.float] = [] while not game.game_over: move, move_probs = self._get_action(game) states.append(convert_game_state(game)) probs.append(move_probs) current_players.append(game.current_player) game.move(move) current_player_z = np.zeros(len(current_players)) current_player_z[np.array(current_players) == game.game_result] = 1.0 current_player_z[np.array(current_players) == -game.game_result] = -1.0 self.reset() return list(zip(states, probs, current_player_z))
def _playout(self, game: ConnectNGame, node: TreeNode): """Run a single playout from the root to the leaf, getting a value at the leaf and propagating it back through its parents. State is modified in-place, so a copy must be provided. """ while True: if node.is_leaf(): break # Greedily select next move. action, node = node.select() game.move(action) # Evaluate the leaf using a network which outputs a list of # (action, probability) tuples p and also a score v in [-1, 1] # for the current player. action_and_probs, leaf_value = self.rollout_policy_value_fn(game) # Check for end of game. end, winner = game.game_over, game.game_result if not end: for action, prob in action_and_probs: child_node = node.expand(action, prob) player = game.current_player result = self._rollout_simulate_to_end(game) if result == ConnectNGame.RESULT_TIE: leaf_value = float(ConnectNGame.RESULT_TIE) else: leaf_value = 1.0 if result == player else -1.0 node.propagate_to_root(leaf_value)
def _rollout_simulate_to_end(self, game: ConnectNGame) -> GameResult: """Use the rollout policy to play until the end of the game, returning +1 if the current player wins, -1 if the opponent wins, and 0 if it is a tie. """ while True: end, result = game.game_over, game.game_result if end: break action_probs = self.rollout_policy_fn(game) max_action = max(action_probs, key=itemgetter(1))[0] game.move(max_action) return result
def train(): initial_game = ConnectNGame(board_size=args.board_size, n=args.n_in_row) game_records: deque[Tuple[NetGameState, ActionProbs, NDArray[(Any), np.float]]] game_records = deque(maxlen=args.buffer_size) policy_value_net = PolicyValueNet(args.board_size, args.board_size, use_gpu=args.use_cuda) alpha_go_zero_player = MCTSAlphaGoZeroPlayer(policy_value_net, playout_num=args.playout_num) for i in range(args.game_batch_num): game = copy.deepcopy(initial_game) one_game_records = alpha_go_zero_player.self_play_one_game(game) episode_len = len(one_game_records) game_records.extend(one_game_records) logging.warning( f'batch i:{i + 1}, episode_len:{episode_len}, records_total:{len(game_records)}' ) if len(game_records) <= args.batch_size: continue # training training_batch = random.sample(game_records, args.batch_size) loss, entropy = update_policy(training_batch, policy_value_net)
def minimax_mcts(): initial_game = ConnectNGame(board_size=4, n=3) strategy = PlannedMinimaxStrategy(initial_game) strategy.load_state() planned_minimax_agent = AIAgent(strategy) mcts_rollout_player = MCTSRolloutPlayer(playout_num=1000) battle(initial_game, planned_minimax_agent, mcts_rollout_player, n_games=20)
def rollout_policy_value_fn( self, game: ConnectNGame) -> Tuple[Iterator[MoveWithProb], float]: """a function that takes in a state and outputs a list of (action, probability) tuples and a score for the state""" # return uniform probabilities and 0 score for pure MCTS move_list = game.get_avail_pos() action_probs = np.ones(len(move_list)) / len(move_list) return zip(move_list, action_probs), 0
def _get_action(self, game: ConnectNGame) -> Tuple[MoveWithProb]: epsilon = 0.25 avail_pos = game.get_avail_pos() move_probs: ActionProbs = np.zeros(game.board_size * game.board_size) assert len(avail_pos) > 0 # the pi defined in AlphaGo Zero paper acts, act_probs = self._next_step_play_act_probs(game) move_probs[list(acts)] = act_probs if self._is_training: # add Dirichlet Noise when training in favour of exploration p_ = (1 - epsilon) * act_probs + epsilon * np.random.dirichlet( 0.3 * np.ones(len(act_probs))) move = np.random.choice(acts, p=p_) assert move in game.get_avail_pos() else: move = np.random.choice(acts, p=act_probs) self.reset() return move, move_probs
def action(self, game: ConnectNGame) -> Tuple[GameResult, Pos]: game = copy.deepcopy(game) player = game.current_player best_result = player * -1 # assume opponent win as worst result best_move = None for move in game.get_avail_pos(): game.move(move) status = game.get_status() game.undo() result = self.dp_map[status] if player == ConnectNGame.PLAYER_A: best_result = max(best_result, result) else: best_result = min(best_result, result) # update best_move if any improvement best_move = move if best_result == result else best_move # print(f'move {move} => {result}') # if best_result == game.currentPlayer: # return best_result, move return best_result, best_move
def test_model(policy_value_net): initial_game = ConnectNGame(board_size=args.board_size, n=args.n_in_row) alphago_zero_player = MCTSAlphaGoZeroPlayer(policy_value_net, playout_num=args.playout_num) mcts_rollout_player = MCTSRolloutPlayer( playout_num=args.rollout_playout_num) win_ratio = battle(initial_game, alphago_zero_player, mcts_rollout_player, n_games=3) logging.warning(f'current self-play win_ratio:{win_ratio:.3f}') policy_value_net.save_model('./current_policy.model') if win_ratio > args.best_win_ratio: logging.warning(f'best policy {win_ratio:.3f}') best_win_ratio = win_ratio # update the best_policy policy_value_net.save_model('./best_policy.model')
def policy_value_fn( self, board: ConnectNGame) -> Tuple[Iterator[MoveWithProb], float]: """ input: board output: a list of (action, probability) tuples for each available action and the score of the board state """ avail_pos_list = board.get_avail_pos() game_state = convert_game_state(board) current_state = np.ascontiguousarray( game_state.reshape(-1, 4, self.board_width, self.board_height)) if self.use_gpu: log_act_probs, value = self.policy_value_net( Variable(torch.from_numpy(current_state)).cuda().float()) pos_probs = np.exp(log_act_probs.data.cpu().numpy().flatten()) else: log_act_probs, value = self.policy_value_net( Variable(torch.from_numpy(current_state)).float()) pos_probs = np.exp(log_act_probs.data.numpy().flatten()) value = float(value.data[0][0]) return zip(avail_pos_list, pos_probs), value
def _playout(self, game: ConnectNGame): """ From current game status, run a sequence down to a leaf node, either because game ends or unexplored node. Get the leaf value of the leaf node, either the actual reward of game or action value returned by policy net. And propagate upwards to root node. :param game: """ player_id = game.current_player node = self._current_root while True: if node.is_leaf(): break act, node = node.select() game.move(act) # now game state is a leaf node in the tree, either a terminal node or an unexplored node act_and_probs: Iterator[MoveWithProb] act_and_probs, leaf_value = self._policy_value_net.policy_value_fn( game) if not game.game_over: # case where encountering an unexplored leaf node, update leaf_value estimated by policy net to root for act, prob in act_and_probs: game.move(act) child_node = node.expand(act, prob) game.undo() else: # case where game ends, update actual leaf_value to root if game.game_result == ConnectNGame.RESULT_TIE: leaf_value = ConnectNGame.RESULT_TIE else: leaf_value = 1 if game.game_result == player_id else -1 leaf_value = float(leaf_value) # Update leaf_value and propagate up to root node node.propagate_to_root(-leaf_value)
[self.start_x + self.grid_size * (self.board_size - 1), y], 2) for c in range(self.board_size): x = self.start_x + c * self.grid_size pygame.draw.line(screen, (0, 0, 0), [x, self.start_y], [x, self.start_y + self.grid_size * (self.board_size - 1)], 2) for r in range(self.board_size): for c in range(self.board_size): piece = self.connect_n_game.board[r][c] if piece != ConnectNGame.AVAILABLE: if piece == ConnectNGame.PLAYER_A: color = (0, 0, 0) else: color = (255, 255, 255) x = self.start_x + c * self.grid_size y = self.start_y + r * self.grid_size pygame.draw.circle(screen, color, [x, y], self.grid_size // 2) if __name__ == '__main__': connectNGame = ConnectNGame() board = PyGameBoard(connectNGame) while not board.is_game_over(): pos = board.next_user_input() board.move(pos) pygame.quit()
def rollout_policy_fn(self, game: ConnectNGame) -> Iterator[MoveWithProb]: """a coarse, fast version of policy_fn used in the rollout phase.""" # rollout randomly action_probs = np.random.rand(len(game.get_avail_pos())) return zip(game.get_avail_pos(), action_probs)
self.dp_map[game.get_status()] = result, move game.undo() ret = max(ret, result) best_move = move if ret == result else best_move self.dp_map[game_status] = ret, best_move return ret, best_move else: ret = math.inf for pos in game.get_avail_pos(): move = pos result = game.move(pos) if result is None: assert not game.game_over result, opp_move = self.minimax(game.get_status()) self.dp_map[game.get_status()] = result, opp_move else: self.dp_map[game.get_status()] = result, move game.undo() ret = min(ret, result) best_move = move if ret == result else best_move self.dp_map[game_status] = ret, best_move return ret, best_move if __name__ == '__main__': tic_tac_toe = ConnectNGame(n=3, board_size=3) strategy = CountingMinimaxStrategy() strategy.action(tic_tac_toe) print(f'Game States Number {len(strategy.dp_map)}')
def action(self, game: ConnectNGame) -> Tuple[GameResult, Pos]: self.game = copy.deepcopy(game) self.dp_map = {} result, move = self.minimax(game.get_status()) return result, move
board = [[ConnectNGame.AVAILABLE] * N for _ in range(N)] for r in range(N): for c in range(N): board[c][N - 1 - r] = status[r][c] return tuple([tuple(board[i]) for i in range(N)]) def save_state(self): import pickle with open( f'planned_minimax_{self.game.n}_{self.game.board_size}.pickle', 'wb') as handle: pickle.dump(self.dp_map, handle, protocol=pickle.HIGHEST_PROTOCOL) def load_state(self): import pickle with open( f'planned_minimax_{self.game.n}_{self.game.board_size}.pickle', 'rb') as handle: self.dp_map = pickle.load(handle) if __name__ == '__main__': connect_n_game = ConnectNGame(n=3, board_size=4) strategy = PlannedMinimaxStrategy(connect_n_game) # strategy.save_state() strategy.load_state() print(strategy.action(connect_n_game))
play(env, planned_minimax_agent, planned_minimax_agent) def play(env: ConnectNGym, agent1: BaseAgent, agent2: BaseAgent, render=True) -> GameResult: agents = [agent1, agent2] env.reset() board = env.pygame_board done = False agent_id = -1 while not done: agent_id = (agent_id + 1) % 2 agent = agents[agent_id] action = agent.get_action(board) _, reward, done, info = env.step(action) if render: env.render() if done: print(f'result={reward}') return reward if __name__ == '__main__': board = PyGameBoard(connect_n_game=ConnectNGame(board_size=3, n=3)) env = ConnectNGym(board) env.render(True) play_ai_vs_ai(env) # play_human_vs_ai(env)
if result is None: assert not game.game_over self.alpha_beta_stack.append((alpha, beta)) result, opp_move = self.alpha_beta_dp(game.get_status()) self.alpha_beta_stack.pop() game.undo() beta = min(beta, result) ret = min(ret, result) best_move = move if ret == result else best_move if alpha >= beta or ret == -1: return ret, move return ret, best_move if __name__ == '__main__': tic_tac_toe = ConnectNGame(n=5, board_size=7) # strategy = MinimaxDPStrategy(tic_tac_toe) strategy = AlphaBetaDPStrategy(tic_tac_toe) print(strategy.action()) sys.exit(1) tic_tac_toe = ConnectNGame(N=5, board_size=5) # tic_tac_toe.move(0, 0) # tic_tac_toe.move(1, 1) # tic_tac_toe.move(1, 2) # tic_tac_toe.move(1, 0) # tic_tac_toe.move(0, 1) # tic_tac_toe.drawText() # strategy1 = MinimaxDPStrategy(tic_tac_toe) # strategy2 = AlphaBetaStrategy(tic_tac_toe) # strategy3 = AlphaBetaDPStrategy(tic_tac_toe)