def traverse(self, node: TreeNode, board: Board): """ 扩展子节点。 Expand node. :param node: 当前节点。 Current node. :param board: 棋盘。 The board. :return: (<TreeNode>, value<int>) 扩展出的节点和需要反向传输的 value。 Expanded nodes, and the value to be backpropagated. """ while True: if len(node.children) == 0: break action, node = node.choose_best_child(c=self.greedy_value) board.step(action) # 是否结束。 game over? is_over, winner = board.result() if is_over: if winner == board.current_player: value = 1.0 elif winner == -board.current_player: value = -1.0 else: value = 0.0 return node, value # 使用策略价值函数决策当前动作概率及评估价值。 # Use the strategy value function to decide the current action probability and evaluate the value. action_probs, value = self.policy_value_function(board) for action, probability in action_probs: _ = node.expand(action, probability) return node, value
def traverse(self, node: TreeNode, board: Board): """ Expand node. :param node: Current node. :param board: The board. :return: <TreeNode> Expanded nodes. """ while True: if len(node.children) == 0: break action, node = node.choose_best_child(c=self.greedy_value) board.step(action) is_over, _ = board.result() if is_over: return node # Expand all child node. actions = board.available_actions probs = np.ones(len(actions)) / len(actions) for action, prob in zip(actions, probs): _ = node.expand(action, prob) return node
def rollout(self, board: Board): while True: is_over, winner = board.result() if is_over: break self.rollout_policy(board) return winner
def traverse(self, node: TreeNode, board: Board): """ :param node: :param board: :return: """ while True: if len(node.children) == 0: break action, node = node.choose_best_child(c=self.greedy_value) board.step(action) is_over, winner = board.result() if is_over: if winner == board.current_player: value = 1.0 elif winner == -board.current_player: value = -1.0 else: value = 0.0 return node, value action_probs, value = self.policy_value_function(board) for action, probability in action_probs: _ = node.expand(action, probability) return node, value
def start_until_game_over(player1: Player, player2: Player, board_renderer: BoardRenderer = None): """ 玩家 player1 和玩家 player2 在 board 上进行游戏直到游戏结束,并输出获胜者。 Player player1 and player2 play on the board until the game is over, and output the winner. :param player1: 玩家 1。 Player 1. :param player2: 玩家 2。 Player 2. :param board_renderer: 棋盘渲染器。 The board renderer. :return: <int> board 返回的获胜者。 The winner returned by board. """ board = Board() while True: # 渲染。 Render. if board_renderer is not None: board.render(board_renderer) # 执行动作。 Take action. if board.current_player == BOARD.o: player1.take_action(board, is_output_action=board_renderer is not None) else: player2.take_action(board, is_output_action=board_renderer is not None) # 游戏是否结束。 Game over? is_over, winner = board.result() if is_over: if board_renderer is not None: board.render(board_renderer) return winner
def self_play(self, temp=1e-3): """ Self-play, return to all boards after the game, the probability of losing all positions, and reward of victory or lose. :param temp: Temperature parameter (Degree of exploration). :return: [(boards, all_action_probs, values)] """ board_inputs, all_action_probs, current_player = [], [], [] board = Board() self.reset() while True: self.run(board, self.search_times) # Get actions and probabilities. actions, probs = self.get_action_probs(temp=temp) action_probs = np.zeros((BOARD.board_size, BOARD.board_size)) # actions, probs -> action_probs for action, prob in zip(actions, probs): action_probs[action[0], action[1]] = prob # Collect self play data. board_inputs.append(self.board_to_xlabel(board)) all_action_probs.append(action_probs) current_player.append(board.current_player) # action -> flatten_action flatten_actions = [] for one_action in actions: flatten_actions.append(one_action[0] * BOARD.board_size + one_action[1]) # Add Dirichlet Noise for exploration in training. flatten_action = np.random.choice( flatten_actions, p=0.75 * probs + 0.25 * np.random.dirichlet(0.3 * np.ones(len(probs)))) # flatten_action -> action action = (flatten_action // BOARD.board_size, flatten_action % BOARD.board_size) board.step(action) # Reset the root node. if action in self.root.children: self.root = self.root.children[action] self.root.parent = None else: self.reset() is_over, winner = board.result() if is_over: values = np.zeros(len(current_player)) if winner != 0: values[np.array(current_player) == winner] = 1 values[np.array(current_player) != winner] = -1 return board_inputs, all_action_probs, values
def play_web_game(is_stop, player1: Player, player2: Player, turn_to, send_board_step, send_player1_running, send_player2_running, wait_human_action, game_over): board = Board() while not is_stop(): turn_to(board.current_player) if board.current_player == BOARD.o: if isinstance(player1, Human): action = wait_human_action(1, is_stop) if is_stop(): return board.step(action) else: action = player1.take_action(board, is_output_action=False, running_output_function=send_player1_running, is_stop=is_stop) send_board_step(1, action) else: if isinstance(player2, Human): action = wait_human_action(2, is_stop) if is_stop(): return board.step(action) else: action = player2.take_action(board, is_output_action=False, running_output_function=send_player2_running, is_stop=is_stop) send_board_step(2, action) is_over, winner = board.result() if is_over: game_over(winner) return
def start_until_game_over(player1: Player, player2: Player, board_renderer: BoardRenderer = None): """ Player player1 and player2 play on the board until the game is over, and output the winner. :param player1: Player 1. :param player2: Player 2. :param board_renderer: The board renderer. :return: <int> board The winner returned by board. """ board = Board() while True: # Render. if board_renderer is not None: board.render(board_renderer) # Take action. if board.current_player == BOARD.o: player1.take_action(board, is_output_action=board_renderer is not None) else: player2.take_action(board, is_output_action=board_renderer is not None) # Game over? is_over, winner = board.result() if is_over: if board_renderer is not None: board.render(board_renderer) return winner
def rollout(self, board: Board): """ Simulation. :param board: The board. :return: winner<int> winner. """ while True: is_over, winner = board.result() if is_over: break # Decision making next step. self.rollout_policy(board) return winner
def self_play(self, temp=1e-3): """ :param temp: :return: """ board_inputs, all_action_probs, current_player = [], [], [] board = Board() self.reset() while True: self.run(board, self.search_times) actions, probs = self.get_action_probs(temp=temp) action_probs = np.zeros((BOARD.board_size, BOARD.board_size)) for action, prob in zip(actions, probs): action_probs[action[0], action[1]] = prob board_inputs.append(self.board_to_xlabel(board)) all_action_probs.append(action_probs) current_player.append(board.current_player) # action -> flatten_action flatten_actions = [] for one_action in actions: flatten_actions.append(one_action[0] * BOARD.board_size + one_action[1]) flatten_action = np.random.choice(flatten_actions, p=0.75 * probs + 0.25 * np.random.dirichlet(0.3 * np.ones(len(probs)))) # flatten_action -> action action = (flatten_action // BOARD.board_size, flatten_action % BOARD.board_size) board.step(action) if action in self.root.children: self.root = self.root.children[action] self.root.parent = None else: self.reset() is_over, winner = board.result() if is_over: values = np.zeros(len(current_player)) if winner != 0: values[np.array(current_player) == winner] = 1 values[np.array(current_player) != winner] = -1 return board_inputs, all_action_probs, values
def traverse(self, node: TreeNode, board: Board): while True: if len(node.children) == 0: break action, node = node.choose_best_child(c=self.greedy_value) board.step(action) is_over, _ = board.result() if is_over: return node actions = board.available_actions probs = np.ones(len(actions)) / len(actions) for action, prob in zip(actions, probs): _ = node.expand(action, prob) return node
def rollout(self, board: Board): """ 模拟。 Simulation. :param node: 当前节点。 Current node. :param board: :return: winner<int> 获胜者。 winner. """ time = [] while True: is_over, winner = board.result() if is_over: break c1 = datetime.datetime.now() self.rollout_policy(board) c2 = datetime.datetime.now() time.append((c2 - c1).microseconds) pass return winner
def start_until_game_over(player1: Player, player2: Player, board_renderer: BoardRenderer = None): """ :param player1: :param player2: :param board_renderer: :return: """ board = Board() while True: if board_renderer is not None: board.render(board_renderer) if board.current_player == BOARD.o: player1.take_action(board, is_output_action=board_renderer is not None) else: player2.take_action(board, is_output_action=board_renderer is not None) is_over, winner = board.result() if is_over: if board_renderer is not None: board.render(board_renderer) return winner
def traverse(self, node: TreeNode, board: Board): """ 扩展子节点。 Expand node. :param node: 当前节点。 Current node. :param board: :return: <TreeNode> 扩展出的节点。 Expanded nodes. """ while True: is_over, _ = board.result() if is_over: break if len(node.children) != 0: action, node = node.choose_best_child(c=5.0) board.step(action) else: actions = board.available_actions probs = np.ones(len(actions)) / len(actions) # 扩展所有子节点。 Expand all child node. for action, prob in zip(actions, probs): _ = node.expand(action, prob) break return node, board