def expand_node(node: Node, to_play: Player, actions: List[Action], network_output: NetworkOutput): """ We expand a node using the value, reward and policy prediction obtained from the neural networks. """ node.to_play = to_play node.hidden_state = network_output.hidden_state node.reward = network_output.reward policy = {a: numpy.exp(network_output.policy_logits[a]) for a in actions} policy_sum = sum(policy.values()) for action, p in policy.items(): node.children[action] = Node(p / policy_sum)
def play_game(config: MuZeroConfig, network: AbstractNetwork, train: bool = True) -> AbstractGame: """ Each game is produced by starting at the initial board position, then repeatedly executing a Monte Carlo Tree Search to generate moves until the end of the game is reached. """ game = config.new_game() mode_action_select = 'softmax' if train else 'max' while not game.terminal() and len(game.history) < config.max_moves: # At the root of the search tree we use the representation function to # obtain a hidden state given the current observation. root = Node(0) current_observation = game.make_image(-1) expand_node(root, game.to_play(), game.legal_actions(), network.initial_inference(current_observation)) if train: add_exploration_noise(config, root) # We then run a Monte Carlo Tree Search using only action sequences and the # model learned by the networks. run_mcts(config, root, game.action_history(), network) action = select_action(config, len(game.history), root, network, mode=mode_action_select) game.apply(action) game.store_search_statistics(root) return game
def store_search_statistics(self, root: Node): """After each MCTS run, store the statistics generated by the search.""" sum_visits = sum(child.visit_count for child in root.children.values()) action_space = (Action(index) for index in range(self.action_space_size)) self.child_visits.append([ root.children[a].visit_count / sum_visits if a in root.children else 0 for a in action_space ]) self.root_values.append(root.value())
def ucb_score(config: MuZeroConfig, parent: Node, child: Node, min_max_stats: MinMaxStats) -> float: """ The score for a node is based on its value, plus an exploration bonus based on the prior. """ pb_c = math.log((parent.visit_count + config.pb_c_base + 1) / config.pb_c_base) + config.pb_c_init pb_c *= math.sqrt(parent.visit_count) / (child.visit_count + 1) prior_score = pb_c * child.prior value_score = min_max_stats.normalize(child.value()) return prior_score + value_score
def play_game(config: MuZeroConfig, storage: SharedStorage, train: bool = True, visual: bool = False, queue: Queue = None) -> AbstractGame: """ Each game is produced by starting at the initial board position, then repeatedly executing a Monte Carlo Tree Search to generate moves until the end of the game is reached. """ if queue: network = storage.latest_network_for_process() else: network = storage.current_network start = time() game = config.new_game() mode_action_select = 'softmax' if train else 'max' while not game.terminal() and len(game.history) < config.max_moves: # At the root of the search tree we use the representation function to # obtain a hidden state given the current observation. root = Node(0) current_observation = game.make_image(-1) expand_node(root, game.to_play(), game.legal_actions(), network.initial_inference(current_observation)) if train: add_exploration_noise(config, root) # We then run a Monte Carlo Tree Search using only action sequences and the # model learned by the networks. run_mcts(config, root, game.action_history(), network) action = select_action(config, len(game.history), root, network, mode=mode_action_select) game.apply(action) game.store_search_statistics(root) if visual: game.env.render() if visual: if game.terminal(): print('Model lost game') else: print('Exceeded max moves') game.env.close() if queue: queue.put(game) print("Finished game episode after " + str(time() - start) + " seconds. Exceeded max moves? " + str(not game.terminal())) print("Score: ", sum(game.rewards)) return game
def run_mcts(config: MuZeroConfig, action_history: ActionHistory, network: BaseNetwork, game, train): """ Core Monte Carlo Tree Search algorithm. To decide on an action, we run N simulations, always starting at the root of the search tree and traversing the tree according to the UCB formula until we reach a leaf node. """ root = Node(0) current_observation = game.make_observation(-1) print(game.make_observation_str()) expand_node(root, game.to_play(), game.legal_actions(), network.initial_inference(current_observation)) if train: add_exploration_noise(config, root) for _ in range(config.num_simulations): t0 = time.time() history = action_history.clone() node = root search_path = [node] while node.expanded(): action, node = select_child(config, node) history.add_action(action) search_path.append(node) # Inside the search tree we use the dynamics function to obtain the next # hidden state given an action and the previous hidden state. parent = search_path[-2] t1 = time.time() network_output = network.recurrent_inference( parent.hidden_state, history.last_action().index) t2 = time.time() expand_node(node, history.to_play(), history.action_space(), network_output) backpropagate(search_path, network_output.value, history.to_play(), config.discount) t3 = time.time() print("cpu time", t1 - t0 + t3 - t2) print("gpu time", t2 - t1) return root
def play_game(config: MuZeroConfig, network: AbstractNetwork, train: bool = True) -> AbstractGame: game = config.new_game() mode_action_select = 'softmax' if train else 'max' while not game.terminal() and len(game.history) < config.max_moves: root = Node(0) current_observation = game.make_image(-1) expand_node(root, game.to_play(), game.legal_actions(), network.initial_inference(current_observation)) if train: add_exploration_noise(config, root) run_mcts(config, root, game.action_history(), network) action = select_action(config, len(game.history), root, network, mode=mode_action_select) game.apply(action) game.store_search_statistics(root) return game