def run_mcts(config: MuZeroConfig, root: Node, action_history: ActionHistory, network: BaseNetwork): """ Core Monte Carlo Tree Search algorithm. To decide on an action, we run N simulations, always starting at the root of the search tree and traversing the tree according to the UCB formula until we reach a leaf node. """ min_max_stats = MinMaxStats(config.known_bounds) for _ in range(config.num_simulations): history = action_history.clone() node = root search_path = [node] while node.expanded(): action, node = select_child(config, node, min_max_stats) history.add_action(action) search_path.append(node) # Inside the search tree we use the dynamics function to obtain the next # hidden state given an action and the previous hidden state. parent = search_path[-2] network_output = network.recurrent_inference(parent.hidden_state, history.last_action()) expand_node(node, history.to_play(), history.action_space(), network_output) backpropagate(search_path, network_output.value, history.to_play(), config.discount, min_max_stats)
def run_mcts(config: MuZeroConfig, action_history: ActionHistory, network: BaseNetwork, game, train): """ Core Monte Carlo Tree Search algorithm. To decide on an action, we run N simulations, always starting at the root of the search tree and traversing the tree according to the UCB formula until we reach a leaf node. """ root = Node(0) current_observation = game.make_observation(-1) print(game.make_observation_str()) expand_node(root, game.to_play(), game.legal_actions(), network.initial_inference(current_observation)) if train: add_exploration_noise(config, root) for _ in range(config.num_simulations): t0 = time.time() history = action_history.clone() node = root search_path = [node] while node.expanded(): action, node = select_child(config, node) history.add_action(action) search_path.append(node) # Inside the search tree we use the dynamics function to obtain the next # hidden state given an action and the previous hidden state. parent = search_path[-2] t1 = time.time() network_output = network.recurrent_inference( parent.hidden_state, history.last_action().index) t2 = time.time() expand_node(node, history.to_play(), history.action_space(), network_output) backpropagate(search_path, network_output.value, history.to_play(), config.discount) t3 = time.time() print("cpu time", t1 - t0 + t3 - t2) print("gpu time", t2 - t1) return root
def run_mcts(config: MuZeroConfig, root: Node, action_history: ActionHistory, network: BaseNetwork): min_max_stats = MinMaxStats(config.known_bounds) for _ in range(config.num_simulations): history = action_history.clone() node = root search_path = [node] while node.expanded(): action, node = select_child(config, node, min_max_stats) history.add_action(action) search_path.append(node) # Inside the search tree we use the dynamics function to obtain the next # hidden state given an action and the previous hidden state. parent = search_path[-2] network_output = network.recurrent_inference(parent.hidden_state, history.last_action()) expand_node(node, history.to_play(), history.action_space(), network_output) backpropagate(search_path, network_output.value, history.to_play(), config.discount, min_max_stats)
def action_history(self) -> ActionHistory: """Return the actions executed inside the search.""" return ActionHistory(self.history, self.action_space_size)
def clone(self): return ActionHistory(self.history, self.action_space_size)