Beispiel #1
0
def expand_node(node: Node, to_play: Player, actions: List[Action],
                network_output: NetworkOutput):
    """
    We expand a node using the value, reward and policy prediction obtained from
    the neural networks.
    """
    node.to_play = to_play
    node.hidden_state = network_output.hidden_state
    node.reward = network_output.reward
    policy = {a: numpy.exp(network_output.policy_logits[a]) for a in actions}
    policy_sum = sum(policy.values())
    for action, p in policy.items():
        node.children[action] = Node(p / policy_sum)
Beispiel #2
0
def play_game(config: MuZeroConfig,
              network: AbstractNetwork,
              train: bool = True) -> AbstractGame:
    """
    Each game is produced by starting at the initial board position, then
    repeatedly executing a Monte Carlo Tree Search to generate moves until the end
    of the game is reached.
    """
    game = config.new_game()
    mode_action_select = 'softmax' if train else 'max'

    while not game.terminal() and len(game.history) < config.max_moves:
        # At the root of the search tree we use the representation function to
        # obtain a hidden state given the current observation.
        root = Node(0)
        current_observation = game.make_image(-1)
        expand_node(root, game.to_play(), game.legal_actions(),
                    network.initial_inference(current_observation))
        if train:
            add_exploration_noise(config, root)

        # We then run a Monte Carlo Tree Search using only action sequences and the
        # model learned by the networks.
        run_mcts(config, root, game.action_history(), network)
        action = select_action(config,
                               len(game.history),
                               root,
                               network,
                               mode=mode_action_select)
        game.apply(action)
        game.store_search_statistics(root)
    return game
Beispiel #3
0
    def store_search_statistics(self, root: Node):
        """After each MCTS run, store the statistics generated by the search."""

        sum_visits = sum(child.visit_count for child in root.children.values())
        action_space = (Action(index) for index in range(self.action_space_size))
        self.child_visits.append([
            root.children[a].visit_count / sum_visits if a in root.children else 0
            for a in action_space
        ])
        self.root_values.append(root.value())
Beispiel #4
0
def ucb_score(config: MuZeroConfig, parent: Node, child: Node,
              min_max_stats: MinMaxStats) -> float:
    """
    The score for a node is based on its value, plus an exploration bonus based on
    the prior.
    """
    pb_c = math.log((parent.visit_count + config.pb_c_base + 1) / config.pb_c_base) + config.pb_c_init
    pb_c *= math.sqrt(parent.visit_count) / (child.visit_count + 1)

    prior_score = pb_c * child.prior
    value_score = min_max_stats.normalize(child.value())
    return prior_score + value_score
Beispiel #5
0
def play_game(config: MuZeroConfig,
              storage: SharedStorage,
              train: bool = True,
              visual: bool = False,
              queue: Queue = None) -> AbstractGame:
    """
    Each game is produced by starting at the initial board position, then
    repeatedly executing a Monte Carlo Tree Search to generate moves until the end
    of the game is reached.
    """
    if queue:
        network = storage.latest_network_for_process()
    else:
        network = storage.current_network

    start = time()
    game = config.new_game()
    mode_action_select = 'softmax' if train else 'max'
    while not game.terminal() and len(game.history) < config.max_moves:
        # At the root of the search tree we use the representation function to
        # obtain a hidden state given the current observation.
        root = Node(0)
        current_observation = game.make_image(-1)
        expand_node(root, game.to_play(), game.legal_actions(),
                    network.initial_inference(current_observation))
        if train:
            add_exploration_noise(config, root)

        # We then run a Monte Carlo Tree Search using only action sequences and the
        # model learned by the networks.
        run_mcts(config, root, game.action_history(), network)
        action = select_action(config,
                               len(game.history),
                               root,
                               network,
                               mode=mode_action_select)
        game.apply(action)
        game.store_search_statistics(root)
        if visual:
            game.env.render()
    if visual:
        if game.terminal():
            print('Model lost game')
        else:
            print('Exceeded max moves')
        game.env.close()

    if queue:
        queue.put(game)
    print("Finished game episode after " + str(time() - start) +
          " seconds. Exceeded max moves? " + str(not game.terminal()))
    print("Score: ", sum(game.rewards))
    return game
def run_mcts(config: MuZeroConfig, action_history: ActionHistory,
             network: BaseNetwork, game, train):
    """
    Core Monte Carlo Tree Search algorithm.
    To decide on an action, we run N simulations, always starting at the root of
    the search tree and traversing the tree according to the UCB formula until we
    reach a leaf node.
    """
    root = Node(0)
    current_observation = game.make_observation(-1)

    print(game.make_observation_str())

    expand_node(root, game.to_play(), game.legal_actions(),
                network.initial_inference(current_observation))
    if train:
        add_exploration_noise(config, root)

    for _ in range(config.num_simulations):
        t0 = time.time()
        history = action_history.clone()
        node = root
        search_path = [node]

        while node.expanded():
            action, node = select_child(config, node)
            history.add_action(action)
            search_path.append(node)

        # Inside the search tree we use the dynamics function to obtain the next
        # hidden state given an action and the previous hidden state.
        parent = search_path[-2]
        t1 = time.time()
        network_output = network.recurrent_inference(
            parent.hidden_state,
            history.last_action().index)
        t2 = time.time()
        expand_node(node, history.to_play(), history.action_space(),
                    network_output)

        backpropagate(search_path, network_output.value, history.to_play(),
                      config.discount)
        t3 = time.time()
        print("cpu time", t1 - t0 + t3 - t2)
        print("gpu time", t2 - t1)
    return root
Beispiel #7
0
def play_game(config: MuZeroConfig, network: AbstractNetwork, train: bool = True) -> AbstractGame:
  
    game = config.new_game()
    mode_action_select = 'softmax' if train else 'max'

    while not game.terminal() and len(game.history) < config.max_moves:
        root = Node(0)
        current_observation = game.make_image(-1)
        expand_node(root, game.to_play(), game.legal_actions(), network.initial_inference(current_observation))
        if train:
            add_exploration_noise(config, root)

        run_mcts(config, root, game.action_history(), network)
        action = select_action(config, len(game.history), root, network, mode=mode_action_select)
        game.apply(action)
        game.store_search_statistics(root)
    return game