Beispiel #1
0
    def _search(self, state: GameState, model: Model, root: bool = False):
        """
        Helper function for self.search allowing for recursive search calls using only one state instance
        :param state: The game state on which the mcts should be performed
        :param model: The model that will provide the mcts with (policy, value) estimations
        :param root: A boolean indicating whether this is the root of the search tree. Dirichlet noise is added to the
                     policy at the root to increase exploration
        :return: The z-value, corresponding to whether the search resulted in a win (z=1), loss (z=-1) or draw (z=0)
                 The z-value is negated once returned to account for changing player perspectives
        """
        if state.is_terminal():
            return -state.get_reward()

        actions = state.get_actions()
        if state not in self:  # Expand the search tree
            # Store for each node:
            # - Q: The expected reward for taking action a from the game state
            # - N: The number of times action a was performed from the game state during simulations
            # - P: A policy determining which move to take as decided by the neural network
            # - v: A value for this game state (from the current player's perspective) as decided by the neural network
            q, n = {a: 0 for a in actions}, {a: 0 for a in actions}
            p, v = model.predict(state)
            self[state] = (p, v, q, n)
            return -v

        # The game state already occurs in the search tree
        p, v, q, n = self[
            state]  # Recall policy p, value v, expected reward q, simulation count n

        # Add noise to the action selection in the root node for increased exploration
        if root:
            noise = np.random.dirichlet([self.alpha] * len(p))
            for i, (a, pa) in enumerate(p.items()):
                p[a] = (1 - self.epsilon) * p[a] + self.epsilon * noise[i]

        # Pick an action to explore by maximizing U, the upper confidence bound on the Q-values
        u = {
            a: self.c_puct * p[a] * sqrt(sum(n.values())) / (1 + n[a])
            for a in actions
        }
        a = max(actions, key=lambda a: q[a] + u[a])

        # Perform the selected action on the state and continue the simulation
        # Negate the value returned, as the 'current player' perspective is changed
        v = self._search(state.do_move(a), model)

        # Propagate the reward back up the tree
        q[a] = (n[a] * q[a] + v) / (n[a] + 1)
        n[a] += 1
        return -v
Beispiel #2
0
    def predict(self, s: GameState):
        """
        Predict a (p, v)-pair by giving the state to the neural network, where
            p is a dictionary containing a probability distribution over all legal actions
            v is a scalar value estimating the probability of winning from state s
        :param s: The input state for the network
        :return: The predicted (p, v)-tuple
        """
        state = self.state_input(s)
        state = np.reshape(
            state, (1, ) +
            state.shape)  # Reshape input to a list of a single data point

        [p_all], [[v]] = self.get_model().predict(
            state)  # Obtain predictions of the network

        # Mask all illegal actions
        legal_actions = s.get_actions()
        mask = np.zeros(self.output_size)
        for a in legal_actions:
            mask[self.action_index[a]] = 1
        p_all *= mask

        # Normalize the distribution so the probabilities sum to 1
        p_all /= p_all.sum()

        # Get the distribution over all legal actions
        p = {a: p_all[self.action_index[a]] for a in legal_actions}

        return p, v
Beispiel #3
0
def ask_move(state: GameState):
    print("Current game state:")
    print(state)
    print("Choose from possible actions: (by index)")
    actions = state.get_actions()
    print(list(enumerate(actions)))
    while True:
        input_index = input()
        try:
            input_index = int(input_index)
        except ValueError:
            continue
        if 0 <= input_index < len(actions):
            break
    a = actions[input_index]
    return a
Beispiel #4
0
def ask_model(state: GameState, model: Model, take_max: bool = False):
    p, _ = model.predict(state)

    actions = state.get_actions()

    # Normalize the distribution to only include valid actions
    valid_a_dist = {a: p[a] for a in actions}
    a_dist = {a: valid_a_dist[a] / sum(valid_a_dist.values()) for a in actions}

    # Sample an action from the distribution
    if take_max:
        a = max(a_dist, key=a_dist.get)
    else:
        items = list(a_dist.items())
        a = items[np.random.choice(len(items), p=[p[1] for p in items])][0]
    return a
Beispiel #5
0
 def state_input(self, s: GameState):
     device = torch.device('cuda' if self.use_cuda else 'cpu')
     return torch.Tensor(s.grid *
                         s.get_current_player()).unsqueeze(0).to(device)
Beispiel #6
0
def random_move(state: GameState):
    actions = state.get_actions()
    return actions[np.random.randint(0, len(actions))]
Beispiel #7
0
 def predict(
     self, s: GameState
 ) -> tuple:  # Gives a uniform distribution as policy and a value of 0.5
     actions = s.get_actions()
     return {a: 1 / len(actions) for a in actions}, 0.5