Esempio n. 1
0
    def search_recursive(self,
                         state: State,
                         agent: RLAgent,
                         root: bool = False,
                         logic: Logic = Logic()):
        """
        This function performs one iteration of MCTS. It is recursively called
        until a leaf node is found. The action chosen at each node is one that
        has the maximum upper confidence bound as in the paper.
        Once a leaf node is found, the neural network is called to return an
        initial policy P and a value value for the state. This value is propagated
        up the search path. In case the leaf node is a terminal state, the
        outcome is propagated up the search path. The values of Ns, Nsa, Qsa are
        updated.

        Notes
        -----
        Since the board value is computed after game termination during the next
        recursive step, which includes a player-view shift, the returned value is
        always from the perspective of the opponent.

        Returns
        -------
        float,
            the (opponent's) board value for the player.
        """
        if state.active_team == Team.red:
            # the network is trained only from the perspective of team blue
            state.flip_teams()
        # get string representation of state
        s = str(state)

        if s not in self.Es:
            self.Es[s] = logic.get_status(state)
        if self.Es[s] != Status.ongoing:
            # terminal node
            return -self.Es[s].value

        if s not in self.Ps:
            # leaf node
            return -self._fill_leaf_node(state, s, agent)

        valids = self.Vs[s]
        policy = self.Ps[s]
        if root:
            policy = self._make_policy_noisy(policy, valids)

        a = self._select_action(s, policy, valids)
        move = self.action_map.action_to_move(a, state, Team.blue)

        self.logic.execute_move(state, move)

        value = self.search_recursive(state, agent, root=False)

        self._update_qsa(s, a, value)
        self.Ns[s] += 1

        return -value
Esempio n. 2
0
    def __init__(
        self,
        student: AZAgent,
        action_map: ActionMap,
        logic: Logic = Logic(),
        num_iterations: int = 100,
        num_selfplay_episodes: int = 100,
        acceptance_rate: float = 0.55,
        mcts_simulations: int = 100,
        temperature: int = 100,
        model_folder: str = "./checkpoints/models",
        train_data_folder: str = "./checkpoints/data",
        seed: Optional[Union[int, np.random.Generator]] = None,
        **kwargs,
    ):
        super().__init__(
            student,
            action_map,
            logic,
            model_folder,
            train_data_folder,
            **kwargs,
        )
        self.n_iters = num_iterations
        self.n_episodes = num_selfplay_episodes
        self.n_mcts_sim = mcts_simulations

        self.acceptance_rate = acceptance_rate

        self.model_folder = model_folder
        self.train_data_folder = train_data_folder
        self.temp_thresh = temperature

        self.skip_first_self_play = False
        self.rng = np.random.default_rng(seed)
Esempio n. 3
0
    def __init__(
        self,
        student: RLAgent,
        action_map: ActionMap,
        logic: Logic = Logic(),
        model_folder: str = "./checkpoints/models",
        train_data_folder: str = "./checkpoints/data",
        **kwargs,
    ):

        self.model_folder = model_folder
        self.train_data_folder = train_data_folder
        if not os.path.exists(model_folder):
            os.makedirs(model_folder)
        if not os.path.exists(train_data_folder):
            os.makedirs(train_data_folder)

        assert isinstance(
            student, RLAgent
        ), f"Student agent to coach has to be of type '{RLAgent}'. Given type '{type(self.student).__name__}'"
        self.student: RLAgent = student
        self.student_mirror: RLAgent = deepcopy(
            student)  # a copy of the student to fight against
        self.action_map = action_map
        self.game = Game(self.student,
                         self.student_mirror,
                         logic=logic,
                         **kwargs)
Esempio n. 4
0
 def select_random_action(
         self,
         state: State,
         logic: Logic = Logic(),
 ) -> Action:
     action_mask = self.action_map.actions_mask(state.board, self.team,
                                                logic)
     return self.rng.choice(self.action_map.actions,
                            p=action_mask / action_mask.sum())
Esempio n. 5
0
 def decide_move(self, state, logic: Logic = Logic()):
     """
     Depending on the amount of enemy pieces left, we are entering the start, mid or endgame
     and planning through the minimax algorithm.
     :return: tuple of tuple positions representing the move
     """
     if self.ext_depth is None:
         self.set_max_depth()  # set max_depth each turn
     else:
         self.max_depth = self.ext_depth
     # make sure a flag win will be discounted by a factor that guarantees a preference towards immediate flag kill
     self.winGameReward = max(self.winGameReward,
                              self.max_depth * self.kill_reward)
     return self.minimax(max_depth=self.max_depth)
Esempio n. 6
0
def test_logic():
    state = minimal_state()
    logic = Logic()

    moves_blue = list(logic.possible_moves_iter(state.board, Team.blue))
    moves_red = list(logic.possible_moves_iter(state.board, Team.red))
    x = 3

    state = minimal_state2()
    moves_blue = list(logic.possible_moves_iter(state.board, Team.blue))
    moves_red = list(logic.possible_moves_iter(state.board, Team.red))
    x = 3
Esempio n. 7
0
    def decide_move(self, state: State, logic: Logic = Logic()) -> Move:
        """
        Decide the move to make for the given state of the game.

        Parameters
        ----------

        state: State,
            the state on which the decision is to be made.
        logic: Logic,
            the logic to use in the engine. Can be changed to vary the game mode if desirable.

        Returns
        -------
        Move,
            the chosen move to make on the state.
        """
        raise NotImplementedError
Esempio n. 8
0
    def __init__(
            self,
            network: torch.nn.Module,
            action_map: ActionMap,
            cpuct: float = 4.0,
            n_mcts_sims: int = 100,
            logic: Logic = Logic(),
    ):
        self.network = network
        self.action_map = action_map
        self.logic = logic
        self.cpuct = cpuct
        self.n_mcts_sims = max(1, n_mcts_sims)

        self.Qsa: Dict[Tuple[str, int],
                       float] = {}  # stores Q values for (s, a)
        self.Nsa: Dict[Tuple[str, int],
                       int] = {}  # stores #times edge (s, a) was visited
        self.Ns: Dict[str, float] = {}  # stores #times board s was visited
        self.Ps: Dict[str, np.ndarray] = {
        }  # stores policy (returned by neural net)

        self.Es: Dict[str, Status] = {}  # stores game end status for state s
        self.Vs: Dict[str, np.ndarray] = {}  # stores valid moves for state s
Esempio n. 9
0
 def decide_move(self, state: State, logic: Logic = Logic()):
     all_moves = list(logic.possible_moves_iter(state.board, self.team))
     if not all_moves:
         return None
     else:
         return self.rng.choice(all_moves)
Esempio n. 10
0
    def search(self,
               state: State,
               agent: RLAgent,
               perspective: Team,
               logic: Logic = Logic()):
        """
        This function performs one iteration of MCTS. It iterates, until a leaf node is found.
        The action chosen at each node is one that has the maximum upper confidence bound as
        in the paper.
        Once a leaf node is found, the neural network is called to return an initial
        policy P and a value value for the state. This value is propagated up the search path.
        In case the leaf node is a terminal state, the outcome is propagated up the search path.
        The values of Ns, Nsa, Qsa are updated.

        Returns
        -------
        float,
            the board value for the agent.
        """
        turn_counter_pre = state.turn_counter

        # (state, action) -> value sign
        sa_to_sign = dict()

        # this simply initializes the variable.
        # If one finds this value later in the tree, then there is a bug in the logic.
        value = float("inf")
        # the first iteration is always the root
        root = True

        while True:
            if state.active_team == Team.red:
                # the network is trained only from the perspective of team blue
                state.flip_teams()
            # get string representation of state
            s = str(state)

            if (state.active_team == perspective) == state.flipped_teams:
                # adjust for the correct perspective:
                # The value needs to be always seen from the perspective of the 'agent'.

                # The condition is logically equivalent to:
                #       (selected team == active player AND teams flipped)
                #    OR (selected team != active player AND teams not flipped)
                # -> Opponent perspective.
                # and in these cases we then need to multiply with -1
                # (assuming symmetric rewards).
                value_sign = -1
            else:
                value_sign = 1

            if s not in self.Es:
                self.Es[s] = logic.get_status(state)

            if self.Es[s] != Status.ongoing:
                # terminal node
                value = self.Es[s].value
                break
            elif s not in self.Ps:
                # leaf node
                value = self._fill_leaf_node(state, s, agent)
                break
            else:
                # has not reached a leaf or terminal node yet, so keep searching
                # by playing according to the current policy
                valids = self.Vs[s]
                policy = self.Ps[s]
                if root:
                    policy = self._make_policy_noisy(policy, valids)
                    # the root is only the first iteration. This information was used
                    # only to add noise to the policy. So now we can deactivate this.
                    root = False

                a = self._select_action(s, policy, valids)
                sa_to_sign[(s, a)] = value_sign

                move = self.action_map.action_to_move(a, state, Team.blue)
                self.logic.execute_move(state, move)

        for (s, a), per in sa_to_sign:
            # for every (state, action) pair: update its Q-value and visitation counter.
            self._update_qsa(s, a, value * per)
            # increment the visitation counter of this state
            self.Ns[s] += 1

        # adjust for team perspective and return the value
        return value * value_sign