def get_action(self, step_env=True, need_probs=False):
        """ !! BEFORE CALLING, NOTIFY EVALAGENT OF THE PAST ACTIONS / ACTIONSEQUENCE !! """

        p_id_acting = self._internal_env_wrapper.env.current_player.seat_id
        range_idx = self._internal_env_wrapper.env.get_range_idx(
            p_id=p_id_acting)

        if self._mode == self.EVAL_MODE_AVG:
            if need_probs:  # only do if rly necessary
                a_probs_all_hands = self.get_a_probs_for_each_hand()
                a_probs = a_probs_all_hands[range_idx]
            else:
                a_probs_all_hands = None  # not needed
                a_probs = self.policies[p_id_acting].get_a_probs(
                    pub_obses=[
                        self._internal_env_wrapper.get_current_obs(
                            use_canonical=self.use_canonical, p_id=p_id_acting)
                    ],
                    range_idxs=np.array([range_idx], dtype=np.int32),
                    legal_actions_lists=[
                        self._internal_env_wrapper.env.get_legal_actions()
                    ])[0]

            action = np.random.choice(np.arange(self.env_bldr.N_ACTIONS),
                                      p=a_probs)

            if step_env:
                self._internal_env_wrapper.step(action=action)

            return action, a_probs_all_hands

        else:
            raise UnknownModeError(self._mode)
Ejemplo n.º 2
0
    def get_action(self, step_env=True, need_probs=False):
        """ !! BEFORE CALLING, NOTIFY EVALAGENT OF THE PAST ACTIONS / ACTIONSEQUENCE !! """

        p_id_acting = self._internal_env_wrapper.env.current_player.seat_id
        range_idx = self._internal_env_wrapper.env.get_range_idx(p_id=p_id_acting)

        # """"""""""""""""""""""""""""
        # Deep CFR
        # """"""""""""""""""""""""""""
        if self._mode == self.EVAL_MODE_AVRG_NET:
            if need_probs:  # only do if necessary
                a_probs_all_hands = self.get_a_probs_for_each_hand()
                a_probs = a_probs_all_hands[range_idx]
            else:
                a_probs_all_hands = None  # not needed

                a_probs = self.avrg_net_policies[p_id_acting].get_a_probs(
                    pub_obses=[self._internal_env_wrapper.get_current_obs()],
                    range_idxs=np.array([range_idx], dtype=np.int32),
                    legal_actions_lists=[self._internal_env_wrapper.env.get_legal_actions()]
                )[0]

            action = np.random.choice(np.arange(self.env_bldr.N_ACTIONS), p=a_probs)

            if step_env:
                self._internal_env_wrapper.step(action=action)

            return action, a_probs_all_hands

        # """"""""""""""""""""""""""""
        # SD-CFR
        # """"""""""""""""""""""""""""
        elif self._mode == self.EVAL_MODE_SINGLE:
            if need_probs:
                a_probs_all_hands = self.get_a_probs_for_each_hand()
            else:
                a_probs_all_hands = None  # not needed

            legal_actions_list = self._internal_env_wrapper.env.get_legal_actions()

            if self._episode_net_idxs[p_id_acting] is None:  # Iteration 0
                action = legal_actions_list[np.random.randint(len(legal_actions_list))]
            else:  # Iteration > 0
                action = self._strategy_buffers[p_id_acting].get(self._episode_net_idxs[p_id_acting]).get_action(
                    pub_obses=[self._internal_env_wrapper.get_current_obs()],
                    range_idxs=[range_idx],
                    legal_actions_lists=[legal_actions_list],
                )[0].item()

            if step_env:
                # add to history before modifying env state
                self._add_history_entry(p_id_acting=p_id_acting, action_hes_gonna_do=action)

                # make INTERNAL step to keep up with the game state.
                self._internal_env_wrapper.step(action=action)

            return action, a_probs_all_hands
        else:
            raise UnknownModeError(self._mode)
    def get_a_probs_for_each_hand(self):
        """ BEFORE CALLING, NOTIFY EVALAGENT OF THE PAST ACTIONS / ACTIONSEQUENCE!!!!! """
        p_id_acting = self._internal_env_wrapper.env.current_player.seat_id

        if self._mode == self.EVAL_MODE_AVG:
            return self.policies[p_id_acting].get_a_probs_for_each_hand(
                pub_obs=self._internal_env_wrapper.get_current_obs(),
                legal_actions_list=self._internal_env_wrapper.env.get_legal_actions())

        else:
            raise UnknownModeError(self._mode)
Ejemplo n.º 4
0
    def get_a_probs(self):
        pub_obs = self._internal_env_wrapper.get_current_obs()
        legal_actions_list = self._internal_env_wrapper.env.get_legal_actions()
        p_id_acting = self._internal_env_wrapper.env.current_player.seat_id
        range_idx = self._internal_env_wrapper.env.get_range_idx(
            p_id=p_id_acting)

        # """"""""""""""""""""""""""""
        # Deep CFR
        # """"""""""""""""""""""""""""
        if self._mode == self.EVAL_MODE_AVRG_NET:
            return self.avrg_net_policies[p_id_acting].get_a_probs(
                pub_obses=[pub_obs],
                range_idxs=np.array([range_idx], dtype=np.int32),
                legal_actions_lists=[legal_actions_list])[0]

        # """"""""""""""""""""""""""""
        # SD-CFR
        # """"""""""""""""""""""""""""
        elif self._mode == self.EVAL_MODE_SINGLE:

            if self._strategy_buffers[p_id_acting].size == 0:
                unif_rand_legal = np.full(
                    shape=self.env_bldr.N_ACTIONS,
                    fill_value=1.0 / len(legal_actions_list)
                ) * rl_util.get_legal_action_mask_np(
                    n_actions=self.env_bldr.N_ACTIONS,
                    legal_actions_list=legal_actions_list,
                    dtype=np.float32)
                return unif_rand_legal
            else:
                # """""""""""""""""""""
                # Weighted by Iteration
                # """"""""""""""""""""""
                # Dim: [model_idx, action_p]
                a_probs_each_model = np.array([
                    weight * strat.get_a_probs(
                        pub_obses=[pub_obs],
                        range_idxs=[range_idx],
                        legal_actions_lists=[legal_actions_list])[0]
                    for strat, weight in self._strategy_buffers[p_id_acting].
                    get_strats_and_weights()
                ])

                # """"""""""""""""""""""
                # Weighted by Reach
                # """"""""""""""""""""""
                #print(a_probs_each_model.shape)
                #print(self._get_reach_for_each_model(p_id_acting=p_id_acting, range_idx=range_idx,).shape)

                a_probs_each_model *= np.expand_dims(
                    self._get_reach_for_each_model(
                        p_id_acting=p_id_acting,
                        range_idx=range_idx,
                    ),
                    axis=1)

                # """"""""""""""""""""""
                # Normalize
                # """"""""""""""""""""""
                # Dim: [action_p]
                a_probs = np.sum(a_probs_each_model, axis=0)

                # Dim: []
                a_probs_sum = np.sum(a_probs)

                # Dim: [action_p]
                return a_probs / a_probs_sum

        else:
            raise UnknownModeError(self._mode)
Ejemplo n.º 5
0
    def get_a_probs_for_each_hand(self):
        """ BEFORE CALLING, NOTIFY EVALAGENT OF THE PAST ACTIONS / ACTIONSEQUENCE!!!!! """
        pub_obs = self._internal_env_wrapper.get_current_obs()
        legal_actions_list = self._internal_env_wrapper.env.get_legal_actions()
        p_id_acting = self._internal_env_wrapper.env.current_player.seat_id

        # """"""""""""""""""""""""""""
        # Deep CFR
        # """"""""""""""""""""""""""""
        if self._mode == self.EVAL_MODE_AVRG_NET:
            return self.avrg_net_policies[
                p_id_acting].get_a_probs_for_each_hand(
                    pub_obs=pub_obs, legal_actions_list=legal_actions_list)

        # """"""""""""""""""""""""""""
        # SD-CFR
        # """"""""""""""""""""""""""""
        elif self._mode == self.EVAL_MODE_SINGLE:

            unif_rand_legal = np.full(
                shape=self.env_bldr.N_ACTIONS,
                fill_value=1.0 /
                len(legal_actions_list)) * rl_util.get_legal_action_mask_np(
                    n_actions=self.env_bldr.N_ACTIONS,
                    legal_actions_list=legal_actions_list,
                    dtype=np.float32)

            n_models = self._strategy_buffers[p_id_acting].size
            if n_models == 0:
                return np.repeat(np.expand_dims(unif_rand_legal, axis=0),
                                 repeats=self.env_bldr.rules.RANGE_SIZE,
                                 axis=0)
            else:
                # Dim: [model_idx, range_idx]
                reaches = self._get_reach_for_each_model_each_hand(
                    p_id_acting=p_id_acting)

                # """"""""""""""""""""""
                # Compute strategy for
                # all infosets with
                # reach >0. Initialize
                # All others stay unif.
                # """"""""""""""""""""""
                contrib_each_model = np.zeros(
                    shape=(n_models, self.env_bldr.rules.RANGE_SIZE,
                           self.env_bldr.N_ACTIONS),
                    dtype=NP_FLOAT_TYPE)

                for m_i, (strat, weight) in enumerate(
                        self._strategy_buffers[p_id_acting].
                        get_strats_and_weights()):
                    range_idxs = np.nonzero(reaches[m_i])[0]
                    if range_idxs.shape[0] > 0:
                        a_probs_m = strat.get_a_probs_for_each_hand_in_list(
                            pub_obs=pub_obs,
                            range_idxs=range_idxs,
                            legal_actions_list=legal_actions_list)
                        contrib_each_model[m_i,
                                           range_idxs] = a_probs_m * weight

                # Dim: [range_idx, action_p]
                a_probs = (np.sum(contrib_each_model *
                                  np.expand_dims(reaches, axis=2),
                                  axis=0)).astype(NP_FLOAT_TYPE)

                # Dim: [range_idx]
                a_probs_sum = np.expand_dims(np.sum(a_probs, axis=1), axis=1)

                # Dim: [range_idx, action_p]
                with np.errstate(divide='ignore', invalid='ignore'):
                    return np.where(
                        a_probs_sum == 0,
                        np.repeat(
                            np.expand_dims(unif_rand_legal, axis=0),
                            repeats=self._internal_env_wrapper.env.RANGE_SIZE,
                            axis=0), a_probs / a_probs_sum)

        else:
            raise UnknownModeError(self._mode)