Ejemplo n.º 1
0
 def get_eval_ddqn_state_dicts(self):
     ddqns = []
     for p in range(self._eval_env_bldr.N_SEATS):
         ddqn = DDQN(owner=p,
                     ddqn_args=self._args.ddqn_args,
                     env_bldr=self._eval_env_bldr)
         ddqn.load_net_state_dict(self._nets[p].state_dict())
         ddqn.update_target_net()
         ddqn.eps = None
         ddqn.buf = None
         ddqns.append(ddqn.state_dict())
     return ddqns
Ejemplo n.º 2
0
    def compute_rlbr(self, n_hands_each_seat, ddqn_state_dicts, stack_size):
        agent_losses = np.empty(shape=n_hands_each_seat * 2, dtype=np.float32)
        rlbr_dqn_each_seat = [
            DDQN.inference_version_from_state_dict(
                state_dict=ddqn_state_dicts[p], env_bldr=self._eval_env_bldr)
            for p in range(self._t_prof.n_seats)
        ]
        rlbr_env_wrapper = self._eval_env_bldr.get_new_wrapper(
            is_evaluating=True, stack_size=stack_size)

        for rlbr_seat_id in range(rlbr_env_wrapper.env.N_SEATS):
            rlbr_agent = rlbr_dqn_each_seat[rlbr_seat_id]
            for iteration_id in range(n_hands_each_seat):

                # """""""""""""""""
                # Reset
                # """""""""""""""""
                obs, r_for_all, done, info = _util.reset_episode_multi_action_space(
                    rlbr_env_wrapper=rlbr_env_wrapper,
                    opponent_agent=self._opponent)
                range_idx_rlbr = rlbr_env_wrapper.env.get_range_idx(
                    p_id=rlbr_seat_id)

                # """""""""""""""""
                # Play Episode
                # """""""""""""""""
                while not done:
                    p_id_acting = rlbr_env_wrapper.env.current_player.seat_id

                    # RL-BR acting
                    if p_id_acting == rlbr_seat_id:
                        action_int = rlbr_agent.select_br_a(
                            pub_obses=[obs],
                            range_idxs=np.array([range_idx_rlbr],
                                                dtype=np.int32),
                            legal_actions_lists=[
                                rlbr_env_wrapper.env.get_legal_actions()
                            ],
                            explore=False,
                        )[0]
                        self._opponent.notify_of_action(
                            p_id_acted=rlbr_seat_id, action_he_did=action_int)

                    # EvalAgent (opponent) acting
                    else:
                        action_int, _ = self._opponent.get_action(
                            step_env=True, need_probs=False)

                    # Step
                    obs, r_for_all, done, info = rlbr_env_wrapper.step(
                        action=action_int)

                # add rews
                agent_losses[iteration_id + (rlbr_seat_id * n_hands_each_seat)] = r_for_all[rlbr_seat_id] \
                                                                                  * rlbr_env_wrapper.env.REWARD_SCALAR \
                                                                                  * rlbr_env_wrapper.env.EV_NORMALIZER

        return agent_losses.tolist()
Ejemplo n.º 3
0
 def reset(self, p_training, eval_opponent_state_dict, stack_size):
     self._rlbr_seat_id = p_training
     self._agent_seat_id = 1 - p_training
     self._opponent = self._eval_agent_cls(t_prof=self._t_prof)
     self._opponent.load_state_dict(eval_opponent_state_dict)
     self._rlbr_env_wrapper = self._eval_env_bldr.get_new_wrapper(
         is_evaluating=True, stack_size=stack_size)
     self._ddqns[p_training] = DDQN(owner=p_training,
                                    ddqn_args=self._args.ddqn_args,
                                    env_bldr=self._eval_env_bldr)
     self._buf = self.CircularBufferCls(
         env_bldr=self._env_bldr,
         max_size=self._args.ddqn_args.cir_buf_size)
     self._br_memory_saver = self.BRMemorySaverCls(
         env_bldr=self._eval_env_bldr, buffer=self._buf)
Ejemplo n.º 4
0
    def evaluate(self, global_iter_nr):

        for mode in self._t_prof.eval_modes_of_algo:
            for stack_size_idx, stack_size in enumerate(
                    self._t_prof.eval_stack_sizes):
                self._eval_agent.set_mode(mode=mode)
                self._eval_agent.set_stack_size(stack_size=stack_size)
                if self._eval_agent.can_compute_mode():
                    self._retrain(mode=mode,
                                  stack_size=stack_size,
                                  stack_size_idx=stack_size_idx,
                                  global_iter_nr=global_iter_nr)
                    # """""""""""""""""""
                    # Compute RL-BR
                    # """""""""""""""""""
                    print("Running rollout matches between RL-BR and agent.")
                    ddqn_states = self._ray.get(
                        self._ray.remote(
                            self._ps_handle.get_eval_ddqn_state_dicts))
                    ddqns = [
                        DDQN.inference_version_from_state_dict(
                            state_dict=ddqn_states[p],
                            env_bldr=self._eval_env_bldr)
                        for p in range(self._t_prof.n_seats)
                    ]
                    scores = self._compute_rlbr(
                        n_hands_each_seat=self._args.n_hands_each_seat,
                        rlbr_dqn_each_seat=ddqns,
                        rlbr_env_wrapper=self._eval_env_bldr.get_new_wrapper(
                            is_evaluating=True, stack_size=stack_size),
                        opponent=self._eval_agent)

                    mean, d = self._get_95confidence(scores=scores)

                    self._log_results(iter_nr=global_iter_nr,
                                      agent_mode=mode,
                                      stack_size_idx=stack_size_idx,
                                      score=mean,
                                      lower_conf95=mean - d,
                                      upper_conf95=mean + d)
Ejemplo n.º 5
0
    def __init__(self, t_prof, worker_id, chief_handle):
        super().__init__(t_prof=t_prof)

        self._env_bldr = rl_util.get_env_builder(t_prof=t_prof)
        self._id = worker_id
        self._chief_handle = chief_handle

        self._ddqn_args = t_prof.module_args["ddqn"]
        self._avg_args = t_prof.module_args["avg"]

        if t_prof.nn_type == "recurrent":
            from PokerRL.rl.buffers.CircularBufferRNN import CircularBufferRNN
            from PokerRL.rl.buffers.BRMemorySaverRNN import BRMemorySaverRNN
            from NFSP.workers.la.action_buffer.ActionBufferRNN import ActionBufferRNN, AvgMemorySaverRNN

            BR_BUF_CLS = CircularBufferRNN
            BR_MEM_SAVER = BRMemorySaverRNN
            AVG_BUF_CLS = ActionBufferRNN
            AVG_MEM_SAVER = AvgMemorySaverRNN

        elif t_prof.nn_type == "feedforward":
            from PokerRL.rl.buffers.CircularBufferFLAT import CircularBufferFLAT
            from PokerRL.rl.buffers.BRMemorySaverFLAT import BRMemorySaverFLAT
            from NFSP.workers.la.action_buffer.ActionBufferFLAT import ActionBufferFLAT, AvgMemorySaverFLAT

            BR_BUF_CLS = CircularBufferFLAT  # TODO: is this wrong? Nope!
            BR_MEM_SAVER = BRMemorySaverFLAT
            AVG_BUF_CLS = ActionBufferFLAT
            AVG_MEM_SAVER = AvgMemorySaverFLAT
        else:
            raise ValueError(t_prof.nn_type)

        self._avg_bufs = [
            AVG_BUF_CLS(env_bldr=self._env_bldr,
                        max_size=self._avg_args.res_buf_size,
                        min_prob=self._avg_args.min_prob_res_buf)
            for p in range(self._env_bldr.N_SEATS)
        ]
        self._br_bufs = [
            BR_BUF_CLS(env_bldr=self._env_bldr,
                       max_size=self._ddqn_args.cir_buf_size)
            for p in range(self._env_bldr.N_SEATS)
        ]
        self._action_and_hand_buffer = ActionAndHandBufferFLAT(
            env_bldr=self._env_bldr,
            max_size=self._t_prof.action_and_hand_buffer_size)
        self._avg_memory_savers = [[
            AVG_MEM_SAVER(env_bldr=self._env_bldr, buffer=self._avg_bufs[p])
            for _ in range(self._t_prof.n_envs)
        ] for p in range(self._env_bldr.N_SEATS)]
        self._br_memory_savers = [[
            BR_MEM_SAVER(env_bldr=self._env_bldr, buffer=self._br_bufs[p])
            for _ in range(self._t_prof.n_envs)
        ] for p in range(self._env_bldr.N_SEATS)]
        self._br_learner = [
            DDQN(owner=p, ddqn_args=self._ddqn_args, env_bldr=self._env_bldr)
            for p in range(self._env_bldr.N_SEATS)
        ]
        self._avg_learner = [
            AvgWrapper(owner=p,
                       env_bldr=self._env_bldr,
                       avg_training_args=self._avg_args)
            for p in range(self._env_bldr.N_SEATS)
        ]

        self._seat_actors = [
            SeatActor(t_prof=t_prof,
                      env_bldr=self._env_bldr,
                      seat_id=p,
                      br_memory_savers=self._br_memory_savers[p],
                      avg_buf_savers=self._avg_memory_savers[p],
                      br_learner=self._br_learner[p],
                      avg_learner=self._avg_learner[p])  #,
            #action_and_hand_buffer=self._action_and_hand_bufs[p])
            for p in range(self._env_bldr.N_SEATS)
        ]

        self._parallel_env = ParallelEnvs(t_prof=t_prof,
                                          env_bldr=self._env_bldr,
                                          n_envs=self._t_prof.n_envs)

        self._last_step_wrappers = self._parallel_env.reset()
        for p in range(self._env_bldr.N_SEATS):
            self._seat_actors[p].init([
                sw for plyr_sws in self._last_step_wrappers for sw in plyr_sws
            ])