def __init__(self, t_prof, chief_handle, eval_agent_cls): super().__init__(t_prof=t_prof, eval_env_bldr=rl_util.get_env_builder(t_prof=t_prof), chief_handle=chief_handle, eval_type="Offline_Winnings", log_conf_interval=True) self._args = t_prof.module_args["offline"] self._env_bldr = rl_util.get_env_builder(t_prof=t_prof) self._eval_agents = [ eval_agent_cls(t_prof=t_prof) for _ in range(self._env_bldr.N_SEATS) ] self._REFERENCE_AGENT = 0
def __init__(self, t_prof): super().__init__(t_prof=t_prof) self._ps_handles = None self._la_handles = None self._env_bldr = rl_util.get_env_builder(t_prof=t_prof) self._SINGLE = EvalAgentDeepCFR.EVAL_MODE_SINGLE in self._t_prof.eval_modes_of_algo self._AVRG = EvalAgentDeepCFR.EVAL_MODE_AVRG_NET in self._t_prof.eval_modes_of_algo # """""""""""""""""""""""""""" # SD-CFR # """""""""""""""""""""""""""" if self._SINGLE: self._strategy_buffers = [ StrategyBuffer(t_prof=t_prof, owner=p, env_bldr=self._env_bldr, max_size=None, device=self._t_prof.device_inference) for p in range(t_prof.n_seats) ] if self._t_prof.log_verbose: self._exp_mem_usage = self.create_experiment( self._t_prof.name + " Chief_Memory_Usage")
def __init__(self, t_prof): super().__init__(t_prof=t_prof) self._t_prof = t_prof self._env_bldr = rl_util.get_env_builder(t_prof=t_prof) self._ps_handles = None self._la_handles = None
def __init__(self, t_prof, chief_handle, eval_agent_cls): super().__init__(t_prof=t_prof) self._args = t_prof.module_args["rlbr"] self._env_bldr = rl_util.get_env_builder(t_prof=t_prof) self._chief_handle = chief_handle self._eval_agent_cls = eval_agent_cls self._eval_env_bldr = _util.get_env_builder_rlbr(t_prof=t_prof) self._ddqns = [None for _ in range(self._eval_env_bldr.N_SEATS)] self._rlbr_seat_id = None self._agent_seat_id = None self._rlbr_env_wrapper = None self._opponent = None self._buf = None self._br_memory_saver = None if t_prof.nn_type == "recurrent": from PokerRL.rl.buffers.CircularBufferRNN import CircularBufferRNN from PokerRL.rl.buffers.BRMemorySaverRNN import BRMemorySaverRNN self.CircularBufferCls = CircularBufferRNN self.BRMemorySaverCls = BRMemorySaverRNN elif t_prof.nn_type == "feedforward": from PokerRL.rl.buffers.CircularBufferFLAT import CircularBufferFLAT from PokerRL.rl.buffers.BRMemorySaverFLAT import BRMemorySaverFLAT self.CircularBufferCls = CircularBufferFLAT self.BRMemorySaverCls = BRMemorySaverFLAT else: raise ValueError(t_prof.nn_type)
def __init__(self, t_prof, chief_handle): super().__init__(t_prof=t_prof) self._env_bldr = rl_util.get_env_builder(t_prof=t_prof) self._chief_handle = chief_handle self._device = torch.device(t_prof.device_parameter_server)
def __init__(self, t_prof, chief_handle, eval_agent_cls): super().__init__(t_prof=t_prof, eval_env_bldr=rl_util.get_env_builder(t_prof=t_prof), chief_handle=chief_handle, evaluator_name="Head2Head_Winnings", log_conf_interval=True) self._args = t_prof.module_args["h2h"] self._env_bldr = rl_util.get_env_builder(t_prof=t_prof) assert self._env_bldr.N_SEATS == 2 self._eval_agents = [ eval_agent_cls(t_prof=t_prof) for _ in range(self._env_bldr.N_SEATS) ] self._REFERENCE_AGENT = 0
def __init__(self, t_prof, chief_handle, eval_agent_cls): super().__init__(t_prof=t_prof, eval_env_bldr=rl_util.get_env_builder(t_prof=t_prof), chief_handle=chief_handle, eval_type="BR") self._env_bldr = rl_util.get_env_builder(t_prof=t_prof) assert self._env_bldr.N_SEATS == 2 self._eval_agent = eval_agent_cls(t_prof=t_prof) self._game_trees = [ PublicTree(env_bldr=self._env_bldr, stack_size=stack_size, stop_at_street=None, put_out_new_round_after_limit=True, is_debugging=self._t_prof.DEBUGGING) for stack_size in self._t_prof.eval_stack_sizes ] for gt in self._game_trees: gt.build_tree() print("Tree with stack size", gt.stack_size, "has", gt.n_nodes, "nodes out of which", gt.n_nonterm, "are non-terminal.")
def __init__(self, t_prof, mode=None, device=None): """ Args: t_prof (TrainingProfile): mode: Any mode your algorithm's eval agent can be evaluated in. Specify modes as class variables and pass one of them here. Can be changed later by calling .to_mode(new_mode) on this instance device (torch.device): The device the eval agent shall live and act on. """ self.t_prof = t_prof self.ray = MaybeRay(runs_distributed=t_prof.DISTRIBUTED, runs_cluster=t_prof.CLUSTER) self.env_bldr = rl_util.get_env_builder(t_prof=t_prof) self._internal_env_wrapper = self.env_bldr.get_new_wrapper(is_evaluating=True, stack_size=None) self._mode = mode if device is None: self.device = self.t_prof.device_inference else: self.device = device
def __init__(self, t_prof, br_agent, mode=None, device=None): super().__init__(t_prof=t_prof, mode=mode, device=device) self.tree = PublicTree( env_bldr=rl_util.get_env_builder(t_prof=t_prof), stack_size=t_prof.eval_stack_sizes[0], stop_at_street=None, put_out_new_round_after_limit=True, is_debugging=t_prof.DEBUGGING ) self.tree.build_tree() self.br_agent = br_agent # agent to play best response against self.solve_br() self.modes = ["EVAL", "BR", "BAYESIAN"] if mode: self.mode = mode else: self.mode = "EVAL" # default is eval if self.mode == "BAYESIAN": self._fill_tree_w_prior()
def __init__(self, t_prof, worker_id, chief_handle): super().__init__(t_prof=t_prof) self._adv_args = t_prof.module_args["adv_training"] self._env_bldr = rl_util.get_env_builder(t_prof=t_prof) self._id = worker_id self._chief_handle = chief_handle self._adv_buffers = [ AdvReservoirBuffer( owner=p, env_bldr=self._env_bldr, max_size=self._adv_args.max_buffer_size, nn_type=t_prof.nn_type, iter_weighting_exponent=self._t_prof.iter_weighting_exponent) for p in range(self._t_prof.n_seats) ] self._adv_wrappers = [ AdvWrapper(owner=p, env_bldr=self._env_bldr, adv_training_args=self._adv_args, device=self._adv_args.device_training) for p in range(self._t_prof.n_seats) ] self._AVRG = EvalAgentDeepCFR.EVAL_MODE_AVRG_NET in self._t_prof.eval_modes_of_algo self._SINGLE = EvalAgentDeepCFR.EVAL_MODE_SINGLE in self._t_prof.eval_modes_of_algo # """""""""""""""""""""""""""" # Deep CFR # """""""""""""""""""""""""""" if self._AVRG: self._avrg_args = t_prof.module_args["avrg_training"] self._avrg_buffers = [ AvrgReservoirBuffer(owner=p, env_bldr=self._env_bldr, max_size=self._avrg_args.max_buffer_size, nn_type=t_prof.nn_type, iter_weighting_exponent=self._t_prof. iter_weighting_exponent) for p in range(self._t_prof.n_seats) ] self._avrg_wrappers = [ AvrgWrapper(owner=p, env_bldr=self._env_bldr, avrg_training_args=self._avrg_args, device=self._avrg_args.device_training) for p in range(self._t_prof.n_seats) ] if self._t_prof.sampler.lower() == "mo": self._data_sampler = MultiOutcomeSampler( env_bldr=self._env_bldr, adv_buffers=self._adv_buffers, avrg_buffers=self._avrg_buffers, n_actions_traverser_samples=self._t_prof. n_actions_traverser_samples) else: raise ValueError("Currently we don't support", self._t_prof.sampler.lower(), "sampling.") else: if self._t_prof.sampler.lower() == "mo": self._data_sampler = MultiOutcomeSampler( env_bldr=self._env_bldr, adv_buffers=self._adv_buffers, avrg_buffers=None, n_actions_traverser_samples=self._t_prof. n_actions_traverser_samples) else: raise ValueError("Currently we don't support", self._t_prof.sampler.lower(), "sampling.") if self._t_prof.log_verbose: self._exp_mem_usage = self._ray.get( self._ray.remote( self._chief_handle.create_experiment, self._t_prof.name + "_LA" + str(worker_id) + "_Memory_Usage")) self._exps_adv_buffer_size = self._ray.get([ self._ray.remote( self._chief_handle.create_experiment, self._t_prof.name + "_LA" + str(worker_id) + "_P" + str(p) + "_ADV_BufSize") for p in range(self._t_prof.n_seats) ]) if self._AVRG: self._exps_avrg_buffer_size = self._ray.get([ self._ray.remote( self._chief_handle.create_experiment, self._t_prof.name + "_LA" + str(worker_id) + "_P" + str(p) + "_AVRG_BufSize") for p in range(self._t_prof.n_seats) ])
def __init__(self, t_prof, worker_id, chief_handle): super().__init__(t_prof=t_prof) self._env_bldr = rl_util.get_env_builder(t_prof=t_prof) self._id = worker_id self._chief_handle = chief_handle self._ddqn_args = t_prof.module_args["ddqn"] self._avg_args = t_prof.module_args["avg"] if t_prof.nn_type == "recurrent": from PokerRL.rl.buffers.CircularBufferRNN import CircularBufferRNN from PokerRL.rl.buffers.BRMemorySaverRNN import BRMemorySaverRNN from NFSP.workers.la.action_buffer.ActionBufferRNN import ActionBufferRNN, AvgMemorySaverRNN BR_BUF_CLS = CircularBufferRNN BR_MEM_SAVER = BRMemorySaverRNN AVG_BUF_CLS = ActionBufferRNN AVG_MEM_SAVER = AvgMemorySaverRNN elif t_prof.nn_type == "feedforward": from PokerRL.rl.buffers.CircularBufferFLAT import CircularBufferFLAT from PokerRL.rl.buffers.BRMemorySaverFLAT import BRMemorySaverFLAT from NFSP.workers.la.action_buffer.ActionBufferFLAT import ActionBufferFLAT, AvgMemorySaverFLAT BR_BUF_CLS = CircularBufferFLAT # TODO: is this wrong? Nope! BR_MEM_SAVER = BRMemorySaverFLAT AVG_BUF_CLS = ActionBufferFLAT AVG_MEM_SAVER = AvgMemorySaverFLAT else: raise ValueError(t_prof.nn_type) self._avg_bufs = [ AVG_BUF_CLS(env_bldr=self._env_bldr, max_size=self._avg_args.res_buf_size, min_prob=self._avg_args.min_prob_res_buf) for p in range(self._env_bldr.N_SEATS) ] self._br_bufs = [ BR_BUF_CLS(env_bldr=self._env_bldr, max_size=self._ddqn_args.cir_buf_size) for p in range(self._env_bldr.N_SEATS) ] self._action_and_hand_buffer = ActionAndHandBufferFLAT( env_bldr=self._env_bldr, max_size=self._t_prof.action_and_hand_buffer_size) self._avg_memory_savers = [[ AVG_MEM_SAVER(env_bldr=self._env_bldr, buffer=self._avg_bufs[p]) for _ in range(self._t_prof.n_envs) ] for p in range(self._env_bldr.N_SEATS)] self._br_memory_savers = [[ BR_MEM_SAVER(env_bldr=self._env_bldr, buffer=self._br_bufs[p]) for _ in range(self._t_prof.n_envs) ] for p in range(self._env_bldr.N_SEATS)] self._br_learner = [ DDQN(owner=p, ddqn_args=self._ddqn_args, env_bldr=self._env_bldr) for p in range(self._env_bldr.N_SEATS) ] self._avg_learner = [ AvgWrapper(owner=p, env_bldr=self._env_bldr, avg_training_args=self._avg_args) for p in range(self._env_bldr.N_SEATS) ] self._seat_actors = [ SeatActor(t_prof=t_prof, env_bldr=self._env_bldr, seat_id=p, br_memory_savers=self._br_memory_savers[p], avg_buf_savers=self._avg_memory_savers[p], br_learner=self._br_learner[p], avg_learner=self._avg_learner[p]) #, #action_and_hand_buffer=self._action_and_hand_bufs[p]) for p in range(self._env_bldr.N_SEATS) ] self._parallel_env = ParallelEnvs(t_prof=t_prof, env_bldr=self._env_bldr, n_envs=self._t_prof.n_envs) self._last_step_wrappers = self._parallel_env.reset() for p in range(self._env_bldr.N_SEATS): self._seat_actors[p].init([ sw for plyr_sws in self._last_step_wrappers for sw in plyr_sws ])
def __init__(self, t_prof): self._t_prof = t_prof self._env_bldr = rl_util.get_env_builder(t_prof) self._env_wrapper = self._env_bldr.get_new_wrapper(is_evaluating=False)
def __init__(self, t_prof, worker_id, chief_handle): super().__init__(t_prof=t_prof) self._env_bldr = rl_util.get_env_builder(t_prof=t_prof) self._id = worker_id self._chief_handle = chief_handle self._ddqn_args = t_prof.module_args["ddqn"] self._avg_args = t_prof.module_args["avg"] if t_prof.nn_type == "recurrent": from PokerRL.rl.buffers.CircularBufferRNN import CircularBufferRNN from NFSP.workers.la.action_buffer.ActionBufferRNN import ActionBufferRNN BR_BUF_CLS = CircularBufferRNN AVG_BUF_CLS = ActionBufferRNN elif t_prof.nn_type == "feedforward": from PokerRL.rl.buffers.CircularBufferFLAT import CircularBufferFLAT from NFSP.workers.la.action_buffer.ActionBufferFLAT import ActionBufferFLAT BR_BUF_CLS = CircularBufferFLAT AVG_BUF_CLS = ActionBufferFLAT else: raise ValueError(t_prof.nn_type) self._avg_buf2 = [ AVG_BUF_CLS(env_bldr=self._env_bldr, max_size=self._avg_args.res_buf_size, min_prob=self._avg_args.min_prob_res_buf) for p in range(self._env_bldr.N_SEATS) ] self._br_buf2 = [ BR_BUF_CLS(env_bldr=self._env_bldr, max_size=self._ddqn_args.cir_buf_size) for p in range(self._env_bldr.N_SEATS) ] self._br_learner2 = [ DDQN(owner=p, ddqn_args=self._ddqn_args, env_bldr=self._env_bldr) for p in range(self._env_bldr.N_SEATS) ] self._avg_learner2 = [ AvgWrapper(owner=p, env_bldr=self._env_bldr, avg_training_args=self._avg_args) for p in range(self._env_bldr.N_SEATS) ] if self._t_prof.sampling == "adam": self._sampler = AdamSampler( t_prof=t_prof, env_bldr=self._env_bldr, br_buf2=self._br_buf2, avg_buf2=self._avg_buf2, br_learner2=self._br_learner2, avg_learner2=self._avg_learner2, constant_eps=self._t_prof.constant_eps_expl) elif self._t_prof.sampling == "clean": self._sampler = CleanSampler( t_prof=t_prof, env_bldr=self._env_bldr, br_buf2=self._br_buf2, avg_buf2=self._avg_buf2, br_learner2=self._br_learner2, avg_learner2=self._avg_learner2, constant_eps=self._t_prof.constant_eps_expl) else: self._sampler = VanillaSampler(t_prof=t_prof, env_bldr=self._env_bldr, br_buf2=self._br_buf2, avg_buf2=self._avg_buf2, br_learner2=self._br_learner2, avg_learner2=self._avg_learner2)