def act(self, step_wrappers): # """"""""""""""""""""" # Act # """"""""""""""""""""" SeatActorBase.act_mixed(step_wrappers=step_wrappers, owner=self.seat_id, br_learner=self.br_learner, avg_learner=self.avg_learner, current_policy_tags=self._current_policy_tags, random_prob=self.br_learner.eps) # """"""""""""""""""""" # Add to memories # """"""""""""""""""""" for sw in step_wrappers: e_i = sw.env_idx if (self._current_policy_tags[e_i] == SeatActorBase.BR) and ( self._t_prof.add_random_actions_to_buffer or (not sw.action_was_random)): self._avg_buf_savers[e_i].add_step( pub_obs=sw.obs, a=sw.action, legal_actions_mask=rl_util.get_legal_action_mask_np( n_actions=self._env_bldr.N_ACTIONS, legal_actions_list=sw.legal_actions_list)) self._br_memory_savers[e_i].add_experience( obs_t_before_acted=sw.obs, a_selected_t=sw.action, legal_actions_list_t=sw.legal_actions_list)
def act_for_br_opp(self, step_wrappers): """ Anticipatory; greedy BR + AVG """ SeatActorBase.act_mixed( step_wrappers=step_wrappers, br_learner=self.br_learner, owner=self.owner, avg_learner=self.avg_learner, current_policy_tags=self._current_policy_tags_OPP_BR, random_prob=0)
def act_for_avg_opp(self, step_wrappers): """ Purely random because that's how it should be for correct reach """ SeatActorBase.act_mixed( step_wrappers=step_wrappers, br_learner=self.br_learner, owner=self.owner, avg_learner=self.avg_learner, current_policy_tags=self._current_policy_tags_O_AVG, explore=True)
def act_for_br_trav(self, step_wrappers): # Act SeatActorBase.act_mixed( step_wrappers=step_wrappers, br_learner=self.br_learner, owner=self.owner, avg_learner=self.avg_learner, current_policy_tags=self._current_policy_tags_T_BR, random_prob=self._constant_eps) # Add to memories for sw in step_wrappers: e_i = sw.env_idx self._br_memory_savers[e_i].add_experience( obs_t_before_acted=sw.obs, a_selected_t=sw.action, legal_actions_list_t=sw.legal_actions_list)