def _compute_best_responses(self): """Computes each player best-response against the pool of other players.""" # pylint: disable=g-long-lambda current_policy = policy.PolicyFromCallable( self._game, lambda state: self._get_infostate_policy(state.information_state())) # pylint: disable=g-long-lambda for player_id in range(self._game.num_players()): self._best_responses[player_id] = exploitability.best_response( self._game, current_policy, player_id)
def _compute_best_responses(self): """Computes each player best-response against the pool of other players.""" def policy_fn(state): key = state.information_state_string() return self._get_infostate_policy(key) current_policy = policy.tabular_policy_from_callable(self._game, policy_fn) for player_id in range(self._game.num_players()): self._best_responses[player_id] = exploitability.best_response( self._game, current_policy, player_id)
def compute_best_reponses(self): """Updates self._oracles to hold best responses for each player.""" for i in range(self._num_players): # Compute a best response policy to pi_{-i}. # First, construct pi_{-i}. joint_policy = _joint_policy(self._policies) br_info = exploitability.best_response( self._game, policy.PolicyFromCallable(self._game, joint_policy), i) full_br_policy = _full_best_response_policy( br_info["best_response_action"]) self._best_responses[i] = full_br_policy if self._oracles is not None: self._oracles[i].append(full_br_policy)
def test_kuhn_poker_uniform_random_best_response_pid1(self): game = pyspiel.load_game("kuhn_poker") test_policy = policy.UniformRandomPolicy(game) results = exploitability.best_response(game, test_policy, player_id=1) self.assertEqual( results["best_response_action"], { # Bet is always best "0p": 1, "1p": 1, "2p": 1, # Call unless we know we're beaten "0b": 0, "1b": 1, "2b": 1, }) self.assertGreater(results["nash_conv"], 0.1)
def test_kuhn_poker_uniform_random_best_response_pid0(self): game = pyspiel.load_game("kuhn_poker") test_policy = policy.UniformRandomPolicy(game) results = exploitability.best_response(game, test_policy, player_id=0) self.assertEqual( results["best_response_action"], { "0": 1, # Bet in case opponent folds when winning "1": 1, # Bet in case opponent folds when winning "2": 0, # Both equally good (we return the lowest action) # Some of these will never happen under the best-response policy, # but we have computed best-response actions anyway. "0pb": 0, # Fold - we're losing "1pb": 1, # Call - we're 50-50 "2pb": 1, # Call - we've won }) self.assertGreater(results["nash_conv"], 0.1)
print(f"saving to: {save_prefix + '_infostates.npy'}") np.save(save_prefix + '_infostates', np.array(cfr_infostates)) if algorithm == 'cfr': solver = cfr.CFRSolver(game) run(solver, iterations) elif algorithm == 'xfp': solver = fictitious_play.XFPSolver(game) run(solver, iterations) elif algorithm == 'xdo': brs = [] info_test = [] for i in range(2): br_info = exploitability.best_response( game, cfr.CFRSolver(game).average_policy(), i) full_br_policy = _full_best_response_policy( br_info["best_response_action"]) info_sets = br_info['info_sets'] info_test.append(info_sets) brs.append(full_br_policy) br_list = [brs] start_time = time.time() xdo_times = [] xdo_exps = [] xdo_episodes = [] xdo_infostates = [] br_conv_threshold = starting_br_conv_threshold