def test_tic_tac_toe_number_histories(self): game = pyspiel.load_game("tic_tac_toe") states = get_all_states.get_all_states( game, depth_limit=-1, include_terminals=True, include_chance_states=False, to_string=lambda s: s.history_str()) self.assertLen(states, 549946) states = get_all_states.get_all_states(game, depth_limit=-1, include_terminals=True, include_chance_states=False, to_string=str) self.assertLen(states, 5478)
def main(_): games_list = pyspiel.registered_games() print("Registered games:") print(games_list) print("Creating game: " + FLAGS.game) if FLAGS.players is not None: # If passing parameters, must use game creator. game = pyspiel.load_game( FLAGS.game, {"players": pyspiel.GameParameter(FLAGS.players)}) else: # Otherwise can create directly. game = pyspiel.load_game(FLAGS.game) print("Getting all states; depth_limit = {}".format(FLAGS.depth_limit)) all_states = get_all_states.get_all_states(game, FLAGS.depth_limit, FLAGS.include_terminals, FLAGS.include_chance_states) count = 0 for state in all_states: print("") print(str(state)) count += 1 print("") print("Total: {} states.".format(count))
def test_cpp_and_python_implementations_are_identical(self, game_name): game = pyspiel.load_game(game_name) python_policy = policy.UniformRandomPolicy(game) pyspiel_policy = pyspiel.UniformRandomPolicy(game) all_states = get_all_states.get_all_states( game, depth_limit=-1, include_terminals=False, include_chance_states=False, to_string=lambda s: s.information_state_string()) for current_player in range(game.num_players()): python_br = best_response.BestResponsePolicy( game, current_player, python_policy) cpp_br = pyspiel.TabularBestResponse( game, current_player, pyspiel_policy).get_best_response_policy() for state in all_states.values(): if state.current_player() != current_player: continue # TODO(b/141737795): Decide what to do about this. self.assertEqual( python_br.action_probabilities(state), { a: prob for a, prob in cpp_br.action_probabilities( state).items() if prob != 0 })
def test_simultaneous_python_game_get_all_state(self): game = pyspiel.load_game( "python_iterated_prisoners_dilemma(max_game_length=6)") states = get_all_states.get_all_states( game, depth_limit=-1, include_terminals=True, include_chance_states=False, to_string=lambda s: s.history_str()) self.assertLen(states, 10921) states = get_all_states.get_all_states(game, depth_limit=-1, include_terminals=True, include_chance_states=False, to_string=str) self.assertLen(states, 5461)
def test_simultaneous_game_noisy_policy(self, game_name): game = pyspiel.load_game(game_name) policy = openspiel_policy.UniformRandomPolicy(game) all_states = get_all_states.get_all_states( game, depth_limit=10, include_terminals=False, include_chance_states=False, to_string=lambda s: s.history_str()) for current_player in range(game.num_players()): noise = noisy_policy.NoisyPolicy(policy, player_id=current_player, alpha=0.5, beta=10.) for state in all_states.values(): if state.current_player() == pyspiel.PlayerId.SIMULTANEOUS: for player_id in range(game.num_players()): if player_id != current_player: self.assertEqual( policy.action_probabilities(state, player_id), noise.action_probabilities(state, player_id)) else: self.assertNotEqual( policy.action_probabilities(state, player_id), noise.action_probabilities(state, player_id))
def test_cpp_and_python_implementations_are_identical(self, game_name): game = pyspiel.load_game(game_name) policy = openspiel_policy.UniformRandomPolicy(game) all_states = get_all_states.get_all_states( game, depth_limit=-1, include_terminals=False, include_chance_states=False, to_string=lambda s: s.information_state_string()) for current_player in range(game.num_players()): noise = noisy_policy.NoisyPolicy(policy, player_id=current_player, alpha=0.5, beta=10.) for state in all_states.values(): if state.current_player() < 0: continue if state.current_player() != current_player: self.assertEqual(policy.action_probabilities(state), noise.action_probabilities(state)) else: self.assertNotEqual(policy.action_probabilities(state), noise.action_probabilities(state))
def main(_): games_list = pyspiel.registered_games() print("Registered games:") for game in games_list: print(" ", game.short_name) print() print("Creating game:", FLAGS.game) params = {} if FLAGS.players is not None: params["players"] = FLAGS.players game = pyspiel.load_game(FLAGS.game, params) print("Getting all states; depth_limit = {}".format(FLAGS.depth_limit)) all_states = get_all_states.get_all_states(game, FLAGS.depth_limit, FLAGS.include_terminals, FLAGS.include_chance_states) count = 0 for state in all_states: print(state) count += 1 print() print("Total: {} states.".format(count))
def summarize_infostates(game, num_player=2, num_actions=2): num_player = num_player info_states = [[] for _ in range(num_player)] init_states = [] # all possible states after chance nodes assign states = get_all_states.get_all_states(game, depth_limit=-1, include_terminals=True, include_chance_states=False, to_string=lambda s: s.history_str()) # extract all information state for his, state in states.items(): if not state.is_player_node(): continue cur_p = state.current_player() info_states[cur_p].append(state.information_state_string()) if len(his.split(' ')) == num_player: init_states.append(state) info_states = [list(set(ele)) for ele in info_states] print('info states for players', info_states) # Generate Strategies for info states, mapping from states into actions num_actions = num_actions strategies = [ list(range(0, math.floor(math.pow(num_actions, len(ele))))) for ele in info_states ] return info_states, strategies, init_states
def test_legal_actions_returns_empty_list_on_opponent(self, game_name): game = pyspiel.load_game(game_name) some_states = get_all_states.get_all_states(game, depth_limit=5, include_terminals=True, include_chance_states=True) # We check we have some non-terminal non-random states self.assertTrue( any(not s.is_terminal() and not s.is_chance_node() for s in some_states.values())) for state in some_states.values(): if not state.is_terminal(): self.assertNotEqual(state.get_type(), pyspiel.StateType.TERMINAL) current_player = state.current_player() for player in range(game.num_players()): if player != current_player: msg = ( "The game {!r} does not return an empty list on " "legal_actions(<not current player>)" ).format(game_name) # It is illegal to call legal_actions(player) on a chance node for # a non chance player. if not (state.is_chance_node() and player != current_player): self.assertEmpty(state.legal_actions(player), msg=msg) else: self.assertEqual(state.get_type(), pyspiel.StateType.TERMINAL)
def compute_states_and_info_states_if_none(game, all_states=None, state_to_information_state=None): """Returns all_states and/or state_to_information_state for the game. To recompute everything, pass in None for both all_states and state_to_information_state. Otherwise, this function will use the passed in values to reconstruct either of them. Args: game: The open_spiel game. all_states: The result of calling get_all_states.get_all_states. Cached for improved performance. state_to_information_state: A dict mapping str(state) to state.information_state for every state in the game. Cached for improved performance. """ if all_states is None: all_states = get_all_states.get_all_states(game, depth_limit=-1, include_terminals=False, include_chance_states=False) if state_to_information_state is None: state_to_information_state = { state: all_states[state].information_state_string() for state in all_states } return all_states, state_to_information_state
def setUpClass(cls): super(EnforceAPIOnFullTreeBase, cls).setUpClass() cls.all_states = set( get_all_states.get_all_states(cls.game, depth_limit=-1, include_terminals=True, include_chance_states=True).values())
def test_simultaneous_game_get_all_state(self): game = game = pyspiel.load_game("goofspiel", {"num_cards": 3}) states = get_all_states.get_all_states( game, depth_limit=-1, include_terminals=True, include_chance_states=False, to_string=lambda s: s.history_str()) self.assertLen(states, 273)
def __init__(self, game, players=None, to_string=lambda s: s.history_str(), states=None): """Initializes a uniform random policy for all players in the game.""" players = sorted(players or range(game.num_players())) super().__init__(game, players) self.game_type = game.get_type() # Get all states in the game at which players have to make decisions unless # they are explicitly specified. states = states or get_all_states.get_all_states( game, depth_limit=-1, include_terminals=False, include_chance_states=False, include_mean_field_states=False, to_string=to_string) # Assemble legal actions for every valid (state, player) pair, keyed by # information state string. self.state_lookup = {} self.states_per_player = [[] for _ in range(game.num_players())] self.states = [] legal_actions_list = [] state_in_list = [] for player in players: # States are ordered by their history. for _, state in sorted(states.items(), key=lambda pair: pair[0]): if state.is_simultaneous_node( ) or player == state.current_player(): legal_actions = state.legal_actions_mask(player) if any(legal_actions): key = self._state_key(state, player) if key not in self.state_lookup: state_index = len(legal_actions_list) self.state_lookup[key] = state_index legal_actions_list.append(legal_actions) self.states_per_player[player].append(key) self.states.append(state) if self.game_type.provides_information_state_tensor: state_in_list.append( state.information_state_tensor(player)) elif self.game_type.provides_observation_tensor: state_in_list.append( state.observation_tensor(player)) # Put legal action masks in a numpy array and create the uniform random # policy. self.state_in = None if state_in_list: self.state_in = np.array(state_in_list) self.legal_actions_mask = np.array(legal_actions_list) self.action_probability_array = ( self.legal_actions_mask / np.sum(self.legal_actions_mask, axis=-1, keepdims=True))
def test_consistent(self): """Checks the Python and C++ game implementations are the same.""" py_game = pyspiel.load_game("python_tic_tac_toe") cc_game = pyspiel.load_game("tic_tac_toe") py_obs = make_observation(py_game) cc_obs = make_observation(cc_game) py_states = get_all_states(py_game, to_string=str) cc_states = get_all_states(cc_game, to_string=str) self.assertCountEqual(list(cc_states), list(py_states)) for key, cc_state in cc_states.items(): py_state = py_states[key] np.testing.assert_array_equal(py_state.history(), cc_state.history()) np.testing.assert_array_equal(py_state.returns(), cc_state.returns()) py_obs.set_from(py_state, 0) cc_obs.set_from(cc_state, 0) np.testing.assert_array_equal(py_obs.tensor, cc_obs.tensor)
def value_iteration(game, depth_limit, threshold): """Solves for the optimal value function of a game. For small games only! Solves the game using value iteration, with the maximum error for the value function less than threshold. This algorithm works for sequential 1-player games or 2-player zero-sum games, with or without chance nodes. Arguments: game: The game to analyze, as returned by `load_game`. depth_limit: How deeply to analyze the game tree. Negative means no limit, 0 means root-only, etc. threshold: Maximum error for state values.. Returns: A `dict` with string keys and float values, mapping string encoding of states to the values of those states. """ if game.num_players() not in (1, 2): raise ValueError("Game must be a 1-player or 2-player game") if (game.num_players() == 2 and game.get_type().utility != pyspiel.GameType.Utility.ZERO_SUM): raise ValueError("2-player games must be zero sum games") # We expect Value Iteration to be used with perfect information games, in # which `str` is assumed to display the state of the game. states = get_all_states.get_all_states(game, depth_limit, True, False, to_string=str) values = {} transitions = {} _initialize_maps(states, values, transitions) error = threshold + 1 # A value larger than threshold min_utility = game.min_utility() while error > threshold: error = 0 for key, state in states.items(): if state.is_terminal(): continue player = state.current_player() value = min_utility if player == 0 else -min_utility for action in state.legal_actions(): next_states = transitions[(key, action)] q_value = sum(p * values[next_state] for next_state, p in next_states) if player == 0: value = max(value, q_value) else: value = min(value, q_value) error = max(abs(values[key] - value), error) values[key] = value return values
def test_consistent(self): """Checks the Python and C++ game implementations are the same.""" py_game = pyspiel.load_game("python_kuhn_poker") cc_game = pyspiel.load_game("kuhn_poker") obs_types = [None, pyspiel.IIGObservationType(perfect_recall=True)] py_observations = [make_observation(py_game, o) for o in obs_types] cc_observations = [make_observation(cc_game, o) for o in obs_types] py_states = get_all_states(py_game) cc_states = get_all_states(cc_game) self.assertCountEqual(list(cc_states), list(py_states)) for key, cc_state in cc_states.items(): py_state = py_states[key] np.testing.assert_array_equal(py_state.history(), cc_state.history()) np.testing.assert_array_equal(py_state.returns(), cc_state.returns()) for py_obs, cc_obs in zip(py_observations, cc_observations): for player in (0, 1): py_obs.set_from(py_state, player) cc_obs.set_from(cc_state, player) np.testing.assert_array_equal(py_obs.tensor, cc_obs.tensor)
def test_has_at_least_an_action(self, game_name): """Check that all population's state have at least one action.""" game = pyspiel.load_game(game_name) to_string = lambda s: s.observation_string(pyspiel.PlayerId. DEFAULT_PLAYER_ID) states = get_all_states.get_all_states(game, depth_limit=-1, include_terminals=False, include_chance_states=False, include_mean_field_states=False, to_string=to_string) for state in states.values(): self.assertNotEmpty(state.legal_actions())
def self_train(): env = rl_environment.Environment("kuhn_poker") num_actions = env.action_spec()["num_actions"] player1 = QLearner(0, num_actions) player2 = QLearner(1, num_actions) state_size = env.observation_spec()["info_state"][0] with tf.Session as sess: player1 = DQN(sess, 0, 11, state_representation_size=state_size, num_actions=num_actions) player1 = DQN(sess, 1, 11, state_representation_size=state_size, num_actions=num_actions) players = [player1, player2] iterations = 1000000 for episode in range(iterations): if episode % 1000 == 0: print("Curr_episode", str(episode)) time_step = env.reset() while not time_step.last(): curr_player_id = time_step.current_player() agent_output = players[curr_player_id].step(time_step) time_step = env.step([agent_output.action]) for player in players: player.step(time_step) print(player1._q_values) game = pyspiel.load_game("kuhn_poker") all_states = get_all_states.get_all_states( game, depth_limit=-1, include_terminals=False, include_chance_states=False, to_string=lambda s: s.information_state_string()) #Initialized to uniform for each state tabular_policy = TabularPolicy(game) for state in all_states: state_policy = tabular_policy.policy_for_key(state) print("State: {}, state_policy: {}".format(state, state_policy))
def test_compression_binary(self): # All infostates for leduc are binary, so we can compress them effectively. game = pyspiel.load_game("leduc_poker") obs1 = make_observation(game, INFO_STATE_OBS_TYPE) obs2 = make_observation(game, INFO_STATE_OBS_TYPE) self.assertLen(obs1.tensor, 30) # 30 floats = 120 bytes for state in get_all_states.get_all_states(game).values(): for player in range(game.num_players()): obs1.set_from(state, player) compressed = obs1.compress() self.assertEqual(type(compressed), bytes) self.assertLen(compressed, 5) obs2.decompress(compressed) np.testing.assert_array_equal(obs1.tensor, obs2.tensor)
def policy_to_dict_but_we_can_actually_use_it(player_policy, game, all_states=None, state_to_information_state=None, player_id: Optional = None): """Converts a Policy instance into a tabular policy represented as a dict. This is compatible with the C++ TabularExploitability code (i.e. pyspiel.exploitability, pyspiel.TabularBestResponse, etc.). While you do not have to pass the all_states and state_to_information_state arguments, creating them outside of this funciton will speed your code up dramatically. Args: player_policy: The policy you want to convert to a dict. game: The game the policy is for. all_states: The result of calling get_all_states.get_all_states. Can be cached for improved performance. state_to_information_state: A dict mapping str(state) to state.information_state for every state in the game. Can be cached for improved performance. Returns: A dictionary version of player_policy that can be passed to the C++ TabularBestResponse, Exploitability, and BestResponse functions/classes. """ if all_states is None: all_states = get_all_states.get_all_states(game, depth_limit=-1, include_terminals=False, include_chance_states=False) state_to_information_state = { state: str( np.asarray( all_states[state].information_state_as_normalized_vector(), dtype=np.float32).tolist()) for state in all_states } tabular_policy = dict() for state in all_states: if player_id is not None and all_states[state].current_player( ) != player_id: continue information_state = state_to_information_state[state] tabular_policy[information_state] = list( player_policy.action_probabilities(all_states[state]).items()) return tabular_policy
def test_tabular_policy_to_csv(tmpdir): # Setup game and policy game = pyspiel.load_game("kuhn_poker") tabular_policy = policy.TabularPolicy(game) # Save policy as CSV output = os.path.join(tmpdir, 'policy.csv') policy_to_csv(game, tabular_policy, output) assert list(tmpdir.listdir()) == [output] # Check created CSV csv = pd.read_csv(output, index_col=0) # Get all states in the game at which players have to make decisions. states = get_all_states.get_all_states(game, depth_limit=-1, include_terminals=False, include_chance_states=False) assert set(csv.index.values) <= set(states.keys()) assert len(csv.columns) == game.num_distinct_actions()
def test_policy_on_game(self, game, policy_object): """Checks the policy conforms to the conventions. Checks the Policy.action_probabilities contains only legal actions (but not necessarily all). Checks that the probabilities are positive and sum to 1. Args: self: The Test class. This methid targets as being used as a utility function to test policies. game: A `pyspiel.Game`, same as the one used in the policy. policy_object: A `policy.Policy` object on `game`. to test. """ all_states = get_all_states.get_all_states( game, depth_limit=-1, include_terminals=False, include_chance_states=False, to_string=lambda s: s.information_state_string()) for state in all_states.values(): legal_actions = set(state.legal_actions()) action_probabilities = policy_object.action_probabilities(state) for action in action_probabilities.keys(): # We want a clearer error message to be able to debug. actions_missing = set(legal_actions) - set( action_probabilities.keys()) illegal_actions = set( action_probabilities.keys()) - set(legal_actions) self.assertIn( action, legal_actions, msg="The action {} is present in the policy but is not a legal " "actions (these are {})\n" "Legal actions missing from policy: {}\n" "Illegal actions present in policy: {}".format( action, legal_actions, actions_missing, illegal_actions)) sum_ = 0 for prob in action_probabilities.values(): sum_ += prob self.assertGreaterEqual(prob, 0) self.assertAlmostEqual(1, sum_)
def get_tabular_policy_states(game): """Returns the states of the game for a tabular policy.""" if game.get_type().dynamics == pyspiel.GameType.Dynamics.MEAN_FIELD: # TODO(perolat): We use s.observation_string(DEFAULT_MFG_PLAYER) here as the # number of history is exponential on the depth of the MFG. What we really # need is a representation of the state. For many player Mean Field games, # the state will be (x0, x1, x2, ..., xn) and the observation_string(0) will # output the string of x0. In that case we would need something like # str([observation_string(i) for i in range(num_player)]) to_string = lambda s: s.observation_string(pyspiel.PlayerId. DEFAULT_PLAYER_ID) else: to_string = lambda s: s.history_str() return get_all_states.get_all_states(game, depth_limit=-1, include_terminals=False, include_chance_states=False, include_mean_field_states=False, to_string=to_string)
def test_callable_policy_to_csv(tmpdir): def _uniform_policy(state): actions = state.legal_actions() p = 1.0 / len(actions) return [(a, p) for a in actions] # Setup game and policy game = pyspiel.load_game("kuhn_poker") callable_policy = policy.PolicyFromCallable(game, _uniform_policy) # Save policy as CSV output = os.path.join(tmpdir, 'policy.csv') policy_to_csv(game, callable_policy, output) assert list(tmpdir.listdir()) == [output] # Check created CSV csv = pd.read_csv(output, index_col=0) # Get all states in the game at which players have to make decisions. states = get_all_states.get_all_states(game, depth_limit=-1, include_terminals=False, include_chance_states=False) assert set(csv.index.values) <= set(states.keys())
def test_compression_none(self): # Most observations for leduc have non-binary data, so we can't # currently compress them. game = pyspiel.load_game("leduc_poker") obs1 = make_observation(game) obs2 = make_observation(game) self.assertLen(obs1.tensor, 16) # 16 floats = 64 bytes freq = collections.Counter() for state in get_all_states.get_all_states(game).values(): for player in range(game.num_players()): obs1.set_from(state, player) compressed = obs1.compress() self.assertEqual(type(compressed), bytes) freq[len(compressed)] += 1 obs2.decompress(compressed) np.testing.assert_array_equal(obs1.tensor, obs2.tensor) expected_freq = { 3: 840, # Compressible states take 3 bytes 65: 17760, # Uncompressible states take 65 bytes } self.assertEqual(freq, expected_freq)
def print_policy_analysis(policies, game, verbose=False): """Function printing policy diversity within game's known policies. Warning : only works with deterministic policies. Args: policies: List of list of policies (One list per game player) game: OpenSpiel game object. verbose: Whether to print policy diversity information. (True : print) Returns: List of list of unique policies (One list per player) """ states_dict = get_all_states.get_all_states(game, np.infty, False, False) unique_policies = [] for player in range(len(policies)): cur_policies = policies[player] cur_set = set() for pol in cur_policies: cur_str = "" for state_str in states_dict: if states_dict[state_str].current_player() == player: pol_action_dict = pol(states_dict[state_str]) max_prob = max(list(pol_action_dict.values())) max_prob_actions = [ a for a in pol_action_dict if pol_action_dict[a] == max_prob ] cur_str += "__" + state_str for a in max_prob_actions: cur_str += "-" + str(a) cur_set.add(cur_str) unique_policies.append(cur_set) if verbose: print("\n---------------------------\nPolicy Diversity :") for player, cur_set in enumerate(unique_policies): print("Player {} : {} unique policies.".format( player, len(cur_set))) print("") return unique_policies
def __call__(self, player, player_policy, info_states): """Computes action values per state for the player. Args: player: The id of the player (0 <= player < game.num_players()). This player will play `player_policy`, while the opponent will play a best response. player_policy: A `policy.Policy` object. info_states: A list of info state strings. Returns: A `_CalculatorReturn` nametuple. See its docstring for the documentation. """ self.player = player opponent = 1 - player def best_response_policy(state): infostate = state.information_state_string(opponent) action = best_response_actions[infostate] return [(action, 1.0)] # If the policy is a TabularPolicy, we can directly copy the infostate # strings & values from the class. This is significantly faster than having # to create the infostate strings. if isinstance(player_policy, policy.TabularPolicy): tabular_policy = { key: _tuples_from_policy(player_policy.policy_for_key(key)) for key in player_policy.state_lookup } # Otherwise, we have to calculate all the infostate strings everytime. This # is ~2x slower. else: # We cache these as they are expensive to compute & do not change. if self._all_states is None: self._all_states = get_all_states.get_all_states( self.game, depth_limit=-1, include_terminals=False, include_chance_states=False) self._state_to_information_state = { state: self._all_states[state].information_state_string() for state in self._all_states } tabular_policy = policy_utils.policy_to_dict( player_policy, self.game, self._all_states, self._state_to_information_state) # When constructed, TabularBestResponse does a lot of work; we can save that # work by caching it. if self._best_responder[player] is None: self._best_responder[player] = pyspiel.TabularBestResponse( self.game, opponent, tabular_policy) else: self._best_responder[player].set_policy(tabular_policy) # Computing the value at the root calculates best responses everywhere. history = str(self.game.new_initial_state()) best_response_value = self._best_responder[player].value(history) best_response_actions = self._best_responder[ player].get_best_response_actions() # Compute action values self._action_value_calculator.compute_all_states_action_values({ player: player_policy, opponent: policy.PolicyFromCallable(self.game, best_response_policy), }) obj = self._action_value_calculator._get_tabular_statistics( # pylint: disable=protected-access ((player, s) for s in info_states)) # Return values return _CalculatorReturn( exploitability=best_response_value, values_vs_br=obj.action_values, counterfactual_reach_probs_vs_br=obj.counterfactual_reach_probs, player_reach_probs_vs_br=obj.player_reach_probs)
def value_iteration(game, depth_limit, threshold, cyclic_game=False): """Solves for the optimal value function of a game. For small games only! Solves the game using value iteration, with the maximum error for the value function less than threshold. This algorithm works for sequential 1-player games or 2-player zero-sum games, with or without chance nodes. Arguments: game: The game to analyze, as returned by `load_game`. depth_limit: How deeply to analyze the game tree. Negative means no limit, 0 means root-only, etc. threshold: Maximum error for state values.. cyclic_game: set to True if the game has cycles (from state A we can get to state B, and from state B we can get back to state A). Returns: A `dict` with string keys and float values, mapping string encoding of states to the values of those states. """ assert game.num_players() in (1, 2), ( "Game must be a 1-player or 2-player game") if game.num_players() == 2: assert game.get_type().utility == pyspiel.GameType.Utility.ZERO_SUM, ( "2-player games must be zero sum games") # Must be perfect information or one-shot (not imperfect information). assert (game.get_type().information == pyspiel.GameType.Information.ONE_SHOT or game.get_type().information == pyspiel.GameType.Information.PERFECT_INFORMATION) # We expect Value Iteration to be used with perfect information games, in # which `str` is assumed to display the state of the game. states = get_all_states.get_all_states(game, depth_limit, True, False, to_string=str, stop_if_encountered=cyclic_game) values = {} transitions = {} _initialize_maps(states, values, transitions) error = threshold + 1 # A value larger than threshold min_utility = game.min_utility() while error > threshold: error = 0 for key, state in states.items(): if state.is_terminal(): continue elif state.is_simultaneous_node(): # Simultaneous node. Assemble a matrix game from the child utilities. # and solve it using a matrix game solver. p0_utils = [] # row player p1_utils = [] # col player row = 0 for p0action in state.legal_actions(0): # new row p0_utils.append([]) p1_utils.append([]) for p1action in state.legal_actions(1): # loop from left-to-right of columns next_states = transitions[(key, p0action, p1action)] joint_q_value = sum(p * values[next_state] for next_state, p in next_states) p0_utils[row].append(joint_q_value) p1_utils[row].append(-joint_q_value) row += 1 stage_game = pyspiel.create_matrix_game(p0_utils, p1_utils) solution = lp_solver.solve_zero_sum_matrix_game(stage_game) value = solution[2] else: # Regular decision node player = state.current_player() value = min_utility if player == 0 else -min_utility for action in state.legal_actions(): next_states = transitions[(key, action)] q_value = sum(p * values[next_state] for next_state, p in next_states) if player == 0: value = max(value, q_value) else: value = min(value, q_value) error = max(abs(values[key] - value), error) values[key] = value return values
def __call__(self, player, player_policy, info_states): """Computes action values per state for the player. Args: player: The id of the player 0 <= player < game.num_players(). player_policy: A `policy.Policy` object. info_states: A list of info state strings. Returns: A `_CalculatorReturn` nametuple. See its docstring for the documentation. """ self.player = player opponent = 1 - player def best_response_policy(state): infostate = state.information_state_string(opponent) action = best_response_actions[infostate] return [(action, 1.0)] # If the policy is a TabularPolicy, we can directly copy the infostate # strings & values from the class. This is significantly faster than having # to create the infostate strings. if isinstance(player_policy, policy.TabularPolicy): tabular_policy = { key: _tuples_from_policy(player_policy.policy_for_key(key)) for key in player_policy.state_lookup } # Otherwise, we have to calculate all the infostate strings everytime. This # is ~2x slower. else: # We cache these as they are expensive to compute & do not change. if self._all_states is None: self._all_states = get_all_states.get_all_states( self.game, depth_limit=-1, include_terminals=False, include_chance_states=False) self._state_to_information_state = { state: self._all_states[state].information_state_string() for state in self._all_states } tabular_policy = policy_utils.policy_to_dict( player_policy, self.game, self._all_states, self._state_to_information_state) # When constructed, TabularBestResponse does a lot of work; we can save that # work by caching it. if self._best_responder[player] is None: self._best_responder[player] = pyspiel.TabularBestResponse( self.game, opponent, tabular_policy) else: self._best_responder[player].set_policy(tabular_policy) # Computing the value at the root calculates best responses everywhere. history = str(self.game.new_initial_state()) best_response_value = self._best_responder[player].value(history) best_response_actions = self._best_responder[ player].get_best_response_actions() # Compute action values self.action_values = collections.defaultdict( lambda: collections.defaultdict(lambda: np.zeros(2))) self.info_state_prob = collections.defaultdict(float) self.info_state_player_prob = collections.defaultdict(float) self.info_state_cf_prob = collections.defaultdict(float) self.info_state_chance_prob = collections.defaultdict(float) self.get_action_values( self.game.new_initial_state(), { player: player_policy, opponent: policy.PolicyFromCallable(self.game, best_response_policy), }) # Collect normalized action values for each information state rv = [] cfrp = [] player_reach_probs_vs_br = [] for info_state in info_states: key = (player, info_state) av = self.action_values[key] norm_prob = self.info_state_prob[key] rv.append([(av[a][player] / norm_prob) if (a in av and norm_prob > 0) else 0 for a in range(self.num_actions)]) cfrp.append(self.info_state_cf_prob[key]) player_reach_probs_vs_br.append(self.info_state_player_prob[key]) # Return values return _CalculatorReturn( exploitability=best_response_value, values_vs_br=rv, counterfactual_reach_probs_vs_br=cfrp, player_reach_probs_vs_br=player_reach_probs_vs_br)