def load_strategy(strategy_type, strategy_kwargs, env, player_id, strategy_weight): """ Load Strategies. If initialization required, initialize """ if strategy_type == "BR": agent = policy.TabularPolicy(env.game) agent.set_weight(strategy_weight) return agent elif strategy_type == "ARS": agent_class = rl_policy.ARSPolicy elif strategy_type == "DQN": agent_class = rl_policy.DQNPolicy elif strategy_type == "PG": agent_class = rl_policy.PGPolicy elif strategy_type == "ARS_parallel": agent_class = rl_policy.ARSPolicy_parallel else: raise NotImplementedError if "ARS" in strategy_type: strategy_kwargs["session"] = None else: strategy_kwargs["session"] = tf.Session() agent = agent_class(env, player_id, **strategy_kwargs) agent.set_weights(strategy_weight) agent.freeze() return agent
def __init__(self, game, initialize_cumulative_values=False, linear_averaging=True, regret_matching_plus=True): # pyformat: disable """Initializer. Args: game: The `pyspiel.Game` to run on. initialize_cumulative_values: Whether to initialize the average policy to the uniform policy (and the initial cumulative regret to an epsilon value). This is independent of the first CFR iteration, which, when the policy is fixed during traversal and we perform non alternating updates, will also compute the uniform policy and add it to the average of policies. linear_averaging: Whether to use linear averaging, i.e. cumulative_policy[info_state][action] += ( iteration_number * reach_prob * action_prob) or not: cumulative_policy[info_state][action] += reach_prob * action_prob regret_matching_plus: Whether to use Regret Matching+: cumulative_regrets = max(cumulative_regrets + regrets, 0) or simply regret matching: cumulative_regrets = cumulative_regrets + regrets """ # pyformat: enable if game.num_players() != 2: raise ValueError("Game {} does not have {} players.".format( game, 2)) self._game = game self._num_players = game.num_players() self._root_node = self._game.new_initial_state() if initialize_cumulative_values: initial_positive_value = _INITIAL_POSITIVE_VALUE else: initial_positive_value = 0 self._info_state_nodes = {} _initialize_info_state_nodes( self._root_node, info_state_nodes=self._info_state_nodes, initial_positive_value=initial_positive_value) self._policy_cache = {} self._root_node = self._game.new_initial_state() # This is for returning the current policy and average policy to a caller self._current_policy = policy.TabularPolicy(game) self._average_policy = self._current_policy.__copy__() self._linear_averaging = linear_averaging self._iteration = 0 # For possible linear-averaging. self._regret_matching_plus = regret_matching_plus self._best_responses = {i: None for i in range(game.num_players())}
def main(_): game = pyspiel.load_game(FLAGS.game) evaluator = pyspiel.RandomRolloutEvaluator(1, SEED) min_expl = game.max_utility() - game.min_utility() print("{:>5} {:>10} {:>50} {:>20}".format("max_sims", "uct_c", "final_policy_type", "exploitability")) for max_simulations in [10, 100, 1000, 10000]: for uct_c in [0.2, 0.5, 1.0, 2.0, 4.0]: # These values are for Kuhn. for final_policy_type in [ pyspiel.ISMCTSFinalPolicyType.NORMALIZED_VISIT_COUNT, pyspiel.ISMCTSFinalPolicyType.MAX_VISIT_COUNT, pyspiel.ISMCTSFinalPolicyType.MAX_VALUE ]: tabular_policy = policy.TabularPolicy(game) bot = pyspiel.ISMCTSBot(SEED, evaluator, uct_c, max_simulations, -1, final_policy_type, False, False) searched = {} construct_is_mcts_policy(game, game.new_initial_state(), tabular_policy, bot, searched) expl = exploitability.exploitability(game, tabular_policy) print("{:>5} {:>10} {:>50} {:>20}".format( max_simulations, uct_c, str(final_policy_type), expl)) if expl < min_expl: min_expl = expl print("Min expl: {}".format(min_expl))
def kuhn_nash_equilibrium(alpha): """Returns a Nash Equilibrium in Kuhn parameterized by alpha in [0, 1/3]. See https://en.wikipedia.org/wiki/Kuhn_poker#Optimal_strategy Args: alpha: The probability to bet on a Jack for Player 0. Raises: ValueError: If `alpha` is not within [0, 1/3]. """ if not 0 <= alpha <= 1 / 3: raise ValueError("alpha ({}) must be in [0, 1/3]".format(alpha)) bet_probability = { # Player 0 "0": alpha, "0pb": 0, "1": 0, "1pb": 1 / 3 + alpha, "2": 3 * alpha, "2pb": 1, # Player 1 "0p": 1 / 3, "0b": 0, "1p": 0, "1b": 1 / 3, "2p": 1, "2b": 1, } game = pyspiel.load_game("kuhn_poker") tabular_policy = policy.TabularPolicy(game) for state, p in bet_probability.items(): tabular_policy.policy_for_key(state)[:] = [1 - p, p] return tabular_policy
def __init__(self, game, initialize_cumulative_values, alternating_updates, linear_averaging, regret_matching_plus): # pyformat: disable """Initializer. Args: game: The `pyspiel.Game` to run on. initialize_cumulative_values: Whether to initialize the average policy to the uniform policy (and the initial cumulative regret to an epsilon value). This is independent of the first CFR iteration, which, when the policy is fixed during traversal and we perform non alternating updates, will also compute the uniform policy and add it to the average of policies. alternating_updates: If `True`, alternating updates are performed: for each player, we compute and update the cumulative regrets and policies. In that case, and when the policy is frozen during tree traversal, the cache is reset after each update for one player. Otherwise, the update is simultaneous. linear_averaging: Whether to use linear averaging, i.e. cumulative_policy[info_state][action] += ( iteration_number * reach_prob * action_prob) or not: cumulative_policy[info_state][action] += reach_prob * action_prob regret_matching_plus: Whether to use Regret Matching+: cumulative_regrets = max(cumulative_regrets + regrets, 0) or simply regret matching: cumulative_regrets = cumulative_regrets + regrets """ # pyformat: enable self._game = game self._num_players = game.num_players() self._root_node = self._game.new_initial_state() # Map from information states string representations and actions to the # counterfactual regrets, accumulated over the policy iterations self._cumulative_regret = collections.defaultdict( lambda: collections.defaultdict(float)) # Same as above for the cumulative of the policy probabilities computed # during the policy iterations self._cumulative_policy = collections.defaultdict( lambda: collections.defaultdict(float)) if initialize_cumulative_values: _initialize_uniform_policy(self._root_node, self._cumulative_regret, self._cumulative_policy) self._policy = {} self._root_node = self._game.new_initial_state() # This is for returning the current average policy to a caller self._average_policy = policy.TabularPolicy(game) self._linear_averaging = linear_averaging self._iteration = 0 # For possible linear-averaging. self._alternating_updates = alternating_updates self._regret_matching_plus = regret_matching_plus
def init_br_responder(env): """Initializes the tabular best-response based responder and agents.""" random_policy = policy.TabularPolicy(env.game) oracle = best_response_oracle.BestResponseOracle(game=env.game, policy=random_policy) agents = [random_policy.__copy__() for _ in range(FLAGS.n_players)] return oracle, agents
def __init__(self, game): """Initializes a loss calculation for the given game.""" if game.num_players() != _NUM_PLAYERS: raise ValueError("Game {} does not have {} players.".format( game, _NUM_PLAYERS)) self.tabular_policy = policy.TabularPolicy(game) self.q_value_calculator = action_value_vs_best_response.Calculator(game)
def test__update_current_policy(self): game = pyspiel.load_game("kuhn_poker") tabular_policy = policy.TabularPolicy(game) cumulative_regrets = np.arange(0, 12 * 2).reshape((12, 2)) expected_policy = cumulative_regrets / np.sum( cumulative_regrets, axis=-1, keepdims=True) nodes_indices = { u"0": 0, u"0pb": 1, u"1": 2, u"1pb": 3, u"2": 4, u"2pb": 5, u"1p": 6, u"1b": 7, u"2p": 8, u"2b": 9, u"0p": 10, u"0b": 11, } # pylint: disable=g-complex-comprehension info_state_nodes = { key: cfr._InfoStateNode( legal_actions=[0, 1], index_in_tabular_policy=None, cumulative_regret=dict(enumerate(cumulative_regrets[index])), cumulative_policy=None) for key, index in nodes_indices.items() } # pylint: enable=g-complex-comprehension cfr._update_current_policy(tabular_policy, info_state_nodes) np.testing.assert_array_equal(expected_policy, tabular_policy.action_probability_array)
def test_tabular_policy_from_csv(tmpdir): game = pyspiel.load_game("kuhn_poker") output = os.path.join(tmpdir, 'policy.csv') tabular_policy = policy.TabularPolicy(game) # Save policy as CSV output = os.path.join(tmpdir, 'policy.csv') policy_to_csv(game, tabular_policy, output) tabular_policy_from_csv(game, output)
def intilize_policy(game, player, policy_init): """Returns initial policy.""" if policy_init == "uniform": new_policy = policy.TabularPolicy(game, players=(player,)) elif policy_init == "random_deterministic": new_policy = policy.TabularPolicy(game, players=(player,)) for i in range(new_policy.action_probability_array.shape[0]): new_policy.action_probability_array[i] = np.random.multinomial( 1, new_policy.action_probability_array[i]).astype(np.float64) else: raise ValueError( "policy_init must be a valid initialization strategy: %s. " "Received: %s" % (INIT_POLICIES, policy_init)) return new_policy
def csv_policy(tmpdir): # Setup game and policy game = pyspiel.load_game("kuhn_poker") tabular_policy = policy.TabularPolicy(game) # Save policy as CSV output = os.path.join(tmpdir, 'policy.csv') policy_to_csv(game, tabular_policy, output) return output
def main(unused_argv): env = rl_environment.Environment(FLAGS.game_name) policies = [[ policy.TabularPolicy(env.game).copy_with_noise(alpha=float(i), beta=1.0) for i in range(2) ] for _ in range(2)] # pylint: disable=g-complex-comprehension probabilities = [ list(np.ones(len(policies[i])) / len(policies[i])) for i in range(2) ] pol_ag = policy_aggregator.PolicyAggregator(env.game) aggr_policies = pol_ag.aggregate([0, 1], policies, probabilities) exploitabilities = exploitability.nash_conv(env.game, aggr_policies) print("Exploitability : {}".format(exploitabilities)) print(policies[0][0].action_probability_array) print(policies[0][1].action_probability_array) print(aggr_policies.policy) print("\nCopy Example") mother_policy = policy.TabularPolicy(env.game).copy_with_noise(1, 10) policies = [[mother_policy.__copy__() for _ in range(2)] for _ in range(2)] probabilities = [ list(np.ones(len(policies)) / len(policies)) for _ in range(2) ] pol_ag = policy_aggregator.PolicyAggregator(env.game) aggr_policy = pol_ag.aggregate([0], policies, probabilities) for state, value in aggr_policy.policy[0].items(): polici = mother_policy.policy_for_key(state) value_normal = { action: probability for action, probability in enumerate(polici) if probability > 0 } for key in value.keys(): print( "State : {}. Key : {}. Aggregated : {}. Real : {}. Passed : {}" .format(state, key, value[key], value_normal[key], np.abs(value[key] - value_normal[key]) < 1e-8))
def __init__(self, game, initialize_cumulative_values, alternating_updates, linear_averaging, regret_matching_plus): # pyformat: disable """Initializer. Args: game: The `pyspiel.Game` to run on. initialize_cumulative_values: Whether to initialize the average policy to the uniform policy (and the initial cumulative regret to an epsilon value). This is independent of the first CFR iteration, which, when the policy is fixed during traversal and we perform non alternating updates, will also compute the uniform policy and add it to the average of policies. alternating_updates: If `True`, alternating updates are performed: for each player, we compute and update the cumulative regrets and policies. In that case, and when the policy is frozen during tree traversal, the cache is reset after each update for one player. Otherwise, the update is simultaneous. linear_averaging: Whether to use linear averaging, i.e. cumulative_policy[info_state][action] += ( iteration_number * reach_prob * action_prob) or not: cumulative_policy[info_state][action] += reach_prob * action_prob regret_matching_plus: Whether to use Regret Matching+: cumulative_regrets = max(cumulative_regrets + regrets, 0) or simply regret matching: cumulative_regrets = cumulative_regrets + regrets """ # pyformat: enable self._game = game self._num_players = game.num_players() self._root_node = self._game.new_initial_state() if initialize_cumulative_values: initial_positive_value = _INITIAL_POSITIVE_VALUE else: initial_positive_value = 0 self._info_state_nodes = {} _initialize_info_state_nodes( self._root_node, info_state_nodes=self._info_state_nodes, initial_positive_value=initial_positive_value) self._policy_cache = {} self._root_node = self._game.new_initial_state() # This is for returning the current policy and average policy to a caller self._current_policy = policy.TabularPolicy(game) self._average_policy = self._current_policy.__copy__() self._linear_averaging = linear_averaging self._iteration = 0 # For possible linear-averaging. self._alternating_updates = alternating_updates self._regret_matching_plus = regret_matching_plus
def test_update_slice(self): game = pyspiel.load_game("kuhn_poker") tabular_policy = policy.TabularPolicy(game) state = "2b" np.testing.assert_array_equal(tabular_policy.policy_for_key(state), [0.5, 0.5]) tabular_policy.policy_for_key(state)[:] = [0.8, 0.2] np.testing.assert_array_equal(tabular_policy.policy_for_key(state), [0.8, 0.2])
def test_states(self): game = pyspiel.load_game("leduc_poker") tabular_policy = policy.TabularPolicy(game) i = 0 for state in tabular_policy.states: self.assertEqual(i, tabular_policy.state_index(state)) i += 1 self.assertEqual(936, i)
def test_cpp_to_python_policy(self): game = pyspiel.load_game("kuhn_poker") pyspiel_policy = pyspiel.UniformRandomPolicy(game) python_policy = policy.policy_from_pyspiel_policy(pyspiel_policy) for info_state_str in policy.TabularPolicy(game).state_lookup.keys(): self.assertEqual({ 0: 0.5, 1: 0.5 }, python_policy.action_probabilities(info_state_str))
def test_update_elementwise(self): game = pyspiel.load_game("kuhn_poker") tabular_policy = policy.TabularPolicy(game) state = "0pb" np.testing.assert_array_equal(tabular_policy.policy_for_key(state), [0.5, 0.5]) tabular_policy.policy_for_key(state)[0] = 0.9 tabular_policy.policy_for_key(state)[1] = 0.1 np.testing.assert_array_equal(tabular_policy.policy_for_key(state), [0.9, 0.1])
def test_python_same_as_cpp_for_multiplayer_uniform_random_nash_conv( self, game_name, num_players): game = pyspiel.load_game(game_name, {"players": num_players}) # TabularPolicy defaults to being a uniform random policy. test_policy = policy.TabularPolicy(game) python_nash_conv = exploitability.nash_conv(game, test_policy) cpp_nash_conv = pyspiel.nash_conv( game, policy_utils.policy_to_dict(test_policy, game)) self.assertAlmostEqual(python_nash_conv, cpp_nash_conv)
def test_play_tournament(tmpdir): game = pyspiel.load_game("kuhn_poker") for team in ["python", "ruby", "java"]: for player in ["p1", "p2"]: tabular_policy = policy.TabularPolicy(game) # Save policy as CSV output = os.path.join(tmpdir, f'{team}_{player}.csv') policy_to_csv(game, tabular_policy, output) ranking, results = play_tournament(game, str(tmpdir)) assert len(list(ranking.keys())) == 3 assert len(results) == 3 * 2 * 2
def test_cpp_and_python_value_are_identical(self, game_name, num_players): game = pyspiel.load_game(game_name, {"players": num_players}) test_policy = policy.TabularPolicy(game) root_state = game.new_initial_state() for i_player in range(num_players): best_resp_py_backend = best_response.BestResponsePolicy( game, i_player, test_policy) best_resp_cpp_backend = best_response.CPPBestResponsePolicy( game, i_player, test_policy) value_py_backend = best_resp_py_backend.value(root_state) value_cpp_backend = best_resp_cpp_backend.value(root_state) self.assertTrue(np.allclose(value_py_backend, value_cpp_backend))
class CommonTest(parameterized.TestCase): @parameterized.parameters([ policy.TabularPolicy(_LEDUC_POKER), policy.UniformRandomPolicy(_LEDUC_POKER), policy.FirstActionPolicy(_LEDUC_POKER), ]) def test_policy_on_leduc(self, policy_object): test_policy_on_game(self, _LEDUC_POKER, policy_object) @parameterized.named_parameters([ ("pyspiel.UniformRandom", pyspiel.UniformRandomPolicy(_LEDUC_POKER)), ]) def test_cpp_policies_on_leduc(self, policy_object): test_policy_on_game(self, _LEDUC_POKER, policy_object)
def test_record_batched_trajectories(self): for game_name in ["kuhn_poker", "leduc_poker", "liars_dice"]: game = pyspiel.load_game(game_name) python_policy = policy.TabularPolicy(game) tabular_policy = policy.python_policy_to_pyspiel_policy(python_policy) policies = [tabular_policy] * 2 # We test that we can create a batch of trajectories. seed = 0 batch_size = 128 include_full_observations = False pyspiel.record_batched_trajectories(game, policies, python_policy.state_lookup, batch_size, include_full_observations, seed, -1)
def test_cpp_python_best_response_oracle(self, game_name, num_players): # Tests that these best responses interface well with Best Response Oracle game = pyspiel.load_game( game_name, {"players": pyspiel.GameParameter(num_players)}) all_states, _ = best_response.compute_states_and_info_states_if_none( game, all_states=None, state_to_information_state=None) current_best = [[policy.TabularPolicy(game).__copy__()] for _ in range(num_players)] probabilities_of_playing_policies = [[1.] for _ in range(num_players)] # Construct the python oracle py_oracle = best_response_oracle.BestResponseOracle( best_response_backend="py") # Construct the cpp oracle. Note that in this regime, BestResponseOracle # uses base_policy to construct and cache TabularBestResponse internally. cpp_oracle = best_response_oracle.BestResponseOracle( game=game, best_response_backend="cpp") # Prepare the computation of the best responses with each backend # pylint:disable=g-complex-comprehension training_params = [[{ "total_policies": current_best, "current_player": i, "probabilities_of_playing_policies": probabilities_of_playing_policies }] for i in range(num_players)] # pylint:enable=g-complex-comprehension py_best_rep = py_oracle(game, training_params) cpp_best_rep = cpp_oracle(game, training_params) # Compare the policies for state in all_states.values(): i_player = state.current_player() py_dict = py_best_rep[i_player][0].action_probabilities(state) cpp_dict = cpp_best_rep[i_player][0].action_probabilities(state) for action in py_dict.keys(): self.assertEqual(py_dict.get(action, 0.0), cpp_dict.get(action, 0.0)) for action in cpp_dict.keys(): self.assertEqual(py_dict.get(action, 0.0), cpp_dict.get(action, 0.0))
def __init__(self, game, alternating_updates, linear_averaging, regret_matching_plus): # pyformat: disable """Initializer. Args: game: The `pyspiel.Game` to run on. alternating_updates: If `True`, alternating updates are performed: for each player, we compute and update the cumulative regrets and policies. In that case, and when the policy is frozen during tree traversal, the cache is reset after each update for one player. Otherwise, the update is simultaneous. linear_averaging: Whether to use linear averaging, i.e. cumulative_policy[info_state][action] += ( iteration_number * reach_prob * action_prob) or not: cumulative_policy[info_state][action] += reach_prob * action_prob regret_matching_plus: Whether to use Regret Matching+: cumulative_regrets = max(cumulative_regrets + regrets, 0) or simply regret matching: cumulative_regrets = cumulative_regrets + regrets """ # pyformat: enable assert game.get_type( ).dynamics == pyspiel.GameType.Dynamics.SEQUENTIAL, ( "CFR requires sequential games. If you're trying to run it " + "on a simultaneous (or normal-form) game, please first transform it " + "using turn_based_simultaneous_game.") self._game = game self._num_players = game.num_players() self._root_node = self._game.new_initial_state() self._root_node = self._game.new_initial_state() # This is for returning the current policy and average policy to a caller self._current_policy = policy.TabularPolicy(game) self._average_policy = self._current_policy.__copy__() self._info_state_nodes = {} self._initialize_info_state_nodes(self._root_node) self._iteration = 0 # For possible linear-averaging. self._linear_averaging = linear_averaging self._alternating_updates = alternating_updates self._regret_matching_plus = regret_matching_plus
def test_identity_redundant(self): num_players = 2 game = pyspiel.load_game("kuhn_poker", {"players": num_players}) tabular_policies = [ # Policy for all players. policy.TabularPolicy(game, players=None) for player in range(num_players)] for player, tabular_policy in enumerate(tabular_policies): tabular_policy.action_probability_array[:] = 0 tabular_policy.action_probability_array[:, player] = 1.0 merged_tabular_policy = policy.merge_tabular_policies( tabular_policies, game) self.assertIdentityPoliciesEqual( tabular_policies, merged_tabular_policy, game)
def ficticious_play(seq_game, number_of_iterations, compute_metrics=False): xfp_solver = fictitious_play.XFPSolver(seq_game) tick_time = time.time() for _ in range(number_of_iterations): xfp_solver.iteration() timing = time.time() - tick_time # print('done') # average_policies = xfp_solver.average_policy_tables() tabular_policy = policy_module.TabularPolicy(seq_game) if compute_metrics: nash_conv = exploitability.nash_conv(seq_game, xfp_solver.average_policy()) average_policy_values = expected_game_score.policy_value( seq_game.new_initial_state(), [tabular_policy]) return timing, tabular_policy, nash_conv, average_policy_values return timing, tabular_policy
def test_tabular_policy_to_csv(tmpdir): # Setup game and policy game = pyspiel.load_game("kuhn_poker") tabular_policy = policy.TabularPolicy(game) # Save policy as CSV output = os.path.join(tmpdir, 'policy.csv') policy_to_csv(game, tabular_policy, output) assert list(tmpdir.listdir()) == [output] # Check created CSV csv = pd.read_csv(output, index_col=0) # Get all states in the game at which players have to make decisions. states = get_all_states.get_all_states(game, depth_limit=-1, include_terminals=False, include_chance_states=False) assert set(csv.index.values) <= set(states.keys()) assert len(csv.columns) == game.num_distinct_actions()
def test_identity(self): num_players = 2 game = pyspiel.load_game( "kuhn_poker", {"players": pyspiel.GameParameter(num_players)}) tabular_policies = [ # Policy limited to player. policy.TabularPolicy(game, players=(player,)) for player in range(num_players)] for player, tabular_policy in enumerate(tabular_policies): tabular_policy.action_probability_array[:] = 0 tabular_policy.action_probability_array[:, player] = 1.0 merged_tabular_policy = policy.merge_tabular_policies( tabular_policies, game) self.assertIdentityPoliciesEqual( tabular_policies, merged_tabular_policy, game)
def _get_exploitability(self): tabular_policy = policy.TabularPolicy(self._game) for player_id in range(2): for info_state, state_policy in self.average_policy_tables( )[player_id].items(): policy_to_update_tabular = tabular_policy.policy_for_key( info_state) for action, probability in state_policy.items(): policy_to_update_tabular[action] = probability average_policy_values = expected_game_score.policy_value( self._game.new_initial_state(), [tabular_policy, tabular_policy]) # print("Kuhn 2P average values after %s iterations" %iters) # print("P0: {}".format(average_policy_values[0])) # print("P1: {}".format(average_policy_values[1])) exp = exploitability.exploitability(game, tabular_policy) print("exploitability: {}".format(exp)) return exp
def __init__(self, game, linear_averaging=True, regret_matching_plus=True): # pyformat: disable """Initializer. Args: game: The `pyspiel.Game` to run on. linear_averaging: Whether to use linear averaging, i.e. cumulative_policy[info_state][action] += ( iteration_number * reach_prob * action_prob) or not: cumulative_policy[info_state][action] += reach_prob * action_prob regret_matching_plus: Whether to use Regret Matching+: cumulative_regrets = max(cumulative_regrets + regrets, 0) or simply regret matching: cumulative_regrets = cumulative_regrets + regrets """ # pyformat: enable if game.num_players() != 2: raise ValueError("Game {} does not have {} players.".format(game, 2)) assert game.get_type().dynamics == pyspiel.GameType.Dynamics.SEQUENTIAL, ( "CFR requires sequential games. If you're trying to run it " + "on a simultaneous (or normal-form) game, please first transform it " + "using turn_based_simultaneous_game.") self._game = game self._num_players = game.num_players() self._root_node = self._game.new_initial_state() self._info_state_nodes = {} _initialize_info_state_nodes(self._root_node, self._info_state_nodes) self._root_node = self._game.new_initial_state() # This is for returning the current policy and average policy to a caller self._current_policy = policy.TabularPolicy(game) self._average_policy = self._current_policy.__copy__() self._linear_averaging = linear_averaging self._iteration = 0 # For possible linear-averaging. self._regret_matching_plus = regret_matching_plus self._best_responses = {i: None for i in range(game.num_players())}