def xfsp_train(_): exploit_history = list() exploit_idx = list() game = pyspiel.load_game(FLAGS.game, {"players": pyspiel.GameParameter(2)}) fsp_solver = fictitious_play.XFPSolver(game) checkpoint = datetime.now() for ep in range(FLAGS.episodes): if (ep % 1000) == 0: delta = datetime.now() - checkpoint pol = policy.PolicyFromCallable( game, fsp_solver.average_policy_callable()) conv = exploitability.exploitability(game, pol) exploit_history.append(conv) exploit_idx.append(ep) print( "[XFSP] Iteration {} exploitability {} - {} seconds since last checkpoint" .format(ep, conv, delta.seconds)) checkpoint = datetime.now() fsp_solver.iteration() agent_name = "xfsp" pickle.dump([exploit_idx, exploit_history], open( FLAGS.game + "_" + agent_name + "_" + str(FLAGS.episodes) + ".dat", "wb")) pol = policy.PolicyFromCallable(game, fsp_solver.average_policy_callable()) for pid in [1, 2]: policy_to_csv( game, pol, f"policies/policy_" + now.strftime("%m-%d-%Y_%H-%M") + "_" + agent_name + "_" + str(pid + 1) + "_+" + str(FLAGS.episodes) + "episodes.csv")
def test_runs_with_uniform_policies(self, game_name): game = pyspiel.load_game(game_name) calc = action_value.TreeWalkCalculator(game) calc.compute_all_states_action_values([ policy.PolicyFromCallable(game, _uniform_policy), policy.PolicyFromCallable(game, _uniform_policy) ])
def test_kuhn_poker_always_pass_p0(self): game = pyspiel.load_game("kuhn_poker") calc = action_value_vs_best_response.Calculator(game) (expl, avvbr, cfrp, player_reach_probs) = calc( 0, policy.PolicyFromCallable(game, lambda state: [(0, 1.0), (1, 0.0)]), ["0", "1", "2", "0pb", "1pb", "2pb"]) self.assertAlmostEqual(expl, 1.) np.testing.assert_allclose( avvbr, [ # Opening bet. If we pass, we always lose (pass-pass with op's K, # otherwise pass-bet-pass). # If we bet, we always win (because op's best response is to pass, # because this is an unreachable state and we break ties in favour # of the lowest action). [-1, 1], [-1, 1], [-1, 1], # We pass, opp bets into us. This can be either J or Q (K will pass # because of the tie-break rules). # So we are guaranteed to be winning with Q or K. [-1, -2], # 0pb [-1, 2], # 1pb [-1, 2], # 2pb ]) np.testing.assert_allclose(cfrp, [1 / 3, 1 / 3, 1 / 3, 1 / 6, 1 / 6, 1 / 3]) np.testing.assert_allclose([1., 1., 1., 1., 1., 1.], player_reach_probs)
def solve(self): """Solution logic for Deep CFR.""" advantage_losses = collections.defaultdict(list) start = datetime.now() expl_idx = [] expl_hist = [] for it in range(self._num_iterations): if (it % self._eval_freq == 0) and it != 0: conv = self.get_exploitabilitiy() elapsed = datetime.now() - start print( "Episode {}/{}, running for {} seconds - Exploitability = {}" .format(it, self._num_iterations, elapsed.seconds, conv)) expl_idx.append(it) expl_hist.append(conv) for p in range(self._num_players): for _ in range(self._num_traversals): self._traverse_game_tree(self._root_node, p) self.reinitialize_advantage_networks() # Re-initialize advantage networks and train from scratch. advantage_losses[p].append(self._learn_advantage_network(p)) self._iteration += 1 # Train policy network. policy_loss = self._learn_strategy_network() conv = exploitability.exploitability( self._game, policy.PolicyFromCallable(self._game, self.action_probabilities)) print("Final exploitability: {}".format(conv)) return self._policy_network, advantage_losses, policy_loss, expl_idx, expl_hist
def main(unused_argv): logging.info("Loading %s", FLAGS.game_name) game = pyspiel.load_game(FLAGS.game_name) with tf.Session() as sess: deep_cfr_solver = deep_cfr.DeepCFRSolver( sess, game, policy_network_layers=(32, 32), advantage_network_layers=(16, 16), num_iterations=FLAGS.num_iterations, num_traversals=FLAGS.num_traversals, learning_rate=1e-3, batch_size_advantage=None, batch_size_strategy=None, memory_capacity=1e7) sess.run(tf.global_variables_initializer()) _, advantage_losses, policy_loss = deep_cfr_solver.solve() for player, losses in six.iteritems(advantage_losses): logging.info("Advantage for player %d: %s", player, losses[:2] + ["..."] + losses[-2:]) logging.info("Advantage Buffer Size for player %s: '%s'", player, len(deep_cfr_solver.advantage_buffers[player])) logging.info("Strategy Buffer Size: '%s'", len(deep_cfr_solver.strategy_buffer)) logging.info("Final policy loss: '%s'", policy_loss) conv = exploitability.nash_conv( game, policy.PolicyFromCallable(game, deep_cfr_solver.action_probabilities)) logging.info("Deep CFR in '%s' - NashConv: %s", FLAGS.game_name, conv)
def get_exploitabilitiy(self): #Define placeholders iter_ph = tf.placeholder(shape=[None, 1], dtype=tf.float32, name="iter_ph") action_probs_ph = tf.placeholder(shape=[None, self._num_actions], dtype=tf.float32, name="action_probs_ph") info_state_ph = tf.placeholder(shape=[None, self._embedding_size], dtype=tf.float32, name="info_state_ph") policy_network = snt.nets.MLP( list(self._policy_network_layers) + [self._num_actions]) action_logits = policy_network(info_state_ph) # Illegal actions are handled in the traversal code where expected payoff # and sampled regret is computed from the advantage networks. action_probs = tf.nn.softmax(action_logits) loss_policy = tf.reduce_mean( tf.losses.mean_squared_error( labels=tf.math.sqrt(iter_ph) * action_probs_ph, predictions=tf.math.sqrt(iter_ph) * action_probs)) optimizer_policy = tf.train.AdamOptimizer( learning_rate=self._learning_rate) learn_step_policy = optimizer_policy.minimize(loss_policy) self._session.run(tf.global_variables_initializer()) def _local_action_probabilities(state): """Returns action probabilities dict for a single batch.""" cur_player = state.current_player() legal_actions = state.legal_actions(cur_player) info_state_vector = np.array(state.information_state_tensor()) if len(info_state_vector.shape) == 1: info_state_vector = np.expand_dims(info_state_vector, axis=0) probs = self._session.run( action_probs, feed_dict={info_state_ph: info_state_vector}) return {action: probs[0][action] for action in legal_actions} info_states_l = [] action_probs_l = [] iterations_l = [] for s in self._strategy_memories.sample(self._batch_size_strategy): info_states_l.append(s.info_state) action_probs_l.append(s.strategy_action_probs) iterations_l.append([s.iteration]) self._session.run( [loss_policy, learn_step_policy], feed_dict={ info_state_ph: np.array(info_states_l), action_probs_ph: np.array(np.squeeze(action_probs_l)), iter_ph: np.array(iterations_l), }) conv = exploitability.exploitability( self._game, policy.PolicyFromCallable(self._game, _local_action_probabilities)) return conv
def _compute_best_responses(self): """Computes each player best-response against the pool of other players.""" # pylint: disable=g-long-lambda current_policy = policy.PolicyFromCallable( self._game, lambda state: self._get_infostate_policy(state.information_state())) # pylint: disable=g-long-lambda for player_id in range(self._game.num_players()): self._best_responses[player_id] = exploitability.best_response( self._game, current_policy, player_id)
def test_shapleys_game(self): game = pyspiel.load_game_as_turn_based("matrix_shapleys_game") xfp_solver = fictitious_play.XFPSolver(game) for i in range(1000): xfp_solver.iteration() if i % 10 == 0: conv = exploitability.nash_conv( game, policy.PolicyFromCallable( game, xfp_solver.average_policy_callable())) print("FP in Shapley's Game. Iter: {}, NashConv: {}".format( i, conv))
def test_outcome_sampling_kuhn_2p(self): np.random.seed(SEED) game = pyspiel.load_game("kuhn_poker") os_solver = outcome_sampling_mccfr.OutcomeSamplingSolver(game) for _ in range(1000): os_solver.iteration() conv = exploitability.nash_conv( game, policy.PolicyFromCallable(game, os_solver.callable_avg_policy())) print("Kuhn2P, conv = {}".format(conv)) self.assertGreater(conv, 0.2) self.assertLess(conv, 0.3)
def compute_best_reponses(self): """Updates self._oracles to hold best responses for each player.""" for i in range(self._num_players): # Compute a best response policy to pi_{-i}. # First, construct pi_{-i}. joint_policy = _joint_policy(self._policies) br_info = exploitability.best_response( self._game, policy.PolicyFromCallable(self._game, joint_policy), i) full_br_policy = _full_best_response_policy( br_info["best_response_action"]) self._best_responses[i] = full_br_policy if self._oracles is not None: self._oracles[i].append(full_br_policy)
def main(_): game = pyspiel.load_game(FLAGS.game, {"players": pyspiel.GameParameter(FLAGS.players)}) xfp_solver = fictitious_play.XFPSolver(game) for i in range(FLAGS.iterations): xfp_solver.iteration() conv = exploitability.exploitability( game, policy.PolicyFromCallable(game, xfp_solver.average_policy_callable())) if i % FLAGS.print_freq == 0: print("Iteration: {} Conv: {}".format(i, conv)) sys.stdout.flush()
def test_matching_pennies_3p(self): game = pyspiel.load_game_as_turn_based("matching_pennies_3p") xfp_solver = fictitious_play.XFPSolver(game) for i in range(1000): xfp_solver.iteration() if i % 10 == 0: conv = exploitability.nash_conv( game, policy.PolicyFromCallable( game, xfp_solver.average_policy_callable())) print( "FP in Matching Pennies 3p. Iter: {}, NashConv: {}".format( i, conv))
def main(unused_argv): logging.info("Loading %s", FLAGS.game_name) env = rl_environment.Environment(FLAGS.game_name) num_players = env.num_players num_actions = env.action_spec()["num_actions"] state_size = env.observation_spec()["info_state"][0] eva_agents = [] with tf.Session() as sess: for player in range(num_players): eva_agents.append( eva.EVAAgent(sess, env, player, state_size, num_actions, embedding_network_layers=(64, 32), embedding_size=12, learning_rate=1e-4, mixing_parameter=0.5, memory_capacity=1e6, discount_factor=1.0, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_duration=int(1e6))) sess.run(tf.global_variables_initializer()) time_step = env.reset() for _ in range(FLAGS.num_episodes): while not time_step.last(): current_player = time_step.observations["current_player"] current_agent = eva_agents[current_player] step_out = current_agent.step(time_step) time_step = env.step([step_out.action]) for agent in eva_agents: agent.step(time_step) game = pyspiel.load_game(FLAGS.game_name) joint_policy = JointPolicy(eva_agents) conv = exploitability.nash_conv( game, policy.PolicyFromCallable(game, joint_policy.action_probabilities)) logging.info("EVA in '%s' - NashConv: %s", FLAGS.game_name, conv)
def test_callable_policy_to_csv(tmpdir): def _uniform_policy(state): actions = state.legal_actions() p = 1.0 / len(actions) return [(a, p) for a in actions] # Setup game and policy game = pyspiel.load_game("kuhn_poker") callable_policy = policy.PolicyFromCallable(game, _uniform_policy) # Save policy as CSV output = os.path.join(tmpdir, 'policy.csv') policy_to_csv(game, callable_policy, output) assert list(tmpdir.listdir()) == [output] # Check created CSV csv = pd.read_csv(output, index_col=0) # Get all states in the game at which players have to make decisions. states = get_all_states.get_all_states(game, depth_limit=-1, include_terminals=False, include_chance_states=False) assert set(csv.index.values) <= set(states.keys())
def test_matching_pennies_3p(self): # We don't expect Deep CFR to necessarily converge on 3-player games but # it's nonetheless interesting to see this result. game = pyspiel.load_game_as_turn_based('matching_pennies_3p') with tf.Session() as sess: deep_cfr_solver = deep_cfr.DeepCFRSolver( sess, game, policy_network_layers=(16, 8), advantage_network_layers=(32, 16), num_iterations=2, num_traversals=2, learning_rate=1e-3, batch_size_advantage=None, batch_size_strategy=None, memory_capacity=1e7) sess.run(tf.global_variables_initializer()) deep_cfr_solver.solve() conv = exploitability.nash_conv( game, policy.PolicyFromCallable( game, deep_cfr_solver.action_probabilities)) print('Deep CFR in Matching Pennies 3p. NashConv: {}'.format(conv))
def test_kuhn_poker_always_pass_p0(self): game = pyspiel.load_game("kuhn_poker") calc = action_value.TreeWalkCalculator(game) for always_pass_policy in [ lambda state: [(0, 1.0), (1, 0.0)], # On purpose, we use a policy that do not list all the legal actions. lambda state: [(0, 1.0), (1, 0.0)], ]: tabular_policy = policy.tabular_policy_from_policy( game, policy.PolicyFromCallable(game, always_pass_policy)) # States are ordered using tabular_policy.states_per_player: # ['0', '0pb', '1', '1pb', '2', '2pb'] + # ['1p', '1b', '2p', '2b', '0p', '0b'] np.testing.assert_array_equal( np.asarray([ [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], ]), tabular_policy.action_probability_array) returned_values = calc([ policy.PolicyFromCallable(game, always_pass_policy), policy.PolicyFromCallable(game, _uniform_policy) ], tabular_policy) # Action 0 == Pass. Action 1 == Bet # Some values are 0 because the states are not reached, thus the expected # value of that node is undefined. np.testing.assert_array_almost_equal( np.asarray([ [-1.0, -0.5], [-1.0, -2.0], [-0.5, 0.5], [-1.0, 0.0], [0.0, 1.5], [-1.0, 2.0], [0.0, 1.0], [0, 0], [1.0, 1.0], [0, 0], [-1.0, 1.0], [0, 0], ]), returned_values.action_values) np.testing.assert_array_almost_equal( np.asarray([ # Player 0 states 1 / 3, # '0' 1 / 6, # '0pb' 1 / 3, # '1' 1 / 6, # '1pb' 1 / 3, # '2' 1 / 6, # '2pb' # Player 1 states 1 / 3, # '1p' 0.0, # '1b': zero because player 0 always play pass 1 / 3, # 2p' 0.0, # '2b': zero because player 0 always play pass 1 / 3, # '0p' 0.0, # '0b': zero because player 0 always play pass ]), returned_values.counterfactual_reach_probs) # The reach probabilities are always one, even though we have player 0 # who only plays pass, because the unreachable nodes for player 0 are # terminal nodes: e.g. 'x x b b p' has a player 0 reach of 0, but it is # a terminal node, thus it does not appear in the tabular policy # states. np.testing.assert_array_equal( [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], returned_values.player_reach_probs) np.testing.assert_array_almost_equal( np.asarray([ np.array([-1 / 3, -1 / 6]), np.array([-1 / 6, -1 / 3]), np.array([-1 / 6, 1 / 6]), np.array([-1 / 6, 0.]), np.array([0., 0.5]), np.array([-1 / 6, 1 / 3]), np.array([0., 1 / 3]), np.array([0., 0.]), np.array([1 / 3, 1 / 3]), np.array([0., 0.]), np.array([-1 / 3, 1 / 3]), np.array([0., 0.]) ]), returned_values.sum_cfr_reach_by_action_value)
def __call__(self, player, player_policy, info_states): """Computes action values per state for the player. Args: player: The id of the player 0 <= player < game.num_players(). player_policy: A `policy.Policy` object. info_states: A list of info state strings. Returns: A `_CalculatorReturn` nametuple. See its docstring for the documentation. """ self.player = player opponent = 1 - player def best_response_policy(state): infostate = state.information_state_string(opponent) action = best_response_actions[infostate] return [(action, 1.0)] # If the policy is a TabularPolicy, we can directly copy the infostate # strings & values from the class. This is significantly faster than having # to create the infostate strings. if isinstance(player_policy, policy.TabularPolicy): tabular_policy = { key: _tuples_from_policy(player_policy.policy_for_key(key)) for key in player_policy.state_lookup } # Otherwise, we have to calculate all the infostate strings everytime. This # is ~2x slower. else: # We cache these as they are expensive to compute & do not change. if self._all_states is None: self._all_states = get_all_states.get_all_states( self.game, depth_limit=-1, include_terminals=False, include_chance_states=False) self._state_to_information_state = { state: self._all_states[state].information_state_string() for state in self._all_states } tabular_policy = policy_utils.policy_to_dict( player_policy, self.game, self._all_states, self._state_to_information_state) # When constructed, TabularBestResponse does a lot of work; we can save that # work by caching it. if self._best_responder[player] is None: self._best_responder[player] = pyspiel.TabularBestResponse( self.game, opponent, tabular_policy) else: self._best_responder[player].set_policy(tabular_policy) # Computing the value at the root calculates best responses everywhere. history = str(self.game.new_initial_state()) best_response_value = self._best_responder[player].value(history) best_response_actions = self._best_responder[ player].get_best_response_actions() # Compute action values self.action_values = collections.defaultdict( lambda: collections.defaultdict(lambda: np.zeros(2))) self.info_state_prob = collections.defaultdict(float) self.info_state_player_prob = collections.defaultdict(float) self.info_state_cf_prob = collections.defaultdict(float) self.info_state_chance_prob = collections.defaultdict(float) self.get_action_values( self.game.new_initial_state(), { player: player_policy, opponent: policy.PolicyFromCallable(self.game, best_response_policy), }) # Collect normalized action values for each information state rv = [] cfrp = [] player_reach_probs_vs_br = [] for info_state in info_states: key = (player, info_state) av = self.action_values[key] norm_prob = self.info_state_prob[key] rv.append([(av[a][player] / norm_prob) if (a in av and norm_prob > 0) else 0 for a in range(self.num_actions)]) cfrp.append(self.info_state_cf_prob[key]) player_reach_probs_vs_br.append(self.info_state_player_prob[key]) # Return values return _CalculatorReturn( exploitability=best_response_value, values_vs_br=rv, counterfactual_reach_probs_vs_br=cfrp, player_reach_probs_vs_br=player_reach_probs_vs_br)
def __call__(self, player, player_policy, info_states): """Computes action values per state for the player. Args: player: The id of the player (0 <= player < game.num_players()). This player will play `player_policy`, while the opponent will play a best response. player_policy: A `policy.Policy` object. info_states: A list of info state strings. Returns: A `_CalculatorReturn` nametuple. See its docstring for the documentation. """ self.player = player opponent = 1 - player def best_response_policy(state): infostate = state.information_state_string(opponent) action = best_response_actions[infostate] return [(action, 1.0)] # If the policy is a TabularPolicy, we can directly copy the infostate # strings & values from the class. This is significantly faster than having # to create the infostate strings. if isinstance(player_policy, policy.TabularPolicy): tabular_policy = { key: _tuples_from_policy(player_policy.policy_for_key(key)) for key in player_policy.state_lookup } # Otherwise, we have to calculate all the infostate strings everytime. This # is ~2x slower. else: # We cache these as they are expensive to compute & do not change. if self._all_states is None: self._all_states = get_all_states.get_all_states( self.game, depth_limit=-1, include_terminals=False, include_chance_states=False) self._state_to_information_state = { state: self._all_states[state].information_state_string() for state in self._all_states } tabular_policy = policy_utils.policy_to_dict( player_policy, self.game, self._all_states, self._state_to_information_state) # When constructed, TabularBestResponse does a lot of work; we can save that # work by caching it. if self._best_responder[player] is None: self._best_responder[player] = pyspiel.TabularBestResponse( self.game, opponent, tabular_policy) else: self._best_responder[player].set_policy(tabular_policy) # Computing the value at the root calculates best responses everywhere. history = str(self.game.new_initial_state()) best_response_value = self._best_responder[player].value(history) best_response_actions = self._best_responder[ player].get_best_response_actions() # Compute action values self._action_value_calculator.compute_all_states_action_values({ player: player_policy, opponent: policy.PolicyFromCallable(self.game, best_response_policy), }) obj = self._action_value_calculator._get_tabular_statistics( # pylint: disable=protected-access ((player, s) for s in info_states)) # Return values return _CalculatorReturn( exploitability=best_response_value, values_vs_br=obj.action_values, counterfactual_reach_probs_vs_br=obj.counterfactual_reach_probs, player_reach_probs_vs_br=obj.player_reach_probs)