def test_copy_strategy(self): strategy1 = extensive_game.Strategy({ 'info1': extensive_game.ActionFloat({ 'a1': 0.4, 'a2': 0.6 }), 'info2': extensive_game.ActionFloat({ 'a2': 0.3, 'a4': 0.7 }) }) strategy2 = strategy1.copy() self.assertEqual(strategy1['info1'], strategy2['info1']) self.assertEqual(strategy1['info2'], strategy2['info2']) strategy1['info1'] = extensive_game.ActionFloat({'a1': 0.2, 'a2': 0.8}) self.assertEqual(strategy1['info2'], strategy2['info2']) self.assertEqual(strategy2['info1'], extensive_game.ActionFloat({ 'a1': 0.4, 'a2': 0.6 }))
def test_expected_value_exact(self): game, _, _ = create_neural_rock_paper_scissors() strategy1 = extensive_game.Strategy({ game.info_set_ids[game.get_node(())]: extensive_game.ActionFloat({ 'R': 0.5, 'P': 0.2, 'S': 0.3 }) }) strategy2 = extensive_game.Strategy({ game.info_set_ids[game.get_node(('R', ))]: extensive_game.ActionFloat({ 'R': 0.2, 'P': 0.3, 'S': 0.5 }) }) computed1, computed2 = game.expected_value_exact(strategy1=strategy1, strategy2=strategy2) expected1 = 0.5 * (0.2 * 0 + 0.3 * -1 + 0.5 * 1) + \ 0.2 * (0.2 * 1 + 0.3 * 0 + 0.5 * -1) + \ 0.3 * (0.2 * -1 + 0.3 * 1 + 0.5 * 0) self.assertEqual(computed1, expected1) self.assertEqual(computed2, -expected1)
def test_add(self): action_float1 = extensive_game.ActionFloat({'a': 1.0, 'b': -1.0}) action_float2 = extensive_game.ActionFloat({'a': 1.0, 'c': 2.0}) action_float = extensive_game.ActionFloat.sum(action_float1, action_float2) expected = extensive_game.ActionFloat({'a': 2.0, 'b': -1.0, 'c': 2.0}) self.assertEqual(action_float, expected)
def test_copy(self): action_floats1 = extensive_game.ActionFloat({'a1': 0.4, 'a2': 0.6}) action_floats2 = action_floats1.copy() self.assertEqual(action_floats1, action_floats2) action_floats1 = extensive_game.ActionFloat({'a1': 0.3, 'a2': 0.7}) self.assertNotEqual(action_floats1, action_floats2)
def test_is_strategy_complete(self): game, _, _ = create_neural_rock_paper_scissors() # Incomplete because missing an information set. strategy = extensive_game.Strategy({ (): extensive_game.ActionFloat({ 'R': 0.4, 'P': 0.5, 'S': 0.1 }) }) computed = game.is_strategy_complete(strategy) self.assertEqual(computed, False) # Incomplete because missing an action. strategy = extensive_game.Strategy({ (): extensive_game.ActionFloat({ 'R': 0.4, 'P': 0.5, 'S': 0.1 }), (-1, ): extensive_game.ActionFloat({ 'R': 0.4, 'P': 0.5 }) }) computed = game.is_strategy_complete(strategy) self.assertEqual(computed, False) # Complete. strategy = extensive_game.Strategy({ (): extensive_game.ActionFloat({ 'R': 0.4, 'P': 0.5, 'S': 0.1 }), (-1, ): extensive_game.ActionFloat({ 'R': 0.4, 'P': 0.3, 'S': 0.3 }) }) computed = game.is_strategy_complete(strategy) self.assertEqual(computed, True)
def test_cfr_traverse_advantage_memory(self): game, action_indexer, info_set_vectoriser = create_neural_rock_paper_scissors( ) node = game.root player = 1 network1 = Mock() network1.compute_action_probs = Mock( return_value=extensive_game.ActionFloat({ 'R': 0.2, 'P': 0.7, 'S': 0.1 })) network2 = Mock() network2.compute_action_probs = Mock( return_value=extensive_game.ActionFloat({ 'R': 0.3, 'P': 0.6, 'S': 0.1 })) advantage_memory1 = buffer.Reservoir(maxlen=100) advantage_memory2 = buffer.Reservoir(maxlen=100) strategy_memory = buffer.Reservoir(maxlen=100) deep_cfr.cfr_traverse(game, action_indexer, info_set_vectoriser, node, player, network1, network2, advantage_memory1, advantage_memory2, strategy_memory, t=2) # We add to the traverser's advantage memory in each of their nodes, of which there is 1. self.assertEqual(len(advantage_memory1), 1) # We don't update 2's advantage memory self.assertEqual(len(advantage_memory2), 0) # We add to the strategy memory for each node of the non-traversing player, of which there are 3. self.assertEqual(len(strategy_memory), 3)
def predict_advantages(self, info_set_vector, action_indexer: neural_game.ActionIndexer) -> \ extensive_game.ActionFloat: advantages = self.sess.run(self.tensors['advantages'], feed_dict={ self.tensors['input_layer']: [info_set_vector] }) return extensive_game.ActionFloat({ action: advantages[0, self.action_indexer.get_index(action)] for action in self.action_indexer.actions })
def test_rock_paper_scissors(self): game = rock_paper_scissors.create_rock_paper_scissors() info_set_1 = game.info_set_ids[game.root] info_set_2 = game.info_set_ids[game.root.children['R']] # Player 1 plays (R, P, S) with probabilities (0.5, 0.3, 0.2), respectively. sigma_1 = extensive_game.Strategy({ info_set_1: extensive_game.ActionFloat({ 'R': 0.5, 'P': 0.3, 'S': 0.2 }) }) # Player 2 plays (R, P, S) with probabilities (0.3, 0.3, 0.4), respectively. sigma_2 = extensive_game.Strategy({ info_set_2: extensive_game.ActionFloat({ 'R': 0.3, 'P': 0.3, 'S': 0.4 }), }) # Check the values. expected_utility_1, expected_utility_2 = cfr_metrics.compute_expected_utility( game, sigma_1, sigma_2) utility_root = ( 0.5 * (0 * 0.3 + -1 * 0.3 + 1 * 0.4) + # RR, RP, RS 0.3 * (1 * 0.3 + 0 * 0.3 + -1 * 0.4) + # PR, PP, PS 0.2 * (-1 * 0.3 + 1 * 0.3 + 0 * 0.4)) # SR, SP, SS self.assertEqual(expected_utility_1[game.get_node(())], utility_root) utility_R = 0 * 0.3 + 1 * 0.3 + -1 * 0.4 # RR + RP + RS self.assertEqual(expected_utility_2[game.get_node(('R', ))], utility_R) utility_P = -1 * 0.3 + 0 * 0.3 + 1 * 0.4 # PR, PP, PS self.assertEqual(expected_utility_2[game.get_node(('P', ))], utility_P) utility_S = 1 * 0.3 + -1 * 0.3 + 0 * 0.4 # SR, SP, SS self.assertEqual(expected_utility_2[game.get_node(('S', ))], utility_S)
def normalise_probs(probs: extensive_game.ActionFloat, epsilon=1e-7): """Sets the minimum prob to be epsilon, and then normalises by dividing by the sum. Args: probs: extensive_game.ActionFloat. Must all be non-negative. Returns: norm_probs: extensive_game.ActionFloat. """ assert min(probs.values()) >= 0.0 probs = {a: max(prob, epsilon) for a, prob in probs.items()} return extensive_game.ActionFloat( {a: prob / sum(probs.values()) for a, prob in probs.items()})
def compute_regret_matching(action_regrets: extensive_game.ActionFloat, epsilon=1e-7, highest_regret=False): """Given regrets r_i for actions a_i, we compute the regret matching strategy as follows. If sum_i max(0, r_i) > 0: Play action a_i proportionally to max(0, r_i) Else: Play all actions uniformly. Args: regrets: dict epsilon: the minimum probability to return for each action, for numerical stability. highest_regret: if True, then when all regrets are negative, return epsilon for all but the highest regret actions. Returns: extensive_game.ActionFloat. The probability of taking each action in this information set. """ # If no regrets are positive, just return the uniform probability distribution on available actions. if max([v for k, v in action_regrets.items()]) <= 0.0: if highest_regret: probs = {action: epsilon for action in action_regrets} best_action = max(action_regrets, key=action_regrets.get) probs[best_action] = 1.0 return normalise_probs(extensive_game.ActionFloat(probs), epsilon=epsilon) else: return extensive_game.ActionFloat.initialise_uniform( action_regrets.action_list) else: # Otherwise take the positive part of each regret (i.e. the maximum of the regret and zero), # and play actions with probability proportional to positive regret. return normalise_probs(extensive_game.ActionFloat( {k: max(0.0, v) for k, v in action_regrets.items()}), epsilon=epsilon)
def test_equals(self): strategy1 = extensive_game.Strategy({ 'info1': extensive_game.ActionFloat({ 'a1': 0.4, 'a2': 0.6 }), 'info2': extensive_game.ActionFloat({ 'a2': 0.3, 'a4': 0.7 }) }) # Same as strategy1 strategy2 = extensive_game.Strategy({ 'info1': extensive_game.ActionFloat({ 'a1': 0.4, 'a2': 0.6 }), 'info2': extensive_game.ActionFloat({ 'a2': 0.3, 'a4': 0.7 }) }) # Different to strategy1 strategy3 = extensive_game.Strategy({ 'info1': extensive_game.ActionFloat({ 'a1': 0.3, 'a2': 0.7 }), 'info2': extensive_game.ActionFloat({ 'a2': 0.3, 'a4': 0.7 }) }) self.assertEqual(strategy1, strategy2) self.assertNotEqual(strategy1, strategy3)
def test_compute_weighted_strategy(self): strategies = { 'info1': [(1.0, extensive_game.ActionFloat({ 'a1': 0.4, 'a2': 0.6 })), (2.0, extensive_game.ActionFloat({ 'a1': 0.5, 'a2': 0.5 }))], 'info2': [(3.0, extensive_game.ActionFloat({ 'a1': 0.6, 'a2': 0.4 })), (2.0, extensive_game.ActionFloat({ 'a1': 0.3, 'a2': 0.7 })), (1.0, extensive_game.ActionFloat({ 'a1': 0.0, 'a2': 1.0 }))] } expected = extensive_game.Strategy({ 'info1': extensive_game.ActionFloat({ 'a1': (1.0 * 0.4 + 2.0 * 0.5) / (1.0 + 2.0), 'a2': (1.0 * 0.6 + 2.0 * 0.5) / (1.0 + 2.0) }), 'info2': extensive_game.ActionFloat({ 'a1': (3.0 * 0.6 + 2.0 * 0.3 + 1.0 * 0.0) / (3.0 + 2.0 + 1.0), 'a2': (3.0 * 0.4 + 2.0 * 0.7 + 1.0 * 1.0) / (3.0 + 2.0 + 1.0) }) }) computed = extensive_game.compute_weighted_strategy(strategies) self.assertEqual(computed, expected)
def test_rock_paper_scissors_recursive(self): game = rock_paper_scissors.create_rock_paper_scissors() info_set_1 = game.info_set_ids[game.root] info_set_2 = game.info_set_ids[game.root.children['R']] # Player 1 plays (R, P, S) with probabilities (0.5, 0.3, 0.2), respectively. sigma_1 = extensive_game.Strategy({ info_set_1: extensive_game.ActionFloat({ 'R': 0.5, 'P': 0.3, 'S': 0.2 }) }) # Player 2 plays (R, P, S) with probabilities (0.3, 0.3, 0.4), respectively. sigma_2 = extensive_game.Strategy({ info_set_2: extensive_game.ActionFloat({ 'R': 0.3, 'P': 0.3, 'S': 0.4 }), }) # Check that terminal nodes have value equal to their utility to the player. terminal_nodes = [ game.get_node((a1, a2)) for a1 in ['R', 'P', 'S'] for a2 in ['R', 'P', 'S'] ] for node in terminal_nodes: v1, v2 = cfr_metrics.compute_expected_utility_recursive( game, node, sigma_1, sigma_2, collections.defaultdict(float), collections.defaultdict(float), 1.0, 1.0, 1.0) expected_v1 = node.utility[1] expected_v2 = node.utility[2] self.assertEqual(v1, expected_v1) self.assertEqual(v2, expected_v2) # Check the values of the player 2 nodes. v1, v2 = cfr_metrics.compute_expected_utility_recursive( game, game.get_node(('R', )), sigma_1, sigma_2, collections.defaultdict(float), collections.defaultdict(float), 0.5, 1.0, 1.0) self.assertEqual(v1, 0 * 0.3 + -1 * 0.3 + 1 * 0.4) self.assertEqual(v2, 0 * 0.3 + 1 * 0.3 + -1 * 0.4) # Check the values of the (only) player 1 node. v1, v2 = cfr_metrics.compute_expected_utility_recursive( game, game.get_node(()), sigma_1, sigma_2, collections.defaultdict(float), collections.defaultdict(float), 0.5, 1.0, 1.0) self.assertEqual( v1, ( 0.5 * (0 * 0.3 + -1 * 0.3 + 1 * 0.4) + # RR, RP, RS 0.3 * (1 * 0.3 + 0 * 0.3 + -1 * 0.4) + # PR, PP, PS 0.2 * (-1 * 0.3 + 1 * 0.3 + 0 * 0.4))) # SR, SP, SS self.assertEqual( v2, ( 0.5 * (0 * 0.3 + 1 * 0.3 + -1 * 0.4) + # RR, RP, RS 0.3 * (-1 * 0.3 + 0 * 0.3 + 1 * 0.4) + # PR, PP, PS 0.2 * (1 * 0.3 + -1 * 0.3 + 0 * 0.4))) # SR, SP, SS
def test_iter(self): action_float = extensive_game.ActionFloat({'a1': 0.4, 'a2': 0.6}) actions = [a for a in action_float] self.assertEqual(set(actions), {'a1', 'a2'})
def test_cfr_traverse_advantage_memory_player2(self): game, action_indexer, info_set_vectoriser = create_neural_rock_paper_scissors( ) game.print_tree() node = game.root.children['R'] assert node.player == 2 player = 2 t = 3 network1 = Mock() network1.compute_action_probs = Mock( return_value=extensive_game.ActionFloat({ 'R': 0.2, 'P': 0.7, 'S': 0.1 })) network2 = Mock() network2.compute_action_probs = Mock( return_value=extensive_game.ActionFloat({ 'R': 0.3, 'P': 0.6, 'S': 0.1 })) advantage_memory1 = buffer.Reservoir(maxlen=100) advantage_memory2 = buffer.Reservoir(maxlen=100) strategy_memory = buffer.Reservoir(maxlen=100) deep_cfr.cfr_traverse(game, action_indexer, info_set_vectoriser, node, player, network1, network2, advantage_memory1, advantage_memory2, strategy_memory, t) # Shouldn't have used network 1 network1.compute_action_probs.assert_not_called() # Should have used network 2 once. network2.compute_action_probs.assert_called_once() # We add to the traverser's advantage memory in each of their nodes, of which there is 1. self.assertEqual(len(advantage_memory1), 0) # We don't update 2's advantage memory self.assertEqual(len(advantage_memory2), 1) # We add to the strategy memory for each node of the non-traversing player, of which there are 3. self.assertEqual(len(strategy_memory), 0) advantage = advantage_memory2.buffer[0] print("Buffer: {}".format(advantage_memory2.buffer)) print("Advantage: {}".format(advantage)) expected = deep_cfr.AdvantageMemoryElement(game.get_info_set_id(node), t, { 'R': -0.5, 'P': 0.5, 'S': -1.5 }) print("Expected: {}".format(expected)) self.assertEqual(advantage, expected)
def external_sampling_cfr_recursive( game: extensive_game.ExtensiveGame, node: extensive_game.ExtensiveGameNode, player: int, regrets: Dict, strategy_t: extensive_game.Strategy, strategy_t_1: extensive_game.Strategy, cfr_state: cfr_util.CFRState, ): """ Computes the 'expected player utility' sum_{z in Q and Z_I} pi_i^sigma (z[I], z) u_i(z). Samples the actions of chance nodes and the nodes of the other players. Accumulates the immediate sampled counterfactual regret: rtilde(I, a) = sum_{z in Q and Z_I} u_i(z) (pi_i^sigma(z[I]a, z) - pi_i^sigma(z[I], z)). Args: game: node: player: regrets: strategy_t: the strategy used at time t. We don't update this one. strategy_t_1: the strategy to use at time t + 1. We update this one in this function call. cfr_state: general state about CFR progress. Returns: expected_player_utility """ cfr_state.node_touched() if node.player == -1: # Terminal node. Just return the utility to the player. return node.utility[player] elif node.player == 0: # Chance player. We sample an action and then return the expected utility for that action. a = cfr_game.sample_chance_action(node) return external_sampling_cfr_recursive( game, node.children[a], player, regrets, strategy_t, strategy_t_1, cfr_state, ) elif node.player == player: # Return sum_{z in Q and Z_I} pi_i^sigma (z[I], z) u_i(z) expected_utilities = dict() action_probs = dict() information_set = cfr_game.get_information_set(game, node) expected_utility = 0.0 if information_set not in strategy_t.get_info_sets(): strategy_t.set_uniform_action_probs(information_set, list(node.children.keys())) immediate_regrets = dict() for action, child in node.children.items(): expected_utilities[action] = external_sampling_cfr_recursive( game, child, player, regrets, strategy_t, strategy_t_1, cfr_state) action_probs[action] = strategy_t[information_set][action] expected_utility += action_probs[action] * expected_utilities[ action] for action in node.children: immediate_regrets[ action] = expected_utilities[action] - expected_utility if information_set not in regrets: regrets[information_set] = extensive_game.ActionFloat( immediate_regrets) else: regrets[information_set] = extensive_game.ActionFloat.sum( regrets[information_set], extensive_game.ActionFloat(immediate_regrets)) # Update the strategy for the next iteration strategy_t_1[information_set] = cfr_util.compute_regret_matching( regrets[information_set]) return expected_utility else: # It is the other player. Sample an action and return the value. information_set = cfr_game.get_information_set(game, node) if information_set not in strategy_t.get_info_sets(): strategy_t.set_uniform_action_probs(information_set, list(node.children.keys())) a = cfr_game.sample_action(strategy_t[information_set]) return external_sampling_cfr_recursive( game, node.children[a], player, regrets, strategy_t, strategy_t_1, cfr_state, )