Python ActionFloatの例、rlpoker.extensive_game.ActionFloat Pythonの例

コード例 #1

0

ファイルを表示

    def test_copy_strategy(self):
        strategy1 = extensive_game.Strategy({
            'info1':
            extensive_game.ActionFloat({
                'a1': 0.4,
                'a2': 0.6
            }),
            'info2':
            extensive_game.ActionFloat({
                'a2': 0.3,
                'a4': 0.7
            })
        })

        strategy2 = strategy1.copy()

        self.assertEqual(strategy1['info1'], strategy2['info1'])
        self.assertEqual(strategy1['info2'], strategy2['info2'])

        strategy1['info1'] = extensive_game.ActionFloat({'a1': 0.2, 'a2': 0.8})

        self.assertEqual(strategy1['info2'], strategy2['info2'])

        self.assertEqual(strategy2['info1'],
                         extensive_game.ActionFloat({
                             'a1': 0.4,
                             'a2': 0.6
                         }))

コード例 #2

0

ファイルを表示

    def test_expected_value_exact(self):

        game, _, _ = create_neural_rock_paper_scissors()

        strategy1 = extensive_game.Strategy({
            game.info_set_ids[game.get_node(())]:
            extensive_game.ActionFloat({
                'R': 0.5,
                'P': 0.2,
                'S': 0.3
            })
        })

        strategy2 = extensive_game.Strategy({
            game.info_set_ids[game.get_node(('R', ))]:
            extensive_game.ActionFloat({
                'R': 0.2,
                'P': 0.3,
                'S': 0.5
            })
        })

        computed1, computed2 = game.expected_value_exact(strategy1=strategy1,
                                                         strategy2=strategy2)

        expected1 = 0.5 * (0.2 * 0 + 0.3 * -1 + 0.5 * 1) + \
                    0.2 * (0.2 * 1 + 0.3 * 0 + 0.5 * -1) + \
                    0.3 * (0.2 * -1 + 0.3 * 1 + 0.5 * 0)

        self.assertEqual(computed1, expected1)
        self.assertEqual(computed2, -expected1)

コード例 #3

0

ファイルを表示

    def test_add(self):
        action_float1 = extensive_game.ActionFloat({'a': 1.0, 'b': -1.0})
        action_float2 = extensive_game.ActionFloat({'a': 1.0, 'c': 2.0})

        action_float = extensive_game.ActionFloat.sum(action_float1,
                                                      action_float2)

        expected = extensive_game.ActionFloat({'a': 2.0, 'b': -1.0, 'c': 2.0})
        self.assertEqual(action_float, expected)

コード例 #4

0

ファイルを表示

    def test_copy(self):
        action_floats1 = extensive_game.ActionFloat({'a1': 0.4, 'a2': 0.6})
        action_floats2 = action_floats1.copy()

        self.assertEqual(action_floats1, action_floats2)

        action_floats1 = extensive_game.ActionFloat({'a1': 0.3, 'a2': 0.7})

        self.assertNotEqual(action_floats1, action_floats2)

コード例 #5

0

ファイルを表示

    def test_is_strategy_complete(self):
        game, _, _ = create_neural_rock_paper_scissors()

        # Incomplete because missing an information set.
        strategy = extensive_game.Strategy({
            ():
            extensive_game.ActionFloat({
                'R': 0.4,
                'P': 0.5,
                'S': 0.1
            })
        })
        computed = game.is_strategy_complete(strategy)
        self.assertEqual(computed, False)

        # Incomplete because missing an action.
        strategy = extensive_game.Strategy({
            ():
            extensive_game.ActionFloat({
                'R': 0.4,
                'P': 0.5,
                'S': 0.1
            }),
            (-1, ):
            extensive_game.ActionFloat({
                'R': 0.4,
                'P': 0.5
            })
        })
        computed = game.is_strategy_complete(strategy)
        self.assertEqual(computed, False)

        # Complete.
        strategy = extensive_game.Strategy({
            ():
            extensive_game.ActionFloat({
                'R': 0.4,
                'P': 0.5,
                'S': 0.1
            }),
            (-1, ):
            extensive_game.ActionFloat({
                'R': 0.4,
                'P': 0.3,
                'S': 0.3
            })
        })
        computed = game.is_strategy_complete(strategy)
        self.assertEqual(computed, True)

コード例 #6

0

ファイルを表示

    def test_cfr_traverse_advantage_memory(self):
        game, action_indexer, info_set_vectoriser = create_neural_rock_paper_scissors(
        )

        node = game.root
        player = 1

        network1 = Mock()
        network1.compute_action_probs = Mock(
            return_value=extensive_game.ActionFloat({
                'R': 0.2,
                'P': 0.7,
                'S': 0.1
            }))

        network2 = Mock()
        network2.compute_action_probs = Mock(
            return_value=extensive_game.ActionFloat({
                'R': 0.3,
                'P': 0.6,
                'S': 0.1
            }))

        advantage_memory1 = buffer.Reservoir(maxlen=100)
        advantage_memory2 = buffer.Reservoir(maxlen=100)

        strategy_memory = buffer.Reservoir(maxlen=100)

        deep_cfr.cfr_traverse(game,
                              action_indexer,
                              info_set_vectoriser,
                              node,
                              player,
                              network1,
                              network2,
                              advantage_memory1,
                              advantage_memory2,
                              strategy_memory,
                              t=2)

        # We add to the traverser's advantage memory in each of their nodes, of which there is 1.
        self.assertEqual(len(advantage_memory1), 1)

        # We don't update 2's advantage memory
        self.assertEqual(len(advantage_memory2), 0)

        # We add to the strategy memory for each node of the non-traversing player, of which there are 3.
        self.assertEqual(len(strategy_memory), 3)

コード例 #7

0

ファイルを表示

ファイル: deep_cfr.py プロジェクト: Michael-Z/rlpoker

    def predict_advantages(self, info_set_vector, action_indexer: neural_game.ActionIndexer) -> \
            extensive_game.ActionFloat:
        advantages = self.sess.run(self.tensors['advantages'], feed_dict={
            self.tensors['input_layer']: [info_set_vector]
        })

        return extensive_game.ActionFloat({
            action: advantages[0, self.action_indexer.get_index(action)] for action in self.action_indexer.actions
        })

コード例 #8

0

ファイルを表示

ファイル: test_cfr_metrics.py プロジェクト: Michael-Z/rlpoker

    def test_rock_paper_scissors(self):
        game = rock_paper_scissors.create_rock_paper_scissors()

        info_set_1 = game.info_set_ids[game.root]
        info_set_2 = game.info_set_ids[game.root.children['R']]

        # Player 1 plays (R, P, S) with probabilities (0.5, 0.3, 0.2), respectively.
        sigma_1 = extensive_game.Strategy({
            info_set_1:
            extensive_game.ActionFloat({
                'R': 0.5,
                'P': 0.3,
                'S': 0.2
            })
        })

        # Player 2 plays (R, P, S) with probabilities (0.3, 0.3, 0.4), respectively.
        sigma_2 = extensive_game.Strategy({
            info_set_2:
            extensive_game.ActionFloat({
                'R': 0.3,
                'P': 0.3,
                'S': 0.4
            }),
        })

        # Check the values.
        expected_utility_1, expected_utility_2 = cfr_metrics.compute_expected_utility(
            game, sigma_1, sigma_2)

        utility_root = (
            0.5 * (0 * 0.3 + -1 * 0.3 + 1 * 0.4) +  # RR, RP, RS
            0.3 * (1 * 0.3 + 0 * 0.3 + -1 * 0.4) +  # PR, PP, PS
            0.2 * (-1 * 0.3 + 1 * 0.3 + 0 * 0.4))  # SR, SP, SS
        self.assertEqual(expected_utility_1[game.get_node(())], utility_root)

        utility_R = 0 * 0.3 + 1 * 0.3 + -1 * 0.4  # RR + RP + RS
        self.assertEqual(expected_utility_2[game.get_node(('R', ))], utility_R)
        utility_P = -1 * 0.3 + 0 * 0.3 + 1 * 0.4  # PR, PP, PS
        self.assertEqual(expected_utility_2[game.get_node(('P', ))], utility_P)
        utility_S = 1 * 0.3 + -1 * 0.3 + 0 * 0.4  # SR, SP, SS
        self.assertEqual(expected_utility_2[game.get_node(('S', ))], utility_S)

コード例 #9

0

ファイルを表示

def normalise_probs(probs: extensive_game.ActionFloat, epsilon=1e-7):
    """Sets the minimum prob to be epsilon, and then normalises by dividing by the sum.

    Args:
        probs: extensive_game.ActionFloat. Must all be non-negative.

    Returns:
        norm_probs: extensive_game.ActionFloat.
    """
    assert min(probs.values()) >= 0.0
    probs = {a: max(prob, epsilon) for a, prob in probs.items()}
    return extensive_game.ActionFloat(
        {a: prob / sum(probs.values())
         for a, prob in probs.items()})

コード例 #10

0

ファイルを表示

def compute_regret_matching(action_regrets: extensive_game.ActionFloat,
                            epsilon=1e-7,
                            highest_regret=False):
    """Given regrets r_i for actions a_i, we compute the regret matching strategy as follows.

    If sum_i max(0, r_i) > 0:
        Play action a_i proportionally to max(0, r_i)
    Else:
        Play all actions uniformly.

    Args:
        regrets: dict
        epsilon: the minimum probability to return for each action, for numerical stability.
        highest_regret: if True, then when all regrets are negative, return epsilon for all but the highest regret
            actions.

    Returns:
        extensive_game.ActionFloat. The probability of taking each action in this information set.
    """
    # If no regrets are positive, just return the uniform probability distribution on available actions.
    if max([v for k, v in action_regrets.items()]) <= 0.0:
        if highest_regret:
            probs = {action: epsilon for action in action_regrets}
            best_action = max(action_regrets, key=action_regrets.get)
            probs[best_action] = 1.0
            return normalise_probs(extensive_game.ActionFloat(probs),
                                   epsilon=epsilon)
        else:
            return extensive_game.ActionFloat.initialise_uniform(
                action_regrets.action_list)
    else:
        # Otherwise take the positive part of each regret (i.e. the maximum of the regret and zero),
        # and play actions with probability proportional to positive regret.
        return normalise_probs(extensive_game.ActionFloat(
            {k: max(0.0, v)
             for k, v in action_regrets.items()}),
                               epsilon=epsilon)

コード例 #11

0

ファイルを表示

    def test_equals(self):
        strategy1 = extensive_game.Strategy({
            'info1':
            extensive_game.ActionFloat({
                'a1': 0.4,
                'a2': 0.6
            }),
            'info2':
            extensive_game.ActionFloat({
                'a2': 0.3,
                'a4': 0.7
            })
        })

        # Same as strategy1
        strategy2 = extensive_game.Strategy({
            'info1':
            extensive_game.ActionFloat({
                'a1': 0.4,
                'a2': 0.6
            }),
            'info2':
            extensive_game.ActionFloat({
                'a2': 0.3,
                'a4': 0.7
            })
        })

        # Different to strategy1
        strategy3 = extensive_game.Strategy({
            'info1':
            extensive_game.ActionFloat({
                'a1': 0.3,
                'a2': 0.7
            }),
            'info2':
            extensive_game.ActionFloat({
                'a2': 0.3,
                'a4': 0.7
            })
        })

        self.assertEqual(strategy1, strategy2)
        self.assertNotEqual(strategy1, strategy3)

コード例 #12

0

ファイルを表示

    def test_compute_weighted_strategy(self):
        strategies = {
            'info1': [(1.0, extensive_game.ActionFloat({
                'a1': 0.4,
                'a2': 0.6
            })), (2.0, extensive_game.ActionFloat({
                'a1': 0.5,
                'a2': 0.5
            }))],
            'info2': [(3.0, extensive_game.ActionFloat({
                'a1': 0.6,
                'a2': 0.4
            })), (2.0, extensive_game.ActionFloat({
                'a1': 0.3,
                'a2': 0.7
            })), (1.0, extensive_game.ActionFloat({
                'a1': 0.0,
                'a2': 1.0
            }))]
        }

        expected = extensive_game.Strategy({
            'info1':
            extensive_game.ActionFloat({
                'a1': (1.0 * 0.4 + 2.0 * 0.5) / (1.0 + 2.0),
                'a2': (1.0 * 0.6 + 2.0 * 0.5) / (1.0 + 2.0)
            }),
            'info2':
            extensive_game.ActionFloat({
                'a1': (3.0 * 0.6 + 2.0 * 0.3 + 1.0 * 0.0) / (3.0 + 2.0 + 1.0),
                'a2': (3.0 * 0.4 + 2.0 * 0.7 + 1.0 * 1.0) / (3.0 + 2.0 + 1.0)
            })
        })
        computed = extensive_game.compute_weighted_strategy(strategies)

        self.assertEqual(computed, expected)

コード例 #13

0

ファイルを表示

ファイル: test_cfr_metrics.py プロジェクト: Michael-Z/rlpoker

    def test_rock_paper_scissors_recursive(self):
        game = rock_paper_scissors.create_rock_paper_scissors()

        info_set_1 = game.info_set_ids[game.root]
        info_set_2 = game.info_set_ids[game.root.children['R']]

        # Player 1 plays (R, P, S) with probabilities (0.5, 0.3, 0.2), respectively.
        sigma_1 = extensive_game.Strategy({
            info_set_1:
            extensive_game.ActionFloat({
                'R': 0.5,
                'P': 0.3,
                'S': 0.2
            })
        })

        # Player 2 plays (R, P, S) with probabilities (0.3, 0.3, 0.4), respectively.
        sigma_2 = extensive_game.Strategy({
            info_set_2:
            extensive_game.ActionFloat({
                'R': 0.3,
                'P': 0.3,
                'S': 0.4
            }),
        })

        # Check that terminal nodes have value equal to their utility to the player.
        terminal_nodes = [
            game.get_node((a1, a2)) for a1 in ['R', 'P', 'S']
            for a2 in ['R', 'P', 'S']
        ]

        for node in terminal_nodes:
            v1, v2 = cfr_metrics.compute_expected_utility_recursive(
                game, node, sigma_1, sigma_2, collections.defaultdict(float),
                collections.defaultdict(float), 1.0, 1.0, 1.0)
            expected_v1 = node.utility[1]
            expected_v2 = node.utility[2]

            self.assertEqual(v1, expected_v1)
            self.assertEqual(v2, expected_v2)

        # Check the values of the player 2 nodes.
        v1, v2 = cfr_metrics.compute_expected_utility_recursive(
            game, game.get_node(('R', )), sigma_1, sigma_2,
            collections.defaultdict(float), collections.defaultdict(float),
            0.5, 1.0, 1.0)
        self.assertEqual(v1, 0 * 0.3 + -1 * 0.3 + 1 * 0.4)
        self.assertEqual(v2, 0 * 0.3 + 1 * 0.3 + -1 * 0.4)

        # Check the values of the (only) player 1 node.
        v1, v2 = cfr_metrics.compute_expected_utility_recursive(
            game, game.get_node(()), sigma_1, sigma_2,
            collections.defaultdict(float), collections.defaultdict(float),
            0.5, 1.0, 1.0)
        self.assertEqual(
            v1,
            (
                0.5 * (0 * 0.3 + -1 * 0.3 + 1 * 0.4) +  # RR, RP, RS
                0.3 * (1 * 0.3 + 0 * 0.3 + -1 * 0.4) +  # PR, PP, PS
                0.2 * (-1 * 0.3 + 1 * 0.3 + 0 * 0.4)))  # SR, SP, SS
        self.assertEqual(
            v2,
            (
                0.5 * (0 * 0.3 + 1 * 0.3 + -1 * 0.4) +  # RR, RP, RS
                0.3 * (-1 * 0.3 + 0 * 0.3 + 1 * 0.4) +  # PR, PP, PS
                0.2 * (1 * 0.3 + -1 * 0.3 + 0 * 0.4)))  # SR, SP, SS

コード例 #14

0

ファイルを表示

    def test_iter(self):
        action_float = extensive_game.ActionFloat({'a1': 0.4, 'a2': 0.6})

        actions = [a for a in action_float]
        self.assertEqual(set(actions), {'a1', 'a2'})

コード例 #15

0

ファイルを表示

    def test_cfr_traverse_advantage_memory_player2(self):
        game, action_indexer, info_set_vectoriser = create_neural_rock_paper_scissors(
        )
        game.print_tree()

        node = game.root.children['R']
        assert node.player == 2
        player = 2
        t = 3

        network1 = Mock()
        network1.compute_action_probs = Mock(
            return_value=extensive_game.ActionFloat({
                'R': 0.2,
                'P': 0.7,
                'S': 0.1
            }))

        network2 = Mock()
        network2.compute_action_probs = Mock(
            return_value=extensive_game.ActionFloat({
                'R': 0.3,
                'P': 0.6,
                'S': 0.1
            }))

        advantage_memory1 = buffer.Reservoir(maxlen=100)
        advantage_memory2 = buffer.Reservoir(maxlen=100)

        strategy_memory = buffer.Reservoir(maxlen=100)

        deep_cfr.cfr_traverse(game, action_indexer, info_set_vectoriser, node,
                              player, network1, network2, advantage_memory1,
                              advantage_memory2, strategy_memory, t)

        # Shouldn't have used network 1
        network1.compute_action_probs.assert_not_called()

        # Should have used network 2 once.
        network2.compute_action_probs.assert_called_once()

        # We add to the traverser's advantage memory in each of their nodes, of which there is 1.
        self.assertEqual(len(advantage_memory1), 0)

        # We don't update 2's advantage memory
        self.assertEqual(len(advantage_memory2), 1)

        # We add to the strategy memory for each node of the non-traversing player, of which there are 3.
        self.assertEqual(len(strategy_memory), 0)

        advantage = advantage_memory2.buffer[0]
        print("Buffer: {}".format(advantage_memory2.buffer))
        print("Advantage: {}".format(advantage))
        expected = deep_cfr.AdvantageMemoryElement(game.get_info_set_id(node),
                                                   t, {
                                                       'R': -0.5,
                                                       'P': 0.5,
                                                       'S': -1.5
                                                   })
        print("Expected: {}".format(expected))
        self.assertEqual(advantage, expected)

コード例 #16

0

ファイルを表示

def external_sampling_cfr_recursive(
    game: extensive_game.ExtensiveGame,
    node: extensive_game.ExtensiveGameNode,
    player: int,
    regrets: Dict,
    strategy_t: extensive_game.Strategy,
    strategy_t_1: extensive_game.Strategy,
    cfr_state: cfr_util.CFRState,
):
    """
    Computes the 'expected player utility' sum_{z in Q and Z_I} pi_i^sigma (z[I], z) u_i(z). Samples the actions of
    chance nodes and the nodes of the other players. Accumulates the immediate sampled counterfactual regret:

    rtilde(I, a) = sum_{z in Q and Z_I} u_i(z) (pi_i^sigma(z[I]a, z) - pi_i^sigma(z[I], z)).

    Args:
        game:
        node:
        player:
        regrets:
        strategy_t: the strategy used at time t. We don't update this one.
        strategy_t_1: the strategy to use at time t + 1. We update this one in this function call.
        cfr_state: general state about CFR progress.

    Returns:
        expected_player_utility
    """
    cfr_state.node_touched()
    if node.player == -1:
        # Terminal node. Just return the utility to the player.
        return node.utility[player]
    elif node.player == 0:
        # Chance player. We sample an action and then return the expected utility for that action.
        a = cfr_game.sample_chance_action(node)
        return external_sampling_cfr_recursive(
            game,
            node.children[a],
            player,
            regrets,
            strategy_t,
            strategy_t_1,
            cfr_state,
        )
    elif node.player == player:
        # Return sum_{z in Q and Z_I} pi_i^sigma (z[I], z) u_i(z)

        expected_utilities = dict()
        action_probs = dict()
        information_set = cfr_game.get_information_set(game, node)
        expected_utility = 0.0
        if information_set not in strategy_t.get_info_sets():
            strategy_t.set_uniform_action_probs(information_set,
                                                list(node.children.keys()))

        immediate_regrets = dict()
        for action, child in node.children.items():
            expected_utilities[action] = external_sampling_cfr_recursive(
                game, child, player, regrets, strategy_t, strategy_t_1,
                cfr_state)
            action_probs[action] = strategy_t[information_set][action]

            expected_utility += action_probs[action] * expected_utilities[
                action]

        for action in node.children:
            immediate_regrets[
                action] = expected_utilities[action] - expected_utility

        if information_set not in regrets:
            regrets[information_set] = extensive_game.ActionFloat(
                immediate_regrets)
        else:
            regrets[information_set] = extensive_game.ActionFloat.sum(
                regrets[information_set],
                extensive_game.ActionFloat(immediate_regrets))

        # Update the strategy for the next iteration
        strategy_t_1[information_set] = cfr_util.compute_regret_matching(
            regrets[information_set])

        return expected_utility
    else:
        # It is the other player. Sample an action and return the value.
        information_set = cfr_game.get_information_set(game, node)
        if information_set not in strategy_t.get_info_sets():
            strategy_t.set_uniform_action_probs(information_set,
                                                list(node.children.keys()))

        a = cfr_game.sample_action(strategy_t[information_set])
        return external_sampling_cfr_recursive(
            game,
            node.children[a],
            player,
            regrets,
            strategy_t,
            strategy_t_1,
            cfr_state,
        )