Beispiel #1
0
    def _log_strategy(
            self,
            policy: Policy,
            infoset: Optional[LeducInfoset],
            global_step: int):
        def recurse(new_action):
            after_action_infoset = copy.deepcopy(infoset)
            after_action_infoset.add_action(new_action)
            self._log_strategy(policy, after_action_infoset, global_step)

        if infoset is None:
            for card in range(3):
                infoset = LeducInfoset(card, bet_sequences=[(), ()], board_card=None)
                self._log_strategy(policy, infoset, global_step)
        elif infoset.player_to_act == -1:
            for board_card in range(3):
                infoset = LeducInfoset(card=infoset.card, bet_sequences=infoset.bet_sequences, board_card=board_card)
                self._log_strategy(policy, infoset, global_step)
        elif infoset.is_terminal:
            return
        else:
            action_probs = policy.action_prob(infoset)
            nash_action_probs = self.nash_policy.action_prob(infoset)
            action_probs -= nash_action_probs

            node_name = "strategy/" + str(infoset)
            node_name = node_name.replace(":", "_")
            for action in PlayerActions.ALL_ACTIONS:
                if action == PlayerActions.FOLD and infoset.can_fold:
                    if not self.text_only:
                        self.writer.add_scalar(node_name+"/f", action_probs[action], global_step=global_step)
                    logger.debug("Epoch %s Strategy %s %s", e, node_name+"/f", action_probs[action])
                    self.total_error += abs(action_probs[action])
                    self.state_cnt += 1
                    recurse(action)
                elif action == PlayerActions.BET_RAISE and infoset.can_raise:
                    if not self.text_only:
                        self.writer.add_scalar(node_name+"/r", action_probs[action], global_step=global_step)
                    logger.debug("Epoch %s Strategy %s %s", e, node_name+"/r", action_probs[action])
                    self.total_error += abs(action_probs[action])
                    self.state_cnt += 1
                    recurse(action)
                elif action == PlayerActions.CHECK_CALL:
                    if not self.text_only:
                        self.writer.add_scalar(node_name + "/c", action_probs[action], global_step=global_step)
                    logger.debug("Epoch %s Strategy %s %s", e, node_name+"/c", action_probs[action])
                    self.total_error += abs(action_probs[action])
                    self.state_cnt += 1
                    recurse(action)
Beispiel #2
0
def log_qvals(
        writer: SummaryWriter,
        policy: QPolicy,
        infoset: Optional[LeducInfoset],
        global_step: int,
        text_only: bool):
    def recurse(new_action):
        after_action_infoset = copy.deepcopy(infoset)
        after_action_infoset.add_action(new_action)
        log_qvals(writer, policy, after_action_infoset, global_step, text_only)

    if infoset is None:
        for card in range(3):
            infoset = LeducInfoset(card, bet_sequences=[(), ()], board_card=None)
            log_qvals(writer, policy, infoset, global_step, text_only)
    elif infoset.player_to_act == -1:
        for board_card in range(3):
            infoset = LeducInfoset(card=infoset.card, bet_sequences=infoset.bet_sequences, board_card=board_card)
            log_qvals(writer, policy, infoset, global_step, text_only)
    elif infoset.is_terminal:
        return
    else:
        state = infoset_to_state(infoset)
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        q_vals = policy.qnetwork_local.forward(state).cpu().numpy()[0]

        node_name = "q_vals/" + str(infoset)
        node_name = node_name.replace(":", "_")

        for action in PlayerActions.ALL_ACTIONS:
            if action == PlayerActions.FOLD and infoset.can_fold:
                if not text_only:
                    writer.add_scalar(node_name+"/f", q_vals[action], global_step=global_step)
                logger.debug("Epoch %s QValue %s %s", e, node_name+"/f", q_vals[action])
                recurse(action)
            elif action == PlayerActions.BET_RAISE and infoset.can_raise:
                if not text_only:
                    writer.add_scalar(node_name+"/r", q_vals[action], global_step=global_step)
                logger.debug("Epoch %s QValue %s %s", e, node_name+"/r", q_vals[action])
                recurse(action)
            elif action == PlayerActions.CHECK_CALL:
                if not text_only:
                    writer.add_scalar(node_name + "/c", q_vals[action], global_step=global_step)
                logger.debug("Epoch %s QValue %s %s", e, node_name+"/c", q_vals[action])
                recurse(action)
Beispiel #3
0
    def test_action_prob_supervised(self):
        self.sut = LeducPoker.NFSP.Agent.NfspAgent(self.mock_q_policy, self.mock_supervised_trainer, nu=0)

        self.sut.leduc_supervised_policy.action_prob = MagicMock(return_value=[1, 0, 0])
        infoset = LeducInfoset(card=1, bet_sequences=[(PlayerActions.BET_RAISE,), ()], board_card=None)

        retval = self.sut.action_prob(infoset)

        self.assertEqual([1, 0, 0], retval)
        self.sut.leduc_supervised_policy.action_prob.assert_called_with(infoset)
Beispiel #4
0
    def test_notify_reward(self):
        self.sut = LeducPoker.NFSP.Agent.NfspAgent(self.mock_q_policy, self.mock_supervised_trainer, nu=0)
        self.sut.leduc_supervised_policy.action_prob = MagicMock(return_value=[0, 1, 0])

        infoset = LeducInfoset(card=1, bet_sequences=[(PlayerActions.CHECK_CALL,), ()], board_card=None)
        infoset_state = infoset_to_state(infoset)
        self.sut.get_action(infoset)

        self.mock_q_policy.add_sars = MagicMock()

        infoset_next = LeducInfoset(card=1, bet_sequences=[(PlayerActions.CHECK_CALL, PlayerActions.BET_RAISE), ()], board_card=None)
        infoset_next_state = infoset_to_state(infoset_next)

        self.sut.notify_reward(next_infoset=infoset_next, reward=123, is_terminal=True)

        # call_args[0] are the position args
        self.assertEqual(self.mock_q_policy.add_sars.call_args[0], tuple())
        self.assertEqual(self.mock_q_policy.add_sars.call_args[1]["state"].tolist(), infoset_state.tolist())
        self.assertEqual(self.mock_q_policy.add_sars.call_args[1]["action"], PlayerActions.CHECK_CALL)
        self.assertEqual(self.mock_q_policy.add_sars.call_args[1]["reward"], 123)
        self.assertEqual(self.mock_q_policy.add_sars.call_args[1]["next_state"].tolist(), infoset_next_state.tolist())
        self.assertEqual(self.mock_q_policy.add_sars.call_args[1]["is_terminal"], True)
Beispiel #5
0
    def test_bet_fold_game(self):
        def mock_random_sample(a, b):
            return [1, 0]

        def get_agent0_action(infoset: LeducInfoset):
            return PlayerActions.BET_RAISE

        def get_agent1_action(infoset: LeducInfoset):
            return PlayerActions.FOLD

        # P0 has queen, P1 has jack
        with mock.patch('random.sample', mock_random_sample):
            self.agents[0].get_action = MagicMock(side_effect=get_agent0_action)
            self.agents[1].get_action = MagicMock(side_effect=get_agent1_action)
            LeducPoker.NFSP.Agent.collect_trajectories(self.agents, num_games=1)

        self.agents[0].reset.assert_called_once_with()
        self.agents[1].reset.assert_called_once_with()

        self.assertEqual(
            self.agents[0].notify_reward.mock_calls[0][2],
            {"next_infoset": LeducInfoset(card=1, bet_sequences=[(),()], board_card=None), "reward": 0, "is_terminal": False})
        self.assertEqual(
            self.agents[1].notify_reward.mock_calls[0][2],
            {"next_infoset": LeducInfoset(card=0, bet_sequences=[(PlayerActions.BET_RAISE,), ()], board_card=None),
             "reward": 0, "is_terminal": False})

        self.assertEqual(
            self.agents[0].notify_reward.mock_calls[1][2],
            {"next_infoset": None, "reward": 1, "is_terminal": True})
        self.assertEqual(
            self.agents[1].notify_reward.mock_calls[1][2],
            {"next_infoset": None, "reward": -1, "is_terminal": True})

        self.assertEqual(2, len(self.agents[0].notify_reward.mock_calls))
        self.assertEqual(2, len(self.agents[1].notify_reward.mock_calls))
Beispiel #6
0
def make_agent(q_policy_parameters, supervised_trainer_parameters, nu):
    network_units = [64]
    state_size = infoset_to_state(LeducInfoset(card=0, bet_sequences=[(), ()], board_card=None)).shape[0]
    q_network_local = QNetwork(state_size=state_size, action_size=3, hidden_units=network_units).to(device)
    #q_network_target = QNetwork(state_size=state_size, action_size=3, hidden_units=network_units).to(device)
    q_network_target = None

    q_policy = QPolicy(
        nn_local=q_network_local,
        nn_target=q_network_target,
        parameters=q_policy_parameters)

    supervised_network = SupervisedNetwork(state_size=state_size, action_size=3, hidden_units=network_units).to(device)
    supervised_trainer = SupervisedTrainer(
        supervised_trainer_parameters=supervised_trainer_parameters, network=supervised_network)

    return NfspAgent(q_policy=q_policy, supervised_trainer=supervised_trainer, nu=nu)
Beispiel #7
0
    def test_action_prob_q(self):
        self.sut = LeducPoker.NFSP.Agent.NfspAgent(self.mock_q_policy, self.mock_supervised_trainer, nu=1.1)

        self.sut.use_q_policy = True
        self.sut.leduc_rl_policy.get_action = MagicMock(return_value=1)
        self.sut.supervised_trainer.add_observation = MagicMock()

        infoset = LeducInfoset(card=1, bet_sequences=[(PlayerActions.CHECK_CALL,), ()], board_card=None)
        infoset_state = infoset_to_state(infoset)

        retval = self.sut.action_prob(infoset)

        self.assertListEqual([0, 1, 0], retval.tolist())
        self.assertEqual(infoset_state.tolist(), self.sut.last_state.tolist())

        self.sut.leduc_rl_policy.get_action.assert_called_with(infoset)

        self.assertEqual(self.sut.supervised_trainer.add_observation.call_args[0][0].tolist(), infoset_state.tolist())
        self.assertEqual(self.sut.supervised_trainer.add_observation.call_args[0][1], 1)
    def _get_terminal_game_state_value(
            self, my_infoset: LeducPoker.LeducInfoset,
            opponent_card_probs: np.ndarray) -> np.ndarray:
        retval = np.zeros(2)
        player_cards = [0, 0]
        player_cards[self.player_num] = my_infoset.card

        for opponent_card in LeducPoker.LeducPokerGame.DECK:
            if opponent_card == my_infoset.card or opponent_card == my_infoset.board_card:
                assert opponent_card_probs[opponent_card] == 0
                continue
            player_cards[self.opponent_num] = opponent_card

            infoset_payoffs = my_infoset.get_payoffs(player_cards).astype(
                float)
            infoset_payoffs -= infoset_payoffs.sum(
            ) / 2  # Equivalent to subtracting off the cumulative value of the bets

            payoffs = opponent_card_probs[opponent_card] * infoset_payoffs
            retval += payoffs
        return retval
Beispiel #9
0
    def test_check_bet_call_game(self):
        def mock_random_sample(a, b):
            return [1, 2]

        def mock_random_choice(a):
            return 2

        def get_agent0_action(infoset: LeducInfoset):
            return PlayerActions.CHECK_CALL

        def get_agent1_action(infoset: LeducInfoset):
            return PlayerActions.BET_RAISE

        # P0 has queen, P1 has king
        with mock.patch('random.sample', mock_random_sample):
            with mock.patch('random.choice', mock_random_choice):
                self.agents[0].get_action = MagicMock(side_effect=get_agent0_action)
                self.agents[1].get_action = MagicMock(side_effect=get_agent1_action)
                LeducPoker.NFSP.Agent.collect_trajectories(self.agents, num_games=1)

        self.agents[0].reset.assert_called_once_with()
        self.agents[1].reset.assert_called_once_with()

        self.assertEqual(
            self.agents[0].notify_reward.mock_calls[0][2],
            {"next_infoset": LeducInfoset(1, bet_sequences=[(), ()], board_card=None), "reward": 0, "is_terminal": False})
        self.assertEqual(
            self.agents[1].notify_reward.mock_calls[0][2],
            {"next_infoset": LeducInfoset(2, bet_sequences=[(PlayerActions.CHECK_CALL,), ()], board_card=None),
             "reward": 0, "is_terminal": False})
        self.assertEqual(
            self.agents[0].notify_reward.mock_calls[1][2],
            {"next_infoset": LeducInfoset(1, bet_sequences=[(PlayerActions.CHECK_CALL, PlayerActions.BET_RAISE), ()],
                                          board_card=None),
             "reward": 0, "is_terminal": False})

        # 2nd round
        self.assertEqual(
            self.agents[0].notify_reward.mock_calls[2][2],
            {"next_infoset": LeducInfoset(1, bet_sequences=[(PlayerActions.CHECK_CALL, PlayerActions.BET_RAISE, PlayerActions.CHECK_CALL), ()], board_card=2),
             "reward": 0, "is_terminal": False})
        self.assertEqual(
            self.agents[1].notify_reward.mock_calls[1][2],
            {"next_infoset": LeducInfoset(2, bet_sequences=[(PlayerActions.CHECK_CALL, PlayerActions.BET_RAISE, PlayerActions.CHECK_CALL), (PlayerActions.CHECK_CALL,)], board_card=2),
             "reward": 0, "is_terminal": False})
        self.assertEqual(
            self.agents[0].notify_reward.mock_calls[3][2],
            {"next_infoset": LeducInfoset(1, bet_sequences=[(PlayerActions.CHECK_CALL, PlayerActions.BET_RAISE, PlayerActions.CHECK_CALL), (PlayerActions.CHECK_CALL, PlayerActions.BET_RAISE)],
                                          board_card=2),
             "reward": 0, "is_terminal": False})

        # Terminals
        self.assertEqual(
            self.agents[1].notify_reward.mock_calls[2][2],
            {"next_infoset": None, "reward": 7, "is_terminal": True})
        self.assertEqual(
            self.agents[0].notify_reward.mock_calls[4][2],
            {"next_infoset": None, "reward": -7, "is_terminal": True})

        self.assertEqual(5, len(self.agents[0].notify_reward.mock_calls))
        self.assertEqual(3, len(self.agents[1].notify_reward.mock_calls))