Exemple #1
0
    def test_notify_reward(self):
        self.sut = KuhnPoker.NFSP.Agent.NfspAgent(self.mock_q_policy,
                                                  self.mock_supervised_trainer,
                                                  nu=0)
        self.sut.kuhn_supervised_policy.aggressive_action_prob = MagicMock(
            return_value=1)

        infoset = KuhnInfoset(card=1, bet_sequence=(0, ))
        infoset_state = infoset_to_state(infoset)
        self.sut.get_action(infoset)

        self.mock_q_policy.add_sars = MagicMock()

        infoset_next = KuhnInfoset(card=1, bet_sequence=(0, 1))
        infoset_next_state = infoset_to_state(infoset_next)

        self.sut.notify_reward(next_infoset=infoset_next,
                               reward=123,
                               is_terminal=True)

        # call_args[0] are the position args
        self.assertEqual(self.mock_q_policy.add_sars.call_args[0], tuple())
        self.assertEqual(
            self.mock_q_policy.add_sars.call_args[1]["state"].tolist(),
            infoset_state.tolist())
        self.assertEqual(self.mock_q_policy.add_sars.call_args[1]["action"], 1)
        self.assertEqual(self.mock_q_policy.add_sars.call_args[1]["reward"],
                         123)
        self.assertEqual(
            self.mock_q_policy.add_sars.call_args[1]["next_state"].tolist(),
            infoset_next_state.tolist())
        self.assertEqual(
            self.mock_q_policy.add_sars.call_args[1]["is_terminal"], True)
Exemple #2
0
 def aggressive_action_prob(self, infoset: KuhnInfoset):
     state = infoset_to_state(infoset)
     state = torch.from_numpy(
         np.array(state)).float().unsqueeze(0).to(device)
     nn_retval = self.network.forward(state).cpu().detach()
     retval = nn_retval.cpu().detach().numpy()[0][0]
     return retval
Exemple #3
0
def collect_trajectories(policy: Policies.Policy, num_games: int):
    nash_policy = Policies.NashPolicy(0)
    nash_player = 0
    player_trajectories = [PlayerTrajectories(), PlayerTrajectories()]

    for _ in range(num_games):
        game = KuhnPokerGame.KuhnPokerGame()

        while not game.game_state.is_terminal:
            player_to_act = game.game_state.player_to_act
            infoset = game.game_state.infosets[player_to_act]

            if player_to_act == nash_player:
                action = nash_policy.get_action(infoset)
            else:
                infoset_state = infoset_to_state(infoset)
                infoset_state = torch.from_numpy(
                    np.array(infoset_state)).float().to(device)
                aggressive_action_prob = policy.forward(
                    infoset_state).cpu().detach()
                state = infoset_to_state(infoset)

                # Manually calculate the action so we don't have to re-evaluate the infoset
                action = int(
                    random.random() < aggressive_action_prob.numpy()[0])

            new_bet_sequence = game.game_state.bet_sequence + (action, )
            game.game_state.bet_sequence = new_bet_sequence
            if game.game_state.is_terminal:
                game_rewards = game.game_state.get_payoffs()
            else:
                game_rewards = 0, 0

            if player_to_act != nash_player:
                player_trajectories[player_to_act].add_transition(
                    state, action, aggressive_action_prob,
                    game_rewards[player_to_act])

            if game.game_state.is_terminal:
                other_player = (player_to_act + 1) % 2
                if other_player != nash_player:
                    player_trajectories[other_player].amend_last_reward(
                        game_rewards[other_player])
                player_trajectories[(nash_player + 1) %
                                    2].complete_trajectory()

    return player_trajectories
Exemple #4
0
def log_qvals(writer: SummaryWriter, policy: QPolicy, global_step: int):
    infoset = KuhnInfoset(0, ())

    for card in range(3):
        infoset.card = card

        infoset.bet_sequence = ()
        state = torch.from_numpy(
            infoset_to_state(infoset)).float().unsqueeze(0).to(device)
        q_vals = policy.qnetwork_local.forward(state).cpu().numpy()[0]
        node_name = "q_vals/%s/p0_open" % card_to_str(card)
        writer.add_scalar(node_name,
                          q_vals[1] - q_vals[0],
                          global_step=global_step)

        infoset.bet_sequence = (0, )
        state = torch.from_numpy(
            infoset_to_state(infoset)).float().unsqueeze(0).to(device)
        q_vals = policy.qnetwork_local.forward(state).cpu().numpy()[0]
        node_name = "q_vals/%s/p0_check/p1" % card_to_str(card)
        writer.add_scalar(node_name,
                          q_vals[1] - q_vals[0],
                          global_step=global_step)

        infoset.bet_sequence = (0, 1)
        state = torch.from_numpy(
            infoset_to_state(infoset)).float().unsqueeze(0).to(device)
        q_vals = policy.qnetwork_local.forward(state).cpu().numpy()[0]
        node_name = "q_vals/%s/p0_check/p1_bet/p0" % card_to_str(card)
        writer.add_scalar(node_name,
                          q_vals[1] - q_vals[0],
                          global_step=global_step)

        infoset.bet_sequence = (1, )
        state = torch.from_numpy(
            infoset_to_state(infoset)).float().unsqueeze(0).to(device)
        q_vals = policy.qnetwork_local.forward(state).cpu().numpy()[0]
        node_name = "q_vals/%s/p0_bet/p1" % card_to_str(card)
        writer.add_scalar(node_name,
                          q_vals[1] - q_vals[0],
                          global_step=global_step)
Exemple #5
0
    def aggressive_action_prob(self, infoset: KuhnInfoset):
        state = infoset_to_state(infoset)

        use_q = random.random() < self.nu
        if use_q:
            retval = self.kuhn_rl_policy.get_action(infoset)
            self.supervised_trainer.add_observation(state, retval)
        else:
            retval = self.kuhn_supervised_policy.aggressive_action_prob(infoset)

        self.last_state = state

        return retval
Exemple #6
0
    def notify_reward(self, next_infoset: Optional[KuhnInfoset], reward: float, is_terminal: bool):
        if self.last_action is None:
            assert reward == 0
            return

        if next_infoset is None:
            assert is_terminal

        assert self.last_state is not None
        assert self.last_action is not None

        next_state = infoset_to_state(next_infoset)
        self.q_policy.add_sars(
            state=self.last_state,
            action=self.last_action,
            reward=reward,
            next_state=next_state,
            is_terminal=is_terminal)
Exemple #7
0
    def test_aggressive_action_prob_q(self):
        self.sut = KuhnPoker.NFSP.Agent.NfspAgent(self.mock_q_policy,
                                                  self.mock_supervised_trainer,
                                                  nu=1.1)

        self.sut.kuhn_rl_policy.get_action = MagicMock(return_value=1)
        self.sut.supervised_trainer.add_observation = MagicMock()

        infoset = KuhnInfoset(card=1, bet_sequence=(1, ))
        infoset_state = infoset_to_state(infoset)

        retval = self.sut.aggressive_action_prob(infoset)

        self.assertEqual(1, retval)
        self.assertEqual(infoset_state.tolist(), self.sut.last_state.tolist())

        self.sut.kuhn_rl_policy.get_action.assert_called_with(infoset)

        self.assertEqual(
            self.sut.supervised_trainer.add_observation.call_args[0]
            [0].tolist(), infoset_state.tolist())
        self.assertEqual(
            self.sut.supervised_trainer.add_observation.call_args[0][1], 1)
 def test_game_start_jack(self):
     infoset = KuhnInfoset(0, ())
     state = infoset_to_state(infoset)
     self.assertEqual([1, 0, 0, 0, 0, 0, 0], state.tolist())
 def test_game_p0_check_p1_bet_queen(self):
     infoset = KuhnInfoset(1, (0, 1))
     state = infoset_to_state(infoset)
     self.assertEqual([0, 1, 0, 1, 0, 0, 1], state.tolist())
 def test_game_p0_check_king(self):
     infoset = KuhnInfoset(2, (0, ))
     state = infoset_to_state(infoset)
     self.assertEqual([0, 0, 1, 1, 0, 0, 0], state.tolist())
 def test_game_start_queen(self):
     infoset = KuhnInfoset(1, ())
     state = infoset_to_state(infoset)
     self.assertEqual([0, 1, 0, 0, 0, 0, 0], state.tolist())
Exemple #12
0
 def aggressive_action_prob(self, infoset: KuhnPokerGame.KuhnInfoset):
     state = infoset_to_state(infoset)
     state = torch.from_numpy(np.array(state)).float().to(device)
     retval = self.nn_policy.forward(state)
     return retval.cpu().detach().numpy()[0]
Exemple #13
0
 def get_action(self, infoset: KuhnInfoset):
     state = infoset_to_state(infoset)
     q_policy_action = self.q_policy.act(state, greedy=False)
     return q_policy_action