def test_notify_reward(self): self.sut = KuhnPoker.NFSP.Agent.NfspAgent(self.mock_q_policy, self.mock_supervised_trainer, nu=0) self.sut.kuhn_supervised_policy.aggressive_action_prob = MagicMock( return_value=1) infoset = KuhnInfoset(card=1, bet_sequence=(0, )) infoset_state = infoset_to_state(infoset) self.sut.get_action(infoset) self.mock_q_policy.add_sars = MagicMock() infoset_next = KuhnInfoset(card=1, bet_sequence=(0, 1)) infoset_next_state = infoset_to_state(infoset_next) self.sut.notify_reward(next_infoset=infoset_next, reward=123, is_terminal=True) # call_args[0] are the position args self.assertEqual(self.mock_q_policy.add_sars.call_args[0], tuple()) self.assertEqual( self.mock_q_policy.add_sars.call_args[1]["state"].tolist(), infoset_state.tolist()) self.assertEqual(self.mock_q_policy.add_sars.call_args[1]["action"], 1) self.assertEqual(self.mock_q_policy.add_sars.call_args[1]["reward"], 123) self.assertEqual( self.mock_q_policy.add_sars.call_args[1]["next_state"].tolist(), infoset_next_state.tolist()) self.assertEqual( self.mock_q_policy.add_sars.call_args[1]["is_terminal"], True)
def aggressive_action_prob(self, infoset: KuhnInfoset): state = infoset_to_state(infoset) state = torch.from_numpy( np.array(state)).float().unsqueeze(0).to(device) nn_retval = self.network.forward(state).cpu().detach() retval = nn_retval.cpu().detach().numpy()[0][0] return retval
def collect_trajectories(policy: Policies.Policy, num_games: int): nash_policy = Policies.NashPolicy(0) nash_player = 0 player_trajectories = [PlayerTrajectories(), PlayerTrajectories()] for _ in range(num_games): game = KuhnPokerGame.KuhnPokerGame() while not game.game_state.is_terminal: player_to_act = game.game_state.player_to_act infoset = game.game_state.infosets[player_to_act] if player_to_act == nash_player: action = nash_policy.get_action(infoset) else: infoset_state = infoset_to_state(infoset) infoset_state = torch.from_numpy( np.array(infoset_state)).float().to(device) aggressive_action_prob = policy.forward( infoset_state).cpu().detach() state = infoset_to_state(infoset) # Manually calculate the action so we don't have to re-evaluate the infoset action = int( random.random() < aggressive_action_prob.numpy()[0]) new_bet_sequence = game.game_state.bet_sequence + (action, ) game.game_state.bet_sequence = new_bet_sequence if game.game_state.is_terminal: game_rewards = game.game_state.get_payoffs() else: game_rewards = 0, 0 if player_to_act != nash_player: player_trajectories[player_to_act].add_transition( state, action, aggressive_action_prob, game_rewards[player_to_act]) if game.game_state.is_terminal: other_player = (player_to_act + 1) % 2 if other_player != nash_player: player_trajectories[other_player].amend_last_reward( game_rewards[other_player]) player_trajectories[(nash_player + 1) % 2].complete_trajectory() return player_trajectories
def log_qvals(writer: SummaryWriter, policy: QPolicy, global_step: int): infoset = KuhnInfoset(0, ()) for card in range(3): infoset.card = card infoset.bet_sequence = () state = torch.from_numpy( infoset_to_state(infoset)).float().unsqueeze(0).to(device) q_vals = policy.qnetwork_local.forward(state).cpu().numpy()[0] node_name = "q_vals/%s/p0_open" % card_to_str(card) writer.add_scalar(node_name, q_vals[1] - q_vals[0], global_step=global_step) infoset.bet_sequence = (0, ) state = torch.from_numpy( infoset_to_state(infoset)).float().unsqueeze(0).to(device) q_vals = policy.qnetwork_local.forward(state).cpu().numpy()[0] node_name = "q_vals/%s/p0_check/p1" % card_to_str(card) writer.add_scalar(node_name, q_vals[1] - q_vals[0], global_step=global_step) infoset.bet_sequence = (0, 1) state = torch.from_numpy( infoset_to_state(infoset)).float().unsqueeze(0).to(device) q_vals = policy.qnetwork_local.forward(state).cpu().numpy()[0] node_name = "q_vals/%s/p0_check/p1_bet/p0" % card_to_str(card) writer.add_scalar(node_name, q_vals[1] - q_vals[0], global_step=global_step) infoset.bet_sequence = (1, ) state = torch.from_numpy( infoset_to_state(infoset)).float().unsqueeze(0).to(device) q_vals = policy.qnetwork_local.forward(state).cpu().numpy()[0] node_name = "q_vals/%s/p0_bet/p1" % card_to_str(card) writer.add_scalar(node_name, q_vals[1] - q_vals[0], global_step=global_step)
def aggressive_action_prob(self, infoset: KuhnInfoset): state = infoset_to_state(infoset) use_q = random.random() < self.nu if use_q: retval = self.kuhn_rl_policy.get_action(infoset) self.supervised_trainer.add_observation(state, retval) else: retval = self.kuhn_supervised_policy.aggressive_action_prob(infoset) self.last_state = state return retval
def notify_reward(self, next_infoset: Optional[KuhnInfoset], reward: float, is_terminal: bool): if self.last_action is None: assert reward == 0 return if next_infoset is None: assert is_terminal assert self.last_state is not None assert self.last_action is not None next_state = infoset_to_state(next_infoset) self.q_policy.add_sars( state=self.last_state, action=self.last_action, reward=reward, next_state=next_state, is_terminal=is_terminal)
def test_aggressive_action_prob_q(self): self.sut = KuhnPoker.NFSP.Agent.NfspAgent(self.mock_q_policy, self.mock_supervised_trainer, nu=1.1) self.sut.kuhn_rl_policy.get_action = MagicMock(return_value=1) self.sut.supervised_trainer.add_observation = MagicMock() infoset = KuhnInfoset(card=1, bet_sequence=(1, )) infoset_state = infoset_to_state(infoset) retval = self.sut.aggressive_action_prob(infoset) self.assertEqual(1, retval) self.assertEqual(infoset_state.tolist(), self.sut.last_state.tolist()) self.sut.kuhn_rl_policy.get_action.assert_called_with(infoset) self.assertEqual( self.sut.supervised_trainer.add_observation.call_args[0] [0].tolist(), infoset_state.tolist()) self.assertEqual( self.sut.supervised_trainer.add_observation.call_args[0][1], 1)
def test_game_start_jack(self): infoset = KuhnInfoset(0, ()) state = infoset_to_state(infoset) self.assertEqual([1, 0, 0, 0, 0, 0, 0], state.tolist())
def test_game_p0_check_p1_bet_queen(self): infoset = KuhnInfoset(1, (0, 1)) state = infoset_to_state(infoset) self.assertEqual([0, 1, 0, 1, 0, 0, 1], state.tolist())
def test_game_p0_check_king(self): infoset = KuhnInfoset(2, (0, )) state = infoset_to_state(infoset) self.assertEqual([0, 0, 1, 1, 0, 0, 0], state.tolist())
def test_game_start_queen(self): infoset = KuhnInfoset(1, ()) state = infoset_to_state(infoset) self.assertEqual([0, 1, 0, 0, 0, 0, 0], state.tolist())
def aggressive_action_prob(self, infoset: KuhnPokerGame.KuhnInfoset): state = infoset_to_state(infoset) state = torch.from_numpy(np.array(state)).float().to(device) retval = self.nn_policy.forward(state) return retval.cpu().detach().numpy()[0]
def get_action(self, infoset: KuhnInfoset): state = infoset_to_state(infoset) q_policy_action = self.q_policy.act(state, greedy=False) return q_policy_action