def test_encoder_encode(card_phase_observation): encoder = CardPhaseObservationEncoder() output = encoder.encode(card_phase_observation) assert len(output) == 78 assert np.min(output) == 0 assert np.max(output) == 1 assert np.sum(output) == 18
def test_play_episode(): encoder = CardPhaseStateActionEncoder(CardPhaseObservationEncoder()) round_1_output = play_episode(encoder) encoder.episode_done() round_2_output = play_episode(encoder) assert len(set(map(lambda entry: entry.mdp_id, round_1_output))) == 4 assert len(round_1_output) == 72 assert len(set(map(lambda entry: entry.mdp_id, round_1_output + round_2_output))) == 8
def test_play_episode_card_phase_agent(): encoder = CardPhaseStateActionEncoder(CardPhaseObservationEncoder()) agent = AllPhaseAgent(card_phase_agent=CardPhaseAgent(Linear(len(CARDS), len(CARDS)))) round_1_output = play_episode(encoder, agent=agent) encoder.episode_done() round_2_output = play_episode(encoder, agent=agent) assert len(set(map(lambda entry: entry.mdp_id, round_1_output))) == 4 assert len(round_1_output) == 72 assert len(set(map(lambda entry: entry.mdp_id, round_1_output + round_2_output))) == 8
def test_encode_2_episodes(card_phase_observation, action, reward): player_position_towards_taker = 0 encoder = CardPhaseStateActionEncoder(CardPhaseObservationEncoder()) output_a = encoder.encode(player_position_towards_taker, card_phase_observation, action, reward) encoder.episode_done() output_b = encoder.encode(player_position_towards_taker, card_phase_observation, action, reward) assert output_a.mdp_id != output_b.mdp_id
class CardPhaseAgent(Agent): def __init__(self, policy_net: nn.Module): self.policy_net = policy_net self._initialize_internals() self._random_state = RandomState(seed=1988) self._card_phase_observation_encoder = CardPhaseObservationEncoder() def update_policy_net(self, policy_net_state_dict: Dict): self.policy_net.load_state_dict(policy_net_state_dict) def get_action(self, observation: CardPhaseObservation) -> ActionWithProbability: self._step += 1 if not self._random_action_policy.should_play_randomly(self._step): self.policy_net.eval() with torch.no_grad(): action_with_probability = self.max_return_action(observation) else: action_with_probability = self.random_action(observation) return action_with_probability def max_return_action( self, observation: CardPhaseObservation) -> ActionWithProbability: indices = retrieve_allowed_card_indices(observation) encode = tensor( self._card_phase_observation_encoder.encode(observation)).float() probabilities = torch.softmax(self.policy_net(encode)[indices], dim=0) action = self._random_state.choice(indices, p=probabilities.detach().numpy()) action_probability = probabilities[indices.index( action)].detach().numpy().item() return ActionWithProbability(action=action, probability=action_probability) def random_action( self, observation: CardPhaseObservation) -> ActionWithProbability: indices = retrieve_allowed_card_indices(observation) return ActionWithProbability(action=self._random_state.choice(indices), probability=1. / float(len(indices))) @property def device(self) -> str: return "cuda" if next(self.policy_net.parameters()).is_cuda else "cpu" def _initialize_internals(self): self._step = 0 self._random_action_policy = Policy()
def test_encoder(card_phase_observation, action, reward, state_feature_expected_output): player_position_towards_taker = 0 encoder = CardPhaseStateActionEncoder(CardPhaseObservationEncoder()) output = encoder.encode(player_position_towards_taker, card_phase_observation, action, reward) later_output = encoder.encode(player_position_towards_taker, card_phase_observation, action, reward) assert output.mdp_id == "0_0" assert later_output.sequence_number > output.sequence_number assert output.state_features == state_feature_expected_output assert all( map(lambda x: isinstance(x, float), output.state_features.values())) assert output.action == 28 assert output.reward == reward assert output.possible_actions == [ 2, 5, 6, 13, 18, 25, 26, 28, 30, 36, 42, 47, 51, 59, 66, 68, 70, 77 ] assert isinstance(output.action_probability, float) assert isinstance(output.dictionary, dict) assert isinstance(output.dictionary["state_features"], dict)
def play_episodes() -> Generator[List[ReAgentDataRow], None, None]: encoder = CardPhaseStateActionEncoder(CardPhaseObservationEncoder()) while True: yield play_episode(encoder) encoder.episode_done()
def __init__(self, policy_net: nn.Module): self.policy_net = policy_net self._initialize_internals() self._random_state = RandomState(seed=1988) self._card_phase_observation_encoder = CardPhaseObservationEncoder()