def test_encoder_encode(card_phase_observation):
    encoder = CardPhaseObservationEncoder()
    output = encoder.encode(card_phase_observation)
    assert len(output) == 78
    assert np.min(output) == 0
    assert np.max(output) == 1
    assert np.sum(output) == 18
Esempio n. 2
0
def test_play_episode():
    encoder = CardPhaseStateActionEncoder(CardPhaseObservationEncoder())
    round_1_output = play_episode(encoder)
    encoder.episode_done()
    round_2_output = play_episode(encoder)

    assert len(set(map(lambda entry: entry.mdp_id, round_1_output))) == 4
    assert len(round_1_output) == 72
    assert len(set(map(lambda entry: entry.mdp_id, round_1_output + round_2_output))) == 8
Esempio n. 3
0
def test_play_episode_card_phase_agent():
    encoder = CardPhaseStateActionEncoder(CardPhaseObservationEncoder())
    agent = AllPhaseAgent(card_phase_agent=CardPhaseAgent(Linear(len(CARDS), len(CARDS))))
    round_1_output = play_episode(encoder, agent=agent)
    encoder.episode_done()
    round_2_output = play_episode(encoder, agent=agent)

    assert len(set(map(lambda entry: entry.mdp_id, round_1_output))) == 4
    assert len(round_1_output) == 72
    assert len(set(map(lambda entry: entry.mdp_id, round_1_output + round_2_output))) == 8
Esempio n. 4
0
def test_encode_2_episodes(card_phase_observation, action, reward):
    player_position_towards_taker = 0
    encoder = CardPhaseStateActionEncoder(CardPhaseObservationEncoder())

    output_a = encoder.encode(player_position_towards_taker,
                              card_phase_observation, action, reward)
    encoder.episode_done()
    output_b = encoder.encode(player_position_towards_taker,
                              card_phase_observation, action, reward)

    assert output_a.mdp_id != output_b.mdp_id
class CardPhaseAgent(Agent):
    def __init__(self, policy_net: nn.Module):
        self.policy_net = policy_net
        self._initialize_internals()
        self._random_state = RandomState(seed=1988)
        self._card_phase_observation_encoder = CardPhaseObservationEncoder()

    def update_policy_net(self, policy_net_state_dict: Dict):
        self.policy_net.load_state_dict(policy_net_state_dict)

    def get_action(self,
                   observation: CardPhaseObservation) -> ActionWithProbability:
        self._step += 1
        if not self._random_action_policy.should_play_randomly(self._step):
            self.policy_net.eval()
            with torch.no_grad():
                action_with_probability = self.max_return_action(observation)
        else:
            action_with_probability = self.random_action(observation)
        return action_with_probability

    def max_return_action(
            self, observation: CardPhaseObservation) -> ActionWithProbability:
        indices = retrieve_allowed_card_indices(observation)
        encode = tensor(
            self._card_phase_observation_encoder.encode(observation)).float()
        probabilities = torch.softmax(self.policy_net(encode)[indices], dim=0)
        action = self._random_state.choice(indices,
                                           p=probabilities.detach().numpy())
        action_probability = probabilities[indices.index(
            action)].detach().numpy().item()
        return ActionWithProbability(action=action,
                                     probability=action_probability)

    def random_action(
            self, observation: CardPhaseObservation) -> ActionWithProbability:
        indices = retrieve_allowed_card_indices(observation)
        return ActionWithProbability(action=self._random_state.choice(indices),
                                     probability=1. / float(len(indices)))

    @property
    def device(self) -> str:
        return "cuda" if next(self.policy_net.parameters()).is_cuda else "cpu"

    def _initialize_internals(self):
        self._step = 0
        self._random_action_policy = Policy()
Esempio n. 6
0
def test_encoder(card_phase_observation, action, reward,
                 state_feature_expected_output):
    player_position_towards_taker = 0

    encoder = CardPhaseStateActionEncoder(CardPhaseObservationEncoder())
    output = encoder.encode(player_position_towards_taker,
                            card_phase_observation, action, reward)
    later_output = encoder.encode(player_position_towards_taker,
                                  card_phase_observation, action, reward)

    assert output.mdp_id == "0_0"
    assert later_output.sequence_number > output.sequence_number
    assert output.state_features == state_feature_expected_output
    assert all(
        map(lambda x: isinstance(x, float), output.state_features.values()))
    assert output.action == 28
    assert output.reward == reward
    assert output.possible_actions == [
        2, 5, 6, 13, 18, 25, 26, 28, 30, 36, 42, 47, 51, 59, 66, 68, 70, 77
    ]
    assert isinstance(output.action_probability, float)
    assert isinstance(output.dictionary, dict)
    assert isinstance(output.dictionary["state_features"], dict)
def play_episodes() -> Generator[List[ReAgentDataRow], None, None]:
    encoder = CardPhaseStateActionEncoder(CardPhaseObservationEncoder())
    while True:
        yield play_episode(encoder)
        encoder.episode_done()
 def __init__(self, policy_net: nn.Module):
     self.policy_net = policy_net
     self._initialize_internals()
     self._random_state = RandomState(seed=1988)
     self._card_phase_observation_encoder = CardPhaseObservationEncoder()