def get_next_state(state, player_id, action, hanabi_game):
    """Simulate the action, get new state and reward"""
    action = HanabiEnv.build_move_static(action)
    prev_state = state.copy()
    state.apply_move(action)
    observation = HanabiEnv.extract_dict_static(player_id,
                                                state.observation(player_id),
                                                state)
    reward = calculate_reward(prev_state, state, action, hanabi_game)
    return QState(observation), reward
Beispiel #2
0
    def __init__(self,
                 colors: int = 5,
                 ranks: int = 5,
                 players: int = 2,
                 hand_size: int = 2,
                 max_information_tokens: int = 8,
                 max_life_tokens: int = 3,
                 observation_type: int = 1,
                 seed=None,
                 random_start_player: bool = False,
                 ):

        """
        Parameter descriptions :
              - colors: int, Number of colors in [2,5].
              - ranks: int, Number of ranks in [2,5].
              - players: int, Number of players in [2,5].
              - hand_size: int, Hand size in [2,5].
              - max_information_tokens: int, Number of information tokens (>=0).
              - max_life_tokens: int, Number of life tokens (>=1).
              - observation_type: int.
                    0: Minimal observation.
                    1: First-order common knowledge observation.
              - seed: int, Random seed or None.
              - random_start_player: bool, Random start player.

        Common game configurations:
            Hanabi-Full (default) :  {
                "colors": 5,
                "ranks": 5,
                "players": 2,
                "max_information_tokens": 8,
                "max_life_tokens": 3,
                "observation_type": 1,
                "hand_size": 2
                }

            Hanabi-Small : {
                "colors": 5,
                "ranks": 5,
                "players": 2,
                "max_information_tokens":
                "max_life_tokens":
                "observation_type": 1}

            Hanabi-Very-Small : {
                "colors": 2,
                "ranks": 5,
                "players": 2,
                "max_information_tokens":
                "max_life_tokens":
                "observation_type": 1}

        """
        EzPickle.__init__(
            self,
            colors,
            ranks,
            players,
            hand_size,
            max_information_tokens,
            max_life_tokens,
            observation_type,
            seed,
            random_start_player,
        )

        seed = seeding.create_seed(seed, max_bytes=3)

        # importing Hanabi and throw error message if pypi package is not installed correctly.
        try:
            from hanabi_learning_environment.rl_env import HanabiEnv, make

        except ModuleNotFoundError:
            raise ImportError(
                (
                    "Hanabi is not installed.\n",
                    "Run ´pip3 install hanabi_learning_environment´ from within your project environment.\n",
                    "Consult hanabi/README.md for detailed information."
                )
            )

        else:

            # ToDo: Starts
            # Check if all possible dictionary values are within a certain ranges.
            self._raise_error_if_config_values_out_of_range(colors,
                                                            ranks,
                                                            players,
                                                            hand_size,
                                                            max_information_tokens,
                                                            max_life_tokens,
                                                            observation_type,
                                                            random_start_player)

            self.hanabi_env: HanabiEnv = HanabiEnv(config={'colors': colors,
                                                           'ranks': ranks,
                                                           'players': players,
                                                           'hand_size': hand_size,
                                                           'max_information_tokens': max_information_tokens,
                                                           'max_life_tokens': max_life_tokens,
                                                           'observation_type': observation_type,
                                                           'random_start_player': random_start_player,
                                                           'seed': seed})

            # List of agent names
            self.agents = ["player_{}".format(i) for i in range(self.hanabi_env.players)]

            self.agent_selection: str

            # Sets hanabi game to clean state and updates all internal dictionaries
            self.reset(observe=False)

            # Set action_spaces and observation_spaces based on params in hanabi_env
            self.action_spaces = {name: spaces.Discrete(self.hanabi_env.num_moves()) for name in self.agents}
            self.observation_spaces = {player_name: spaces.Box(low=0,
                                                               high=1,
                                                               shape=(self.hanabi_env.vectorized_observation_shape()[0],),
                                                               dtype=np.float32)
                                       for player_name in self.agents}
Beispiel #3
0
    def __init__(
        self,
        colors: int = 5,
        ranks: int = 5,
        players: int = 2,
        hand_size: int = 5,
        max_information_tokens: int = 8,
        max_life_tokens: int = 3,
        observation_type: int = 1,
        random_start_player: bool = False,
    ):
        """
        Parameter descriptions :
              - colors: int, Number of colors in [2,5].
              - ranks: int, Number of ranks in [2,5].
              - players: int, Number of players in [2,5].
              - hand_size: int, Hand size in [2,5].
              - max_information_tokens: int, Number of information tokens (>=0).
              - max_life_tokens: int, Number of life tokens (>=1).
              - observation_type: int.
                    0: Minimal observation.
                    1: First-order common knowledge observation.
              - random_start_player: bool, Random start player.

        Common game configurations:
            Hanabi-Full (default) :  {
                "colors": 5,
                "ranks": 5,
                "players": 2,
                "max_information_tokens": 8,
                "max_life_tokens": 3,
                "hand_size": (4 if players >= 4 else 5)
                "observation_type": 1,
                "hand_size": 2
                }

            Hanabi-Small : {
                "colors": 2,
                "ranks": 5,
                "players": 2,
                "max_information_tokens": 3
                "hand_size": 2,
                "max_life_tokens": 1
                "observation_type": 1}

            Hanabi-Very-Small : {
                "colors": 1,
                "ranks": 5,
                "players": 2,
                "max_information_tokens": 3
                "hand_size": 2,
                "max_life_tokens": 1
                "observation_type": 1}

        """
        EzPickle.__init__(
            self,
            colors,
            ranks,
            players,
            hand_size,
            max_information_tokens,
            max_life_tokens,
            observation_type,
            random_start_player,
        )

        # ToDo: Starts
        # Check if all possible dictionary values are within a certain ranges.
        self._raise_error_if_config_values_out_of_range(
            colors, ranks, players, hand_size, max_information_tokens,
            max_life_tokens, observation_type, random_start_player)

        self._config = {
            'colors': colors,
            'ranks': ranks,
            'players': players,
            'hand_size': hand_size,
            'max_information_tokens': max_information_tokens,
            'max_life_tokens': max_life_tokens,
            'observation_type': observation_type,
            'random_start_player': random_start_player,
        }
        self.hanabi_env: HanabiEnv = HanabiEnv(config=self._config)

        # List of agent names
        self.agents = [f"player_{i}" for i in range(self.hanabi_env.players)]
        self.possible_agents = self.agents[:]

        self.agent_selection: str

        # Sets hanabi game to clean state and updates all internal dictionaries
        self.reset()

        # Set action_spaces and observation_spaces based on params in hanabi_env
        self.action_spaces = {
            name: spaces.Discrete(self.hanabi_env.num_moves())
            for name in self.agents
        }
        self.observation_spaces = {
            player_name: spaces.Dict({
                'observation':
                spaces.Box(
                    low=0,
                    high=1,
                    shape=(
                        self.hanabi_env.vectorized_observation_shape()[0], ),
                    dtype=np.float32),
                'action_mask':
                spaces.Box(low=0,
                           high=1,
                           shape=(self.hanabi_env.num_moves(), ),
                           dtype=np.int8)
            })
            for player_name in self.agents
        }
Beispiel #4
0
class raw_env(AECEnv, EzPickle):
    """This class capsules endpoints provided within deepmind/hanabi-learning-environment/rl_env.py."""

    metadata = {
        "render_modes": ["human"],
        "name": "hanabi_v4",
        "is_parallelizable": False,
        "render_fps": 2,
    }

    # set of all required params
    required_keys: set = {
        'colors',
        'ranks',
        'players',
        'hand_size',
        'max_information_tokens',
        'max_life_tokens',
        'observation_type',
        'random_start_player',
    }

    def __init__(
        self,
        colors: int = 5,
        ranks: int = 5,
        players: int = 2,
        hand_size: int = 5,
        max_information_tokens: int = 8,
        max_life_tokens: int = 3,
        observation_type: int = 1,
        random_start_player: bool = False,
    ):
        """
        Parameter descriptions :
              - colors: int, Number of colors in [2,5].
              - ranks: int, Number of ranks in [2,5].
              - players: int, Number of players in [2,5].
              - hand_size: int, Hand size in [2,5].
              - max_information_tokens: int, Number of information tokens (>=0).
              - max_life_tokens: int, Number of life tokens (>=1).
              - observation_type: int.
                    0: Minimal observation.
                    1: First-order common knowledge observation.
              - random_start_player: bool, Random start player.

        Common game configurations:
            Hanabi-Full (default) :  {
                "colors": 5,
                "ranks": 5,
                "players": 2,
                "max_information_tokens": 8,
                "max_life_tokens": 3,
                "hand_size": (4 if players >= 4 else 5)
                "observation_type": 1,
                "hand_size": 2
                }

            Hanabi-Small : {
                "colors": 2,
                "ranks": 5,
                "players": 2,
                "max_information_tokens": 3
                "hand_size": 2,
                "max_life_tokens": 1
                "observation_type": 1}

            Hanabi-Very-Small : {
                "colors": 1,
                "ranks": 5,
                "players": 2,
                "max_information_tokens": 3
                "hand_size": 2,
                "max_life_tokens": 1
                "observation_type": 1}

        """
        EzPickle.__init__(
            self,
            colors,
            ranks,
            players,
            hand_size,
            max_information_tokens,
            max_life_tokens,
            observation_type,
            random_start_player,
        )

        # ToDo: Starts
        # Check if all possible dictionary values are within a certain ranges.
        self._raise_error_if_config_values_out_of_range(
            colors, ranks, players, hand_size, max_information_tokens,
            max_life_tokens, observation_type, random_start_player)

        self._config = {
            'colors': colors,
            'ranks': ranks,
            'players': players,
            'hand_size': hand_size,
            'max_information_tokens': max_information_tokens,
            'max_life_tokens': max_life_tokens,
            'observation_type': observation_type,
            'random_start_player': random_start_player,
        }
        self.hanabi_env: HanabiEnv = HanabiEnv(config=self._config)

        # List of agent names
        self.agents = [f"player_{i}" for i in range(self.hanabi_env.players)]
        self.possible_agents = self.agents[:]

        self.agent_selection: str

        # Sets hanabi game to clean state and updates all internal dictionaries
        self.reset()

        # Set action_spaces and observation_spaces based on params in hanabi_env
        self.action_spaces = {
            name: spaces.Discrete(self.hanabi_env.num_moves())
            for name in self.agents
        }
        self.observation_spaces = {
            player_name: spaces.Dict({
                'observation':
                spaces.Box(
                    low=0,
                    high=1,
                    shape=(
                        self.hanabi_env.vectorized_observation_shape()[0], ),
                    dtype=np.float32),
                'action_mask':
                spaces.Box(low=0,
                           high=1,
                           shape=(self.hanabi_env.num_moves(), ),
                           dtype=np.int8)
            })
            for player_name in self.agents
        }

    def observation_space(self, agent):
        return self.observation_spaces[agent]

    def action_space(self, agent):
        return self.action_spaces[agent]

    def seed(self, seed=None):
        config = dict(seed=seed, **self._config)
        self.hanabi_env = HanabiEnv(config=config)

    @staticmethod
    def _raise_error_if_config_values_out_of_range(
            colors, ranks, players, hand_size, max_information_tokens,
            max_life_tokens, observation_type, random_start_player):

        if not (2 <= colors <= 5):
            raise ValueError(
                f'Config parameter {colors} is out of bounds. See description in hanabi.py.'
            )

        elif not (2 <= ranks <= 5):
            raise ValueError(
                f'Config parameter {ranks} is out of bounds. See description in hanabi.py.'
            )

        elif not (2 <= players <= 5):
            raise ValueError(
                f'Config parameter {players} is out of bounds. See description in hanabi.py.'
            )

        elif not (players <= colors):
            raise ValueError(
                f'Config parameter colors: {colors} is smaller than players: {players}, which is not allowed. See description in hanabi.py.'
            )

        elif not (2 <= hand_size <= 5):
            raise ValueError(
                f'Config parameter {hand_size} is out of bounds. See description in hanabi.py.'
            )

        elif not (0 <= max_information_tokens):
            raise ValueError(
                f'Config parameter {max_information_tokens} is out of bounds. See description in hanabi.py.'
            )

        elif not (1 <= max_life_tokens):
            raise ValueError(
                f'Config parameter {max_life_tokens} is out of bounds. See description in hanabi.py.'
            )

        elif not (0 <= observation_type <= 1):
            raise ValueError(
                f'Config parameter {observation_type} is out of bounds. See description in hanabi.py.'
            )

    @property
    def observation_vector_dim(self):
        return self.hanabi_env.vectorized_observation_shape()

    @property
    def legal_moves(self) -> List[int]:
        return self.infos[self.agent_selection]['legal_moves']

    @property
    def all_moves(self) -> List[int]:
        return list(range(0, self.hanabi_env.num_moves()))

    # ToDo: Fix Return value
    def reset(self, seed=None):
        """ Resets the environment for a new game and returns observations of current player as List of ints

        Returns:
            observation: Optional list of integers of length self.observation_vector_dim, describing observations of
            current agent (agent_selection).
        """

        if seed is not None:
            self.seed(seed=seed)

        self.agents = self.possible_agents[:]
        # Reset underlying hanabi reinforcement learning environment
        obs = self.hanabi_env.reset()

        # Reset agent and agent_selection
        self._reset_agents(player_number=obs['current_player'])

        self.rewards = {agent: 0 for agent in self.agents}
        self._cumulative_rewards = {name: 0 for name in self.agents}
        # Reset internal state
        self._process_latest_observations(obs=obs)

    def _reset_agents(self, player_number: int):
        """ Rearrange self.agents as pyhanabi starts a different player after each reset(). """

        # Shifts self.agents list as long order starting player is not according to player_number
        while not self.agents[0] == 'player_' + str(player_number):
            self.agents = self.agents[1:] + [self.agents[0]]

        # Agent order list, on which the agent selector operates on.
        self._agent_selector = agent_selector(self.agents)

        # Reset agent_selection
        self.agent_selection = self._agent_selector.reset()

    def _step_agents(self):
        self.agent_selection = self._agent_selector.next()

    def step(
        self,
        action: int,
        observe: bool = True,
        as_vector: bool = True
    ) -> Optional[Union[np.ndarray, List[List[dict]]]]:
        """ Advances the environment by one step. Action must be within self.legal_moves, otherwise throws error.

        Returns:
            observation: Optional List of new observations of agent at turn after the action step is performed.
            By default a list of integers, describing the logic state of the game from the view of the agent.
            Can be a returned as a descriptive dictionary, if as_vector=False.
        """
        if self.dones[self.agent_selection]:
            return self._was_done_step(action)
        action = int(action)

        agent_on_turn = self.agent_selection

        if action not in self.legal_moves:
            raise ValueError(
                'Illegal action. Please choose between legal actions, as documented in dict self.infos'
            )

        else:
            # Iterate agent_selection
            self._step_agents()

            # Apply action
            all_observations, reward, done, _ = self.hanabi_env.step(
                action=action)

            # Update internal state
            self._process_latest_observations(obs=all_observations,
                                              reward=reward,
                                              done=done)

            # sets current reward for 0 to initialize reward accumulation
            self._cumulative_rewards[agent_on_turn] = 0
            self._accumulate_rewards()

    def observe(self, agent_name: str):
        observation = np.array(
            self.infos[agent_name]['observations_vectorized'],
            np.float32) if agent_name in self.infos else np.zeros_like(
                self.observation_spaces[agent_name].low)

        legal_moves = self.infos[agent_name]['legal_moves']
        action_mask = np.zeros(self.hanabi_env.num_moves(), 'int8')
        for i in legal_moves:
            action_mask[i] = 1

        return {'observation': observation, 'action_mask': action_mask}

    def _process_latest_observations(self,
                                     obs: Dict,
                                     reward: Optional[float] = 0,
                                     done: Optional[bool] = False):
        """Updates internal state"""

        self.latest_observations = obs
        self.rewards = {a: reward for a in self.agents}
        self.dones = {player_name: done for player_name in self.agents}

        # Here we have to deal with the player index with offset = 1
        self.infos = {
            player_name: dict(
                legal_moves=self.latest_observations['player_observations'][
                    int(player_name[-1])]['legal_moves_as_int'],
                # legal_moves_as_dict=self.latest_observations['player_observations'][int(player_name[-1])]['legal_moves'],
                observations_vectorized=self.
                latest_observations['player_observations'][int(
                    player_name[-1])]['vectorized'],
                # observations=self.latest_observations['player_observations'][int(player_name[-1])
            )
            for player_name in self.agents
        }

    def render(self, mode='human'):
        """ Supports console print only. Prints player's data.

         Example:
        """
        player_data = self.latest_observations['player_observations']
        print("Active player:",
              self.possible_agents[player_data[0]['current_player_offset']])
        for i, d in enumerate(player_data):
            print(self.possible_agents[i])
            print("========")
            print(d['pyhanabi'])
            print()

    def close(self):
        pass
Beispiel #5
0
 def seed(self, seed=None):
     config = dict(seed=seed, **self._config)
     self.hanabi_env = HanabiEnv(config=config)