Ejemplo n.º 1
0
    def test_simple_game(self):
        game = pyspiel.load_efg_game(SIMPLE_EFG_DATA)
        env = rl_environment.Environment(game=game)
        agent = dqn.DQN(
            0,
            state_representation_size=game.information_state_tensor_shape()[0],
            num_actions=game.num_distinct_actions(),
            hidden_layers_sizes=[16],
            replay_buffer_capacity=100,
            batch_size=5,
            epsilon_start=0.02,
            epsilon_end=0.01)
        total_reward = 0

        for _ in range(100):
            time_step = env.reset()
            while not time_step.last():
                agent_output = agent.step(time_step)
                time_step = env.step([agent_output.action])
                total_reward += time_step.rewards[0]
            agent.step(time_step)
        self.assertGreaterEqual(total_reward, 75)
Ejemplo n.º 2
0
    def test_run_tic_tac_toe(self):
        env = rl_environment.Environment("tic_tac_toe")
        state_size = env.observation_spec()["info_state"][0]
        num_actions = env.action_spec()["num_actions"]

        agents = [
            dqn.DQN(  # pylint: disable=g-complex-comprehension
                player_id,
                state_representation_size=state_size,
                num_actions=num_actions,
                hidden_layers_sizes=[16],
                replay_buffer_capacity=10,
                batch_size=5) for player_id in [0, 1]
        ]
        time_step = env.reset()
        while not time_step.last():
            current_player = time_step.observations["current_player"]
            current_agent = agents[current_player]
            agent_output = current_agent.step(time_step)
            time_step = env.step([agent_output.action])

        for agent in agents:
            agent.step(time_step)
Ejemplo n.º 3
0
    def test_run_hanabi(self):
        # Hanabi is an optional game, so check we have it before running the test.
        game = "hanabi"
        if game not in pyspiel.registered_names():
            return

        num_players = 3
        env_configs = {
            "players": num_players,
            "max_life_tokens": 1,
            "colors": 2,
            "ranks": 3,
            "hand_size": 2,
            "max_information_tokens": 3,
            "discount": 0.
        }
        env = rl_environment.Environment(game, **env_configs)
        state_size = env.observation_spec()["info_state"][0]
        num_actions = env.action_spec()["num_actions"]

        agents = [
            dqn.DQN(  # pylint: disable=g-complex-comprehension
                player_id,
                state_representation_size=state_size,
                num_actions=num_actions,
                hidden_layers_sizes=[16],
                replay_buffer_capacity=10,
                batch_size=5) for player_id in range(num_players)
        ]
        time_step = env.reset()
        while not time_step.last():
            current_player = time_step.observations["current_player"]
            agent_output = [agent.step(time_step) for agent in agents]
            time_step = env.step([agent_output[current_player].action])

        for agent in agents:
            agent.step(time_step)
Ejemplo n.º 4
0
    def __init__(self,
                 game,
                 player_id,
                 state_size,
                 num_actions,
                 embedding_network_layers=(128, ),
                 embedding_size=16,
                 dqn_hidden_layers=(128, 128),
                 batch_size=16,
                 trajectory_len=10,
                 num_neighbours=5,
                 learning_rate=1e-4,
                 mixing_parameter=0.9,
                 memory_capacity=int(1e6),
                 discount_factor=1.0,
                 update_target_network_every=1000,
                 epsilon_start=1.0,
                 epsilon_end=0.1,
                 epsilon_decay_duration=int(1e4),
                 embedding_as_parametric_input=False):
        """Initialize the Ephemeral VAlue Adjustment algorithm.

    Args:
      game: (rl_environment.Environment) Open Spiel game.
      player_id: (int) Player id for this player.
      state_size: (int) Size of info state vector.
      num_actions: (int) number of actions.
      embedding_network_layers: (list[int]) Layer sizes of strategy net MLP.
      embedding_size: (int) Size of memory embeddings.
      dqn_hidden_layers: (list(int)) MLP layer sizes of DQN network.
      batch_size: (int) Size of batches for DQN learning steps.
      trajectory_len: (int) Length of trajectories from replay buffer.
      num_neighbours: (int) Number of neighbours to fetch from replay buffer.
      learning_rate: (float) Learning rate.
      mixing_parameter: (float) Value mixing parameter between 0 and 1.
      memory_capacity: Number af samples that can be stored in memory.
      discount_factor: (float) Discount factor for Q-Learning.
      update_target_network_every: How often to update DQN target network.
      epsilon_start: (float) Starting epsilon-greedy value.
      epsilon_end: (float) Final epsilon-greedy value.
      epsilon_decay_duration: (float) Number of steps over which epsilon decays.
      embedding_as_parametric_input: (bool) Whether we use embeddings as input
        to the parametric model.
    """
        assert (mixing_parameter >= 0 and mixing_parameter <= 1)
        self._game = game
        self.player_id = player_id
        self._env = game
        self._num_actions = num_actions
        self._info_state_size = state_size
        self._embedding_size = embedding_size
        self._lambda = mixing_parameter
        self._trajectory_len = trajectory_len
        self._num_neighbours = num_neighbours
        self._discount = discount_factor
        self._epsilon_start = epsilon_start
        self._epsilon_end = epsilon_end
        self._epsilon_decay_duration = epsilon_decay_duration
        self._last_time_step = None
        self._last_action = None
        self._embedding_as_parametric_input = embedding_as_parametric_input

        self._embedding_network = dqn.MLP(self._info_state_size,
                                          list(embedding_network_layers),
                                          embedding_size)

        # The DQN agent requires this be an integer.
        if not isinstance(memory_capacity, int):
            raise ValueError("Memory capacity not an integer.")

        # Initialize the parametric & non-parametric Q-networks.
        self._agent = dqn.DQN(
            player_id,
            state_representation_size=self._info_state_size,
            num_actions=self._num_actions,
            hidden_layers_sizes=list(dqn_hidden_layers),
            replay_buffer_capacity=memory_capacity,
            replay_buffer_class=QueryableFixedSizeRingBuffer,
            batch_size=batch_size,
            learning_rate=learning_rate,
            update_target_network_every=update_target_network_every,
            learn_every=batch_size,
            discount_factor=1.0,
            epsilon_start=1.0,
            epsilon_end=0.1,
            epsilon_decay_duration=int(1e6))
        # Initialize Value Buffers - Fetch Replay buffers from agents.
        self._value_buffer = QueryableFixedSizeRingBuffer(memory_capacity)
        self._replay_buffer = self._agent.replay_buffer

        # Initialize non-parametric & EVA Q-values.
        self._v_np = collections.defaultdict(float)
        self._q_np = collections.defaultdict(lambda: [0] * self._num_actions)
        self._q_eva = collections.defaultdict(lambda: [0] * self._num_actions)
Ejemplo n.º 5
0
    def __init__(self,
                 player_id,
                 state_representation_size,
                 num_actions,
                 hidden_layers_sizes,
                 reservoir_buffer_capacity,
                 anticipatory_param,
                 batch_size=128,
                 rl_learning_rate=0.01,
                 sl_learning_rate=0.01,
                 min_buffer_size_to_learn=1000,
                 learn_every=64,
                 optimizer_str="sgd",
                 **kwargs):
        """Initialize the `NFSP` agent."""
        self.player_id = player_id
        self._num_actions = num_actions
        self._layer_sizes = hidden_layers_sizes
        self._batch_size = batch_size
        self._learn_every = learn_every
        self._anticipatory_param = anticipatory_param
        self._min_buffer_size_to_learn = min_buffer_size_to_learn

        self._reservoir_buffer = ReservoirBuffer(reservoir_buffer_capacity)
        self._prev_timestep = None
        self._prev_action = None

        # Step counter to keep track of learning.
        self._step_counter = 0

        # Inner RL agent
        kwargs.update({
            "batch_size": batch_size,
            "learning_rate": rl_learning_rate,
            "learn_every": learn_every,
            "min_buffer_size_to_learn": min_buffer_size_to_learn,
            "optimizer_str": optimizer_str,
        })
        self._rl_agent = dqn.DQN(player_id, state_representation_size,
                                 num_actions, hidden_layers_sizes, **kwargs)

        # Keep track of the last training loss achieved in an update step.
        self._last_rl_loss_value = lambda: self._rl_agent.loss
        self._last_sl_loss_value = None

        # Average policy network.
        self._avg_network = dqn.MLP(state_representation_size,
                                    self._layer_sizes, num_actions)

        self._savers = [("q_network", self._rl_agent._q_network),
                        ("avg_network", self._avg_network)]

        if optimizer_str == "adam":
            self.optimizer = torch.optim.Adam(self._avg_network.parameters(),
                                              lr=sl_learning_rate)
        elif optimizer_str == "sgd":
            self.optimizer = torch.optim.SGD(self._avg_network.parameters(),
                                             lr=sl_learning_rate)
        else:
            raise ValueError("Not implemented. Choose from ['adam', 'sgd'].")

        self._sample_episode_policy()