Beispiel #1
0
    def __init__(self,
                 session,
                 game,
                 policy_network_layers=(256, 256),
                 advantage_network_layers=(128, 128),
                 num_iterations: int = 100,
                 num_traversals: int = 20,
                 learning_rate: float = 1e-4,
                 batch_size_advantage=None,
                 batch_size_strategy=None,
                 memory_capacity: int = int(1e6),
                 policy_network_train_steps: int = 1,
                 advantage_network_train_steps: int = 1,
                 reinitialize_advantage_networks: bool = True):
        """Initialize the Deep CFR algorithm.

    Args:
      session: (tf.Session) TensorFlow session.
      game: Open Spiel game.
      policy_network_layers: (list[int]) Layer sizes of strategy net MLP.
      advantage_network_layers: (list[int]) Layer sizes of advantage net MLP.
      num_iterations: Number of iterations.
      num_traversals: Number of traversals per iteration.
      learning_rate: Learning rate.
      batch_size_advantage: (int or None) Batch size to sample from advantage
        memories.
      batch_size_strategy: (int or None) Batch size to sample from strategy
        memories.
      memory_capacity: Number of samples that can be stored in memory.
      policy_network_train_steps: Number of policy network training steps (per
        iteration).
      advantage_network_train_steps: Number of advantage network training steps
        (per iteration).
      reinitialize_advantage_networks: Whether to re-initialize the
        advantage network before training on each iteration.
    """
        all_players = list(range(game.num_players()))
        super(DeepCFRSolver, self).__init__(game, all_players)
        self._game = game
        if game.get_type().dynamics == pyspiel.GameType.Dynamics.SIMULTANEOUS:
            # `_traverse_game_tree` does not take into account this option.
            raise ValueError("Simulatenous games are not supported.")
        self._session = session
        self._batch_size_advantage = batch_size_advantage
        self._batch_size_strategy = batch_size_strategy
        self._policy_network_train_steps = policy_network_train_steps
        self._advantage_network_train_steps = advantage_network_train_steps
        self._num_players = game.num_players()
        self._root_node = self._game.new_initial_state()
        # TODO(author6) Allow embedding size (and network) to be specified.
        self._embedding_size = len(self._root_node.information_state_tensor(0))
        self._num_iterations = num_iterations
        self._num_traversals = num_traversals
        self._reinitialize_advantage_networks = reinitialize_advantage_networks
        self._num_actions = game.num_distinct_actions()
        self._iteration = 1

        # Create required TensorFlow placeholders to perform the Q-network updates.
        self._info_state_ph = tf.placeholder(
            shape=[None, self._embedding_size],
            dtype=tf.float32,
            name="info_state_ph")
        self._info_state_action_ph = tf.placeholder(
            shape=[None, self._embedding_size + 1],
            dtype=tf.float32,
            name="info_state_action_ph")
        self._action_probs_ph = tf.placeholder(shape=[None, self._num_actions],
                                               dtype=tf.float32,
                                               name="action_probs_ph")
        self._iter_ph = tf.placeholder(shape=[None, 1],
                                       dtype=tf.float32,
                                       name="iter_ph")
        self._advantage_ph = []
        for p in range(self._num_players):
            self._advantage_ph.append(
                tf.placeholder(shape=[None, self._num_actions],
                               dtype=tf.float32,
                               name="advantage_ph_" + str(p)))

        # Define strategy network, loss & memory.
        self._strategy_memories = ReservoirBuffer(memory_capacity)
        self._policy_network = simple_nets.MLP(self._embedding_size,
                                               list(policy_network_layers),
                                               self._num_actions)
        action_logits = self._policy_network(self._info_state_ph)
        # Illegal actions are handled in the traversal code where expected payoff
        # and sampled regret is computed from the advantage networks.
        self._action_probs = tf.nn.softmax(action_logits)
        self._loss_policy = tf.reduce_mean(
            tf.losses.mean_squared_error(
                labels=tf.math.sqrt(self._iter_ph) * self._action_probs_ph,
                predictions=tf.math.sqrt(self._iter_ph) * self._action_probs))
        self._optimizer_policy = tf.train.AdamOptimizer(
            learning_rate=learning_rate)
        self._learn_step_policy = self._optimizer_policy.minimize(
            self._loss_policy)

        # Define advantage network, loss & memory. (One per player)
        self._advantage_memories = [
            ReservoirBuffer(memory_capacity) for _ in range(self._num_players)
        ]
        self._advantage_networks = [
            simple_nets.MLP(self._embedding_size,
                            list(advantage_network_layers), self._num_actions)
            for _ in range(self._num_players)
        ]
        self._advantage_outputs = [
            self._advantage_networks[i](self._info_state_ph)
            for i in range(self._num_players)
        ]
        self._loss_advantages = []
        self._optimizer_advantages = []
        self._learn_step_advantages = []
        for p in range(self._num_players):
            self._loss_advantages.append(
                tf.reduce_mean(
                    tf.losses.mean_squared_error(
                        labels=tf.math.sqrt(self._iter_ph) *
                        self._advantage_ph[p],
                        predictions=tf.math.sqrt(self._iter_ph) *
                        self._advantage_outputs[p])))
            self._optimizer_advantages.append(
                tf.train.AdamOptimizer(learning_rate=learning_rate))
            self._learn_step_advantages.append(
                self._optimizer_advantages[p].minimize(
                    self._loss_advantages[p]))
Beispiel #2
0
  def __init__(self,
               session,
               player_id,
               state_representation_size,
               num_actions,
               hidden_layers_sizes=128,
               replay_buffer_capacity=10000,
               batch_size=128,
               replay_buffer_class=ReplayBuffer,
               learning_rate=0.01,
               update_target_network_every=1000,
               learn_every=10,
               discount_factor=1.0,
               min_buffer_size_to_learn=1000,
               epsilon_start=1.0,
               epsilon_end=0.1,
               epsilon_decay_duration=int(1e6),
               optimizer_str="sgd",
               loss_str="mse"):
    """Initialize the DQN agent."""

    # This call to locals() is used to store every argument used to initialize
    # the class instance, so it can be copied with no hyperparameter change.
    self._kwargs = locals()

    self.player_id = player_id
    self._session = session
    self._num_actions = num_actions
    if isinstance(hidden_layers_sizes, int):
      hidden_layers_sizes = [hidden_layers_sizes]
    self._layer_sizes = hidden_layers_sizes
    self._batch_size = batch_size
    self._update_target_network_every = update_target_network_every
    self._learn_every = learn_every
    self._min_buffer_size_to_learn = min_buffer_size_to_learn
    self._discount_factor = discount_factor

    self._epsilon_start = epsilon_start
    self._epsilon_end = epsilon_end
    self._epsilon_decay_duration = epsilon_decay_duration

    # TODO(author6) Allow for optional replay buffer config.
    if not isinstance(replay_buffer_capacity, int):
      raise ValueError("Replay buffer capacity not an integer.")
    self._replay_buffer = replay_buffer_class(replay_buffer_capacity)
    self._prev_timestep = None
    self._prev_action = None

    # Step counter to keep track of learning, eps decay and target network.
    self._step_counter = 0

    # Keep track of the last training loss achieved in an update step.
    self._last_loss_value = None

    # Create required TensorFlow placeholders to perform the Q-network updates.
    self._info_state_ph = tf.placeholder(
        shape=[None, state_representation_size],
        dtype=tf.float32,
        name="info_state_ph")
    self._action_ph = tf.placeholder(
        shape=[None], dtype=tf.int32, name="action_ph")
    self._reward_ph = tf.placeholder(
        shape=[None], dtype=tf.float32, name="reward_ph")
    self._is_final_step_ph = tf.placeholder(
        shape=[None], dtype=tf.float32, name="is_final_step_ph")
    self._next_info_state_ph = tf.placeholder(
        shape=[None, state_representation_size],
        dtype=tf.float32,
        name="next_info_state_ph")
    self._legal_actions_mask_ph = tf.placeholder(
        shape=[None, num_actions],
        dtype=tf.float32,
        name="legal_actions_mask_ph")

    self._q_network = simple_nets.MLP(state_representation_size,
                                      self._layer_sizes, num_actions)
    self._q_values = self._q_network(self._info_state_ph)

    self._target_q_network = simple_nets.MLP(state_representation_size,
                                             self._layer_sizes, num_actions)
    self._target_q_values = self._target_q_network(self._next_info_state_ph)

    # Stop gradient to prevent updates to the target network while learning
    self._target_q_values = tf.stop_gradient(self._target_q_values)

    self._update_target_network = self._create_target_network_update_op(
        self._q_network, self._target_q_network)

    # Create the loss operations.
    # Sum a large negative constant to illegal action logits before taking the
    # max. This prevents illegal action values from being considered as target.
    illegal_actions = 1 - self._legal_actions_mask_ph
    illegal_logits = illegal_actions * ILLEGAL_ACTION_LOGITS_PENALTY
    max_next_q = tf.reduce_max(
        tf.math.add(tf.stop_gradient(self._target_q_values), illegal_logits),
        axis=-1)
    target = (
        self._reward_ph +
        (1 - self._is_final_step_ph) * self._discount_factor * max_next_q)

    action_indices = tf.stack(
        [tf.range(tf.shape(self._q_values)[0]), self._action_ph], axis=-1)
    predictions = tf.gather_nd(self._q_values, action_indices)

    if loss_str == "mse":
      loss_class = tf.losses.mean_squared_error
    elif loss_str == "huber":
      loss_class = tf.losses.huber_loss
    else:
      raise ValueError("Not implemented, choose from 'mse', 'huber'.")

    self._loss = tf.reduce_mean(
        loss_class(labels=target, predictions=predictions))

    if optimizer_str == "adam":
      self._optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    elif optimizer_str == "sgd":
      self._optimizer = tf.train.GradientDescentOptimizer(
          learning_rate=learning_rate)
    else:
      raise ValueError("Not implemented, choose from 'adam' and 'sgd'.")

    self._learn_step = self._optimizer.minimize(self._loss)
    self._initialize()
Beispiel #3
0
    def __init__(self,
                 session,
                 player_id,
                 state_representation_size,
                 num_actions,
                 hidden_layers_sizes,
                 reservoir_buffer_capacity,
                 anticipatory_param,
                 batch_size=128,
                 rl_learning_rate=0.01,
                 sl_learning_rate=0.01,
                 min_buffer_size_to_learn=1000,
                 learn_every=64,
                 optimizer_str="sgd",
                 **kwargs):
        """Initialize the `NFSP` agent."""
        self.player_id = player_id
        self._session = session
        self._num_actions = num_actions
        self._layer_sizes = hidden_layers_sizes
        self._batch_size = batch_size
        self._learn_every = learn_every
        self._anticipatory_param = anticipatory_param
        self._min_buffer_size_to_learn = min_buffer_size_to_learn

        self._reservoir_buffer = ReservoirBuffer(reservoir_buffer_capacity)
        self._prev_timestep = None
        self._prev_action = None

        # Step counter to keep track of learning.
        self._step_counter = 0

        # Inner RL agent
        kwargs.update({
            "batch_size": batch_size,
            "learning_rate": rl_learning_rate,
            "learn_every": learn_every,
            "min_buffer_size_to_learn": min_buffer_size_to_learn,
            "optimizer_str": optimizer_str,
        })
        self._rl_agent = dqn.DQN(session, player_id, state_representation_size,
                                 num_actions, hidden_layers_sizes, **kwargs)

        # Keep track of the last training loss achieved in an update step.
        self._last_rl_loss_value = lambda: self._rl_agent.loss
        self._last_sl_loss_value = None

        # Placeholders.
        self._info_state_ph = tf.placeholder(
            shape=[None, state_representation_size],
            dtype=tf.float32,
            name="info_state_ph")

        self._action_probs_ph = tf.placeholder(shape=[None, num_actions],
                                               dtype=tf.float32,
                                               name="action_probs_ph")

        self._legal_actions_mask_ph = tf.placeholder(
            shape=[None, num_actions],
            dtype=tf.float32,
            name="legal_actions_mask_ph")

        # Average policy network.
        self._avg_network = simple_nets.MLP(state_representation_size,
                                            self._layer_sizes, num_actions)
        self._avg_policy = self._avg_network(self._info_state_ph)
        self._avg_policy_probs = tf.nn.softmax(self._avg_policy)

        self._savers = [
            ("q_network", tf.train.Saver(self._rl_agent._q_network.variables)),
            ("avg_network", tf.train.Saver(self._avg_network.variables))
        ]

        # Loss
        self._loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits_v2(
                labels=tf.stop_gradient(self._action_probs_ph),
                logits=self._avg_policy))

        if optimizer_str == "adam":
            optimizer = tf.train.AdamOptimizer(learning_rate=sl_learning_rate)
        elif optimizer_str == "sgd":
            optimizer = tf.train.GradientDescentOptimizer(
                learning_rate=sl_learning_rate)
        else:
            raise ValueError("Not implemented. Choose from ['adam', 'sgd'].")

        self._learn_step = optimizer.minimize(self._loss)
        self._sample_episode_policy()
Beispiel #4
0
    def __init__(self,
                 session,
                 game,
                 player_id,
                 state_size,
                 num_actions,
                 embedding_network_layers=(128, ),
                 embedding_size=16,
                 dqn_hidden_layers=(128, 128),
                 batch_size=16,
                 trajectory_len=10,
                 num_neighbours=5,
                 learning_rate=1e-4,
                 mixing_parameter=0.9,
                 memory_capacity=int(1e6),
                 discount_factor=1.0,
                 update_target_network_every=1000,
                 epsilon_start=1.0,
                 epsilon_end=0.1,
                 epsilon_decay_duration=int(1e4),
                 embedding_as_parametric_input=False):
        """Initialize the Ephemeral VAlue Adjustment algorithm.

    Args:
      session: (tf.Session) TensorFlow session.
      game: (rl_environment.Environment) Open Spiel game.
      player_id: (int) Player id for this player.
      state_size: (int) Size of info state vector.
      num_actions: (int) number of actions.
      embedding_network_layers: (list[int]) Layer sizes of strategy net MLP.
      embedding_size: (int) Size of memory embeddings.
      dqn_hidden_layers: (list(int)) MLP layer sizes of DQN network.
      batch_size: (int) Size of batches for DQN learning steps.
      trajectory_len: (int) Length of trajectories from replay buffer.
      num_neighbours: (int) Number of neighbours to fetch from replay buffer.
      learning_rate: (float) Learning rate.
      mixing_parameter: (float) Value mixing parameter between 0 and 1.
      memory_capacity: Number af samples that can be stored in memory.
      discount_factor: (float) Discount factor for Q-Learning.
      update_target_network_every: How often to update DQN target network.
      epsilon_start: (float) Starting epsilon-greedy value.
      epsilon_end: (float) Final epsilon-greedy value.
      epsilon_decay_duration: (float) Number of steps over which epsilon decays.
      embedding_as_parametric_input: (bool) Whether we use embeddings as input
        to the parametric model.
    """
        assert (mixing_parameter >= 0 and mixing_parameter <= 1)
        self._game = game
        self._session = session
        self.player_id = player_id
        self._env = game
        self._num_actions = num_actions
        self._info_state_size = state_size
        self._embedding_size = embedding_size
        self._lambda = mixing_parameter
        self._trajectory_len = trajectory_len
        self._num_neighbours = num_neighbours
        self._discount = discount_factor
        self._epsilon_start = epsilon_start
        self._epsilon_end = epsilon_end
        self._epsilon_decay_duration = epsilon_decay_duration
        self._last_time_step = None
        self._last_action = None
        self._embedding_as_parametric_input = embedding_as_parametric_input

        # Create required TensorFlow placeholders to perform the Q-network updates.
        self._info_state_ph = tf.placeholder(
            shape=[None, self._info_state_size],
            dtype=tf.float32,
            name="info_state_ph")
        self._embedding_network = simple_nets.MLP(
            self._info_state_size, list(embedding_network_layers),
            embedding_size)
        self._embedding = self._embedding_network(self._info_state_ph)

        # The DQN agent requires this be an integer.
        if not isinstance(memory_capacity, int):
            raise ValueError("Memory capacity not an integer.")

        # Initialize the parametric & non-parametric Q-networks.
        self._agent = dqn.DQN(
            session,
            player_id,
            state_representation_size=self._info_state_size,
            num_actions=self._num_actions,
            hidden_layers_sizes=list(dqn_hidden_layers),
            replay_buffer_capacity=memory_capacity,
            replay_buffer_class=QueryableFixedSizeRingBuffer,
            batch_size=batch_size,
            learning_rate=learning_rate,
            update_target_network_every=update_target_network_every,
            learn_every=batch_size,
            discount_factor=1.0,
            epsilon_start=1.0,
            epsilon_end=0.1,
            epsilon_decay_duration=int(1e6))
        # Initialize Value Buffers - Fetch Replay buffers from agents.
        self._value_buffer = QueryableFixedSizeRingBuffer(memory_capacity)
        self._replay_buffer = self._agent.replay_buffer

        # Initialize non-parametric & EVA Q-values.
        self._v_np = collections.defaultdict(float)
        self._q_np = collections.defaultdict(lambda: [0] * self._num_actions)
        self._q_eva = collections.defaultdict(lambda: [0] * self._num_actions)