def test_simple_game(self): game = pyspiel.load_efg_game(SIMPLE_EFG_DATA) env = rl_environment.Environment(game=game) agent = dqn.DQN( 0, state_representation_size=game.information_state_tensor_shape()[0], num_actions=game.num_distinct_actions(), hidden_layers_sizes=[16], replay_buffer_capacity=100, batch_size=5, epsilon_start=0.02, epsilon_end=0.01) total_reward = 0 for _ in range(100): time_step = env.reset() while not time_step.last(): agent_output = agent.step(time_step) time_step = env.step([agent_output.action]) total_reward += time_step.rewards[0] agent.step(time_step) self.assertGreaterEqual(total_reward, 75)
def test_run_tic_tac_toe(self): env = rl_environment.Environment("tic_tac_toe") state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] agents = [ dqn.DQN( # pylint: disable=g-complex-comprehension player_id, state_representation_size=state_size, num_actions=num_actions, hidden_layers_sizes=[16], replay_buffer_capacity=10, batch_size=5) for player_id in [0, 1] ] time_step = env.reset() while not time_step.last(): current_player = time_step.observations["current_player"] current_agent = agents[current_player] agent_output = current_agent.step(time_step) time_step = env.step([agent_output.action]) for agent in agents: agent.step(time_step)
def test_run_hanabi(self): # Hanabi is an optional game, so check we have it before running the test. game = "hanabi" if game not in pyspiel.registered_names(): return num_players = 3 env_configs = { "players": num_players, "max_life_tokens": 1, "colors": 2, "ranks": 3, "hand_size": 2, "max_information_tokens": 3, "discount": 0. } env = rl_environment.Environment(game, **env_configs) state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] agents = [ dqn.DQN( # pylint: disable=g-complex-comprehension player_id, state_representation_size=state_size, num_actions=num_actions, hidden_layers_sizes=[16], replay_buffer_capacity=10, batch_size=5) for player_id in range(num_players) ] time_step = env.reset() while not time_step.last(): current_player = time_step.observations["current_player"] agent_output = [agent.step(time_step) for agent in agents] time_step = env.step([agent_output[current_player].action]) for agent in agents: agent.step(time_step)
def __init__(self, game, player_id, state_size, num_actions, embedding_network_layers=(128, ), embedding_size=16, dqn_hidden_layers=(128, 128), batch_size=16, trajectory_len=10, num_neighbours=5, learning_rate=1e-4, mixing_parameter=0.9, memory_capacity=int(1e6), discount_factor=1.0, update_target_network_every=1000, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_duration=int(1e4), embedding_as_parametric_input=False): """Initialize the Ephemeral VAlue Adjustment algorithm. Args: game: (rl_environment.Environment) Open Spiel game. player_id: (int) Player id for this player. state_size: (int) Size of info state vector. num_actions: (int) number of actions. embedding_network_layers: (list[int]) Layer sizes of strategy net MLP. embedding_size: (int) Size of memory embeddings. dqn_hidden_layers: (list(int)) MLP layer sizes of DQN network. batch_size: (int) Size of batches for DQN learning steps. trajectory_len: (int) Length of trajectories from replay buffer. num_neighbours: (int) Number of neighbours to fetch from replay buffer. learning_rate: (float) Learning rate. mixing_parameter: (float) Value mixing parameter between 0 and 1. memory_capacity: Number af samples that can be stored in memory. discount_factor: (float) Discount factor for Q-Learning. update_target_network_every: How often to update DQN target network. epsilon_start: (float) Starting epsilon-greedy value. epsilon_end: (float) Final epsilon-greedy value. epsilon_decay_duration: (float) Number of steps over which epsilon decays. embedding_as_parametric_input: (bool) Whether we use embeddings as input to the parametric model. """ assert (mixing_parameter >= 0 and mixing_parameter <= 1) self._game = game self.player_id = player_id self._env = game self._num_actions = num_actions self._info_state_size = state_size self._embedding_size = embedding_size self._lambda = mixing_parameter self._trajectory_len = trajectory_len self._num_neighbours = num_neighbours self._discount = discount_factor self._epsilon_start = epsilon_start self._epsilon_end = epsilon_end self._epsilon_decay_duration = epsilon_decay_duration self._last_time_step = None self._last_action = None self._embedding_as_parametric_input = embedding_as_parametric_input self._embedding_network = dqn.MLP(self._info_state_size, list(embedding_network_layers), embedding_size) # The DQN agent requires this be an integer. if not isinstance(memory_capacity, int): raise ValueError("Memory capacity not an integer.") # Initialize the parametric & non-parametric Q-networks. self._agent = dqn.DQN( player_id, state_representation_size=self._info_state_size, num_actions=self._num_actions, hidden_layers_sizes=list(dqn_hidden_layers), replay_buffer_capacity=memory_capacity, replay_buffer_class=QueryableFixedSizeRingBuffer, batch_size=batch_size, learning_rate=learning_rate, update_target_network_every=update_target_network_every, learn_every=batch_size, discount_factor=1.0, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_duration=int(1e6)) # Initialize Value Buffers - Fetch Replay buffers from agents. self._value_buffer = QueryableFixedSizeRingBuffer(memory_capacity) self._replay_buffer = self._agent.replay_buffer # Initialize non-parametric & EVA Q-values. self._v_np = collections.defaultdict(float) self._q_np = collections.defaultdict(lambda: [0] * self._num_actions) self._q_eva = collections.defaultdict(lambda: [0] * self._num_actions)
def __init__(self, player_id, state_representation_size, num_actions, hidden_layers_sizes, reservoir_buffer_capacity, anticipatory_param, batch_size=128, rl_learning_rate=0.01, sl_learning_rate=0.01, min_buffer_size_to_learn=1000, learn_every=64, optimizer_str="sgd", **kwargs): """Initialize the `NFSP` agent.""" self.player_id = player_id self._num_actions = num_actions self._layer_sizes = hidden_layers_sizes self._batch_size = batch_size self._learn_every = learn_every self._anticipatory_param = anticipatory_param self._min_buffer_size_to_learn = min_buffer_size_to_learn self._reservoir_buffer = ReservoirBuffer(reservoir_buffer_capacity) self._prev_timestep = None self._prev_action = None # Step counter to keep track of learning. self._step_counter = 0 # Inner RL agent kwargs.update({ "batch_size": batch_size, "learning_rate": rl_learning_rate, "learn_every": learn_every, "min_buffer_size_to_learn": min_buffer_size_to_learn, "optimizer_str": optimizer_str, }) self._rl_agent = dqn.DQN(player_id, state_representation_size, num_actions, hidden_layers_sizes, **kwargs) # Keep track of the last training loss achieved in an update step. self._last_rl_loss_value = lambda: self._rl_agent.loss self._last_sl_loss_value = None # Average policy network. self._avg_network = dqn.MLP(state_representation_size, self._layer_sizes, num_actions) self._savers = [("q_network", self._rl_agent._q_network), ("avg_network", self._avg_network)] if optimizer_str == "adam": self.optimizer = torch.optim.Adam(self._avg_network.parameters(), lr=sl_learning_rate) elif optimizer_str == "sgd": self.optimizer = torch.optim.SGD(self._avg_network.parameters(), lr=sl_learning_rate) else: raise ValueError("Not implemented. Choose from ['adam', 'sgd'].") self._sample_episode_policy()