def test_simple_game(self): game = pyspiel.load_efg_game(SIMPLE_EFG_DATA) env = rl_environment.Environment(game=game) with self.session() as sess: agent = dqn.DQN(sess, 0, state_representation_size=game. information_state_tensor_shape()[0], num_actions=game.num_distinct_actions(), hidden_layers_sizes=[16], replay_buffer_capacity=100, batch_size=5, epsilon_start=0.02, epsilon_end=0.01) total_reward = 0 sess.run(tf.global_variables_initializer()) for _ in range(100): time_step = env.reset() while not time_step.last(): agent_output = agent.step(time_step) time_step = env.step([agent_output.action]) total_reward += time_step.rewards[0] agent.step(time_step) self.assertGreaterEqual(total_reward, 75)
def main(_): game = "breakthrough" num_players = 2 env_configs = {"columns": 5, "rows": 5} env = rl_environment.Environment(game, **env_configs) info_state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] # random agents for evaluation random_agents = [ random_agent.RandomAgent(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] with tf.Session() as sess: hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] # pylint: disable=g-complex-comprehension agents = [ dqn.DQN(session=sess, player_id=idx, state_representation_size=info_state_size, num_actions=num_actions, hidden_layers_sizes=hidden_layers_sizes, replay_buffer_capacity=FLAGS.replay_buffer_capacity, batch_size=FLAGS.batch_size) for idx in range(num_players) ] saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) print(type(agents[0].get_weights()), agents[0].get_weights())
def test_run_tic_tac_toe(self): env = rl_environment.Environment("tic_tac_toe") state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] with self.session() as sess: agents = [ dqn.DQN( # pylint: disable=g-complex-comprehension sess, player_id, state_representation_size=state_size, num_actions=num_actions, hidden_layers_sizes=[16], replay_buffer_capacity=10, batch_size=5) for player_id in [0, 1] ] sess.run(tf.global_variables_initializer()) time_step = env.reset() while not time_step.last(): current_player = time_step.observations["current_player"] current_agent = agents[current_player] agent_output = current_agent.step(time_step) time_step = env.step([agent_output.action]) for agent in agents: agent.step(time_step)
def create_training_agents(num_players, sess, num_actions, info_state_size, hidden_layers_sizes): """Create the agents we want to use for learning.""" if FLAGS.learner == "qlearning": # pylint: disable=g-complex-comprehension return [ tabular_qlearner.QLearner( player_id=idx, num_actions=num_actions, # step_size=0.02, step_size=0.1, # epsilon_schedule=rl_tools.ConstantSchedule(0.5), epsilon_schedule=rl_tools.LinearSchedule(0.5, 0.2, 1000000), discount_factor=0.99) for idx in range(num_players) ] elif FLAGS.learner == "dqn": # pylint: disable=g-complex-comprehension return [ dqn.DQN(session=sess, player_id=idx, state_representation_size=info_state_size, num_actions=num_actions, discount_factor=0.99, epsilon_start=0.5, epsilon_end=0.1, hidden_layers_sizes=hidden_layers_sizes, replay_buffer_capacity=FLAGS.replay_buffer_capacity, batch_size=FLAGS.batch_size) for idx in range(num_players) ] else: raise RuntimeError("Unknown learner")
def main(_): game = FLAGS.game num_players = 1 games, rewards,_,_ = mst.game_params(FLAGS.num_nodes) env_configs = games[0] env = rl_environment.Environment(game, **env_configs) info_state_size = FLAGS.num_nodes * FLAGS.num_nodes * 3 #env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] print("Info State Size: ", info_state_size) print("Num Actions: ", num_actions) # random agents for evaluation random_agents = [ random_agent.RandomAgent(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] with tf.Session() as sess: hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] # pylint: disable=g-complex-comprehension agents = [ dqn.DQN( session=sess, player_id=idx, state_representation_size=info_state_size, num_actions=num_actions, hidden_layers_sizes=hidden_layers_sizes, replay_buffer_capacity=FLAGS.replay_buffer_capacity, batch_size=FLAGS.batch_size) for idx in range(num_players) ] saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) for ep in range(FLAGS.num_train_episodes): if (ep + 1) % FLAGS.eval_every == 0: r_mean = eval_against_random_bots(env, agents, random_agents, 1) logging.info("[%s] Mean episode rewards %s", ep + 1, r_mean) saver.save(sess, FLAGS.checkpoint_dir, ep) print("Actual MST Value: ", rewards[0]) #env = rl_environment.Environment(game, **games[ep]) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] if env.is_turn_based: agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] else: agents_output = [agent.step(time_step) for agent in agents] action_list = [agent_output.action for agent_output in agents_output] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step) print("Actual MST: ", rewards)
def main(_): game = "breakthrough" num_players = 2 env_configs = {"columns": 5, "rows": 5} env = rl_environment.Environment(game, **env_configs) info_state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] # random agents for evaluation random_agents = [ random_agent.RandomAgent(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] with tf.Session() as sess: hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] # pylint: disable=g-complex-comprehension agents = [ dqn.DQN(session=sess, player_id=idx, state_representation_size=info_state_size, num_actions=num_actions, hidden_layers_sizes=hidden_layers_sizes, replay_buffer_capacity=FLAGS.replay_buffer_capacity, batch_size=FLAGS.batch_size) for idx in range(num_players) ] sess.run(tf.global_variables_initializer()) for ep in range(FLAGS.num_train_episodes): if (ep + 1) % FLAGS.eval_every == 0: r_mean = eval_against_random_bots(env, agents, random_agents, 1000) logging.info("[%s] Mean episode rewards %s", ep + 1, r_mean) if (ep + 1) % FLAGS.save_every == 0: for agent in agents: agent.save(FLAGS.checkpoint_dir) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] if env.is_turn_based: agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] else: agents_output = [agent.step(time_step) for agent in agents] action_list = [ agent_output.action for agent_output in agents_output ] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step)
def test_run_landlord(self): # landlord is an optional game, so check we have it before running the test. game = "landlord" if game not in pyspiel.registered_names(): return num_players = 3 env_configs = { } env = rl_environment.Environment(game, **env_configs) state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] with self.session() as sess: agents = [ dqn.DQN( # pylint: disable=g-complex-comprehension sess, player_id, state_representation_size=state_size, num_actions=num_actions, hidden_layers_sizes=[16], replay_buffer_capacity=10, batch_size=5) for player_id in range(num_players) ] sess.run(tf.global_variables_initializer()) time_step = env.reset() while not time_step.last(): current_player = time_step.observations["current_player"] #agent_output = [agent.step(time_step) for agent in agents] #time_step = env.step([agent_output[current_player].action]) if env.is_turn_based: agent_output = agents[current_player].step(time_step) action_list = [agent_output.action] else: agents_output = [agent.step(time_step) for agent in agents] action_list = [agent_output.action for agent_output in agents_output] print_iteration(time_step, current_player, action_list) time_step = env.step(action_list) for agent in agents: agent.step(time_step)
def test_run_hanabi(self): # Hanabi is an optional game, so check we have it before running the test. game = "hanabi" if game not in pyspiel.registered_names(): return num_players = 3 env_configs = { "players": num_players, "max_life_tokens": 1, "colors": 2, "ranks": 3, "hand_size": 2, "max_information_tokens": 3, "discount": 0. } env = rl_environment.Environment(game, **env_configs) state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] with self.session() as sess: agents = [ dqn.DQN( # pylint: disable=g-complex-comprehension sess, player_id, state_representation_size=state_size, num_actions=num_actions, hidden_layers_sizes=[16], replay_buffer_capacity=10, batch_size=5) for player_id in range(num_players) ] sess.run(tf.global_variables_initializer()) time_step = env.reset() while not time_step.last(): current_player = time_step.observations["current_player"] agent_output = [agent.step(time_step) for agent in agents] time_step = env.step([agent_output[current_player].action]) for agent in agents: agent.step(time_step)
def build_graph(self, scope_name, current_player, info_state_size, num_actions, hidden_layers_sizes=[64,64], replay_buffer_capacity=1e5, batch_size=32, entropy_cost=0.001, critic_learning_rate=0.01, pi_learning_rate=0.01, num_critic_before_pi=32 ): with tf.variable_scope(scope_name) as scope: if self._oracle == "dqn": training_agent = dqn.DQN( session=self._session, player_id=current_player, state_representation_size=info_state_size, num_actions=num_actions, hidden_layers_sizes=hidden_layers_sizes, replay_buffer_capacity=replay_buffer_capacity, batch_size=batch_size) elif self._oracle in ["rpg", "qpg", "rm", "a2c"]: training_agent = policy_gradient.PolicyGradient( # pylint: disable=g-complex-comprehension session=self._session, player_id=current_player, info_state_size=info_state_size, num_actions=num_actions, loss_str=self._oracle, hidden_layers_sizes=hidden_layers_sizes, batch_size=batch_size, entropy_cost=entropy_cost, critic_learning_rate=critic_learning_rate, pi_learning_rate=pi_learning_rate, num_critic_before_pi=num_critic_before_pi) else: raise ValueError("Oracle selected is not supported.") return training_agent
def dqn_train(unused_arg): env = rl_environment.Environment(FLAGS.game) state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] sess = tf.Session() players = [ dqn.DQN(sess, idx, state_representation_size=state_size, num_actions=num_actions, hidden_layers_sizes=[64], reservoir_buffer_capacity=2e6, batch_size=128, learn_every=64, replay_buffer_capacity=2e5, epsilon_decay_duration=FLAGS.episodes, epsilon_start=0.06, eplsilon_end=0.001) for idx in range(2) ] expl_policies_avg = NFSPPolicies(env, players, nfsp.MODE.average_policy) run_agents(sess, env, players, expl_policies_avg) sess.close()
def main(_): game = "tic_tac_toe" num_players = 2 env = rl_environment.Environment(game) state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] hidden_layers_sizes = [32, 32] replay_buffer_capacity = int(1e4) train_episodes = FLAGS.num_episodes loss_report_interval = 1000 with tf.Session() as sess: dqn_agent = dqn.DQN(sess, player_id=0, state_representation_size=state_size, num_actions=num_actions, hidden_layers_sizes=hidden_layers_sizes, replay_buffer_capacity=replay_buffer_capacity) tabular_q_agent = tabular_qlearner.QLearner(player_id=1, num_actions=num_actions) agents = [dqn_agent, tabular_q_agent] sess.run(tf.global_variables_initializer()) # Train agent for ep in range(train_episodes): if ep and ep % loss_report_interval == 0: logging.info("[%s/%s] DQN loss: %s", ep, train_episodes, agents[0].loss) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step) # Evaluate against random agent random_agents = [ random_agent.RandomAgent(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] r_mean = eval_against_random_bots(env, agents, random_agents, 1000) logging.info("Mean episode rewards: %s", r_mean) if not FLAGS.iteractive_play: return # Play from the command line against the trained DQN agent. human_player = 1 while True: logging.info("You are playing as %s", "O" if human_player else "X") time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] if player_id == human_player: agent_out = agents[human_player].step(time_step, is_evaluation=True) logging.info("\n%s", agent_out.probs.reshape((3, 3))) logging.info("\n%s", pretty_board(time_step)) action = command_line_action(time_step) else: agent_out = agents[1 - human_player].step( time_step, is_evaluation=True) action = agent_out.action time_step = env.step([action]) logging.info("\n%s", pretty_board(time_step)) logging.info("End of game!") if time_step.rewards[human_player] > 0: logging.info("You win") elif time_step.rewards[human_player] < 0: logging.info("You lose") else: logging.info("Draw") # Switch order of players human_player = 1 - human_player
def main(_): game = "skat" num_players = 3 env_configs = {} env = rl_environment.Environment(game, **env_configs) observation_tensor_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] # random agents for evaluation random_agents = [ random_agent.RandomAgent(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] with tf.Session() as sess: summaries_dir = os.path.join(FLAGS.checkpoint_dir, "random_eval") summary_writer = tf.summary.FileWriter(summaries_dir, tf.get_default_graph()) hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] # pylint: disable=g-complex-comprehension agents = [ dqn.DQN(session=sess, player_id=idx, state_representation_size=observation_tensor_size, num_actions=num_actions, hidden_layers_sizes=hidden_layers_sizes, replay_buffer_capacity=FLAGS.replay_buffer_capacity, batch_size=FLAGS.batch_size) for idx in range(num_players) ] saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) for ep in range(FLAGS.num_train_episodes): if (ep + 1) % FLAGS.eval_every == 0: r_mean = eval_against_random_bots(env, agents, random_agents, FLAGS.num_eval_games) logging.info("[%s] Mean episode rewards %s", ep + 1, r_mean) for i in range(num_players): summary = tf.Summary() summary.value.add(tag="mean_reward/random_{}".format(i), simple_value=r_mean[i]) summary_writer.add_summary(summary, ep) summary_writer.flush() saver.save(sess, FLAGS.checkpoint_dir, ep) time_step = env.reset() # Randomize position. if FLAGS.randomize_positions: positions = random.sample(range(len(agents)), len(agents)) while not time_step.last(): player_id = time_step.observations["current_player"] if FLAGS.randomize_positions: position = positions[player_id] agents[position].player_id = player_id else: position = player_id agent_output = agents[position].step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step)
def __init__(self, session, game, player_id, state_size, num_actions, embedding_network_layers=(128, ), embedding_size=16, dqn_hidden_layers=(128, 128), batch_size=16, trajectory_len=10, num_neighbours=5, learning_rate=1e-4, mixing_parameter=0.9, memory_capacity=int(1e6), discount_factor=1.0, update_target_network_every=1000, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_duration=int(1e4), embedding_as_parametric_input=False): """Initialize the Ephemeral VAlue Adjustment algorithm. Args: session: (tf.Session) TensorFlow session. game: (rl_environment.Environment) Open Spiel game. player_id: (int) Player id for this player. state_size: (int) Size of info state vector. num_actions: (int) number of actions. embedding_network_layers: (list[int]) Layer sizes of strategy net MLP. embedding_size: (int) Size of memory embeddings. dqn_hidden_layers: (list(int)) MLP layer sizes of DQN network. batch_size: (int) Size of batches for DQN learning steps. trajectory_len: (int) Length of trajectories from replay buffer. num_neighbours: (int) Number of neighbours to fetch from replay buffer. learning_rate: (float) Learning rate. mixing_parameter: (float) Value mixing parameter between 0 and 1. memory_capacity: Number af samples that can be stored in memory. discount_factor: (float) Discount factor for Q-Learning. update_target_network_every: How often to update DQN target network. epsilon_start: (float) Starting epsilon-greedy value. epsilon_end: (float) Final epsilon-greedy value. epsilon_decay_duration: (float) Number of steps over which epsilon decays. embedding_as_parametric_input: (bool) Whether we use embeddings as input to the parametric model. """ assert (mixing_parameter >= 0 and mixing_parameter <= 1) self._game = game self._session = session self.player_id = player_id self._env = game self._num_actions = num_actions self._info_state_size = state_size self._embedding_size = embedding_size self._lambda = mixing_parameter self._trajectory_len = trajectory_len self._num_neighbours = num_neighbours self._discount = discount_factor self._epsilon_start = epsilon_start self._epsilon_end = epsilon_end self._epsilon_decay_duration = epsilon_decay_duration self._last_time_step = None self._last_action = None self._embedding_as_parametric_input = embedding_as_parametric_input # Create required TensorFlow placeholders to perform the Q-network updates. self._info_state_ph = tf.placeholder( shape=[None, self._info_state_size], dtype=tf.float32, name="info_state_ph") self._embedding_network = snt.nets.MLP( list(embedding_network_layers) + [embedding_size]) self._embedding = self._embedding_network(self._info_state_ph) # The DQN agent requires this be an integer. if not isinstance(memory_capacity, int): raise ValueError("Memory capacity not an integer.") # Initialize the parametric & non-parametric Q-networks. self._agent = dqn.DQN( session, player_id, state_representation_size=self._info_state_size, num_actions=self._num_actions, hidden_layers_sizes=list(dqn_hidden_layers), replay_buffer_capacity=memory_capacity, replay_buffer_class=QueryableFixedSizeRingBuffer, batch_size=batch_size, learning_rate=learning_rate, update_target_network_every=update_target_network_every, learn_every=batch_size, discount_factor=1.0, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_duration=int(1e6)) # Initialize Value Buffers - Fetch Replay buffers from agents. self._value_buffer = QueryableFixedSizeRingBuffer(memory_capacity) self._replay_buffer = self._agent.replay_buffer # Initialize non-parametric & EVA Q-values. self._v_np = collections.defaultdict(float) self._q_np = collections.defaultdict(lambda: [0] * self._num_actions) self._q_eva = collections.defaultdict(lambda: [0] * self._num_actions)
def __init__(self, session, player_id, state_representation_size, num_actions, hidden_layers_sizes, reservoir_buffer_capacity, anticipatory_param, batch_size=128, rl_learning_rate=0.01, sl_learning_rate=0.01, min_buffer_size_to_learn=1000, learn_every=64, optimizer_str="sgd", **kwargs): """Initialize the `NFSP` agent.""" self.player_id = player_id self._session = session self._num_actions = num_actions self._layer_sizes = hidden_layers_sizes + [num_actions] self._batch_size = batch_size self._learn_every = learn_every self._anticipatory_param = anticipatory_param self._min_buffer_size_to_learn = min_buffer_size_to_learn self._reservoir_buffer = ReservoirBuffer(reservoir_buffer_capacity) self._prev_timestep = None self._prev_action = None # Step counter to keep track of learning. self._step_counter = 0 # Inner RL agent kwargs.update({ "batch_size": batch_size, "learning_rate": rl_learning_rate, "learn_every": learn_every, "min_buffer_size_to_learn": min_buffer_size_to_learn, "optimizer_str": optimizer_str, }) self._rl_agent = dqn.DQN(session, player_id, state_representation_size, num_actions, hidden_layers_sizes, **kwargs) # Keep track of the last training loss achieved in an update step. self._last_rl_loss_value = lambda: self._rl_agent.loss self._last_sl_loss_value = None # Placeholders. self._info_state_ph = tf.placeholder( shape=[None, state_representation_size], dtype=tf.float32, name="info_state_ph") self._action_probs_ph = tf.placeholder(shape=[None, num_actions], dtype=tf.float32, name="action_probs_ph") self._legal_actions_mask_ph = tf.placeholder( shape=[None, num_actions], dtype=tf.float32, name="legal_actions_mask_ph") # Average policy network. self._avg_network = snt.nets.MLP(output_sizes=self._layer_sizes) self._avg_policy = self._avg_network(self._info_state_ph) self._avg_policy_probs = tf.nn.softmax(self._avg_policy) # Loss self._loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2( labels=tf.stop_gradient(self._action_probs_ph), logits=self._avg_policy)) if optimizer_str == "adam": optimizer = tf.train.AdamOptimizer(learning_rate=sl_learning_rate) elif optimizer_str == "sgd": optimizer = tf.train.GradientDescentOptimizer( learning_rate=sl_learning_rate) else: raise ValueError("Not implemented. Choose from ['adam', 'sgd'].") self._learn_step = optimizer.minimize(self._loss) self._sample_episode_policy()
num_actions = env.action_spec()["num_actions"] hidden_layers_sizes = parameters.hidden_layers_sizes replay_buffer_capacity = int(1e4) train_episodes = 50000 loss_report_interval = 1000 delta_rank = 10 max_rank = 20 min_rank = -20 with tf.Session() as sess: sess.run(tf.compat.v1.global_variables_initializer()) dqn_agents = [ dqn.DQN(sess, player_id=idx, state_representation_size=state_size, num_actions=num_actions, hidden_layers_sizes=hidden_layers_sizes, replay_buffer_capacity=replay_buffer_capacity) for idx in range(num_players) ] stds = [] #cette boucle for permet d'effectuer un ladder sur les versions de l'agent voulu : si on ne veut rank qu'une seule version de l'agent, utiliser la premier ligne, sinon, la 2nd for version in [159]: #for version in range(1, 160, 5) : jjs = [] for i in range(num_players): dqn_agents[i].restore(parameters.agent_path, str(i), str(version)) #creation de la population de jean jacques, avec son
def main(_): game = "lewis_signaling" num_players = 2 num_states = FLAGS.num_states num_messages = FLAGS.num_messages if FLAGS.payoffs == "random": payoffs = np.random.random((num_states, num_states)) payoffs_str = ",".join([str(x) for x in payoffs.flatten()]) elif FLAGS.payoffs == "climbing": # This is a particular payoff matrix that is hard for decentralized # algorithms. Introduced in C. Claus and C. Boutilier, "The dynamics of # reinforcement learning in cooperative multiagent systems", 1998, for # simultaneous action games, but it is difficult even in the case of # signaling games. payoffs = np.array([[11, -30, 0], [-30, 7, 6], [0, 0, 5]]) / 30 payoffs_str = ",".join([str(x) for x in payoffs.flatten()]) else: payoffs_str = FLAGS.payoffs try: payoffs_list = [float(x) for x in payoffs_str.split(",")] payoffs = np.array(payoffs_list).reshape((num_states, num_states)) except ValueError: raise ValueError( "There should be {} (states * actions) elements in payoff. Found {} elements" .format(num_states * num_states, len(payoffs_list))) env_configs = { "num_states": num_states, "num_messages": num_messages, "payoffs": payoffs_str } env = rl_environment.Environment(game, **env_configs) state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] replay_buffer_capacity = FLAGS.replay_buffer_capacity # Results to store num_runs = FLAGS.num_runs training_episodes = FLAGS.num_episodes log_interval = FLAGS.log_interval rewards = np.zeros((num_runs, training_episodes // log_interval)) opts = np.zeros((num_runs, training_episodes // log_interval)) converge_point = np.zeros((num_states, num_states)) percent_opt = 0 # Repeat the experiment num_runs times for i in range(num_runs): with tf.Session() as sess: # pylint: disable=g-complex-comprehension agents = [ dqn.DQN( sess, player_id=idx, state_representation_size=state_size, num_actions=num_actions, learning_rate=FLAGS.step_size, replay_buffer_capacity=replay_buffer_capacity, epsilon_start=FLAGS.eps_init, epsilon_end=FLAGS.eps_final, epsilon_decay_duration=FLAGS.eps_decay_steps * 2) for idx in range(num_players) ] # 1. Train the agents for cur_episode in range(training_episodes): time_step = env.reset() # Find cur_state for logging. See lewis_signaling.cc for info_state # details cur_state = time_step.observations["info_state"][0][3:].index(1) while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) time_step = env.step([agent_output.action]) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step) # Store rewards reward = time_step.rewards[0] max_reward = payoffs[cur_state].max() cur_idx = (i, cur_episode // log_interval) rewards[cur_idx] += reward / log_interval opts[cur_idx] += np.isclose(reward, max_reward) / log_interval base_info_state0 = [1.0, 0.0, 0.0] + [0.0] * num_states base_info_state1 = [0.0, 1.0, 0.0] + [0.0] * num_states for s in range(num_states): info_state0 = copy.deepcopy(base_info_state0) info_state0[3 + s] = 1.0 # pylint: disable=protected-access m, _ = agents[0]._epsilon_greedy(info_state0, np.arange(num_messages), 0) info_state1 = copy.deepcopy(base_info_state1) info_state1[3 + m] = 1.0 a, _ = agents[1]._epsilon_greedy(info_state1, np.arange(num_states), 0) converge_point[s, a] += 1 best_act = payoffs[s].argmax() percent_opt += int(a == best_act) / num_runs / num_states if FLAGS.plot: # pylint: disable=g-import-not-at-top import matplotlib as mpl import matplotlib.pyplot as plt import scipy.stats as stats params = { "font.size": 13, "axes.labelsize": 13, "xtick.labelsize": 13, "ytick.labelsize": 13, } mpl.rcParams.update(params) def init_fig(): fig, ax = plt.subplots(1, 1) ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) return fig, ax def plot_scalars(scalars, repetition_axis=0, scalar_labels=None, title=None, ax_labels=None): """Plots scalar on ax by filling 1 standard error. Args: scalars: List of scalars to plot (mean taken over repetition axis) repetition_axis: Axis to take the mean over scalar_labels: Labels for the scalars (for legend) title: Figure title ax_labels: Labels for x and y axis (list of 2 strings) """ if not all([len(s.shape) == 2 for s in scalars]): raise ValueError("Only 2D arrays supported for plotting") if scalar_labels is None: scalar_labels = [None] * len(scalars) if len(scalars) != len(scalar_labels): raise ValueError( "Wrong number of scalar labels, expected {} but received {}".format( len(scalars), len(scalar_labels))) _, plot_axis = init_fig() for i, scalar in enumerate(scalars): xs = np.arange(scalar.shape[1 - repetition_axis]) * FLAGS.log_interval mean = scalar.mean(axis=repetition_axis) sem = stats.sem(scalar, axis=repetition_axis) plot_axis.plot(xs, mean, label=scalar_labels[i]) plot_axis.fill_between(xs, mean - sem, mean + sem, alpha=0.5) if title is not None: plot_axis.set_title(title) if ax_labels is not None: plot_axis.set_xlabel(ax_labels[0]) plot_axis.set_ylabel(ax_labels[1]) def plot_confusion_matrix(cm, cmap=plt.cm.Blues, title=None): """Plot the confusion matrix. Args: cm (np.ndarray): Confusion matrix to plot cmap: Color map to be used in matplotlib's imshow title: Figure title Returns: Figure and axis on which the confusion matrix is plotted. """ fig, ax = plt.subplots() ax.imshow(cm, interpolation="nearest", cmap=cmap) ax.set_xticks([]) ax.set_yticks([]) ax.set_xlabel("Receiver's action", fontsize=14) ax.set_ylabel("Sender's state", fontsize=14) # Loop over data dimensions and create text annotations. fmt = "d" thresh = cm.max() / 2. for i in range(cm.shape[0]): for j in range(cm.shape[1]): ax.text( j, i, format(cm[i, j], fmt), ha="center", va="center", color="white" if cm[i, j] > thresh else "black") fig.tight_layout() if title is not None: ax.set_title(title) return fig, ax plot_scalars([rewards], title="Reward graph (DQN)", ax_labels=["Episodes", "Reward per episode"]) plot_scalars([opts], title="Percentage of optimal actions (DQN)", ax_labels=["Episodes", "% optimal actions"]) plot_confusion_matrix( converge_point.astype(np.int), title="Final policy (DQN)") plt.show() return percent_opt
def main(_): game = FLAGS.game # Set the game num_players = 1 train_games, train_rewards, test_games, test_rewards = mst.game_params( FLAGS.num_nodes) # Load from files env_configs = train_games[0] env = rl_environment.Environment(game, **env_configs) info_state_size = FLAGS.num_nodes * FLAGS.num_nodes * 3 #env.observation_spec()["info_state"][0] num_actions = env.action_spec()[ "num_actions"] # number of possible actions print("Info State Size: ", info_state_size) print("Num Actions: ", num_actions) # random agents for evaluation random_agents = [ random_agent.RandomAgent(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.125) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] # pylint: disable=g-complex-comprehension agents = [ dqn.DQN(session=sess, player_id=idx, state_representation_size=info_state_size, num_actions=num_actions, hidden_layers_sizes=hidden_layers_sizes, replay_buffer_capacity=FLAGS.replay_buffer_capacity, batch_size=FLAGS.batch_size) for idx in range(num_players) ] saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) #saver = tf.train.import_meta_graph('/home/jupyter/ORIE-GNN-bjk224/mst-game/dqn_checkpoints/dqn_20epochs_mst_medium/dqn_test-399999.meta') #saver.restore(sess, tf.train.latest_checkpoint('/home/jupyter/ORIE-GNN-bjk224/mst-game/dqn_checkpoints/dqn_20epochs_mst_medium/')) for ep in range(FLAGS.num_train_episodes): print(env_configs) #env_configs = train_games[ep % len(train_games)] #env = rl_environment.Environment(game, **env_configs) episode_reward = train_rewards[ep % len(train_games)] if (ep + 1) % FLAGS.eval_every == 0: r_mean = eval_against_random_bots(env, agents, random_agents, 0) logging.info("[%s] Mean episode rewards %s", ep + 1, r_mean) #saver.save(sess, FLAGS.checkpoint_dir, ep) print("Actual MST Value: ", episode_reward) if (ep + 1) % FLAGS.test_every == 0: test_accuracy = test_trained_bot(test_games, test_rewards, agents[0], ep, FLAGS.num_nodes, game, FLAGS.game_version) logging.info("[%s] Test Accuracy: %s", ep + 1, test_accuracy) #env = rl_environment.Environment(game, **games[ep]) time_step = env.reset() # print("TRAIN"+"*"*80) while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) #print("(Action, Reward): ", action_list[0], time_step.rewards[0]) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step)
def main_loop(unused_arg): """Trains a DQN agent in the catch environment.""" env = catch.Environment() info_state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] train_episodes = FLAGS.num_episodes with tf.Session() as sess: if FLAGS.algorithm in {"rpg", "qpg", "rm", "a2c"}: agent = policy_gradient.PolicyGradient( sess, player_id=0, info_state_size=info_state_size, num_actions=num_actions, loss_str=FLAGS.algorithm, hidden_layers_sizes=[128, 128], batch_size=128, entropy_cost=0.01, critic_learning_rate=0.1, pi_learning_rate=0.1, num_critic_before_pi=3) elif FLAGS.algorithm == "dqn": agent = dqn.DQN( sess, player_id=0, state_representation_size=info_state_size, num_actions=num_actions, learning_rate=0.1, replay_buffer_capacity=10000, hidden_layers_sizes=[32, 32], epsilon_decay_duration=2000, # 10% total data update_target_network_every=250) elif FLAGS.algorithm == "eva": agent = eva.EVAAgent( sess, env, player_id=0, state_size=info_state_size, num_actions=num_actions, learning_rate=1e-3, trajectory_len=2, num_neighbours=2, mixing_parameter=0.95, memory_capacity=10000, dqn_hidden_layers=[32, 32], epsilon_decay_duration=2000, # 10% total data update_target_network_every=250) else: raise ValueError("Algorithm not implemented!") sess.run(tf.global_variables_initializer()) # Train agent for ep in range(train_episodes): time_step = env.reset() while not time_step.last(): agent_output = agent.step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) # Episode is over, step agent with final info state. agent.step(time_step) if ep and ep % FLAGS.eval_every == 0: logging.info("-" * 80) logging.info("Episode %s", ep) logging.info("Loss: %s", agent.loss) avg_return = _eval_agent(env, agent, 100) logging.info("Avg return: %s", avg_return)
def main_loop(unused_arg): """RL main loop example.""" logging.info("Registered games: %s", rl_environment.registered_games()) logging.info("Creating game %s", FLAGS.game) #env_configs = {"players": FLAGS.num_players} if FLAGS.num_players else {} env_configs = {} env = rl_environment.Environment(FLAGS.game, **env_configs) state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] hidden_layers_sizes = [512, 512] replay_buffer_capacity = int(1e4) train_episodes = FLAGS.num_episodes loss_report_interval = 1000 logging.info("Env specs: %s", env.observation_spec()) logging.info("Action specs: %s", env.action_spec()) with tf.Session() as sess: agents = [ dqn.DQN( # pylint: disable=g-complex-comprehension sess, player_id, state_representation_size=state_size, num_actions=num_actions, #hidden_layers_sizes=[16], #replay_buffer_capacity=10, hidden_layers_sizes=hidden_layers_sizes, replay_buffer_capacity=replay_buffer_capacity, batch_size=128) for player_id in range(3) ] saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) #latest_checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) latest_checkpoint_path = tf.train.latest_checkpoint( FLAGS.checkpoint_dir) if latest_checkpoint_path: print('Restoring checkpoint: {0}'.format(latest_checkpoint_path)) saver.restore(sess, latest_checkpoint_path) # Train agent for ep in range(train_episodes): #if ep and ep % loss_report_interval == 0: if (ep + 1) % FLAGS.eval_every == 0: logging.info("[%s/%s] DQN loss: %s %s %s", ep, train_episodes, agents[0].loss, agents[1].loss, agents[2].loss) saver.save(sess, FLAGS.checkpoint_dir, ep) time_step = env.reset() while not time_step.last(): current_player = time_step.observations["current_player"] #agent_output = [agent.step(time_step) for agent in agents] #time_step = env.step([agent_output[current_player].action]) if env.is_turn_based: agent_output = agents[current_player].step(time_step) action_list = [agent_output.action] else: agents_output = [agent.step(time_step) for agent in agents] action_list = [ agent_output.action for agent_output in agents_output ] #print_iteration(time_step, current_player, action_list) time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step)