def create_training_agents(num_players, sess, num_actions, info_state_size, hidden_layers_sizes): """Create the agents we want to use for learning.""" if FLAGS.learner == "qlearning": # pylint: disable=g-complex-comprehension return [ tabular_qlearner.QLearner( player_id=idx, num_actions=num_actions, # step_size=0.02, step_size=0.1, # epsilon_schedule=rl_tools.ConstantSchedule(0.5), epsilon_schedule=rl_tools.LinearSchedule(0.5, 0.2, 1000000), discount_factor=0.99) for idx in range(num_players) ] elif FLAGS.learner == "dqn": # pylint: disable=g-complex-comprehension return [ dqn.DQN(session=sess, player_id=idx, state_representation_size=info_state_size, num_actions=num_actions, discount_factor=0.99, epsilon_start=0.5, epsilon_end=0.1, hidden_layers_sizes=hidden_layers_sizes, replay_buffer_capacity=FLAGS.replay_buffer_capacity, batch_size=FLAGS.batch_size) for idx in range(num_players) ] else: raise RuntimeError("Unknown learner")
def create_epsilon_schedule(sched_str): """Creates an epsilon schedule from the string as desribed in the flags.""" values = FLAGS.epsilon_schedule.split(",") if values[0] == "linear": assert len(values) == 4 return rl_tools.LinearSchedule(float(values[1]), float(values[2]), int(values[3])) elif values[0] == "constant": assert len(values) == 2 return rl_tools.ConstantSchedule(float(values[1])) else: print("Unrecognized schedule string: {}".format(sched_str)) sys.exit()
def run_experiment(num_players, env, payoffs, centralized): """Run the experiments.""" num_states = FLAGS.num_states num_messages = FLAGS.num_messages num_actions = env.action_spec()["num_actions"] # Results to store num_runs = FLAGS.num_runs training_episodes = FLAGS.num_episodes log_interval = FLAGS.log_interval rewards = np.zeros((num_runs, training_episodes // log_interval)) opts = np.zeros((num_runs, training_episodes // log_interval)) converge_point = np.zeros((num_states, num_states)) percent_opt = 0 # Repeat the experiment num_runs times for i in range(num_runs): eps_schedule = rl_tools.LinearSchedule( FLAGS.eps_init, FLAGS.eps_final, FLAGS.eps_decay_steps * 2) # *2 since there are 2 agent steps per episode agents = [ # pylint: disable=g-complex-comprehension tabular_qlearner.QLearner( player_id=idx, num_actions=num_actions, step_size=FLAGS.step_size, epsilon_schedule=eps_schedule, centralized=centralized) for idx in range(num_players) ] # 1. Train the agents for cur_episode in range(training_episodes): time_step = env.reset() # Find cur_state for logging. See lewis_signaling.cc for info_state # details. cur_state = time_step.observations["info_state"][0][3:].index(1) while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) time_step = env.step([agent_output.action]) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step) # Store rewards reward = time_step.rewards[0] max_reward = payoffs[cur_state].max() cur_idx = (i, cur_episode // log_interval) rewards[cur_idx] += reward / log_interval opts[cur_idx] += np.isclose(reward, max_reward) / log_interval base_info_state0 = [1.0, 0.0, 0.0] + [0.0] * num_states base_info_state1 = [0.0, 1.0, 0.0] + [0.0] * num_states if centralized: base_info_state0 = [base_info_state0, base_info_state0.copy()] base_info_state1 = [base_info_state1, base_info_state1.copy()] for s in range(num_states): info_state0 = copy.deepcopy(base_info_state0) if centralized: info_state0[0][3 + s] = 1.0 else: info_state0[3 + s] = 1.0 # pylint: disable=protected-access m, _ = agents[0]._epsilon_greedy( str(info_state0), np.arange(num_messages), 0) info_state1 = copy.deepcopy(base_info_state1) if centralized: info_state1[0][3 + s] = 1.0 info_state1[1][3 + m] = 1.0 else: info_state1[3 + m] = 1.0 a, _ = agents[1]._epsilon_greedy( str(info_state1), np.arange(num_states), 0) converge_point[s, a] += 1 best_act = payoffs[s].argmax() percent_opt += int(a == best_act) / num_runs / num_states return rewards, opts, converge_point, percent_opt