def main(_): env = rl_environment.Environment(FLAGS.game) num_players = env.num_players num_actions = env.action_spec()["num_actions"] agents = [] if FLAGS.epsilon_schedule is not None: for idx in range(num_players): agents.append( tabular_qlearner.QLearner( player_id=idx, num_actions=num_actions, epsilon_schedule=create_epsilon_schedule(FLAGS.epsilon_schedule))) else: agents = [ tabular_qlearner.QLearner(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] # 1. Train the agents training_episodes = FLAGS.num_train_episodes for cur_episode in range(training_episodes): if cur_episode % int(FLAGS.eval_freq) == 0: avg_rewards = eval_agents(env, agents, FLAGS.num_eval_episodes) print("Training episodes: {}, Avg rewards: {}".format( cur_episode, avg_rewards)) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) time_step = env.step([agent_output.action]) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step)
def main_loop(unused_arg): """Trains a tabular qlearner agent in the cliff walking environment.""" env = cliff_walking.Environment(width=5, height=3) num_actions = env.action_spec()["num_actions"] train_episodes = FLAGS.num_episodes eval_interval = 50 agent = tabular_qlearner.QLearner( player_id=0, step_size=0.05, num_actions=num_actions) # Train the agent for ep in range(train_episodes): time_step = env.reset() while not time_step.last(): agent_output = agent.step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) # Episode is over, step agent with final info state. agent.step(time_step) if ep and ep % eval_interval == 0: logging.info("-" * 80) logging.info("Episode %s", ep) logging.info("Last loss: %s", agent.loss) avg_return = eval_agent(env, agent, 100) logging.info("Avg return: %s", avg_return)
def create_training_agents(num_players, sess, num_actions, info_state_size, hidden_layers_sizes): """Create the agents we want to use for learning.""" if FLAGS.learner == "qlearning": # pylint: disable=g-complex-comprehension return [ tabular_qlearner.QLearner( player_id=idx, num_actions=num_actions, # step_size=0.02, step_size=0.1, # epsilon_schedule=rl_tools.ConstantSchedule(0.5), epsilon_schedule=rl_tools.LinearSchedule(0.5, 0.2, 1000000), discount_factor=0.99) for idx in range(num_players) ] elif FLAGS.learner == "dqn": # pylint: disable=g-complex-comprehension return [ dqn.DQN(session=sess, player_id=idx, state_representation_size=info_state_size, num_actions=num_actions, discount_factor=0.99, epsilon_start=0.5, epsilon_end=0.1, hidden_layers_sizes=hidden_layers_sizes, replay_buffer_capacity=FLAGS.replay_buffer_capacity, batch_size=FLAGS.batch_size) for idx in range(num_players) ] else: raise RuntimeError("Unknown learner")
def main(): env = cliff_walking.Environment(width=12, height=4) num_actions = env.action_spec()["num_actions"] learning_rates = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] for learning_rate in learning_rates: agent = tabular_qlearner.QLearner(player_id=0, step_size=learning_rate, num_actions=num_actions) train(env, agent, 100) avg_reward = evaluate(env, agent, 50) print(avg_reward)
def main(): print("finished") game = "matrix_pd" num_players = 2 env = rl_environment.Environment(game) num_actions = env.action_spec()["num_actions"] agents = [ tabular_qlearner.QLearner(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] training_episodes = FLAGS.num_episodes for cur_episode in range(training_episodes): time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) time_step = env.step([agent_output.action]) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step)
def main(_): game = "tic_tac_toe" num_players = 2 env = rl_environment.Environment(game) state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] hidden_layers_sizes = [32, 32] replay_buffer_capacity = int(1e4) train_episodes = FLAGS.num_episodes loss_report_interval = 1000 with tf.Session() as sess: dqn_agent = dqn.DQN(sess, player_id=0, state_representation_size=state_size, num_actions=num_actions, hidden_layers_sizes=hidden_layers_sizes, replay_buffer_capacity=replay_buffer_capacity) tabular_q_agent = tabular_qlearner.QLearner(player_id=1, num_actions=num_actions) agents = [dqn_agent, tabular_q_agent] sess.run(tf.global_variables_initializer()) # Train agent for ep in range(train_episodes): if ep and ep % loss_report_interval == 0: logging.info("[%s/%s] DQN loss: %s", ep, train_episodes, agents[0].loss) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step) # Evaluate against random agent random_agents = [ random_agent.RandomAgent(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] r_mean = eval_against_random_bots(env, agents, random_agents, 1000) logging.info("Mean episode rewards: %s", r_mean) if not FLAGS.iteractive_play: return # Play from the command line against the trained DQN agent. human_player = 1 while True: logging.info("You are playing as %s", "O" if human_player else "X") time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] if player_id == human_player: agent_out = agents[human_player].step(time_step, is_evaluation=True) logging.info("\n%s", agent_out.probs.reshape((3, 3))) logging.info("\n%s", pretty_board(time_step)) action = command_line_action(time_step) else: agent_out = agents[1 - human_player].step( time_step, is_evaluation=True) action = agent_out.action time_step = env.step([action]) logging.info("\n%s", pretty_board(time_step)) logging.info("End of game!") if time_step.rewards[human_player] > 0: logging.info("You win") elif time_step.rewards[human_player] < 0: logging.info("You lose") else: logging.info("Draw") # Switch order of players human_player = 1 - human_player
def run_experiment(num_players, env, payoffs, centralized): """Run the experiments.""" num_states = FLAGS.num_states num_messages = FLAGS.num_messages num_actions = env.action_spec()["num_actions"] # Results to store num_runs = FLAGS.num_runs training_episodes = FLAGS.num_episodes log_interval = FLAGS.log_interval rewards = np.zeros((num_runs, training_episodes // log_interval)) opts = np.zeros((num_runs, training_episodes // log_interval)) converge_point = np.zeros((num_states, num_states)) percent_opt = 0 # Repeat the experiment num_runs times for i in range(num_runs): eps_schedule = rl_tools.LinearSchedule( FLAGS.eps_init, FLAGS.eps_final, FLAGS.eps_decay_steps * 2) # *2 since there are 2 agent steps per episode agents = [ # pylint: disable=g-complex-comprehension tabular_qlearner.QLearner( player_id=idx, num_actions=num_actions, step_size=FLAGS.step_size, epsilon_schedule=eps_schedule, centralized=centralized) for idx in range(num_players) ] # 1. Train the agents for cur_episode in range(training_episodes): time_step = env.reset() # Find cur_state for logging. See lewis_signaling.cc for info_state # details. cur_state = time_step.observations["info_state"][0][3:].index(1) while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) time_step = env.step([agent_output.action]) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step) # Store rewards reward = time_step.rewards[0] max_reward = payoffs[cur_state].max() cur_idx = (i, cur_episode // log_interval) rewards[cur_idx] += reward / log_interval opts[cur_idx] += np.isclose(reward, max_reward) / log_interval base_info_state0 = [1.0, 0.0, 0.0] + [0.0] * num_states base_info_state1 = [0.0, 1.0, 0.0] + [0.0] * num_states if centralized: base_info_state0 = [base_info_state0, base_info_state0.copy()] base_info_state1 = [base_info_state1, base_info_state1.copy()] for s in range(num_states): info_state0 = copy.deepcopy(base_info_state0) if centralized: info_state0[0][3 + s] = 1.0 else: info_state0[3 + s] = 1.0 # pylint: disable=protected-access m, _ = agents[0]._epsilon_greedy( str(info_state0), np.arange(num_messages), 0) info_state1 = copy.deepcopy(base_info_state1) if centralized: info_state1[0][3 + s] = 1.0 info_state1[1][3 + m] = 1.0 else: info_state1[3 + m] = 1.0 a, _ = agents[1]._epsilon_greedy( str(info_state1), np.arange(num_states), 0) converge_point[s, a] += 1 best_act = payoffs[s].argmax() percent_opt += int(a == best_act) / num_runs / num_states return rewards, opts, converge_point, percent_opt
''' game.new_initial_state() game.num_players() state.is_terminal() state.is_simultaneous_node() state.is_chance_node() state.legal_actions(int playerId) state.apply_actions([a1,a2, ...] voor elke playerId) state.returns() | geeft result van het spel weer ''' env = rl_environment.Environment("matrix_pd") num_actions = env.action_spec()["num_actions"] q_agent = tabular_qlearner.QLearner(0, num_actions) ra2 = random_agent.RandomAgent(1, num_actions) players = [q_agent, ra2] for cur_episode in range(5000): if (cur_episode % 1000 == 0): print("cur_episode: " + str(cur_episode)) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = players[player_id].step(time_step) time_step = env.step([agent_output.action]) for agent in players: agent.step(time_step)
def main(_): game = "tic_tac_toe" num_players = 2 env = rl_environment.Environment(game) num_actions = env.action_spec()["num_actions"] agents = [ tabular_qlearner.QLearner(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] # random agents for evaluation random_agents = [ random_agent.RandomAgent(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] # 1. Train the agents training_episodes = FLAGS.num_episodes for cur_episode in range(training_episodes): if cur_episode % int(1e4) == 0: win_rates = eval_against_random_bots(env, agents, random_agents, 1000) logging.info("Starting episode %s, win_rates %s", cur_episode, win_rates) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) time_step = env.step([agent_output.action]) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step) if not FLAGS.iteractive_play: return # 2. Play from the command line against the trained agent. human_player = 1 while True: logging.info("You are playing as %s", "O" if human_player else "X") time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] if player_id == human_player: agent_out = agents[human_player].step(time_step, is_evaluation=True) logging.info("\n%s", agent_out.probs.reshape((3, 3))) logging.info("\n%s", pretty_board(time_step)) action = command_line_action(time_step) else: agent_out = agents[1 - human_player].step(time_step, is_evaluation=True) action = agent_out.action time_step = env.step([action]) logging.info("\n%s", pretty_board(time_step)) logging.info("End of game!") if time_step.rewards[human_player] > 0: logging.info("You win") elif time_step.rewards[human_player] < 0: logging.info("You lose") else: logging.info("Draw") # Switch order of players human_player = 1 - human_player
def main(_): game = "kuhn_poker" num_players = 2 env = rl_environment.Environment(game) num_actions = env.action_spec()["num_actions"] agents = [ tabular_qlearner.QLearner(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] # random agents for evaluation random_agents = [ random_agent.RandomAgent(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] # 1. Train the agents if FLAGS.should_train: training_episodes = FLAGS.num_episodes for cur_episode in range(training_episodes): if cur_episode % int(1e4) == 0: win_rates = eval_against_random_bots(env, agents, random_agents, 1000) logging.info("Starting episode %s, win_rates %s", cur_episode, win_rates) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) time_step = env.step([agent_output.action]) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step) if not FLAGS.iteractive_play: return # 2. Play from the command line against the trained agent. if FLAGS.should_play: # Pretty print state player_1 = 0 while True: time_step = env.reset() pretty_print_state(env) while not time_step.last(): player_id = time_step.observations["current_player"] if player_id == player_1: agent_out = agents[player_1].step(time_step, is_evaluation=True) logging.info("Pick action for player %s", player_id) action = command_line_action(env, time_step) else: agent_out = agents[1 - player_1].step(time_step, is_evaluation=True) logging.info("Pick action for player %s", player_id) #action = command_line_action(env, time_step) action = agent_out.action logging.info("Agent action: %s", action) time_step = env.step([action]) logging.info("Rewards: Player_0 %s | Player_1 %s", time_step.rewards[player_1], time_step.rewards[1 - player_1]) logging.info("End of game!")