def main(_):
  env = rl_environment.Environment(FLAGS.game)
  num_players = env.num_players
  num_actions = env.action_spec()["num_actions"]

  agents = []
  if FLAGS.epsilon_schedule is not None:
    for idx in range(num_players):
      agents.append(
          tabular_qlearner.QLearner(
              player_id=idx,
              num_actions=num_actions,
              epsilon_schedule=create_epsilon_schedule(FLAGS.epsilon_schedule)))
  else:
    agents = [
        tabular_qlearner.QLearner(player_id=idx, num_actions=num_actions)
        for idx in range(num_players)
    ]

  # 1. Train the agents
  training_episodes = FLAGS.num_train_episodes
  for cur_episode in range(training_episodes):
    if cur_episode % int(FLAGS.eval_freq) == 0:
      avg_rewards = eval_agents(env, agents, FLAGS.num_eval_episodes)
      print("Training episodes: {}, Avg rewards: {}".format(
          cur_episode, avg_rewards))
    time_step = env.reset()
    while not time_step.last():
      player_id = time_step.observations["current_player"]
      agent_output = agents[player_id].step(time_step)
      time_step = env.step([agent_output.action])

    # Episode is over, step all agents with final info state.
    for agent in agents:
      agent.step(time_step)
Example #2
0
def main_loop(unused_arg):
  """Trains a tabular qlearner agent in the cliff walking environment."""
  env = cliff_walking.Environment(width=5, height=3)
  num_actions = env.action_spec()["num_actions"]

  train_episodes = FLAGS.num_episodes
  eval_interval = 50

  agent = tabular_qlearner.QLearner(
      player_id=0, step_size=0.05, num_actions=num_actions)

  # Train the agent
  for ep in range(train_episodes):
    time_step = env.reset()
    while not time_step.last():
      agent_output = agent.step(time_step)
      action_list = [agent_output.action]
      time_step = env.step(action_list)
    # Episode is over, step agent with final info state.
    agent.step(time_step)

    if ep and ep % eval_interval == 0:
      logging.info("-" * 80)
      logging.info("Episode %s", ep)
      logging.info("Last loss: %s", agent.loss)
      avg_return = eval_agent(env, agent, 100)
      logging.info("Avg return: %s", avg_return)
Example #3
0
def create_training_agents(num_players, sess, num_actions, info_state_size,
                           hidden_layers_sizes):
    """Create the agents we want to use for learning."""
    if FLAGS.learner == "qlearning":
        # pylint: disable=g-complex-comprehension
        return [
            tabular_qlearner.QLearner(
                player_id=idx,
                num_actions=num_actions,
                # step_size=0.02,
                step_size=0.1,
                # epsilon_schedule=rl_tools.ConstantSchedule(0.5),
                epsilon_schedule=rl_tools.LinearSchedule(0.5, 0.2, 1000000),
                discount_factor=0.99) for idx in range(num_players)
        ]
    elif FLAGS.learner == "dqn":
        # pylint: disable=g-complex-comprehension
        return [
            dqn.DQN(session=sess,
                    player_id=idx,
                    state_representation_size=info_state_size,
                    num_actions=num_actions,
                    discount_factor=0.99,
                    epsilon_start=0.5,
                    epsilon_end=0.1,
                    hidden_layers_sizes=hidden_layers_sizes,
                    replay_buffer_capacity=FLAGS.replay_buffer_capacity,
                    batch_size=FLAGS.batch_size) for idx in range(num_players)
        ]
    else:
        raise RuntimeError("Unknown learner")
Example #4
0
def main():
    env = cliff_walking.Environment(width=12, height=4)
    num_actions = env.action_spec()["num_actions"]

    learning_rates = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

    for learning_rate in learning_rates:
        agent = tabular_qlearner.QLearner(player_id=0,
                                          step_size=learning_rate,
                                          num_actions=num_actions)

        train(env, agent, 100)
        avg_reward = evaluate(env, agent, 50)

        print(avg_reward)
Example #5
0
def main():
    print("finished")
    game = "matrix_pd"
    num_players = 2

    env = rl_environment.Environment(game)
    num_actions = env.action_spec()["num_actions"]

    agents = [
        tabular_qlearner.QLearner(player_id=idx, num_actions=num_actions)
        for idx in range(num_players)
    ]

    training_episodes = FLAGS.num_episodes
    for cur_episode in range(training_episodes):
        time_step = env.reset()
        while not time_step.last():
            player_id = time_step.observations["current_player"]
            agent_output = agents[player_id].step(time_step)
            time_step = env.step([agent_output.action])

        # Episode is over, step all agents with final info state.
        for agent in agents:
            agent.step(time_step)
Example #6
0
def main(_):
    game = "tic_tac_toe"
    num_players = 2
    env = rl_environment.Environment(game)
    state_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]

    hidden_layers_sizes = [32, 32]
    replay_buffer_capacity = int(1e4)
    train_episodes = FLAGS.num_episodes
    loss_report_interval = 1000

    with tf.Session() as sess:
        dqn_agent = dqn.DQN(sess,
                            player_id=0,
                            state_representation_size=state_size,
                            num_actions=num_actions,
                            hidden_layers_sizes=hidden_layers_sizes,
                            replay_buffer_capacity=replay_buffer_capacity)
        tabular_q_agent = tabular_qlearner.QLearner(player_id=1,
                                                    num_actions=num_actions)
        agents = [dqn_agent, tabular_q_agent]

        sess.run(tf.global_variables_initializer())

        # Train agent
        for ep in range(train_episodes):
            if ep and ep % loss_report_interval == 0:
                logging.info("[%s/%s] DQN loss: %s", ep, train_episodes,
                             agents[0].loss)
            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                agent_output = agents[player_id].step(time_step)
                action_list = [agent_output.action]
                time_step = env.step(action_list)

            # Episode is over, step all agents with final info state.
            for agent in agents:
                agent.step(time_step)

        # Evaluate against random agent
        random_agents = [
            random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
            for idx in range(num_players)
        ]
        r_mean = eval_against_random_bots(env, agents, random_agents, 1000)
        logging.info("Mean episode rewards: %s", r_mean)

        if not FLAGS.iteractive_play:
            return

        # Play from the command line against the trained DQN agent.
        human_player = 1
        while True:
            logging.info("You are playing as %s", "O" if human_player else "X")
            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                if player_id == human_player:
                    agent_out = agents[human_player].step(time_step,
                                                          is_evaluation=True)
                    logging.info("\n%s", agent_out.probs.reshape((3, 3)))
                    logging.info("\n%s", pretty_board(time_step))
                    action = command_line_action(time_step)
                else:
                    agent_out = agents[1 - human_player].step(
                        time_step, is_evaluation=True)
                    action = agent_out.action
                time_step = env.step([action])

            logging.info("\n%s", pretty_board(time_step))

            logging.info("End of game!")
            if time_step.rewards[human_player] > 0:
                logging.info("You win")
            elif time_step.rewards[human_player] < 0:
                logging.info("You lose")
            else:
                logging.info("Draw")
            # Switch order of players
            human_player = 1 - human_player
def run_experiment(num_players, env, payoffs, centralized):
  """Run the experiments."""
  num_states = FLAGS.num_states
  num_messages = FLAGS.num_messages
  num_actions = env.action_spec()["num_actions"]

  # Results to store
  num_runs = FLAGS.num_runs
  training_episodes = FLAGS.num_episodes
  log_interval = FLAGS.log_interval
  rewards = np.zeros((num_runs, training_episodes // log_interval))
  opts = np.zeros((num_runs, training_episodes // log_interval))
  converge_point = np.zeros((num_states, num_states))
  percent_opt = 0

  # Repeat the experiment num_runs times
  for i in range(num_runs):
    eps_schedule = rl_tools.LinearSchedule(
        FLAGS.eps_init, FLAGS.eps_final, FLAGS.eps_decay_steps *
        2)  # *2 since there are 2 agent steps per episode

    agents = [
        # pylint: disable=g-complex-comprehension
        tabular_qlearner.QLearner(
            player_id=idx,
            num_actions=num_actions,
            step_size=FLAGS.step_size,
            epsilon_schedule=eps_schedule,
            centralized=centralized) for idx in range(num_players)
    ]

    # 1. Train the agents
    for cur_episode in range(training_episodes):
      time_step = env.reset()
      # Find cur_state for logging. See lewis_signaling.cc for info_state
      # details.
      cur_state = time_step.observations["info_state"][0][3:].index(1)
      while not time_step.last():
        player_id = time_step.observations["current_player"]
        agent_output = agents[player_id].step(time_step)
        time_step = env.step([agent_output.action])

      # Episode is over, step all agents with final info state.
      for agent in agents:
        agent.step(time_step)

      # Store rewards
      reward = time_step.rewards[0]
      max_reward = payoffs[cur_state].max()
      cur_idx = (i, cur_episode // log_interval)
      rewards[cur_idx] += reward / log_interval
      opts[cur_idx] += np.isclose(reward, max_reward) / log_interval

    base_info_state0 = [1.0, 0.0, 0.0] + [0.0] * num_states
    base_info_state1 = [0.0, 1.0, 0.0] + [0.0] * num_states
    if centralized:
      base_info_state0 = [base_info_state0, base_info_state0.copy()]
      base_info_state1 = [base_info_state1, base_info_state1.copy()]

    for s in range(num_states):
      info_state0 = copy.deepcopy(base_info_state0)
      if centralized:
        info_state0[0][3 + s] = 1.0
      else:
        info_state0[3 + s] = 1.0
      # pylint: disable=protected-access
      m, _ = agents[0]._epsilon_greedy(
          str(info_state0), np.arange(num_messages), 0)
      info_state1 = copy.deepcopy(base_info_state1)
      if centralized:
        info_state1[0][3 + s] = 1.0
        info_state1[1][3 + m] = 1.0
      else:
        info_state1[3 + m] = 1.0
      a, _ = agents[1]._epsilon_greedy(
          str(info_state1), np.arange(num_states), 0)
      converge_point[s, a] += 1
      best_act = payoffs[s].argmax()
      percent_opt += int(a == best_act) / num_runs / num_states
  return rewards, opts, converge_point, percent_opt
'''
game.new_initial_state()
game.num_players()
state.is_terminal()
state.is_simultaneous_node()
state.is_chance_node()
state.legal_actions(int playerId)
state.apply_actions([a1,a2, ...] voor elke playerId)
state.returns()    | geeft result van het spel weer

'''

env = rl_environment.Environment("matrix_pd")
num_actions = env.action_spec()["num_actions"]
q_agent = tabular_qlearner.QLearner(0, num_actions)
ra2 = random_agent.RandomAgent(1, num_actions)
players = [q_agent, ra2]

for cur_episode in range(5000):
    if (cur_episode % 1000 == 0):
        print("cur_episode: " + str(cur_episode))

    time_step = env.reset()
    while not time_step.last():
        player_id = time_step.observations["current_player"]
        agent_output = players[player_id].step(time_step)
        time_step = env.step([agent_output.action])

    for agent in players:
        agent.step(time_step)
def main(_):
    game = "tic_tac_toe"
    num_players = 2

    env = rl_environment.Environment(game)
    num_actions = env.action_spec()["num_actions"]

    agents = [
        tabular_qlearner.QLearner(player_id=idx, num_actions=num_actions)
        for idx in range(num_players)
    ]

    # random agents for evaluation
    random_agents = [
        random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
        for idx in range(num_players)
    ]

    # 1. Train the agents
    training_episodes = FLAGS.num_episodes
    for cur_episode in range(training_episodes):
        if cur_episode % int(1e4) == 0:
            win_rates = eval_against_random_bots(env, agents, random_agents,
                                                 1000)
            logging.info("Starting episode %s, win_rates %s", cur_episode,
                         win_rates)
        time_step = env.reset()
        while not time_step.last():
            player_id = time_step.observations["current_player"]
            agent_output = agents[player_id].step(time_step)
            time_step = env.step([agent_output.action])

        # Episode is over, step all agents with final info state.
        for agent in agents:
            agent.step(time_step)

    if not FLAGS.iteractive_play:
        return

    # 2. Play from the command line against the trained agent.
    human_player = 1
    while True:
        logging.info("You are playing as %s", "O" if human_player else "X")
        time_step = env.reset()
        while not time_step.last():
            player_id = time_step.observations["current_player"]
            if player_id == human_player:
                agent_out = agents[human_player].step(time_step,
                                                      is_evaluation=True)
                logging.info("\n%s", agent_out.probs.reshape((3, 3)))
                logging.info("\n%s", pretty_board(time_step))
                action = command_line_action(time_step)
            else:
                agent_out = agents[1 - human_player].step(time_step,
                                                          is_evaluation=True)
                action = agent_out.action
            time_step = env.step([action])

        logging.info("\n%s", pretty_board(time_step))

        logging.info("End of game!")
        if time_step.rewards[human_player] > 0:
            logging.info("You win")
        elif time_step.rewards[human_player] < 0:
            logging.info("You lose")
        else:
            logging.info("Draw")
        # Switch order of players
        human_player = 1 - human_player
Example #10
0
def main(_):
    game = "kuhn_poker"
    num_players = 2

    env = rl_environment.Environment(game)
    num_actions = env.action_spec()["num_actions"]

    agents = [
        tabular_qlearner.QLearner(player_id=idx, num_actions=num_actions)
        for idx in range(num_players)
    ]

    # random agents for evaluation
    random_agents = [
        random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
        for idx in range(num_players)
    ]

    # 1. Train the agents
    if FLAGS.should_train:
        training_episodes = FLAGS.num_episodes
        for cur_episode in range(training_episodes):
            if cur_episode % int(1e4) == 0:
                win_rates = eval_against_random_bots(env, agents,
                                                     random_agents, 1000)
                logging.info("Starting episode %s, win_rates %s", cur_episode,
                             win_rates)
            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                agent_output = agents[player_id].step(time_step)
                time_step = env.step([agent_output.action])

            # Episode is over, step all agents with final info state.
            for agent in agents:
                agent.step(time_step)

        if not FLAGS.iteractive_play:
            return

    # 2. Play from the command line against the trained agent.
    if FLAGS.should_play:
        # Pretty print state

        player_1 = 0
        while True:
            time_step = env.reset()
            pretty_print_state(env)
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                if player_id == player_1:
                    agent_out = agents[player_1].step(time_step,
                                                      is_evaluation=True)

                    logging.info("Pick action for player %s", player_id)
                    action = command_line_action(env, time_step)
                else:
                    agent_out = agents[1 - player_1].step(time_step,
                                                          is_evaluation=True)

                    logging.info("Pick action for player %s", player_id)
                    #action = command_line_action(env, time_step)

                    action = agent_out.action
                    logging.info("Agent action: %s", action)
                time_step = env.step([action])

            logging.info("Rewards: Player_0 %s | Player_1 %s",
                         time_step.rewards[player_1],
                         time_step.rewards[1 - player_1])
            logging.info("End of game!")