Exemple #1
0
def main(_):
    game = "breakthrough"
    num_players = 2

    env_configs = {"columns": 5, "rows": 5}
    env = rl_environment.Environment(game, **env_configs)
    info_state_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]

    # random agents for evaluation
    random_agents = [
        random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
        for idx in range(num_players)
    ]

    with tf.Session() as sess:
        hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
        # pylint: disable=g-complex-comprehension
        agents = [
            dqn.DQN(session=sess,
                    player_id=idx,
                    state_representation_size=info_state_size,
                    num_actions=num_actions,
                    hidden_layers_sizes=hidden_layers_sizes,
                    replay_buffer_capacity=FLAGS.replay_buffer_capacity,
                    batch_size=FLAGS.batch_size) for idx in range(num_players)
        ]
        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())

        print(type(agents[0].get_weights()), agents[0].get_weights())
Exemple #2
0
def main(_):
  game = FLAGS.game
  num_players = 1
  games, rewards,_,_ = mst.game_params(FLAGS.num_nodes)

  env_configs = games[0]
  env = rl_environment.Environment(game, **env_configs)
  info_state_size = FLAGS.num_nodes * FLAGS.num_nodes * 3 #env.observation_spec()["info_state"][0]
  num_actions = env.action_spec()["num_actions"]

  print("Info State Size: ", info_state_size)
  print("Num Actions: ", num_actions)  
    
  # random agents for evaluation
  random_agents = [
      random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
      for idx in range(num_players)
  ]

  with tf.Session() as sess:
    hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
    # pylint: disable=g-complex-comprehension
    agents = [
        dqn.DQN(
            session=sess,
            player_id=idx,
            state_representation_size=info_state_size,
            num_actions=num_actions,
            hidden_layers_sizes=hidden_layers_sizes,
            replay_buffer_capacity=FLAGS.replay_buffer_capacity,
            batch_size=FLAGS.batch_size) for idx in range(num_players)
    ]
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())

    for ep in range(FLAGS.num_train_episodes):
      if (ep + 1) % FLAGS.eval_every == 0:
        r_mean = eval_against_random_bots(env, agents, random_agents, 1)
        logging.info("[%s] Mean episode rewards %s", ep + 1, r_mean)
        saver.save(sess, FLAGS.checkpoint_dir, ep)
        print("Actual MST Value: ", rewards[0])

      #env = rl_environment.Environment(game, **games[ep])
      time_step = env.reset()
      while not time_step.last():
        player_id = time_step.observations["current_player"]
        if env.is_turn_based:
          agent_output = agents[player_id].step(time_step)
          action_list = [agent_output.action]
        else:
          agents_output = [agent.step(time_step) for agent in agents]
          action_list = [agent_output.action for agent_output in agents_output]
        time_step = env.step(action_list)

      # Episode is over, step all agents with final info state.
      for agent in agents:
        agent.step(time_step)

    print("Actual MST: ", rewards)
Exemple #3
0
def main(_):
  game = "leduc_poker"
  num_players = 2
  env = rl_environment.Environment(game)
  state_size = env.observation_spec()["info_state"][0]
  num_actions = env.action_spec()["num_actions"]
  hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]

  kwargs = {
      "replay_buffer_capacity": FLAGS.replay_buffer_capacity,
      "reservoir_buffer_capacity": FLAGS.reservoir_buffer_capacity,
      "min_buffer_size_to_learn": FLAGS.min_buffer_size_to_learn,
      "anticipatory_param": FLAGS.anticipatory_param,
      "batch_size": FLAGS.batch_size,
      "learn_every": FLAGS.learn_every,
      "rl_learning_rate": FLAGS.rl_learning_rate,
      "sl_learning_rate": FLAGS.sl_learning_rate,
      "optimizer_str": FLAGS.optimizer_str,
      "loss_str": FLAGS.loss_str,
      "update_target_network_every": FLAGS.update_target_network_every,
      "discount_factor": FLAGS.discount_factor,
      "epsilon_decay_duration": FLAGS.epsilon_decay_duration,
      "epsilon_start": FLAGS.epsilon_start,
      "epsilon_end": FLAGS.epsilon_end,
  }



  with tf.Session() as sess:
    agents = [
        nfsp.NFSP(sess, idx, state_size, num_actions, hidden_layers_sizes,
                  **kwargs) for idx in range(num_players)
    ]

    # for agent in agents[2:]:
    #     agent.restore("/home/benedikt/Dokumente/Uni/HCI/openspiel_saves/half_trained")

    for agent in agents:
      agent.restore(FLAGS.checkpoint_dir)
    # agents[1].restore("/home/benedikt/Dokumente/Uni/HCI/openspiel_saves/half_trained")




    # Evaluate against random agent
    random_agents = [
        random_agent.RandomAgent(player_id=idx, num_actions=num_actions) for idx in range(num_players)
    ]

    r_mean = evaluateBotAgainstBot(env, agents[0], agents[1], 10000)
    logging.info("Mean episode rewards: %s", r_mean)

    #analyzeHistory()

    #r_mean = eval_against_random_bots(env, agents, random_agents, 10000)
    #logging.info("Mean episode rewards: %s", r_mean)

    '''if not FLAGS.iteractive_play:
Exemple #4
0
def main(_):
    game = "breakthrough"
    num_players = 2

    env_configs = {"columns": 5, "rows": 5}
    env = rl_environment.Environment(game, **env_configs)
    info_state_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]

    # random agents for evaluation
    random_agents = [
        random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
        for idx in range(num_players)
    ]

    with tf.Session() as sess:
        hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
        # pylint: disable=g-complex-comprehension
        agents = [
            dqn.DQN(session=sess,
                    player_id=idx,
                    state_representation_size=info_state_size,
                    num_actions=num_actions,
                    hidden_layers_sizes=hidden_layers_sizes,
                    replay_buffer_capacity=FLAGS.replay_buffer_capacity,
                    batch_size=FLAGS.batch_size) for idx in range(num_players)
        ]
        sess.run(tf.global_variables_initializer())

        for ep in range(FLAGS.num_train_episodes):
            if (ep + 1) % FLAGS.eval_every == 0:
                r_mean = eval_against_random_bots(env, agents, random_agents,
                                                  1000)
                logging.info("[%s] Mean episode rewards %s", ep + 1, r_mean)
            if (ep + 1) % FLAGS.save_every == 0:
                for agent in agents:
                    agent.save(FLAGS.checkpoint_dir)

            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                if env.is_turn_based:
                    agent_output = agents[player_id].step(time_step)
                    action_list = [agent_output.action]
                else:
                    agents_output = [agent.step(time_step) for agent in agents]
                    action_list = [
                        agent_output.action for agent_output in agents_output
                    ]
                time_step = env.step(action_list)

            # Episode is over, step all agents with final info state.
            for agent in agents:
                agent.step(time_step)
Exemple #5
0
def main_loop(unused_arg):
    """RL main loop example."""
    logging.info("Registered games: %s", rl_environment.registered_games())
    logging.info("Creating game %s", FLAGS.game)

    env_configs = mst.params(FLAGS.num_nodes)
    env = rl_environment.Environment(FLAGS.game, **env_configs)
    num_actions = env.action_spec()["num_actions"]

    agents = [
        random_agent.RandomAgent(player_id=i, num_actions=num_actions)
        for i in range(FLAGS.num_players)
    ]

    logging.info("Env specs: %s", env.observation_spec())
    logging.info("Action specs: %s", env.action_spec())

    for cur_episode in range(FLAGS.num_episodes):
        logging.info("Starting episode %s", cur_episode)
        time_step = env.reset()
        while not time_step.last():
            pid = time_step.observations["current_player"]

            if env.is_turn_based:
                agent_output = agents[pid].step(time_step)
                action_list = [agent_output.action]
            else:
                agents_output = [agent.step(time_step) for agent in agents]
                action_list = [
                    agent_output.action for agent_output in agents_output
                ]

            print_iteration(time_step, pid, action_list)
            time_step = env.step(action_list)

        # Episode is over, step all agents with final state.
        for agent in agents:
            agent.step(time_step)

        # Print final state of end game.
        for pid in range(env.num_players):
            print_iteration(time_step, pid)
  def test_step(self):
    agent = random_agent.RandomAgent(player_id=0, num_actions=10)

    legal_actions = [0, 2, 3, 5]
    time_step = rl_environment.TimeStep(
        observations={
            "info_state": [[0], [1]],
            "legal_actions": [legal_actions, []],
            "current_player": 0
        },
        rewards=None,
        discounts=None,
        step_type=None)
    agent_output = agent.step(time_step)

    self.assertIn(agent_output.action, legal_actions)
    self.assertAlmostEqual(sum(agent_output.probs), 1.0)
    self.assertEqual(
        len([x for x in agent_output.probs if x > 0]), len(legal_actions))
    self.assertTrue(
        np.allclose(agent_output.probs[legal_actions], [.25] * 4, atol=1e-5))
Exemple #7
0
def main(_):
    game = "tic_tac_toe"
    num_players = 2
    env = rl_environment.Environment(game)
    state_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]

    hidden_layers_sizes = [32, 32]
    replay_buffer_capacity = int(1e4)
    train_episodes = FLAGS.num_episodes
    loss_report_interval = 1000

    with tf.Session() as sess:
        dqn_agent = dqn.DQN(sess,
                            player_id=0,
                            state_representation_size=state_size,
                            num_actions=num_actions,
                            hidden_layers_sizes=hidden_layers_sizes,
                            replay_buffer_capacity=replay_buffer_capacity)
        tabular_q_agent = tabular_qlearner.QLearner(player_id=1,
                                                    num_actions=num_actions)
        agents = [dqn_agent, tabular_q_agent]

        sess.run(tf.global_variables_initializer())

        # Train agent
        for ep in range(train_episodes):
            if ep and ep % loss_report_interval == 0:
                logging.info("[%s/%s] DQN loss: %s", ep, train_episodes,
                             agents[0].loss)
            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                agent_output = agents[player_id].step(time_step)
                action_list = [agent_output.action]
                time_step = env.step(action_list)

            # Episode is over, step all agents with final info state.
            for agent in agents:
                agent.step(time_step)

        # Evaluate against random agent
        random_agents = [
            random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
            for idx in range(num_players)
        ]
        r_mean = eval_against_random_bots(env, agents, random_agents, 1000)
        logging.info("Mean episode rewards: %s", r_mean)

        if not FLAGS.iteractive_play:
            return

        # Play from the command line against the trained DQN agent.
        human_player = 1
        while True:
            logging.info("You are playing as %s", "O" if human_player else "X")
            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                if player_id == human_player:
                    agent_out = agents[human_player].step(time_step,
                                                          is_evaluation=True)
                    logging.info("\n%s", agent_out.probs.reshape((3, 3)))
                    logging.info("\n%s", pretty_board(time_step))
                    action = command_line_action(time_step)
                else:
                    agent_out = agents[1 - human_player].step(
                        time_step, is_evaluation=True)
                    action = agent_out.action
                time_step = env.step([action])

            logging.info("\n%s", pretty_board(time_step))

            logging.info("End of game!")
            if time_step.rewards[human_player] > 0:
                logging.info("You win")
            elif time_step.rewards[human_player] < 0:
                logging.info("You lose")
            else:
                logging.info("Draw")
            # Switch order of players
            human_player = 1 - human_player
def main(argv):
    del argv

    # calculate state values:
    num_cards = 5
    values = solve_goofspiel(num_cards)

    # setup environment:
    game = pyspiel.load_game(
        'goofspiel(imp_info=False,num_cards={})'.format(num_cards))
    env = rl_environment.Environment(game)
    num_actions = env.action_spec()["num_actions"]

    # define agent:
    value_it_agent = ValueItAgent(0, num_actions, values)
    rand_agent = random_agent.RandomAgent(player_id=1, num_actions=num_actions)

    # play against human:
    print("=============================")
    num_episodes = 1000
    wins = 0
    draws = 0
    logging.info(
        "Playing goofspiel with {} cards over {} episodes. value_it_agent (p0) vs. random_agent (p1)"
        .format(num_cards, num_episodes))
    for i in range(num_episodes):

        logging.info("episode {}".format(i))
        time_step = env.reset()

        while not time_step.last():

            # print current state:
            curr_state = env.get_state
            # print("Next turn. Current state is: ")
            # print(str(curr_state))
            logging.info(str(curr_state))

            # value it player:
            agent_out = value_it_agent.step(time_step, curr_state)
            logging.info("\n%s", agent_out.probs)
            p0_action = agent_out.action
            logging.info('Agent 0 played: {}'.format(p0_action + 1))

            # human player:
            agent_out = rand_agent.step(time_step)
            p1_action = agent_out.action
            logging.info('Agent 1 played: {}'.format(p1_action + 1))

            # print(time_step.observations['info_state'][0])
            # print(time_step.observations['info_state'][1])
            # print(len(time_step.observations['info_state'][0]))
            # print(env.observation_spec())

            # state = time_step.observations['info_state'][0]
            # state = np.asarray(state)
            #
            # P_ob = np.where(state[points_ob_b:points_op_b] == 1)[0][0]
            # P_op = np.where(state[points_op_b:seq_b] == 1)[0][0]
            # logging.info('Points: P%d = %d P%d = %d', player_id, P_ob, 0 if player_id == 1 else 1, P_op)
            #
            # which = num_cards - np.sum(state[np.size(state) - num_cards:])
            # curr = np.where(state[int(seq_b + which * num_cards): int(seq_b + (which + 1) * num_cards)] == 1)[0][0] + 1
            # logging.info('Point Card (Middle Card): %d', curr)
            #

            time_step = env.step([p0_action, p1_action])

        # logging.info("\n%s", pretty_board(time_step))

        logging.info("End of game!")
        if time_step.rewards[1] > 0:
            logging.info("You win")
        elif time_step.rewards[1] < 0:
            logging.info("You lose")
        else:
            logging.info("Draw")

        if time_step.rewards[0] > 0:
            wins += 1
        if time_step.rewards[0] == 0:
            draws += 1

        p0_win = wins / num_episodes

        logging.info("Summary: ==============")
        logging.info("Wins: {}, Draws: {}, Estimated pwin: {}".format(
            wins, draws, p0_win))
Exemple #9
0
def main(_):
    ## LOAD GAMES
    # print(pyspiel.registered_games())
    # games = [pyspiel.load_game("matrix_sh"), pyspiel.load_game("matrix_rps"), pyspiel.load_game("matrix_mp"), pyspiel.load_game("matrix_pd"),  _battle_of_the_sexes_easy()]
    games = [
        _battle_of_the_sexes_easy(),
        pyspiel.load_game("matrix_mp"),
        pyspiel.load_game("matrix_sh"),
        pyspiel.load_game("matrix_pd"),
        _biased_rock_paper_scissors_easy()
    ]
    _phaseplot(
        games, bstreamplot=True
    )  # Best to do this with 4-5 games                         #if you want other dynamics, change them in utils_matrix.py::_phaseplot
    _dynamics_kplot(
        [1, 2, 3, 5, 10, 25], games
    )  # Best to do this with 4-5 game and 5 or 6 k-values      #if you want other dynamics, change them in utils_matrix.py::_dynamics_kplot

    for game in games:
        ## GAME INFO
        # print(game.get_type().long_name.upper())
        # state = game.new_initial_state()
        # print(state)
        # print("-"*80)

        population_histories = []
        player1_probs = []
        player2_probs = []
        for _ in range(FLAGS.pop_iter):
            env = rl_environment.Environment(game=game)
            num_actions = env.action_spec()["num_actions"]
            agents = []
            for idx in range(env.num_players):
                if FLAGS.learner == "eps":
                    agents.append(
                        epsilongreedy_QLearner.EpsilonGreedy_QLearner(
                            player_id=idx,
                            num_actions=num_actions,
                            step_size=FLAGS.lr,
                            discount_factor=1,
                            epsilon=FLAGS.expl,
                            epsilon_annealing=FLAGS.expl_ann,
                            epsilon_min=FLAGS.expl_min))
                elif FLAGS.learner == "boltz":
                    agents.append(
                        boltzmann_QLearner.Boltzman_QLearner(
                            player_id=idx,
                            num_actions=num_actions,
                            step_size=FLAGS.lr,
                            discount_factor=1,
                            temperature=FLAGS.expl,
                            temperature_annealing=FLAGS.expl_ann,
                            temperature_min=FLAGS.expl_min))
                elif FLAGS.learner == "faq":
                    agents.append(
                        boltzmann_FAQLeaner.Boltzmann_FAQLearner(
                            player_id=idx,
                            num_actions=num_actions,
                            step_size=FLAGS.lr,
                            discount_factor=1,
                            temperature=FLAGS.expl,
                            temperature_annealing=FLAGS.expl_ann,
                            temperature_min=FLAGS.expl_min,
                            beta=FLAGS.beta))
                else:
                    agents.append(
                        boltzmann_LFAQLearner.Boltzmann_LFAQLearner(
                            player_id=idx,
                            num_actions=num_actions,
                            step_size=FLAGS.lr,
                            discount_factor=1,
                            temperature=FLAGS.expl,
                            temperature_annealing=FLAGS.expl_ann,
                            temperature_min=FLAGS.expl_min,
                            beta=FLAGS.beta,
                            k=FLAGS.k))

            random_agents = [
                random_agent.RandomAgent(player_id=idx,
                                         num_actions=num_actions)
                for idx in range(env.num_players)
            ]
            ## PLAY BEFORE TRAIN
            # print("BEFORE TRAINING: 1 episode of self-play")
            # play_episode(env, agents)

            ## TRAIN
            history = train_qlearning(
                agents, env, FLAGS.train_iter,
                random_agents)  # needs to be high for LFAQ
            population_histories.append(history)
            agents_output, _ = _env_play_episode(env, agents, evaluating=True)
            player1_probs.append(agents_output[0].probs)
            player2_probs.append(agents_output[1].probs)

            ## PLAY AFTER TRAIN
            # print("AFTER TRAINING: 1 episode of self-play")
            # play_episode(env, agents)
            # print("-"*80)
        _trajectoryplot(game, population_histories, FLAGS.k)

        for i in range(len(player1_probs)):
            print(f"\t\tPlayer 1\t Player 2")
            print(
                f"{env.get_state.action_to_string(0, 0)}:\t{player1_probs[i][0]:.2f}\t\t{player2_probs[i][0]:.2f}"
            )
            print(
                f"{env.get_state.action_to_string(0, 1)}:\t{player1_probs[i][1]:.2f}\t\t{player2_probs[i][1]:.2f}"
            )
            print()
Exemple #10
0
def main(_):
    game = "skat"
    num_players = 3

    env_configs = {}
    env = rl_environment.Environment(game, **env_configs)
    observation_tensor_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]

    # random agents for evaluation
    random_agents = [
        random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
        for idx in range(num_players)
    ]

    with tf.Session() as sess:
        summaries_dir = os.path.join(FLAGS.checkpoint_dir, "random_eval")
        summary_writer = tf.summary.FileWriter(summaries_dir,
                                               tf.get_default_graph())
        hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
        # pylint: disable=g-complex-comprehension
        agents = [
            dqn.DQN(session=sess,
                    player_id=idx,
                    state_representation_size=observation_tensor_size,
                    num_actions=num_actions,
                    hidden_layers_sizes=hidden_layers_sizes,
                    replay_buffer_capacity=FLAGS.replay_buffer_capacity,
                    batch_size=FLAGS.batch_size) for idx in range(num_players)
        ]
        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())

        for ep in range(FLAGS.num_train_episodes):
            if (ep + 1) % FLAGS.eval_every == 0:
                r_mean = eval_against_random_bots(env, agents, random_agents,
                                                  FLAGS.num_eval_games)
                logging.info("[%s] Mean episode rewards %s", ep + 1, r_mean)
                for i in range(num_players):
                    summary = tf.Summary()
                    summary.value.add(tag="mean_reward/random_{}".format(i),
                                      simple_value=r_mean[i])
                    summary_writer.add_summary(summary, ep)
                summary_writer.flush()
                saver.save(sess, FLAGS.checkpoint_dir, ep)

            time_step = env.reset()
            # Randomize position.
            if FLAGS.randomize_positions:
                positions = random.sample(range(len(agents)), len(agents))
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                if FLAGS.randomize_positions:
                    position = positions[player_id]
                    agents[position].player_id = player_id
                else:
                    position = player_id
                agent_output = agents[position].step(time_step)
                action_list = [agent_output.action]
                time_step = env.step(action_list)

            # Episode is over, step all agents with final info state.
            for agent in agents:
                agent.step(time_step)
Exemple #11
0
def main(_):
    np.random.seed(FLAGS.seed)
    tf.random.set_random_seed(FLAGS.seed)

    num_players = FLAGS.num_players

    env = rl_environment.Environment(FLAGS.game, include_full_state=True)
    info_state_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]

    # Exploitee agents
    if FLAGS.exploitee == "first":
        exploitee_agents = [
            FirstActionAgent(idx, num_actions) for idx in range(num_players)
        ]
    elif FLAGS.exploitee == "random":
        exploitee_agents = [
            random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
            # FirstActionAgent(player_id=idx, num_actions=num_actions)
            for idx in range(num_players)
        ]
    else:
        raise RuntimeError("Unknown exploitee")

    rolling_averager = RollingAverage(FLAGS.window_size)
    rolling_averager_p0 = RollingAverage(FLAGS.window_size)
    rolling_averager_p1 = RollingAverage(FLAGS.window_size)
    rolling_value = 0
    total_value = 0
    total_value_n = 0

    with tf.Session() as sess:
        hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
        # pylint: disable=g-complex-comprehension
        learning_agents = create_training_agents(num_players, sess,
                                                 num_actions, info_state_size,
                                                 hidden_layers_sizes)
        sess.run(tf.global_variables_initializer())

        print("Starting...")

        for ep in range(FLAGS.num_train_episodes):
            if (ep + 1) % FLAGS.eval_every == 0:
                r_mean = eval_against_fixed_bots(env, learning_agents,
                                                 exploitee_agents,
                                                 FLAGS.eval_episodes)
                value = r_mean[0] + r_mean[1]
                rolling_averager.add(value)
                rolling_averager_p0.add(r_mean[0])
                rolling_averager_p1.add(r_mean[1])
                rolling_value = rolling_averager.mean()
                rolling_value_p0 = rolling_averager_p0.mean()
                rolling_value_p1 = rolling_averager_p1.mean()
                total_value += value
                total_value_n += 1
                avg_value = total_value / total_value_n
                print(("[{}] Mean episode rewards {}, value: {}, " +
                       "rval: {} (p0/p1: {} / {}), aval: {}").format(
                           ep + 1, r_mean, value, rolling_value,
                           rolling_value_p0, rolling_value_p1, avg_value))

            agents_round1 = [learning_agents[0], exploitee_agents[1]]
            agents_round2 = [exploitee_agents[0], learning_agents[1]]

            for agents in [agents_round1, agents_round2]:
                time_step = env.reset()
                while not time_step.last():
                    player_id = time_step.observations["current_player"]
                    if env.is_turn_based:
                        agent_output = agents[player_id].step(time_step)
                        action_list = [agent_output.action]
                    else:
                        agents_output = [
                            agent.step(time_step) for agent in agents
                        ]
                        action_list = [
                            agent_output.action
                            for agent_output in agents_output
                        ]
                    time_step = env.step(action_list)

                # Episode is over, step all agents with final info state.
                for agent in agents:
                    agent.step(time_step)
'''
game.new_initial_state()
game.num_players()
state.is_terminal()
state.is_simultaneous_node()
state.is_chance_node()
state.legal_actions(int playerId)
state.apply_actions([a1,a2, ...] voor elke playerId)
state.returns()    | geeft result van het spel weer

'''

env = rl_environment.Environment("matrix_pd")
num_actions = env.action_spec()["num_actions"]
q_agent = tabular_qlearner.QLearner(0, num_actions)
ra2 = random_agent.RandomAgent(1, num_actions)
players = [q_agent, ra2]

for cur_episode in range(5000):
    if (cur_episode % 1000 == 0):
        print("cur_episode: " + str(cur_episode))

    time_step = env.reset()
    while not time_step.last():
        player_id = time_step.observations["current_player"]
        agent_output = players[player_id].step(time_step)
        time_step = env.step([agent_output.action])

    for agent in players:
        agent.step(time_step)
def main_loop(unused_arg):
    """RL main loop example."""
    logging.info("Registered games: %s", rl_environment.registered_games())
    logging.info("Creating game %s", FLAGS.game)

    #env_configs = {"players": FLAGS.num_players} if FLAGS.num_players else {}
    env_configs = {}
    env = rl_environment.Environment(FLAGS.game, **env_configs)
    state_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]

    hidden_layers_sizes = [512, 512]
    replay_buffer_capacity = int(1e4)
    train_episodes = FLAGS.num_episodes
    loss_report_interval = 1000

    logging.info("Env specs: %s", env.observation_spec())
    logging.info("Action specs: %s", env.action_spec())

    with tf.Session() as sess:
        # agents = [
        #     dqn.DQN(  # pylint: disable=g-complex-comprehension
        #         sess,
        #         player_id,
        #         state_representation_size=state_size,
        #         num_actions=num_actions,
        #         #hidden_layers_sizes=[16],
        #         #replay_buffer_capacity=10,
        #         hidden_layers_sizes=hidden_layers_sizes,
        #         replay_buffer_capacity=replay_buffer_capacity,
        #         batch_size=128) for player_id in range(3)
        # ]

        dqn_agents = [
            dqn.DQN(  # pylint: disable=g-complex-comprehension
                sess,
                0,
                state_representation_size=state_size,
                num_actions=num_actions,
                #hidden_layers_sizes=[16],
                #replay_buffer_capacity=10,
                hidden_layers_sizes=hidden_layers_sizes,
                replay_buffer_capacity=replay_buffer_capacity,
                batch_size=128)
        ]
        random_agents = [
            random_agent.RandomAgent(player_id=i, num_actions=num_actions)
            for i in range(1, 3)
        ]
        agents = dqn_agents + random_agents
        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())
        #latest_checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
        latest_checkpoint_path = tf.train.latest_checkpoint(
            FLAGS.checkpoint_dir)
        if latest_checkpoint_path:
            print('Restoring checkpoint: {0}'.format(latest_checkpoint_path))
            saver.restore(sess, latest_checkpoint_path)

        # Train agent
        for ep in range(train_episodes):
            #if ep and ep % loss_report_interval == 0:
            if (ep + 1) % FLAGS.eval_every == 0:
                logging.info("[%s/%s] DQN loss: %s", ep, train_episodes,
                             agents[0].loss)
                saver.save(sess, FLAGS.checkpoint_dir, ep)

            time_step = env.reset()
            while not time_step.last():
                current_player = time_step.observations["current_player"]
                #agent_output = [agent.step(time_step) for agent in agents]
                #time_step = env.step([agent_output[current_player].action])
                if env.is_turn_based:
                    agent_output = agents[current_player].step(time_step)
                    action_list = [agent_output.action]
                else:
                    agents_output = [agent.step(time_step) for agent in agents]
                    action_list = [
                        agent_output.action for agent_output in agents_output
                    ]
                #print_iteration(time_step, current_player, action_list)
                time_step = env.step(action_list)

            # Episode is over, step all agents with final info state.
            for agent in agents:
                agent.step(time_step)
Exemple #14
0
def main(_):
    game = FLAGS.game  # Set the game
    num_players = 1
    train_games, train_rewards, test_games, test_rewards = mst.game_params(
        FLAGS.num_nodes)  # Load from files
    env_configs = train_games[0]
    env = rl_environment.Environment(game, **env_configs)
    info_state_size = FLAGS.num_nodes * FLAGS.num_nodes * 3  #env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()[
        "num_actions"]  # number of possible actions

    print("Info State Size: ", info_state_size)
    print("Num Actions: ", num_actions)

    # random agents for evaluation
    random_agents = [
        random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
        for idx in range(num_players)
    ]
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.125)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
        # pylint: disable=g-complex-comprehension
        agents = [
            dqn.DQN(session=sess,
                    player_id=idx,
                    state_representation_size=info_state_size,
                    num_actions=num_actions,
                    hidden_layers_sizes=hidden_layers_sizes,
                    replay_buffer_capacity=FLAGS.replay_buffer_capacity,
                    batch_size=FLAGS.batch_size) for idx in range(num_players)
        ]
        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())

        #saver = tf.train.import_meta_graph('/home/jupyter/ORIE-GNN-bjk224/mst-game/dqn_checkpoints/dqn_20epochs_mst_medium/dqn_test-399999.meta')
        #saver.restore(sess, tf.train.latest_checkpoint('/home/jupyter/ORIE-GNN-bjk224/mst-game/dqn_checkpoints/dqn_20epochs_mst_medium/'))

        for ep in range(FLAGS.num_train_episodes):
            print(env_configs)
            #env_configs = train_games[ep % len(train_games)]
            #env = rl_environment.Environment(game, **env_configs)
            episode_reward = train_rewards[ep % len(train_games)]
            if (ep + 1) % FLAGS.eval_every == 0:
                r_mean = eval_against_random_bots(env, agents, random_agents,
                                                  0)
                logging.info("[%s] Mean episode rewards %s", ep + 1, r_mean)
                #saver.save(sess, FLAGS.checkpoint_dir, ep)
                print("Actual MST Value: ", episode_reward)
            if (ep + 1) % FLAGS.test_every == 0:
                test_accuracy = test_trained_bot(test_games, test_rewards,
                                                 agents[0], ep,
                                                 FLAGS.num_nodes, game,
                                                 FLAGS.game_version)
                logging.info("[%s] Test Accuracy: %s", ep + 1, test_accuracy)

            #env = rl_environment.Environment(game, **games[ep])
            time_step = env.reset()
            # print("TRAIN"+"*"*80)
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                agent_output = agents[player_id].step(time_step)
                action_list = [agent_output.action]
                time_step = env.step(action_list)
                #print("(Action, Reward): ", action_list[0], time_step.rewards[0])

            # Episode is over, step all agents with final info state.
            for agent in agents:
                agent.step(time_step)
Exemple #15
0
def main(_):
    game = "kuhn_poker"
    num_players = 2

    env = rl_environment.Environment(game)
    num_actions = env.action_spec()["num_actions"]

    agents = [
        tabular_qlearner.QLearner(player_id=idx, num_actions=num_actions)
        for idx in range(num_players)
    ]

    # random agents for evaluation
    random_agents = [
        random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
        for idx in range(num_players)
    ]

    # 1. Train the agents
    if FLAGS.should_train:
        training_episodes = FLAGS.num_episodes
        for cur_episode in range(training_episodes):
            if cur_episode % int(1e4) == 0:
                win_rates = eval_against_random_bots(env, agents,
                                                     random_agents, 1000)
                logging.info("Starting episode %s, win_rates %s", cur_episode,
                             win_rates)
            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                agent_output = agents[player_id].step(time_step)
                time_step = env.step([agent_output.action])

            # Episode is over, step all agents with final info state.
            for agent in agents:
                agent.step(time_step)

        if not FLAGS.iteractive_play:
            return

    # 2. Play from the command line against the trained agent.
    if FLAGS.should_play:
        # Pretty print state

        player_1 = 0
        while True:
            time_step = env.reset()
            pretty_print_state(env)
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                if player_id == player_1:
                    agent_out = agents[player_1].step(time_step,
                                                      is_evaluation=True)

                    logging.info("Pick action for player %s", player_id)
                    action = command_line_action(env, time_step)
                else:
                    agent_out = agents[1 - player_1].step(time_step,
                                                          is_evaluation=True)

                    logging.info("Pick action for player %s", player_id)
                    #action = command_line_action(env, time_step)

                    action = agent_out.action
                    logging.info("Agent action: %s", action)
                time_step = env.step([action])

            logging.info("Rewards: Player_0 %s | Player_1 %s",
                         time_step.rewards[player_1],
                         time_step.rewards[1 - player_1])
            logging.info("End of game!")
Exemple #16
0
    with tf.Session() as sess:
        dqn_agents = [
            dqn.DQN(sess,
                    player_id=idx,
                    state_representation_size=state_size,
                    num_actions=num_actions,
                    hidden_layers_sizes=hidden_layers_sizes,
                    replay_buffer_capacity=replay_buffer_capacity)
            for idx in range(num_players)
        ]
        #si tu veux restorer les agents pour continuer de l'entrainer, tu peux décommenter les 2 lignes en dessous
        #for i in range(num_players) :
        #    dqn_agents[i].restore("agents/better_puissance4/", str(i), "99")
        rand_agents = [
            random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
            for idx in range(num_players)
        ]
        sess.run(tf.compat.v1.global_variables_initializer())

        #trianing
        for ep in range(train_episodes):
            if (ep + 1) % save_every_n_step == 0:
                win_rates = eval_against_random_bots(env, dqn_agents,
                                                     rand_agents, 100)
                win_rates_history.append(win_rates)
                print("Episode {} of {}, win {}".format(
                    ep + 1, train_episodes, win_rates))
                for i, agent in enumerate(dqn_agents):
                    agent.save(parameters.agent_path, str(i),
                               str(int(ep / save_every_n_step)))
def main(_):
    game = "tic_tac_toe"
    num_players = 2

    env = rl_environment.Environment(game)
    num_actions = env.action_spec()["num_actions"]

    agents = [
        tabular_qlearner.QLearner(player_id=idx, num_actions=num_actions)
        for idx in range(num_players)
    ]

    # random agents for evaluation
    random_agents = [
        random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
        for idx in range(num_players)
    ]

    # 1. Train the agents
    training_episodes = FLAGS.num_episodes
    for cur_episode in range(training_episodes):
        if cur_episode % int(1e4) == 0:
            win_rates = eval_against_random_bots(env, agents, random_agents,
                                                 1000)
            logging.info("Starting episode %s, win_rates %s", cur_episode,
                         win_rates)
        time_step = env.reset()
        while not time_step.last():
            player_id = time_step.observations["current_player"]
            agent_output = agents[player_id].step(time_step)
            time_step = env.step([agent_output.action])

        # Episode is over, step all agents with final info state.
        for agent in agents:
            agent.step(time_step)

    if not FLAGS.iteractive_play:
        return

    # 2. Play from the command line against the trained agent.
    human_player = 1
    while True:
        logging.info("You are playing as %s", "O" if human_player else "X")
        time_step = env.reset()
        while not time_step.last():
            player_id = time_step.observations["current_player"]
            if player_id == human_player:
                agent_out = agents[human_player].step(time_step,
                                                      is_evaluation=True)
                logging.info("\n%s", agent_out.probs.reshape((3, 3)))
                logging.info("\n%s", pretty_board(time_step))
                action = command_line_action(time_step)
            else:
                agent_out = agents[1 - human_player].step(time_step,
                                                          is_evaluation=True)
                action = agent_out.action
            time_step = env.step([action])

        logging.info("\n%s", pretty_board(time_step))

        logging.info("End of game!")
        if time_step.rewards[human_player] > 0:
            logging.info("You win")
        elif time_step.rewards[human_player] < 0:
            logging.info("You lose")
        else:
            logging.info("Draw")
        # Switch order of players
        human_player = 1 - human_player