Beispiel #1
0
def main(_):
  game = FLAGS.game
  num_players = 1
  games, rewards,_,_ = mst.game_params(FLAGS.num_nodes)

  env_configs = games[0]
  env = rl_environment.Environment(game, **env_configs)
  info_state_size = FLAGS.num_nodes * FLAGS.num_nodes * 3 #env.observation_spec()["info_state"][0]
  num_actions = env.action_spec()["num_actions"]

  print("Info State Size: ", info_state_size)
  print("Num Actions: ", num_actions)  
    
  # random agents for evaluation
  random_agents = [
      random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
      for idx in range(num_players)
  ]

  with tf.Session() as sess:
    hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
    # pylint: disable=g-complex-comprehension
    agents = [
        dqn.DQN(
            session=sess,
            player_id=idx,
            state_representation_size=info_state_size,
            num_actions=num_actions,
            hidden_layers_sizes=hidden_layers_sizes,
            replay_buffer_capacity=FLAGS.replay_buffer_capacity,
            batch_size=FLAGS.batch_size) for idx in range(num_players)
    ]
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())

    for ep in range(FLAGS.num_train_episodes):
      if (ep + 1) % FLAGS.eval_every == 0:
        r_mean = eval_against_random_bots(env, agents, random_agents, 1)
        logging.info("[%s] Mean episode rewards %s", ep + 1, r_mean)
        saver.save(sess, FLAGS.checkpoint_dir, ep)
        print("Actual MST Value: ", rewards[0])

      #env = rl_environment.Environment(game, **games[ep])
      time_step = env.reset()
      while not time_step.last():
        player_id = time_step.observations["current_player"]
        if env.is_turn_based:
          agent_output = agents[player_id].step(time_step)
          action_list = [agent_output.action]
        else:
          agents_output = [agent.step(time_step) for agent in agents]
          action_list = [agent_output.action for agent_output in agents_output]
        time_step = env.step(action_list)

      # Episode is over, step all agents with final info state.
      for agent in agents:
        agent.step(time_step)

    print("Actual MST: ", rewards)
def main(_):

    game = "mst"
    num_players = 1
    train_games, train_rewards, test_games, test_rewards = mst.game_params(
        FLAGS.num_nodes)

    env_configs = train_games[0]
    env = rl_environment.Environment(game, **env_configs)
    info_state_size = FLAGS.num_nodes * FLAGS.num_nodes * 3
    num_actions = env.action_spec()["num_actions"]

    with tf.Session() as sess:
        # pylint: disable=g-complex-comprehension
        agents = [
            policy_gradient.PolicyGradient(sess,
                                           idx,
                                           info_state_size,
                                           num_actions,
                                           loss_str=FLAGS.loss_str,
                                           hidden_layers_sizes=(128, ))
            for idx in range(num_players)
        ]
        expl_policies_avg = PolicyGradientPolicies(env, agents)

        sess.run(tf.global_variables_initializer())
        for ep in range(FLAGS.num_episodes):
            env_configs = train_games[ep % len(train_games)]
            env = rl_environment.Environment(game, **env_configs)
            if (ep + 1) % FLAGS.eval_every == 0:
                losses = [agent.loss for agent in agents]
                #expl = exploitability.exploitability(env.game, expl_policies_avg)
                msg = "-" * 80 + "\n"
                msg += "{}: {}\n".format(ep + 1, losses)  #expl, losses)
                logging.info("%s", msg)
            if (ep + 1) % FLAGS.test_every == 0:
                test_accuracy = test_trained_bot(test_games, test_rewards,
                                                 agents[0], ep,
                                                 FLAGS.num_nodes, game,
                                                 FLAGS.game_version)
                logging.info("[%s] Test Accuracy: %s", ep + 1, test_accuracy)

            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                agent_output = agents[player_id].step(time_step)
                action_list = [agent_output.action]
                time_step = env.step(action_list)

            # Episode is over, step all agents with final info state.
            for agent in agents:
                agent.step(time_step)
Beispiel #3
0
def main(_):
    game = FLAGS.game  # Set the game
    num_players = 1
    train_games, train_rewards, test_games, test_rewards = mst.game_params(
        FLAGS.num_nodes)  # Load from files
    env_configs = train_games[0]
    env = rl_environment.Environment(game, **env_configs)
    info_state_size = FLAGS.num_nodes * FLAGS.num_nodes * 3  #env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()[
        "num_actions"]  # number of possible actions

    print("Info State Size: ", info_state_size)
    print("Num Actions: ", num_actions)

    # random agents for evaluation
    random_agents = [
        random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
        for idx in range(num_players)
    ]
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.125)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
        # pylint: disable=g-complex-comprehension
        agents = [
            dqn.DQN(session=sess,
                    player_id=idx,
                    state_representation_size=info_state_size,
                    num_actions=num_actions,
                    hidden_layers_sizes=hidden_layers_sizes,
                    replay_buffer_capacity=FLAGS.replay_buffer_capacity,
                    batch_size=FLAGS.batch_size) for idx in range(num_players)
        ]
        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())

        #saver = tf.train.import_meta_graph('/home/jupyter/ORIE-GNN-bjk224/mst-game/dqn_checkpoints/dqn_20epochs_mst_medium/dqn_test-399999.meta')
        #saver.restore(sess, tf.train.latest_checkpoint('/home/jupyter/ORIE-GNN-bjk224/mst-game/dqn_checkpoints/dqn_20epochs_mst_medium/'))

        for ep in range(FLAGS.num_train_episodes):
            print(env_configs)
            #env_configs = train_games[ep % len(train_games)]
            #env = rl_environment.Environment(game, **env_configs)
            episode_reward = train_rewards[ep % len(train_games)]
            if (ep + 1) % FLAGS.eval_every == 0:
                r_mean = eval_against_random_bots(env, agents, random_agents,
                                                  0)
                logging.info("[%s] Mean episode rewards %s", ep + 1, r_mean)
                #saver.save(sess, FLAGS.checkpoint_dir, ep)
                print("Actual MST Value: ", episode_reward)
            if (ep + 1) % FLAGS.test_every == 0:
                test_accuracy = test_trained_bot(test_games, test_rewards,
                                                 agents[0], ep,
                                                 FLAGS.num_nodes, game,
                                                 FLAGS.game_version)
                logging.info("[%s] Test Accuracy: %s", ep + 1, test_accuracy)

            #env = rl_environment.Environment(game, **games[ep])
            time_step = env.reset()
            # print("TRAIN"+"*"*80)
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                agent_output = agents[player_id].step(time_step)
                action_list = [agent_output.action]
                time_step = env.step(action_list)
                #print("(Action, Reward): ", action_list[0], time_step.rewards[0])

            # Episode is over, step all agents with final info state.
            for agent in agents:
                agent.step(time_step)
Beispiel #4
0
def main(_):
    action_string = None
    train_rewards = [8]
    print("Creating game: " + FLAGS.game)
    if FLAGS.num_nodes not in [5, 7, 10, 20]:
        distances = np.random.random((FLAGS.num_nodes, 2))
        dist_mat = np.round(distance_matrix(distances, distances), 2).flatten()
        generated_weights = str(dist_mat[0])
        for i in range(1, dist_mat.size):
            generated_weights += "," + str(dist_mat[i])

        game = pyspiel.load_game(
            FLAGS.game, {
                "num_nodes": pyspiel.GameParameter(FLAGS.num_nodes),
                "weights": pyspiel.GameParameter(generated_weights)
            })

    elif FLAGS.num_nodes == 7:

        game = pyspiel.load_game(
            FLAGS.game, {
                "num_nodes":
                pyspiel.GameParameter(5),
                "weights":
                pyspiel.GameParameter(
                    "inf,0.169,inf,inf,inf,inf,inf,inf,0.693,inf,inf,inf,inf,inf,0.121,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf"
                )
            })
    else:
        #game = pyspiel.load_game(FLAGS.game, {"num_nodes": pyspiel.GameParameter(5),
        #"weights": pyspiel.GameParameter("0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0")})
        train_games, train_rewards, _, _ = mst.game_params(
            FLAGS.num_nodes)  # Load from files
        print(train_games[0])
        game = pyspiel.load_game(
            FLAGS.game, {
                "num_nodes": pyspiel.GameParameter(FLAGS.num_nodes),
                "weights": pyspiel.GameParameter(train_games[0]['weights'])
            })

    # Get a new state
    if FLAGS.load_state is not None:
        # Load a specific state
        state_string = ""
        with open(FLAGS.load_state, encoding="utf-8") as input_file:
            for line in input_file:
                state_string += line
        state_string = state_string.rstrip()
        print("Loading state:")
        print(state_string)
        print("")
        state = game.deserialize_state(state_string)
    else:
        state = game.new_initial_state()

    # Print the initial state
    print(str(state))

    while not state.is_terminal():
        # The state can be three different types: chance node,
        # simultaneous node, or decision node

        legal_actions = state.legal_actions(state.current_player())
        print("Legal Actions: ", [(i // FLAGS.num_nodes, i % FLAGS.num_nodes)
                                  for i in legal_actions])
        # Decision node: sample action for the single current player
        action = random.choice(legal_actions)
        action_string = state.action_to_string(state.current_player(), action)
        print("Player ", state.current_player(), ", randomly sampled action: ",
              action_string)
        state.apply_action(action)

        print(str(state))

        print("Information State: ", state.information_state_string())
        #print("Edge Values: ", dist_mat)
        print("Actual MST Reward: ", train_rewards[0])

    # Game is now done. Print utilities for each player
    returns = state.returns()
    for pid in range(game.num_players()):
        print("Utility for player {} is {}".format(pid, returns[pid]))