def main(_): game = FLAGS.game num_players = 1 games, rewards,_,_ = mst.game_params(FLAGS.num_nodes) env_configs = games[0] env = rl_environment.Environment(game, **env_configs) info_state_size = FLAGS.num_nodes * FLAGS.num_nodes * 3 #env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] print("Info State Size: ", info_state_size) print("Num Actions: ", num_actions) # random agents for evaluation random_agents = [ random_agent.RandomAgent(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] with tf.Session() as sess: hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] # pylint: disable=g-complex-comprehension agents = [ dqn.DQN( session=sess, player_id=idx, state_representation_size=info_state_size, num_actions=num_actions, hidden_layers_sizes=hidden_layers_sizes, replay_buffer_capacity=FLAGS.replay_buffer_capacity, batch_size=FLAGS.batch_size) for idx in range(num_players) ] saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) for ep in range(FLAGS.num_train_episodes): if (ep + 1) % FLAGS.eval_every == 0: r_mean = eval_against_random_bots(env, agents, random_agents, 1) logging.info("[%s] Mean episode rewards %s", ep + 1, r_mean) saver.save(sess, FLAGS.checkpoint_dir, ep) print("Actual MST Value: ", rewards[0]) #env = rl_environment.Environment(game, **games[ep]) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] if env.is_turn_based: agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] else: agents_output = [agent.step(time_step) for agent in agents] action_list = [agent_output.action for agent_output in agents_output] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step) print("Actual MST: ", rewards)
def main(_): game = "mst" num_players = 1 train_games, train_rewards, test_games, test_rewards = mst.game_params( FLAGS.num_nodes) env_configs = train_games[0] env = rl_environment.Environment(game, **env_configs) info_state_size = FLAGS.num_nodes * FLAGS.num_nodes * 3 num_actions = env.action_spec()["num_actions"] with tf.Session() as sess: # pylint: disable=g-complex-comprehension agents = [ policy_gradient.PolicyGradient(sess, idx, info_state_size, num_actions, loss_str=FLAGS.loss_str, hidden_layers_sizes=(128, )) for idx in range(num_players) ] expl_policies_avg = PolicyGradientPolicies(env, agents) sess.run(tf.global_variables_initializer()) for ep in range(FLAGS.num_episodes): env_configs = train_games[ep % len(train_games)] env = rl_environment.Environment(game, **env_configs) if (ep + 1) % FLAGS.eval_every == 0: losses = [agent.loss for agent in agents] #expl = exploitability.exploitability(env.game, expl_policies_avg) msg = "-" * 80 + "\n" msg += "{}: {}\n".format(ep + 1, losses) #expl, losses) logging.info("%s", msg) if (ep + 1) % FLAGS.test_every == 0: test_accuracy = test_trained_bot(test_games, test_rewards, agents[0], ep, FLAGS.num_nodes, game, FLAGS.game_version) logging.info("[%s] Test Accuracy: %s", ep + 1, test_accuracy) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step)
def main(_): game = FLAGS.game # Set the game num_players = 1 train_games, train_rewards, test_games, test_rewards = mst.game_params( FLAGS.num_nodes) # Load from files env_configs = train_games[0] env = rl_environment.Environment(game, **env_configs) info_state_size = FLAGS.num_nodes * FLAGS.num_nodes * 3 #env.observation_spec()["info_state"][0] num_actions = env.action_spec()[ "num_actions"] # number of possible actions print("Info State Size: ", info_state_size) print("Num Actions: ", num_actions) # random agents for evaluation random_agents = [ random_agent.RandomAgent(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.125) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] # pylint: disable=g-complex-comprehension agents = [ dqn.DQN(session=sess, player_id=idx, state_representation_size=info_state_size, num_actions=num_actions, hidden_layers_sizes=hidden_layers_sizes, replay_buffer_capacity=FLAGS.replay_buffer_capacity, batch_size=FLAGS.batch_size) for idx in range(num_players) ] saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) #saver = tf.train.import_meta_graph('/home/jupyter/ORIE-GNN-bjk224/mst-game/dqn_checkpoints/dqn_20epochs_mst_medium/dqn_test-399999.meta') #saver.restore(sess, tf.train.latest_checkpoint('/home/jupyter/ORIE-GNN-bjk224/mst-game/dqn_checkpoints/dqn_20epochs_mst_medium/')) for ep in range(FLAGS.num_train_episodes): print(env_configs) #env_configs = train_games[ep % len(train_games)] #env = rl_environment.Environment(game, **env_configs) episode_reward = train_rewards[ep % len(train_games)] if (ep + 1) % FLAGS.eval_every == 0: r_mean = eval_against_random_bots(env, agents, random_agents, 0) logging.info("[%s] Mean episode rewards %s", ep + 1, r_mean) #saver.save(sess, FLAGS.checkpoint_dir, ep) print("Actual MST Value: ", episode_reward) if (ep + 1) % FLAGS.test_every == 0: test_accuracy = test_trained_bot(test_games, test_rewards, agents[0], ep, FLAGS.num_nodes, game, FLAGS.game_version) logging.info("[%s] Test Accuracy: %s", ep + 1, test_accuracy) #env = rl_environment.Environment(game, **games[ep]) time_step = env.reset() # print("TRAIN"+"*"*80) while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) #print("(Action, Reward): ", action_list[0], time_step.rewards[0]) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step)
def main(_): action_string = None train_rewards = [8] print("Creating game: " + FLAGS.game) if FLAGS.num_nodes not in [5, 7, 10, 20]: distances = np.random.random((FLAGS.num_nodes, 2)) dist_mat = np.round(distance_matrix(distances, distances), 2).flatten() generated_weights = str(dist_mat[0]) for i in range(1, dist_mat.size): generated_weights += "," + str(dist_mat[i]) game = pyspiel.load_game( FLAGS.game, { "num_nodes": pyspiel.GameParameter(FLAGS.num_nodes), "weights": pyspiel.GameParameter(generated_weights) }) elif FLAGS.num_nodes == 7: game = pyspiel.load_game( FLAGS.game, { "num_nodes": pyspiel.GameParameter(5), "weights": pyspiel.GameParameter( "inf,0.169,inf,inf,inf,inf,inf,inf,0.693,inf,inf,inf,inf,inf,0.121,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf" ) }) else: #game = pyspiel.load_game(FLAGS.game, {"num_nodes": pyspiel.GameParameter(5), #"weights": pyspiel.GameParameter("0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0")}) train_games, train_rewards, _, _ = mst.game_params( FLAGS.num_nodes) # Load from files print(train_games[0]) game = pyspiel.load_game( FLAGS.game, { "num_nodes": pyspiel.GameParameter(FLAGS.num_nodes), "weights": pyspiel.GameParameter(train_games[0]['weights']) }) # Get a new state if FLAGS.load_state is not None: # Load a specific state state_string = "" with open(FLAGS.load_state, encoding="utf-8") as input_file: for line in input_file: state_string += line state_string = state_string.rstrip() print("Loading state:") print(state_string) print("") state = game.deserialize_state(state_string) else: state = game.new_initial_state() # Print the initial state print(str(state)) while not state.is_terminal(): # The state can be three different types: chance node, # simultaneous node, or decision node legal_actions = state.legal_actions(state.current_player()) print("Legal Actions: ", [(i // FLAGS.num_nodes, i % FLAGS.num_nodes) for i in legal_actions]) # Decision node: sample action for the single current player action = random.choice(legal_actions) action_string = state.action_to_string(state.current_player(), action) print("Player ", state.current_player(), ", randomly sampled action: ", action_string) state.apply_action(action) print(str(state)) print("Information State: ", state.information_state_string()) #print("Edge Values: ", dist_mat) print("Actual MST Reward: ", train_rewards[0]) # Game is now done. Print utilities for each player returns = state.returns() for pid in range(game.num_players()): print("Utility for player {} is {}".format(pid, returns[pid]))