def main(_): game = "breakthrough" num_players = 2 env_configs = {"columns": 5, "rows": 5} env = rl_environment.Environment(game, **env_configs) info_state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] # random agents for evaluation random_agents = [ random_agent.RandomAgent(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] with tf.Session() as sess: hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] # pylint: disable=g-complex-comprehension agents = [ dqn.DQN(session=sess, player_id=idx, state_representation_size=info_state_size, num_actions=num_actions, hidden_layers_sizes=hidden_layers_sizes, replay_buffer_capacity=FLAGS.replay_buffer_capacity, batch_size=FLAGS.batch_size) for idx in range(num_players) ] saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) print(type(agents[0].get_weights()), agents[0].get_weights())
def main(_): game = FLAGS.game num_players = 1 games, rewards,_,_ = mst.game_params(FLAGS.num_nodes) env_configs = games[0] env = rl_environment.Environment(game, **env_configs) info_state_size = FLAGS.num_nodes * FLAGS.num_nodes * 3 #env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] print("Info State Size: ", info_state_size) print("Num Actions: ", num_actions) # random agents for evaluation random_agents = [ random_agent.RandomAgent(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] with tf.Session() as sess: hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] # pylint: disable=g-complex-comprehension agents = [ dqn.DQN( session=sess, player_id=idx, state_representation_size=info_state_size, num_actions=num_actions, hidden_layers_sizes=hidden_layers_sizes, replay_buffer_capacity=FLAGS.replay_buffer_capacity, batch_size=FLAGS.batch_size) for idx in range(num_players) ] saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) for ep in range(FLAGS.num_train_episodes): if (ep + 1) % FLAGS.eval_every == 0: r_mean = eval_against_random_bots(env, agents, random_agents, 1) logging.info("[%s] Mean episode rewards %s", ep + 1, r_mean) saver.save(sess, FLAGS.checkpoint_dir, ep) print("Actual MST Value: ", rewards[0]) #env = rl_environment.Environment(game, **games[ep]) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] if env.is_turn_based: agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] else: agents_output = [agent.step(time_step) for agent in agents] action_list = [agent_output.action for agent_output in agents_output] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step) print("Actual MST: ", rewards)
def main(_): game = "leduc_poker" num_players = 2 env = rl_environment.Environment(game) state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] kwargs = { "replay_buffer_capacity": FLAGS.replay_buffer_capacity, "reservoir_buffer_capacity": FLAGS.reservoir_buffer_capacity, "min_buffer_size_to_learn": FLAGS.min_buffer_size_to_learn, "anticipatory_param": FLAGS.anticipatory_param, "batch_size": FLAGS.batch_size, "learn_every": FLAGS.learn_every, "rl_learning_rate": FLAGS.rl_learning_rate, "sl_learning_rate": FLAGS.sl_learning_rate, "optimizer_str": FLAGS.optimizer_str, "loss_str": FLAGS.loss_str, "update_target_network_every": FLAGS.update_target_network_every, "discount_factor": FLAGS.discount_factor, "epsilon_decay_duration": FLAGS.epsilon_decay_duration, "epsilon_start": FLAGS.epsilon_start, "epsilon_end": FLAGS.epsilon_end, } with tf.Session() as sess: agents = [ nfsp.NFSP(sess, idx, state_size, num_actions, hidden_layers_sizes, **kwargs) for idx in range(num_players) ] # for agent in agents[2:]: # agent.restore("/home/benedikt/Dokumente/Uni/HCI/openspiel_saves/half_trained") for agent in agents: agent.restore(FLAGS.checkpoint_dir) # agents[1].restore("/home/benedikt/Dokumente/Uni/HCI/openspiel_saves/half_trained") # Evaluate against random agent random_agents = [ random_agent.RandomAgent(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] r_mean = evaluateBotAgainstBot(env, agents[0], agents[1], 10000) logging.info("Mean episode rewards: %s", r_mean) #analyzeHistory() #r_mean = eval_against_random_bots(env, agents, random_agents, 10000) #logging.info("Mean episode rewards: %s", r_mean) '''if not FLAGS.iteractive_play:
def main(_): game = "breakthrough" num_players = 2 env_configs = {"columns": 5, "rows": 5} env = rl_environment.Environment(game, **env_configs) info_state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] # random agents for evaluation random_agents = [ random_agent.RandomAgent(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] with tf.Session() as sess: hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] # pylint: disable=g-complex-comprehension agents = [ dqn.DQN(session=sess, player_id=idx, state_representation_size=info_state_size, num_actions=num_actions, hidden_layers_sizes=hidden_layers_sizes, replay_buffer_capacity=FLAGS.replay_buffer_capacity, batch_size=FLAGS.batch_size) for idx in range(num_players) ] sess.run(tf.global_variables_initializer()) for ep in range(FLAGS.num_train_episodes): if (ep + 1) % FLAGS.eval_every == 0: r_mean = eval_against_random_bots(env, agents, random_agents, 1000) logging.info("[%s] Mean episode rewards %s", ep + 1, r_mean) if (ep + 1) % FLAGS.save_every == 0: for agent in agents: agent.save(FLAGS.checkpoint_dir) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] if env.is_turn_based: agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] else: agents_output = [agent.step(time_step) for agent in agents] action_list = [ agent_output.action for agent_output in agents_output ] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step)
def main_loop(unused_arg): """RL main loop example.""" logging.info("Registered games: %s", rl_environment.registered_games()) logging.info("Creating game %s", FLAGS.game) env_configs = mst.params(FLAGS.num_nodes) env = rl_environment.Environment(FLAGS.game, **env_configs) num_actions = env.action_spec()["num_actions"] agents = [ random_agent.RandomAgent(player_id=i, num_actions=num_actions) for i in range(FLAGS.num_players) ] logging.info("Env specs: %s", env.observation_spec()) logging.info("Action specs: %s", env.action_spec()) for cur_episode in range(FLAGS.num_episodes): logging.info("Starting episode %s", cur_episode) time_step = env.reset() while not time_step.last(): pid = time_step.observations["current_player"] if env.is_turn_based: agent_output = agents[pid].step(time_step) action_list = [agent_output.action] else: agents_output = [agent.step(time_step) for agent in agents] action_list = [ agent_output.action for agent_output in agents_output ] print_iteration(time_step, pid, action_list) time_step = env.step(action_list) # Episode is over, step all agents with final state. for agent in agents: agent.step(time_step) # Print final state of end game. for pid in range(env.num_players): print_iteration(time_step, pid)
def test_step(self): agent = random_agent.RandomAgent(player_id=0, num_actions=10) legal_actions = [0, 2, 3, 5] time_step = rl_environment.TimeStep( observations={ "info_state": [[0], [1]], "legal_actions": [legal_actions, []], "current_player": 0 }, rewards=None, discounts=None, step_type=None) agent_output = agent.step(time_step) self.assertIn(agent_output.action, legal_actions) self.assertAlmostEqual(sum(agent_output.probs), 1.0) self.assertEqual( len([x for x in agent_output.probs if x > 0]), len(legal_actions)) self.assertTrue( np.allclose(agent_output.probs[legal_actions], [.25] * 4, atol=1e-5))
def main(_): game = "tic_tac_toe" num_players = 2 env = rl_environment.Environment(game) state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] hidden_layers_sizes = [32, 32] replay_buffer_capacity = int(1e4) train_episodes = FLAGS.num_episodes loss_report_interval = 1000 with tf.Session() as sess: dqn_agent = dqn.DQN(sess, player_id=0, state_representation_size=state_size, num_actions=num_actions, hidden_layers_sizes=hidden_layers_sizes, replay_buffer_capacity=replay_buffer_capacity) tabular_q_agent = tabular_qlearner.QLearner(player_id=1, num_actions=num_actions) agents = [dqn_agent, tabular_q_agent] sess.run(tf.global_variables_initializer()) # Train agent for ep in range(train_episodes): if ep and ep % loss_report_interval == 0: logging.info("[%s/%s] DQN loss: %s", ep, train_episodes, agents[0].loss) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step) # Evaluate against random agent random_agents = [ random_agent.RandomAgent(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] r_mean = eval_against_random_bots(env, agents, random_agents, 1000) logging.info("Mean episode rewards: %s", r_mean) if not FLAGS.iteractive_play: return # Play from the command line against the trained DQN agent. human_player = 1 while True: logging.info("You are playing as %s", "O" if human_player else "X") time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] if player_id == human_player: agent_out = agents[human_player].step(time_step, is_evaluation=True) logging.info("\n%s", agent_out.probs.reshape((3, 3))) logging.info("\n%s", pretty_board(time_step)) action = command_line_action(time_step) else: agent_out = agents[1 - human_player].step( time_step, is_evaluation=True) action = agent_out.action time_step = env.step([action]) logging.info("\n%s", pretty_board(time_step)) logging.info("End of game!") if time_step.rewards[human_player] > 0: logging.info("You win") elif time_step.rewards[human_player] < 0: logging.info("You lose") else: logging.info("Draw") # Switch order of players human_player = 1 - human_player
def main(argv): del argv # calculate state values: num_cards = 5 values = solve_goofspiel(num_cards) # setup environment: game = pyspiel.load_game( 'goofspiel(imp_info=False,num_cards={})'.format(num_cards)) env = rl_environment.Environment(game) num_actions = env.action_spec()["num_actions"] # define agent: value_it_agent = ValueItAgent(0, num_actions, values) rand_agent = random_agent.RandomAgent(player_id=1, num_actions=num_actions) # play against human: print("=============================") num_episodes = 1000 wins = 0 draws = 0 logging.info( "Playing goofspiel with {} cards over {} episodes. value_it_agent (p0) vs. random_agent (p1)" .format(num_cards, num_episodes)) for i in range(num_episodes): logging.info("episode {}".format(i)) time_step = env.reset() while not time_step.last(): # print current state: curr_state = env.get_state # print("Next turn. Current state is: ") # print(str(curr_state)) logging.info(str(curr_state)) # value it player: agent_out = value_it_agent.step(time_step, curr_state) logging.info("\n%s", agent_out.probs) p0_action = agent_out.action logging.info('Agent 0 played: {}'.format(p0_action + 1)) # human player: agent_out = rand_agent.step(time_step) p1_action = agent_out.action logging.info('Agent 1 played: {}'.format(p1_action + 1)) # print(time_step.observations['info_state'][0]) # print(time_step.observations['info_state'][1]) # print(len(time_step.observations['info_state'][0])) # print(env.observation_spec()) # state = time_step.observations['info_state'][0] # state = np.asarray(state) # # P_ob = np.where(state[points_ob_b:points_op_b] == 1)[0][0] # P_op = np.where(state[points_op_b:seq_b] == 1)[0][0] # logging.info('Points: P%d = %d P%d = %d', player_id, P_ob, 0 if player_id == 1 else 1, P_op) # # which = num_cards - np.sum(state[np.size(state) - num_cards:]) # curr = np.where(state[int(seq_b + which * num_cards): int(seq_b + (which + 1) * num_cards)] == 1)[0][0] + 1 # logging.info('Point Card (Middle Card): %d', curr) # time_step = env.step([p0_action, p1_action]) # logging.info("\n%s", pretty_board(time_step)) logging.info("End of game!") if time_step.rewards[1] > 0: logging.info("You win") elif time_step.rewards[1] < 0: logging.info("You lose") else: logging.info("Draw") if time_step.rewards[0] > 0: wins += 1 if time_step.rewards[0] == 0: draws += 1 p0_win = wins / num_episodes logging.info("Summary: ==============") logging.info("Wins: {}, Draws: {}, Estimated pwin: {}".format( wins, draws, p0_win))
def main(_): ## LOAD GAMES # print(pyspiel.registered_games()) # games = [pyspiel.load_game("matrix_sh"), pyspiel.load_game("matrix_rps"), pyspiel.load_game("matrix_mp"), pyspiel.load_game("matrix_pd"), _battle_of_the_sexes_easy()] games = [ _battle_of_the_sexes_easy(), pyspiel.load_game("matrix_mp"), pyspiel.load_game("matrix_sh"), pyspiel.load_game("matrix_pd"), _biased_rock_paper_scissors_easy() ] _phaseplot( games, bstreamplot=True ) # Best to do this with 4-5 games #if you want other dynamics, change them in utils_matrix.py::_phaseplot _dynamics_kplot( [1, 2, 3, 5, 10, 25], games ) # Best to do this with 4-5 game and 5 or 6 k-values #if you want other dynamics, change them in utils_matrix.py::_dynamics_kplot for game in games: ## GAME INFO # print(game.get_type().long_name.upper()) # state = game.new_initial_state() # print(state) # print("-"*80) population_histories = [] player1_probs = [] player2_probs = [] for _ in range(FLAGS.pop_iter): env = rl_environment.Environment(game=game) num_actions = env.action_spec()["num_actions"] agents = [] for idx in range(env.num_players): if FLAGS.learner == "eps": agents.append( epsilongreedy_QLearner.EpsilonGreedy_QLearner( player_id=idx, num_actions=num_actions, step_size=FLAGS.lr, discount_factor=1, epsilon=FLAGS.expl, epsilon_annealing=FLAGS.expl_ann, epsilon_min=FLAGS.expl_min)) elif FLAGS.learner == "boltz": agents.append( boltzmann_QLearner.Boltzman_QLearner( player_id=idx, num_actions=num_actions, step_size=FLAGS.lr, discount_factor=1, temperature=FLAGS.expl, temperature_annealing=FLAGS.expl_ann, temperature_min=FLAGS.expl_min)) elif FLAGS.learner == "faq": agents.append( boltzmann_FAQLeaner.Boltzmann_FAQLearner( player_id=idx, num_actions=num_actions, step_size=FLAGS.lr, discount_factor=1, temperature=FLAGS.expl, temperature_annealing=FLAGS.expl_ann, temperature_min=FLAGS.expl_min, beta=FLAGS.beta)) else: agents.append( boltzmann_LFAQLearner.Boltzmann_LFAQLearner( player_id=idx, num_actions=num_actions, step_size=FLAGS.lr, discount_factor=1, temperature=FLAGS.expl, temperature_annealing=FLAGS.expl_ann, temperature_min=FLAGS.expl_min, beta=FLAGS.beta, k=FLAGS.k)) random_agents = [ random_agent.RandomAgent(player_id=idx, num_actions=num_actions) for idx in range(env.num_players) ] ## PLAY BEFORE TRAIN # print("BEFORE TRAINING: 1 episode of self-play") # play_episode(env, agents) ## TRAIN history = train_qlearning( agents, env, FLAGS.train_iter, random_agents) # needs to be high for LFAQ population_histories.append(history) agents_output, _ = _env_play_episode(env, agents, evaluating=True) player1_probs.append(agents_output[0].probs) player2_probs.append(agents_output[1].probs) ## PLAY AFTER TRAIN # print("AFTER TRAINING: 1 episode of self-play") # play_episode(env, agents) # print("-"*80) _trajectoryplot(game, population_histories, FLAGS.k) for i in range(len(player1_probs)): print(f"\t\tPlayer 1\t Player 2") print( f"{env.get_state.action_to_string(0, 0)}:\t{player1_probs[i][0]:.2f}\t\t{player2_probs[i][0]:.2f}" ) print( f"{env.get_state.action_to_string(0, 1)}:\t{player1_probs[i][1]:.2f}\t\t{player2_probs[i][1]:.2f}" ) print()
def main(_): game = "skat" num_players = 3 env_configs = {} env = rl_environment.Environment(game, **env_configs) observation_tensor_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] # random agents for evaluation random_agents = [ random_agent.RandomAgent(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] with tf.Session() as sess: summaries_dir = os.path.join(FLAGS.checkpoint_dir, "random_eval") summary_writer = tf.summary.FileWriter(summaries_dir, tf.get_default_graph()) hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] # pylint: disable=g-complex-comprehension agents = [ dqn.DQN(session=sess, player_id=idx, state_representation_size=observation_tensor_size, num_actions=num_actions, hidden_layers_sizes=hidden_layers_sizes, replay_buffer_capacity=FLAGS.replay_buffer_capacity, batch_size=FLAGS.batch_size) for idx in range(num_players) ] saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) for ep in range(FLAGS.num_train_episodes): if (ep + 1) % FLAGS.eval_every == 0: r_mean = eval_against_random_bots(env, agents, random_agents, FLAGS.num_eval_games) logging.info("[%s] Mean episode rewards %s", ep + 1, r_mean) for i in range(num_players): summary = tf.Summary() summary.value.add(tag="mean_reward/random_{}".format(i), simple_value=r_mean[i]) summary_writer.add_summary(summary, ep) summary_writer.flush() saver.save(sess, FLAGS.checkpoint_dir, ep) time_step = env.reset() # Randomize position. if FLAGS.randomize_positions: positions = random.sample(range(len(agents)), len(agents)) while not time_step.last(): player_id = time_step.observations["current_player"] if FLAGS.randomize_positions: position = positions[player_id] agents[position].player_id = player_id else: position = player_id agent_output = agents[position].step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step)
def main(_): np.random.seed(FLAGS.seed) tf.random.set_random_seed(FLAGS.seed) num_players = FLAGS.num_players env = rl_environment.Environment(FLAGS.game, include_full_state=True) info_state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] # Exploitee agents if FLAGS.exploitee == "first": exploitee_agents = [ FirstActionAgent(idx, num_actions) for idx in range(num_players) ] elif FLAGS.exploitee == "random": exploitee_agents = [ random_agent.RandomAgent(player_id=idx, num_actions=num_actions) # FirstActionAgent(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] else: raise RuntimeError("Unknown exploitee") rolling_averager = RollingAverage(FLAGS.window_size) rolling_averager_p0 = RollingAverage(FLAGS.window_size) rolling_averager_p1 = RollingAverage(FLAGS.window_size) rolling_value = 0 total_value = 0 total_value_n = 0 with tf.Session() as sess: hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] # pylint: disable=g-complex-comprehension learning_agents = create_training_agents(num_players, sess, num_actions, info_state_size, hidden_layers_sizes) sess.run(tf.global_variables_initializer()) print("Starting...") for ep in range(FLAGS.num_train_episodes): if (ep + 1) % FLAGS.eval_every == 0: r_mean = eval_against_fixed_bots(env, learning_agents, exploitee_agents, FLAGS.eval_episodes) value = r_mean[0] + r_mean[1] rolling_averager.add(value) rolling_averager_p0.add(r_mean[0]) rolling_averager_p1.add(r_mean[1]) rolling_value = rolling_averager.mean() rolling_value_p0 = rolling_averager_p0.mean() rolling_value_p1 = rolling_averager_p1.mean() total_value += value total_value_n += 1 avg_value = total_value / total_value_n print(("[{}] Mean episode rewards {}, value: {}, " + "rval: {} (p0/p1: {} / {}), aval: {}").format( ep + 1, r_mean, value, rolling_value, rolling_value_p0, rolling_value_p1, avg_value)) agents_round1 = [learning_agents[0], exploitee_agents[1]] agents_round2 = [exploitee_agents[0], learning_agents[1]] for agents in [agents_round1, agents_round2]: time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] if env.is_turn_based: agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] else: agents_output = [ agent.step(time_step) for agent in agents ] action_list = [ agent_output.action for agent_output in agents_output ] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step)
''' game.new_initial_state() game.num_players() state.is_terminal() state.is_simultaneous_node() state.is_chance_node() state.legal_actions(int playerId) state.apply_actions([a1,a2, ...] voor elke playerId) state.returns() | geeft result van het spel weer ''' env = rl_environment.Environment("matrix_pd") num_actions = env.action_spec()["num_actions"] q_agent = tabular_qlearner.QLearner(0, num_actions) ra2 = random_agent.RandomAgent(1, num_actions) players = [q_agent, ra2] for cur_episode in range(5000): if (cur_episode % 1000 == 0): print("cur_episode: " + str(cur_episode)) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = players[player_id].step(time_step) time_step = env.step([agent_output.action]) for agent in players: agent.step(time_step)
def main_loop(unused_arg): """RL main loop example.""" logging.info("Registered games: %s", rl_environment.registered_games()) logging.info("Creating game %s", FLAGS.game) #env_configs = {"players": FLAGS.num_players} if FLAGS.num_players else {} env_configs = {} env = rl_environment.Environment(FLAGS.game, **env_configs) state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] hidden_layers_sizes = [512, 512] replay_buffer_capacity = int(1e4) train_episodes = FLAGS.num_episodes loss_report_interval = 1000 logging.info("Env specs: %s", env.observation_spec()) logging.info("Action specs: %s", env.action_spec()) with tf.Session() as sess: # agents = [ # dqn.DQN( # pylint: disable=g-complex-comprehension # sess, # player_id, # state_representation_size=state_size, # num_actions=num_actions, # #hidden_layers_sizes=[16], # #replay_buffer_capacity=10, # hidden_layers_sizes=hidden_layers_sizes, # replay_buffer_capacity=replay_buffer_capacity, # batch_size=128) for player_id in range(3) # ] dqn_agents = [ dqn.DQN( # pylint: disable=g-complex-comprehension sess, 0, state_representation_size=state_size, num_actions=num_actions, #hidden_layers_sizes=[16], #replay_buffer_capacity=10, hidden_layers_sizes=hidden_layers_sizes, replay_buffer_capacity=replay_buffer_capacity, batch_size=128) ] random_agents = [ random_agent.RandomAgent(player_id=i, num_actions=num_actions) for i in range(1, 3) ] agents = dqn_agents + random_agents saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) #latest_checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) latest_checkpoint_path = tf.train.latest_checkpoint( FLAGS.checkpoint_dir) if latest_checkpoint_path: print('Restoring checkpoint: {0}'.format(latest_checkpoint_path)) saver.restore(sess, latest_checkpoint_path) # Train agent for ep in range(train_episodes): #if ep and ep % loss_report_interval == 0: if (ep + 1) % FLAGS.eval_every == 0: logging.info("[%s/%s] DQN loss: %s", ep, train_episodes, agents[0].loss) saver.save(sess, FLAGS.checkpoint_dir, ep) time_step = env.reset() while not time_step.last(): current_player = time_step.observations["current_player"] #agent_output = [agent.step(time_step) for agent in agents] #time_step = env.step([agent_output[current_player].action]) if env.is_turn_based: agent_output = agents[current_player].step(time_step) action_list = [agent_output.action] else: agents_output = [agent.step(time_step) for agent in agents] action_list = [ agent_output.action for agent_output in agents_output ] #print_iteration(time_step, current_player, action_list) time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step)
def main(_): game = FLAGS.game # Set the game num_players = 1 train_games, train_rewards, test_games, test_rewards = mst.game_params( FLAGS.num_nodes) # Load from files env_configs = train_games[0] env = rl_environment.Environment(game, **env_configs) info_state_size = FLAGS.num_nodes * FLAGS.num_nodes * 3 #env.observation_spec()["info_state"][0] num_actions = env.action_spec()[ "num_actions"] # number of possible actions print("Info State Size: ", info_state_size) print("Num Actions: ", num_actions) # random agents for evaluation random_agents = [ random_agent.RandomAgent(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.125) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] # pylint: disable=g-complex-comprehension agents = [ dqn.DQN(session=sess, player_id=idx, state_representation_size=info_state_size, num_actions=num_actions, hidden_layers_sizes=hidden_layers_sizes, replay_buffer_capacity=FLAGS.replay_buffer_capacity, batch_size=FLAGS.batch_size) for idx in range(num_players) ] saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) #saver = tf.train.import_meta_graph('/home/jupyter/ORIE-GNN-bjk224/mst-game/dqn_checkpoints/dqn_20epochs_mst_medium/dqn_test-399999.meta') #saver.restore(sess, tf.train.latest_checkpoint('/home/jupyter/ORIE-GNN-bjk224/mst-game/dqn_checkpoints/dqn_20epochs_mst_medium/')) for ep in range(FLAGS.num_train_episodes): print(env_configs) #env_configs = train_games[ep % len(train_games)] #env = rl_environment.Environment(game, **env_configs) episode_reward = train_rewards[ep % len(train_games)] if (ep + 1) % FLAGS.eval_every == 0: r_mean = eval_against_random_bots(env, agents, random_agents, 0) logging.info("[%s] Mean episode rewards %s", ep + 1, r_mean) #saver.save(sess, FLAGS.checkpoint_dir, ep) print("Actual MST Value: ", episode_reward) if (ep + 1) % FLAGS.test_every == 0: test_accuracy = test_trained_bot(test_games, test_rewards, agents[0], ep, FLAGS.num_nodes, game, FLAGS.game_version) logging.info("[%s] Test Accuracy: %s", ep + 1, test_accuracy) #env = rl_environment.Environment(game, **games[ep]) time_step = env.reset() # print("TRAIN"+"*"*80) while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) #print("(Action, Reward): ", action_list[0], time_step.rewards[0]) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step)
def main(_): game = "kuhn_poker" num_players = 2 env = rl_environment.Environment(game) num_actions = env.action_spec()["num_actions"] agents = [ tabular_qlearner.QLearner(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] # random agents for evaluation random_agents = [ random_agent.RandomAgent(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] # 1. Train the agents if FLAGS.should_train: training_episodes = FLAGS.num_episodes for cur_episode in range(training_episodes): if cur_episode % int(1e4) == 0: win_rates = eval_against_random_bots(env, agents, random_agents, 1000) logging.info("Starting episode %s, win_rates %s", cur_episode, win_rates) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) time_step = env.step([agent_output.action]) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step) if not FLAGS.iteractive_play: return # 2. Play from the command line against the trained agent. if FLAGS.should_play: # Pretty print state player_1 = 0 while True: time_step = env.reset() pretty_print_state(env) while not time_step.last(): player_id = time_step.observations["current_player"] if player_id == player_1: agent_out = agents[player_1].step(time_step, is_evaluation=True) logging.info("Pick action for player %s", player_id) action = command_line_action(env, time_step) else: agent_out = agents[1 - player_1].step(time_step, is_evaluation=True) logging.info("Pick action for player %s", player_id) #action = command_line_action(env, time_step) action = agent_out.action logging.info("Agent action: %s", action) time_step = env.step([action]) logging.info("Rewards: Player_0 %s | Player_1 %s", time_step.rewards[player_1], time_step.rewards[1 - player_1]) logging.info("End of game!")
with tf.Session() as sess: dqn_agents = [ dqn.DQN(sess, player_id=idx, state_representation_size=state_size, num_actions=num_actions, hidden_layers_sizes=hidden_layers_sizes, replay_buffer_capacity=replay_buffer_capacity) for idx in range(num_players) ] #si tu veux restorer les agents pour continuer de l'entrainer, tu peux décommenter les 2 lignes en dessous #for i in range(num_players) : # dqn_agents[i].restore("agents/better_puissance4/", str(i), "99") rand_agents = [ random_agent.RandomAgent(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] sess.run(tf.compat.v1.global_variables_initializer()) #trianing for ep in range(train_episodes): if (ep + 1) % save_every_n_step == 0: win_rates = eval_against_random_bots(env, dqn_agents, rand_agents, 100) win_rates_history.append(win_rates) print("Episode {} of {}, win {}".format( ep + 1, train_episodes, win_rates)) for i, agent in enumerate(dqn_agents): agent.save(parameters.agent_path, str(i), str(int(ep / save_every_n_step)))
def main(_): game = "tic_tac_toe" num_players = 2 env = rl_environment.Environment(game) num_actions = env.action_spec()["num_actions"] agents = [ tabular_qlearner.QLearner(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] # random agents for evaluation random_agents = [ random_agent.RandomAgent(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] # 1. Train the agents training_episodes = FLAGS.num_episodes for cur_episode in range(training_episodes): if cur_episode % int(1e4) == 0: win_rates = eval_against_random_bots(env, agents, random_agents, 1000) logging.info("Starting episode %s, win_rates %s", cur_episode, win_rates) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) time_step = env.step([agent_output.action]) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step) if not FLAGS.iteractive_play: return # 2. Play from the command line against the trained agent. human_player = 1 while True: logging.info("You are playing as %s", "O" if human_player else "X") time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] if player_id == human_player: agent_out = agents[human_player].step(time_step, is_evaluation=True) logging.info("\n%s", agent_out.probs.reshape((3, 3))) logging.info("\n%s", pretty_board(time_step)) action = command_line_action(time_step) else: agent_out = agents[1 - human_player].step(time_step, is_evaluation=True) action = agent_out.action time_step = env.step([action]) logging.info("\n%s", pretty_board(time_step)) logging.info("End of game!") if time_step.rewards[human_player] > 0: logging.info("You win") elif time_step.rewards[human_player] < 0: logging.info("You lose") else: logging.info("Draw") # Switch order of players human_player = 1 - human_player