def __init__(self, env, eval_env, log_every, save_every, base_dir, config, training_agent, vs_agent, feed_function, save_function): self.save_dir = "{}/{}".format(base_dir, datetime.now().strftime("%Y%m%d")) self.log_dir = os.path.join(self.save_dir, "logs/") self.model_dir = os.path.join(self.save_dir, "model/") if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) self.log_every = log_every self.save_every = save_every self.config = config self.env = env self.eval_env = eval_env self.agent = training_agent self.training_agents = [self.agent, vs_agent] self.env.set_agents(self.training_agents) self.logger = Logger(self.log_dir) self.logger.log("CONFIG: ") self.logger.log(str(config)) self.stat_logger = YanivStatLogger(self.logger) self.feed_function = feed_function self.save_function = save_function self.action_space = utils.JOINED_ACTION_SPACE if config[ 'single_step_actions'] else utils.ACTION_SPACE
def test_add_point(self): logger = Logger(xlabel="x", ylabel="y", legend="test", csv_path="./newtest/test_csv.csv") logger.add_point(x=1, y=1) self.assertEqual(logger.xs[0], 1) self.assertEqual(logger.ys[0], 1)
def test_close_file(self): logger = Logger(xlabel="x", ylabel="y", legend="test", log_path="./newtest/test_log.txt", csv_path="./newtest/test_csv.csv") logger.close_file() self.assertTrue(os.path.exists('./newtest/'))
def train(): env = rlcard.make('mahjong', {'allow_step_back': True}) # env = rlcard.make('mahjong') # Set the iterations numbers and how frequently we evaluate/save plot evaluate_every = 100 save_plot_every = 1000 evaluate_num = 10000 episode_num = 100000 # The paths for saving the logs and learning curves root_path = './experiments/mahjong_cfr_result/' log_path = root_path + 'log.txt' csv_path = root_path + 'performance.csv' figure_path = root_path + 'figures/' # Set a global seed set_global_seed(0) # Initilize CFR Agent agent = MCCFRAgent(env) # Init a Logger to plot the learning curve logger = Logger(root_path) for episode in range(episode_num + 1): agent.train() print('\rIteration {}'.format(episode), end='') if episode % 5000 == 0: agent.save(episode) # # Evaluate the performance. Play with NFSP agents. # if episode % evaluate_every == 0: # reward = 0 # for eval_episode in range(evaluate_num): # _, payoffs = eval_env.run(is_training=False) # # reward += payoffs[0] # # logger.log('\n########## Evaluation ##########') # logger.log('Iteration: {} Average reward is {}'.format(episode, float(reward)/evaluate_num)) # # # Add point to logger # logger.add_point(x=env.timestep, y=float(reward)/evaluate_num) # # # Make plot # if episode % save_plot_every == 0 and episode > 0: # logger.make_plot(save_path=figure_path+str(episode)+'.png') # Make the final plot logger.make_plot(save_path=figure_path + 'final_' + str(episode) + '.png')
def test_log(self): log_dir = "experiments/newtest/test_log.txt" if os.path.exists(log_dir): shutil.rmtree(log_dir) with Logger(log_dir) as logger: logger.log("test text") logger.log_performance(1, 1) logger.log_performance(2, 2) logger.log_performance(3, 3)
def test_add_point(self): logger = Logger(xlabel="x", ylabel="y", legend="test", csv_path="./newtest/test_csv.csv") logger.add_point(x=1, y=1) self.assertEqual(logger.xs[0], 1) self.assertEqual(logger.ys[0], 1) with self.assertRaises(ValueError): logger.add_point(None, None)
def test_log(self): logger = Logger(xlabel="x", ylabel="y", legend="test", log_path="./newtest/test_log.txt") logger.log("test text") f = open("./newtest/test_log.txt", "r") contents = f.read() self.assertEqual(contents, "test text\n") logger.close_file()
def test_make_plot(self): logger = Logger(xlabel="x", ylabel="y", legend="test") for x in range(10): logger.add_point(x=x, y=x * x) self.assertEqual(9 * 9, logger.ys[9]) save_path = './newtest/test.png' save_dir = os.path.dirname(save_path) if os.path.exists(save_dir): shutil.rmtree(save_dir) logger.make_plot(save_path=save_path) shutil.rmtree(save_dir)
def test_log(self): log_path = "./newtest/test_log.txt" log_dir = os.path.dirname(log_path) if os.path.exists(log_dir): shutil.rmtree(log_dir) logger = Logger(xlabel="x", ylabel="y", legend="test", log_path=log_path) logger.log("test text") f = open("./newtest/test_log.txt", "r") contents = f.read() self.assertEqual(contents, "test text\n") logger.close_file() shutil.rmtree(log_dir)
def test_make_plot(self): logger = Logger(xlabel="x", ylabel="y", legend="test") for x in range(10): logger.add_point(x=x, y=x * x) self.assertEqual(9 * 9, logger.ys[9]) logger.make_plot(save_path='./newtest/test.png')
class ExperimentRunner: def __init__(self, env, eval_env, log_every, save_every, base_dir, config, training_agent, vs_agent, feed_function, save_function): self.save_dir = "{}/{}".format(base_dir, datetime.now().strftime("%Y%m%d")) self.log_dir = os.path.join(self.save_dir, "logs/") self.model_dir = os.path.join(self.save_dir, "model/") if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) self.log_every = log_every self.save_every = save_every self.config = config self.env = env self.eval_env = eval_env self.agent = training_agent self.training_agents = [self.agent, vs_agent] self.env.set_agents(self.training_agents) self.logger = Logger(self.log_dir) self.logger.log("CONFIG: ") self.logger.log(str(config)) self.stat_logger = YanivStatLogger(self.logger) self.feed_function = feed_function self.save_function = save_function self.action_space = utils.JOINED_ACTION_SPACE if config[ 'single_step_actions'] else utils.ACTION_SPACE def feed_game(self, agent, trajectories, player_id): self.feed_function(agent, trajectories[player_id]) if self.config.get("feed_both_games"): other_traj = trajectories[player_id + 1 % len(self.training_agents)] if self.training_agents[player_id + 1 % len(self.training_agents)].use_raw: self.feed_function( agent, list( map( lambda t: [t[0], self.action_space[t[1]], *t[2:]], other_traj, ))) else: self.feed_function(agent, other_traj) def run_training(self, episode_num, eval_every, eval_vs, eval_num): for episode in trange(episode_num, desc="Episodes", file=sys.stdout): # Generate data from the environment trajectories, _ = self.env.run(is_training=True) self.stat_logger.add_game(trajectories, self.env, 0) self.feed_game(self.agent, trajectories, 0) if self.config['feed_both_agents']: self.feed_game(self.training_agents[1], trajectories, 1) if episode != 0 and episode % self.log_every == 0: self.stat_logger.log_stats() if episode != 0 and episode % self.save_every == 0: self.save_function(self.agent, self.model_dir) if episode != 0 and episode % eval_every == 0: self.logger.log( "\n\n########## Evaluation {} ##########".format(episode)) self.evaluate_perf(eval_vs, eval_num) self.evaluate_perf(eval_vs, eval_num) self.save_function(self.agent, self.model_dir) def evaluate_perf(self, eval_vs, eval_num): if isinstance(eval_vs, list): for vs in eval_vs: self.run_evaluation(vs, eval_num) else: self.run_evaluation(eval_vs, eval_num) def run_evaluation(self, vs, num): self.eval_env.set_agents([self.agent, vs]) self.logger.log("eval vs {}".format(vs.__class__.__name__)) r = tournament(self.eval_env, num) eval_vs = "eval_{}_".format(vs.__class__.__name__) wandb.log( { eval_vs + "payoff": r["payoffs"][0], eval_vs + "draws": r["draws"], eval_vs + "roundlen": r["roundlen"], eval_vs + "assafs": r["assafs"][0], eval_vs + "win_rate": r["wins"][0] / num, }, ) self.logger.log("Timestep: {}, avg roundlen: {}".format( self.env.timestep, r["roundlen"])) for i in range(self.env.player_num): self.logger.log( "Agent {}:\nWins: {}, Draws: {}, Assafs: {}, Payoff: {}". format( i, r["wins"][i], r["draws"], r["assafs"][i], r["payoffs"][i], )) self.logger.log_performance(self.env.timestep, r["payoffs"][0])
global_step = tf.Variable(0, name='global_step', trainable=False) agent = DQNAgent(sess, scope='dqn', action_num=env.action_num, replay_memory_size=int(1e5), replay_memory_init_size=memory_init_size, norm_step=norm_step, state_shape=env.state_shape, mlp_layers=[128, 128]) sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curve logger = Logger(xlabel='timestep', ylabel='reward', legend='DQN on Leduc Holdem', log_path=log_path, csv_path=csv_path) state = env.reset() for timestep in range(timesteps): action = agent.step(state) next_state, reward, done = env.step(action) ts = (state, action, reward, next_state, done) agent.feed(ts) train_count = timestep - (memory_init_size + norm_step) if train_count > 0: loss = agent.train() print('\rINFO - Step {}, loss: {}'.format(timestep, loss), end='')
def train_mahjong(): # Make environment env = rlcard.make('mahjong', config={'seed': 0}) eval_env = rlcard.make('mahjong', config={'seed': 0}) # Set the iterations numbers and how frequently we evaluate the performance evaluate_every = 1000 evaluate_num = 1000 episode_num = 10000 # The intial memory size memory_init_size = 1000 # Train the agent every X steps train_every = 64 # The paths for saving the logs and learning curves log_dir = './experiments/mahjong_nfsp_result/' # Set a global seed set_global_seed(0) with tf.Session() as sess: # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agents = [] for i in range(env.player_num): agent = NFSPAgent(sess, scope='nfsp' + str(i), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.5, batch_size=256, rl_learning_rate=0.00005, sl_learning_rate=0.00001, min_buffer_size_to_learn=memory_init_size, q_replay_memory_size=int(1e5), q_replay_memory_init_size=memory_init_size, train_every=train_every, q_train_every=train_every, q_batch_size=256, q_mlp_layers=[512, 512]) agents.append(agent) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents(agents) eval_env.set_agents( [agents[0], random_agent, random_agent, random_agent]) # Initialize global variables sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curvefrom rlcard.agents.random_agent import RandomAgent logger = Logger(log_dir) for episode in tqdm(range(episode_num)): # First sample a policy for the episode for agent in agents: agent.sample_episode_policy() # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for i in range(env.player_num): for ts in trajectories[i]: agents[i].feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('NFSP') # Save model save_dir = 'models/mahjong_nfsp' if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model'))
q_norm_step=norm_step, q_mlp_layers=[512, 512]) agents.append(agent) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents(agents) eval_env.set_agents([agents[0], random_agent]) # Count the number of steps step_counters = [0 for _ in range(env.player_num)] # Init a Logger to plot the learning curve logger = Logger(xlabel='timestep', ylabel='reward', legend='NFSP on Limit Texas Holdem', log_path=log_path, csv_path=csv_path) for episode in range(episode_num): # First sample a policy for the episode for agent in agents: agent.sample_episode_policy() # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for i in range(env.player_num): for ts in trajectories[i]:
# Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agent = DQNAgent(sess, scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[128, 128]) # Initialize global variables sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curve logger = Logger(log_dir) state = env.reset() for timestep in range(timesteps): action = agent.step(state) next_state, reward, done = env.step(action) ts = (state, action, reward, next_state, done) agent.feed(ts) if timestep % evaluate_every == 0: rewards = [] state = eval_env.reset() for _ in range(evaluate_num): action, _ = agent.eval_step(state) _, reward, done = env.step(action)
def test_log(self): log_dir = "./newtest/test_log.txt" if os.path.exists(log_dir): shutil.rmtree(log_dir) logger = Logger(log_dir) logger.log("test text") logger.log_performance(1, 1) logger.log_performance(2, 2) logger.log_performance(3, 3) logger.close_files() logger.plot('aaa')
q_replay_memory_size=int(1e5), q_replay_memory_init_size=memory_init_size, train_every=train_every, q_train_every=train_every, q_batch_size=256, q_mlp_layers=[512, 1024, 2048, 1024, 512]) agents.append(agent) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents(agents) eval_env.set_agents([agents[0], random_agent]) # Initialize global variables sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): # First sample a policy for the episode for agent in agents: agent.sample_episode_policy() # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for i in range(env.player_num): for ts in trajectories[i]: agents[i].feed(ts)
norm_step=norm_step, state_shape=env.state_shape, mlp_layers=[512, 512]) random_agent = RandomAgent(action_num=eval_env.action_num) sess.run(tf.global_variables_initializer()) env.set_agents([agent, random_agent, random_agent, random_agent, random_agent]) eval_env.set_agents([agent, random_agent, random_agent, random_agent, random_agent]) # Count the number of steps step_counter = 0 # Init a Logger to plot the learning curve logger = Logger(xlabel='timestep', ylabel='hand reward', legend='DQN on Badugi', log_path=log_path, csv_path=csv_path) if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) for episode in range(episode_num): # Generate data from the environment trajectories, _, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]: agent.feed(ts) step_counter += 1 # Train the agent train_count = step_counter - (memory_init_size + norm_step)
agents.append(agent) sess.run(tf.global_variables_initializer()) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents(agents) eval_env.set_agents([agents[0], random_agent, random_agent]) # Count the number of steps step_counters = [0 for _ in range(env.player_num)] # Init a Logger to plot the learning curve logger = Logger(xlabel='timestep', ylabel='reward', legend='NFSP on Dou Dizhu', log_path=log_path, csv_path=csv_path) for episode in range(episode_num): # First sample a policy for the episode for agent in agents: agent.sample_episode_policy() # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for i in range(env.player_num): for ts in trajectories[i]:
self_played = True self_record_number = 1 models = {'0': RandomAgent, '250007': TarotDQNModelV250007} # Model save path save_path = 'examples/statistics/tarot_v{}/'.format(str(stats_on_model)) if not os.path.exists('examples/statistics'): os.makedirs('examples/statistics') if not os.path.exists(save_path): os.makedirs(save_path) csv_path_taking = save_path + 'taking_stats.csv' logger_taking = Logger(xlabel='hand_value', ylabel='nb_bouts', zlabel='taking_bid_order', legend='', csv_path=csv_path_taking) csv_path_game = save_path + 'games_stats.csv' logger_game = Logger(label_list=[ 'game_id', 'hand_value', 'nb_bouts', 'nb_bouts_dog', 'taking', 'taking_bid_order', 'number_of_points_achieved', 'nb_bouts_achieved', 'reward' ], legend='', csv_path=csv_path_game) # Testing bid strategy of this agent with tf.compat.v1.Session() as sess:
# Set a global seed set_global_seed(0) agents = [ DQNAgent(scope=f'dqn_{i}', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[128, 128], device=torch.device('cpu')) for i in range(env.player_num) ] env.set_agents(agents) eval_env.set_agents(agents) logger = Logger(log_dir) for episode in range(episode_num): # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for agent_idx, traj in enumerate(trajectories): for ts in traj: agents[agent_idx].feed(ts) # Evaluate the performance. if episode % evaluate_every == 0: logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0])
agent = DQNAgent(sess, scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[512, 512]) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents([agent, random_agent, random_agent]) eval_env.set_agents([agent, random_agent, random_agent]) # Initialize global variables sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]: agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: t = tournament(eval_env, evaluate_num)[0] logger.log_performance(env.timestep, t) # Close files in the logger
sess.run(tf.compat.v1.global_variables_initializer()) saver = tf.compat.v1.train.Saver() env.set_agents([agent] + [opponent_agent] * (env.player_num - 1)) eval_env.set_agents([agent] + [random_agent] * (env.player_num - 1)) # Count the number of steps step_counter = 0 # Init a Logger to plot the learning curve against random logger_random = Logger( xlabel='timestep', ylabel='reward', legend='DQN on TAROT against Random', legend_hist='Histogram of last evaluations against Random', log_path=log_path_random, csv_path=csv_path_random) # Init a Logger to plot the learning curve against last opponent logger_opponent = Logger( xlabel='timestep', ylabel='reward', legend='DQN on TAROT against last agent', legend_hist='Histogram of last evaluations against last agent', log_path=log_path_opponent, csv_path=csv_path_opponent) total_game_played = 0 seconds = time.time()
action_num=env.action_num, replay_memory_init_size=memory_init_size, norm_step=norm_step, state_shape=env.state_shape, mlp_layers=[10, 10]) env.set_agents([agent]) eval_env.set_agents([agent]) sess.run(tf.global_variables_initializer()) # Count the number of steps step_counter = 0 # Init a Logger to plot the learning curve logger = Logger(xlabel='timestep', ylabel='reward', legend='DQN on Blackjack', log_path=log_path, csv_path=csv_path) for episode in range(episode_num // evaluate_every): # Generate data from the environment tasks = assign_task(evaluate_every, PROCESS_NUM) for task in tasks: INPUT_QUEUE.put((task, True, None, None)) for _ in range(evaluate_every): trajectories = OUTPUT_QUEUE.get() # Feed transitions into agent memory, and train for ts in trajectories[0]: agent.feed(ts)
# Set agents global_step = tf.Variable(0, name='global_step', trainable=False) agent = DQNAgent(sess, scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, state_shape=env.state_shape, train_every=train_every, mlp_layers=[128, 128]) random_agent = RandomAgent(action_num=env.action_num) env.set_agents([agent, random_agent]) eval_env.set_agents([agent, random_agent]) sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num // evaluate_every): # Generate data from the environment tasks = assign_task(evaluate_every, PROCESS_NUM) for task in tasks: INPUT_QUEUE.put((task, True, None, None)) for _ in range(evaluate_every): trajectories, timestep = OUTPUT_QUEUE.get() env.timestep += timestep # Feed transitions into agent memory, and train for ts in trajectories[0]: agent.feed(ts) # Evaluate the performance reward = 0
batch_size=32, model_path='./DeepCFR_model') #agent0 = MCCFRagent2.MCCFRagent(env, isAbs=False, CFR_num=1, tra_num=10) #agent1 = cfr_plus_agent2.CFRAPlusgent(env, isAbs=True, CFR_num=1, tra_num=2) #agent2 = cfr_plus_agent2.CFRAPlusgent(env, isAbs=False, CFR_num=1, tra_num=2) agent3 = RandomAgent(action_num=env.action_num) l = [] from rlcard.utils.logger import Logger root_path = './model_result/' log_path = root_path + 'log.txt' csv_path = root_path + 'performance.csv' figure_path = root_path + 'figures/' logger = Logger(xlabel='iteration', ylabel='exploitability', legend='DeepCFR+_model', log_path=log_path, csv_path=csv_path) r = utils.reward() ''' start = time.perf_counter() e1 = np.mean(r.computer_reward(agent0, agent2, evaluate_num*20, Process_num, eval_env)) e2 = np.mean(r.computer_reward(agent1, agent2, evaluate_num*20, Process_num, eval_env)) end = time.perf_counter() logger.log('eposide {}:{:.5f},{:.5f} test time:{}'.format(0, e1, e2, end-start)) ''' for i in range(100): start = time.perf_counter() agent0.deepCFR(i, 8)
# The paths for saving the logs and learning curves log_dir = './experiments/leduc_holdem_cfr_result/' # Set a global seed set_global_seed(0) # Initilize CFR Agent agent = CFRAgent(env) agent.load() # If we have saved model, we first load the model # Evaluate CFR against pre-trained NFSP eval_env.set_agents([agent, models.load('leduc-holdem-nfsp').agents[0]]) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): agent.train() print('\rIteration {}'.format(episode), end='') # Evaluate the performance. Play with NFSP agents. if episode % evaluate_every == 0: agent.save() # Save model logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('CFR')
episode_num = 10000000 # The paths for saving the logs and learning curves root_path = './experiments/mahjong_cfr_result/' log_path = root_path + 'log.txt' csv_path = root_path + 'performance.csv' figure_path = root_path + 'figures/' # Set a global seed set_global_seed(0) # Initilize CFR Agent agent = CFRAgent(env) # Init a Logger to plot the learning curve logger = Logger(root_path) for episode in range(episode_num): agent.train() print('\rIteration {}'.format(episode), end='') agent.save() # Evaluate the performance. Play with NFSP agents. if episode % evaluate_every == 0: reward = 0 for eval_episode in range(evaluate_num): _, payoffs = eval_env.run(is_training=False) reward += payoffs[0] logger.log('\n########## Evaluation ##########') logger.log('Iteration: {} Average reward is {}'.format(
agent = models[str(against_model)](sess.graph, sess).dqn_agent opponent_agent = agent sess.run(tf.compat.v1.global_variables_initializer()) saver = tf.compat.v1.train.Saver() env.set_agents([agent] + [opponent_agent] * (env.player_num - 1)) eval_env.set_agents([agent] + [opponent_agent] * (env.player_num - 1)) # Count the number of steps step_counter = 0 # Init a Logger to plot the learning curve logger = Logger(xlabel='timestep', ylabel='reward', legend='DQN on TAROT', legend_hist='Histogram of last evaluations', log_path=log_path, csv_path=csv_path) total_game_played = 0 seconds = time.time() for episode in range(episode_num): print('\rEPISODE {} - Number of game played {} - {}'.format(episode, total_game_played, time_difference_good_format(seconds, time.time())), end='') # Generate data from the environment trajectories, _ = env.run(is_training=True) total_game_played += 1 # Feed transitions into agent memory, and train the agent for ts in trajectories[0]:
# Set a global seed set_global_seed(0) # Initilize CFR Agent opponent = CFRAgent(env) #opponent = RandomAgent(action_num=env.action_num) #opponent.load() # If we have saved model, we first load the model #agent = RandomAgent(action_num=env.action_num) agent = BRAgent(eval_env, opponent) #agent = CFRAgent(env) # Evaluate CFR against pre-trained NFSP # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): opponent.train() #agent.train() print('\rIteration {}'.format(episode), end='') # Evaluate the performance. Play with NFSP agents. if episode % evaluate_every == 0: exploitability(eval_env, opponent) #logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() logger.plot('BR')