def __init__(self): """ Load pretrained model """ super().__init__() self.graph = tf.Graph() self.sess = tf.Session(graph=self.graph) env = rlcard.make('leduc-holdem') with self.graph.as_default(): self.nfsp_agents = [] for i in range(env.player_num): agent = NFSPAgent(self.sess, scope='nfsp' + str(i), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[128, 128], q_norm_step=1000, q_mlp_layers=[128, 128]) self.nfsp_agents.append(agent) normalize(env, self.nfsp_agents, 1000) self.sess.run(tf.global_variables_initializer()) check_point_path = os.path.join(ROOT_PATH, 'leduc_holdem_nfsp') with self.sess.as_default(): with self.graph.as_default(): saver = tf.train.Saver(tf.model_variables()) saver.restore(self.sess, tf.train.latest_checkpoint(check_point_path))
def load_nfsp_leduc_agent(model_path): # Set a global seed set_global_seed(0) # Load pretrained model graph = tf.Graph() sess = tf.Session(graph=graph) with graph.as_default(): nfsp_agents = [] for i in range(env.player_num): agent = NFSPAgent(sess, scope='nfsp' + str(i), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[128, 128], q_mlp_layers=[128, 128]) nfsp_agents.append(agent) # We have a pretrained model here. Change the path for your model. # check_point_path = os.path.join(rlcard.__path__[0], 'models/pretrained/leduc_holdem_nfsp') check_point_path = model_path with sess.as_default(): with graph.as_default(): saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(check_point_path)) return nfsp_agents[0]
def __init__(self): ''' Load pretrained model ''' import tensorflow as tf from rlcard.agents.nfsp_agent import NFSPAgent self.graph = tf.Graph() self.sess = tf.Session(graph=self.graph) env = rlcard.make('leduc-holdem') with self.graph.as_default(): self.nfsp_agents = [] for i in range(env.player_num): agent = NFSPAgent(self.sess, scope='nfsp' + str(i), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[128,128], q_mlp_layers=[128,128]) self.nfsp_agents.append(agent) check_point_path = os.path.join(ROOT_PATH, 'leduc_holdem_nfsp') with self.sess.as_default(): with self.graph.as_default(): saver = tf.train.Saver() saver.restore(self.sess, tf.train.latest_checkpoint(check_point_path))
def test_train(self): memory_init_size = 20 num_steps = 1000 agent = NFSPAgent(num_actions=2, state_shape=[2], hidden_layers_sizes=[10, 10], reservoir_buffer_capacity=50, batch_size=4, min_buffer_size_to_learn=memory_init_size, q_replay_memory_size=50, q_replay_memory_init_size=memory_init_size, q_batch_size=4, q_mlp_layers=[10, 10], device=torch.device('cpu')) predicted_action, _ = agent.eval_step({ 'obs': np.random.random_sample((2, )), 'legal_actions': { 0: None, 1: None }, 'raw_legal_actions': ['call', 'raise'] }) self.assertGreaterEqual(predicted_action, 0) self.assertLessEqual(predicted_action, 1) for _ in range(num_steps): agent.sample_episode_policy() predicted_action = agent.step({ 'obs': np.random.random_sample((2, )), 'legal_actions': { 0: None, 1: None } }) self.assertGreaterEqual(predicted_action, 0) self.assertLessEqual(predicted_action, 1) ts = [{ 'obs': np.random.random_sample((2, )), 'legal_actions': { 0: None, 1: None } }, np.random.randint(2), 0, { 'obs': np.random.random_sample((2, )), 'legal_actions': { 0: None, 1: None }, 'raw_legal_actions': ['call', 'raise'] }, True] agent.feed(ts)
def test_init(self): agent = NFSPAgent(num_actions=10, state_shape=[10], hidden_layers_sizes=[10, 10], q_mlp_layers=[10, 10], device=torch.device('cpu')) self.assertEqual(agent._num_actions, 10)
def test_train(self): memory_init_size = 20 step_num = 1000 sess = tf.compat.v1.InteractiveSession() tf.Variable(0, name='global_step', trainable=False) agent = NFSPAgent(sess=sess, scope='nfsp', action_num=2, state_shape=[2], hidden_layers_sizes=[10, 10], reservoir_buffer_capacity=50, batch_size=4, min_buffer_size_to_learn=memory_init_size, q_replay_memory_size=50, q_replay_memory_init_size=memory_init_size, q_batch_size=4, q_mlp_layers=[10, 10]) sess.run(tf.compat.v1.global_variables_initializer()) predicted_action, _ = agent.eval_step({ 'obs': np.random.random_sample((2, )), 'legal_actions': [0, 1] }) self.assertGreaterEqual(predicted_action, 0) self.assertLessEqual(predicted_action, 1) for _ in range(step_num): agent.sample_episode_policy() predicted_action = agent.step({ 'obs': np.random.random_sample((2, )), 'legal_actions': [0, 1] }) self.assertGreaterEqual(predicted_action, 0) self.assertLessEqual(predicted_action, 1) ts = [{ 'obs': np.random.random_sample((2, )), 'legal_actions': [0, 1] }, np.random.randint(2), 0, { 'obs': np.random.random_sample((2, )), 'legal_actions': [0, 1] }, True] agent.feed(ts) sess.close() tf.compat.v1.reset_default_graph()
def test_init(self): sess = tf.compat.v1.InteractiveSession() tf.Variable(0, name='global_step', trainable=False) agent = NFSPAgent(sess=sess, scope='nfsp', action_num=10, state_shape=[10], hidden_layers_sizes=[10, 10], q_mlp_layers=[10, 10]) self.assertEqual(agent._action_num, 10) sess.close() tf.compat.v1.reset_default_graph()
def test_evaluate_with(self): # Test average policy and value error here sess = tf.compat.v1.InteractiveSession() tf.Variable(0, name='global_step', trainable=False) agent = NFSPAgent(sess=sess, scope='nfsp', action_num=2, state_shape=[2], hidden_layers_sizes=[10, 10], q_mlp_layers=[10, 10], evaluate_with='average_policy') sess.run(tf.compat.v1.global_variables_initializer()) predicted_action, _ = agent.eval_step({ 'obs': np.random.random_sample((2, )), 'legal_actions': [0, 1] }) self.assertGreaterEqual(predicted_action, 0) self.assertLessEqual(predicted_action, 1) sess.close() tf.compat.v1.reset_default_graph() sess = tf.compat.v1.InteractiveSession() tf.Variable(0, name='global_step', trainable=False) agent = NFSPAgent(sess=sess, scope='nfsp', action_num=2, state_shape=[2], hidden_layers_sizes=[10, 10], q_mlp_layers=[10, 10], evaluate_with='random') sess.run(tf.compat.v1.global_variables_initializer()) with self.assertRaises(ValueError): predicted_action = agent.eval_step({ 'obs': np.random.random_sample((2, )), 'legal_actions': [0, 1] }) sess.close() tf.compat.v1.reset_default_graph()
csv_path = root_path + 'performance.csv' figure_path = root_path + 'figures/' # Set a global seed set_global_seed(0) with tf.Session() as sess: # Set agents global_step = tf.Variable(0, name='global_step', trainable=False) agents = [] for i in range(env.player_num): agent = NFSPAgent(sess, scope='nfsp' + str(i), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.1, min_buffer_size_to_learn=memory_init_size, q_replay_memory_init_size=memory_init_size, q_norm_step=norm_step, q_mlp_layers=[512, 512]) agents.append(agent) sess.run(tf.global_variables_initializer()) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents(agents) eval_env.set_agents([agents[0], random_agent]) # Count the number of steps step_counters = [0 for _ in range(env.player_num)]
def train_mahjong(): # Make environment env = rlcard.make('mahjong', config={'seed': 0}) eval_env = rlcard.make('mahjong', config={'seed': 0}) # Set the iterations numbers and how frequently we evaluate the performance evaluate_every = 1000 evaluate_num = 1000 episode_num = 10000 # The intial memory size memory_init_size = 1000 # Train the agent every X steps train_every = 64 # The paths for saving the logs and learning curves log_dir = './experiments/mahjong_nfsp_result/' # Set a global seed set_global_seed(0) with tf.Session() as sess: # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agents = [] for i in range(env.player_num): agent = NFSPAgent(sess, scope='nfsp' + str(i), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.5, batch_size=256, rl_learning_rate=0.00005, sl_learning_rate=0.00001, min_buffer_size_to_learn=memory_init_size, q_replay_memory_size=int(1e5), q_replay_memory_init_size=memory_init_size, train_every=train_every, q_train_every=train_every, q_batch_size=256, q_mlp_layers=[512, 512]) agents.append(agent) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents(agents) eval_env.set_agents( [agents[0], random_agent, random_agent, random_agent]) # Initialize global variables sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curvefrom rlcard.agents.random_agent import RandomAgent logger = Logger(log_dir) for episode in tqdm(range(episode_num)): # First sample a policy for the episode for agent in agents: agent.sample_episode_policy() # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for i in range(env.player_num): for ts in trajectories[i]: agents[i].feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('NFSP') # Save model save_dir = 'models/mahjong_nfsp' if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model'))
# Set a global seed set_global_seed(0) with tf.Session() as sess: # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agents = [] for i in range(env.player_num): agent = NFSPAgent(sess, scope='nfsp' + str(i), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[128, 128], min_buffer_size_to_learn=memory_init_size, q_replay_memory_init_size=memory_init_size, train_every=train_every, q_train_every=train_every, q_mlp_layers=[128, 128]) agents.append(agent) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents(agents) eval_env.set_agents([agents[0], random_agent]) # Initialize global variables sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curve logger = Logger(log_dir)
# Set a global seed set_global_seed(0) with tf.Session() as sess: # Set agents global_step = tf.Variable(0, name='global_step', trainable=False) agents = [] for i in range(env.player_num): agent = NFSPAgent(sess, scope='nfsp' + str(i), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 1024, 2048, 1024, 512], anticipatory_param=0.5, batch_size=256, rl_learning_rate=0.00005, sl_learning_rate=0.00001, min_buffer_size_to_learn=memory_init_size, q_replay_memory_size=int(1e5), q_replay_memory_init_size=memory_init_size, q_norm_step=norm_step, q_batch_size=256, q_mlp_layers=[512, 1024, 2048, 1024, 512]) agents.append(agent) sess.run(tf.global_variables_initializer()) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents(agents) eval_env.set_agents([agents[0], random_agent, random_agent])
# Make environment env = rlcard.make('leduc-holdem') # Set a global seed set_global_seed(0) # Load pretrained model graph = tf.Graph() sess = tf.Session(graph=graph) with graph.as_default(): nfsp_agents = [] for i in range(env.player_num): agent = NFSPAgent(sess, scope='nfsp' + str(i), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[128, 128], q_mlp_layers=[128, 128]) nfsp_agents.append(agent) # We have a pretrained model here. Change the path for your model. check_point_path = os.path.join(rlcard.__path__[0], 'models/pretrained/leduc_holdem_nfsp') with sess.as_default(): with graph.as_default(): saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(check_point_path)) # Evaluate the performance. Play with random agents. evaluate_num = 10000
# Set a global seed set_global_seed(0) with tf.Session() as sess: # Set agents global_step = tf.Variable(0, name='global_step', trainable=False) agents = [] for i in range(env.player_num): agent = NFSPAgent(sess, scope='nfsp' + str(i), anticipatory_param=0.1, action_num=env.action_num, state_shape=env.state_shape, rl_learning_rate=0.1, sl_learning_rate=0.005, hidden_layers_sizes=[128, 128], min_buffer_size_to_learn=memory_init_size, q_replay_memory_init_size=memory_init_size, q_epsilon_start=0.06, q_epsilon_end=0.0, q_norm_step=norm_step, q_mlp_layers=[128, 128]) agents.append(agent) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents(agents)
### Step 2: Initialize the NFSP agents. ### import tensorflow.compat.v1 as tf from rlcard.agents.nfsp_agent import NFSPAgent tf.disable_v2_behavior() memory_init_size = 1000 norm_step = 100 with tf.Session() as sess: # Set agents global_step = tf.Variable(0, name='global_step', trainable=False) agents = [] for i in range(env.player_num): agent = NFSPAgent(sess, scope='nfsp' + str(i), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[128, 128], min_buffer_size_to_learn=1000, q_replay_memory_init_size=memory_init_size, q_update_target_estimator_every=norm_step, q_mlp_layers=[128, 128]) agents.append(agent) # with sess.as_default(): #uncomment when loading # saver = tf.train.Saver() # saver.restore(sess, tf.train.latest_checkpoint(save_dir)) sess.run(tf.global_variables_initializer()) # comment out when loading env.set_agents(agents) # Setup all nfsp agents into training environments # Setup random agent for evaluation random_agent = RandomAgent(action_num=eval_env.action_num) eval_env.set_agents([agents[0], random_agent])