def load_nfsp_leduc_agent(model_path): # Set a global seed set_global_seed(0) # Load pretrained model graph = tf.Graph() sess = tf.Session(graph=graph) with graph.as_default(): nfsp_agents = [] for i in range(env.player_num): agent = NFSPAgent(sess, scope='nfsp' + str(i), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[128, 128], q_mlp_layers=[128, 128]) nfsp_agents.append(agent) # We have a pretrained model here. Change the path for your model. # check_point_path = os.path.join(rlcard.__path__[0], 'models/pretrained/leduc_holdem_nfsp') check_point_path = model_path with sess.as_default(): with graph.as_default(): saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(check_point_path)) return nfsp_agents[0]
def load_dqn_leduc_agent(model_path): # Set a global seed set_global_seed(0) # Load pretrained model # tf.reset_default_graph() graph = tf.Graph() sess = tf.Session(graph=graph) with graph.as_default(): nfsp_agents = [] agent = DQNAgent(sess, scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[128, 128]) # We have a pretrained model here. Change the path for your model. # check_point_path = os.path.join(rlcard.__path__[0], 'models/pretrained/leduc_holdem_nfsp') check_point_path = model_path with sess.as_default(): with graph.as_default(): saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(check_point_path)) return agent
def multi_traverse(self, q, player_id, num): regretMemory = [] policyMemory = [] set_global_seed(os.getpid()) for i in range(num): self.env.init_game() probs = np.ones(self.env.player_num) self.traverse_tree(max(self.iteration-self.startPolicy, 0), 1, player_id, regretMemory, policyMemory) q.put([regretMemory, policyMemory]) return [regretMemory, policyMemory]
def train(): env = rlcard.make('mahjong', {'allow_step_back': True}) # env = rlcard.make('mahjong') # Set the iterations numbers and how frequently we evaluate/save plot evaluate_every = 100 save_plot_every = 1000 evaluate_num = 10000 episode_num = 100000 # The paths for saving the logs and learning curves root_path = './experiments/mahjong_cfr_result/' log_path = root_path + 'log.txt' csv_path = root_path + 'performance.csv' figure_path = root_path + 'figures/' # Set a global seed set_global_seed(0) # Initilize CFR Agent agent = MCCFRAgent(env) # Init a Logger to plot the learning curve logger = Logger(root_path) for episode in range(episode_num + 1): agent.train() print('\rIteration {}'.format(episode), end='') if episode % 5000 == 0: agent.save(episode) # # Evaluate the performance. Play with NFSP agents. # if episode % evaluate_every == 0: # reward = 0 # for eval_episode in range(evaluate_num): # _, payoffs = eval_env.run(is_training=False) # # reward += payoffs[0] # # logger.log('\n########## Evaluation ##########') # logger.log('Iteration: {} Average reward is {}'.format(episode, float(reward)/evaluate_num)) # # # Add point to logger # logger.add_point(x=env.timestep, y=float(reward)/evaluate_num) # # # Make plot # if episode % save_plot_every == 0 and episode > 0: # logger.make_plot(save_path=figure_path+str(episode)+'.png') # Make the final plot logger.make_plot(save_path=figure_path + 'final_' + str(episode) + '.png')
def traverse(self, agent1, agent2, evaluate_num, eval_env): reward = [] set_global_seed(random.randint(0,100)) for eval_episode in range(evaluate_num): try: agent1.oppoCV = None except: pass try: agent2.oppoCV = None except: pass his, payoffs = eval_env.run(is_training=False) reward.append(payoffs[0]) #print(reward) return np.mean(reward)
def main(): # Make environment env = rlcard.make('blackjack') episode_num = 2 # Set a global seed set_global_seed(0) # Set up agents agent_0 = RandomAgent(action_num=env.action_num) env.set_agents([agent_0]) for episode in range(episode_num): # Generate data from the environment trajectories, _ = env.run(is_training=False) # Print out the trajectories print('\nEpisode {}'.format(episode)) for ts in trajectories[0]: print( 'State: {}, Action: {}, Reward: {}, Next State: {}, Done: {}'. format(ts[0], ts[1], ts[2], ts[3], ts[4]))
root_path = './experiments/tarot_dqn_result_v{}/'.format(str(record_number)) log_path = root_path + 'log.txt' csv_path = root_path + 'performance.csv' figure_path = root_path + 'figures/' # Model save path if not os.path.exists('rlcard/models'): os.makedirs('rlcard/models') if not os.path.exists('rlcard/models/pretrained'): os.makedirs('rlcard/models/pretrained') if not os.path.exists('rlcard/models/pretrained/tarot_v' + str(record_number)): os.makedirs('rlcard/models/pretrained/tarot_v' + str(record_number)) model_path = 'rlcard/models/pretrained/tarot_v' + str(record_number) + '/model' # Set a global seed set_global_seed(0) with tf.compat.v1.Session() as sess: # Set agents global_step = tf.Variable(0, name='global_step', trainable=False) agent = models[str(against_model)](sess.graph, sess).dqn_agent opponent_agent = agent sess.run(tf.compat.v1.global_variables_initializer()) saver = tf.compat.v1.train.Saver() env.set_agents([agent] + [opponent_agent] * (env.player_num - 1)) eval_env.set_agents([agent] + [opponent_agent] * (env.player_num - 1))
def test_set_global_seed(self): set_global_seed(0) self.assertEqual(np.random.get_state()[1][0], 0)
def train_mahjong(): # Make environment env = rlcard.make('mahjong', config={'seed': 0}) eval_env = rlcard.make('mahjong', config={'seed': 0}) # Set the iterations numbers and how frequently we evaluate the performance evaluate_every = 1000 evaluate_num = 1000 episode_num = 10000 # The intial memory size memory_init_size = 1000 # Train the agent every X steps train_every = 64 # The paths for saving the logs and learning curves log_dir = './experiments/mahjong_nfsp_result/' # Set a global seed set_global_seed(0) with tf.Session() as sess: # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agents = [] for i in range(env.player_num): agent = NFSPAgent(sess, scope='nfsp' + str(i), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.5, batch_size=256, rl_learning_rate=0.00005, sl_learning_rate=0.00001, min_buffer_size_to_learn=memory_init_size, q_replay_memory_size=int(1e5), q_replay_memory_init_size=memory_init_size, train_every=train_every, q_train_every=train_every, q_batch_size=256, q_mlp_layers=[512, 512]) agents.append(agent) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents(agents) eval_env.set_agents( [agents[0], random_agent, random_agent, random_agent]) # Initialize global variables sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curvefrom rlcard.agents.random_agent import RandomAgent logger = Logger(log_dir) for episode in tqdm(range(episode_num)): # First sample a policy for the episode for agent in agents: agent.sample_episode_policy() # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for i in range(env.player_num): for ts in trajectories[i]: agents[i].feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('NFSP') # Save model save_dir = 'models/mahjong_nfsp' if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model'))
# Set the number of process process_num = 8 # Set episode_num episode_num = 10000 # Assign tasks per_tasks = assign_task(episode_num, process_num) # Set game and make environment game = 'doudizhu' env = rlcard.make(game) # Set global seed set_global_seed(1) # Set up agents agent_num = env.player_num env.set_agents([RandomAgent(action_num=env.action_num) for _ in range(agent_num)]) # Set a global list to reserve trajectories manager = multiprocessing.Manager() trajectories_set = manager.list() # Generate Processes processes = [] for p in range(process_num): process = multiprocessing.Process(target=env.run_multi, args=(per_tasks[p], trajectories_set)) processes.append(process)
# Set the iterations numbers and how frequently we evaluate/save plot evaluate_every = 1 #00 save_plot_every = 5 #00 evaluate_num = 5 #0 episode_num = 1 #000 # The paths for saving the logs and learning curves root_path = './experiments/nolimit_holdem_cfr_result/' log_path = root_path + 'log.txt' csv_path = root_path + 'performance.csv' figure_path = root_path + 'figures/' log_reward_path = root_path + '_reward_log.txt' csv_reward_path = root_path + '_reward_performance.csv' # Set a global seed set_global_seed(10) # Initilize CFR Agent agent = cfr_agent.CFRAgent(env) #agent.load() # If we have saved model, we first load the model # Evaluate CFR against pre-trained NFSP #eval_env.set_agents([agent, models.load('leduc-holdem-nfsp').agents[0]]) eval_env.set_agents([agent, RandomAgent(action_num=env.action_num)]) # Init a Logger to plot the learning curve logger = Logger(xlabel='iteration', ylabel='exploitability', legend='CFR on nolimit Holdem', log_path=log_path, csv_path=csv_path) logger_reward = Logger(xlabel='iteration',
def traverse(self, evaluate_num): reward = [] set_global_seed(random.randint(0,100)) reward.append(self.agent.compute_exploitability(evaluate_num)) return np.mean(reward)
import rlcard import torch from rlcard.agents.reinforce_agent import ReinforceAgent from rlcard.utils.utils import set_global_seed, tournament from rlcard.utils.logger import Logger episode_num = 100000 evaluate_num = 10000 evaluate_every = 1000 env = rlcard.make("blackjack") eval_env = rlcard.make("blackjack") log_dir = './experiments/blackjack_reinforce_result/' set_global_seed(42) agent = ReinforceAgent(scope="reinforce_agent", action_num=env.action_num, state_shape=env.state_shape, discount_factor=0.99, learning_rate=1e-6, device=None) env.set_agents([agent]) eval_env.set_agents([agent]) logger = Logger(log_dir) for episode in range(episode_num): trajectories, _ = env.run(is_training=True)