for agent in agents: agent.sample_episode_policy() # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for i in range(env.player_num): # update ray rl model for ts in trajectories[i]: agents[i].feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log( '\n\n\n---------------------------------------------------------------\nTournament ' + str(episode / evaluate_every)) # tournament(eval_env2, 6) # exploitability.exploitability(eval_env, agents[0], 500) res = tournament(env, evaluate_num) logger.log_performance(env.timestep, res[0]) res2 = tournament(eval_env, evaluate_num // 3) logger.log_performance(env.timestep, res2[0]) res3 = tournament(eval_env2, evaluate_num // 3) logger.log_performance(env.timestep, res3[0]) logger.log('' + str(episode_num) + " - " + str(episode) + '\n') logger.log( '\n\n----------------------------------------------------------------' )
# Feed transitions into agent memory, and train the agent for i in range(env.player_num): for ts in trajectories[i]: agents[i].feed(ts) # extra logging if episode % evaluate_every == 0: reward = 0 reward2 = 0 eval_episode = 0 for eval_episode in range(evaluate_num): _, payoffs = eval_env.run(is_training=False) reward += payoffs[0] reward2 += payoffs[1] logger.log( "\n\n########## Evaluation {} ##########".format(episode)) reward_text = "{}".format(float(reward) / evaluate_num) reward2_text = "{}".format(float(reward2) / evaluate_num) info = "Timestep: {} Average reward is {}, reward2 is {}".format( env.timestep, reward_text, reward2_text) logger.log(info) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve
from rlcard.utils import Logger from eval_util import * # Set the iterations numbers and how frequently we evaluate/save plot evaluate_num = 100 emu_num = 50 log_dir = './experiments/doudizhu_mcts_vs_drqn_result/' best_model_path = './models/doudizhu_train_drqn_as_L_vs_random_and_eval_vs_random_best.npy' # Set a global seed # Init a Logger to plot the learning curve logger = Logger(log_dir) logger.log("MCTS-UCT VS DRQN") env = rlcard.make('doudizhu', config={'seed': 0, 'allow_step_back': True}) config = tf.ConfigProto() config.gpu_options.allow_growth = True os.environ["CUDA_VISIBLE_DEVICES"] = "0" sess = tf.Session(config=config) drqn_agent = DRQNAgent(sess, scope='doudizhu_drqn', action_num=env.action_num, memory_init_size=3000, memory_size=6000, train_every_t=1, state_shape=env.state_shape,
# Feed transitions into agent memory, and train the agent for tss in trajectories[:1]: for ts in tss: agent.feed(ts) #print(episode) if episode % evaluate_every == 0: eval_env = rlcard.make('doudizhu', config={'seed': 0, 'allow_step_back': True}) eval_env.set_agents([agent, SRandomAgent(eval_env.action_num, seed=0), SRandomAgent(eval_env.action_num, seed=0)]) time_start = time.time() payoffs1 = general_tournament(eval_env,evaluate_num,False) logger.log("episode:{} time:{} landlord winrate:{}".format(episode,time.time()-time_start,payoffs1[0])) L_WR_logger.log_performance(episode,payoffs1[0]) eval_env = rlcard.make('doudizhu', config={'seed': 0, 'allow_step_back': True}) eval_env.set_agents([SRandomAgent(eval_env.action_num, seed=0), SRandomAgent(eval_env.action_num, seed=0),agent]) time_start = time.time() payoffs2 = general_tournament(eval_env, evaluate_num, False) logger.log("episode:{} time:{} peasant winrate:{}".format(episode, time.time() - time_start, payoffs2[1])) P_WR_logger.log_performance(episode, payoffs2[1]) # save_flag = False if payoffs1[0] > max_L_WR: max_L_WR = payoffs1[0] save_flag = True
from SeedRanomAgent import SRandomAgent from rlcard.utils import Logger from eval_util import * # Set the iterations numbers and how frequently we evaluate/save plot evaluate_num = 1000 emu_num = 50 log_dir = './experiments/doudizhu_random_vs_random_result/' # Set a global seed # Init a Logger to plot the learning curve logger = Logger(log_dir) logger.log("Random VS Random") # 地主 set_global_seed(0) eval_env = rlcard.make('doudizhu', config={'seed': 0, 'allow_step_back': True}) eval_env.set_agents([ SRandomAgent(eval_env.action_num, seed=0), SRandomAgent(eval_env.action_num, seed=0), SRandomAgent(eval_env.action_num, seed=0) ]) time_start = time.time() logger.log("Random = landlord winrate:{} time:{}".format( general_tournament(eval_env, evaluate_num, True)[0], time.time() - time_start))
def nfsp(): import tensorflow as tf if tf.test.gpu_device_name(): print('GPU found') else: print("No GPU found") #os.environ['TF_CPP_MIN_LOG_LEVEL']='2' # Make environment env = rlcard.make('no-limit-holdem', config={ 'game_player_num': 2, 'seed': 477 }) eval_env = rlcard.make('no-limit-holdem', config={ 'seed': 12, 'game_player_num': 2 }) eval_env2 = rlcard.make('no-limit-holdem', config={ 'seed': 43, 'game_player_num': 2 }) #eval_env3 = rlcard.make('no-limit-holdem', config={'seed': 43, 'game_player_num': 2}) # Set the iterations numbers and how frequently we evaluate the performance # The intial memory size memory_init_size = 1000 # The paths for saving the logs and learning curves log_dir = './experiments/nolimit_holdem_nfsp_result/no_all_in' # Set a global seed set_global_seed(477) graph = tf.Graph() tf.ConfigProto() sess = tf.Session(graph=graph) evaluate_every = 2048 evaluate_num = 32 episode_num = 24576 # The intial memory size memory_init_size = 256 # Train the agent every X steps train_every = 256 agents = [] with graph.as_default(): """ def __init__(self, sess, scope, action_num=4, state_shape=None, hidden_layers_sizes=None, reservoir_buffer_capacity=int(1e6), anticipatory_param=0.1, batch_size=256, train_every=1, rl_learning_rate=0.1, sl_learning_rate=0.005, min_buffer_size_to_learn=1000, q_replay_memory_size=30000, q_replay_memory_init_size=1000, q_update_target_estimator_every=1000, q_discount_factor=0.99, q_epsilon_start=0.06, q_epsilon_end=0, q_epsilon_decay_steps=int(1e6), q_batch_size=256, q_train_every=1, q_mlp_layers=None, evaluate_with='average_policy'): """ # Model1v1V3cp10good agents.append( NFSPAgent(sess, scope='nfsp' + str(0), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.1, rl_learning_rate=0.01, sl_learning_rate=0.005, q_epsilon_start=.7, min_buffer_size_to_learn=memory_init_size, q_replay_memory_size=80000, q_replay_memory_init_size=memory_init_size, train_every=train_every + 44, q_train_every=train_every, q_mlp_layers=[512, 512])) agents.append( NFSPAgent(sess, scope='nfsp' + str(1), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.1, rl_learning_rate=0.01, sl_learning_rate=0.005, q_epsilon_start=.7, q_replay_memory_size=80000, min_buffer_size_to_learn=memory_init_size, q_replay_memory_init_size=memory_init_size, train_every=train_every + 44, q_train_every=train_every, q_mlp_layers=[512, 512])) # check_point_path = os.path.join('models\\nolimit_holdem_nfsp\\iivan') print( '-------------------------------------------------------------------------------------' ) # print(check_point_path) #todays project :) # https://stackoverflow.com/questions/33758669/running-multiple-tensorflow-sessions-concurrently with sess.as_default(): with graph.as_default(): # saver = tf.train.Saver() # saver.restore(sess, tf.train.latest_checkpoint(check_point_path)) global_step = tf.Variable(0, name='global_step', trainable=False) random_agent = RandomAgent(action_num=eval_env2.action_num) env.set_agents(agents) eval_env.set_agents([agents[0], random_agent]) eval_env2.set_agents([random_agent, agents[1]]) # eval_env3.set_agents([agents[1], random_agent]) # Initialize global variables sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): print(episode, end='\r') #print('oh') # First sample a policy for the episode for agent in agents: agent.sample_episode_policy() # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for i in range(env.player_num): for ts in trajectories[i]: agents[i].feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log( '\n\n\n---------------------------------------------------------------\nTournament ' + str(episode / evaluate_every)) # tournament(eval_env2, 6) # exploitability.exploitability(eval_env, agents[0], 500) res = tournament(env, evaluate_num) logger.log_performance(env.timestep, res[0]) res2 = tournament(eval_env, evaluate_num // 3) logger.log_performance(env.timestep, res2[0]) res3 = tournament(eval_env2, evaluate_num // 3) logger.log_performance(env.timestep, res3[0]) logger.log('' + str(episode_num) + " - " + str(episode) + '\n') logger.log( '\n\n----------------------------------------------------------------' ) if episode % (evaluate_every) == 0 and not episode == 0: save_dir = 'models/nolimit_holdem_nfsp/no_all_in/cp/' + str( episode // evaluate_every) if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model')) logger.log( '\n\n\n---------------------------------------------------------------\nTournament ' + str(episode / evaluate_every)) res = tournament(eval_env, evaluate_num) logger.log_performance(env.timestep, res[0]) logger.log('' + str(episode_num) + " - " + str(episode)) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('NFSP') # Save model save_dir = 'models/nolimit_holdem_nfsp/no_all_in' if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model'))
def nfsp(): import tensorflow as tf if tf.test.gpu_device_name(): print('GPU found') else: print("No GPU found") #os.environ['TF_CPP_MIN_LOG_LEVEL']='2' # Make environment env = rlcard.make('no-limit-holdem', config={ 'record_action': False, 'game_player_num': 2 }) eval_env = rlcard.make('no-limit-holdem', config={ 'seed': 12, 'game_player_num': 2 }) eval_env2 = rlcard.make('no-limit-holdem', config={ 'seed': 43, 'game_player_num': 2 }) # Set the iterations numbers and how frequently we evaluate the performance # The intial memory size memory_init_size = 1000 # The paths for saving the logs and learning curves log_dir = './experiments/nolimit_holdem_nfsp_result/1v1MCNFSPv3' # Set a global seed set_global_seed(0) graph = tf.Graph() sess = tf.Session(graph=graph) evaluate_every = 1000 evaluate_num = 250 episode_num = 5000 # The intial memory size memory_init_size = 1500 # Train the agent every X steps train_every = 256 agents = [] with graph.as_default(): # Model1v1V3cp10good agents.append( NFSPAgent(sess, scope='nfsp' + str(0), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.1, rl_learning_rate=.1, min_buffer_size_to_learn=memory_init_size, q_replay_memory_init_size=memory_init_size, train_every=train_every, q_train_every=train_every, q_mlp_layers=[512, 512])) agents.append( NFSPAgent(sess, scope='nfsp' + str(1), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.075, rl_learning_rate=0.075, min_buffer_size_to_learn=memory_init_size, q_replay_memory_init_size=memory_init_size, train_every=train_every // 2, q_train_every=train_every // 2, q_mlp_layers=[512, 512])) # check_point_path = os.path.join('models\\nolimit_holdem_nfsp\\1v1MCNFSPv3\\cp\\10') print( '-------------------------------------------------------------------------------------' ) # print(check_point_path) with sess.as_default(): with graph.as_default(): saver = tf.train.Saver() # saver.restore(sess, tf.train.latest_checkpoint(check_point_path)) global_step = tf.Variable(0, name='global_step', trainable=False) random_agent = RandomAgent(action_num=eval_env2.action_num) #easy_agent = nfsp_agents[0] print(agents) # print(nfsp_agents) env.set_agents(agents) eval_env.set_agents(agents) eval_env2.set_agents([agents[0], random_agent]) # Initialize global variables sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): # First sample a policy for the episode for agent in agents: agent.sample_episode_policy() table = [] # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for i in range(env.player_num): for ts in trajectories[i]: agents[i].feed(ts, table) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log( '\n\n\n---------------------------------------------------------------\nTournament ' + str(episode / evaluate_every)) res = tournament(eval_env, evaluate_num) res2 = tournament(eval_env2, evaluate_num // 4) logger.log_performance(env.timestep, res[0]) logger.log_performance(env.timestep, res2[0]) logger.log('' + str(episode_num) + " - " + str(episode) + '\n') logger.log( '\n\n----------------------------------------------------------------' ) if episode % (evaluate_every) == 0 and not episode == 0: save_dir = 'models/nolimit_holdem_nfsp/1v1MCNFSPv3/cp/10/good' + str( episode // evaluate_every) if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model')) logger.log( '\n\n\n---------------------------------------------------------------\nTournament ' + str(episode / evaluate_every)) res = tournament(eval_env, evaluate_num) logger.log_performance(env.timestep, res[0]) logger.log('' + str(episode_num) + " - " + str(episode)) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('NFSP') # Save model save_dir = 'models/nolimit_holdem_nfsp/1v1MCNFSPv3/cp/10/good' if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model'))
m_avg.append(rl_loss) t.set_description("rl loss: {}, payoff: {}, epsilon: {}".format( round(m_avg.get(), 2), round(payoff_avg.get(), 2), round(agent.epsilons[min(agent.total_t, agent.epsilon_decay_steps-1)], 2) ), refresh=True) # q = env.agents[0].eval_step(state)[1] # probs = {ACTION_LIST[i]:round(q[i],3) for i in range(len(q)) if q[i] != -100} # probs = sorted(probs.items(), key=lambda x: x[1], reverse=True) # tqdm.write(str(probs)) # Evaluate the performance. Play with random agents. if episode % evaluate_every == evaluate_every - 1: logger.log_performance(env.timestep, tournament_tractor(eval_env, evaluate_num)[0]) logger.log("rl loss: {}, payoff: {}, epsilon: {}".format( round(m_avg.get(), 2), round(payoff_avg.get(), 2), round(agent.epsilons[min(agent.total_t, agent.epsilon_decay_steps-1)], 2) )) saver.save(sess, os.path.join(save_dir, 'model')) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('DQN')