def __init__(self): ''' Load pretrained model ''' import tensorflow as tf from rlcard.agents import DQNAgent #tf.compat.v1.global_variables_initializer() #tf.compat.v1.local_variables_initializer() env = rlcard.make('uno') self.graph = tf.Graph() self.sess = tf.Session(graph=self.graph) with self.graph.as_default(): self.dqn_agents = [] for i in range(env.player_num): agent = DQNAgent(self.sess, scope='dqn' + str(i), action_num=env.action_num, state_shape=env.state_shape, mlp_layers=[512, 512]) self.dqn_agents.append(agent) check_point_path = os.path.join(ROOT_PATH, 'uno_dqn') with self.sess.as_default(): with self.graph.as_default(): saver = tf.train.Saver() # saver = tf.train.Saver() saver.restore(self.sess, tf.train.latest_checkpoint(check_point_path))
def __init__(self): ''' Load pretrained model ''' import tensorflow as tf self.graph = tf.Graph() # Mitigation for gpu memory issue config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(graph=self.graph, config=config) env = rlcard.make('tractor') with self.graph.as_default(): self.dqn_agents = [] for i in range(1): agent = DQNAgent(self.sess, scope='dqn', action_num=env.action_num, state_shape=env.state_shape, mlp_layers=[2048,2048], replay_memory_size=100000, update_target_estimator_every=100, discount_factor=0.5, epsilon_start=1, epsilon_end=0.1, epsilon_decay_steps=100000, batch_size=256, learning_rate=0.00002, use_rule_policy=False ) self.dqn_agents.append(agent) check_point_path = os.path.join(TRACTOR_PATH, 'tractor_dqn_345k') with self.sess.as_default(): with self.graph.as_default(): saver = tf.train.Saver() saver.restore(self.sess, tf.train.latest_checkpoint(check_point_path))
def __init__(self): ''' Load pretrained model ''' self.graph = tf.Graph() self.sess = tf.Session(graph=self.graph) env = rlcard.make('limit-holdem') with self.graph.as_default(): self.dqn_agents = [] agent = DQNAgent(self.sess, scope='dqn', action_num=env.action_num, replay_memory_init_size=1000, train_every=1, state_shape=env.state_shape, mlp_layers=[512,512]) self.dqn_agents.append(agent) check_point_path = os.path.join(ROOT_PATH, 'limit_holdem_dqn') with self.sess.as_default(): with self.graph.as_default(): print(2) saver = tf.train.Saver() saver.restore(self.sess,tf.train.latest_checkpoint(check_point_path))
def main(): # Make environment env = rlcard.make('blackjack', config={'env_num': 4, 'seed': 0}) eval_env = rlcard.make('blackjack', config={'env_num': 4, 'seed': 0}) # Set the iterations numbers and how frequently we evaluate performance evaluate_every = 100 evaluate_num = 10000 iteration_num = 100000 # The intial memory size memory_init_size = 100 # Train the agent every X steps train_every = 1 # The paths for saving the logs and learning curves log_dir = './experiments/blackjack_dqn_result/' # Set a global seed set_global_seed(0) with tf.compat.v1.Session() as sess: # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agent = DQNAgent(sess, scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[10, 10]) env.set_agents([agent]) eval_env.set_agents([agent]) # Initialize global variables sess.run(tf.compat.v1.global_variables_initializer()) # Initialize a Logger to plot the learning curve logger = Logger(log_dir) for iteration in range(iteration_num): # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]: agent.feed(ts) # Evaluate the performance. Play with random agents. if iteration % evaluate_every == 0: logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('DQN') # Save model save_dir = 'models/blackjack_dqn' if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.compat.v1.train.Saver() saver.save(sess, os.path.join(save_dir, 'model'))
# The paths for saving the logs and learning curves log_dir = './experiments/uno_single_dqn_result/' # Set a global seed set_global_seed(0) with tf.compat.v1.Session() as sess: # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agent = DQNAgent(sess, scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[128, 128]) # Initialize global variables sess.run(tf.compat.v1.global_variables_initializer()) # Init a Logger to plot the learning curve logger = Logger(log_dir) state = env.reset() for timestep in range(timesteps): action = agent.step(state) next_state, reward, done = env.step(action) ts = (state, action, reward, next_state, done)
import tensorflow as tf import os # Make environment and enable human mode # Set 'record_action' to True because we need it to print results env = rlcard.make('limit-holdem', config={'record_action': True}) human_agent = HumanAgent(env.action_num) with tf.Session() as sess: # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) agent = DQNAgent(sess, scope='dqn', action_num=env.action_num, replay_memory_init_size=1000, train_every=1, state_shape=env.state_shape, mlp_layers=[512, 512]) saver = tf.train.Saver() save_dir = 'models/limit_holdem_dqn' saver.restore(sess, os.path.join(save_dir, 'model')) env.set_agents([human_agent, agent]) print(">> Limit Hold'em random agent") while (True): print(">> Start a new game")
# Train the agent every X steps train_every = 1 # The paths for saving the logs and learning curves log_dir = './experiments/gin_rummy_dqn_result/' # Set a global seed set_global_seed(0) agent = DQNAgent( scope='dqn', action_num=env.action_num, #replay_memory_size=20000, replay_memory_size=1000, #replay_memory_init_size=memory_init_size, replay_memory_init_size=500, train_every=train_every, #state_shape=env.state_shape, state_shape=[768], mlp_layers=[512, 512], device=torch.device('cpu')) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents([agent, random_agent]) eval_env.set_agents([agent, random_agent]) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): print('epi: ', episode)
# latest_ckpt = tf.train.latest_checkpoint(check_point_path) sess = tf.Session(graph=graph, config=config) env = rlcard.make('tractor') with graph.as_default(): dqn_agents = [] for i in range(1): agent = DQNAgent(sess, scope='dqn', action_num=env.action_num, state_shape=env.state_shape, mlp_layers=[2048,2048], replay_memory_size=100000, update_target_estimator_every=100, discount_factor=0.5, epsilon_start=1, epsilon_end=0.1, epsilon_decay_steps=100000, batch_size=256, learning_rate=0.00002, use_rule_policy=False ) dqn_agents.append(agent) # check_point_path = os.path.join(TRACTOR_PATH, 'dqn_10k_blindcard') with sess.as_default(): with graph.as_default(): saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(check_point_path))
log_dir = './experiments/nolimit_holdem_dqn_result/' # Set a global seed # set_global_seed(0) with tf.Session() as sess: # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) with tf.variable_scope('agent1'): # Set up the agents agent1 = DQNAgent(sess, scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[1280, 1280]) with tf.variable_scope('agent2'): agent2 = DQNAgent(sess, scope='dqn', action_num=eval_env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=eval_env.state_shape, mlp_layers=[512, 512]) random_agent = RandomAgent(action_num=eval_env.action_num) human_agent = NolimitholdemHumanAgent(eval_env.action_num)
log_dir = './experiments/uno_dqn_result/' # Set a global seed set_global_seed(0) with tf.Session() as sess: # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agent = DQNAgent(sess, scope='dqn', action_num=env.action_num, replay_memory_size=20000, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[60, 60, 60, 60, 60], batch_size=512) saver = tf.train.Saver() random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents([agent, random_agent]) eval_env.set_agents([agent, random_agent]) # Initialize global variables sess.run(tf.global_variables_initializer()) # saver.restore(sess, "models/uno_dqn5/model")
def main(): # Make environment device = torch.device("cuda" if torch.cuda.is_available() else "cpu") env = rlcard.make('no-limit-holdem', config={'seed': 0, 'env_num': 4}) eval_env = rlcard.make('no-limit-holdem', config={'seed': 0, 'env_num': 4}) # Set the iterations numbers and how frequently we evaluate performance evaluate_every = 5000 selfplay_every = 25000 evaluate_num = 10000 iteration_num = 8000000 # The intial memory size memory_init_size = 100 # Train the agent every X steps train_every = 1 agent = DQNAgent(num_actions=env.num_actions, state_shape=env.state_shape[0], mlp_layers=[64, 64, 64, 64], device=device) agents = [agent, load_model("model.pth")] env.set_agents(agents) with Logger('./') as logger: for episode in range(iteration_num): # Generate data from the environment trajectories, payoffs = env.run(is_training=True) # Reorganaize the data to be state, action, reward, next_state, done trajectories = reorganize(trajectories, payoffs) # Feed transitions into agent memory, and train the agent # Here, we assume that DQN always plays the first position # and the other players play randomly (if any) for ts in trajectories[0]: agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log_performance(env.timestep, tournament(env, evaluate_num)[0]) if episode % selfplay_every == 0: save_path = os.path.join('./', str(episode) + "model.pth") torch.save(agent, save_path) print('Model saved in', save_path) agents = [agent, load_model(str(episode) + "model.pth")] env.set_agents(agents) # Get the paths csv_path, fig_path = logger.csv_path, logger.fig_path # Plot the learning curve #plot_curve(csv_path, fig_path, args.algorithm) # Save model save_path = os.path.join('./', 'model.pth') torch.save(agent, save_path) print('Model saved in', save_path) # The paths for saving the logs and learning curves log_dir = './experiments/nlh_cfr_result/' # Set a global seed set_seed(0)
def main(): # Make environment env = rlcard.make('no-limit-holdem', config={ 'seed': 0, 'env_num': 16, 'game_player_num': 4 }) eval_env = rlcard.make('no-limit-holdem', config={ 'seed': 0, 'env_num': 16 }) # Set the iterations numbers and how frequently we evaluate the performance evaluate_every = 100 evaluate_num = 1000 episode_num = 200000 # The intial memory size memory_init_size = 1000 # Train the agent every X steps train_every = 1 _reward_max = -0.8 # The paths for saving the logs and learning curves log_dir = './experiments/nolimit_holdem_dqn_result/' # Set a global seed set_global_seed(0) with tf.Session() as sess: # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agent = DQNAgent(sess, scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[512, 512]) agent2 = NFSPAgent(sess, scope='nfsp', action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.1, min_buffer_size_to_learn=memory_init_size, q_replay_memory_init_size=memory_init_size, train_every=64, q_train_every=64, q_mlp_layers=[512, 512]) # Initialize global variables sess.run(tf.global_variables_initializer()) save_dir = 'models/nolimit_holdem_dqn' saver = tf.train.Saver() #saver.restore(sess, os.path.join(save_dir, 'model')) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents([agent, agent, agent2, random_agent]) eval_env.set_agents([agent, agent2]) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): agent2.sample_episode_policy() # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]: agent.feed(ts) for ts in trajectories[2]: agent2.feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: _reward = tournament(eval_env, evaluate_num)[0] logger.log_performance(episode, _reward) if _reward > _reward_max: if not os.path.exists(save_dir): os.makedirs(save_dir) saver.save(sess, os.path.join(save_dir, 'model')) _reward_max = _reward # Close files in the logger logger.close_files() if not os.path.exists(save_dir): os.makedirs(save_dir) saver.save(sess, os.path.join(save_dir, 'model_final'))
def main(): # Make environment env = rlcard.make('leduc-holdem', config={'seed': 0, 'env_num': 4}) eval_env = rlcard.make('leduc-holdem', config={'seed': 0, 'env_num': 4}) # Set the iterations numbers and how frequently we evaluate the performance evaluate_every = 100 evaluate_num = 10000 episode_num = 800000 # The intial memory size memory_init_size = 1000 # Train the agent every X steps train_every = 1 _reward_max = -0.5 # The paths for saving the logs and learning curves log_dir = './experiments/leduc_holdem_dqn_result/' # Set a global seed set_global_seed(0) with tf.Session() as sess: # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agent = DQNAgent(sess, scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[128, 128]) # random_agent = RandomAgent(action_num=eval_env.action_num) cfr_agent = models.load('leduc-holdem-cfr').agents[0] env.set_agents([agent, agent]) eval_env.set_agents([agent, cfr_agent]) # Initialize global variables sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curve logger = Logger(log_dir) saver = tf.train.Saver() save_dir = 'models/leduc_holdem_dqn' saver.restore(sess, os.path.join(save_dir, 'model')) for episode in range(episode_num): # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]: agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: _reward = tournament(eval_env, evaluate_num)[0] logger.log_performance(episode, _reward) if _reward > _reward_max: # Save model if not os.path.exists(save_dir): os.makedirs(save_dir) saver.save(sess, os.path.join(save_dir, 'model')) _reward_max = _reward # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('DQN')
# Make environment env = rlcard.make('uno', config={'seed': 0}) # Set a global seed set_global_seed(0) # Load pretrained model graph = tf.Graph() sess = tf.Session(graph=graph) with graph.as_default(): dqn_agents = [] for i in range(env.player_num): agent = DQNAgent(sess, scope='dqn' + str(i), action_num=env.action_num, state_shape=env.state_shape, mlp_layers=[512, 512]) dqn_agents.append(agent) # We have a pretrained model here. Change the path for your model. check_point_path = os.path.join(rlcard.__path__[0], 'models/pretrained/uno_dqn') with sess.as_default(): with graph.as_default(): saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(check_point_path)) # Evaluate the performance. Play with random agents. evaluate_num = 1000
set_global_seed(0) with tf.Session() as sess: # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agent = DQNAgent( sess, scope='dqn', replay_memory_size=replay_memory_size, replay_memory_init_size=memory_init_size, update_target_estimator_every=update_target_estimator_every, discount_factor=discount_factor, epsilon_start=epsilon_start, epsilon_end=epsilon_end, epsilon_decay_steps=epsilon_decay_steps, batch_size=batch_size, action_num=env.action_num, state_shape=env.state_shape, train_every=train_every, mlp_layers=mlp_layers, learning_rate=learning_rate) random_agent = RandomAgent(action_num=eval_env.action_num) agent_list = [agent, random_agent, random_agent] # default #deactivated at the moment because we might not need it if we use landlord score anyway for switching positions/roles if (landlord_score): agent_list = [agent, random_agent, random_agent]
memory_init_size = 1000 # Train the agent every X steps train_every = 1 # The paths for saving the logs and learning curves log_dir = './experiments/dqn_random_result/' # Set a global seed set_global_seed(0) # Set up the agents agent = DQNAgent(scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[128, 128], device=torch.device('cpu')) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents([agent, random_agent]) eval_env.set_agents([agent, random_agent]) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): # Generate data from the environment trajectories, _ = env.run(is_training=True)
def train(args): # Check whether gpu is available device = get_device() # Seed numpy, torch, random set_seed(args.seed) # Make the environment with seed env = rlcard.make(args.env, config={ 'seed': args.seed, }) # Initialize the agent and use random agents as opponents if args.algorithm == 'dqn': from rlcard.agents import DQNAgent agent = DQNAgent( num_actions=env.num_actions, state_shape=env.state_shape[0], mlp_layers=[64, 64], device=device, ) elif args.algorithm == 'nfsp': from rlcard.agents import NFSPAgent agent = NFSPAgent( num_actions=env.num_actions, state_shape=env.state_shape[0], hidden_layers_sizes=[64, 64], q_mlp_layers=[64, 64], device=device, ) agents = [agent] for _ in range(1, env.num_players): agents.append(RandomAgent(num_actions=env.num_actions)) env.set_agents(agents) # Start training with Logger(args.log_dir) as logger: for episode in range(args.num_episodes): if args.algorithm == 'nfsp': agents[0].sample_episode_policy() # Generate data from the environment trajectories, payoffs = env.run(is_training=True) # Reorganaize the data to be state, action, reward, next_state, done trajectories = reorganize(trajectories, payoffs) # Feed transitions into agent memory, and train the agent # Here, we assume that DQN always plays the first position # and the other players play randomly (if any) for ts in trajectories[0]: agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % args.evaluate_every == 0: logger.log_performance( env.timestep, tournament( env, args.num_eval_games, )[0]) # Get the paths csv_path, fig_path = logger.csv_path, logger.fig_path # Plot the learning curve plot_curve(csv_path, fig_path, args.algorithm) # Save model save_path = os.path.join(args.log_dir, 'model.pth') torch.save(agent, save_path) print('Model saved in', save_path)
# Mitigation for gpu memory issue # config = tf.ConfigProto() # config.gpu_options.allow_growth = True # config.gpu_options.per_process_gpu_memory_fraction = 0.9 # with tf.Session(config=config) as sess: with tf.Session() as sess: # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agent = DQNAgent(sess, scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[512, 512]) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents([agent, random_agent, random_agent]) eval_env.set_agents([agent, random_agent, random_agent]) # Initialize global variables sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num):
with tf.Session(config=config) as sess: # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents for i in range(1): agent = DQNAgent(sess, scope='dqn' if i==0 else 'dqn' + str(i), action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[2048,2048], replay_memory_size=100000, update_target_estimator_every=500, discount_factor=0.99, epsilon_start=0.1, epsilon_end=0.1, epsilon_decay_steps=100000, batch_size=256, learning_rate=0.00002, use_rule_policy=False ) agents.append(agent) random_agent = RandomAgent(action_num=eval_env.action_num) rule_agent = TractorRuleAgent(action_num=eval_env.action_num) # 1 dqn agent vs 3 rule agent # env.set_agents([agent, rule_agent, rule_agent, rule_agent])