def __init__(self): ''' Load pretrained model ''' import tensorflow as tf from rlcard.agents import NFSPAgent self.graph = tf.Graph() self.sess = tf.Session(graph=self.graph) env = rlcard.make('leduc-holdem') with self.graph.as_default(): self.nfsp_agents = [] for i in range(env.player_num): agent = NFSPAgent(self.sess, scope='nfsp' + str(i), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[128, 128], q_mlp_layers=[128, 128]) self.nfsp_agents.append(agent) check_point_path = os.path.join(ROOT_PATH, 'leduc_holdem_nfsp') with self.sess.as_default(): with self.graph.as_default(): saver = tf.train.Saver() saver.restore(self.sess, tf.train.latest_checkpoint(check_point_path))
def __init__(self): super().__init__() self.wins = 0 self.losses = 0 ''' Instantiate agent. ''' # Setup RL NFSP agent # Set the iterations numbers and how frequently we evaluate/save plot evaluate_every = 10000 evaluate_num = 10000 episode_num = 100000 # The intial memory size memory_init_size = 1000 # Train the agent every X steps train_every = 64 # The paths for saving the logs and learning curves log_dir = './training/nfsp/' # Set a global seed set_global_seed(0) # Set agent - TODO - determine PPE parameters self.agent = NFSPAgent(scope='nfsp', action_num=3, state_shape=54, hidden_layers_sizes=[512, 512], min_buffer_size_to_learn=memory_init_size, q_replay_memory_init_size=memory_init_size, train_every=train_every, q_train_every=train_every, q_mlp_layers=[512, 512], device=torch.device('cpu')) # Init a Logger to plot the learning curve self.logger = Logger(log_dir)
def __init__(self): # load pretrained model from tensorflow evaluate_every = 100 evaluate_num = 100 episode_num = 6000 memory_init_size = 1000 train_every = 64 i = 0 self.graph = tf.Graph() self.sess = tf.Session(graph=self.graph) self.env = rlcard.make('gin-rummy') with self.graph.as_default(): self.agent = NFSPAgent(self.sess, scope='nfsp' + str(i), action_num=self.env.action_num, state_shape=[4, 52], hidden_layers_sizes=[128], anticipatory_param=0.5, batch_size=256, rl_learning_rate=0.01, sl_learning_rate=0.005, min_buffer_size_to_learn=memory_init_size, q_replay_memory_size=int(1e5), q_replay_memory_init_size=memory_init_size, train_every=train_every, q_train_every=train_every, q_batch_size=256, q_mlp_layers=[128]) print("restoring checkpoint...") check_point_path = "gin_rummy_nfsp4" with self.sess.as_default(): with self.graph.as_default(): saver = tf.train.Saver() saver.restore(self.sess, tf.train.latest_checkpoint(check_point_path)) print("checkpoint restored!")
def __init__(self): ''' Load pretrained model ''' import tensorflow as tf from rlcard.agents import NFSPAgent, RandomAgent self.graph = tf.Graph() # Mitigation for gpu memory issue config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(graph=self.graph, config=config) env = rlcard.make('tractor') with self.graph.as_default(): self.nfsp_agents = [] # for i in range(env.player_num): # agent = NFSPAgent(self.sess, # scope='nfsp' + str(i), # action_num=env.action_num, # state_shape=env.state_shape, # hidden_layers_sizes=[512,1024,2048,1024,512], # q_mlp_layers=[512,1024,2048,1024,512]) # self.nfsp_agents.append(agent) for i in range(1): agent = NFSPAgent(self.sess, scope='nfsp' + str(i), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[2048,2048], q_mlp_layers=[2048,2048], # evaluate_with='average_policy') evaluate_with='best_response') self.nfsp_agents.append(agent) check_point_path = os.path.join(TRACTOR_PATH, 'nfsp_continue_350k_0.99') with self.sess.as_default(): with self.graph.as_default(): saver = tf.train.Saver() saver.restore(self.sess, tf.train.latest_checkpoint(check_point_path))
with tf.Session() as sess: # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agents = [] for i in range(env.player_num): agent = NFSPAgent(sess, scope='nfsp' + str(i), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 1024, 2048, 1024, 512], anticipatory_param=0.5, batch_size=256, rl_learning_rate=0.00005, sl_learning_rate=0.00001, min_buffer_size_to_learn=memory_init_size, q_replay_memory_size=int(1e5), q_replay_memory_init_size=memory_init_size, train_every=train_every, q_train_every=train_every, q_batch_size=256, q_mlp_layers=[512, 1024, 2048, 1024, 512]) agents.append(agent) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents(agents) eval_env.set_agents([agents[0], random_agent, random_agent]) # Initialize global variables sess.run(tf.global_variables_initializer())
global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agents = [] for i in range(2): nfsp_agent = NFSPAgent( sess, scope='nfsp' + str(i), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 1024, 2048, 1024, 512], #hidden_layers_sizes=[512,1024,512], # hidden_layers_sizes=[64], anticipatory_param=0.5, batch_size=256, rl_learning_rate=0.00005, sl_learning_rate=0.00001, min_buffer_size_to_learn=memory_init_size, q_replay_memory_size=int(1e5), q_replay_memory_init_size=memory_init_size, train_every=train_every, q_train_every=train_every, q_batch_size=256, q_mlp_layers=[512, 1024, 2048, 1024, 512], # q_mlp_layers=[512,1024,512], # q_mlp_layers=[64], reservoir_buffer_capacity=int(1e4)) agents.append(nfsp_agent) random_agent = RandomAgent(action_num=eval_env.action_num) rule_agent = TractorRuleAgent(action_num=eval_env.action_num)
class NFSPPlayer(GinRummyPlayer): def __init__(self): # load pretrained model from tensorflow evaluate_every = 100 evaluate_num = 100 episode_num = 6000 memory_init_size = 1000 train_every = 64 i = 0 self.graph = tf.Graph() self.sess = tf.Session(graph=self.graph) self.env = rlcard.make('gin-rummy') with self.graph.as_default(): self.agent = NFSPAgent(self.sess, scope='nfsp' + str(i), action_num=self.env.action_num, state_shape=[4, 52], hidden_layers_sizes=[128], anticipatory_param=0.5, batch_size=256, rl_learning_rate=0.01, sl_learning_rate=0.005, min_buffer_size_to_learn=memory_init_size, q_replay_memory_size=int(1e5), q_replay_memory_init_size=memory_init_size, train_every=train_every, q_train_every=train_every, q_batch_size=256, q_mlp_layers=[128]) print("restoring checkpoint...") check_point_path = "gin_rummy_nfsp4" with self.sess.as_default(): with self.graph.as_default(): saver = tf.train.Saver() saver.restore(self.sess, tf.train.latest_checkpoint(check_point_path)) print("checkpoint restored!") def init_game_get_state(self): state = {} state['hand'] = np.zeros(52, dtype=int) for card in self.cards: state['hand'][card.getId()] = 1 state['top_discard'] = np.zeros(52, dtype=int) state['dead_cards'] = np.zeros(52, dtype=int) state['opponent_known_cards'] = np.zeros(52, dtype=int) rep = [state['hand'], state['top_discard'], state['dead_cards'], \ state['opponent_known_cards']] #, unknown_cards_rep] # changed obs = np.array(rep) extracted_state = {'obs': obs, 'legal_actions': None} return extracted_state def get_state_index(self, label): inds = {'hand': 0, 'top_discard': 1, 'dead_cards': 2, \ 'opponent_known_cards': 3} return inds[label] def set_discard(self, card): self.state['obs'][self.get_state_index('top_discard')] = np.zeros( 52, dtype=int) self.state['obs'][self.get_state_index('top_discard')][ card.getId()] = 1 # Inform player of 0-based player number (0/1), starting player number (0/1), and dealt cards # @param playerNum player's 0-based player number (0/1) # @param startingPlayerNum starting player number (0/1) # @param cards dealt cards def startGame(self, playerNum: int, startingPlayerNum: int, cards: List[Card]) -> None: self.playerNum = playerNum self.startingPlayerNum = startingPlayerNum self.cards = list(cards) self.opponentKnocked = False self.drawDiscardBitstrings = [] # long[], or List[int] self.faceUpCard = None self.drawnCard = None self.state = self.init_game_get_state() self.action = None # ==================================== # Action_ids: # 0 -> score_player_0_id # 1 -> score_player_1_id # 2 -> draw_card_id # 3 -> pick_up_discard_id # 4 -> declare_dead_hand_id # 5 -> gin_id # 6 to 57 -> discard_id card_id # 58 to 109 -> knock_id card_id # ==================================== # Return whether or not player will draw the given face-up card on the draw pile. # @param card face-up card on the draw pile # @return whether or not player will draw the given face-up card on the draw pile def willDrawFaceUpCard(self, card: Card) -> bool: # Return true if card would be a part of a meld, false otherwise. self.faceUpCard = card # update state self.set_discard(card) self.state['legal_actions'] = [2, 3] action, probs = self.agent.eval_step(self.state) return True if action == 3 else False # self.faceUpCard = card # newCards = list(self.cards) # newCards.append(card) # for meld in GinRummyUtil.cardsToAllMelds(newCards): # if card in meld: # return True # return False # Report that the given player has drawn a given card and, if known, what the card is. # If the card is unknown because it is drawn from the face-down draw pile, the drawnCard is null. # Note that a player that returns false for willDrawFaceUpCard will learn of their face-down draw from this method. # @param playerNum - player drawing a card # @param drawnCard - the card drawn or null, depending on whether the card is known to the player or not, respectively. def reportDraw(self, playerNum: int, drawnCard: Card) -> None: # Ignore other player draws. Add to cards if playerNum is this player. if playerNum == self.playerNum: self.cards.append(drawnCard) self.drawnCard = drawnCard # add to state self.state['obs'][self.get_state_index('hand')][ drawnCard.getId()] = 1 # if other player card is not null add to state elif drawnCard is not None: self.state['obs'][self.get_state_index('opponent_known_cards')][ drawnCard.getId()] = 1 # Get the player's discarded card. If you took the top card from the discard pile, # you must discard a different card. # If this is not a card in the player's possession, the player forfeits the game. # @return the player's chosen card for discarding def getDiscard(self) -> Card: # Discard a random card (not just drawn face up) leaving minimal deadwood points. self.state['legal_actions'] = [] for card in self.cards: if card == self.drawnCard and self.drawnCard == self.faceUpCard: continue self.state['legal_actions'].append(card.getId() + 6) action, probs = self.agent.eval_step(self.state) for card in self.cards: if card.getId() == action - 6: return card # minDeadwood = float('inf') # candidateCards = [] # for card in self.cards: # # Cannot draw and discard face up card. # if card == self.drawnCard and self.drawnCard == self.faceUpCard: # continue # # Disallow repeat of draw and discard. # drawDiscard = [self.drawnCard, card] # if GinRummyUtil.cardsToBitstring(drawDiscard) in self.drawDiscardBitstrings: # continue # remainingCards = list(self.cards) # remainingCards.remove(card) # bestMeldSets = GinRummyUtil.cardsToBestMeldSets(remainingCards) # deadwood = GinRummyUtil.getDeadwoodPoints3(remainingCards) if len(bestMeldSets) == 0 \ # else GinRummyUtil.getDeadwoodPoints1(bestMeldSets[0], remainingCards) # if deadwood <= minDeadwood: # if deadwood < minDeadwood: # minDeadwood = deadwood # candidateCards.clear() # candidateCards.append(card) # # Prevent future repeat of draw, discard pair. # discard = candidateCards[randint(0, len(candidateCards)-1)] # drawDiscard = [self.drawnCard, discard] # self.drawDiscardBitstrings.append(GinRummyUtil.cardsToBitstring(drawDiscard)) # return discard # Report that the given player has discarded a given card. # @param playerNum the discarding player # @param discardedCard the card that was discarded def reportDiscard(self, playerNum: int, discardedCard: Card) -> None: # Ignore other player discards. Remove from cards if playerNum is this player. if playerNum == self.playerNum: self.cards.remove(discardedCard) # update state self.state['obs'][self.get_state_index('hand')][ discardedCard.getId()] = 0 else: self.state['obs'][self.get_state_index('opponent_known_cards')][ discardedCard.getId()] = 0 self.set_discard(discardedCard) # At the end of each turn, this method is called and the player that cannot (or will not) end the round will return a null value. # However, the first player to "knock" (that is, end the round), and then their opponent, will return an ArrayList of ArrayLists of melded cards. # All other cards are counted as "deadwood", unless they can be laid off (added to) the knocking player's melds. # When final melds have been reported for the other player, a player should return their final melds for the round. # @return null if continuing play and opponent hasn't melded, or an ArrayList of ArrayLists of melded cards. def getFinalMelds(self) -> List[List[Card]]: # Check if deadwood of maximal meld is low enough to go out. # TODO: maybe get action from agent bestMeldSets = GinRummyUtil.cardsToBestMeldSets( self.cards) # List[List[List[Card]]] if not self.opponentKnocked and (len(bestMeldSets) == 0 or \ GinRummyUtil.getDeadwoodPoints1(bestMeldSets[0], self.cards) > \ GinRummyUtil.MAX_DEADWOOD): return None if len(bestMeldSets) == 0: return [] return bestMeldSets[randint(0, len(bestMeldSets) - 1)] # When an player has ended play and formed melds, the melds (and deadwood) are reported to both players. # @param playerNum player that has revealed melds # @param melds an ArrayList of ArrayLists of melded cards with the last ArrayList (possibly empty) being deadwood. def reportFinalMelds(self, playerNum: int, melds: List[List[Card]]) -> None: # Melds ignored by simple player, but could affect which melds to make for complex player. if playerNum != self.playerNum: self.opponentKnocked = True # add dead cards to state. for l in melds: for card in l: self.state['obs'][self.get_state_index('dead_cards')][ card.getId()] = 1 # Report current player scores, indexed by 0-based player number. # @param scores current player scores, indexed by 0-based player number def reportScores(self, scores: List[int]) -> None: # Ignored by simple player, but could affect strategy of more complex player. return # Report layoff actions. # @param playerNum player laying off cards # @param layoffCard card being laid off # @param opponentMeld the opponent meld that card is being added to def reportLayoff(self, playerNum: int, layoffCard: Card, opponentMeld: List[Card]) -> None: # Ignored by simple player, but could affect strategy of more complex player. return # Report the final hands of players. # @param playerNum player of hand reported # @param hand complete hand of given player def reportFinalHand(self, playerNum: int, hand: List[Card]) -> None: # Ignored by simple player, but could affect strategy of more complex player. return
set_global_seed(0) with tf.compat.v1.Session() as sess: # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agents = [] for i in range(env.player_num): agent = NFSPAgent(sess, scope='nfsp' + str(i), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.1, min_buffer_size_to_learn=memory_init_size, q_replay_memory_init_size=memory_init_size, train_every=train_every, q_train_every=train_every, q_mlp_layers=[512, 512]) agents.append(agent) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents(agents) eval_env.set_agents([agents[0], random_agent]) # Initialize global variables sess.run(tf.compat.v1.global_variables_initializer()) # Init a Logger to plot the learning curve
def __init__(self, env_name, max_episode_length=0, enable_record=False, record_path="1.mp4"): self.env_name = env_name self.env_type = None print('wtf') self.env = rlcard.make('no-limit-holdem', config={ 'record_action': True, 'game_player_num': 2, 'seed': 477 }) # self.state, self.pointer = self.game.init_game() memory_init_size = 300 # The paths for saving the logs and learning curves self.log_dir = './experiments/nolimit_holdem_nfsp_result/ivvan' # Set a global seed self.evaluate_every = 512 self.evaluate_num = 64 self.episode_num = 20480 # The intial memory size self.memory_init_size = 256 # Train the agent every X steps self.train_every = 256 self.agents = [] self.agents.append( NFSPAgent(scope='nfsp' + str(0), action_num=self.env.action_num, state_shape=self.env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.1, rl_learning_rate=0.015, sl_learning_rate=0.0075, q_epsilon_start=.3, min_buffer_size_to_learn=memory_init_size, q_replay_memory_size=20480, q_replay_memory_init_size=memory_init_size, train_every=self.train_every + 44, q_train_every=self.train_every, q_mlp_layers=[512, 512], evaluate_with='average_policy')) self.agents.append( NFSPAgent(scope='nfsp' + str(1), action_num=self.env.action_num, state_shape=self.env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.1, rl_learning_rate=0.015, sl_learning_rate=0.0075, q_epsilon_start=.3, q_replay_memory_size=20480, min_buffer_size_to_learn=memory_init_size, q_replay_memory_init_size=memory_init_size, train_every=self.train_every + 44, q_train_every=self.train_every, q_mlp_layers=[512, 512], evaluate_with='average_policy')) self.env.set_agents(self.agents) self.env.reset() #initialize env to be equal to the game # print(self.state) # self.env = PokerState(self.state['hand'], self.state['public_cards'], 250 - self.state['all_chips'][0], 250 - self.state['all_chips'][1], abs(self.state['all_chips'][0] - self.state['all_chips'][1]), self.state['all_chips'][0] + self.state['all_chips'][1], self.state['all_chips'][0], self.state['all_chips'][1]) self.action_n = 6 self.max_episode_length = self.env._max_episode_steps if max_episode_length == 0 else max_episode_length self.current_step_count = 0 self.since_last_reset = 0
def main(): # Make environment env = rlcard.make('no-limit-holdem', config={ 'seed': 0, 'env_num': 16, 'game_player_num': 4 }) eval_env = rlcard.make('no-limit-holdem', config={ 'seed': 0, 'env_num': 16 }) # Set the iterations numbers and how frequently we evaluate the performance evaluate_every = 100 evaluate_num = 1000 episode_num = 200000 # The intial memory size memory_init_size = 1000 # Train the agent every X steps train_every = 1 _reward_max = -0.8 # The paths for saving the logs and learning curves log_dir = './experiments/nolimit_holdem_dqn_result/' # Set a global seed set_global_seed(0) with tf.Session() as sess: # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agent = DQNAgent(sess, scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[512, 512]) agent2 = NFSPAgent(sess, scope='nfsp', action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.1, min_buffer_size_to_learn=memory_init_size, q_replay_memory_init_size=memory_init_size, train_every=64, q_train_every=64, q_mlp_layers=[512, 512]) # Initialize global variables sess.run(tf.global_variables_initializer()) save_dir = 'models/nolimit_holdem_dqn' saver = tf.train.Saver() #saver.restore(sess, os.path.join(save_dir, 'model')) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents([agent, agent, agent2, random_agent]) eval_env.set_agents([agent, agent2]) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): agent2.sample_episode_policy() # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]: agent.feed(ts) for ts in trajectories[2]: agent2.feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: _reward = tournament(eval_env, evaluate_num)[0] logger.log_performance(episode, _reward) if _reward > _reward_max: if not os.path.exists(save_dir): os.makedirs(save_dir) saver.save(sess, os.path.join(save_dir, 'model')) _reward_max = _reward # Close files in the logger logger.close_files() if not os.path.exists(save_dir): os.makedirs(save_dir) saver.save(sess, os.path.join(save_dir, 'model_final'))
train_every = 64 # The paths for saving the logs and learning curves log_dir = './experiments/nfsp_random_result/' # Set a global seed set_global_seed(0) # Set up the agents # agents = [] # print(env.player_num) agent = NFSPAgent(scope='nfsp', action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.1, min_buffer_size_to_learn=memory_init_size, q_replay_memory_init_size=memory_init_size, train_every=train_every, q_train_every=train_every, q_mlp_layers=[512, 512]) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents([agent, random_agent]) eval_env.set_agents([agent, random_agent]) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): # First sample a policy for the episode
def train(args): # Check whether gpu is available device = get_device() # Seed numpy, torch, random set_seed(args.seed) # Make the environment with seed env = rlcard.make(args.env, config={ 'seed': args.seed, }) # Initialize the agent and use random agents as opponents if args.algorithm == 'dqn': from rlcard.agents import DQNAgent agent = DQNAgent( num_actions=env.num_actions, state_shape=env.state_shape[0], mlp_layers=[64, 64], device=device, ) elif args.algorithm == 'nfsp': from rlcard.agents import NFSPAgent agent = NFSPAgent( num_actions=env.num_actions, state_shape=env.state_shape[0], hidden_layers_sizes=[64, 64], q_mlp_layers=[64, 64], device=device, ) agents = [agent] for _ in range(1, env.num_players): agents.append(RandomAgent(num_actions=env.num_actions)) env.set_agents(agents) # Start training with Logger(args.log_dir) as logger: for episode in range(args.num_episodes): if args.algorithm == 'nfsp': agents[0].sample_episode_policy() # Generate data from the environment trajectories, payoffs = env.run(is_training=True) # Reorganaize the data to be state, action, reward, next_state, done trajectories = reorganize(trajectories, payoffs) # Feed transitions into agent memory, and train the agent # Here, we assume that DQN always plays the first position # and the other players play randomly (if any) for ts in trajectories[0]: agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % args.evaluate_every == 0: logger.log_performance( env.timestep, tournament( env, args.num_eval_games, )[0]) # Get the paths csv_path, fig_path = logger.csv_path, logger.fig_path # Plot the learning curve plot_curve(csv_path, fig_path, args.algorithm) # Save model save_path = os.path.join(args.log_dir, 'model.pth') torch.save(agent, save_path) print('Model saved in', save_path)
train_every = 64 # The paths for saving the logs and learning curves log_dir = './experiments/leduc_holdem_nfsp_result/' # Set a global seed set_global_seed(0) # Set agents agents = [] for i in range(env.player_num): agent = NFSPAgent(scope='nfsp' + str(i), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[128, 128], min_buffer_size_to_learn=memory_init_size, q_replay_memory_init_size=memory_init_size, train_every=train_every, q_train_every=train_every, q_mlp_layers=[128, 128], device=torch.device('cpu')) agents.append(agent) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents(agents) eval_env.set_agents([agents[0], random_agent]) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num):
# Make environment env = rlcard.make('leduc-holdem', config={'seed': 0}) # Set a global seed set_global_seed(0) # Load pretrained model graph = tf.Graph() sess = tf.Session(graph=graph) with graph.as_default(): nfsp_agents = [] for i in range(env.player_num): agent = NFSPAgent(sess, scope='nfsp' + str(i), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[128, 128], q_mlp_layers=[128, 128]) nfsp_agents.append(agent) # We have a pretrained model here. Change the path for your model. check_point_path = os.path.join(rlcard.__path__[0], 'models/pretrained/leduc_holdem_nfsp') with sess.as_default(): with graph.as_default(): saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(check_point_path)) # Evaluate the performance. Play with random agents. evaluate_num = 10000
# Set up the agents agents = [] for i in range(1): nfsp_agent = NFSPAgent( sess, scope='nfsp' + str(i), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[2048, 2048], # anticipatory_param=0.1, anticipatory_param=0.9, batch_size=256, train_every=train_every, rl_learning_rate=0.00002, sl_learning_rate=0.0002, min_buffer_size_to_learn=memory_init_size, q_replay_memory_init_size=memory_init_size, q_update_target_estimator_every=500, q_discount_factor=0.99, q_epsilon_start=1, q_epsilon_end=0.1, q_epsilon_decay_steps=100000, q_batch_size=256, q_train_every=train_every, q_mlp_layers=[2048, 2048], reservoir_buffer_capacity=500000, q_replay_memory_size=100000, # evaluate_with='average_policy') evaluate_with='best_response') agents.append(nfsp_agent) random_agent = RandomAgent(action_num=eval_env.action_num)
def run(): torch.multiprocessing.freeze_support() env = rlcard.make('no-limit-holdem', config={ 'record_action': True, 'game_player_num': 2, 'env_num': 8, 'use_raw': True }) # eval_env = rlcard.make('no-limit-holdem', config={'seed': 12, 'game_player_num': 2}) # eval_env2 = rlcard.make('no-limit-holdem', config={'seed': 43, 'game_player_num': 2}) #eval_env3 = rlcard.make('no-limit-holdem', config={'seed': 43, 'game_player_num': 2}) # Set the iterations numbers and how frequently we evaluate the performance evaluate_every = 1024 evaluate_num = 32 episode_num = 20480 # The intial memory size memory_init_size = 256 # Train the agent every X steps train_every = 256 agents = [] agents.append( NFSPAgent(scope='nfsp' + str(0), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.1, rl_learning_rate=0.015, sl_learning_rate=0.0075, q_epsilon_start=.3, min_buffer_size_to_learn=memory_init_size, q_replay_memory_size=20480, q_replay_memory_init_size=memory_init_size, train_every=train_every + 44, q_train_every=train_every, q_mlp_layers=[512, 512], evaluate_with='best_response')) agents.append( NFSPAgent(scope='nfsp' + str(1), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.1, rl_learning_rate=0.015, sl_learning_rate=0.0075, q_epsilon_start=.3, q_replay_memory_size=20480, min_buffer_size_to_learn=memory_init_size, q_replay_memory_init_size=memory_init_size, train_every=train_every + 44, q_train_every=train_every, q_mlp_layers=[512, 512], evaluate_with='best_response')) # 7, 5 - all in junkies check_point_path = os.path.join('models/ivvan/cp/8/model-nfsp1.pth') checkpoint = torch.load(check_point_path) check_point_path = os.path.join('models/ivvan/cp/8/model-nfsp0.pth') checkpoint2 = torch.load(check_point_path) # for agent in agents: # agent.load(checkpoint) agents[1].load(checkpoint) agents[0].load(checkpoint2) human = nolimit_holdem_human_agent.HumanAgent(env.action_num) env.set_agents([agents[0], agents[1]]) while (True): print(">> Start a new game") trajectories, payoffs = env.run(is_training=False) if (len(trajectories[0]) == 0): # the bot folded immediately continue # If the human does not take the final action, we need to # print other players action final_state = trajectories[0][-1][-2] # print(final_state, 'waa') action_record = final_state['action_record'] state = final_state['raw_obs'] _action_list = [] for i in range(1, len(action_record) + 1): if action_record[-i][0] == state['current_player']: break _action_list.insert(0, action_record[-i]) for pair in _action_list: print('>> Player', pair[0], 'chooses', pair[1]) # Let's take a look at what the agent card is print('=============== CFR Agent ===============') print_card(env.get_perfect_information()['hand_cards'][1]) print('=============== Result ===============') if payoffs[0] > 0: print('You win {} chips!'.format(payoffs[0])) elif payoffs[0] == 0: print('It is a tie.') else: print('You lose {} chips!'.format(-payoffs[0])) print('') input("Press any key to continue...")
# The intial memory size memory_init_size = 256 # Train the agent every X steps train_every = 256 agents = [] agents.append( NFSPAgent(scope='nfsp' + str(0), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.1, rl_learning_rate=0.015, sl_learning_rate=0.0075, q_epsilon_start=.3, min_buffer_size_to_learn=memory_init_size, q_replay_memory_size=20480, q_replay_memory_init_size=memory_init_size, train_every=train_every + 44, q_train_every=train_every, q_mlp_layers=[512, 512], evaluate_with='average_policy')) agents.append( NFSPAgent(scope='nfsp' + str(1), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.1, rl_learning_rate=0.015,
def main(): wandb_config = wandb.config config = {} hyperparams = {} for key in wandb_config.keys(): if key in default_config: config[key] = wandb_config[key] elif key in default_hyperparams: hyperparams[key] = wandb_config[key] # Make environment env = make("yaniv", config=config) eval_env = make("yaniv", config=config) agents = [] for i in range(env.player_num): agent = NFSPAgent(scope="nfsp" + str(i), action_num=env.action_num, state_shape=env.state_shape, device=torch.device("cuda"), **hyperparams) agents.append(agent) if load_model is not None: state_dict = torch.load(load_model) policy_dict = state_dict[load_scope] agent.policy_network.load_state_dict(policy_dict) q_key = load_scope + "_dqn_q_estimator" agent._rl_agent.q_estimator.qnet.load_state_dict(state_dict[q_key]) target_key = load_scope + "_dqn_target_estimator" agent._rl_agent.target_estimator.qnet.load_state_dict( state_dict[target_key]) rule_agent = YanivNoviceRuleAgent( single_step=config["single_step_actions"]) random_agent = RandomAgent(action_num=env.action_num) def agent_feed(agent, trajectories): for transition in trajectories: agent.feed(transition) def save_function(agent, model_dir): torch.save(agent.get_state_dict(), os.path.join(model_dir, "model_{}.pth".format(i))) e = ExperimentRunner( env, eval_env, log_every=100, save_every=100, base_dir="yaniv_nfsp_pytorch", config=config, training_agent=agents[0], vs_agent=agents[1], feed_function=agent_feed, save_function=save_function, ) e.run_training( episode_num=50000, eval_every=200, eval_vs=[random_agent, rule_agent], eval_num=100, )