def test_train(self): norm_step = 1100 memory_init_size = 100 step_num = 1500 agent = DQNAgent(scope='dqn', replay_memory_size = 500, replay_memory_init_size=memory_init_size, update_target_estimator_every=100, norm_step=norm_step, state_shape=[2], mlp_layers=[10,10], device=torch.device('cpu')) predicted_action = agent.eval_step({'obs': np.random.random_sample((2,)), 'legal_actions': [0, 1]}) self.assertGreaterEqual(predicted_action, 0) self.assertLessEqual(predicted_action, 1) for step in range(step_num): ts = [{'obs': np.random.random_sample((2,)), 'legal_actions': [0, 1]}, \ np.random.randint(2), 0, {'obs': np.random.random_sample((2,)), 'legal_actions': [0, 1]}, True] agent.feed(ts) if step > norm_step + memory_init_size: agent.train() predicted_action = agent.step({'obs': np.random.random_sample((2,)), 'legal_actions': [0, 1]}) self.assertGreaterEqual(predicted_action, 0) self.assertLessEqual(predicted_action, 1)
def test_train(self): memory_init_size = 100 step_num = 500 agent = DQNAgent(scope='dqn', replay_memory_size = 200, replay_memory_init_size=memory_init_size, update_target_estimator_every=100, state_shape=[2], mlp_layers=[10,10], device=torch.device('cpu')) predicted_action, _ = agent.eval_step({'obs': np.random.random_sample((2,)), 'legal_actions': [0, 1]}) self.assertGreaterEqual(predicted_action, 0) self.assertLessEqual(predicted_action, 1) for _ in range(step_num): ts = [{'obs': np.random.random_sample((2,)), 'legal_actions': [0, 1]}, np.random.randint(2), 0, {'obs': np.random.random_sample((2,)), 'legal_actions': [0, 1]}, True] agent.feed(ts) state_dict = agent.get_state_dict() self.assertIsInstance(state_dict, dict) predicted_action = agent.step({'obs': np.random.random_sample((2,)), 'legal_actions': [0, 1]}) self.assertGreaterEqual(predicted_action, 0) self.assertLessEqual(predicted_action, 1)
def test_init(self): agent = DQNAgent(scope='dqn', replay_memory_size=0, replay_memory_init_size=0, update_target_estimator_every=0, discount_factor=0, epsilon_start=0, epsilon_end=0, epsilon_decay_steps=0, batch_size=0, action_num=2, state_shape=[1], mlp_layers=[10, 10], device=torch.device('cpu')) self.assertEqual(agent.replay_memory_init_size, 0) self.assertEqual(agent.update_target_estimator_every, 0) self.assertEqual(agent.discount_factor, 0) self.assertEqual(agent.epsilon_decay_steps, 0) self.assertEqual(agent.batch_size, 0) self.assertEqual(agent.action_num, 2)
def __init__(self, scope, action_num=4, state_shape=None, hidden_layers_sizes=None, reservoir_buffer_capacity=int(1e6), anticipatory_param=0.5, batch_size=256, rl_learning_rate=0.0001, sl_learning_rate=0.00001, min_buffer_size_to_learn=1000, q_replay_memory_size=30000, q_replay_memory_init_size=1000, q_update_target_estimator_every=1000, q_discount_factor=0.99, q_epsilon_start=1, q_epsilon_end=0.1, q_epsilon_decay_steps=int(1e6), q_batch_size=256, q_norm_step=1000, q_mlp_layers=None, device=None): ''' Initialize the NFSP agent. Args: scope (string): The name scope of NFSPAgent. action_num (int): The number of actions. state_shape (list): The shape of the state space. hidden_layers_sizes (list): The hidden layers sizes for the layers of the average policy. reservoir_buffer_capacity (int): The size of the buffer for average policy. anticipatory_param (float): The hyper-parameter that balances rl/avarage policy. batch_size (int): The batch_size for training average policy. rl_learning_rate (float): The learning rate of the RL agent. sl_learning_rate (float): the learning rate of the average policy. min_buffer_size_to_learn (int): The minimum buffer size to learn for average policy. q_replay_memory_size (int): The memory size of inner DQN agent. q_replay_memory_init_size (int): The initial memory size of inner DQN agent. q_update_target_estimator_every (int): The frequency of updating target network for inner DQN agent. q_discount_factor (float): The discount factor of inner DQN agent. q_epsilon_start (float): The starting epsilon of inner DQN agent. q_epsilon_end (float): the end epsilon of inner DQN agent. q_epsilon_decay_steps (int): The decay steps of inner DQN agent. q_batch_size (int): The batch size of inner DQN agent. q_norm_step (int): The normalization steps of inner DQN agent. q_mlp_layers (list): The layer sizes of inner DQN agent. device (torch.device): Whether to use the cpu or gpu ''' self.scope = scope self._action_num = action_num self._state_shape = state_shape self._layer_sizes = hidden_layers_sizes + [action_num] self._batch_size = batch_size self._sl_learning_rate = sl_learning_rate self._anticipatory_param = anticipatory_param self._min_buffer_size_to_learn = min_buffer_size_to_learn self._reservoir_buffer = ReservoirBuffer(reservoir_buffer_capacity) self._prev_timestep = None self._prev_action = None if device is None: self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') else: self.device = device # Step counter to keep track of learning. self._step_counter = 0 # Build the action-value network self._rl_agent = DQNAgent('dqn', q_replay_memory_size, q_replay_memory_init_size, \ q_update_target_estimator_every, q_discount_factor, q_epsilon_start, q_epsilon_end, \ q_epsilon_decay_steps, q_batch_size, action_num, state_shape, q_norm_step, q_mlp_layers, \ rl_learning_rate, device) # Build the average policy supervised model self._build_model() self.sample_episode_policy()
class NFSPAgent(object): ''' An approximate clone of rlcard.agents.nfsp_agent that uses pytorch instead of tensorflow. Note that this implementation differs from Henrich and Silver (2016) in that the supervised training minimizes cross-entropy with respect to the stored action probabilities rather than the realized actions. ''' def __init__(self, scope, action_num=4, state_shape=None, hidden_layers_sizes=None, reservoir_buffer_capacity=int(1e6), anticipatory_param=0.5, batch_size=256, rl_learning_rate=0.0001, sl_learning_rate=0.00001, min_buffer_size_to_learn=1000, q_replay_memory_size=30000, q_replay_memory_init_size=1000, q_update_target_estimator_every=1000, q_discount_factor=0.99, q_epsilon_start=1, q_epsilon_end=0.1, q_epsilon_decay_steps=int(1e6), q_batch_size=256, q_norm_step=1000, q_mlp_layers=None, device=None): ''' Initialize the NFSP agent. Args: scope (string): The name scope of NFSPAgent. action_num (int): The number of actions. state_shape (list): The shape of the state space. hidden_layers_sizes (list): The hidden layers sizes for the layers of the average policy. reservoir_buffer_capacity (int): The size of the buffer for average policy. anticipatory_param (float): The hyper-parameter that balances rl/avarage policy. batch_size (int): The batch_size for training average policy. rl_learning_rate (float): The learning rate of the RL agent. sl_learning_rate (float): the learning rate of the average policy. min_buffer_size_to_learn (int): The minimum buffer size to learn for average policy. q_replay_memory_size (int): The memory size of inner DQN agent. q_replay_memory_init_size (int): The initial memory size of inner DQN agent. q_update_target_estimator_every (int): The frequency of updating target network for inner DQN agent. q_discount_factor (float): The discount factor of inner DQN agent. q_epsilon_start (float): The starting epsilon of inner DQN agent. q_epsilon_end (float): the end epsilon of inner DQN agent. q_epsilon_decay_steps (int): The decay steps of inner DQN agent. q_batch_size (int): The batch size of inner DQN agent. q_norm_step (int): The normalization steps of inner DQN agent. q_mlp_layers (list): The layer sizes of inner DQN agent. device (torch.device): Whether to use the cpu or gpu ''' self.scope = scope self._action_num = action_num self._state_shape = state_shape self._layer_sizes = hidden_layers_sizes + [action_num] self._batch_size = batch_size self._sl_learning_rate = sl_learning_rate self._anticipatory_param = anticipatory_param self._min_buffer_size_to_learn = min_buffer_size_to_learn self._reservoir_buffer = ReservoirBuffer(reservoir_buffer_capacity) self._prev_timestep = None self._prev_action = None if device is None: self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') else: self.device = device # Step counter to keep track of learning. self._step_counter = 0 # Build the action-value network self._rl_agent = DQNAgent('dqn', q_replay_memory_size, q_replay_memory_init_size, \ q_update_target_estimator_every, q_discount_factor, q_epsilon_start, q_epsilon_end, \ q_epsilon_decay_steps, q_batch_size, action_num, state_shape, q_norm_step, q_mlp_layers, \ rl_learning_rate, device) # Build the average policy supervised model self._build_model() self.sample_episode_policy() def _build_model(self): ''' Build the average policy network ''' # configure the average policy network policy_network = AveragePolicyNetwork(self._action_num, self._state_shape, self._layer_sizes) policy_network = policy_network.to(self.device) self.policy_network = policy_network self.policy_network.eval() print(self.policy_network) # xavier init for p in self.policy_network.parameters(): if len(p.data.shape) > 1: nn.init.xavier_uniform_(p.data) # configure optimizer self.policy_network_optimizer = torch.optim.Adam(self.policy_network.parameters(), lr=self._sl_learning_rate) def feed(self, ts): ''' Feed data to inner RL agent Args: ts (list): A list of 5 elements that represent the transition. ''' self._rl_agent.feed(ts) def step(self, state,player_id): ''' Returns the action to be taken. Args: state (dict): The current state Returns: action (int): An action id ''' obs = state['obs'] legal_actions = state['legal_actions'] if self._mode == MODE.best_response: probs = self._rl_agent.predict(obs) self._add_transition(obs, probs) elif self._mode == MODE.average_policy: probs = self._act(obs) probs = remove_illegal(probs, legal_actions) action = np.random.choice(len(probs), p=probs) return action def eval_step(self, state,player_id): ''' Use the average policy for evaluation purpose Args: state (dict): The current state. Returns: action (int): An action id. ''' action = self._rl_agent.eval_step(state,player_id) return action def sample_episode_policy(self): ''' Sample average/best_response policy ''' if np.random.rand() < self._anticipatory_param: self._mode = MODE.best_response else: self._mode = MODE.average_policy def _act(self, info_state): ''' Predict action probability givin the observation and legal actions Not connected to computation graph Args: info_state (numpy.array): An obervation. Returns: action_probs (numpy.array): The predicted action probability. ''' info_state = np.expand_dims(info_state, axis=0) info_state = torch.from_numpy(info_state).float().to(self.device) with torch.no_grad(): log_action_probs = self.policy_network(info_state).numpy() action_probs = np.exp(log_action_probs)[0] return action_probs def _add_transition(self, state, probs): ''' Adds the new transition to the reservoir buffer. Transitions are in the form (state, probs). Args: state (numpy.array): The state. probs (numpy.array): The probabilities of each action. ''' transition = Transition( info_state=state, action_probs=probs) self._reservoir_buffer.add(transition) def train_rl(self): ''' Update the inner RL agent ''' return self._rl_agent.train() def train_sl(self): ''' Compute the loss on sampled transitions and perform a avg-network update. If there are not enough elements in the buffer, no loss is computed and `None` is returned instead. Returns: loss (float): The average loss obtained on this batch of transitions or `None`. ''' if (len(self._reservoir_buffer) < self._batch_size or len(self._reservoir_buffer) < self._min_buffer_size_to_learn): return None transitions = self._reservoir_buffer.sample(self._batch_size) info_states = [t.info_state for t in transitions] action_probs = [t.action_probs for t in transitions] self.policy_network_optimizer.zero_grad() self.policy_network.train() # (batch, state_size) info_states = torch.from_numpy(np.array(info_states)).float().to(self.device) # (batch, action_num) eval_action_probs = torch.from_numpy(np.array(action_probs)).float().to(self.device) # (batch, action_num) log_forecast_action_probs = self.policy_network(info_states) ce_loss = - (eval_action_probs * log_forecast_action_probs).sum(dim=-1).mean() ce_loss.backward() self.policy_network_optimizer.step() ce_loss = ce_loss.item() self.policy_network.eval() return ce_loss
# The intial memory size memory_init_size = 1000 # Train the agent every X steps train_every = 1 # The paths for saving the logs and learning curves log_dir = './experiments/limit_holdem_dqn_result/' # Set a global seed set_global_seed(0) agent = DQNAgent(scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[128, 128], device=torch.device('cpu')) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents([agent, random_agent]) eval_env.set_agents([agent, random_agent]) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): # Generate data from the environment trajectories, _ = env.run(is_training=True)
def __init__(self, scope, action_num=4, state_shape=None, hidden_layers_sizes=None, reservoir_buffer_capacity=int(1e6), anticipatory_param=0.1, batch_size=256, train_every=1, rl_learning_rate=0.1, sl_learning_rate=0.005, min_buffer_size_to_learn=1000, q_replay_memory_size=30000, q_replay_memory_init_size=1000, q_update_target_estimator_every=1000, q_discount_factor=0.99, q_epsilon_start=0.06, q_epsilon_end=0, q_epsilon_decay_steps=int(1e6), q_batch_size=256, q_train_every=1, q_mlp_layers=None, evaluate_with='average_policy', device=None): ''' Initialize the NFSP agent. Args: scope (string): The name scope of NFSPAgent. action_num (int): The number of actions. state_shape (list): The shape of the state space. hidden_layers_sizes (list): The hidden layers sizes for the layers of the average policy. reservoir_buffer_capacity (int): The size of the buffer for average policy. anticipatory_param (float): The hyper-parameter that balances rl/avarage policy. batch_size (int): The batch_size for training average policy. train_every (int): Train the SL policy every X steps. rl_learning_rate (float): The learning rate of the RL agent. sl_learning_rate (float): the learning rate of the average policy. min_buffer_size_to_learn (int): The minimum buffer size to learn for average policy. q_replay_memory_size (int): The memory size of inner DQN agent. q_replay_memory_init_size (int): The initial memory size of inner DQN agent. q_update_target_estimator_every (int): The frequency of updating target network for inner DQN agent. q_discount_factor (float): The discount factor of inner DQN agent. q_epsilon_start (float): The starting epsilon of inner DQN agent. q_epsilon_end (float): the end epsilon of inner DQN agent. q_epsilon_decay_steps (int): The decay steps of inner DQN agent. q_batch_size (int): The batch size of inner DQN agent. q_train_step (int): Train the model every X steps. q_mlp_layers (list): The layer sizes of inner DQN agent. device (torch.device): Whether to use the cpu or gpu ''' self.use_raw = False self._scope = scope self._action_num = action_num self._state_shape = state_shape self._layer_sizes = hidden_layers_sizes + [action_num] self._batch_size = batch_size self._train_every = train_every self._sl_learning_rate = sl_learning_rate self._anticipatory_param = anticipatory_param self._min_buffer_size_to_learn = min_buffer_size_to_learn self._reservoir_buffer = ReservoirBuffer(reservoir_buffer_capacity) self._prev_timestep = None self._prev_action = None self.evaluate_with = evaluate_with self.d = { 0: 'A', 1: '2', 2: '3', 3: '4', 4: '5', 5: '6', 6: '7', 7: '8', 8: '9', 9: 'T', 10: 'J', 11: 'Q', 12: 'K' } self.s = {0: 's', 1: 'h', 2: 'd', 3: 'c'} self.c2n = { '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9, 'T': 10, 'J': 11, 'Q': 12, 'K': 13, 'A': 14 } self.late_range = Range( '22+, A2s+, K2s+, Q2s+, J2s+, J8, T9, 98, 87, 76s, 65s, 54s, 98s+, K9+, Q8+, J7+, T6s+, A9+' ) if device is None: self.device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') else: self.device = device # Total timesteps self.total_t = 0 # Step counter to keep track of learning. self._step_counter = 0 # Build the action-value network self._rl_agent = DQNAgent(scope+'_dqn', q_replay_memory_size, q_replay_memory_init_size, \ q_update_target_estimator_every, q_discount_factor, q_epsilon_start, q_epsilon_end, \ q_epsilon_decay_steps, q_batch_size, action_num, state_shape, q_train_every, q_mlp_layers, \ rl_learning_rate, device) self.MCTS = mcst.MCTS(psutil.cpu_count()) # Build the average policy supervised model self._build_model() self.sample_episode_policy()
class NFSPAgent(object): ''' An approximate clone of rlcard.agents.nfsp_agent that uses pytorch instead of tensorflow. Note that this implementation differs from Henrich and Silver (2016) in that the supervised training minimizes cross-entropy with respect to the stored action probabilities rather than the realized actions. ''' def __init__(self, scope, action_num=4, state_shape=None, hidden_layers_sizes=None, reservoir_buffer_capacity=int(1e6), anticipatory_param=0.1, batch_size=256, train_every=1, rl_learning_rate=0.1, sl_learning_rate=0.005, min_buffer_size_to_learn=1000, q_replay_memory_size=30000, q_replay_memory_init_size=1000, q_update_target_estimator_every=1000, q_discount_factor=0.99, q_epsilon_start=0.06, q_epsilon_end=0, q_epsilon_decay_steps=int(1e6), q_batch_size=256, q_train_every=1, q_mlp_layers=None, evaluate_with='average_policy', device=None): ''' Initialize the NFSP agent. Args: scope (string): The name scope of NFSPAgent. action_num (int): The number of actions. state_shape (list): The shape of the state space. hidden_layers_sizes (list): The hidden layers sizes for the layers of the average policy. reservoir_buffer_capacity (int): The size of the buffer for average policy. anticipatory_param (float): The hyper-parameter that balances rl/avarage policy. batch_size (int): The batch_size for training average policy. train_every (int): Train the SL policy every X steps. rl_learning_rate (float): The learning rate of the RL agent. sl_learning_rate (float): the learning rate of the average policy. min_buffer_size_to_learn (int): The minimum buffer size to learn for average policy. q_replay_memory_size (int): The memory size of inner DQN agent. q_replay_memory_init_size (int): The initial memory size of inner DQN agent. q_update_target_estimator_every (int): The frequency of updating target network for inner DQN agent. q_discount_factor (float): The discount factor of inner DQN agent. q_epsilon_start (float): The starting epsilon of inner DQN agent. q_epsilon_end (float): the end epsilon of inner DQN agent. q_epsilon_decay_steps (int): The decay steps of inner DQN agent. q_batch_size (int): The batch size of inner DQN agent. q_train_step (int): Train the model every X steps. q_mlp_layers (list): The layer sizes of inner DQN agent. device (torch.device): Whether to use the cpu or gpu ''' self.use_raw = False self._scope = scope self._action_num = action_num self._state_shape = state_shape self._layer_sizes = hidden_layers_sizes + [action_num] self._batch_size = batch_size self._train_every = train_every self._sl_learning_rate = sl_learning_rate self._anticipatory_param = anticipatory_param self._min_buffer_size_to_learn = min_buffer_size_to_learn self._reservoir_buffer = ReservoirBuffer(reservoir_buffer_capacity) self._prev_timestep = None self._prev_action = None self.evaluate_with = evaluate_with self.d = { 0: 'A', 1: '2', 2: '3', 3: '4', 4: '5', 5: '6', 6: '7', 7: '8', 8: '9', 9: 'T', 10: 'J', 11: 'Q', 12: 'K' } self.s = {0: 's', 1: 'h', 2: 'd', 3: 'c'} self.c2n = { '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9, 'T': 10, 'J': 11, 'Q': 12, 'K': 13, 'A': 14 } self.late_range = Range( '22+, A2s+, K2s+, Q2s+, J2s+, J8, T9, 98, 87, 76s, 65s, 54s, 98s+, K9+, Q8+, J7+, T6s+, A9+' ) if device is None: self.device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') else: self.device = device # Total timesteps self.total_t = 0 # Step counter to keep track of learning. self._step_counter = 0 # Build the action-value network self._rl_agent = DQNAgent(scope+'_dqn', q_replay_memory_size, q_replay_memory_init_size, \ q_update_target_estimator_every, q_discount_factor, q_epsilon_start, q_epsilon_end, \ q_epsilon_decay_steps, q_batch_size, action_num, state_shape, q_train_every, q_mlp_layers, \ rl_learning_rate, device) self.MCTS = mcst.MCTS(psutil.cpu_count()) # Build the average policy supervised model self._build_model() self.sample_episode_policy() def _build_model(self): ''' Build the average policy network ''' # configure the average policy network policy_network = AveragePolicyNetwork(self._action_num, self._state_shape, self._layer_sizes) policy_network = policy_network.to(self.device) self.policy_network = policy_network self.policy_network.eval() # xavier init for p in self.policy_network.parameters(): if len(p.data.shape) > 1: nn.init.xavier_uniform_(p.data) # configure optimizer self.policy_network_optimizer = torch.optim.Adam( self.policy_network.parameters(), lr=self._sl_learning_rate) def feed(self, ts): ''' Feed data to inner RL agent Args: ts (list): A list of 5 elements that represent the transition. ''' self._rl_agent.feed(ts) self.total_t += 1 # print(len(self._reservoir_buffer)) if self.total_t > 0 and len( self._reservoir_buffer ) >= self._min_buffer_size_to_learn and self.total_t % self._train_every == 0: sl_loss = self.train_sl() rlcopy = copy.copy(self._rl_agent) rlcopy.memory = None rlcopy.target_estimator = None rlcopy.device = self.device self.obj_ref = ray.put(rlcopy.eval_step) print('\r\nINFO - Agent {}, step {}, sl-loss: {}'.format( self._scope, self.total_t, sl_loss), end='\n\n') def step(self, state): ''' Returns the action to be taken. Args: state (dict): The current state Returns: action (int): An action id ''' cards = '' pos = 0 for i in state['obs']: if (i == 1 and pos < 52): cards = cards + self.d[pos % 13] + '' + self.s[pos // 13] pos += 1 # if(len(cards) == 4 and not Combo(cards) in self.late_range.combos): # return 0, 1 tab = [] handcards = cards for i in state['public_cards']: tab.append((self.c2n[i[1]], i[0].lower())) hand = [] for i in range(0, len(handcards), 2): hand.append((self.c2n[handcards[i]], handcards[i + 1])) # print(tab) hand = [x for x in hand if x not in tab] stt = mcst.PokerState(hand, tab, state['cur'], state['opp'], abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], state['obs'][52], state['obs'][53]) # print(hand, tab, 250 - min(state['obs'][-2:]), 250 - max(state['obs'][-2:]), abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], min(state['obs'][-2:]), max(state['obs'][-2:])) # mcst.PokerState() obs = state['obs'] legal_actions = state['legal_actions'] if self._mode == MODE.best_response: # now = time.clock() probs = self._rl_agent.predict(obs) # for the early hands, we want to perform shallow searches. Too much variability to "investigate" any potential strategy # gc.disable() # if len(tab) == 0: # m, p = self.MCTS.UCT(rootstate = stt, itermax = 16384, processes = 512, verbose = False) # # for each position after, our situation becomes more certain and we can parse deeper into the tree/investigate more indepth strategies # if len(tab) == 3: # m, p = self.MCTS.UCT(rootstate = stt, itermax = 16384, processes = 256, verbose = False) # if len(tab) == 4: # m, p = self.MCTS.UCT(rootstate = stt, itermax = 16384, processes = 128, verbose = False) # if len(tab) == 5: # m, p = self.MCTS.UCT(rootstate = stt, itermax = 16384, processes = 64, verbose = False) if len(tab) == 0: m, p = self.MCTS.UCT(rootstate=stt, itermax=100, processes=128, verbose=False) # for each position after, our situation becomes more certain and we can parse deeper into the tree/investigate more indepth strategies if len(tab) == 3: m, p = self.MCTS.UCT(rootstate=stt, itermax=100, processes=64, verbose=False) if len(tab) == 4: m, p = self.MCTS.UCT(rootstate=stt, itermax=100, processes=32, verbose=False) if len(tab) == 5: m, p = self.MCTS.UCT(rootstate=stt, itermax=100, processes=32, verbose=False) # print(time.clock() - now) probs = copy.deepcopy(probs) probs[m] += 1.5 probs = remove_illegal(probs, legal_actions) if (probs[1] != 0): probs[0] = 0 probs /= sum(probs) self._add_transition(obs, probs) elif self._mode == MODE.average_policy: probs = self._act(obs) probs = remove_illegal(probs, legal_actions) action = np.random.choice(len(probs), p=probs) return action def eval_step(self, state): ''' Use the average policy for evaluation purpose Args: state (dict): The current state. Returns: action (int): An action id. ''' cards = '' pos = 0 for i in state['obs']: if (i == 1 and pos < 52): cards = cards + self.d[pos % 13] + '' + self.s[pos // 13] pos += 1 # if(len(cards) == 4 and not Combo(cards) in self.late_range.combos): # return 0, 1 tab = [] handcards = cards for i in state['public_cards']: tab.append((self.c2n[i[1]], i[0].lower())) hand = [] for i in range(0, len(handcards), 2): hand.append((self.c2n[handcards[i]], handcards[i + 1])) # print(tab) hand = [x for x in hand if x not in tab] print(state) stt = mcst.PokerState(hand, tab, state['cur'], state['opp'], abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], state['obs'][52], state['obs'][53]) if self.evaluate_with == 'best_response': legal_actions = state['legal_actions'] action, probs = self._rl_agent.eval_step(state) print(probs, '------------') # gc.disable() # if len(tab) == 0: # m, p = self.MCTS.UCT(rootstate = stt, itermax = 65536, processes = 128, verbose = False) # # for each position after, our situation becomes more certain and we can parse deeper into the tree/investigate more indepth strategies # if len(tab) == 3: # m, p = self.MCTS.UCT(rootstate = stt, itermax = 32768, processes = 64, verbose = False) # if len(tab) == 4: # m, p = self.MCTS.UCT(rootstate = stt, itermax = 32768, processes = 32, verbose = False) # if len(tab) == 5: # m, p = self.MCTS.UCT(rootstate = stt, itermax = 32768, processes = 32, verbose = False) m, p = self.MCTS.UCT(rootstate=stt, itermax=1048576 // 8, processes=8, verbose=False) probs = copy.deepcopy(probs) print(probs, m) probs[m] += 1.5 probs = remove_illegal(probs, legal_actions) if (probs[1] != 0): probs[0] = 0 probs /= sum(probs) action = np.random.choice(len(probs), p=probs) print(m, probs) # probs[m] += 1 # probs = remove_illegal(probs, legal_actions) # probs /= sum(probs) elif self.evaluate_with == 'average_policy': obs = state['obs'] legal_actions = state['legal_actions'] probs = self._act(obs) probs = remove_illegal(probs, legal_actions) action = np.random.choice(len(probs), p=probs) else: raise ValueError( "'evaluate_with' should be either 'average_policy' or 'best_response'." ) return action, probs def sample_episode_policy(self): ''' Sample average/best_response policy ''' if np.random.rand() < self._anticipatory_param: self._mode = MODE.best_response else: self._mode = MODE.average_policy def _act(self, info_state): ''' Predict action probability givin the observation and legal actions Not connected to computation graph Args: info_state (numpy.array): An obervation. Returns: action_probs (numpy.array): The predicted action probability. ''' info_state = np.expand_dims(info_state, axis=0) info_state = torch.from_numpy(info_state).float().to(self.device) with torch.no_grad(): log_action_probs = self.policy_network(info_state).numpy() action_probs = np.exp(log_action_probs)[0] return action_probs def _add_transition(self, state, probs): ''' Adds the new transition to the reservoir buffer. Transitions are in the form (state, probs). Args: state (numpy.array): The state. probs (numpy.array): The probabilities of each action. ''' transition = Transition(info_state=state, action_probs=probs) self._reservoir_buffer.add(transition) def train_sl(self): ''' Compute the loss on sampled transitions and perform a avg-network update. If there are not enough elements in the buffer, no loss is computed and `None` is returned instead. Returns: loss (float): The average loss obtained on this batch of transitions or `None`. ''' if (len(self._reservoir_buffer) < self._batch_size or len(self._reservoir_buffer) < self._min_buffer_size_to_learn): return None transitions = self._reservoir_buffer.sample(self._batch_size) info_states = [t.info_state for t in transitions] action_probs = [t.action_probs for t in transitions] self.policy_network_optimizer.zero_grad() self.policy_network.train() # (batch, state_size) info_states = torch.from_numpy(np.array(info_states)).float().to( self.device) # (batch, action_num) eval_action_probs = torch.from_numpy( np.array(action_probs)).float().to(self.device) # (batch, action_num) log_forecast_action_probs = self.policy_network(info_states) ce_loss = -(eval_action_probs * log_forecast_action_probs).sum(dim=-1).mean() ce_loss.backward() self.policy_network_optimizer.step() ce_loss = ce_loss.item() self.policy_network.eval() return ce_loss def get_state_dict(self): ''' Get the state dict to save models Returns: (dict): A dict of model states ''' state_dict = self._rl_agent.get_state_dict() state_dict[self._scope] = self.policy_network.state_dict() return state_dict def load(self, checkpoint): ''' Load model Args: checkpoint (dict): the loaded state ''' self.policy_network.load_state_dict(checkpoint[self._scope])
memory_init_size = 1000 # Train the agent every X steps train_every = 1 # The paths for saving the logs and learning curves log_dir = './experiments/scopone_result/' # Set a global seed set_global_seed(0) agents = [ DQNAgent(scope=f'dqn_{i}', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[128, 128], device=torch.device('cpu')) for i in range(env.player_num) ] env.set_agents(agents) eval_env.set_agents(agents) logger = Logger(log_dir) for episode in range(episode_num): # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for agent_idx, traj in enumerate(trajectories):
memory_init_size = 1000 norm_step = 100 # The paths for saving the logs and learning curves root_path = './experiments/limit_holdem_dqn_pytorch_result/' log_path = root_path + 'log.txt' csv_path = root_path + 'performance.csv' figure_path = root_path + 'figures/' # Set a global seed set_global_seed(0) agent = DQNAgent(scope='dqn', action_num=env.action_num, replay_memory_size=int(1e5), replay_memory_init_size=memory_init_size, norm_step=norm_step, state_shape=env.state_shape, mlp_layers=[512, 512], device=torch.device('cpu')) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents([agent, random_agent]) eval_env.set_agents([agent, random_agent]) # Count the number of steps step_counter = 0 # Init a Logger to plot the learning curve logger = Logger(xlabel='timestep', ylabel='reward',
memory_init_size = 1000 # Train the agent every X steps train_every = 1 # The paths for saving the logs and learning curves log_dir = './experiments/doudizhu_dqn_result/' # Set a global seed set_global_seed(0) # Set up the agents agent = DQNAgent( scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[512,512]) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents([agent, random_agent, random_agent]) eval_env.set_agents([agent, random_agent, random_agent]) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): # Generate data from the environment
'single_agent_mode': False, 'active_player': 0, 'record_action': False, } env = EuchreEnv(DEFAULT_CONFIG) eval_env = EuchreEnv(DEFAULT_CONFIG) # All random = 0.00411 # Rule Agent Alone = 0.09185 # Rule Agent Team = 0.19251 # Bid Rule = 0.0438 agent = DQNAgent(scope='dqn', action_num=env.action_num, state_shape=env.state_shape, batch_size=64, mlp_layers=[64]) env.set_agents([ agent, RandomAgent(action_num=env.action_num), RandomAgent(action_num=env.action_num), RandomAgent(action_num=env.action_num) ]) eval_env.set_agents([ agent, RandomAgent(action_num=env.action_num), RandomAgent(action_num=env.action_num), RandomAgent(action_num=env.action_num)