def step(self, state, player_id): s = np.array(state['obs']) leagl_actions = state["legal_actions"] #cardstr,other_two_action,one_last, two_last, three_last, legal_card = numpytostr(state) if self.ran < -0.5: #print("zhi xing jiangducelue") probs = self.snet.t_choose_action(np.expand_dims( s, 0), state["legal_actions"]).detach().cpu().numpy() self.agent_probs.append(probs) probs = remove_illegal(probs[0], leagl_actions) action = np.random.choice(len(probs), p=probs) # print("player:", player_id, "手牌 is:", cardstr, "上一局:",0, "出牌:", ACTION_ID_TO_STR[action]) return int(action) else: #print("zhi xing zhi qianghua") probs = self.lnet.t_choose_action(np.expand_dims( s, 0), state["legal_actions"]).detach().cpu().numpy() self.agent_probs.append(probs) probs = remove_illegal(probs[0], leagl_actions) action = np.random.choice(len(probs), p=probs) memdic = {} memdic['obs'] = np.expand_dims(s, 0) memdic['action'] = action self.mem.addmemory(memdic) # print("player:", player_id, "手牌 is:", cardstr, "上一局:",0, "出牌:", ACTION_ID_TO_STR[action]) return int(action)
def step(self, state): ''' Returns the action to be taken. Args: state (dict): The current state Returns: action (int): An action id ''' # print(table) cards = '' pos = 0 # print(state) # print(state['obs']) for i in state['obs']: if (i == 1 and pos < 52): cards = cards + self.d[pos % 13] + '' + self.s[pos // 13] pos += 1 if (len(cards) == 4 and not Combo(cards) in self.late_range.combos): return 0 tab = [] handcards = cards for i in table: tab.append((self.c2n[self.d[i % 13]], self.s[i // 13])) handcards = handcards.replace( self.d[i % 13] + '' + self.s[i // 13], '') hand = [] for i in range(0, len(handcards), 2): hand.append((self.c2n[handcards[i]], handcards[i + 1])) stt = mcst.PokerState(hand, tab, 250 - min(state['obs'][-2:]), 250 - max(state['obs'][-2:]), abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], min(state['obs'][-2:]), max(state['obs'][-2:])) # print(hand, tab, 250 - min(state['obs'][-2:]), 250 - max(state['obs'][-2:]), abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], min(state['obs'][-2:]), max(state['obs'][-2:])) m = mcst.UCT(rootstate=stt, itermax=750, verbose=False) obs = state['obs'] legal_actions = state['legal_actions'] if self._mode == MODE.best_response: probs = self._rl_agent.predict(obs) one_hot = np.eye(len(probs))[np.argmax(probs)] self._add_transition(obs, one_hot) elif self._mode == MODE.average_policy: probs = self._act(obs) probs = remove_illegal(probs, legal_actions) probs[m] += .65 probs /= sum(probs) probs = remove_illegal(probs, legal_actions) action = np.random.choice(len(probs), p=probs) # print(action, m) return action
def eval_step(self, state): ''' Use the average policy for evaluation purpose Args: state (dict): The current state. Returns: action (int): An action id. probs (list): The list of action probabilies ''' cards = '' pos = 0 # print(table) # print(state['obs']) for i in state['obs']: if (i == 1 and pos < 52): cards = cards + self.d[pos % 13] + '' + self.s[pos // 13] pos += 1 if (len(cards) == 4 and not Combo(cards) in self.late_range.combos): return 0, 1 tab = [] handcards = cards for i in table: tab.append((self.c2n[self.d[i % 13]], self.s[i // 13])) handcards = handcards.replace( self.d[i % 13] + '' + self.s[i // 13], '') hand = [] for i in range(0, len(handcards), 2): hand.append((self.c2n[handcards[i]], handcards[i + 1])) stt = mcst.PokerState(hand, tab, 250 - min(state['obs'][-2:]), 250 - max(state['obs'][-2:]), abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], min(state['obs'][-2:]), max(state['obs'][-2:])) # print(hand, tab, 250 - min(state['obs'][-2:]), 250 - max(state['obs'][-2:]), abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], min(state['obs'][-2:]), max(state['obs'][-2:])) # print(state) if self.evaluate_with == 'best_response': action, probs = self._rl_agent.eval_step(state) elif self.evaluate_with == 'average_policy': #print('hi?') obs = state['obs'] legal_actions = state['legal_actions'] probs = self._act(obs) m = mcst.UCT(rootstate=stt, itermax=750, verbose=False) probs[m] += .65 probs /= sum(probs) probs = remove_illegal(probs, legal_actions) action = np.random.choice(len(probs), p=probs) # print(action, m) else: raise ValueError( "'evaluate_with' should be either 'average_policy' or 'best_response'." ) return action, probs
def step(self, state): ''' Returns the action to be taken. Args: state (dict): The current state Returns: action (int): An action id ''' obs = state['obs'] legal_actions = state['legal_actions'] if self._mode == MODE.best_response: # probs = self._rl_agent.predict(obs) probs = self._rl_agent.predict(state) # one_hot = np.eye(len(probs))[np.argmax(probs)] one_hot = np.zeros(len(probs)) one_hot[np.argmax(probs)] = 1 self._add_transition(obs, one_hot) elif self._mode == MODE.average_policy: probs = self._act(obs) elif self._mode == MODE.rule_policy: action = TractorRuleAgent.step(state) probs = np.zeros(self._action_num, dtype=float) probs[action] = 1 one_hot = np.eye(len(probs))[action] self._add_transition(obs, one_hot) probs = remove_illegal(probs, legal_actions) action = np.random.choice(len(probs), p=probs) return action
def eval_step(self, state): ''' Use the average policy for evaluation purpose Args: state (dict): The current state. Returns: action (int): An action id. info (dict): A dictionary containing information ''' if self.evaluate_with == 'best_response': action, info = self._rl_agent.eval_step(state) elif self.evaluate_with == 'average_policy': obs = state['obs'] legal_actions = list(state['legal_actions'].keys()) probs = self._act(obs) probs = remove_illegal(probs, legal_actions) action = np.random.choice(len(probs), p=probs) info = {} info['probs'] = { state['raw_legal_actions'][i]: float(probs[list(state['legal_actions'].keys())[i]]) for i in range(len(state['legal_actions'])) } else: raise ValueError( "'evaluate_with' should be either 'average_policy' or 'best_response'." ) return action, info
def eval_step(self, state): obs = np.ravel(state["obs"]) softmax_action = remove_illegal( self._softmax_action(obs), state["legal_actions"] ) action = np.argmax(softmax_action) return action, softmax_action
def predict(self, state): ''' Predict the action probabilities Args: state (numpy.array): current state Returns: q_values (numpy.array): a 1-d array where each entry represents a Q value ''' epsilon = self.epsilons[min(self.total_t, self.epsilon_decay_steps - 1)] A = np.ones(self.action_num, dtype=float) * epsilon / len( state['legal_actions']) q_values = self.q_estimator.predict(self.sess, np.expand_dims(state['obs'], 0))[0] best_action = state['legal_actions'][0] best_action_q_value = q_values[best_action] for action in state['legal_actions']: if q_values[action] > best_action_q_value: best_action = action best_action_q_value = q_values[action] A[best_action] += (1.0 - epsilon) # TODO: no need to normalize in this function A = remove_illegal(A, state['legal_actions']) return A
def _pick_action(self, softmax_action, legal_actions): # ie if all probs are 0 legal_probs = remove_illegal(softmax_action, legal_actions) if not np.any(legal_probs): action = np.random.choice(legal_actions) else: action = np.argmax(legal_probs) return action
def eval_step(self, state, player_id): s = np.array(state['obs']) leagl_actions = state["legal_actions"] #cardstr, other_two_action, one_last, two_last, three_last, legal_card = numpytostr(state) #probs = self.lnet.t_choose_action(np.expand_dims(self.normalizer.normalize(s), 0),state["legal_actions"]).detach().cpu().numpy() probs = self.lnet.t_choose_action(np.expand_dims( s, 0), state["legal_actions"]).detach().cpu().numpy() probs = remove_illegal(probs[0], leagl_actions) action = np.random.choice(len(probs), p=probs) #print("player:", player_id, "手牌 is:", cardstr, "上一局:",0, "出牌:", ACTION_ID_TO_STR[action]) return int(action)
def step(self, state): ''' Predict the action for genrating training data but have the predictions disconnected from the computation graph Args: state (numpy.array): current state Returns: action (int): an action id ''' A = self.predict(state['obs']) A = remove_illegal(A, state['legal_actions']) action = np.random.choice(np.arange(len(A)), p=A) return action
def eval_step(self, state): ''' Predict the action for evaluation purpose. Args: state (numpy.array): current state Returns: action (int): an action id ''' q_values = self.q_estimator.predict_nograd( np.expand_dims(state['obs'], 0))[0] probs = remove_illegal(np.exp(q_values), state['legal_actions']) best_action = np.argmax(probs) return best_action, probs
def step(self, state: dict): ''' Predict the action for generating training data Args: state (dict): current state Returns: action (int): an action id ''' # Sample according to probs (based on dqn_agent impl) state_obs = np.expand_dims(state['obs'], 0) A = self.policy.predict(self.sess, state_obs)[0] A = remove_illegal(A, state['legal_actions']) action = np.random.choice(np.arange(len(A)), p=A) return action
def step(self, state): ''' Predict the action for generating training data Args: state (numpy.array): current state Returns: action (int): an action id ''' A = self.predict(state['obs']) A = remove_illegal(A, state['legal_actions']) action = np.random.choice(np.arange(len(A)), p=A) return action
def eval_step(self, state): ''' Predict the action given state for evaluation args: state (dict): current state returns: action (int): an action id ''' obs = state['obs'] legal_actions = state['legal_actions'] action_prob = self.action_probabilities(obs) action_prob = remove_illegal(action_prob, legal_actions) action_prob /= action_prob.sum() action = np.random.choice(np.arange(len(action_prob)), p=action_prob) return action, action_prob
def eval_step(self, state: dict): ''' Predict the action given the current state for evaluation. Args: state (dict): current state Returns: action (int): an action id probs (list): a list of probabilies ''' # Sample according to probs (based on dqn_agent impl) state_obs = np.expand_dims(state['obs'], 0) A = self.policy.predict(self.sess, state_obs)[0] probs = remove_illegal(A, state['legal_actions']) best_action = np.argmax(probs) return best_action, probs
def eval_step(self, state): """ Predict the action for evaluation purpose. Args: state (numpy.array): current state Returns: action (int): an action id """ q_values = self.q_estimator.predict( self.sess, np.expand_dims(self.normalizer.normalize(state['obs']), 0))[0] probs = remove_illegal(np.exp(q_values), state['legal_actions']) best_action = np.argmax(probs) return best_action
def eval_step(self, state): ''' Use the average policy for evaluation purpose Args: state (dict): The current state. Returns: action (int): An action id. ''' if self.evaluate_with == 'best_response': action, probs = self._rl_agent.eval_step(state) elif self.evaluate_with == 'average_policy': obs = state['obs'] legal_actions = state['legal_actions'] probs = self._act(obs) probs = remove_illegal(probs, legal_actions) action = np.random.choice(len(probs), p=probs) else: raise ValueError("'evaluate_with' should be either 'average_policy' or 'best_response'.") return action, probs
def step(self, state): ''' Returns the action to be taken. Args: state (dict): The current state Returns: action (int): An action id ''' obs = state['obs'] legal_actions = state['legal_actions'] if self._mode == MODE.best_response: probs = self._rl_agent.predict(obs) self._add_transition(obs, probs) elif self._mode == MODE.average_policy: probs = self._act(obs) probs = remove_illegal(probs, legal_actions) action = np.random.choice(len(probs), p=probs) return action
def step(self, state): ''' Returns the action to be taken. Args: state (dict): The current state Returns: action (int): An action id ''' obs = state['obs'] legal_actions = list(state['legal_actions'].keys()) if self._mode == 'best_response': action = self._rl_agent.step(state) one_hot = np.zeros(self._num_actions) one_hot[action] = 1 self._add_transition(obs, one_hot) elif self._mode == 'average_policy': probs = self._act(obs) probs = remove_illegal(probs, legal_actions) action = np.random.choice(len(probs), p=probs) return action
def step(self, states): A = self.predict(states["obs"]) A = remove_illegal(A, states["legal_actions"]) action = np.random.choice(np.arange(len(A)), p=A) return action
def eval_step(self, state): ''' Use the average policy for evaluation purpose Args: state (dict): The current state. Returns: action (int): An action id. probs (list): The list of action probabilies ''' cards = '' pos = 0 for i in state['obs']: if (i == 1 and pos < 52): cards = cards + self.d[pos % 13] + '' + self.s[pos // 13] pos += 1 # if(len(cards) == 4 and not Combo(cards) in self.late_range.combos): # return 0, 1 tab = [] handcards = cards legal_actions = state['legal_actions'] for i in state['public_cards']: tab.append((self.c2n[i[1]], i[0].lower())) hand = [] for i in range(0, len(handcards), 2): hand.append((self.c2n[handcards[i]], handcards[i + 1])) hand = [x for x in hand if x not in tab] stt = mcst.PokerState(hand, tab, state['cur'], state['opp'], abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], state['obs'][52], state['obs'][53]) par = mcst.MCTS(1) # print(state) if self.evaluate_with == 'best_response': action, probs = self._rl_agent.eval_step(state) m = par.UCT(rootstate=stt, itermax=100000, processes=32, verbose=False) print(m, probs) m = m[0] probs[m] += 1 # if probs[1] == probs[3] and probs[3] == probs[4] and probs[4] == probs[5]: # probs[2] /= 25 # probs[m] += 2 # elif not m == 5: # probs[m] += 2 # else: # probs[4] += 3 # if(len(tab) == 0): # probs[5] = 0 # else: # probs[5] /= 4 probs = remove_illegal(probs, legal_actions) probs /= sum(probs) elif self.evaluate_with == 'average_policy': obs = state['obs'] probs = self._act(obs) else: raise ValueError( "'evaluate_with' should be either 'average_policy' or 'best_response'." ) probs = remove_illegal(probs, legal_actions) action = np.random.choice(len(probs), p=probs) if (action == 0 and 1 in legal_actions): action = 1 # print(action, probs) return action, probs
def step(self, state): ''' Returns the action to be taken. Args: state (dict): The current state Returns: action (int): An action id ''' self.sample_episode_policy() cards = '' pos = 0 for i in state['obs']: if (i == 1 and pos < 52): cards = cards + self.d[pos % 13] + '' + self.s[pos // 13] pos += 1 # if(len(cards) == 4 and not Combo(cards) in self.late_range.combos): # return 0, 1 tab = [] handcards = cards for i in state['public_cards']: tab.append((self.c2n[i[1]], i[0].lower())) hand = [] for i in range(0, len(handcards), 2): hand.append((self.c2n[handcards[i]], handcards[i + 1])) # print(tab) hand = [x for x in hand if x not in tab] stt = mcst.PokerState(hand, tab, state['cur'], state['opp'], abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], state['obs'][52], state['obs'][53]) # print(hand, tab, 250 - min(state['obs'][-2:]), 250 - max(state['obs'][-2:]), abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], min(state['obs'][-2:]), max(state['obs'][-2:])) # mcst.PokerState() obs = state['obs'] legal_actions = state['legal_actions'] par = mcst.MCTS(1) if self._mode == MODE.best_response: probs = self._rl_agent.predict(obs) m = par.UCT(rootstate=stt, itermax=50000, processes=16, verbose=False) m = m[0] probs[m] += 1 probs = remove_illegal(probs, legal_actions) probs /= sum(probs) elif self._mode == MODE.average_policy: probs = self._act(obs) one_hot = np.eye(len(probs))[np.argmax(probs)] self._add_transition(obs, one_hot) probs = remove_illegal(probs, legal_actions) action = np.random.choice(len(probs), p=probs) # print(m, action) return action
def eval_step(self, state): ''' Use the average policy for evaluation purpose Args: state (dict): The current state. Returns: action (int): An action id. ''' cards = '' pos = 0 for i in state['obs']: if (i == 1 and pos < 52): cards = cards + self.d[pos % 13] + '' + self.s[pos // 13] pos += 1 # if(len(cards) == 4 and not Combo(cards) in self.late_range.combos): # return 0, 1 tab = [] handcards = cards for i in state['public_cards']: tab.append((self.c2n[i[1]], i[0].lower())) hand = [] for i in range(0, len(handcards), 2): hand.append((self.c2n[handcards[i]], handcards[i + 1])) # print(tab) hand = [x for x in hand if x not in tab] print(state) stt = mcst.PokerState(hand, tab, state['cur'], state['opp'], abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], state['obs'][52], state['obs'][53]) if self.evaluate_with == 'best_response': legal_actions = state['legal_actions'] action, probs = self._rl_agent.eval_step(state) print(probs, '------------') # gc.disable() # if len(tab) == 0: # m, p = self.MCTS.UCT(rootstate = stt, itermax = 65536, processes = 128, verbose = False) # # for each position after, our situation becomes more certain and we can parse deeper into the tree/investigate more indepth strategies # if len(tab) == 3: # m, p = self.MCTS.UCT(rootstate = stt, itermax = 32768, processes = 64, verbose = False) # if len(tab) == 4: # m, p = self.MCTS.UCT(rootstate = stt, itermax = 32768, processes = 32, verbose = False) # if len(tab) == 5: # m, p = self.MCTS.UCT(rootstate = stt, itermax = 32768, processes = 32, verbose = False) m, p = self.MCTS.UCT(rootstate=stt, itermax=1048576 // 8, processes=8, verbose=False) probs = copy.deepcopy(probs) print(probs, m) probs[m] += 1.5 probs = remove_illegal(probs, legal_actions) if (probs[1] != 0): probs[0] = 0 probs /= sum(probs) action = np.random.choice(len(probs), p=probs) print(m, probs) # probs[m] += 1 # probs = remove_illegal(probs, legal_actions) # probs /= sum(probs) elif self.evaluate_with == 'average_policy': obs = state['obs'] legal_actions = state['legal_actions'] probs = self._act(obs) probs = remove_illegal(probs, legal_actions) action = np.random.choice(len(probs), p=probs) else: raise ValueError( "'evaluate_with' should be either 'average_policy' or 'best_response'." ) return action, probs
def step(self, state): ''' Returns the action to be taken. Args: state (dict): The current state Returns: action (int): An action id ''' cards = '' pos = 0 for i in state['obs']: if (i == 1 and pos < 52): cards = cards + self.d[pos % 13] + '' + self.s[pos // 13] pos += 1 # if(len(cards) == 4 and not Combo(cards) in self.late_range.combos): # return 0, 1 tab = [] handcards = cards for i in state['public_cards']: tab.append((self.c2n[i[1]], i[0].lower())) hand = [] for i in range(0, len(handcards), 2): hand.append((self.c2n[handcards[i]], handcards[i + 1])) # print(tab) hand = [x for x in hand if x not in tab] stt = mcst.PokerState(hand, tab, state['cur'], state['opp'], abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], state['obs'][52], state['obs'][53]) # print(hand, tab, 250 - min(state['obs'][-2:]), 250 - max(state['obs'][-2:]), abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], min(state['obs'][-2:]), max(state['obs'][-2:])) # mcst.PokerState() obs = state['obs'] legal_actions = state['legal_actions'] if self._mode == MODE.best_response: # now = time.clock() probs = self._rl_agent.predict(obs) # for the early hands, we want to perform shallow searches. Too much variability to "investigate" any potential strategy # gc.disable() # if len(tab) == 0: # m, p = self.MCTS.UCT(rootstate = stt, itermax = 16384, processes = 512, verbose = False) # # for each position after, our situation becomes more certain and we can parse deeper into the tree/investigate more indepth strategies # if len(tab) == 3: # m, p = self.MCTS.UCT(rootstate = stt, itermax = 16384, processes = 256, verbose = False) # if len(tab) == 4: # m, p = self.MCTS.UCT(rootstate = stt, itermax = 16384, processes = 128, verbose = False) # if len(tab) == 5: # m, p = self.MCTS.UCT(rootstate = stt, itermax = 16384, processes = 64, verbose = False) if len(tab) == 0: m, p = self.MCTS.UCT(rootstate=stt, itermax=100, processes=128, verbose=False) # for each position after, our situation becomes more certain and we can parse deeper into the tree/investigate more indepth strategies if len(tab) == 3: m, p = self.MCTS.UCT(rootstate=stt, itermax=100, processes=64, verbose=False) if len(tab) == 4: m, p = self.MCTS.UCT(rootstate=stt, itermax=100, processes=32, verbose=False) if len(tab) == 5: m, p = self.MCTS.UCT(rootstate=stt, itermax=100, processes=32, verbose=False) # print(time.clock() - now) probs = copy.deepcopy(probs) probs[m] += 1.5 probs = remove_illegal(probs, legal_actions) if (probs[1] != 0): probs[0] = 0 probs /= sum(probs) self._add_transition(obs, probs) elif self._mode == MODE.average_policy: probs = self._act(obs) probs = remove_illegal(probs, legal_actions) action = np.random.choice(len(probs), p=probs) return action
def eval_step(self, states): q_values = self.q_estimator.predict_nograd( np.expand_dims(states["obs"], 0))[0] probs = remove_illegal(np.exp(q_values), states["legal_actions"]) best_action = np.argmax(probs) return best_action, probs