def eval_step(self, state): ''' Use the average policy for evaluation purpose Args: state (dict): The current state. Returns: action (int): An action id. probs (list): The list of action probabilies ''' cards = '' pos = 0 # print(table) # print(state['obs']) for i in state['obs']: if (i == 1 and pos < 52): cards = cards + self.d[pos % 13] + '' + self.s[pos // 13] pos += 1 if (len(cards) == 4 and not Combo(cards) in self.late_range.combos): return 0, 1 tab = [] handcards = cards for i in table: tab.append((self.c2n[self.d[i % 13]], self.s[i // 13])) handcards = handcards.replace( self.d[i % 13] + '' + self.s[i // 13], '') hand = [] for i in range(0, len(handcards), 2): hand.append((self.c2n[handcards[i]], handcards[i + 1])) stt = mcst.PokerState(hand, tab, 250 - min(state['obs'][-2:]), 250 - max(state['obs'][-2:]), abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], min(state['obs'][-2:]), max(state['obs'][-2:])) # print(hand, tab, 250 - min(state['obs'][-2:]), 250 - max(state['obs'][-2:]), abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], min(state['obs'][-2:]), max(state['obs'][-2:])) # print(state) if self.evaluate_with == 'best_response': action, probs = self._rl_agent.eval_step(state) elif self.evaluate_with == 'average_policy': #print('hi?') obs = state['obs'] legal_actions = state['legal_actions'] probs = self._act(obs) m = mcst.UCT(rootstate=stt, itermax=750, verbose=False) probs[m] += .65 probs /= sum(probs) probs = remove_illegal(probs, legal_actions) action = np.random.choice(len(probs), p=probs) # print(action, m) else: raise ValueError( "'evaluate_with' should be either 'average_policy' or 'best_response'." ) return action, probs
def step(self, state): ''' Returns the action to be taken. Args: state (dict): The current state Returns: action (int): An action id ''' # print(table) cards = '' pos = 0 # print(state) # print(state['obs']) for i in state['obs']: if (i == 1 and pos < 52): cards = cards + self.d[pos % 13] + '' + self.s[pos // 13] pos += 1 if (len(cards) == 4 and not Combo(cards) in self.late_range.combos): return 0 tab = [] handcards = cards for i in table: tab.append((self.c2n[self.d[i % 13]], self.s[i // 13])) handcards = handcards.replace( self.d[i % 13] + '' + self.s[i // 13], '') hand = [] for i in range(0, len(handcards), 2): hand.append((self.c2n[handcards[i]], handcards[i + 1])) stt = mcst.PokerState(hand, tab, 250 - min(state['obs'][-2:]), 250 - max(state['obs'][-2:]), abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], min(state['obs'][-2:]), max(state['obs'][-2:])) # print(hand, tab, 250 - min(state['obs'][-2:]), 250 - max(state['obs'][-2:]), abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], min(state['obs'][-2:]), max(state['obs'][-2:])) m = mcst.UCT(rootstate=stt, itermax=750, verbose=False) obs = state['obs'] legal_actions = state['legal_actions'] if self._mode == MODE.best_response: probs = self._rl_agent.predict(obs) one_hot = np.eye(len(probs))[np.argmax(probs)] self._add_transition(obs, one_hot) elif self._mode == MODE.average_policy: probs = self._act(obs) probs = remove_illegal(probs, legal_actions) probs[m] += .65 probs /= sum(probs) probs = remove_illegal(probs, legal_actions) action = np.random.choice(len(probs), p=probs) # print(action, m) return action
def eval_step(self, state): ''' Use the average policy for evaluation purpose Args: state (dict): The current state. Returns: action (int): An action id. probs (list): The list of action probabilies ''' cards = '' pos = 0 for i in state['obs']: if (i == 1 and pos < 52): cards = cards + self.d[pos % 13] + '' + self.s[pos // 13] pos += 1 # if(len(cards) == 4 and not Combo(cards) in self.late_range.combos): # return 0, 1 tab = [] handcards = cards legal_actions = state['legal_actions'] for i in state['public_cards']: tab.append((self.c2n[i[1]], i[0].lower())) hand = [] for i in range(0, len(handcards), 2): hand.append((self.c2n[handcards[i]], handcards[i + 1])) hand = [x for x in hand if x not in tab] stt = mcst.PokerState(hand, tab, state['cur'], state['opp'], abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], state['obs'][52], state['obs'][53]) par = mcst.MCTS(1) # print(state) if self.evaluate_with == 'best_response': action, probs = self._rl_agent.eval_step(state) m = par.UCT(rootstate=stt, itermax=100000, processes=32, verbose=False) print(m, probs) m = m[0] probs[m] += 1 # if probs[1] == probs[3] and probs[3] == probs[4] and probs[4] == probs[5]: # probs[2] /= 25 # probs[m] += 2 # elif not m == 5: # probs[m] += 2 # else: # probs[4] += 3 # if(len(tab) == 0): # probs[5] = 0 # else: # probs[5] /= 4 probs = remove_illegal(probs, legal_actions) probs /= sum(probs) elif self.evaluate_with == 'average_policy': obs = state['obs'] probs = self._act(obs) else: raise ValueError( "'evaluate_with' should be either 'average_policy' or 'best_response'." ) probs = remove_illegal(probs, legal_actions) action = np.random.choice(len(probs), p=probs) if (action == 0 and 1 in legal_actions): action = 1 # print(action, probs) return action, probs
def step(self, state): ''' Returns the action to be taken. Args: state (dict): The current state Returns: action (int): An action id ''' self.sample_episode_policy() cards = '' pos = 0 for i in state['obs']: if (i == 1 and pos < 52): cards = cards + self.d[pos % 13] + '' + self.s[pos // 13] pos += 1 # if(len(cards) == 4 and not Combo(cards) in self.late_range.combos): # return 0, 1 tab = [] handcards = cards for i in state['public_cards']: tab.append((self.c2n[i[1]], i[0].lower())) hand = [] for i in range(0, len(handcards), 2): hand.append((self.c2n[handcards[i]], handcards[i + 1])) # print(tab) hand = [x for x in hand if x not in tab] stt = mcst.PokerState(hand, tab, state['cur'], state['opp'], abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], state['obs'][52], state['obs'][53]) # print(hand, tab, 250 - min(state['obs'][-2:]), 250 - max(state['obs'][-2:]), abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], min(state['obs'][-2:]), max(state['obs'][-2:])) # mcst.PokerState() obs = state['obs'] legal_actions = state['legal_actions'] par = mcst.MCTS(1) if self._mode == MODE.best_response: probs = self._rl_agent.predict(obs) m = par.UCT(rootstate=stt, itermax=50000, processes=16, verbose=False) m = m[0] probs[m] += 1 probs = remove_illegal(probs, legal_actions) probs /= sum(probs) elif self._mode == MODE.average_policy: probs = self._act(obs) one_hot = np.eye(len(probs))[np.argmax(probs)] self._add_transition(obs, one_hot) probs = remove_illegal(probs, legal_actions) action = np.random.choice(len(probs), p=probs) # print(m, action) return action
def eval_step(self, state): ''' Use the average policy for evaluation purpose Args: state (dict): The current state. Returns: action (int): An action id. ''' cards = '' pos = 0 for i in state['obs']: if (i == 1 and pos < 52): cards = cards + self.d[pos % 13] + '' + self.s[pos // 13] pos += 1 # if(len(cards) == 4 and not Combo(cards) in self.late_range.combos): # return 0, 1 tab = [] handcards = cards for i in state['public_cards']: tab.append((self.c2n[i[1]], i[0].lower())) hand = [] for i in range(0, len(handcards), 2): hand.append((self.c2n[handcards[i]], handcards[i + 1])) # print(tab) hand = [x for x in hand if x not in tab] print(state) stt = mcst.PokerState(hand, tab, state['cur'], state['opp'], abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], state['obs'][52], state['obs'][53]) if self.evaluate_with == 'best_response': legal_actions = state['legal_actions'] action, probs = self._rl_agent.eval_step(state) print(probs, '------------') # gc.disable() # if len(tab) == 0: # m, p = self.MCTS.UCT(rootstate = stt, itermax = 65536, processes = 128, verbose = False) # # for each position after, our situation becomes more certain and we can parse deeper into the tree/investigate more indepth strategies # if len(tab) == 3: # m, p = self.MCTS.UCT(rootstate = stt, itermax = 32768, processes = 64, verbose = False) # if len(tab) == 4: # m, p = self.MCTS.UCT(rootstate = stt, itermax = 32768, processes = 32, verbose = False) # if len(tab) == 5: # m, p = self.MCTS.UCT(rootstate = stt, itermax = 32768, processes = 32, verbose = False) m, p = self.MCTS.UCT(rootstate=stt, itermax=1048576 // 8, processes=8, verbose=False) probs = copy.deepcopy(probs) print(probs, m) probs[m] += 1.5 probs = remove_illegal(probs, legal_actions) if (probs[1] != 0): probs[0] = 0 probs /= sum(probs) action = np.random.choice(len(probs), p=probs) print(m, probs) # probs[m] += 1 # probs = remove_illegal(probs, legal_actions) # probs /= sum(probs) elif self.evaluate_with == 'average_policy': obs = state['obs'] legal_actions = state['legal_actions'] probs = self._act(obs) probs = remove_illegal(probs, legal_actions) action = np.random.choice(len(probs), p=probs) else: raise ValueError( "'evaluate_with' should be either 'average_policy' or 'best_response'." ) return action, probs
def step(self, state): ''' Returns the action to be taken. Args: state (dict): The current state Returns: action (int): An action id ''' cards = '' pos = 0 for i in state['obs']: if (i == 1 and pos < 52): cards = cards + self.d[pos % 13] + '' + self.s[pos // 13] pos += 1 # if(len(cards) == 4 and not Combo(cards) in self.late_range.combos): # return 0, 1 tab = [] handcards = cards for i in state['public_cards']: tab.append((self.c2n[i[1]], i[0].lower())) hand = [] for i in range(0, len(handcards), 2): hand.append((self.c2n[handcards[i]], handcards[i + 1])) # print(tab) hand = [x for x in hand if x not in tab] stt = mcst.PokerState(hand, tab, state['cur'], state['opp'], abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], state['obs'][52], state['obs'][53]) # print(hand, tab, 250 - min(state['obs'][-2:]), 250 - max(state['obs'][-2:]), abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], min(state['obs'][-2:]), max(state['obs'][-2:])) # mcst.PokerState() obs = state['obs'] legal_actions = state['legal_actions'] if self._mode == MODE.best_response: # now = time.clock() probs = self._rl_agent.predict(obs) # for the early hands, we want to perform shallow searches. Too much variability to "investigate" any potential strategy # gc.disable() # if len(tab) == 0: # m, p = self.MCTS.UCT(rootstate = stt, itermax = 16384, processes = 512, verbose = False) # # for each position after, our situation becomes more certain and we can parse deeper into the tree/investigate more indepth strategies # if len(tab) == 3: # m, p = self.MCTS.UCT(rootstate = stt, itermax = 16384, processes = 256, verbose = False) # if len(tab) == 4: # m, p = self.MCTS.UCT(rootstate = stt, itermax = 16384, processes = 128, verbose = False) # if len(tab) == 5: # m, p = self.MCTS.UCT(rootstate = stt, itermax = 16384, processes = 64, verbose = False) if len(tab) == 0: m, p = self.MCTS.UCT(rootstate=stt, itermax=100, processes=128, verbose=False) # for each position after, our situation becomes more certain and we can parse deeper into the tree/investigate more indepth strategies if len(tab) == 3: m, p = self.MCTS.UCT(rootstate=stt, itermax=100, processes=64, verbose=False) if len(tab) == 4: m, p = self.MCTS.UCT(rootstate=stt, itermax=100, processes=32, verbose=False) if len(tab) == 5: m, p = self.MCTS.UCT(rootstate=stt, itermax=100, processes=32, verbose=False) # print(time.clock() - now) probs = copy.deepcopy(probs) probs[m] += 1.5 probs = remove_illegal(probs, legal_actions) if (probs[1] != 0): probs[0] = 0 probs /= sum(probs) self._add_transition(obs, probs) elif self._mode == MODE.average_policy: probs = self._act(obs) probs = remove_illegal(probs, legal_actions) action = np.random.choice(len(probs), p=probs) return action