Esempio n. 1
0
    def eval_step(self, state):
        ''' Use the average policy for evaluation purpose

        Args:
            state (dict): The current state.

        Returns:
            action (int): An action id.
            probs (list): The list of action probabilies
        '''

        cards = ''
        pos = 0
        # print(table)
        # print(state['obs'])
        for i in state['obs']:
            if (i == 1 and pos < 52):
                cards = cards + self.d[pos % 13] + '' + self.s[pos // 13]
            pos += 1
        if (len(cards) == 4 and not Combo(cards) in self.late_range.combos):
            return 0, 1

        tab = []
        handcards = cards
        for i in table:
            tab.append((self.c2n[self.d[i % 13]], self.s[i // 13]))
            handcards = handcards.replace(
                self.d[i % 13] + '' + self.s[i // 13], '')
        hand = []
        for i in range(0, len(handcards), 2):
            hand.append((self.c2n[handcards[i]], handcards[i + 1]))

        stt = mcst.PokerState(hand, tab, 250 - min(state['obs'][-2:]),
                              250 - max(state['obs'][-2:]),
                              abs(state['obs'][-2] - state['obs'][-1]),
                              state['obs'][-2] + state['obs'][-1],
                              min(state['obs'][-2:]), max(state['obs'][-2:]))
        # print(hand, tab, 250 - min(state['obs'][-2:]), 250 - max(state['obs'][-2:]), abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], min(state['obs'][-2:]), max(state['obs'][-2:]))

        # print(state)
        if self.evaluate_with == 'best_response':
            action, probs = self._rl_agent.eval_step(state)

        elif self.evaluate_with == 'average_policy':
            #print('hi?')
            obs = state['obs']
            legal_actions = state['legal_actions']
            probs = self._act(obs)
            m = mcst.UCT(rootstate=stt, itermax=750, verbose=False)
            probs[m] += .65
            probs /= sum(probs)
            probs = remove_illegal(probs, legal_actions)
            action = np.random.choice(len(probs), p=probs)

            # print(action, m)
        else:
            raise ValueError(
                "'evaluate_with' should be either 'average_policy' or 'best_response'."
            )
        return action, probs
Esempio n. 2
0
    def step(self, state):
        ''' Returns the action to be taken.

        Args:
            state (dict): The current state

        Returns:
            action (int): An action id
        '''
        # print(table)
        cards = ''
        pos = 0
        # print(state)
        # print(state['obs'])
        for i in state['obs']:
            if (i == 1 and pos < 52):
                cards = cards + self.d[pos % 13] + '' + self.s[pos // 13]
            pos += 1
        if (len(cards) == 4 and not Combo(cards) in self.late_range.combos):
            return 0
        tab = []
        handcards = cards
        for i in table:
            tab.append((self.c2n[self.d[i % 13]], self.s[i // 13]))
            handcards = handcards.replace(
                self.d[i % 13] + '' + self.s[i // 13], '')
        hand = []
        for i in range(0, len(handcards), 2):
            hand.append((self.c2n[handcards[i]], handcards[i + 1]))

        stt = mcst.PokerState(hand, tab, 250 - min(state['obs'][-2:]),
                              250 - max(state['obs'][-2:]),
                              abs(state['obs'][-2] - state['obs'][-1]),
                              state['obs'][-2] + state['obs'][-1],
                              min(state['obs'][-2:]), max(state['obs'][-2:]))
        # print(hand, tab, 250 - min(state['obs'][-2:]), 250 - max(state['obs'][-2:]), abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], min(state['obs'][-2:]), max(state['obs'][-2:]))
        m = mcst.UCT(rootstate=stt, itermax=750, verbose=False)

        obs = state['obs']
        legal_actions = state['legal_actions']
        if self._mode == MODE.best_response:
            probs = self._rl_agent.predict(obs)
            one_hot = np.eye(len(probs))[np.argmax(probs)]
            self._add_transition(obs, one_hot)

        elif self._mode == MODE.average_policy:
            probs = self._act(obs)

        probs = remove_illegal(probs, legal_actions)
        probs[m] += .65
        probs /= sum(probs)
        probs = remove_illegal(probs, legal_actions)
        action = np.random.choice(len(probs), p=probs)

        # print(action, m)
        return action
Esempio n. 3
0
    def eval_step(self, state):
        ''' Use the average policy for evaluation purpose

        Args:
            state (dict): The current state.

        Returns:
            action (int): An action id.
            probs (list): The list of action probabilies
        '''

        cards = ''
        pos = 0
        for i in state['obs']:
            if (i == 1 and pos < 52):
                cards = cards + self.d[pos % 13] + '' + self.s[pos // 13]
            pos += 1
        # if(len(cards) == 4 and not Combo(cards) in self.late_range.combos):
        #     return 0, 1

        tab = []
        handcards = cards
        legal_actions = state['legal_actions']

        for i in state['public_cards']:
            tab.append((self.c2n[i[1]], i[0].lower()))
        hand = []
        for i in range(0, len(handcards), 2):
            hand.append((self.c2n[handcards[i]], handcards[i + 1]))

        hand = [x for x in hand if x not in tab]
        stt = mcst.PokerState(hand, tab, state['cur'], state['opp'],
                              abs(state['obs'][-2] - state['obs'][-1]),
                              state['obs'][-2] + state['obs'][-1],
                              state['obs'][52], state['obs'][53])
        par = mcst.MCTS(1)
        # print(state)
        if self.evaluate_with == 'best_response':
            action, probs = self._rl_agent.eval_step(state)
            m = par.UCT(rootstate=stt,
                        itermax=100000,
                        processes=32,
                        verbose=False)
            print(m, probs)
            m = m[0]
            probs[m] += 1
            # if probs[1] == probs[3] and probs[3] == probs[4] and probs[4] == probs[5]:
            #     probs[2] /= 25
            #     probs[m] += 2

            # elif not m == 5:
            #     probs[m] += 2
            # else:
            #     probs[4] += 3

            # if(len(tab) == 0):
            #     probs[5] = 0
            # else:
            #     probs[5] /= 4

            probs = remove_illegal(probs, legal_actions)
            probs /= sum(probs)

        elif self.evaluate_with == 'average_policy':
            obs = state['obs']
            probs = self._act(obs)

        else:
            raise ValueError(
                "'evaluate_with' should be either 'average_policy' or 'best_response'."
            )

        probs = remove_illegal(probs, legal_actions)
        action = np.random.choice(len(probs), p=probs)
        if (action == 0 and 1 in legal_actions):
            action = 1


#        print(action, probs)
        return action, probs
Esempio n. 4
0
    def step(self, state):
        ''' Returns the action to be taken.

        Args:
            state (dict): The current state

        Returns:
            action (int): An action id
        '''

        self.sample_episode_policy()

        cards = ''
        pos = 0

        for i in state['obs']:
            if (i == 1 and pos < 52):
                cards = cards + self.d[pos % 13] + '' + self.s[pos // 13]
            pos += 1
        # if(len(cards) == 4 and not Combo(cards) in self.late_range.combos):
        #     return 0, 1

        tab = []
        handcards = cards

        for i in state['public_cards']:
            tab.append((self.c2n[i[1]], i[0].lower()))
        hand = []
        for i in range(0, len(handcards), 2):
            hand.append((self.c2n[handcards[i]], handcards[i + 1]))

        # print(tab)

        hand = [x for x in hand if x not in tab]
        stt = mcst.PokerState(hand, tab, state['cur'], state['opp'],
                              abs(state['obs'][-2] - state['obs'][-1]),
                              state['obs'][-2] + state['obs'][-1],
                              state['obs'][52], state['obs'][53])
        # print(hand, tab, 250 - min(state['obs'][-2:]), 250 - max(state['obs'][-2:]), abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], min(state['obs'][-2:]), max(state['obs'][-2:]))
        # mcst.PokerState()

        obs = state['obs']
        legal_actions = state['legal_actions']

        par = mcst.MCTS(1)
        if self._mode == MODE.best_response:
            probs = self._rl_agent.predict(obs)
            m = par.UCT(rootstate=stt,
                        itermax=50000,
                        processes=16,
                        verbose=False)
            m = m[0]
            probs[m] += 1
            probs = remove_illegal(probs, legal_actions)
            probs /= sum(probs)

        elif self._mode == MODE.average_policy:
            probs = self._act(obs)
            one_hot = np.eye(len(probs))[np.argmax(probs)]
            self._add_transition(obs, one_hot)

        probs = remove_illegal(probs, legal_actions)
        action = np.random.choice(len(probs), p=probs)
        # print(m, action)
        return action
Esempio n. 5
0
    def eval_step(self, state):
        ''' Use the average policy for evaluation purpose

        Args:
            state (dict): The current state.

        Returns:
            action (int): An action id.
        '''

        cards = ''
        pos = 0

        for i in state['obs']:
            if (i == 1 and pos < 52):
                cards = cards + self.d[pos % 13] + '' + self.s[pos // 13]
            pos += 1
        # if(len(cards) == 4 and not Combo(cards) in self.late_range.combos):
        #     return 0, 1

        tab = []
        handcards = cards

        for i in state['public_cards']:
            tab.append((self.c2n[i[1]], i[0].lower()))
        hand = []
        for i in range(0, len(handcards), 2):
            hand.append((self.c2n[handcards[i]], handcards[i + 1]))

        # print(tab)

        hand = [x for x in hand if x not in tab]
        print(state)
        stt = mcst.PokerState(hand, tab, state['cur'], state['opp'],
                              abs(state['obs'][-2] - state['obs'][-1]),
                              state['obs'][-2] + state['obs'][-1],
                              state['obs'][52], state['obs'][53])

        if self.evaluate_with == 'best_response':
            legal_actions = state['legal_actions']

            action, probs = self._rl_agent.eval_step(state)
            print(probs, '------------')
            # gc.disable()
            # if len(tab) == 0:
            #     m, p = self.MCTS.UCT(rootstate = stt, itermax = 65536, processes = 128, verbose = False)

            # # for each position after, our situation becomes more certain and we can parse deeper into the tree/investigate more indepth strategies
            # if len(tab) == 3:
            #     m, p = self.MCTS.UCT(rootstate = stt, itermax = 32768, processes = 64, verbose = False)

            # if len(tab) == 4:
            #     m, p = self.MCTS.UCT(rootstate = stt, itermax = 32768, processes = 32, verbose = False)

            # if len(tab) == 5:
            #     m, p = self.MCTS.UCT(rootstate = stt, itermax = 32768, processes = 32, verbose = False)

            m, p = self.MCTS.UCT(rootstate=stt,
                                 itermax=1048576 // 8,
                                 processes=8,
                                 verbose=False)
            probs = copy.deepcopy(probs)
            print(probs, m)
            probs[m] += 1.5

            probs = remove_illegal(probs, legal_actions)

            if (probs[1] != 0):
                probs[0] = 0

            probs /= sum(probs)

            action = np.random.choice(len(probs), p=probs)
            print(m, probs)
            # probs[m] += 1
            # probs = remove_illegal(probs, legal_actions)
            # probs /= sum(probs)

        elif self.evaluate_with == 'average_policy':
            obs = state['obs']
            legal_actions = state['legal_actions']
            probs = self._act(obs)
            probs = remove_illegal(probs, legal_actions)
            action = np.random.choice(len(probs), p=probs)
        else:
            raise ValueError(
                "'evaluate_with' should be either 'average_policy' or 'best_response'."
            )
        return action, probs
Esempio n. 6
0
    def step(self, state):
        ''' Returns the action to be taken.

        Args:
            state (dict): The current state

        Returns:
            action (int): An action id
        '''

        cards = ''
        pos = 0

        for i in state['obs']:
            if (i == 1 and pos < 52):
                cards = cards + self.d[pos % 13] + '' + self.s[pos // 13]
            pos += 1
        # if(len(cards) == 4 and not Combo(cards) in self.late_range.combos):
        #     return 0, 1

        tab = []
        handcards = cards

        for i in state['public_cards']:
            tab.append((self.c2n[i[1]], i[0].lower()))
        hand = []
        for i in range(0, len(handcards), 2):
            hand.append((self.c2n[handcards[i]], handcards[i + 1]))

        # print(tab)

        hand = [x for x in hand if x not in tab]
        stt = mcst.PokerState(hand, tab, state['cur'], state['opp'],
                              abs(state['obs'][-2] - state['obs'][-1]),
                              state['obs'][-2] + state['obs'][-1],
                              state['obs'][52], state['obs'][53])
        # print(hand, tab, 250 - min(state['obs'][-2:]), 250 - max(state['obs'][-2:]), abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], min(state['obs'][-2:]), max(state['obs'][-2:]))
        # mcst.PokerState()

        obs = state['obs']
        legal_actions = state['legal_actions']

        if self._mode == MODE.best_response:
            #            now = time.clock()

            probs = self._rl_agent.predict(obs)

            # for the early hands, we want to perform shallow searches. Too much variability to "investigate" any potential strategy
            # gc.disable()
            # if len(tab) == 0:
            #     m, p = self.MCTS.UCT(rootstate = stt, itermax = 16384, processes = 512, verbose = False)

            # # for each position after, our situation becomes more certain and we can parse deeper into the tree/investigate more indepth strategies
            # if len(tab) == 3:
            #     m, p = self.MCTS.UCT(rootstate = stt, itermax = 16384, processes = 256, verbose = False)

            # if len(tab) == 4:
            #     m, p = self.MCTS.UCT(rootstate = stt, itermax = 16384, processes = 128, verbose = False)

            # if len(tab) == 5:
            #     m, p = self.MCTS.UCT(rootstate = stt, itermax = 16384, processes = 64, verbose = False)

            if len(tab) == 0:
                m, p = self.MCTS.UCT(rootstate=stt,
                                     itermax=100,
                                     processes=128,
                                     verbose=False)

            # for each position after, our situation becomes more certain and we can parse deeper into the tree/investigate more indepth strategies
            if len(tab) == 3:
                m, p = self.MCTS.UCT(rootstate=stt,
                                     itermax=100,
                                     processes=64,
                                     verbose=False)

            if len(tab) == 4:
                m, p = self.MCTS.UCT(rootstate=stt,
                                     itermax=100,
                                     processes=32,
                                     verbose=False)

            if len(tab) == 5:
                m, p = self.MCTS.UCT(rootstate=stt,
                                     itermax=100,
                                     processes=32,
                                     verbose=False)


#            print(time.clock() - now)

            probs = copy.deepcopy(probs)
            probs[m] += 1.5
            probs = remove_illegal(probs, legal_actions)

            if (probs[1] != 0):
                probs[0] = 0

            probs /= sum(probs)
            self._add_transition(obs, probs)

        elif self._mode == MODE.average_policy:
            probs = self._act(obs)

        probs = remove_illegal(probs, legal_actions)
        action = np.random.choice(len(probs), p=probs)

        return action