def step(self, state, player_id):
        s = np.array(state['obs'])
        leagl_actions = state["legal_actions"]
        #cardstr,other_two_action,one_last, two_last, three_last, legal_card = numpytostr(state)

        if self.ran < -0.5:
            #print("zhi xing jiangducelue")
            probs = self.snet.t_choose_action(np.expand_dims(
                s, 0), state["legal_actions"]).detach().cpu().numpy()
            self.agent_probs.append(probs)
            probs = remove_illegal(probs[0], leagl_actions)
            action = np.random.choice(len(probs), p=probs)
            # print("player:", player_id, "手牌 is:", cardstr, "上一局:",0, "出牌:", ACTION_ID_TO_STR[action])
            return int(action)

        else:
            #print("zhi xing zhi qianghua")
            probs = self.lnet.t_choose_action(np.expand_dims(
                s, 0), state["legal_actions"]).detach().cpu().numpy()
            self.agent_probs.append(probs)

            probs = remove_illegal(probs[0], leagl_actions)
            action = np.random.choice(len(probs), p=probs)

            memdic = {}
            memdic['obs'] = np.expand_dims(s, 0)
            memdic['action'] = action
            self.mem.addmemory(memdic)

            # print("player:", player_id, "手牌 is:", cardstr, "上一局:",0, "出牌:", ACTION_ID_TO_STR[action])
            return int(action)
Example #2
0
    def step(self, state):
        ''' Returns the action to be taken.

        Args:
            state (dict): The current state

        Returns:
            action (int): An action id
        '''
        # print(table)
        cards = ''
        pos = 0
        # print(state)
        # print(state['obs'])
        for i in state['obs']:
            if (i == 1 and pos < 52):
                cards = cards + self.d[pos % 13] + '' + self.s[pos // 13]
            pos += 1
        if (len(cards) == 4 and not Combo(cards) in self.late_range.combos):
            return 0
        tab = []
        handcards = cards
        for i in table:
            tab.append((self.c2n[self.d[i % 13]], self.s[i // 13]))
            handcards = handcards.replace(
                self.d[i % 13] + '' + self.s[i // 13], '')
        hand = []
        for i in range(0, len(handcards), 2):
            hand.append((self.c2n[handcards[i]], handcards[i + 1]))

        stt = mcst.PokerState(hand, tab, 250 - min(state['obs'][-2:]),
                              250 - max(state['obs'][-2:]),
                              abs(state['obs'][-2] - state['obs'][-1]),
                              state['obs'][-2] + state['obs'][-1],
                              min(state['obs'][-2:]), max(state['obs'][-2:]))
        # print(hand, tab, 250 - min(state['obs'][-2:]), 250 - max(state['obs'][-2:]), abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], min(state['obs'][-2:]), max(state['obs'][-2:]))
        m = mcst.UCT(rootstate=stt, itermax=750, verbose=False)

        obs = state['obs']
        legal_actions = state['legal_actions']
        if self._mode == MODE.best_response:
            probs = self._rl_agent.predict(obs)
            one_hot = np.eye(len(probs))[np.argmax(probs)]
            self._add_transition(obs, one_hot)

        elif self._mode == MODE.average_policy:
            probs = self._act(obs)

        probs = remove_illegal(probs, legal_actions)
        probs[m] += .65
        probs /= sum(probs)
        probs = remove_illegal(probs, legal_actions)
        action = np.random.choice(len(probs), p=probs)

        # print(action, m)
        return action
Example #3
0
    def eval_step(self, state):
        ''' Use the average policy for evaluation purpose

        Args:
            state (dict): The current state.

        Returns:
            action (int): An action id.
            probs (list): The list of action probabilies
        '''

        cards = ''
        pos = 0
        # print(table)
        # print(state['obs'])
        for i in state['obs']:
            if (i == 1 and pos < 52):
                cards = cards + self.d[pos % 13] + '' + self.s[pos // 13]
            pos += 1
        if (len(cards) == 4 and not Combo(cards) in self.late_range.combos):
            return 0, 1

        tab = []
        handcards = cards
        for i in table:
            tab.append((self.c2n[self.d[i % 13]], self.s[i // 13]))
            handcards = handcards.replace(
                self.d[i % 13] + '' + self.s[i // 13], '')
        hand = []
        for i in range(0, len(handcards), 2):
            hand.append((self.c2n[handcards[i]], handcards[i + 1]))

        stt = mcst.PokerState(hand, tab, 250 - min(state['obs'][-2:]),
                              250 - max(state['obs'][-2:]),
                              abs(state['obs'][-2] - state['obs'][-1]),
                              state['obs'][-2] + state['obs'][-1],
                              min(state['obs'][-2:]), max(state['obs'][-2:]))
        # print(hand, tab, 250 - min(state['obs'][-2:]), 250 - max(state['obs'][-2:]), abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], min(state['obs'][-2:]), max(state['obs'][-2:]))

        # print(state)
        if self.evaluate_with == 'best_response':
            action, probs = self._rl_agent.eval_step(state)

        elif self.evaluate_with == 'average_policy':
            #print('hi?')
            obs = state['obs']
            legal_actions = state['legal_actions']
            probs = self._act(obs)
            m = mcst.UCT(rootstate=stt, itermax=750, verbose=False)
            probs[m] += .65
            probs /= sum(probs)
            probs = remove_illegal(probs, legal_actions)
            action = np.random.choice(len(probs), p=probs)

            # print(action, m)
        else:
            raise ValueError(
                "'evaluate_with' should be either 'average_policy' or 'best_response'."
            )
        return action, probs
Example #4
0
    def step(self, state):
        ''' Returns the action to be taken.

        Args:
            state (dict): The current state

        Returns:
            action (int): An action id
        '''
        obs = state['obs']
        legal_actions = state['legal_actions']
        if self._mode == MODE.best_response:
            # probs = self._rl_agent.predict(obs)
            probs = self._rl_agent.predict(state)

            # one_hot = np.eye(len(probs))[np.argmax(probs)]
            one_hot = np.zeros(len(probs))
            one_hot[np.argmax(probs)] = 1

            self._add_transition(obs, one_hot)

        elif self._mode == MODE.average_policy:
            probs = self._act(obs)

        elif self._mode == MODE.rule_policy:
            action = TractorRuleAgent.step(state)
            probs = np.zeros(self._action_num, dtype=float)
            probs[action] = 1
            one_hot = np.eye(len(probs))[action]
            self._add_transition(obs, one_hot)

        probs = remove_illegal(probs, legal_actions)
        action = np.random.choice(len(probs), p=probs)

        return action
Example #5
0
    def eval_step(self, state):
        ''' Use the average policy for evaluation purpose

        Args:
            state (dict): The current state.

        Returns:
            action (int): An action id.
            info (dict): A dictionary containing information
        '''
        if self.evaluate_with == 'best_response':
            action, info = self._rl_agent.eval_step(state)
        elif self.evaluate_with == 'average_policy':
            obs = state['obs']
            legal_actions = list(state['legal_actions'].keys())
            probs = self._act(obs)
            probs = remove_illegal(probs, legal_actions)
            action = np.random.choice(len(probs), p=probs)
            info = {}
            info['probs'] = {
                state['raw_legal_actions'][i]:
                float(probs[list(state['legal_actions'].keys())[i]])
                for i in range(len(state['legal_actions']))
            }
        else:
            raise ValueError(
                "'evaluate_with' should be either 'average_policy' or 'best_response'."
            )
        return action, info
Example #6
0
 def eval_step(self, state):
     obs = np.ravel(state["obs"])
     softmax_action = remove_illegal(
         self._softmax_action(obs), state["legal_actions"]
     )
     action = np.argmax(softmax_action)
     return action, softmax_action
Example #7
0
    def predict(self, state):
        ''' Predict the action probabilities

        Args:
            state (numpy.array): current state

        Returns:
            q_values (numpy.array): a 1-d array where each entry represents a Q value
        '''
        epsilon = self.epsilons[min(self.total_t,
                                    self.epsilon_decay_steps - 1)]
        A = np.ones(self.action_num, dtype=float) * epsilon / len(
            state['legal_actions'])

        q_values = self.q_estimator.predict(self.sess,
                                            np.expand_dims(state['obs'], 0))[0]

        best_action = state['legal_actions'][0]
        best_action_q_value = q_values[best_action]
        for action in state['legal_actions']:
            if q_values[action] > best_action_q_value:
                best_action = action
                best_action_q_value = q_values[action]

        A[best_action] += (1.0 - epsilon)

        # TODO: no need to normalize in this function
        A = remove_illegal(A, state['legal_actions'])
        return A
Example #8
0
    def _pick_action(self, softmax_action, legal_actions):
        # ie if all probs are 0
        legal_probs = remove_illegal(softmax_action, legal_actions)
        if not np.any(legal_probs):
            action = np.random.choice(legal_actions)
        else:
            action = np.argmax(legal_probs)

        return action
    def eval_step(self, state, player_id):
        s = np.array(state['obs'])
        leagl_actions = state["legal_actions"]
        #cardstr, other_two_action, one_last, two_last, three_last, legal_card = numpytostr(state)
        #probs = self.lnet.t_choose_action(np.expand_dims(self.normalizer.normalize(s), 0),state["legal_actions"]).detach().cpu().numpy()

        probs = self.lnet.t_choose_action(np.expand_dims(
            s, 0), state["legal_actions"]).detach().cpu().numpy()
        probs = remove_illegal(probs[0], leagl_actions)
        action = np.random.choice(len(probs), p=probs)
        #print("player:", player_id, "手牌 is:", cardstr, "上一局:",0, "出牌:", ACTION_ID_TO_STR[action])
        return int(action)
Example #10
0
 def step(self, state):
     ''' Predict the action for genrating training data but
         have the predictions disconnected from the computation graph
     Args:
         state (numpy.array): current state
     Returns:
         action (int): an action id
     '''
     A = self.predict(state['obs'])
     A = remove_illegal(A, state['legal_actions'])
     action = np.random.choice(np.arange(len(A)), p=A)
     return action
Example #11
0
 def eval_step(self, state):
     ''' Predict the action for evaluation purpose.
     Args:
         state (numpy.array): current state
     Returns:
         action (int): an action id
     '''
     q_values = self.q_estimator.predict_nograd(
         np.expand_dims(state['obs'], 0))[0]
     probs = remove_illegal(np.exp(q_values), state['legal_actions'])
     best_action = np.argmax(probs)
     return best_action, probs
Example #12
0
 def step(self, state: dict):
     ''' Predict the action for generating training data
     Args:
         state (dict): current state
     Returns:
         action (int): an action id
     '''
     # Sample according to probs (based on dqn_agent impl)
     state_obs = np.expand_dims(state['obs'], 0)
     A = self.policy.predict(self.sess, state_obs)[0]
     A = remove_illegal(A, state['legal_actions'])
     action = np.random.choice(np.arange(len(A)), p=A)
     return action
Example #13
0
    def step(self, state):
        ''' Predict the action for generating training data

        Args:
            state (numpy.array): current state

        Returns:
            action (int): an action id
        '''
        A = self.predict(state['obs'])
        A = remove_illegal(A, state['legal_actions'])
        action = np.random.choice(np.arange(len(A)), p=A)
        return action
Example #14
0
 def eval_step(self, state):
     ''' Predict the action given state for evaluation
     args:
         state (dict): current state
     returns:
         action (int): an action id
     '''
     obs = state['obs']
     legal_actions = state['legal_actions']
     action_prob = self.action_probabilities(obs)
     action_prob = remove_illegal(action_prob, legal_actions)
     action_prob /= action_prob.sum()
     action = np.random.choice(np.arange(len(action_prob)), p=action_prob)
     return action, action_prob
Example #15
0
 def eval_step(self, state: dict):
     ''' Predict the action given the current state for evaluation.
     Args:
         state (dict): current state
     Returns:
         action (int): an action id
         probs (list): a list of probabilies
     '''
     # Sample according to probs (based on dqn_agent impl)
     state_obs = np.expand_dims(state['obs'], 0)
     A = self.policy.predict(self.sess, state_obs)[0]
     probs = remove_illegal(A, state['legal_actions'])
     best_action = np.argmax(probs)
     return best_action, probs
Example #16
0
    def eval_step(self, state):
        """ Predict the action for evaluation purpose.

        Args:
            state (numpy.array): current state

        Returns:
            action (int): an action id
        """
        q_values = self.q_estimator.predict(
            self.sess,
            np.expand_dims(self.normalizer.normalize(state['obs']), 0))[0]
        probs = remove_illegal(np.exp(q_values), state['legal_actions'])
        best_action = np.argmax(probs)
        return best_action
Example #17
0
 def eval_step(self, state):
     ''' Use the average policy for evaluation purpose
     Args:
         state (dict): The current state.
     Returns:
         action (int): An action id.
     '''
     if self.evaluate_with == 'best_response':
         action, probs = self._rl_agent.eval_step(state)
     elif self.evaluate_with == 'average_policy':
         obs = state['obs']
         legal_actions = state['legal_actions']
         probs = self._act(obs)
         probs = remove_illegal(probs, legal_actions)
         action = np.random.choice(len(probs), p=probs)
     else:
         raise ValueError("'evaluate_with' should be either 'average_policy' or 'best_response'.")
     return action, probs
Example #18
0
    def step(self, state):
        ''' Returns the action to be taken.
        Args:
            state (dict): The current state
        Returns:
            action (int): An action id
        '''
        obs = state['obs']
        legal_actions = state['legal_actions']
        if self._mode == MODE.best_response:
            probs = self._rl_agent.predict(obs)
            self._add_transition(obs, probs)

        elif self._mode == MODE.average_policy:
            probs = self._act(obs)

        probs = remove_illegal(probs, legal_actions)
        action = np.random.choice(len(probs), p=probs)

        return action
Example #19
0
    def step(self, state):
        ''' Returns the action to be taken.

        Args:
            state (dict): The current state

        Returns:
            action (int): An action id
        '''
        obs = state['obs']
        legal_actions = list(state['legal_actions'].keys())
        if self._mode == 'best_response':
            action = self._rl_agent.step(state)
            one_hot = np.zeros(self._num_actions)
            one_hot[action] = 1
            self._add_transition(obs, one_hot)

        elif self._mode == 'average_policy':
            probs = self._act(obs)
            probs = remove_illegal(probs, legal_actions)
            action = np.random.choice(len(probs), p=probs)

        return action
Example #20
0
 def step(self, states):
     A = self.predict(states["obs"])
     A = remove_illegal(A, states["legal_actions"])
     action = np.random.choice(np.arange(len(A)), p=A)
     return action
Example #21
0
    def eval_step(self, state):
        ''' Use the average policy for evaluation purpose

        Args:
            state (dict): The current state.

        Returns:
            action (int): An action id.
            probs (list): The list of action probabilies
        '''

        cards = ''
        pos = 0
        for i in state['obs']:
            if (i == 1 and pos < 52):
                cards = cards + self.d[pos % 13] + '' + self.s[pos // 13]
            pos += 1
        # if(len(cards) == 4 and not Combo(cards) in self.late_range.combos):
        #     return 0, 1

        tab = []
        handcards = cards
        legal_actions = state['legal_actions']

        for i in state['public_cards']:
            tab.append((self.c2n[i[1]], i[0].lower()))
        hand = []
        for i in range(0, len(handcards), 2):
            hand.append((self.c2n[handcards[i]], handcards[i + 1]))

        hand = [x for x in hand if x not in tab]
        stt = mcst.PokerState(hand, tab, state['cur'], state['opp'],
                              abs(state['obs'][-2] - state['obs'][-1]),
                              state['obs'][-2] + state['obs'][-1],
                              state['obs'][52], state['obs'][53])
        par = mcst.MCTS(1)
        # print(state)
        if self.evaluate_with == 'best_response':
            action, probs = self._rl_agent.eval_step(state)
            m = par.UCT(rootstate=stt,
                        itermax=100000,
                        processes=32,
                        verbose=False)
            print(m, probs)
            m = m[0]
            probs[m] += 1
            # if probs[1] == probs[3] and probs[3] == probs[4] and probs[4] == probs[5]:
            #     probs[2] /= 25
            #     probs[m] += 2

            # elif not m == 5:
            #     probs[m] += 2
            # else:
            #     probs[4] += 3

            # if(len(tab) == 0):
            #     probs[5] = 0
            # else:
            #     probs[5] /= 4

            probs = remove_illegal(probs, legal_actions)
            probs /= sum(probs)

        elif self.evaluate_with == 'average_policy':
            obs = state['obs']
            probs = self._act(obs)

        else:
            raise ValueError(
                "'evaluate_with' should be either 'average_policy' or 'best_response'."
            )

        probs = remove_illegal(probs, legal_actions)
        action = np.random.choice(len(probs), p=probs)
        if (action == 0 and 1 in legal_actions):
            action = 1


#        print(action, probs)
        return action, probs
Example #22
0
    def step(self, state):
        ''' Returns the action to be taken.

        Args:
            state (dict): The current state

        Returns:
            action (int): An action id
        '''

        self.sample_episode_policy()

        cards = ''
        pos = 0

        for i in state['obs']:
            if (i == 1 and pos < 52):
                cards = cards + self.d[pos % 13] + '' + self.s[pos // 13]
            pos += 1
        # if(len(cards) == 4 and not Combo(cards) in self.late_range.combos):
        #     return 0, 1

        tab = []
        handcards = cards

        for i in state['public_cards']:
            tab.append((self.c2n[i[1]], i[0].lower()))
        hand = []
        for i in range(0, len(handcards), 2):
            hand.append((self.c2n[handcards[i]], handcards[i + 1]))

        # print(tab)

        hand = [x for x in hand if x not in tab]
        stt = mcst.PokerState(hand, tab, state['cur'], state['opp'],
                              abs(state['obs'][-2] - state['obs'][-1]),
                              state['obs'][-2] + state['obs'][-1],
                              state['obs'][52], state['obs'][53])
        # print(hand, tab, 250 - min(state['obs'][-2:]), 250 - max(state['obs'][-2:]), abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], min(state['obs'][-2:]), max(state['obs'][-2:]))
        # mcst.PokerState()

        obs = state['obs']
        legal_actions = state['legal_actions']

        par = mcst.MCTS(1)
        if self._mode == MODE.best_response:
            probs = self._rl_agent.predict(obs)
            m = par.UCT(rootstate=stt,
                        itermax=50000,
                        processes=16,
                        verbose=False)
            m = m[0]
            probs[m] += 1
            probs = remove_illegal(probs, legal_actions)
            probs /= sum(probs)

        elif self._mode == MODE.average_policy:
            probs = self._act(obs)
            one_hot = np.eye(len(probs))[np.argmax(probs)]
            self._add_transition(obs, one_hot)

        probs = remove_illegal(probs, legal_actions)
        action = np.random.choice(len(probs), p=probs)
        # print(m, action)
        return action
Example #23
0
    def eval_step(self, state):
        ''' Use the average policy for evaluation purpose

        Args:
            state (dict): The current state.

        Returns:
            action (int): An action id.
        '''

        cards = ''
        pos = 0

        for i in state['obs']:
            if (i == 1 and pos < 52):
                cards = cards + self.d[pos % 13] + '' + self.s[pos // 13]
            pos += 1
        # if(len(cards) == 4 and not Combo(cards) in self.late_range.combos):
        #     return 0, 1

        tab = []
        handcards = cards

        for i in state['public_cards']:
            tab.append((self.c2n[i[1]], i[0].lower()))
        hand = []
        for i in range(0, len(handcards), 2):
            hand.append((self.c2n[handcards[i]], handcards[i + 1]))

        # print(tab)

        hand = [x for x in hand if x not in tab]
        print(state)
        stt = mcst.PokerState(hand, tab, state['cur'], state['opp'],
                              abs(state['obs'][-2] - state['obs'][-1]),
                              state['obs'][-2] + state['obs'][-1],
                              state['obs'][52], state['obs'][53])

        if self.evaluate_with == 'best_response':
            legal_actions = state['legal_actions']

            action, probs = self._rl_agent.eval_step(state)
            print(probs, '------------')
            # gc.disable()
            # if len(tab) == 0:
            #     m, p = self.MCTS.UCT(rootstate = stt, itermax = 65536, processes = 128, verbose = False)

            # # for each position after, our situation becomes more certain and we can parse deeper into the tree/investigate more indepth strategies
            # if len(tab) == 3:
            #     m, p = self.MCTS.UCT(rootstate = stt, itermax = 32768, processes = 64, verbose = False)

            # if len(tab) == 4:
            #     m, p = self.MCTS.UCT(rootstate = stt, itermax = 32768, processes = 32, verbose = False)

            # if len(tab) == 5:
            #     m, p = self.MCTS.UCT(rootstate = stt, itermax = 32768, processes = 32, verbose = False)

            m, p = self.MCTS.UCT(rootstate=stt,
                                 itermax=1048576 // 8,
                                 processes=8,
                                 verbose=False)
            probs = copy.deepcopy(probs)
            print(probs, m)
            probs[m] += 1.5

            probs = remove_illegal(probs, legal_actions)

            if (probs[1] != 0):
                probs[0] = 0

            probs /= sum(probs)

            action = np.random.choice(len(probs), p=probs)
            print(m, probs)
            # probs[m] += 1
            # probs = remove_illegal(probs, legal_actions)
            # probs /= sum(probs)

        elif self.evaluate_with == 'average_policy':
            obs = state['obs']
            legal_actions = state['legal_actions']
            probs = self._act(obs)
            probs = remove_illegal(probs, legal_actions)
            action = np.random.choice(len(probs), p=probs)
        else:
            raise ValueError(
                "'evaluate_with' should be either 'average_policy' or 'best_response'."
            )
        return action, probs
Example #24
0
    def step(self, state):
        ''' Returns the action to be taken.

        Args:
            state (dict): The current state

        Returns:
            action (int): An action id
        '''

        cards = ''
        pos = 0

        for i in state['obs']:
            if (i == 1 and pos < 52):
                cards = cards + self.d[pos % 13] + '' + self.s[pos // 13]
            pos += 1
        # if(len(cards) == 4 and not Combo(cards) in self.late_range.combos):
        #     return 0, 1

        tab = []
        handcards = cards

        for i in state['public_cards']:
            tab.append((self.c2n[i[1]], i[0].lower()))
        hand = []
        for i in range(0, len(handcards), 2):
            hand.append((self.c2n[handcards[i]], handcards[i + 1]))

        # print(tab)

        hand = [x for x in hand if x not in tab]
        stt = mcst.PokerState(hand, tab, state['cur'], state['opp'],
                              abs(state['obs'][-2] - state['obs'][-1]),
                              state['obs'][-2] + state['obs'][-1],
                              state['obs'][52], state['obs'][53])
        # print(hand, tab, 250 - min(state['obs'][-2:]), 250 - max(state['obs'][-2:]), abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], min(state['obs'][-2:]), max(state['obs'][-2:]))
        # mcst.PokerState()

        obs = state['obs']
        legal_actions = state['legal_actions']

        if self._mode == MODE.best_response:
            #            now = time.clock()

            probs = self._rl_agent.predict(obs)

            # for the early hands, we want to perform shallow searches. Too much variability to "investigate" any potential strategy
            # gc.disable()
            # if len(tab) == 0:
            #     m, p = self.MCTS.UCT(rootstate = stt, itermax = 16384, processes = 512, verbose = False)

            # # for each position after, our situation becomes more certain and we can parse deeper into the tree/investigate more indepth strategies
            # if len(tab) == 3:
            #     m, p = self.MCTS.UCT(rootstate = stt, itermax = 16384, processes = 256, verbose = False)

            # if len(tab) == 4:
            #     m, p = self.MCTS.UCT(rootstate = stt, itermax = 16384, processes = 128, verbose = False)

            # if len(tab) == 5:
            #     m, p = self.MCTS.UCT(rootstate = stt, itermax = 16384, processes = 64, verbose = False)

            if len(tab) == 0:
                m, p = self.MCTS.UCT(rootstate=stt,
                                     itermax=100,
                                     processes=128,
                                     verbose=False)

            # for each position after, our situation becomes more certain and we can parse deeper into the tree/investigate more indepth strategies
            if len(tab) == 3:
                m, p = self.MCTS.UCT(rootstate=stt,
                                     itermax=100,
                                     processes=64,
                                     verbose=False)

            if len(tab) == 4:
                m, p = self.MCTS.UCT(rootstate=stt,
                                     itermax=100,
                                     processes=32,
                                     verbose=False)

            if len(tab) == 5:
                m, p = self.MCTS.UCT(rootstate=stt,
                                     itermax=100,
                                     processes=32,
                                     verbose=False)


#            print(time.clock() - now)

            probs = copy.deepcopy(probs)
            probs[m] += 1.5
            probs = remove_illegal(probs, legal_actions)

            if (probs[1] != 0):
                probs[0] = 0

            probs /= sum(probs)
            self._add_transition(obs, probs)

        elif self._mode == MODE.average_policy:
            probs = self._act(obs)

        probs = remove_illegal(probs, legal_actions)
        action = np.random.choice(len(probs), p=probs)

        return action
Example #25
0
 def eval_step(self, states):
     q_values = self.q_estimator.predict_nograd(
         np.expand_dims(states["obs"], 0))[0]
     probs = remove_illegal(np.exp(q_values), states["legal_actions"])
     best_action = np.argmax(probs)
     return best_action, probs