Beispiel #1
0
    def inference_minor_util60(role_id, handcards, num, is_pair, dup_mask, main_cards_char):
        for main_card in main_cards_char:
            handcards.remove(main_card)

        s = get_mask(handcards, action_space, None).astype(np.float32)
        outputs = []
        minor_type = 1 if is_pair else 0
        for i in range(num):
            input_single, input_pair, _, _ = get_masks(handcards, None)
            _, _, _, _, _, _, minor_response_prob = func(
                [np.array([role_id]), s.reshape(1, -1), np.zeros([1, 9085]), np.array([minor_type])]
            )

            # give minor cards
            mask = None
            if is_pair:
                mask = np.concatenate([input_pair, [0, 0]]) * dup_mask
            else:
                mask = input_single * dup_mask

            minor_response = take_action_from_prob(minor_response_prob, mask)
            dup_mask[minor_response] = 0

            # convert network output to char cards
            handcards.remove(to_char(minor_response + 3))
            if is_pair:
                handcards.remove(to_char(minor_response + 3))
            s = get_mask(handcards, action_space, None).astype(np.float32)

            # save to output
            outputs.append(to_char(minor_response + 3))
            if is_pair:
                outputs.append(to_char(minor_response + 3))
        return outputs
Beispiel #2
0
 def player_cards(self):
     other_two = self.get_last_two_handcards()
     curr_idx = self.get_current_idx()
     return {
         self.agent_names[(curr_idx + 2) % 3]: to_char(other_two[1]),
         self.agent_names[(curr_idx + 1) % 3]: to_char(other_two[0]),
         self.agent_names[curr_idx]: self.get_curr_handcards()
     }
Beispiel #3
0
    def get_state_and_action_spaces(self, action=None):

        def cards_char2embedding(cards_char):
            test = (action_space_onehot60 == Card.char2onehot60(cards_char))
            test = np.all(test, axis=1)
            target = np.where(test)[0]
            return self.encoding[target[0]]

        last_two_cards_char = self.player.get_last_two_cards()
        last_two_cards_char = [to_char(cards) for cards in last_two_cards_char]
        last_cards_char = last_two_cards_char[0]
        if not last_cards_char:
            last_cards_char = last_two_cards_char[1]
        curr_cards_char = to_char(self.player.get_curr_handcards())
        if self._comb_mask:
            # print(curr_cards_char, last_cards_char)
            combs = self.get_combinations(curr_cards_char, last_cards_char)
            if len(combs) > self.num_actions[0]:
                combs, self._fine_mask = self.subsample_combs_masks(combs, self._fine_mask, self.num_actions[0])
            # TODO: utilize temporal relations to speedup
            available_actions = [[action_space[idx] for idx in comb] for comb in combs]
            # print(available_actions)
            # print('-------------------------------------------')
            assert len(combs) > 0
            if self._fine_mask is not None:
                self._fine_mask = self.pad_fine_mask(self._fine_mask)
            self.pad_action_space(available_actions)
            state = [np.stack([self.encoding[idx] for idx in comb]) for comb in combs]
            assert len(state) > 0
            prob_state = self.player.get_state_prob()
            # test = action_space_onehot60 == Card.char2onehot60(last_cards_char)
            # test = np.all(test, axis=1)
            # target = np.where(test)[0]
            # assert target.size == 1
            extra_state = np.concatenate([cards_char2embedding(last_two_cards_char[0]), cards_char2embedding(last_two_cards_char[1]), prob_state])
            for i in range(len(state)):
                state[i] = np.concatenate([state[i], np.tile(extra_state[None, :], [state[i].shape[0], 1])], axis=-1)
            state = self.pad_state(state)
            assert state.shape[0] == self.num_actions[0] and state.shape[1] == self.num_actions[1]
        else:
            assert action is not None
            if self._fine_mask is not None:
                self._fine_mask = self._fine_mask[action]
            available_actions = self._action_space[action]
            state = self._current_ob[action:action+1, :, :]
            state = np.repeat(state, self.num_actions[0], axis=0)
            assert state.shape[0] == self.num_actions[0] and state.shape[1] == self.num_actions[1]
        return state, available_actions
    def _populate_exp(self):
        """ populate a transition by epsilon-greedy"""
        old_s = self._current_ob
        if self.rng.rand() <= self.exploration:
            act = self.rng.choice(range(self.num_actions))
        else:
            mask = get_mask(to_char(self.player.get_curr_handcards()),
                            action_space,
                            to_char(self.player.get_last_outcards()))
            q_values = self.predictor(old_s[None, ...])[0][0]
            q_values[mask == 0] = np.nan
            act = np.nanargmax(q_values)
            assert act < self.num_actions
        reward, isOver, _ = self.player.step_manual(to_value(
            action_space[act]))

        # step for AI
        while not isOver and self.player.get_role_ID() != ROLE_ID_TO_TRAIN:
            _, reward, _ = self.player.step_auto()
            isOver = (reward != 0)
        if ROLE_ID_TO_TRAIN == 2:
            reward = -reward
        self._current_game_score.feed(reward)

        if isOver:
            # print('lord wins' if reward > 0 else 'farmer wins')
            self._player_scores.feed(self._current_game_score.sum)
            # print(self._current_game_score.sum)
            while True:
                self.player.reset()
                # init_cards = np.arange(36)
                # self.player.prepare_manual(init_cards)
                self.player.prepare()
                early_stop = False
                while self.player.get_role_ID() != ROLE_ID_TO_TRAIN:
                    _, reward, _ = self.player.step_auto()
                    isOver = (reward != 0)
                    if isOver:
                        print('prestart ends too early! now resetting env')
                        early_stop = True
                        break
                if early_stop:
                    continue
                self._current_ob = self.get_state()
                break
            self._current_game_score.reset()
        self._current_ob = self.get_state()
        self.mem.append(Experience(old_s, act, reward, isOver))
Beispiel #5
0
def play_one_episode(env, func):

    env.reset()
    env.prepare()
    r = 0
    while r == 0:
        role_id = env.get_role_ID()
        if role_id == ROLE_ID_TO_TRAIN:
            s = get_state(env)
            mask = get_mask(to_char(env.get_curr_handcards()), action_space,
                            to_char(env.get_last_outcards()))
            q_values = func(s[None, ...])[0][0]
            q_values[mask == 0] = np.nan
            act = np.nanargmax(q_values)
            intention = to_value(action_space[act])
            r, _, _ = env.step_manual(intention)
        else:
            intention, r, _ = env.step_auto()
    return int(r > 0)
Beispiel #6
0
def play_one_episode(env, func):
    env.reset()
    env.prepare()
    r = 0
    while r == 0:
        role_id = env.get_role_ID()
        if role_id == ROLE_ID_TO_TRAIN:
            handcards = to_char(env.get_curr_handcards())
            last_two_cards = env.get_last_two_cards()
            last_two_cards = [to_char(cards) for cards in last_two_cards]
            prob_state = env.get_state_prob()
            # print(agent, handcards)

            action = func.predict(handcards, last_two_cards, prob_state)
            # print(agent, ' gives ', action)
            intention = to_value(action)
            r, _, _ = env.step_manual(intention)
        else:
            intention, r, _ = env.step_auto()
    return int(r > 0)
Beispiel #7
0
 def step_auto(self):
     idx = self.get_current_idx()
     # print(idx)
     intention, r, _ = super().step_auto()
     intention = to_char(intention)
     if len(intention) > 0:
         self.controller = self.agent_names[idx]
     # print(self.agent_names[idx], 'gives', intention, self.controller)
     assert np.all(self.get_state_prob() >= 0) and np.all(self.get_state_prob() <= 1)
     # print(intention)
     return r, r != 0
Beispiel #8
0
def get_state(env):
    def cards_char2embedding(cards_char):
        test = (action_space_onehot60 == Card.char2onehot60(cards_char))
        test = np.all(test, axis=1)
        target = np.where(test)[0]
        return encoding[target[0]]

    s = env.get_state_prob()
    s = np.concatenate([Card.val2onehot60(env.get_curr_handcards()), s])
    last_two_cards_char = env.get_last_two_cards()
    last_two_cards_char = [to_char(c) for c in last_two_cards_char]
    return np.concatenate(
        [s, cards_char2embedding(last_two_cards_char[0]), cards_char2embedding(last_two_cards_char[1])])
Beispiel #9
0
def play_one_episode(env, func, role_id):

    env.reset()
    env.prepare()
    r = 0
    while r == 0:
        if env.get_role_ID() == role_id:
            handcards = to_char(env.get_curr_handcards())
            last_two_cards = env.get_last_two_cards()
            last_two_cards = [to_char(cards) for cards in last_two_cards]
            prob_state = env.get_state_prob()
            # print(agent, handcards)

            action = func.predict(handcards, last_two_cards, prob_state)
            # print(agent, ' gives ', action)
            intention = to_value(action)
            r, _, _ = env.step_manual(intention)
            # print('lord gives', to_char(intention), file=f)
            assert (intention is not None)
        else:
            intention, r, _ = env.step_auto()

    return int(r > 0)
Beispiel #10
0
def ai_play():
    data = request.json
    print(data)
    pos = int(data['current_player'])

    player_cards = data['player_cards']
    my_cards = trans_cards(player_cards.split("|")[pos])
    last_move = trans_cards(data['last_move'])
    if int(data['last_player']) == int(data['current_player']):
        last_move = []
    else:
        last_move = trans_cards(data['last_move'])

    intention = to_char(
        CEnv.step_auto_static(Card.char2color(my_cards), to_value(last_move)))

    res = trans_cards_reverse(intention)
    if res == "":
        res = 'P'
    print("result is {}".format(res))
    return jsonify({'move': res})
Beispiel #11
0
 def get_mask(self):
     if self.act == ACT_TYPE.PASSIVE:
         decision_mask, response_mask, bomb_mask, _ = get_mask_alter(
             self.curr_handcards_char, to_char(self.last_cards_value),
             self.category)
         if self.mode == MODE.PASSIVE_DECISION:
             return decision_mask
         elif self.mode == MODE.PASSIVE_RESPONSE:
             return response_mask
         elif self.mode == MODE.PASSIVE_BOMB:
             return bomb_mask
         elif self.mode == MODE.MINOR_RESPONSE:
             input_single, input_pair, _, _ = get_masks(
                 self.curr_handcards_char, None)
             if self.minor_type == 1:
                 mask = np.append(input_pair, [0, 0])
             else:
                 mask = input_single
             for v in set(self.intention):
                 mask[v - 3] = 0
             return mask
     elif self.act == ACT_TYPE.ACTIVE:
         decision_mask, response_mask, _, length_mask = get_mask_alter(
             self.curr_handcards_char, [], self.category)
         if self.mode == MODE.ACTIVE_DECISION:
             return decision_mask
         elif self.mode == MODE.ACTIVE_RESPONSE:
             return response_mask[self.active_decision]
         elif self.mode == MODE.ACTIVE_SEQ:
             return length_mask[self.active_decision][self.active_response]
         elif self.mode == MODE.MINOR_RESPONSE:
             input_single, input_pair, _, _ = get_masks(
                 self.curr_handcards_char, None)
             if self.minor_type == 1:
                 mask = np.append(input_pair, [0, 0])
             else:
                 mask = input_single
             for v in set(self.intention):
                 mask[v - 3] = 0
             return mask
Beispiel #12
0
    def run(self):
        player = self._build_player()
        context = zmq.Context()
        c2s_socket = context.socket(zmq.PUSH)
        c2s_socket.setsockopt(zmq.IDENTITY, self.identity)
        c2s_socket.set_hwm(10)
        c2s_socket.connect(self.c2s)

        s2c_socket = context.socket(zmq.DEALER)
        s2c_socket.setsockopt(zmq.IDENTITY, self.identity)
        s2c_socket.connect(self.s2c)

        player.reset()
        # init_cards = np.arange(52)
        # init_cards = np.append(init_cards[::4], init_cards[1::4])
        # player.prepare_manual(init_cards)
        player.prepare()
        r, is_over = 0, False
        lstm_state = np.zeros([1024 * 2])
        while True:
            role_id = player.get_role_ID()
            if role_id in ROLE_IDS_TO_TRAIN:
                prob_state, all_state, curr_handcards_value, last_cards_value, last_category = \
                    player.get_state_prob(), player.get_state_all_cards(), player.get_curr_handcards(), player.get_last_outcards(), player.get_last_outcategory_idx()
                prob_state = np.concatenate(
                    [Card.val2onehot60(curr_handcards_value), prob_state])
                # after taking the last action, get to this state and get this reward/isOver.
                # If isOver, get to the next-episode state immediately.
                # This tuple is not the same as the one put into the memory buffer

                is_active = False if last_cards_value.size > 0 else True
                mask = get_mask(
                    to_char(curr_handcards_value), action_space,
                    None if is_active else to_char(last_cards_value))
                if is_active:
                    mask[0] = 0
                last_two_cards = player.get_last_two_cards()
                last_two_cards_onehot = np.concatenate([
                    Card.val2onehot60(last_two_cards[0]),
                    Card.val2onehot60(last_two_cards[1])
                ])
                c2s_socket.send(dumps(
                    (self.identity, role_id, prob_state, all_state,
                     last_two_cards_onehot, mask, 0 if is_active else 1,
                     lstm_state, r, is_over)),
                                copy=False)
                action_idx, lstm_state = loads(
                    s2c_socket.recv(copy=False).bytes)

                r, is_over, _ = player.step_manual(
                    to_value(action_space[action_idx]))
            else:
                _, r, _ = player.step_auto()
                is_over = (r != 0)
            if is_over:
                # print('{} over with reward {}'.format(self.identity, r))
                # logger.info('{} over with reward {}'.format(self.identity, r))
                # sys.stdout.flush()
                player.reset()
                player.prepare()
                lstm_state = np.zeros([1024 * 2])
Beispiel #13
0
 def get_last_two_cards(self):
     last_two_cards = super().get_last_two_cards()
     last_two_cards = [to_char(c) for c in last_two_cards]
     return last_two_cards
Beispiel #14
0
 def get_last_outcards(self):
     return to_char(super().get_last_outcards())
Beispiel #15
0
 def intention(self, env):
     intention = to_char(
         CEnv.step_auto_static(Card.char2color(env.get_curr_handcards()),
                               to_value(env.get_last_outcards())))
     return intention
Beispiel #16
0
def data_generator(rng):
    env = Env(rng.randint(1 << 31))
    # logger.info('called')

    while True:
        env.reset()
        env.prepare()
        r = 0
        while r == 0:
            last_cards_value = env.get_last_outcards()
            last_cards_char = to_char(last_cards_value)
            last_out_cards = Card.val2onehot60(last_cards_value)
            last_category_idx = env.get_last_outcategory_idx()
            curr_cards_char = to_char(env.get_curr_handcards())
            is_active = True if last_cards_value.size == 0 else False

            s = env.get_state_prob()
            # s = s[:60]
            intention, r, category_idx = env.step_auto()

            if category_idx == 14:
                continue
            minor_cards_targets = pick_minor_targets(category_idx,
                                                     to_char(intention))
            # self, state, last_cards, passive_decision_target, passive_bomb_target, passive_response_target,
            # active_decision_target, active_response_target, seq_length_target, minor_response_target, minor_type, mode
            if not is_active:
                if category_idx == Category.QUADRIC.value and category_idx != last_category_idx:
                    passive_decision_input = 1
                    passive_bomb_input = intention[0] - 3
                    yield s, last_out_cards, passive_decision_input, 0, 0, 0, 0, 0, 0, 0, 0
                    yield s, last_out_cards, 0, passive_bomb_input, 0, 0, 0, 0, 0, 0, 1

                else:
                    if category_idx == Category.BIGBANG.value:
                        passive_decision_input = 2
                        yield s, last_out_cards, passive_decision_input, 0, 0, 0, 0, 0, 0, 0, 0
                    else:
                        if category_idx != Category.EMPTY.value:
                            passive_decision_input = 3
                            # OFFSET_ONE
                            # 1st, Feb - remove relative card output since shift is hard for the network to learn
                            passive_response_input = intention[0] - 3
                            if passive_response_input < 0:
                                print("something bad happens")
                                passive_response_input = 0
                            yield s, last_out_cards, passive_decision_input, 0, 0, 0, 0, 0, 0, 0, 0
                            yield s, last_out_cards, 0, 0, passive_response_input, 0, 0, 0, 0, 0, 2
                        else:
                            passive_decision_input = 0
                            yield s, last_out_cards, passive_decision_input, 0, 0, 0, 0, 0, 0, 0, 0

            else:
                seq_length = get_seq_length(category_idx, intention)

                # ACTIVE OFFSET ONE!
                active_decision_input = category_idx - 1
                active_response_input = intention[0] - 3
                yield s, last_out_cards, 0, 0, 0, active_decision_input, 0, 0, 0, 0, 3
                yield s, last_out_cards, 0, 0, 0, 0, active_response_input, 0, 0, 0, 4

                if seq_length is not None:
                    # length offset one
                    seq_length_input = seq_length - 1
                    yield s, last_out_cards, 0, 0, 0, 0, 0, seq_length_input, 0, 0, 5

            if minor_cards_targets is not None:
                main_cards = pick_main_cards(category_idx, to_char(intention))
                handcards = curr_cards_char.copy()
                state = s.copy()
                for main_card in main_cards:
                    handcards.remove(main_card)
                cards_onehot = Card.char2onehot60(main_cards)

                # we must make the order in each 4 batch correct...
                discard_onehot_from_s_60(state, cards_onehot)

                is_pair = False
                minor_type = 0
                if category_idx == Category.THREE_TWO.value or category_idx == Category.THREE_TWO_LINE.value:
                    is_pair = True
                    minor_type = 1
                for target in minor_cards_targets:
                    target_val = Card.char2value_3_17(target) - 3
                    yield state.copy(
                    ), last_out_cards, 0, 0, 0, 0, 0, 0, target_val, minor_type, 6
                    cards = [target]
                    handcards.remove(target)
                    if is_pair:
                        if target not in handcards:
                            print('something wrong...')
                            print('minor', target)
                            print('main_cards', main_cards)
                            print('handcards', handcards)
                            print('intention', intention)
                            print('category_idx', category_idx)
                        else:
                            handcards.remove(target)
                            cards.append(target)

                    # correct for one-hot state
                    cards_onehot = Card.char2onehot60(cards)

                    # print(s.shape)
                    # print(cards_onehot.shape)
                    discard_onehot_from_s_60(state, cards_onehot)
Beispiel #17
0
def play_one_episode(env, func):
    def take_action_from_prob(prob, mask):
        prob = prob[0]
        # to avoid numeric difficulty
        prob[mask == 0] = -1
        return np.argmax(prob)

    env.reset()
    # init_cards = np.arange(52)
    # init_cards = np.append(init_cards[::4], init_cards[1::4])
    # env.prepare_manual(init_cards)
    env.prepare()
    r = 0
    lstm_state = np.zeros([1024 * 2])
    while r == 0:
        last_cards_value = env.get_last_outcards()
        last_cards_char = to_char(last_cards_value)
        last_two_cards = env.get_last_two_cards()
        last_two_cards_onehot = np.concatenate([
            Card.val2onehot60(last_two_cards[0]),
            Card.val2onehot60(last_two_cards[1])
        ])
        curr_cards_char = to_char(env.get_curr_handcards())
        is_active = True if last_cards_value.size == 0 else False

        s = env.get_state_prob()
        s = np.concatenate([Card.char2onehot60(curr_cards_char), s])
        # print(s.shape)

        role_id = env.get_role_ID()
        # print('%s current cards' % ('lord' if role_id == 2 else 'farmer'), curr_cards_char)

        if role_id in ROLE_IDS_TO_TRAIN:
            if is_active:
                # first get mask
                mask = get_mask(curr_cards_char, action_space, None)
                # not valid for active
                mask[0] = 0

                active_prob, _, lstm_state = func(np.array([role_id]),
                                                  s.reshape(1, -1),
                                                  np.zeros([1, 120]),
                                                  lstm_state.reshape(1, -1))

                # make decision depending on output
                action_idx = take_action_from_prob(active_prob, mask)
            else:
                # print('last cards char', last_cards_char)
                mask = get_mask(curr_cards_char, action_space, last_cards_char)

                _, passive_prob, lstm_state = func(
                    np.array([role_id]), s.reshape(1, -1),
                    last_two_cards_onehot.reshape(1, -1),
                    lstm_state.reshape(1, -1))

                action_idx = take_action_from_prob(passive_prob, mask)

            # since step auto needs full last card group info, we do not explicitly feed card type
            intention = to_value(action_space[action_idx])
            r, _, _ = env.step_manual(intention)
            # print('lord gives', to_char(intention))
            assert (intention is not None)
        else:
            intention, r, _ = env.step_auto()
            # print('farmer gives', to_char(intention))
    # if r > 0:
    #     print('farmer wins')
    # else:
    #     print('lord wins')
    return int(r > 0)
Beispiel #18
0
def play_one_episode(env, func):
    def take_action_from_prob(prob, mask):
        prob = prob[0]
        # to avoid numeric difficulty
        prob[mask == 0] = -1
        return np.argmax(prob)

    # return char minor cards output
    def inference_minor_util60(role_id, handcards, num, is_pair, dup_mask, main_cards_char):
        for main_card in main_cards_char:
            handcards.remove(main_card)

        s = get_mask(handcards, action_space, None).astype(np.float32)
        outputs = []
        minor_type = 1 if is_pair else 0
        for i in range(num):
            input_single, input_pair, _, _ = get_masks(handcards, None)
            _, _, _, _, _, _, minor_response_prob = func(
                [np.array([role_id]), s.reshape(1, -1), np.zeros([1, 9085]), np.array([minor_type])]
            )

            # give minor cards
            mask = None
            if is_pair:
                mask = np.concatenate([input_pair, [0, 0]]) * dup_mask
            else:
                mask = input_single * dup_mask

            minor_response = take_action_from_prob(minor_response_prob, mask)
            dup_mask[minor_response] = 0

            # convert network output to char cards
            handcards.remove(to_char(minor_response + 3))
            if is_pair:
                handcards.remove(to_char(minor_response + 3))
            s = get_mask(handcards, action_space, None).astype(np.float32)

            # save to output
            outputs.append(to_char(minor_response + 3))
            if is_pair:
                outputs.append(to_char(minor_response + 3))
        return outputs

    def inference_minor_cards60(role_id, category, s, handcards, seq_length, dup_mask, main_cards_char):
        if category == Category.THREE_ONE.value:
            return inference_minor_util60(role_id, handcards, 1, False, dup_mask, main_cards_char)
        if category == Category.THREE_TWO.value:
            return inference_minor_util60(role_id, handcards, 1, True, dup_mask, main_cards_char)
        if category == Category.THREE_ONE_LINE.value:
            return inference_minor_util60(role_id, handcards, seq_length, False, dup_mask, main_cards_char)
        if category == Category.THREE_TWO_LINE.value:
            return inference_minor_util60(role_id, handcards, seq_length, True, dup_mask, main_cards_char)
        if category == Category.FOUR_TWO.value:
            return inference_minor_util60(role_id, handcards, 2, False, dup_mask, main_cards_char)

    env.reset()
    init_cards = np.arange(21)
    # init_cards = np.append(init_cards[::4], init_cards[1::4])
    env.prepare_manual(init_cards)
    r = 0
    while r == 0:
        last_cards_value = env.get_last_outcards()
        last_cards_char = to_char(last_cards_value)
        last_out_cards = Card.val2onehot60(last_cards_value)
        last_category_idx = env.get_last_outcategory_idx()
        curr_cards_char = to_char(env.get_curr_handcards())
        is_active = True if last_cards_value.size == 0 else False

        s = get_mask(curr_cards_char, action_space, None if is_active else last_cards_char).astype(np.float32)
        last_state = get_mask(last_cards_char, action_space, None).astype(np.float32)
        # print(s.shape)

        role_id = env.get_role_ID()
        # print('%s current cards' % ('lord' if role_id == 2 else 'farmer'), curr_cards_char)

        intention = None
        if role_id == 2:
            if is_active:

                # first get mask
                decision_mask, response_mask, _, length_mask = get_mask_alter(curr_cards_char, [], last_category_idx)

                _, _, _, active_decision_prob, active_response_prob, active_seq_prob, _ = func(
                    [np.array([role_id]), s.reshape(1, -1), np.zeros([1, 9085]), np.zeros([s.shape[0]])]
                )

                # make decision depending on output
                active_decision = take_action_from_prob(active_decision_prob, decision_mask)

                active_category_idx = active_decision + 1

                # get response
                active_response = take_action_from_prob(active_response_prob, response_mask[active_decision])

                seq_length = 0
                # next sequence length
                if active_category_idx == Category.SINGLE_LINE.value or \
                        active_category_idx == Category.DOUBLE_LINE.value or \
                        active_category_idx == Category.TRIPLE_LINE.value or \
                        active_category_idx == Category.THREE_ONE_LINE.value or \
                        active_category_idx == Category.THREE_TWO_LINE.value:
                    seq_length = take_action_from_prob(active_seq_prob, length_mask[active_decision][active_response]) + 1

                # give main cards
                intention = give_cards_without_minor(active_response, last_cards_value, active_category_idx, seq_length)

                # then give minor cards
                if active_category_idx == Category.THREE_ONE.value or \
                        active_category_idx == Category.THREE_TWO.value or \
                        active_category_idx == Category.THREE_ONE_LINE.value or \
                        active_category_idx == Category.THREE_TWO_LINE.value or \
                        active_category_idx == Category.FOUR_TWO.value:
                    dup_mask = np.ones([15])
                    if seq_length > 0:
                        for i in range(seq_length):
                            dup_mask[intention[0] - 3 + i] = 0
                    else:
                        dup_mask[intention[0] - 3] = 0
                    intention = np.concatenate([intention,
                                                to_value(inference_minor_cards60(role_id, active_category_idx, s.copy(),
                                                                                 curr_cards_char.copy(), seq_length,
                                                                                 dup_mask, to_char(intention)))])
            else:
                # print(to_char(last_cards_value), is_bomb, last_category_idx)
                decision_mask, response_mask, bomb_mask, _ = get_mask_alter(curr_cards_char, to_char(last_cards_value),
                                                                            last_category_idx)

                passive_decision_prob, passive_bomb_prob, passive_response_prob, _, _, _, _ = func(
                    [np.array([role_id]), s.reshape(1, -1), last_state.reshape(1, -1), np.zeros([s.shape[0]])])

                passive_decision = take_action_from_prob(passive_decision_prob, decision_mask)

                if passive_decision == 0:
                    intention = np.array([])
                elif passive_decision == 1:

                    passive_bomb = take_action_from_prob(passive_bomb_prob, bomb_mask)

                    # converting 0-based index to 3-based value
                    intention = np.array([passive_bomb + 3] * 4)

                elif passive_decision == 2:
                    intention = np.array([16, 17])
                elif passive_decision == 3:
                    passive_response = take_action_from_prob(passive_response_prob, response_mask)

                    intention = give_cards_without_minor(passive_response, last_cards_value, last_category_idx, None)
                    if last_category_idx == Category.THREE_ONE.value or \
                            last_category_idx == Category.THREE_TWO.value or \
                            last_category_idx == Category.THREE_ONE_LINE.value or \
                            last_category_idx == Category.THREE_TWO_LINE.value or \
                            last_category_idx == Category.FOUR_TWO.value:
                        dup_mask = np.ones([15])
                        seq_length = get_seq_length(last_category_idx, last_cards_value)
                        if seq_length:
                            for i in range(seq_length):
                                dup_mask[intention[0] - 3 + i] = 0
                        else:
                            dup_mask[intention[0] - 3] = 0
                        intention = np.concatenate([intention,
                                                    to_value(inference_minor_cards60(role_id, last_category_idx, s.copy(),
                                                                                     curr_cards_char.copy(), seq_length,
                                                                                     dup_mask, to_char(intention)))])
            # since step auto needs full last card group info, we do not explicitly feed card type
            r, _, _ = env.step_manual(intention)
            # print('lord gives', to_char(intention))
            assert (intention is not None)
        else:
            intention, r, _ = env.step_auto()
            # print('farmer gives', to_char(intention))
    # if r > 0:
    #     print('farmer wins')
    # else:
    #     print('lord wins')
    return int(r > 0)
Beispiel #19
0
 def state(self):
     return get_mask(
         self.handcards_char, action_space,
         None if self.act == ACT_TYPE.ACTIVE else to_char(
             self.last_cards_value)).astype(np.float32)
Beispiel #20
0
 def step(self, action):
     if self.act == ACT_TYPE.PASSIVE:
         if self.mode == MODE.PASSIVE_DECISION:
             if action == 0 or action == 2:
                 self.finished = True
                 if action == 2:
                     self.intention = np.array([16, 17])
                     self.card_type = Category.BIGBANG.value
                 else:
                     self.card_type = Category.EMPTY.value
                 return
             elif action == 1:
                 self.mode = MODE.PASSIVE_BOMB
                 return
             elif action == 3:
                 self.mode = MODE.PASSIVE_RESPONSE
                 return
             else:
                 raise Exception('unexpected action')
         elif self.mode == MODE.PASSIVE_BOMB:
             # convert to value input
             self.intention = np.array([action + 3] * 4)
             self.finished = True
             self.card_type = Category.QUADRIC.value
             return
         elif self.mode == MODE.PASSIVE_RESPONSE:
             self.intention = give_cards_without_minor(
                 action, self.last_cards_value, self.category, None)
             if self.category == Category.THREE_ONE.value or \
                     self.category == Category.THREE_TWO.value or \
                     self.category == Category.THREE_ONE_LINE.value or \
                     self.category == Category.THREE_TWO_LINE.value or \
                     self.category == Category.FOUR_TWO.value:
                 if self.category == Category.THREE_TWO.value or self.category == Category.THREE_TWO_LINE.value:
                     self.minor_type = 1
                 self.mode = MODE.MINOR_RESPONSE
                 # modify the state for minor cards
                 # discard_onehot_from_s_60(self.prob_state, Card.val2onehot60(self.intention))
                 intention_char = to_char(self.intention)
                 for c in intention_char:
                     self.handcards_char.remove(c)
                 self.minor_length = get_seq_length(self.category,
                                                    self.last_cards_value)
                 if self.minor_length is None:
                     self.minor_length = 2 if self.category == Category.FOUR_TWO.value else 1
                 self.card_type = self.category
                 return
             else:
                 self.finished = True
                 self.card_type = self.category
                 return
         elif self.mode == MODE.MINOR_RESPONSE:
             minor_value_cards = [action + 3
                                  ] * (1 if self.minor_type == 0 else 2)
             # modify the state for minor cards
             minor_char = to_char(minor_value_cards)
             for c in minor_char:
                 self.handcards_char.remove(c)
             # discard_onehot_from_s_60(self.prob_state, Card.val2onehot60(minor_value_cards))
             self.intention = np.append(self.intention, minor_value_cards)
             assert self.minor_length > 0
             self.minor_length -= 1
             if self.minor_length == 0:
                 self.finished = True
                 return
             else:
                 return
     elif self.act == ACT_TYPE.ACTIVE:
         if self.mode == MODE.ACTIVE_DECISION:
             self.category = action + 1
             self.active_decision = action
             self.mode = MODE.ACTIVE_RESPONSE
             self.card_type = self.category
             return
         elif self.mode == MODE.ACTIVE_RESPONSE:
             if self.category == Category.SINGLE_LINE.value or \
                     self.category == Category.DOUBLE_LINE.value or \
                     self.category == Category.TRIPLE_LINE.value or \
                     self.category == Category.THREE_ONE_LINE.value or \
                     self.category == Category.THREE_TWO_LINE.value:
                 self.active_response = action
                 self.mode = MODE.ACTIVE_SEQ
                 return
             elif self.category == Category.THREE_ONE.value or \
                     self.category == Category.THREE_TWO.value or \
                     self.category == Category.FOUR_TWO.value:
                 if self.category == Category.THREE_TWO.value or self.category == Category.THREE_TWO_LINE.value:
                     self.minor_type = 1
                 self.mode = MODE.MINOR_RESPONSE
                 self.intention = give_cards_without_minor(
                     action, np.array([]), self.category, None)
                 # modify the state for minor cards
                 intention_char = to_char(self.intention)
                 for c in intention_char:
                     self.handcards_char.remove(c)
                 # discard_onehot_from_s_60(self.prob_state, Card.val2onehot60(self.intention))
                 self.minor_length = 2 if self.category == Category.FOUR_TWO.value else 1
                 return
             else:
                 self.intention = give_cards_without_minor(
                     action, np.array([]), self.category, None)
                 self.finished = True
                 return
         elif self.mode == MODE.ACTIVE_SEQ:
             self.minor_length = action + 1
             self.intention = give_cards_without_minor(
                 self.active_response, np.array([]), self.category,
                 action + 1)
             if self.category == Category.THREE_ONE_LINE.value or \
                     self.category == Category.THREE_TWO_LINE.value:
                 if self.category == Category.THREE_TWO.value or self.category == Category.THREE_TWO_LINE.value:
                     self.minor_type = 1
                 self.mode = MODE.MINOR_RESPONSE
                 # modify the state for minor cards
                 intention_char = to_char(self.intention)
                 for c in intention_char:
                     self.handcards_char.remove(c)
                 # discard_onehot_from_s_60(self.prob_state, Card.val2onehot60(self.intention))
             else:
                 self.finished = True
             return
         elif self.mode == MODE.MINOR_RESPONSE:
             minor_value_cards = [action + 3
                                  ] * (1 if self.minor_type == 0 else 2)
             # modify the state for minor cards
             minor_char = to_char(minor_value_cards)
             for c in minor_char:
                 self.handcards_char.remove(c)
             # discard_onehot_from_s_60(self.prob_state, Card.val2onehot60(minor_value_cards))
             self.intention = np.append(self.intention, minor_value_cards)
             assert self.minor_length > 0
             self.minor_length -= 1
             if self.minor_length == 0:
                 self.finished = True
                 return
             else:
                 return
Beispiel #21
0
 def step_auto(self):
     intention, r, _ = super().step_auto()
     intention = to_char(intention)
     assert np.all(self.get_state_prob() >= 0) and np.all(self.get_state_prob() <= 1)
     # print(intention)
     return r, r != 0
Beispiel #22
0
class Env:
    total_cards = sorted(to_char(np.arange(3, 16)) * 4 + ['*', '$'], key=lambda k: Card.cards_to_value[k])

    def __init__(self, agent_names=('agent1', 'agent2', 'agent3')):
        seed = (id(self) + int(datetime.now().strftime("%Y%m%d%H%M%S%f"))) % 4294967295
        np.random.seed(seed)
        self.agent_names = agent_names
        self.reset()

    def get_all_agent_names(self):
        return self.agent_names

    def get_curr_agent_name(self):
        return self.curr_player

    def reset(self):
        self.histories = {n: [] for n in self.agent_names}
        self.player_cards = {n: [] for n in self.agent_names}
        self.extra_cards = []
        self.lord = None
        self.controller = None
        self.last_cards_char = []
        self.out_cards = [[] for _ in range(3)]
        self.curr_player = None

    def get_role_ID(self):
        curr_idx = self.get_current_idx()
        assert 0 <= curr_idx <= 2
        if curr_idx == 0:
            return 2
        if curr_idx == 1:
            return 3
        return 1

    def get_current_idx(self):
        return self.agent_names.index(self.curr_player)

    def prepare(self):
        cards = Env.total_cards.copy()
        np.random.shuffle(cards)
        self.extra_cards = cards[17:20]
        self.player_cards[self.agent_names[0]] = sorted(cards[:20], key=lambda k: Card.cards_to_value[k])
        self.player_cards[self.agent_names[1]] = sorted(cards[20:37], key=lambda k: Card.cards_to_value[k])
        self.player_cards[self.agent_names[2]] = sorted(cards[37:], key=lambda k: Card.cards_to_value[k])
        self.lord = self.agent_names[0]
        self.controller = self.lord
        self.curr_player = self.lord

    def step(self, intention):
        print(self.get_curr_agent_name() + str(self.get_curr_handcards()) + " play:")
        print(str(intention))
        self.out_cards[self.agent_names.index(self.curr_player)] = intention
        if len(intention) == 0:
            self.curr_player = self.agent_names[(self.agent_names.index(self.curr_player) + 1) % len(self.agent_names)]
            return self.curr_player, False
        else:
            self.last_cards_char = intention
            self.controller = self.curr_player
            for card in intention:
                self.player_cards[self.curr_player].remove(card)

            self.histories[self.curr_player].extend(intention)
            if len(self.player_cards[self.curr_player]) == 0:
                if 'agent1' == self.curr_player:
                    print("winner is landlord ")
                else:
                    print("winner is fammer ")
                return self.curr_player, True
            else:
                self.curr_player = self.agent_names[
                    (self.agent_names.index(self.curr_player) + 1) % len(self.agent_names)]
                return self.curr_player, False

    def get_last_outcards(self):
        return self.last_cards_char.copy() if self.curr_player != self.controller else []

    def get_last_two_cards(self):
        return [self.out_cards[(self.agent_names.index(self.curr_player) + 2) % len(self.agent_names)].copy(),
                self.out_cards[(self.agent_names.index(self.curr_player) + 1) % len(self.agent_names)].copy()]

    def get_curr_handcards(self):
        return self.player_cards[self.curr_player].copy()

    def get_state_prob(self):
        total_cards = np.ones([60])
        total_cards[53:56] = 0
        total_cards[57:60] = 0
        player_idx = self.get_current_idx()
        remain_cards = total_cards - Card.char2onehot60(self.get_curr_handcards()
                                                        + self.histories[self.agent_names[player_idx]]
                                                        + self.histories[self.agent_names[(player_idx + 1) % 3]]
                                                        + self.histories[self.agent_names[(player_idx + 2) % 3]])
        # sanity check
        # remain_cards_check = Card.char2onehot60(self.player_cards[self.agent_names[(player_idx + 1) % 3]] + self.player_cards[self.agent_names[(player_idx + 2) % 3]])
        # remain_cards_cp = remain_cards.copy()
        # normalize(remain_cards_cp, 0, 60)
        # assert np.all(remain_cards_cp == remain_cards_check)
        next_cnt = len(self.player_cards[self.agent_names[(player_idx + 1) % len(self.agent_names)]])
        next_next_cnt = len(self.player_cards[self.agent_names[(player_idx + 2) % len(self.agent_names)]])
        right_prob_state = remain_cards * (next_cnt / (next_cnt + next_next_cnt))
        left_prob_state = remain_cards * (next_next_cnt / (next_cnt + next_next_cnt))
        prob_state = np.concatenate([right_prob_state, left_prob_state])
        return prob_state
Beispiel #23
0
 def get_curr_handcards(self):
     return to_char(super().get_curr_handcards())
Beispiel #24
0
    def run(self):
        player = self._build_player()
        context = zmq.Context()
        c2s_socket = context.socket(zmq.PUSH)
        c2s_socket.setsockopt(zmq.IDENTITY, self.identity)
        c2s_socket.set_hwm(10)
        c2s_socket.connect(self.c2s)

        s2c_socket = context.socket(zmq.DEALER)
        s2c_socket.setsockopt(zmq.IDENTITY, self.identity)
        s2c_socket.connect(self.s2c)

        player.reset()
        init_cards = np.arange(21)
        # init_cards = np.append(init_cards[::4], init_cards[1::4])
        player.prepare_manual(init_cards)
        r, is_over = 0, False
        while True:
            all_state, role_id, curr_handcards_value, last_cards_value, last_category = \
                player.get_state_all_cards(), player.get_role_ID(), player.get_curr_handcards(), player.get_last_outcards(), player.get_last_outcategory_idx()
            # after taking the last action, get to this state and get this reward/isOver.
            # If isOver, get to the next-episode state immediately.
            # This tuple is not the same as the one put into the memory buffer
            is_active = (last_cards_value.size == 0)
            all_state = np.stack([
                get_mask(
                    Card.onehot2char(all_state[i * 60:(i + 1) * 60]),
                    action_space,
                    None if is_active else to_char(last_cards_value)).astype(
                        np.float32) for i in range(3)
            ]).reshape(-1)
            last_state = get_mask(to_char(last_cards_value), action_space,
                                  None).astype(np.float32)

            if role_id == 2:
                st = SubState(
                    ACT_TYPE.PASSIVE if last_cards_value.size > 0
                    else ACT_TYPE.ACTIVE, all_state,
                    to_char(curr_handcards_value), last_cards_value,
                    last_category)
                if last_cards_value.size > 0:
                    assert last_category > 0
                first_st = True
                while not st.finished:
                    c2s_socket.send(dumps(
                        (self.identity, role_id,
                         st.state, st.all_state, last_state, first_st,
                         st.get_mask(), st.minor_type, st.mode, r, is_over)),
                                    copy=False)
                    first_st = False
                    action = loads(s2c_socket.recv(copy=False).bytes)
                    # logger.info('received action {}'.format(action))
                    # print(action)
                    st.step(action)

                # print(st.intention)
                assert st.card_type != -1
                r, is_over, category_idx = player.step_manual(st.intention)
            else:
                _, r, _ = player.step_auto()
                is_over = (r != 0)
            if is_over:
                # print('{} over with reward {}'.format(self.identity, r))
                # logger.info('{} over with reward {}'.format(self.identity, r))
                # sys.stdout.flush()
                player.reset()
                player.prepare_manual(init_cards)
Beispiel #25
0
def play_one_episode(env, func):
    env.reset()
    env.prepare()
    r = 0
    stats = [StatCounter() for _ in range(7)]
    while r == 0:
        last_cards_value = env.get_last_outcards()
        last_cards_char = to_char(last_cards_value)
        last_out_cards = Card.val2onehot60(last_cards_value)
        last_category_idx = env.get_last_outcategory_idx()
        curr_cards_char = to_char(env.get_curr_handcards())
        is_active = True if last_cards_value.size == 0 else False

        s = env.get_state_prob()
        intention, r, category_idx = env.step_auto()

        if category_idx == 14:
            continue
        minor_cards_targets = pick_minor_targets(category_idx,
                                                 to_char(intention))

        if not is_active:
            if category_idx == Category.QUADRIC.value and category_idx != last_category_idx:
                passive_decision_input = 1
                passive_bomb_input = intention[0] - 3
                passive_decision_prob, passive_bomb_prob, _, _, _, _, _ = func(
                    [
                        s.reshape(1, -1),
                        last_out_cards.reshape(1, -1),
                        np.zeros([s.shape[0]])
                    ])
                stats[0].feed(
                    int(passive_decision_input == np.argmax(
                        passive_decision_prob)))
                stats[1].feed(
                    int(passive_bomb_input == np.argmax(passive_bomb_prob)))

            else:
                if category_idx == Category.BIGBANG.value:
                    passive_decision_input = 2
                    passive_decision_prob, _, _, _, _, _, _ = func([
                        s.reshape(1, -1),
                        last_out_cards.reshape(1, -1),
                        np.zeros([s.shape[0]])
                    ])
                    stats[0].feed(
                        int(passive_decision_input == np.argmax(
                            passive_decision_prob)))
                else:
                    if category_idx != Category.EMPTY.value:
                        passive_decision_input = 3
                        # OFFSET_ONE
                        # 1st, Feb - remove relative card output since shift is hard for the network to learn
                        passive_response_input = intention[0] - 3
                        if passive_response_input < 0:
                            print("something bad happens")
                            passive_response_input = 0
                        passive_decision_prob, _, passive_response_prob, _, _, _, _ = func(
                            [
                                s.reshape(1, -1),
                                last_out_cards.reshape(1, -1),
                                np.zeros([s.shape[0]])
                            ])
                        stats[0].feed(
                            int(passive_decision_input == np.argmax(
                                passive_decision_prob)))
                        stats[2].feed(
                            int(passive_response_input == np.argmax(
                                passive_response_prob)))
                    else:
                        passive_decision_input = 0
                        passive_decision_prob, _, _, _, _, _, _ = func([
                            s.reshape(1, -1),
                            last_out_cards.reshape(1, -1),
                            np.zeros([s.shape[0]])
                        ])
                        stats[0].feed(
                            int(passive_decision_input == np.argmax(
                                passive_decision_prob)))

        else:
            seq_length = get_seq_length(category_idx, intention)

            # ACTIVE OFFSET ONE!
            active_decision_input = category_idx - 1
            active_response_input = intention[0] - 3
            _, _, _, active_decision_prob, active_response_prob, active_seq_prob, _ = func(
                [
                    s.reshape(1, -1),
                    last_out_cards.reshape(1, -1),
                    np.zeros([s.shape[0]])
                ])

            stats[3].feed(
                int(active_decision_input == np.argmax(active_decision_prob)))
            stats[4].feed(
                int(active_response_input == np.argmax(active_response_prob)))

            if seq_length is not None:
                # length offset one
                seq_length_input = seq_length - 1
                stats[5].feed(
                    int(seq_length_input == np.argmax(active_seq_prob)))

        if minor_cards_targets is not None:
            main_cards = pick_main_cards(category_idx, to_char(intention))
            handcards = curr_cards_char.copy()
            state = s.copy()
            for main_card in main_cards:
                handcards.remove(main_card)
            cards_onehot = Card.char2onehot60(main_cards)

            # we must make the order in each 4 batch correct...
            discard_onehot_from_s_60(state, cards_onehot)

            is_pair = False
            minor_type = 0
            if category_idx == Category.THREE_TWO.value or category_idx == Category.THREE_TWO_LINE.value:
                is_pair = True
                minor_type = 1
            for target in minor_cards_targets:
                target_val = Card.char2value_3_17(target) - 3
                _, _, _, _, _, _, minor_response_prob = func([
                    state.copy().reshape(1, -1),
                    last_out_cards.reshape(1, -1),
                    np.array([minor_type])
                ])
                stats[6].feed(
                    int(target_val == np.argmax(minor_response_prob)))
                cards = [target]
                handcards.remove(target)
                if is_pair:
                    if target not in handcards:
                        logger.warn('something wrong...')
                        logger.warn('minor', target)
                        logger.warn('main_cards', main_cards)
                        logger.warn('handcards', handcards)
                    else:
                        handcards.remove(target)
                        cards.append(target)

                # correct for one-hot state
                cards_onehot = Card.char2onehot60(cards)

                # print(s.shape)
                # print(cards_onehot.shape)
                discard_onehot_from_s_60(state, cards_onehot)
    return stats
Beispiel #26
0
 def ccardgroup2char(cg):
     return [to_char(int(c) + 3) for c in cg.cards]
Beispiel #27
0
    def _populate_exp(self):
        """ populate a transition by epsilon-greedy"""
        old_s = self._current_ob
        comb_mask = self._comb_mask
        if not self._comb_mask and self._fine_mask is not None:
            fine_mask = self._fine_mask if self._fine_mask.shape[0] == max(self.num_actions[0], self.num_actions[1]) \
                else np.pad(self._fine_mask, (0, max(self.num_actions[0], self.num_actions[1]) - self._fine_mask.shape[0]), 'constant', constant_values=(0, 0))
        else:
            fine_mask = np.ones(
                [max(self.num_actions[0], self.num_actions[1])], dtype=np.bool)
        last_cards_value = self.player.get_last_outcards()
        if self.rng.rand() <= self.exploration:
            if not self._comb_mask and self._fine_mask is not None:
                q_values = np.random.rand(self.num_actions[1])
                q_values[np.where(np.logical_not(self._fine_mask))[0]] = np.nan
                act = np.nanargmax(q_values)
                # print(q_values)
                # print(act)
            else:
                act = self.rng.choice(
                    range(self.num_actions[0 if comb_mask else 1]))
        else:
            q_values = self.predictor(old_s[None, :, :, :],
                                      np.array([comb_mask]),
                                      np.array([fine_mask]))[0][0]
            if not self._comb_mask and self._fine_mask is not None:
                q_values = q_values[:self.num_actions[1]]
                assert np.all(q_values[np.where(np.logical_not(
                    self._fine_mask))[0]] < -100)
                q_values[np.where(np.logical_not(self._fine_mask))[0]] = np.nan
            act = np.nanargmax(q_values)
            assert act < self.num_actions[0 if comb_mask else 1]
            # print(q_values)
            # print(act)
            # clamp action to valid range
            act = min(act, self.num_actions[0 if comb_mask else 1] - 1)
        if comb_mask:
            reward = 0
            isOver = False
        else:
            if last_cards_value.size > 0:
                if act > 0:
                    if not CardGroup.to_cardgroup(
                            self._action_space[act]).bigger_than(
                                CardGroup.to_cardgroup(
                                    to_char(last_cards_value))):
                        print('warning, some error happened')
            # print(to_char(self.player.get_curr_handcards()))
            reward, isOver, _ = self.player.step_manual(
                to_value(self._action_space[act]))

            # print(self._action_space[act])

        # step for AI
        while not isOver and self.player.get_role_ID() != ROLE_ID_TO_TRAIN:
            _, reward, _ = self.player.step_auto()
            isOver = (reward != 0)
        # if landlord negate the reward
        if ROLE_ID_TO_TRAIN == 2:
            reward = -reward
        self._current_game_score.feed(reward)

        if isOver:
            # print('lord wins' if reward > 0 else 'farmer wins')
            self._player_scores.feed(self._current_game_score.sum)
            # print(self._current_game_score.sum)
            while True:
                self.player.reset()
                # init_cards = np.arange(36)
                # self.player.prepare_manual(init_cards)
                self.player.prepare()
                self._comb_mask = True
                early_stop = False
                while self.player.get_role_ID() != ROLE_ID_TO_TRAIN:
                    _, reward, _ = self.player.step_auto()
                    isOver = (reward != 0)
                    if isOver:
                        print('prestart ends too early! now resetting env')
                        early_stop = True
                        break
                if early_stop:
                    continue
                self._current_ob, self._action_space = self.get_state_and_action_spaces(
                )
                break
            self._current_game_score.reset()
        else:
            self._comb_mask = not self._comb_mask
        self._current_ob, self._action_space = self.get_state_and_action_spaces(
            act if not self._comb_mask else None)
        self.mem.append(
            Experience(old_s, act, reward, isOver, comb_mask, fine_mask))