def get_combinations(self, curr_cards_char, last_cards_char): if len(curr_cards_char) > 10: card_mask = Card.char2onehot60(curr_cards_char).astype(np.uint8) mask = augment_action_space_onehot60 a = np.expand_dims(1 - card_mask, 0) * mask invalid_row_idx = set(np.where(a > 0)[0]) if len(last_cards_char) == 0: invalid_row_idx.add(0) valid_row_idx = [i for i in range(len(augment_action_space)) if i not in invalid_row_idx] mask = mask[valid_row_idx, :] idx_mapping = dict(zip(range(mask.shape[0]), valid_row_idx)) # augment mask # TODO: known issue: 555444666 will not decompose into 5554 and 66644 combs = get_combinations_nosplit(mask, card_mask) combs = [([] if len(last_cards_char) == 0 else [0]) + [clamp_action_idx(idx_mapping[idx]) for idx in comb] for comb in combs] if len(last_cards_char) > 0: idx_must_be_contained = set( [idx for idx in valid_row_idx if CardGroup.to_cardgroup(augment_action_space[idx]). \ bigger_than(CardGroup.to_cardgroup(last_cards_char))]) combs = [comb for comb in combs if not idx_must_be_contained.isdisjoint(comb)] fine_mask = np.zeros([len(combs), self.num_actions[1]], dtype=np.bool) for i in range(len(combs)): for j in range(len(combs[i])): if combs[i][j] in idx_must_be_contained: fine_mask[i][j] = True else: fine_mask = None else: mask = get_mask_onehot60(curr_cards_char, action_space, None).reshape(len(action_space), 15, 4).sum(-1).astype( np.uint8) valid = mask.sum(-1) > 0 cards_target = Card.char2onehot60(curr_cards_char).reshape(-1, 4).sum(-1).astype(np.uint8) # do not feed empty to C++, which will cause infinite loop combs = get_combinations_recursive(mask[valid, :], cards_target) idx_mapping = dict(zip(range(valid.shape[0]), np.where(valid)[0])) combs = [([] if len(last_cards_char) == 0 else [0]) + [idx_mapping[idx] for idx in comb] for comb in combs] if len(last_cards_char) > 0: valid[0] = True idx_must_be_contained = set( [idx for idx in range(len(action_space)) if valid[idx] and CardGroup.to_cardgroup(action_space[idx]). \ bigger_than(CardGroup.to_cardgroup(last_cards_char))]) combs = [comb for comb in combs if not idx_must_be_contained.isdisjoint(comb)] fine_mask = np.zeros([len(combs), self.num_actions[1]], dtype=np.bool) for i in range(len(combs)): for j in range(len(combs[i])): if combs[i][j] in idx_must_be_contained: fine_mask[i][j] = True else: fine_mask = None return combs, fine_mask
def dancing_link(): env = Pyenv() env.reset() env.prepare() # print(env.get_handcards()) cards = env.get_handcards() cards = ['3', '3', '3', '4', '4', '4'] import timeit begin = timeit.default_timer() card_mask = Card.char2onehot60(cards).astype(np.uint8) # mask = get_mask_onehot60(cards, action_space, None).astype(np.uint8) last_cards = ['3', '3'] mask = augment_action_space_onehot60 a = np.expand_dims(1 - card_mask, 0) * mask row_idx = set(np.where(a > 0)[0]) # tmp = np.ones(len(augment_action_space)) # tmp[row_idx] = 0 # tmp[0] = 0 # valid_row_idx = np.where(tmp > 0)[0] valid_row_idx = [ i for i in range(1, len(augment_action_space)) if i not in row_idx ] idx_must_be_contained = set([idx for idx in valid_row_idx if CardGroup.to_cardgroup(augment_action_space[idx]).\ bigger_than(CardGroup.to_cardgroup(last_cards))]) print(idx_must_be_contained) mask = mask[valid_row_idx, :] idx_mapping = dict(zip(range(mask.shape[0]), valid_row_idx)) # augment mask # TODO: known issue: 555444666 will not decompose into 5554 and 66644 combs = get_combinations_nosplit( mask, Card.char2onehot60(cards).astype(np.uint8)) combs = [[clamp_action_idx(idx_mapping[idx]) for idx in comb] for comb in combs] combs = [ comb for comb in combs if not idx_must_be_contained.isdisjoint(comb) ] fine_mask = np.zeros([len(combs), 21]) for i in range(len(combs)): for j in range(len(combs[i])): if combs[i][j] in idx_must_be_contained: fine_mask[i][j] = 1 print(fine_mask) end = timeit.default_timer() print(end - begin) print(len(combs)) for comb in combs: for idx in comb: print(action_space[idx], end=', ') print()
def get_state_prob(self): total_cards = np.ones([60]) total_cards[53:56] = 0 total_cards[57:60] = 0 player_idx = self.get_current_idx() remain_cards = total_cards - Card.char2onehot60( self.get_curr_handcards() + self.histories[self.agent_names[player_idx]] + self.histories[self.agent_names[(player_idx + 1) % 3]] + self.histories[self.agent_names[(player_idx + 2) % 3]]) # sanity check # remain_cards_check = Card.char2onehot60(self.player_cards[self.agent_names[(player_idx + 1) % 3]] + self.player_cards[self.agent_names[(player_idx + 2) % 3]]) # remain_cards_cp = remain_cards.copy() # normalize(remain_cards_cp, 0, 60) # assert np.all(remain_cards_cp == remain_cards_check) next_cnt = len( self.player_cards[self.agent_names[(player_idx + 1) % len(self.agent_names)]]) next_next_cnt = len( self.player_cards[self.agent_names[(player_idx + 2) % len(self.agent_names)]]) right_prob_state = remain_cards * (next_cnt / (next_cnt + next_next_cnt)) left_prob_state = remain_cards * (next_next_cnt / (next_cnt + next_next_cnt)) prob_state = np.concatenate([right_prob_state, left_prob_state]) return prob_state
def get_data(self): action_space_onehot = [Card.char2onehot60(a) for a in action_space] while True: yield [ action_space_onehot[self.rng.randint(0, len(action_space_onehot))] ]
def recursive(): import timeit env = Pyenv() st = StatCounter() for i in range(1): env.reset() env.prepare() # print(env.get_handcards()) cards = env.get_curr_handcards()[:15] cards = ['J', '10', '10', '7', '7', '6'] # last_cards = ['3', '3'] mask = get_mask_onehot60(cards, action_space, None).reshape(len(action_space), 15, 4).sum(-1).astype(np.uint8) valid = mask.sum(-1) > 0 cards_target = Card.char2onehot60(cards).reshape(-1, 4).sum(-1).astype( np.uint8) t1 = timeit.default_timer() print(cards_target) print(mask[valid]) combs = get_combinations_recursive(mask[valid, :], cards_target) print(combs) idx_mapping = dict(zip(range(valid.shape[0]), np.where(valid)[0])) # idx_must_be_contained = set( # [idx for idx in range(1, 9085) if valid[idx] and CardGroup.to_cardgroup(action_space[idx]). \ # bigger_than(CardGroup.to_cardgroup(last_cards))]) # print(idx_must_be_contained) combs = [[idx_mapping[idx] for idx in comb] for comb in combs] # combs = [comb for comb in combs if not idx_must_be_contained.isdisjoint(comb)] # fine_mask = np.zeros([len(combs), 21]) # for i in range(len(combs)): # for j in range(len(combs[i])): # if combs[i][j] in idx_must_be_contained: # fine_mask[i][j] = 1 # print(fine_mask) t2 = timeit.default_timer() st.feed(t2 - t1) print(len(combs)) import pdb pdb.set_trace() for comb in combs: for idx in comb: print(action_space[idx], end=', ') print() print(st.average)
def __init__(self, input_names, output_names): self.action_space_onehot = np.array([Card.char2onehot60(a) for a in action_space]) self.input_names = input_names self.output_names = output_names
def cards_char2embedding(cards_char): test = (action_space_onehot60 == Card.char2onehot60(cards_char)) test = np.all(test, axis=1) target = np.where(test)[0] return encoding[target[0]]
def data_generator(rng): env = Env(rng.randint(1 << 31)) # logger.info('called') while True: env.reset() env.prepare() r = 0 while r == 0: last_cards_value = env.get_last_outcards() last_cards_char = to_char(last_cards_value) last_out_cards = Card.val2onehot60(last_cards_value) last_category_idx = env.get_last_outcategory_idx() curr_cards_char = to_char(env.get_curr_handcards()) is_active = True if last_cards_value.size == 0 else False s = env.get_state_prob() # s = s[:60] intention, r, category_idx = env.step_auto() if category_idx == 14: continue minor_cards_targets = pick_minor_targets(category_idx, to_char(intention)) # self, state, last_cards, passive_decision_target, passive_bomb_target, passive_response_target, # active_decision_target, active_response_target, seq_length_target, minor_response_target, minor_type, mode if not is_active: if category_idx == Category.QUADRIC.value and category_idx != last_category_idx: passive_decision_input = 1 passive_bomb_input = intention[0] - 3 yield s, last_out_cards, passive_decision_input, 0, 0, 0, 0, 0, 0, 0, 0 yield s, last_out_cards, 0, passive_bomb_input, 0, 0, 0, 0, 0, 0, 1 else: if category_idx == Category.BIGBANG.value: passive_decision_input = 2 yield s, last_out_cards, passive_decision_input, 0, 0, 0, 0, 0, 0, 0, 0 else: if category_idx != Category.EMPTY.value: passive_decision_input = 3 # OFFSET_ONE # 1st, Feb - remove relative card output since shift is hard for the network to learn passive_response_input = intention[0] - 3 if passive_response_input < 0: print("something bad happens") passive_response_input = 0 yield s, last_out_cards, passive_decision_input, 0, 0, 0, 0, 0, 0, 0, 0 yield s, last_out_cards, 0, 0, passive_response_input, 0, 0, 0, 0, 0, 2 else: passive_decision_input = 0 yield s, last_out_cards, passive_decision_input, 0, 0, 0, 0, 0, 0, 0, 0 else: seq_length = get_seq_length(category_idx, intention) # ACTIVE OFFSET ONE! active_decision_input = category_idx - 1 active_response_input = intention[0] - 3 yield s, last_out_cards, 0, 0, 0, active_decision_input, 0, 0, 0, 0, 3 yield s, last_out_cards, 0, 0, 0, 0, active_response_input, 0, 0, 0, 4 if seq_length is not None: # length offset one seq_length_input = seq_length - 1 yield s, last_out_cards, 0, 0, 0, 0, 0, seq_length_input, 0, 0, 5 if minor_cards_targets is not None: main_cards = pick_main_cards(category_idx, to_char(intention)) handcards = curr_cards_char.copy() state = s.copy() for main_card in main_cards: handcards.remove(main_card) cards_onehot = Card.char2onehot60(main_cards) # we must make the order in each 4 batch correct... discard_onehot_from_s_60(state, cards_onehot) is_pair = False minor_type = 0 if category_idx == Category.THREE_TWO.value or category_idx == Category.THREE_TWO_LINE.value: is_pair = True minor_type = 1 for target in minor_cards_targets: target_val = Card.char2value_3_17(target) - 3 yield state.copy( ), last_out_cards, 0, 0, 0, 0, 0, 0, target_val, minor_type, 6 cards = [target] handcards.remove(target) if is_pair: if target not in handcards: print('something wrong...') print('minor', target) print('main_cards', main_cards) print('handcards', handcards) print('intention', intention) print('category_idx', category_idx) else: handcards.remove(target) cards.append(target) # correct for one-hot state cards_onehot = Card.char2onehot60(cards) # print(s.shape) # print(cards_onehot.shape) discard_onehot_from_s_60(state, cards_onehot)
def play_one_episode(env, func): env.reset() env.prepare() r = 0 stats = [StatCounter() for _ in range(7)] while r == 0: last_cards_value = env.get_last_outcards() last_cards_char = to_char(last_cards_value) last_out_cards = Card.val2onehot60(last_cards_value) last_category_idx = env.get_last_outcategory_idx() curr_cards_char = to_char(env.get_curr_handcards()) is_active = True if last_cards_value.size == 0 else False s = env.get_state_prob() intention, r, category_idx = env.step_auto() if category_idx == 14: continue minor_cards_targets = pick_minor_targets(category_idx, to_char(intention)) if not is_active: if category_idx == Category.QUADRIC.value and category_idx != last_category_idx: passive_decision_input = 1 passive_bomb_input = intention[0] - 3 passive_decision_prob, passive_bomb_prob, _, _, _, _, _ = func( [ s.reshape(1, -1), last_out_cards.reshape(1, -1), np.zeros([s.shape[0]]) ]) stats[0].feed( int(passive_decision_input == np.argmax( passive_decision_prob))) stats[1].feed( int(passive_bomb_input == np.argmax(passive_bomb_prob))) else: if category_idx == Category.BIGBANG.value: passive_decision_input = 2 passive_decision_prob, _, _, _, _, _, _ = func([ s.reshape(1, -1), last_out_cards.reshape(1, -1), np.zeros([s.shape[0]]) ]) stats[0].feed( int(passive_decision_input == np.argmax( passive_decision_prob))) else: if category_idx != Category.EMPTY.value: passive_decision_input = 3 # OFFSET_ONE # 1st, Feb - remove relative card output since shift is hard for the network to learn passive_response_input = intention[0] - 3 if passive_response_input < 0: print("something bad happens") passive_response_input = 0 passive_decision_prob, _, passive_response_prob, _, _, _, _ = func( [ s.reshape(1, -1), last_out_cards.reshape(1, -1), np.zeros([s.shape[0]]) ]) stats[0].feed( int(passive_decision_input == np.argmax( passive_decision_prob))) stats[2].feed( int(passive_response_input == np.argmax( passive_response_prob))) else: passive_decision_input = 0 passive_decision_prob, _, _, _, _, _, _ = func([ s.reshape(1, -1), last_out_cards.reshape(1, -1), np.zeros([s.shape[0]]) ]) stats[0].feed( int(passive_decision_input == np.argmax( passive_decision_prob))) else: seq_length = get_seq_length(category_idx, intention) # ACTIVE OFFSET ONE! active_decision_input = category_idx - 1 active_response_input = intention[0] - 3 _, _, _, active_decision_prob, active_response_prob, active_seq_prob, _ = func( [ s.reshape(1, -1), last_out_cards.reshape(1, -1), np.zeros([s.shape[0]]) ]) stats[3].feed( int(active_decision_input == np.argmax(active_decision_prob))) stats[4].feed( int(active_response_input == np.argmax(active_response_prob))) if seq_length is not None: # length offset one seq_length_input = seq_length - 1 stats[5].feed( int(seq_length_input == np.argmax(active_seq_prob))) if minor_cards_targets is not None: main_cards = pick_main_cards(category_idx, to_char(intention)) handcards = curr_cards_char.copy() state = s.copy() for main_card in main_cards: handcards.remove(main_card) cards_onehot = Card.char2onehot60(main_cards) # we must make the order in each 4 batch correct... discard_onehot_from_s_60(state, cards_onehot) is_pair = False minor_type = 0 if category_idx == Category.THREE_TWO.value or category_idx == Category.THREE_TWO_LINE.value: is_pair = True minor_type = 1 for target in minor_cards_targets: target_val = Card.char2value_3_17(target) - 3 _, _, _, _, _, _, minor_response_prob = func([ state.copy().reshape(1, -1), last_out_cards.reshape(1, -1), np.array([minor_type]) ]) stats[6].feed( int(target_val == np.argmax(minor_response_prob))) cards = [target] handcards.remove(target) if is_pair: if target not in handcards: logger.warn('something wrong...') logger.warn('minor', target) logger.warn('main_cards', main_cards) logger.warn('handcards', handcards) else: handcards.remove(target) cards.append(target) # correct for one-hot state cards_onehot = Card.char2onehot60(cards) # print(s.shape) # print(cards_onehot.shape) discard_onehot_from_s_60(state, cards_onehot) return stats
def run(self): logger.info('simulator main loop') context = zmq.Context() sim2coord_socket = context.socket(zmq.PUSH) sim2coord_socket.setsockopt(zmq.IDENTITY, self.name.encode('utf-8')) sim2coord_socket.set_hwm(2) sim2coord_socket.connect(self.sim2coord) coord2sim_socket = context.socket(zmq.DEALER) coord2sim_socket.setsockopt(zmq.IDENTITY, self.name.encode('utf-8')) coord2sim_socket.set_hwm(2) coord2sim_socket.connect(self.coord2sim) sim2exp_sockets = [] for sim2exp in self.sim2exps: sim2exp_socket = context.socket(zmq.PUSH) sim2exp_socket.setsockopt(zmq.IDENTITY, self.name.encode('utf-8')) sim2exp_socket.set_hwm(2) sim2exp_socket.connect(sim2exp) sim2exp_sockets.append(sim2exp_socket) sim2mgr_socket = context.socket(zmq.PUSH) sim2mgr_socket.setsockopt(zmq.IDENTITY, self.name.encode('utf-8')) sim2mgr_socket.set_hwm(2) sim2mgr_socket.connect(self.sim2mgr) mgr2sim_socket = context.socket(zmq.DEALER) mgr2sim_socket.setsockopt(zmq.IDENTITY, self.name.encode('utf-8')) mgr2sim_socket.set_hwm(2) mgr2sim_socket.connect(self.mgr2sim) # while True: # time.sleep(0.3) # print(self.name) # sim2exp_sockets[1].send(dumps([self.name, 'haha'])) # print('main loop') # while True: # time.sleep(0.3) # msg = loads(coord2sim_socket.recv(copy=False).bytes) # print(msg) # sim2coord_socket.send(dumps([self.name, self.agent_names[0], np.arange(10)])) def request_screen(): sim2mgr_socket.send(dumps([self.name, SimulatorManager.MSG_TYPE.SCREEN, []])) return loads(mgr2sim_socket.recv(copy=False).bytes) def request_click(bbox): sim2mgr_socket.send(dumps([self.name, SimulatorManager.MSG_TYPE.CLICK, [(bbox[0] + bbox[2]) // 2 + self.window_rect[0] + 6, (bbox[1] + bbox[3]) // 2 + self.window_rect[1] + 46]])) return loads(mgr2sim_socket.recv(copy=False).bytes) def request_lock(): sim2mgr_socket.send(dumps([self.name, SimulatorManager.MSG_TYPE.LOCK, []])) return loads(mgr2sim_socket.recv(copy=False).bytes) def request_unlock(): sim2mgr_socket.send(dumps([self.name, SimulatorManager.MSG_TYPE.UNLOCK, []])) return loads(mgr2sim_socket.recv(copy=False).bytes) def spin_lock_on_button(): act = dict() while not act: self.current_screen = request_screen() cv2.imwrite('debug.png', self.current_screen) act = get_current_button_action(self.current_screen) if self.toggle.value == 0: break return act def discard(act, bboxes, idxs): def diff(idxs, cards): res = [] for i in range(len(cards)): if cards[i] is not None: if i in idxs: res.append(i) else: if i not in idxs: res.append(i) return res differences = diff(idxs, get_cards_bboxes(request_screen(), self.templates, bboxes=bboxes)[0]) print(differences) request_lock() while len(differences) > 0: for d in differences: request_click(bboxes[d]) # request_click(bboxes[differences[0]]) # time.sleep(0.3) differences = diff(idxs, get_cards_bboxes(request_screen(), self.templates, bboxes=bboxes)[0]) print(differences) if 'chupai' in act: request_click(act['chupai']) elif 'alone_chupai' in act: request_click(act['alone_chupai']) elif 'ming_chupai' in act: request_click(act['ming_chupai']) request_unlock() game_cnt = 0 while True: import psutil # print('memory usage is: ', psutil.virtual_memory()) if self.toggle.value == 0: time.sleep(0.2) continue print('new round') self.current_screen = request_screen() act = spin_lock_on_button() if not act: continue print(act) if 'start' in act: request_click(act['start']) continue if self.state == Simulator.State.CALLING: # state has changed if 'reverse' in act: self.state = Simulator.State.PLAYING self.current_lord_pos = who_is_lord(self.current_screen) while self.current_lord_pos < 0: self.current_screen = request_screen() self.current_lord_pos = who_is_lord(self.current_screen) print('current lord pos ', self.current_lord_pos) if self.toggle.value == 0: break continue if 'continuous defeat' in act: request_click(act['continuous defeat']) continue print('calling', act) handcards, _ = get_cards_bboxes(self.current_screen, self.templates, 0) cards_value, _ = CEnv.get_cards_value(Card.char2color(handcards)) print('cards value: ', cards_value) # assert 'jiaodizhu' in act request_click(act['bujiao']) if cards_value < 10 else request_click(act['jiaodizhu']) elif self.state == Simulator.State.PLAYING: if 'defeat' in act or 'victory' in act: request_click(act['defeat'] if 'defeat' in act else act['victory']) if self.cached_msg is None: print('other player wins in one step!!!') continue win = is_win(self.current_screen) state, action, fine_mask = self.cached_msg if win: sim2exp_sockets[self.current_lord_pos].send(dumps([[state, state], action, 1, True, False, [fine_mask, fine_mask]])) self.win_rates[self.agent_names[self.current_lord_pos]].feed(1.) else: sim2exp_sockets[self.current_lord_pos].send(dumps([[state, state], action, -1, True, False, [fine_mask, fine_mask]])) self.win_rates[self.agent_names[self.current_lord_pos]].feed(0.) game_cnt += 1 if game_cnt % 100 == 0: for agent in self.agent_names: if self.win_rates[agent].count > 0: logger.info('[last-100]{} win rate: {}'.format(agent, self.win_rates[agent].average)) self.win_rates[agent].reset() self.reset_episode() continue # test if we have cached msg not sent print('playing', act) left_cards, _ = get_cards_bboxes(self.current_screen, self.mini_templates, 1) right_cards, _ = get_cards_bboxes(self.current_screen, self.mini_templates, 2) if None in left_cards or None in right_cards: request_click(act['buchu']) time.sleep(1.) continue assert None not in left_cards assert None not in right_cards self.history[1].extend(right_cards) self.history[2].extend(left_cards) # last_cards = left_cards # if not left_cards: # last_cards = right_cards # print('last cards', last_cards) total_cards = np.ones([60]) total_cards[53:56] = 0 total_cards[57:60] = 0 handcards, bboxes = get_cards_bboxes(self.current_screen, self.templates, 0) handcards = [card for card in handcards if card is not None] remain_cards = total_cards - Card.char2onehot60(handcards + self.history[0] + self.history[1] + self.history[2]) print('current handcards: ', handcards) # left_cnt, right_cnt = get_opponent_cnts(self.current_screen, self.tiny_templates) # print('left cnt: ', left_cnt, 'right cnt: ', right_cnt) left_cnt = 17 - len(self.history[2]) right_cnt = 17 - len(self.history[1]) if self.current_lord_pos == 1: left_cnt += 3 if self.current_lord_pos == 2: right_cnt += 3 # assert left_cnt > 0 and right_cnt > 0 # to be the same as C++ side, right comes before left right_prob_state = remain_cards * (right_cnt / (left_cnt + right_cnt)) left_prob_state = remain_cards * (left_cnt / (left_cnt + right_cnt)) prob_state = np.concatenate([right_prob_state, left_prob_state]) # assert prob_state.size == 120 # assert np.all(prob_state < 1.) and np.all(prob_state >= 0.) # print(prob_state) intention, buffer_comb, buffer_fine = self.predictor.predict(handcards, [left_cards, right_cards], prob_state, self, sim2coord_socket, coord2sim_socket) if self.cached_msg is not None: state, action, fine_mask = self.cached_msg sim2exp_sockets[self.current_lord_pos].send( dumps([[state, buffer_comb[0]], action, 0, False, False, [fine_mask, buffer_comb[2]]])) sim2exp_sockets[self.current_lord_pos].send( dumps([[buffer_comb[0], buffer_fine[0]], buffer_comb[1], 0, False, True, [buffer_comb[2], buffer_fine[2]]])) self.cached_msg = buffer_fine self.history[0].extend(intention) print('intention is: ', intention) intention.sort(key=lambda k: Card.cards_to_value[k]) if len(intention) == 0: request_click(act['buchu']) else: i = 0 j = 0 to_click = [] to_click_idxs = [] while j < len(intention): if handcards[i] == intention[j]: to_click_idxs.append(i) to_click.append(bboxes[i]) i += 1 j += 1 else: i += 1 for bbox in to_click: request_click(bbox) time.sleep(0.5) request_click([1310, 760, 1310, 760]) time.sleep(1.)
def play_one_episode(env, func): def take_action_from_prob(prob, mask): prob = prob[0] # to avoid numeric difficulty prob[mask == 0] = -1 return np.argmax(prob) env.reset() # init_cards = np.arange(52) # init_cards = np.append(init_cards[::4], init_cards[1::4]) # env.prepare_manual(init_cards) env.prepare() r = 0 lstm_state = np.zeros([1024 * 2]) while r == 0: last_cards_value = env.get_last_outcards() last_cards_char = to_char(last_cards_value) last_two_cards = env.get_last_two_cards() last_two_cards_onehot = np.concatenate([ Card.val2onehot60(last_two_cards[0]), Card.val2onehot60(last_two_cards[1]) ]) curr_cards_char = to_char(env.get_curr_handcards()) is_active = True if last_cards_value.size == 0 else False s = env.get_state_prob() s = np.concatenate([Card.char2onehot60(curr_cards_char), s]) # print(s.shape) role_id = env.get_role_ID() # print('%s current cards' % ('lord' if role_id == 2 else 'farmer'), curr_cards_char) if role_id in ROLE_IDS_TO_TRAIN: if is_active: # first get mask mask = get_mask(curr_cards_char, action_space, None) # not valid for active mask[0] = 0 active_prob, _, lstm_state = func(np.array([role_id]), s.reshape(1, -1), np.zeros([1, 120]), lstm_state.reshape(1, -1)) # make decision depending on output action_idx = take_action_from_prob(active_prob, mask) else: # print('last cards char', last_cards_char) mask = get_mask(curr_cards_char, action_space, last_cards_char) _, passive_prob, lstm_state = func( np.array([role_id]), s.reshape(1, -1), last_two_cards_onehot.reshape(1, -1), lstm_state.reshape(1, -1)) action_idx = take_action_from_prob(passive_prob, mask) # since step auto needs full last card group info, we do not explicitly feed card type intention = to_value(action_space[action_idx]) r, _, _ = env.step_manual(intention) # print('lord gives', to_char(intention)) assert (intention is not None) else: intention, r, _ = env.step_auto() # print('farmer gives', to_char(intention)) # if r > 0: # print('farmer wins') # else: # print('lord wins') return int(r > 0)