def set_flags(self, flag, cond): dotype = 'true' if cond else 'false' todo_list = flag.get(dotype, None) if todo_list is None: return for todo in todo_list: name = todo['name'] val = todo['value'] operator = todo['op'] op = operator.lower() if not name: pass elif op in ('set', '='): self.rule_flags[name] = val elif op in ('decr', '-='): oldval = to_value(self.rule_flags[name]) newval = to_value(val) self.rule_flags[name] = str(oldval - newval) elif op in ('incr', '+='): oldval = to_value(self.rule_flags[name]) newval = to_value(val) self.rule_flags[name] = str(oldval + newval) else: raise Exception(f"Unknown flag operator: '{op}'")
def intention(self, env): def char2ccardgroup(chars): cg = CardGroup.to_cardgroup(chars) ccg = CCardGroup([CCard(to_value(c) - 3) for c in cg.cards], CCategory(cg.type), cg.value, cg.len) return ccg def ccardgroup2char(cg): return [to_char(int(c) + 3) for c in cg.cards] handcards_char = env.get_curr_handcards() chandcards = [CCard(to_value(c) - 3) for c in handcards_char] player_idx = env.get_current_idx() unseen_cards = env.player_cards[env.agent_names[(player_idx + 1) % 3]] + env.player_cards[env.agent_names[(player_idx + 2) % 3]] cunseen_cards = [CCard(to_value(c) - 3) for c in unseen_cards] # print(env.player_cards) next_handcards_cnt = len(env.player_cards[env.agent_names[(player_idx + 1) % 3]]) last_cg = char2ccardgroup(env.get_last_outcards()) # print(handcards_char, env.get_last_outcards(), next_handcards_cnt, env.curr_player, env.controller, env.lord) caction = mcsearch(chandcards, cunseen_cards, next_handcards_cnt, last_cg, (env.agent_names.index(env.curr_player) - env.agent_names.index(env.lord) + 3) % 3, (env.agent_names.index(env.controller) - env.agent_names.index(env.lord) + 3) % 3, 10, 50, 500) intention = ccardgroup2char(caction) return intention
def step_auto(self): def char2ccardgroup(chars): cg = CardGroup.to_cardgroup(chars) ccg = CCardGroup([CCard(to_value(c) - 3) for c in cg.cards], CCategory(cg.type), cg.value, cg.len) return ccg def ccardgroup2char(cg): return [to_char(int(c) + 3) for c in cg.cards] handcards_char = self.get_curr_handcards() chandcards = [CCard(to_value(c) - 3) for c in handcards_char] player_idx = self.get_current_idx() unseen_cards = self.player_cards[self.agent_names[ (player_idx + 1) % 3]] + self.player_cards[self.agent_names[(player_idx + 2) % 3]] cunseen_cards = [CCard(to_value(c) - 3) for c in unseen_cards] next_handcards_cnt = len( self.player_cards[self.agent_names[(player_idx + 1) % 3]]) last_cg = char2ccardgroup(self.get_last_outcards()) caction = mcsearch(chandcards, cunseen_cards, next_handcards_cnt, last_cg, (self.agent_names.index(self.curr_player) - self.agent_names.index(self.lord) + 3) % 3, (self.agent_names.index(self.controller) - self.agent_names.index(self.lord) + 3) % 3, 10, 50, 500) intention = ccardgroup2char(caction) return self.step(intention)
def step(self, intention): # print(intention) idx = self.get_current_idx() r, done, category = self.step_manual(to_value(intention)) if category > 0: self.controller = self.agent_names[idx] # print(self.agent_names[idx], 'gives', intention, self.controller) return r, done
def evaluate_flag(self, flag): try: name = flag['name'] uv = flag['value'] operator = flag['condition'] except KeyError as error: raise Exception(f"Missing key in flagCheck: '{error}'") op = operator.lower() ev = self.rule_flags.get(name, "") if not name: result = True elif op in ('equals', '=='): result = ev == uv elif op in ('notequals', '!='): result = ev != uv elif op in ('contains', '@'): result = uv in ev elif op in ('notcontains', '!@'): result = uv not in ev elif op in ('search', '~'): result = re.search(uv, ev) is not None elif op in ('notsearch', '!~'): result = re.search(uv, ev) is None elif op in ('lessthan', '<'): result = to_value(ev) < to_value(uv) elif op in ('lessthanequals', '<='): result = to_value(ev) <= to_value(uv) elif op in ('greaterthan', '>'): result = to_value(ev) > to_value(uv) elif op in ('greaterthanequals', '>='): result = to_value(ev) >= to_value(uv) else: raise SyntaxError(f"Unknown flag condition operator: '{operator}'") if 'and' in flag and 'or' in flag: raise SyntaxError( f"flag only allows either 'and' or 'or', not both") elif 'and' in flag: result2 = self.evaluate_flag(flag['and']) result = result and result2 elif 'or' in flag: result2 = self.evaluate_flag(flag['or']) result = result or result2 self.set_flags(flag, result) return result
def check_criteria(self, action): try: criterion = action['addon'] xpath = dict_gets(criterion, ('xpath', 'elementFinder')) if not xpath: return True uv = criterion['value'] operator = criterion['condition'] except KeyError as error: raise Exception(f"Missing key in addon: '{error}'") try: elem = self.driver.find_element_by_xpath(xpath) if elem.tag_name == 'input': ev = elem.get_attribute('value') elif elem.tag_name == 'label': ev = elem.text else: ev = elem.text op = operator.lower() if op in ('equals', '=='): result = ev == uv elif op in ('notequals', '!='): result = ev != uv elif op in ('contains', '@'): result = uv in ev elif op in ('notcontains', '!@'): result = uv not in ev elif op in ('search', '~'): result = re.search(uv, ev) is not None elif op in ('notsearch', '!~'): result = re.search(uv, ev) is None elif op in ('lessthan', '<'): result = to_value(ev) < to_value(uv) elif op in ('lessthanequals', '<='): result = to_value(ev) <= to_value(uv) elif op in ('greaterthan', '>'): result = to_value(ev) > to_value(uv) elif op in ('greaterthanequals', '>='): result = to_value(ev) >= to_value(uv) else: raise Exception(f"Unknown condition operator: '{operator}'") except NoSuchElementException: result = False return result
def _populate_exp(self): """ populate a transition by epsilon-greedy""" old_s = self._current_ob if self.rng.rand() <= self.exploration: act = self.rng.choice(range(self.num_actions)) else: mask = get_mask(to_char(self.player.get_curr_handcards()), action_space, to_char(self.player.get_last_outcards())) q_values = self.predictor(old_s[None, ...])[0][0] q_values[mask == 0] = np.nan act = np.nanargmax(q_values) assert act < self.num_actions reward, isOver, _ = self.player.step_manual(to_value( action_space[act])) # step for AI while not isOver and self.player.get_role_ID() != ROLE_ID_TO_TRAIN: _, reward, _ = self.player.step_auto() isOver = (reward != 0) if ROLE_ID_TO_TRAIN == 2: reward = -reward self._current_game_score.feed(reward) if isOver: # print('lord wins' if reward > 0 else 'farmer wins') self._player_scores.feed(self._current_game_score.sum) # print(self._current_game_score.sum) while True: self.player.reset() # init_cards = np.arange(36) # self.player.prepare_manual(init_cards) self.player.prepare() early_stop = False while self.player.get_role_ID() != ROLE_ID_TO_TRAIN: _, reward, _ = self.player.step_auto() isOver = (reward != 0) if isOver: print('prestart ends too early! now resetting env') early_stop = True break if early_stop: continue self._current_ob = self.get_state() break self._current_game_score.reset() self._current_ob = self.get_state() self.mem.append(Experience(old_s, act, reward, isOver))
def play_one_episode(env, func): env.reset() env.prepare() r = 0 while r == 0: role_id = env.get_role_ID() if role_id == ROLE_ID_TO_TRAIN: s = get_state(env) mask = get_mask(to_char(env.get_curr_handcards()), action_space, to_char(env.get_last_outcards())) q_values = func(s[None, ...])[0][0] q_values[mask == 0] = np.nan act = np.nanargmax(q_values) intention = to_value(action_space[act]) r, _, _ = env.step_manual(intention) else: intention, r, _ = env.step_auto() return int(r > 0)
def play_one_episode(env, func): env.reset() env.prepare() r = 0 while r == 0: role_id = env.get_role_ID() if role_id == ROLE_ID_TO_TRAIN: handcards = to_char(env.get_curr_handcards()) last_two_cards = env.get_last_two_cards() last_two_cards = [to_char(cards) for cards in last_two_cards] prob_state = env.get_state_prob() # print(agent, handcards) action = func.predict(handcards, last_two_cards, prob_state) # print(agent, ' gives ', action) intention = to_value(action) r, _, _ = env.step_manual(intention) else: intention, r, _ = env.step_auto() return int(r > 0)
def ai_play(): data = request.json print(data) pos = int(data['current_player']) player_cards = data['player_cards'] my_cards = trans_cards(player_cards.split("|")[pos]) last_move = trans_cards(data['last_move']) if int(data['last_player']) == int(data['current_player']): last_move = [] else: last_move = trans_cards(data['last_move']) intention = to_char( CEnv.step_auto_static(Card.char2color(my_cards), to_value(last_move))) res = trans_cards_reverse(intention) if res == "": res = 'P' print("result is {}".format(res)) return jsonify({'move': res})
def play_one_episode(env, func, role_id): env.reset() env.prepare() r = 0 while r == 0: if env.get_role_ID() == role_id: handcards = to_char(env.get_curr_handcards()) last_two_cards = env.get_last_two_cards() last_two_cards = [to_char(cards) for cards in last_two_cards] prob_state = env.get_state_prob() # print(agent, handcards) action = func.predict(handcards, last_two_cards, prob_state) # print(agent, ' gives ', action) intention = to_value(action) r, _, _ = env.step_manual(intention) # print('lord gives', to_char(intention), file=f) assert (intention is not None) else: intention, r, _ = env.step_auto() return int(r > 0)
def _populate_exp(self): """ populate a transition by epsilon-greedy""" old_s = self._current_ob comb_mask = self._comb_mask if not self._comb_mask and self._fine_mask is not None: fine_mask = self._fine_mask if self._fine_mask.shape[0] == max(self.num_actions[0], self.num_actions[1]) \ else np.pad(self._fine_mask, (0, max(self.num_actions[0], self.num_actions[1]) - self._fine_mask.shape[0]), 'constant', constant_values=(0, 0)) else: fine_mask = np.ones( [max(self.num_actions[0], self.num_actions[1])], dtype=np.bool) last_cards_value = self.player.get_last_outcards() if self.rng.rand() <= self.exploration: if not self._comb_mask and self._fine_mask is not None: q_values = np.random.rand(self.num_actions[1]) q_values[np.where(np.logical_not(self._fine_mask))[0]] = np.nan act = np.nanargmax(q_values) # print(q_values) # print(act) else: act = self.rng.choice( range(self.num_actions[0 if comb_mask else 1])) else: q_values = self.predictor(old_s[None, :, :, :], np.array([comb_mask]), np.array([fine_mask]))[0][0] if not self._comb_mask and self._fine_mask is not None: q_values = q_values[:self.num_actions[1]] assert np.all(q_values[np.where(np.logical_not( self._fine_mask))[0]] < -100) q_values[np.where(np.logical_not(self._fine_mask))[0]] = np.nan act = np.nanargmax(q_values) assert act < self.num_actions[0 if comb_mask else 1] # print(q_values) # print(act) # clamp action to valid range act = min(act, self.num_actions[0 if comb_mask else 1] - 1) if comb_mask: reward = 0 isOver = False else: if last_cards_value.size > 0: if act > 0: if not CardGroup.to_cardgroup( self._action_space[act]).bigger_than( CardGroup.to_cardgroup( to_char(last_cards_value))): print('warning, some error happened') # print(to_char(self.player.get_curr_handcards())) reward, isOver, _ = self.player.step_manual( to_value(self._action_space[act])) # print(self._action_space[act]) # step for AI while not isOver and self.player.get_role_ID() != ROLE_ID_TO_TRAIN: _, reward, _ = self.player.step_auto() isOver = (reward != 0) # if landlord negate the reward if ROLE_ID_TO_TRAIN == 2: reward = -reward self._current_game_score.feed(reward) if isOver: # print('lord wins' if reward > 0 else 'farmer wins') self._player_scores.feed(self._current_game_score.sum) # print(self._current_game_score.sum) while True: self.player.reset() # init_cards = np.arange(36) # self.player.prepare_manual(init_cards) self.player.prepare() self._comb_mask = True early_stop = False while self.player.get_role_ID() != ROLE_ID_TO_TRAIN: _, reward, _ = self.player.step_auto() isOver = (reward != 0) if isOver: print('prestart ends too early! now resetting env') early_stop = True break if early_stop: continue self._current_ob, self._action_space = self.get_state_and_action_spaces( ) break self._current_game_score.reset() else: self._comb_mask = not self._comb_mask self._current_ob, self._action_space = self.get_state_and_action_spaces( act if not self._comb_mask else None) self.mem.append( Experience(old_s, act, reward, isOver, comb_mask, fine_mask))
def play_one_episode(env, func): def take_action_from_prob(prob, mask): prob = prob[0] # to avoid numeric difficulty prob[mask == 0] = -1 return np.argmax(prob) # return char minor cards output def inference_minor_util60(role_id, handcards, num, is_pair, dup_mask, main_cards_char): for main_card in main_cards_char: handcards.remove(main_card) s = get_mask(handcards, action_space, None).astype(np.float32) outputs = [] minor_type = 1 if is_pair else 0 for i in range(num): input_single, input_pair, _, _ = get_masks(handcards, None) _, _, _, _, _, _, minor_response_prob = func( [np.array([role_id]), s.reshape(1, -1), np.zeros([1, 9085]), np.array([minor_type])] ) # give minor cards mask = None if is_pair: mask = np.concatenate([input_pair, [0, 0]]) * dup_mask else: mask = input_single * dup_mask minor_response = take_action_from_prob(minor_response_prob, mask) dup_mask[minor_response] = 0 # convert network output to char cards handcards.remove(to_char(minor_response + 3)) if is_pair: handcards.remove(to_char(minor_response + 3)) s = get_mask(handcards, action_space, None).astype(np.float32) # save to output outputs.append(to_char(minor_response + 3)) if is_pair: outputs.append(to_char(minor_response + 3)) return outputs def inference_minor_cards60(role_id, category, s, handcards, seq_length, dup_mask, main_cards_char): if category == Category.THREE_ONE.value: return inference_minor_util60(role_id, handcards, 1, False, dup_mask, main_cards_char) if category == Category.THREE_TWO.value: return inference_minor_util60(role_id, handcards, 1, True, dup_mask, main_cards_char) if category == Category.THREE_ONE_LINE.value: return inference_minor_util60(role_id, handcards, seq_length, False, dup_mask, main_cards_char) if category == Category.THREE_TWO_LINE.value: return inference_minor_util60(role_id, handcards, seq_length, True, dup_mask, main_cards_char) if category == Category.FOUR_TWO.value: return inference_minor_util60(role_id, handcards, 2, False, dup_mask, main_cards_char) env.reset() init_cards = np.arange(21) # init_cards = np.append(init_cards[::4], init_cards[1::4]) env.prepare_manual(init_cards) r = 0 while r == 0: last_cards_value = env.get_last_outcards() last_cards_char = to_char(last_cards_value) last_out_cards = Card.val2onehot60(last_cards_value) last_category_idx = env.get_last_outcategory_idx() curr_cards_char = to_char(env.get_curr_handcards()) is_active = True if last_cards_value.size == 0 else False s = get_mask(curr_cards_char, action_space, None if is_active else last_cards_char).astype(np.float32) last_state = get_mask(last_cards_char, action_space, None).astype(np.float32) # print(s.shape) role_id = env.get_role_ID() # print('%s current cards' % ('lord' if role_id == 2 else 'farmer'), curr_cards_char) intention = None if role_id == 2: if is_active: # first get mask decision_mask, response_mask, _, length_mask = get_mask_alter(curr_cards_char, [], last_category_idx) _, _, _, active_decision_prob, active_response_prob, active_seq_prob, _ = func( [np.array([role_id]), s.reshape(1, -1), np.zeros([1, 9085]), np.zeros([s.shape[0]])] ) # make decision depending on output active_decision = take_action_from_prob(active_decision_prob, decision_mask) active_category_idx = active_decision + 1 # get response active_response = take_action_from_prob(active_response_prob, response_mask[active_decision]) seq_length = 0 # next sequence length if active_category_idx == Category.SINGLE_LINE.value or \ active_category_idx == Category.DOUBLE_LINE.value or \ active_category_idx == Category.TRIPLE_LINE.value or \ active_category_idx == Category.THREE_ONE_LINE.value or \ active_category_idx == Category.THREE_TWO_LINE.value: seq_length = take_action_from_prob(active_seq_prob, length_mask[active_decision][active_response]) + 1 # give main cards intention = give_cards_without_minor(active_response, last_cards_value, active_category_idx, seq_length) # then give minor cards if active_category_idx == Category.THREE_ONE.value or \ active_category_idx == Category.THREE_TWO.value or \ active_category_idx == Category.THREE_ONE_LINE.value or \ active_category_idx == Category.THREE_TWO_LINE.value or \ active_category_idx == Category.FOUR_TWO.value: dup_mask = np.ones([15]) if seq_length > 0: for i in range(seq_length): dup_mask[intention[0] - 3 + i] = 0 else: dup_mask[intention[0] - 3] = 0 intention = np.concatenate([intention, to_value(inference_minor_cards60(role_id, active_category_idx, s.copy(), curr_cards_char.copy(), seq_length, dup_mask, to_char(intention)))]) else: # print(to_char(last_cards_value), is_bomb, last_category_idx) decision_mask, response_mask, bomb_mask, _ = get_mask_alter(curr_cards_char, to_char(last_cards_value), last_category_idx) passive_decision_prob, passive_bomb_prob, passive_response_prob, _, _, _, _ = func( [np.array([role_id]), s.reshape(1, -1), last_state.reshape(1, -1), np.zeros([s.shape[0]])]) passive_decision = take_action_from_prob(passive_decision_prob, decision_mask) if passive_decision == 0: intention = np.array([]) elif passive_decision == 1: passive_bomb = take_action_from_prob(passive_bomb_prob, bomb_mask) # converting 0-based index to 3-based value intention = np.array([passive_bomb + 3] * 4) elif passive_decision == 2: intention = np.array([16, 17]) elif passive_decision == 3: passive_response = take_action_from_prob(passive_response_prob, response_mask) intention = give_cards_without_minor(passive_response, last_cards_value, last_category_idx, None) if last_category_idx == Category.THREE_ONE.value or \ last_category_idx == Category.THREE_TWO.value or \ last_category_idx == Category.THREE_ONE_LINE.value or \ last_category_idx == Category.THREE_TWO_LINE.value or \ last_category_idx == Category.FOUR_TWO.value: dup_mask = np.ones([15]) seq_length = get_seq_length(last_category_idx, last_cards_value) if seq_length: for i in range(seq_length): dup_mask[intention[0] - 3 + i] = 0 else: dup_mask[intention[0] - 3] = 0 intention = np.concatenate([intention, to_value(inference_minor_cards60(role_id, last_category_idx, s.copy(), curr_cards_char.copy(), seq_length, dup_mask, to_char(intention)))]) # since step auto needs full last card group info, we do not explicitly feed card type r, _, _ = env.step_manual(intention) # print('lord gives', to_char(intention)) assert (intention is not None) else: intention, r, _ = env.step_auto() # print('farmer gives', to_char(intention)) # if r > 0: # print('farmer wins') # else: # print('lord wins') return int(r > 0)
def auto_shot_poker(self): def pokers_to_char(cards): cards = rule._to_cards(cards) for i, card in enumerate(cards): if card == 'w': cards[i] = '*' elif card == 'W': cards[i] = '$' elif card == '0': cards[i] = '10' return cards def char_to_pokers(cards): cards = cards.copy() for i, card in enumerate(cards): if card == '*': cards[i] = 'w' elif card == '$': cards[i] = 'W' elif card == '10': cards[i] = '0' return rule._to_pokers(self.hand_pokers, cards) def char2ccardgroup(chars): cg = CardGroup.to_cardgroup(chars) ccg = CCardGroup([CCard(to_value(c) - 3) for c in cg.cards], CCategory(cg.type), cg.value, cg.len) return ccg def ccardgroup2char(cg): return [to_char(int(c) + 3) for c in cg.cards] handcards_char = pokers_to_char(self.hand_pokers) total_cards_cnt = sum( [len(self.table.players[i].hand_pokers) for i in range(3)]) if total_cards_cnt <= AiPlayer.MCT_THRESH: chandcards = [CCard(to_value(c) - 3) for c in handcards_char] unseen_cards = pokers_to_char( self.table.players[(self.table.whose_turn + 1) % 3].hand_pokers + self.table.players[(self.table.whose_turn + 2) % 3].hand_pokers) cunseen_cards = [CCard(to_value(c) - 3) for c in unseen_cards] next_handcards_cnt = len( self.table.players[(self.table.whose_turn + 1) % 3].hand_pokers) last_shot_poker = self.table.last_shot_poker if self.table.whose_turn != self.table.controller else [] last_cg = char2ccardgroup(pokers_to_char(last_shot_poker)) if not self.table.controller: self.table.controller = self.table.whose_turn caction = mcsearch( chandcards, cunseen_cards, next_handcards_cnt, last_cg, (self.table.whose_turn - self.table.lord_turn + 3) % 3, (self.table.controller - self.table.lord_turn + 3) % 3, 10, 50, 500) intention = ccardgroup2char(caction) else: last_two_cards = self.table.get_last_two_cards() last_two_cards = [pokers_to_char(c) for c in last_two_cards] # # last_cards_char = ['10', 'J', 'Q', 'K', 'A'] # # print(handcards_char) # # print(last_cards_char) # if self.table.last_shot_seat == self.seat: # last_cards_char = [] total_cards = np.ones([60]) total_cards[53:56] = 0 total_cards[57:60] = 0 remain_cards = total_cards - Card.char2onehot60( handcards_char + pokers_to_char(self.table.history[self.seat] + self.table.history[(self.seat + 1) % 3] + self.table.history[(self.seat + 2) % 3])) next_cnt = len(self.table.players[(self.seat + 1) % 3].hand_pokers) next_next_cnt = len(self.table.players[(self.seat + 2) % 3].hand_pokers) next_state = remain_cards * (next_cnt / (next_cnt + next_next_cnt)) next_next_state = remain_cards * (next_next_cnt / (next_cnt + next_next_cnt)) prob_state = np.concatenate([next_state, next_next_state]) assert np.all(prob_state < 1.) and np.all(prob_state >= 0.) # print(self.table.last_shot_poker) # print(self.hand_pokers) # print(self.table.players[self.seat].hand_pokers) intention, combs, groups = self.predictor.predict( handcards_char, last_two_cards, prob_state) # print(intention) top_k = 5 top_combs = combs[:top_k] a, q = zip(*top_combs) for comb in a: test = [] for i, c in enumerate(comb): test += c if collections.Counter(test) == collections.Counter( handcards_char): del comb[i + 1:] break top_combs = list( zip([[char_to_pokers(c) for c in comb] for comb in a], q)) # print(top_combs) top_groups = groups[:top_k] a, q = zip(*top_groups) top_groups = list(zip([char_to_pokers(g) for g in a], q)) # print(top_groups) # if not self.table.last_shot_poker or self.table.last_shot_seat == self.seat: # pokers.append(self.hand_pokers[0]) # else: # pokers = rule.cards_above(self.hand_pokers, self.table.last_shot_poker) packet_comb = [Pt.REQ_Q_COMB, top_combs] packet_fine = [Pt.REQ_Q_FINE, top_groups] IOLoop.current().call_later(1, self.to_server, packet_comb) IOLoop.current().call_later(2, self.to_server, packet_fine) pokers = char_to_pokers(intention) packet = [Pt.REQ_SHOT_POKER, pokers] # IOLoop.current().add_callback(self.to_server, packet) IOLoop.current().call_later(2, self.to_server, packet)
def char2ccardgroup(chars): cg = CardGroup.to_cardgroup(chars) ccg = CCardGroup([CCard(to_value(c) - 3) for c in cg.cards], CCategory(cg.type), cg.value, cg.len) return ccg
def run(self): player = self._build_player() context = zmq.Context() c2s_socket = context.socket(zmq.PUSH) c2s_socket.setsockopt(zmq.IDENTITY, self.identity) c2s_socket.set_hwm(10) c2s_socket.connect(self.c2s) s2c_socket = context.socket(zmq.DEALER) s2c_socket.setsockopt(zmq.IDENTITY, self.identity) s2c_socket.connect(self.s2c) player.reset() # init_cards = np.arange(52) # init_cards = np.append(init_cards[::4], init_cards[1::4]) # player.prepare_manual(init_cards) player.prepare() r, is_over = 0, False lstm_state = np.zeros([1024 * 2]) while True: role_id = player.get_role_ID() if role_id in ROLE_IDS_TO_TRAIN: prob_state, all_state, curr_handcards_value, last_cards_value, last_category = \ player.get_state_prob(), player.get_state_all_cards(), player.get_curr_handcards(), player.get_last_outcards(), player.get_last_outcategory_idx() prob_state = np.concatenate( [Card.val2onehot60(curr_handcards_value), prob_state]) # after taking the last action, get to this state and get this reward/isOver. # If isOver, get to the next-episode state immediately. # This tuple is not the same as the one put into the memory buffer is_active = False if last_cards_value.size > 0 else True mask = get_mask( to_char(curr_handcards_value), action_space, None if is_active else to_char(last_cards_value)) if is_active: mask[0] = 0 last_two_cards = player.get_last_two_cards() last_two_cards_onehot = np.concatenate([ Card.val2onehot60(last_two_cards[0]), Card.val2onehot60(last_two_cards[1]) ]) c2s_socket.send(dumps( (self.identity, role_id, prob_state, all_state, last_two_cards_onehot, mask, 0 if is_active else 1, lstm_state, r, is_over)), copy=False) action_idx, lstm_state = loads( s2c_socket.recv(copy=False).bytes) r, is_over, _ = player.step_manual( to_value(action_space[action_idx])) else: _, r, _ = player.step_auto() is_over = (r != 0) if is_over: # print('{} over with reward {}'.format(self.identity, r)) # logger.info('{} over with reward {}'.format(self.identity, r)) # sys.stdout.flush() player.reset() player.prepare() lstm_state = np.zeros([1024 * 2])
# python env usage env = Env(['1', '2', '3']) agent_names = ['1', '2', '3'] for _ in range(1): env.reset() env.prepare() done = False while not done: print('here') handcards = env.get_curr_handcards() env.get_state_prob() t = time.perf_counter() chandcards = [CCard(to_value(c) - 3) for c in handcards] unseen_cards = env.player_cards[agent_names[(env.get_current_idx() + 1) % len(env.agent_names)]].copy() \ + env.player_cards[agent_names[(env.get_current_idx() + 2) % len(env.agent_names)]].copy() print('here') cunseen_cards = [CCard(to_value(c) - 3) for c in unseen_cards] print('here') next_handcards_cnt = len(env.player_cards[agent_names[(env.get_current_idx() + 1) % len(env.agent_names)]]) last_cg = char2ccardgroup(env.get_last_outcards()) caction = mcsearch(chandcards, cunseen_cards, next_handcards_cnt, last_cg, env.agent_names.index(env.curr_player), env.agent_names.index(env.controller)) action = ccardgroup2char(caction) print(action) winner, done = env.step(action) if done: for agent_name in agent_names:
def intention(self, env): intention = to_char( CEnv.step_auto_static(Card.char2color(env.get_curr_handcards()), to_value(env.get_last_outcards()))) return intention
def step(self, intention): # print(intention) r, done, _ = self.step_manual(to_value(intention)) return r, done
def play_one_episode(env, func): def take_action_from_prob(prob, mask): prob = prob[0] # to avoid numeric difficulty prob[mask == 0] = -1 return np.argmax(prob) env.reset() # init_cards = np.arange(52) # init_cards = np.append(init_cards[::4], init_cards[1::4]) # env.prepare_manual(init_cards) env.prepare() r = 0 lstm_state = np.zeros([1024 * 2]) while r == 0: last_cards_value = env.get_last_outcards() last_cards_char = to_char(last_cards_value) last_two_cards = env.get_last_two_cards() last_two_cards_onehot = np.concatenate([ Card.val2onehot60(last_two_cards[0]), Card.val2onehot60(last_two_cards[1]) ]) curr_cards_char = to_char(env.get_curr_handcards()) is_active = True if last_cards_value.size == 0 else False s = env.get_state_prob() s = np.concatenate([Card.char2onehot60(curr_cards_char), s]) # print(s.shape) role_id = env.get_role_ID() # print('%s current cards' % ('lord' if role_id == 2 else 'farmer'), curr_cards_char) if role_id in ROLE_IDS_TO_TRAIN: if is_active: # first get mask mask = get_mask(curr_cards_char, action_space, None) # not valid for active mask[0] = 0 active_prob, _, lstm_state = func(np.array([role_id]), s.reshape(1, -1), np.zeros([1, 120]), lstm_state.reshape(1, -1)) # make decision depending on output action_idx = take_action_from_prob(active_prob, mask) else: # print('last cards char', last_cards_char) mask = get_mask(curr_cards_char, action_space, last_cards_char) _, passive_prob, lstm_state = func( np.array([role_id]), s.reshape(1, -1), last_two_cards_onehot.reshape(1, -1), lstm_state.reshape(1, -1)) action_idx = take_action_from_prob(passive_prob, mask) # since step auto needs full last card group info, we do not explicitly feed card type intention = to_value(action_space[action_idx]) r, _, _ = env.step_manual(intention) # print('lord gives', to_char(intention)) assert (intention is not None) else: intention, r, _ = env.step_auto() # print('farmer gives', to_char(intention)) # if r > 0: # print('farmer wins') # else: # print('lord wins') return int(r > 0)