def get_combinations(self, curr_cards_char, last_cards_char): if len(curr_cards_char) > 10: card_mask = Card.char2onehot60(curr_cards_char).astype(np.uint8) mask = augment_action_space_onehot60 a = np.expand_dims(1 - card_mask, 0) * mask invalid_row_idx = set(np.where(a > 0)[0]) if len(last_cards_char) == 0: invalid_row_idx.add(0) valid_row_idx = [i for i in range(len(augment_action_space)) if i not in invalid_row_idx] mask = mask[valid_row_idx, :] idx_mapping = dict(zip(range(mask.shape[0]), valid_row_idx)) # augment mask # TODO: known issue: 555444666 will not decompose into 5554 and 66644 combs = get_combinations_nosplit(mask, card_mask) combs = [([] if len(last_cards_char) == 0 else [0]) + [clamp_action_idx(idx_mapping[idx]) for idx in comb] for comb in combs] if len(last_cards_char) > 0: idx_must_be_contained = set( [idx for idx in valid_row_idx if CardGroup.to_cardgroup(augment_action_space[idx]). \ bigger_than(CardGroup.to_cardgroup(last_cards_char))]) combs = [comb for comb in combs if not idx_must_be_contained.isdisjoint(comb)] fine_mask = np.zeros([len(combs), self.num_actions[1]], dtype=np.bool) for i in range(len(combs)): for j in range(len(combs[i])): if combs[i][j] in idx_must_be_contained: fine_mask[i][j] = True else: fine_mask = None else: mask = get_mask_onehot60(curr_cards_char, action_space, None).reshape(len(action_space), 15, 4).sum(-1).astype( np.uint8) valid = mask.sum(-1) > 0 cards_target = Card.char2onehot60(curr_cards_char).reshape(-1, 4).sum(-1).astype(np.uint8) # do not feed empty to C++, which will cause infinite loop combs = get_combinations_recursive(mask[valid, :], cards_target) idx_mapping = dict(zip(range(valid.shape[0]), np.where(valid)[0])) combs = [([] if len(last_cards_char) == 0 else [0]) + [idx_mapping[idx] for idx in comb] for comb in combs] if len(last_cards_char) > 0: valid[0] = True idx_must_be_contained = set( [idx for idx in range(len(action_space)) if valid[idx] and CardGroup.to_cardgroup(action_space[idx]). \ bigger_than(CardGroup.to_cardgroup(last_cards_char))]) combs = [comb for comb in combs if not idx_must_be_contained.isdisjoint(comb)] fine_mask = np.zeros([len(combs), self.num_actions[1]], dtype=np.bool) for i in range(len(combs)): for j in range(len(combs[i])): if combs[i][j] in idx_must_be_contained: fine_mask[i][j] = True else: fine_mask = None return combs, fine_mask
def dancing_link(): env = Pyenv() env.reset() env.prepare() # print(env.get_handcards()) cards = env.get_handcards() cards = ['3', '3', '3', '4', '4', '4'] import timeit begin = timeit.default_timer() card_mask = Card.char2onehot60(cards).astype(np.uint8) # mask = get_mask_onehot60(cards, action_space, None).astype(np.uint8) last_cards = ['3', '3'] mask = augment_action_space_onehot60 a = np.expand_dims(1 - card_mask, 0) * mask row_idx = set(np.where(a > 0)[0]) # tmp = np.ones(len(augment_action_space)) # tmp[row_idx] = 0 # tmp[0] = 0 # valid_row_idx = np.where(tmp > 0)[0] valid_row_idx = [ i for i in range(1, len(augment_action_space)) if i not in row_idx ] idx_must_be_contained = set([idx for idx in valid_row_idx if CardGroup.to_cardgroup(augment_action_space[idx]).\ bigger_than(CardGroup.to_cardgroup(last_cards))]) print(idx_must_be_contained) mask = mask[valid_row_idx, :] idx_mapping = dict(zip(range(mask.shape[0]), valid_row_idx)) # augment mask # TODO: known issue: 555444666 will not decompose into 5554 and 66644 combs = get_combinations_nosplit( mask, Card.char2onehot60(cards).astype(np.uint8)) combs = [[clamp_action_idx(idx_mapping[idx]) for idx in comb] for comb in combs] combs = [ comb for comb in combs if not idx_must_be_contained.isdisjoint(comb) ] fine_mask = np.zeros([len(combs), 21]) for i in range(len(combs)): for j in range(len(combs[i])): if combs[i][j] in idx_must_be_contained: fine_mask[i][j] = 1 print(fine_mask) end = timeit.default_timer() print(end - begin) print(len(combs)) for comb in combs: for idx in comb: print(action_space[idx], end=', ') print()
def step(self, i, a, single_step=False): if a != 0: self.players[i].discard(self.action_space[a]) self.last_player = self.players[i] assert self.players[i] is self.last_player self.last_cards = CardGroup.to_cardgroup(self.action_space[a]) self.history += self.last_cards.cards self.log(i, self.last_cards.cards, False) if not self.players[i].cards: return 2 if self.players[i].is_lord else 1, True else: self.log(i, [], True) if not single_step: ai = 0 for k in range(i + 1, i + 3): ai = k % 3 if self.players[ai].trainable: break if not self.players[ai].cards: # TODO: add coordination rewards return -1, True self.last_player, self.last_cards, passed = self.players[ ai].respond(self.last_player, self.last_cards, self.players[(ai - 1) % 3], self.players[(ai + 1) % 3]) self.log(ai, self.last_cards.cards, passed) if not passed: self.history += self.last_cards.cards self.next_turn = ai % 3 else: self.next_turn = (self.next_turn + 1) % 3 return 0, False
def step(self, intention): if not intention: self.controller = 1 for a in action_space: if not a: continue if counter_subset(a, self.oppo_cards): self.last_cards = a group = CardGroup.to_cardgroup(a) for card in a: self.oppo_cards.remove(card) self.history[1].append(card) if not self.oppo_cards: return -1, True return 0, False self.controller = 0 self.last_cards = intention for card in intention: self.agent_cards.remove(card) self.history[0].append(card) if not self.agent_cards: return 1, True group_intention = CardGroup.to_cardgroup(intention) for a in action_space: if not a: continue if counter_subset(a, self.oppo_cards): group = CardGroup.to_cardgroup(a) if group.bigger_than(group_intention): for card in a: self.oppo_cards.remove(card) self.history[1].append(card) self.last_cards = a self.controller = 1 break if not self.oppo_cards: return -1, True return 0, False
def get_mask(self, i): mask = np.zeros_like(self.action_space) for j in range(mask.size): if counter_subset(self.action_space[j], self.players[i].cards): mask[j] = 1 mask = mask.astype(bool) if self.last_player is not None: if self.last_player is not self.players[i]: for j in range(1, mask.size): if mask[j] == 1 and not CardGroup.to_cardgroup( self.action_space[j]).bigger_than(self.last_cards): mask[j] = False elif self.last_player is self.players[i]: mask[0] = False else: mask[0] = False return mask
def char2ccardgroup(chars): cg = CardGroup.to_cardgroup(chars) ccg = CCardGroup([CCard(to_value(c) - 3) for c in cg.cards], CCategory(cg.type), cg.value, cg.len) return ccg
def _populate_exp(self): """ populate a transition by epsilon-greedy""" old_s = self._current_ob comb_mask = self._comb_mask if not self._comb_mask and self._fine_mask is not None: fine_mask = self._fine_mask if self._fine_mask.shape[0] == max(self.num_actions[0], self.num_actions[1]) \ else np.pad(self._fine_mask, (0, max(self.num_actions[0], self.num_actions[1]) - self._fine_mask.shape[0]), 'constant', constant_values=(0, 0)) else: fine_mask = np.ones([max(self.num_actions[0], self.num_actions[1])], dtype=np.bool) last_cards_char = self.player.get_last_outcards() if self.rng.rand() <= self.exploration: if not self._comb_mask and self._fine_mask is not None: q_values = np.random.rand(self.num_actions[1]) q_values[np.where(np.logical_not(self._fine_mask))[0]] = np.nan act = np.nanargmax(q_values) # print(q_values) # print(act) else: act = self.rng.choice(range(self.num_actions[0 if comb_mask else 1])) else: q_values = self.curr_predictor(old_s[None, :, :, :], np.array([comb_mask]), np.array([fine_mask]))[0][0] if not self._comb_mask and self._fine_mask is not None: q_values = q_values[:self.num_actions[1]] assert np.all(q_values[np.where(np.logical_not(self._fine_mask))[0]] < -100) q_values[np.where(np.logical_not(self._fine_mask))[0]] = np.nan act = np.nanargmax(q_values) assert act < self.num_actions[0 if comb_mask else 1] # print(q_values) # print(act) # clamp action to valid range act = min(act, self.num_actions[0 if comb_mask else 1] - 1) winner = -1 reward = 0 if comb_mask: isOver = False else: if len(last_cards_char) > 0: if act > 0: if not CardGroup.to_cardgroup(self._action_space[act]).bigger_than(CardGroup.to_cardgroup(last_cards_char)): print('warning, some error happened, ', self._action_space[act], last_cards_char) raise Exception("card comparison error") winner, isOver = self.player.step(self._action_space[act]) # step for AI farmers while not isOver and self.player.get_curr_agent_name() != self.agent_name: handcards = self.player.get_curr_handcards() last_two_cards = self.player.get_last_two_cards() prob_state = self.player.get_state_prob() action = self.predictors[self.player.get_curr_agent_name()].predict(handcards, last_two_cards, prob_state) winner, isOver = self.player.step(action) if isOver: if self.agent_name == winner: reward = 1 else: if self.player.get_all_agent_names().index(winner) + self.player.get_all_agent_names().index(self.agent_name) == 3: reward = 1 else: reward = -1 self._current_game_score.feed(reward) if isOver: self._player_scores.feed(self._current_game_score.sum) self.player.reset() self.player.prepare() self._comb_mask = True self.prestart() self._current_game_score.reset() else: self._comb_mask = not self._comb_mask self._current_ob, self._action_space = self.get_state_and_action_spaces(act if not self._comb_mask else None) self.mem.append(Experience(old_s, act, reward, isOver, comb_mask, fine_mask))
def _populate_exp(self): """ populate a transition by epsilon-greedy""" old_s = self._current_ob comb_mask = self._comb_mask if not self._comb_mask and self._fine_mask is not None: fine_mask = self._fine_mask if self._fine_mask.shape[0] == max(self.num_actions[0], self.num_actions[1]) \ else np.pad(self._fine_mask, (0, max(self.num_actions[0], self.num_actions[1]) - self._fine_mask.shape[0]), 'constant', constant_values=(0, 0)) else: fine_mask = np.ones( [max(self.num_actions[0], self.num_actions[1])], dtype=np.bool) last_cards_value = self.player.get_last_outcards() if self.rng.rand() <= self.exploration: if not self._comb_mask and self._fine_mask is not None: q_values = np.random.rand(self.num_actions[1]) q_values[np.where(np.logical_not(self._fine_mask))[0]] = np.nan act = np.nanargmax(q_values) # print(q_values) # print(act) else: act = self.rng.choice( range(self.num_actions[0 if comb_mask else 1])) else: q_values = self.predictor(old_s[None, :, :, :], np.array([comb_mask]), np.array([fine_mask]))[0][0] if not self._comb_mask and self._fine_mask is not None: q_values = q_values[:self.num_actions[1]] assert np.all(q_values[np.where(np.logical_not( self._fine_mask))[0]] < -100) q_values[np.where(np.logical_not(self._fine_mask))[0]] = np.nan act = np.nanargmax(q_values) assert act < self.num_actions[0 if comb_mask else 1] # print(q_values) # print(act) # clamp action to valid range act = min(act, self.num_actions[0 if comb_mask else 1] - 1) if comb_mask: reward = 0 isOver = False else: if last_cards_value.size > 0: if act > 0: if not CardGroup.to_cardgroup( self._action_space[act]).bigger_than( CardGroup.to_cardgroup( to_char(last_cards_value))): print('warning, some error happened') # print(to_char(self.player.get_curr_handcards())) reward, isOver, _ = self.player.step_manual( to_value(self._action_space[act])) # print(self._action_space[act]) # step for AI while not isOver and self.player.get_role_ID() != ROLE_ID_TO_TRAIN: _, reward, _ = self.player.step_auto() isOver = (reward != 0) # if landlord negate the reward if ROLE_ID_TO_TRAIN == 2: reward = -reward self._current_game_score.feed(reward) if isOver: # print('lord wins' if reward > 0 else 'farmer wins') self._player_scores.feed(self._current_game_score.sum) # print(self._current_game_score.sum) while True: self.player.reset() # init_cards = np.arange(36) # self.player.prepare_manual(init_cards) self.player.prepare() self._comb_mask = True early_stop = False while self.player.get_role_ID() != ROLE_ID_TO_TRAIN: _, reward, _ = self.player.step_auto() isOver = (reward != 0) if isOver: print('prestart ends too early! now resetting env') early_stop = True break if early_stop: continue self._current_ob, self._action_space = self.get_state_and_action_spaces( ) break self._current_game_score.reset() else: self._comb_mask = not self._comb_mask self._current_ob, self._action_space = self.get_state_and_action_spaces( act if not self._comb_mask else None) self.mem.append( Experience(old_s, act, reward, isOver, comb_mask, fine_mask))
def respond(self, last_player, cards, before_player, next_player): if self.is_human: print("your cards: ", end='') print(self.cards) intend = raw_input("enter your intention(0 for pass): ") intend = intend.strip() intend = intend.split(',') if intend[0] == '0': return last_player, cards, True else: if not counter_subset(intend, self.cards) or \ not CardGroup.isvalid(intend): print("invalid intention, try again") return self.respond(last_player, cards, before_player, next_player) else: if last_player is not None and last_player != self: if not (CardGroup.to_cardgroup(intend)).bigger_than(cards): print('you must give bigger cards') return self.respond(last_player, cards, before_player, next_player) self.discard(intend) return self, CardGroup.to_cardgroup(intend), False if self.need_analyze: self.candidates = CardGroup.analyze(self.cards) self.need_analyze = False if last_player is None or self is last_player: if CardGroup.folks(self.cards) == 2: self.discard(self.candidates[-1].cards) return self, self.candidates[-1], False elif not next_player.is_lord and len(next_player.cards) == 1: for group in self.candidates: if group.type == 'single': self.discard(group.cards) return self, group, False self.discard(self.candidates[0].cards) return self, self.candidates[0], False elif next_player.is_lord and len(next_player.cards) == 1: for group in self.candidates: if group.type != 'single': self.discard(group.cards) return self, group, False self.discard(self.candidates[-1].cards) return self, self.candidates[-1], False else: for group in self.candidates: if group.type != 'single' or Card.to_value(group.cards[0]) < Card.to_value('A'): self.discard(group.cards) return self, group, False self.discard(self.candidates[0].cards) return self, self.candidates[0], False # print "player %s cards:" % self.name # print self.cards # print "player %s respond:" % self.name # print self.candidates[0].cards # self.discard(self.candidates[0].cards) # return self.name, self.candidates[0] elif not last_player.is_lord: if CardGroup.folks(self.cards) <= 2: for c in self.candidates: if c.bigger_than(cards): self.discard(c.cards) return self, c, False return last_player, cards, True elif before_player.is_lord and last_player is not before_player: return last_player, cards, True else: for c in self.candidates: if c.bigger_than(cards) and cards.type not in ['bomb', 'bigbang'] \ and Card.to_value(c.cards[0]) < Card.to_value('A'): self.discard(c.cards) return self, c, False return last_player, cards, True else: for c in self.candidates: if c.bigger_than(cards) and c.type not in ['bomb', 'bigbang']: self.discard(c.cards) return self, c, False # use bomb for c in self.candidates: if c.bigger_than(cards): self.discard(c.cards) return self, c, False return last_player, cards, True