def update_strategy_search(self, traverser, state, node_map, action_map, continuation, leaf=False): if state.terminal: return turn = state.turn info_set = state.info_set() if info_set not in action_map[turn]: action_map[turn][info_set] = {'actions': state.valid_actions()} valid_actions = action_map[turn][info_set]['actions'] if leaf is True: if info_set not in continuation[turn]: continuation[turn][info_set] = Node([i for i in range(4)]) node = continuation[turn][info_set] else: if info_set not in node_map[turn]: node_map[turn][info_set] = Node(valid_actions) node = node_map[turn][info_set] strategy = node.strategy() if turn == traverser: actions = list(strategy.keys()) probs = list(strategy.values()) random_action = actions[np.random.choice(len(actions), p=probs)] node.strategy_sum[random_action] += 1 new_state = state.take(random_action, deep=True) if leaf is False: self.update_strategy_search( traverser, new_state, node_map, action_map, continuation, leaf=new_state.round != state.round) else: if leaf is False: for action in valid_actions: new_state = state.take(action, deep=True) self.update_strategy_search( traverser, new_state, node_map, action_map, continuation, leaf=new_state.round != state.round)
def test_init(): actions = ['F', 'C', 'R'] node = Node(actions) strat = node.strategy() assert sum(strat.values()) == 1, node assert sum(node.strategy_sum.values()) == 1, node assert sum(node.regret_sum.values()) == 0, node
def test_weighting(): actions = ['F', 'C', 'R'] node = Node(actions) strat = node.strategy(.5) assert sum(strat.values()) == 1, node assert sum(node.strategy_sum.values()) == .5, node.strategy_sum assert sum(node.regret_sum.values()) == 0, node
def test_regrets(): actions = ['F', 'C', 'R'] node = Node(actions) node.regret_sum = {'F': .5, 'C': .5, 'R': 0} strat = node.strategy() assert strat == {'F': .5, 'C': .5, 'R': 0}, strat assert sum(node.strategy_sum.values()) == 1, node.strategy_sum
def update_strategy(traverser, state, node_map, action_map): if state.terminal: return turn = state.turn info_set = state.info_set() if info_set not in action_map[turn]: action_map[turn][info_set] = {'actions': state.valid_actions()} valid_actions = action_map[turn][info_set]['actions'] if info_set not in node_map[turn]: node_map[turn][info_set] = Node(valid_actions) node = node_map[turn][info_set] strategy = node.strategy() if turn == traverser: actions = list(strategy.keys()) probs = list(strategy.values()) random_action = actions[np.random.choice(len(actions), p=probs)] node.strategy_sum[random_action] += 1 new_state = state.take(random_action, deep=True) update_strategy(traverser, new_state, node_map, action_map) else: for action in valid_actions: new_state = state.take(action, deep=True) update_strategy(traverser, new_state, node_map, action_map)
def test_average(): actions = ['F', 'C', 'R'] node = Node(actions) avg = node.avg_strategy() assert sum(avg.values()) == 1, avg strat = node.strategy(.5) assert sum(strat.values()) == 1, node assert sum(node.strategy_sum.values()) == .5, node.strategy_sum assert sum(node.regret_sum.values()) == 0, node avg = node.avg_strategy() assert sum(avg.values()) == 1, avg
def test_update_strategy(): num_players = 2 node_map = {i: {} for i in range(num_players)} action_map = {i: {} for i in range(num_players)} n1 = Node(['F', 'C', '1R']) n1.regret_sum = {'F': 0, 'C': 1, '1R': 0} n2 = Node(['F', 'C', '1R']) n2.regret_sum = {'F': 1, 'C': 0, '1R': 1} node_map[0]['As || [[]]'] = n1 node_map[0]["As || [['C', '1R']]"] = n2 cards = [Card(14, 1), Card(13, 1), Card(12, 1)] state = State(cards, num_players, 1, kuhn_eval) update_strategy(0, state, node_map, action_map) assert sum(n1.strategy_sum.values()) > 0, f'Util\n{n1}, \nNodes\n{node_map}, Actions\n{json.dumps(action_map, indent=4)}' assert sum(n2.strategy_sum.values()) > 0, f'Util\n{n1}, \nNodes\n{node_map}, Actions\n{json.dumps(action_map, indent=4)}'
def accumulate_regrets(traverser, state, node_map, action_map, prune=False): if state.terminal: util = state.utility() return util turn = state.turn info_set = state.info_set() if info_set not in action_map[turn]: action_map[turn][info_set] = {'actions': state.valid_actions()} valid_actions = action_map[turn][info_set]['actions'] if info_set not in node_map[turn]: node_map[turn][info_set] = Node(valid_actions) node = node_map[turn][info_set] strategy = node.strategy() if turn == traverser: util = {a: 0 for a in valid_actions} node_util = np.zeros(len(node_map)) explored = set(valid_actions) for action in valid_actions: if prune is True and node.regret_sum[action] <= REGRET_MIN: explored.remove(action) else: new_state = state.take(action, deep=True) returned = accumulate_regrets(traverser, new_state, node_map, action_map, prune=prune) util[action] = returned[turn] node_util += returned * strategy[action] for action in explored: regret = util[action] - node_util[turn] node.regret_sum[action] += regret return node_util else: actions = list(strategy.keys()) probs = list(strategy.values()) random_action = actions[np.random.choice(len(actions), p=probs)] new_state = state.take(random_action, deep=True) return accumulate_regrets(traverser, new_state, node_map, action_map, prune=prune)
def accumulate_regrets(state, node_map, action_map, probs): if state.terminal: util = state.utility() return util info_set = state.info_set() if info_set not in action_map[state.turn]: action_map[state.turn][info_set] = state.valid_actions() valid_actions = action_map[state.turn][info_set] if info_set not in node_map[state.turn]: node_map[state.turn][info_set] = Node(valid_actions) node = node_map[state.turn][info_set] strategy = node.strategy(probs[state.turn]) util = {a: 0 for a in valid_actions} node_util = np.zeros(len(node_map)) for action in valid_actions: new_prob = [ p if i != state.turn else p * strategy[action] for i, p in enumerate(probs) ] new_state = state.take(action, deep=True) returned = accumulate_regrets(new_state, node_map, action_map, new_prob) util[action] = returned[state.turn] node_util += returned * strategy[action] reach_prob = 1 for p, prob in enumerate(probs): if p != state.turn: reach_prob *= prob for action in valid_actions: regret = util[action] - node_util[state.turn] node.regret_sum[action] += regret * reach_prob return node_util
def pluribus_turn(self, state, blueprint, action_map, cards): info_set = state.info_set() turn = state.turn if info_set not in action_map[turn]: action_map[turn][info_set] = {'actions': state.valid_actions()} valid_actions = action_map[turn][info_set]['actions'] if state.info_set() not in blueprint[state.turn]: blueprint[state.turn][state.info_set()] = Node(valid_actions) node = blueprint[state.turn][state.info_set()] strategy = node.avg_strategy() actions = list(strategy.keys()) probs = list(strategy.values()) sampled = actions[np.random.choice(len(actions), p=probs)] print(f"Pluribus played {sampled}") action_map[state.turn][state.info_set()]['frozen'] = sampled state.take(sampled) self.check_round(state, self.root, blueprint, action_map, cards)
def accumulate_regrets_search(self, traverser, state, node_map, action_map, continuations, prune=False, leaf=False): if state.terminal: util = state.utility() return util turn = state.turn info_set = state.info_set() if info_set not in action_map[turn]: action_map[turn][info_set] = {'actions': state.valid_actions()} valid_actions = action_map[turn][info_set]['actions'] if 'fixed' in valid_actions: valid_actions = [action_map[turn][info_set]['fixed']] if leaf is True: if info_set not in continuations[turn]: continuations[turn][info_set] = Node(["NULL", "F", "C", "4R"]) node = continuations[turn][info_set] valid_actions = ["NULL", "F", "C", "4R"] else: if info_set not in node_map[turn]: node_map[turn][info_set] = Node(valid_actions) node = node_map[turn][info_set] strategy = node.strategy() if turn == traverser: util = {a: 0 for a in valid_actions} node_util = np.zeros(len(node_map)) explored = set(valid_actions) for action in valid_actions: if prune is True and leaf is False and node.regret_sum[ action] <= REGRET_MIN: explored.remove(action) else: if leaf is True: returned = self.rollout(traverser, state, action) else: new_state = state.take(action, deep=True) returned = self.accumulate_regrets_search( traverser, new_state, node_map, action_map, continuations, prune=prune, leaf=new_state.round != state.round) util[action] = returned[turn] node_util += returned * strategy[action] for action in explored: regret = util[action] - node_util[turn] node.regret_sum[action] += regret return node_util else: if leaf is True: return self.rollout(traverser, state, "NULL") actions = list(strategy.keys()) probs = list(strategy.values()) random_action = actions[np.random.choice(len(actions), p=probs)] new_state = state.take(random_action, deep=True) return self.accumulate_regrets_search( traverser, new_state, node_map, action_map, continuations, prune=prune, leaf=new_state.round != state.round)