def test_merchant(self) -> None: self.game.new_game() p_state: PlayerState = self.game.state.player_states[0] # Inject cards into hand merchant = Merchant() first_silver = Silver() second_silver = Silver() self.game.state.inject(0, merchant) self.game.state.inject(0, first_silver) self.game.state.inject(0, second_silver) self.game.state.inject(0, Estate()) self.game.state.inject(0, Estate()) self.game.state.advance_next_decision() # Action Phase Decision -- Play Merchant r = DecisionResponse([merchant]) self.game.state.process_decision(r) self.game.state.advance_next_decision() # Treasure Phase Decision -- Play All Treasures r = DecisionResponse([first_silver]) self.game.state.process_decision(r) self.game.state.advance_next_decision() r = DecisionResponse([second_silver]) self.game.state.process_decision(r) self.game.state.advance_next_decision() self.assertEqual(p_state.coins, 5)
def testChapelHeuristic(self) -> None: self.game.new_game() state: State = self.game.state state.inject(0, Chapel()) state.advance_next_decision() # Action Phase decision: defaults to playing Chapel r: DecisionResponse = DecisionResponse([]) self.players[0].makeDecision(state, r) self.game.state.process_decision(r) self.game.state.advance_next_decision() # Should auto trash 3 Copper and 1 Estate r = DecisionResponse([]) self.players[0].makeDecision(state, r) self.game.state.process_decision(r) # Process TrashCard events self.game.state.advance_next_decision() n_copper = state.get_card_count(0, Copper) n_estate = state.get_card_count(0, Estate) self.assertTrue(n_copper == 3 or n_copper == 4) self.assertEqual(n_copper + n_estate, 6)
def makeDecision(self, s: State, response: DecisionResponse): d: DecisionState = s.decision p: int = s.player if s.phase == Phase.ActionPhase: assert False, 'GreedyPlayer does not support action cards yet' elif s.phase == Phase.TreasurePhase: response.single_card = d.card_choices[0] else: choices = d.card_choices + [None] X = s.lookahead_batch_featurize(choices).cpu() label_idx = np.argmin( self.model.classes_) if p == 1 else np.argmax( self.model.classes_) y = self.model.predict_proba(X) if self.train: card = np.random.choice(choices, p=softmax(y[:, label_idx], t=self.tau)) else: card = choices[np.argmax(y[:, label_idx])] response.single_card = card
def testGreedyActionHeuristic(self) -> None: self.game.new_game() state: State = self.game.state state.inject(0, Laboratory()) state.inject(0, Village()) state.inject(0, Smithy()) state.advance_next_decision() # Action Phase: Play Lab r = DecisionResponse([]) self.players[0].makeDecision(state, r) self.assertTrue(isinstance(r.cards[0], Laboratory)) state.process_decision(r) state.advance_next_decision() r = DecisionResponse([]) self.players[0].makeDecision(state, r) self.assertTrue(isinstance(r.cards[0], Village)) state.process_decision(r) state.advance_next_decision() r = DecisionResponse([]) self.players[0].makeDecision(state, r) self.assertTrue(isinstance(r.cards[0], Smithy)) state.process_decision(r) state.advance_next_decision() self.assertEqual(state.get_zone_card_count(0, Zone.Hand), 10)
def makePhaseDecision(self, s: State, response: DecisionResponse): d: DecisionState = s.decision player = d.controlling_player if s.phase == Phase.ActionPhase: self.heuristic.makeGreedyActionDecision(s, response) elif s.phase == Phase.TreasurePhase: response.single_card = d.card_choices[0] else: if not self.train: remove_first_card(Curse(), d.card_choices) response.single_card = self.heuristic.agenda.buy( s, player, d.card_choices) return
def makeDecision(self, s: State, response: DecisionResponse): d: DecisionState = s.decision if s.phase == Phase.ActionPhase: assert False, 'MCTS does not support action cards yet' elif s.phase == Phase.TreasurePhase: response.single_card = d.card_choices[0] else: choices = d.card_choices + [None] # the next node in the tree is the one that maximizes the UCB1 score card = self.rollout.select(choices, state=s) response.single_card = card
def step(self, action: DecisionResponse) -> Tuple[State, int, bool, Any]: s: State = self.game.state d: DecisionState = s.decision if s.phase != Phase.BuyPhase: raise ValueError('Cannot step from any phase other than Buy Phase.') p: Player = self.game.players[d.controlling_player].controller s.process_decision(action) s.advance_next_decision() # Skip all non-Buy phases until end of game while s.phase != Phase.BuyPhase and not self._done: response = DecisionResponse([]) p = self.game.players[d.controlling_player].controller p.makeDecision(s, response) s.process_decision(response) s.advance_next_decision() reward = 0 if self._done: p0win = self.game.is_winner(0) p1win = self.game.is_winner(1) if p0win and p1win: reward = 0 elif p0win: reward = 1 else: reward = -1 return s, reward, self._done, None
def train_elog(env: Environment, epochs: int, train_epochs_interval: int): for epoch in tqdm(range(epochs)): state = env.reset() done = False data = { 'features': [], 'rewards': [], 'cards': [], 'idxs': state.feature.idxs } while not done: action = DecisionResponse([]) d: DecisionState = state.decision player: Player = env.players[d.controlling_player] player.makeDecision(state, action) x = state.feature.to_numpy() data['features'].append(x) data['cards'].append(action.single_card) obs, reward, done, _ = env.step(action) data['rewards'].extend([reward] * (len(data['features']) - len(data['rewards']))) for player in env.players: if isinstance(player, RolloutPlayer): player.rollout.update(**data) if (epoch + 1) % train_epochs_interval == 0: player.rollout.learn()
def makeDecision(self, s: State, response: DecisionResponse): d: DecisionState = s.decision player: int = d.controlling_player if d.type != DecisionType.DecisionSelectCards and d.type != DecisionType.DecisionDiscreteChoice: logging.error('Invalid decision type') if not d.active_card: self.makePhaseDecision(s, response) elif s.events: event = s.events[-1] if isinstance(event, PutOnDeckDownToN): self.heuristic.makePutDownOnDeckDecision(s, response) elif isinstance(event, DiscardDownToN): self.heuristic.makeDiscardDownDecision(s, response) elif isinstance(event, RemodelExpand): if not event.trashed_card: def scoringFunction(card: Card): if isinstance(card, Curse): return 19 elif isinstance(card, Estate): return 18 elif isinstance(card, VictoryCard): return -200 + card.get_coin_cost() return -card.get_coin_cost() response.cards = heuristic_select_cards( d.card_choices, d.min_cards, scoringFunction) else: response.cards.append( self.heuristic.agenda.forceBuy(s, player, d.card_choices)) else: self.heuristic.makeBaseDecision(s, response)
def makeDecision(self, s: State, response: DecisionResponse): d: DecisionState = s.decision p: int = s.player if s.phase == Phase.ActionPhase: assert False, 'MCTS does not support action cards yet' elif s.phase == Phase.TreasurePhase: response.single_card = d.card_choices[0] else: vals = [] choices = d.card_choices + [None] X = s.lookahead_batch_featurize(choices) vals = self.model(X).detach().cpu().numpy() choice = self.select(p, choices, vals) response.single_card = choice
def makeCopyDecision(self, s: State, response: DecisionResponse): d: DecisionState = s.decision def scoringFunction(card: Card): return card.get_coin_cost() response.cards = heuristic_select_cards(d.card_choices, d.min_cards, scoringFunction)
def run(self): s = self.game.state d: DecisionState = s.decision tree_score = 0 # run the game up to game end or turn limit reached while d.type != DecisionType.DecisionGameOver and s.player_states[ 0]._turns < self.T: if d.text: logging.info(d.text) response = DecisionResponse([]) player = self.game.players[d.controlling_player] next_node = player.controller.makeDecision(s, response) if s.phase == Phase.BuyPhase: # apply selection until leaf node is reached if next_node: assert next_node == self.player.node self.player.node.n += 1 elif not self.expanded: # expand one node cards = list( filter(lambda x: not isinstance(x, Curse), d.card_choices + [None])) self.player.node.add_unique_children(cards) self.expanded = True self.player.node = self.player.node.get_child_node( response.single_card) self.player.node.n += 1 # Uncomment to track UCT score within the tree tree_score = self.game.get_player_scores()[0] self.data.update_split_scores(tree_score, False, self.iter) elif self.rollout_model == Rollout.HistoryHeuristic: self.rollout_cards.append(response.single_card) s.process_decision(response) s.advance_next_decision() score = self.game.get_player_scores()[0] # update data self.data.update_split_scores(score - tree_score, True, self.iter) # backpropagate delta = score self.player.node.v += delta self.player.node = self.player.node.parent while self.player.node != self.player.root: self.player.node.update_v(lambda x: sum(x) / len(x)) self.player.node = self.player.node.parent # update history heuristic if self.rollout_model == Rollout.HistoryHeuristic: self.rollout.update(cards=self.rollout_cards, score=score) elif self.rollout_model == Rollout.LinearRegression: counts = self.game.state.get_card_counts(0) self.rollout.update(counts=counts, score=score, i=self.iter) return self.game.get_player_scores()[0]
def makeDecision(self, s: State, response: DecisionResponse): d: DecisionState = s.decision if s.phase == Phase.ActionPhase: if not d.active_card: self.heuristic.makeGreedyActionDecision(s, response) elif s.events: event = s.events[-1] if isinstance(event, DiscardDownToN): self.heuristic.makeDiscardDownDecision(s, response) elif isinstance(event, MoatReveal): self.heuristic.makeBaseDecision(s, response) else: raise ValueError(f'Event {type(event)} not supported') else: self.heuristic.makeBaseDecision(s, response) elif s.phase == Phase.TreasurePhase: response.single_card = d.card_choices[0] else: # Remove Curse choices = list( filter(lambda x: not isinstance(x, Curse), d.card_choices + [None])) # Rollout (out-of-tree) case; tree actually isn't that good if not self.tree.in_tree or not self.use_tree: logging.log(level=BUY, msg='Rollout') response.single_card = self.rollout.select(choices, state=s) return # the next node in the tree is the one that maximizes the UCB1 score try: # Remove Copper and Victory cards -- tree never gets that deep anyways tree_choices = list( filter( lambda x: not isinstance(x, Copper) and not issubclass( type(x), VictoryCard), choices)) card = self.tree.select(tree_choices) logging.log(level=BUY, msg=f'Selection: {self.tree.node.n}') except ValueError: card = self.rollout.select(choices, state=s) response.single_card = card
def makeDecision(self, s: State, response: DecisionResponse): d: DecisionState = s.decision p: int = s.player if s.phase == Phase.ActionPhase: assert False, 'GreedyMLPPlayer does not support action cards yet' elif s.phase == Phase.TreasurePhase: response.single_card = d.card_choices[0] else: choices = d.card_choices + [None] X = s.lookahead_batch_featurize(choices) label_idx = 0 if p == 1 else 2 y_pred = self.model.forward(X) card_idx = torch.argmax(y_pred[:, label_idx]) response.single_card = choices[card_idx]
def makeDecision(self, s: State, response: DecisionResponse): d: DecisionState = s.decision if s.phase == Phase.TreasurePhase: response.single_card = d.card_choices[0] return if d.type == DecisionType.DecisionSelectCards: cardsToPick = -1 d.print_card_choices() while (cardsToPick < d.min_cards or cardsToPick > d.max_cards): text = '' while not text: text = input( f'Pick between {d.min_cards} and {d.max_cards} of the above cards:\n' ) cardsToPick = int(text) responseIdxs = [] for i in range(cardsToPick): cardIdx = -1 while (cardIdx == -1 or cardIdx in responseIdxs or cardIdx >= len(d.card_choices)): d.print_card_choices() text = '' while not text: text = input('Choose another card:\n') cardIdx = int(text) responseIdxs.append(cardIdx) response.cards.append(d.card_choices[cardIdx]) elif d.type == DecisionType.DecisionDiscreteChoice: choice = -1 while choice == -1 or choice > d.min_cards: text = '' while not text: text = input( 'Please make a discrete choice from the above cards:\n' ) choice = int(text) d.print_card_choices() response.choice = choice else: logging.error(f'Player {s.player} given invalid decision type.')
def simulate(env: Environment, n: int, tree: GameTree, turn_log=False, action_log=False, card_log=False) -> SimulationData: # TODO: Fix this shit sim_data = SimulationData(Supply(env.config).get_supply_card_types()) for i in tqdm(range(n)): state: State = env.reset() if tree: tree.reset(state) done = False t_start = time.time() starting_player_buy = None while not done: action: DecisionResponse = DecisionResponse([]) d: DecisionState = state.decision pid: int = d.controlling_player player = env.players[pid] player.makeDecision(state, action) if state.phase == Phase.ActionPhase: # +1 to turns to get current turn sim_data.update_action(i, pid, state.player_states[pid].turns + 1, action.cards[0]) if state.phase == Phase.BuyPhase and tree: tree.advance(action.single_card) log_buy = (state.phase == Phase.BuyPhase) obs, reward, done, _ = env.step(action) if turn_log and log_buy: if pid == 0: starting_player_buy = action.single_card else: sim_data.update_turn(i, 0, state.player_states[0].turns, state.get_player_score(0), starting_player_buy, state.get_coin_density(0)) sim_data.update_turn(i, 1, state.player_states[1].turns, state.get_player_score(1), action.single_card, state.get_coin_density(1)) if card_log and log_buy: if pid == 1: sim_data.update_card(i, 0, state.player_states[0].turns, state.get_card_counts(0)) sim_data.update_card(i, 1, state.player_states[1].turns, state.get_card_counts(1)) if state.player_states[0].turns > state.player_states[1].turns: sim_data.update_card(i, 0, state.player_states[0].turns, state.get_card_counts(0)) sim_data.update_turn(i, 0, state.player_states[0].turns, state.get_player_score(0), starting_player_buy, state.get_coin_density(0)) t_end = time.time() sim_data.update(env.game, t_end - t_start) sim_data.finalize(env.game) print('===SUMMARY===') print(sim_data.summary) return sim_data
def makeDiscardDownDecision(self, s: State, response: DecisionResponse): d: DecisionState = s.decision def scoringFunction(card: Card): if isinstance(card, VictoryCard): return 20 elif isinstance(card, Curse): return 19 elif isinstance(card, Copper): return 18 return -card.get_coin_cost() response.cards = heuristic_select_cards(d.card_choices, d.min_cards, scoringFunction)
def run(self, T=None): d = self.state.decision self.state.advance_next_decision() while d.type != DecisionType.DecisionGameOver: if T is not None and all(t.turns >= T for t in self.state.player_states): break if d.text: logging.info(d.text) response = DecisionResponse([]) player = self.players[self.state.decision.controlling_player] player.controller.makeDecision(self.state, response) self.state.process_decision(response) self.state.advance_next_decision()
def makeDecision(self, s: State, response: DecisionResponse): d: DecisionState = s.decision # Do not allow RandomPlayer to purchase curses if s.phase == Phase.BuyPhase and not self.train: remove_first_card(Curse(), d.card_choices) # Ensure random player plays all treasures if s.phase == Phase.TreasurePhase: response.single_card = d.card_choices[0] return if d.type == DecisionType.DecisionSelectCards: cards_to_pick = d.min_cards if d.max_cards > d.min_cards: cards_to_pick = random.randint(d.min_cards, d.max_cards) response.cards = random.sample(d.card_choices, k=min(cards_to_pick, len(d.card_choices))) elif d.type == DecisionType.DecisionDiscreteChoice: response.choice = random.randint(0, d.min_cards) else: logging.error('Invalid decision type')
def makeGreedyActionDecision(self, s: State, response: DecisionResponse): d: DecisionState = s.decision assert d.min_cards == 0 and d.max_cards == 1, 'Invalid decision parameters' def scoringFunction(card: Card): '''Play all cantrips first, then greedily''' cantrip_bonus = 7 score = min(card.get_coin_cost(), 6) if is_cantrip(card): score += cantrip_bonus return score cards = heuristic_select_cards(d.card_choices, d.min_cards, scoringFunction) response.cards = cards
def makePutDownOnDeckDecision(self, s: State, response: DecisionResponse): d: DecisionState = s.decision def scoringFunction(card: Card): if has_excess_actions(s.decision.card_choices): if isinstance(card, ActionCard): return 100 - card.get_plus_actions() return -card.get_coin_cost() elif has_treasure_cards(s.decision.choices): if isinstance(card, TreasureCard): return 100 - card.get_treasure() return -card.get_coin_cost() else: return -card.get_coin_cost() response.cards = heuristic_select_cards(d.card_choices, d.min_cards, scoringFunction)
def reset(self, **kwargs) -> State: if self.randomize_player_order: np.random.shuffle(self.players) self.game = Game(self.config, self.players) self.game.new_game() self.game.state.advance_next_decision() s: State = self.game.state d: DecisionState = s.decision while s.phase != Phase.BuyPhase and not self._done: response = DecisionResponse([]) p = self.game.players[d.controlling_player].controller p.makeDecision(s, response) s.process_decision(response) s.advance_next_decision() return self.game.state
def test_vassal_effect_play_action(self) -> None: self.game.new_game() p_state: PlayerState = self.game.state.player_states[0] opp_state: PlayerState = self.game.state.player_states[1] card = Bandit() p_state._deck[-1] = card first_discarded = opp_state._deck[-1] second_discarded = opp_state._deck[-2] effect = VassalEffect() # Play Bandit r = DecisionResponse([], 1) effect.play_action(self.game.state) self.game.state.advance_next_decision() self.game.state.process_decision(r) # Process Bandit events self.game.state.advance_next_decision() self.assertIn(card, p_state._play_area) self.assertIn(first_discarded, opp_state._discard) self.assertIn(second_discarded, opp_state._discard)
def sample_training_batch(n: int, p: float, config: GameConfig, players: Iterable[Player], win_loss=False) -> Tuple[np.array, np.array]: env = DefaultEnvironment(config, players) X = [] y = [] rng = np.random.default_rng() print('Generating training data from self-play...') for epoch in tqdm(range(n)): state: State = env.reset() done = False while not done: action = DecisionResponse([]) d = state.decision player = players[d.controlling_player] player.makeDecision(state, action) obs, reward, done, _ = env.step(action) feature = obs.feature.to_numpy() if p <= 1 and p > 0: if rng.uniform(0, 1) < p: X.append(feature) else: if obs.player_states[d.controlling_player].turns < p: X.append(feature) if p <= 0: X.append(feature) y.extend([reward] * (len(X) - len(y))) y = np.array(y) if win_loss: y[y == -1] = 0 return np.array(X), y
def test_event_sentry(self) -> None: self.game.new_game() # Inject Sentry in player's hand sentry = Sentry() self.game.state.inject(0, sentry) self.game.state.advance_next_decision() # Action Phase Decision r = DecisionResponse([]) r.cards = [sentry] self.game.state.process_decision(r) self.game.state.advance_next_decision() # Choose to trash one card d = self.game.state.decision trashed = d.card_choices[0] r = DecisionResponse([trashed]) self.game.state.process_decision(r) # Trash card self.game.state.advance_next_decision() self.assertEqual(self.game.state.trash, [trashed]) # Choose to discard one card d = self.game.state.decision discarded = d.card_choices[0] r = DecisionResponse([discarded]) self.game.state.process_decision(r) # Discard card self.game.state.advance_next_decision() d = self.game.state.decision p_state: PlayerState = self.game.state.player_states[0] self.assertEqual(p_state._discard, [discarded]) self.assertIsNone(d.active_card)
def test_moat_reveal(self) -> None: self.game.new_game() # Inject necessary cards into players' hands attack_card = Militia() moat_card = Moat() self.game.state.inject(0, attack_card) self.game.state.inject(1, moat_card) self.game.state.advance_next_decision() # Action Phase decision r = DecisionResponse([]) r.cards = [attack_card] self.game.state.process_decision(r) self.game.state.advance_next_decision() # MoatReveal reaction r = DecisionResponse([]) r.choice = 0 self.game.state.process_decision(r) self.game.state.advance_next_decision() self.assertEqual(self.game.state.events, [])
def makeBaseDecision(self, s: State, response: DecisionResponse): d: DecisionState = s.decision card = d.active_card player = s.decision.controlling_player p_state: PlayerState = s.player_states[player] if isinstance(card, Cellar): num_discarded = 0 for c in d.card_choices: if isinstance(c, VictoryCard) or c.get_coin_cost() < 2: response.cards.append(c) elif isinstance(card, Chapel): treasureValue = s.get_total_coin_count(player) trashCoppers = (treasureValue > 3) num_discarded = 0 for c in d.card_choices: trashCoppers = (treasureValue > 3) if num_discarded == 4: break if isinstance(c, Curse): response.cards.append(c) num_discarded += 1 elif isinstance(c, Copper) and trashCoppers: response.cards.append(c) num_discarded += 1 treasureValue -= 1 elif isinstance(c, Estate): response.cards.append(c) num_discarded += 1 elif isinstance(c, Chapel): response.cards.append(c) num_discarded += 1 elif isinstance(card, Moat): response.choice = 0 elif isinstance(card, Bureaucrat): response.cards.append(d.card_choices[0]) elif isinstance(card, Militia): self.makeDiscardDownDecision(s, response) elif isinstance(card, ThroneRoom): self.makeCopyDecision(s, response) elif isinstance(card, Library): if s.player_states[s.player].actions == 0: response.choice = 0 else: response.choice = 1 elif isinstance(card, Mine): event = s.events[-1] if not event.trashed_card: def scoringFunction(card: Card): if isinstance(card, Gold) and s.supply[Gold] > 0: return 20 if isinstance(card, Silver) and s.supply[Silver] > 0: return 19 if isinstance(card, Copper) and s.supply[Copper] > 0: return 18 return -card.get_coin_cost() response.cards = heuristic_select_cards(d.card_choices, d.min_cards, scoringFunction) else: response.cards.append(self.agenda.forceBuy(s, player, d.card_choices)) elif isinstance(card, Harbinger): def scoringFunction(card: Card): if has_excess_actions(p_state.hand): if isinstance(card, ActionCard): return 100 + card.get_coin_cost() else: return card.get_coin_cost() else: return card.get_coin_cost() response.cards = heuristic_select_cards(d.card_choices, d.min_cards, scoringFunction) elif isinstance(card, Artisan): event = s.events[-1] if not event.gained_card: response.cards.append(self.agenda.forceBuy(s, player, d.card_choices)) else: self.makePutDownOnDeckDecision(s, response) elif isinstance(card, Poacher): self.makeDiscardDownDecision(s, response) else: logging.error('Unexpected decision')
def train_mcts(env: Environment, tree: GameTree, path: str, rollout_path: str, epochs: int, train_epochs_interval: int = 1000, train_epochs_cap=10000, save_epochs=1000, scoring='win_loss'): for epoch in tqdm(range(epochs)): state: State = env.reset() tree.reset(state) done = False expanded = False flip = False data = { 'features': [], 'rewards': [], 'cards': [], 'idxs': state.feature.idxs } data['model_name'] = os.path.split(path)[-1] while not done: action = DecisionResponse([]) d: DecisionState = state.decision player: Player = env.players[d.controlling_player] # Add any states now visible due to randomness if tree.in_tree: cards = d.card_choices + [None] tree.node.add_unique_children(cards) player.makeDecision(state, action) if isinstance(player, MCTSPlayer): x = state.feature.to_numpy() data['features'].append(x) data['cards'].append(action.single_card) # Advance to the next node within the tree, implicitly adding a node the first time we exit tree if tree.in_tree: tree.advance(action.single_card) # First time we go out of tree, enter rollout phase if not expanded and not tree.in_tree: # Previous node is starting player action, so current node is opponent player action. flip = (state.player == 1) expanded = True obs, reward, done, _ = env.step(action) data['rewards'].extend([reward] * (len(data['features']) - len(data['rewards']))) start_idx = 1 if flip else 0 p0_score, p1_score = state.get_player_score(0), state.get_player_score( 1) if scoring == 'score': p0_reward, p1_reward = p0_score, p1_score elif scoring == 'win_loss': if reward == 0: p0_reward, p1_reward = 1 / 2, 1 / 2 elif reward == 1: p0_reward, p1_reward = 1, 0 else: p0_reward, p1_reward = 0, 1 elif scoring == 'score_ratio': min_score = min(p0_score, p1_score) if min_score < 0: p0_score_nonneg, p1_score_nonneg = p0_score + abs( min_score), p1_score + abs(min_score) else: p0_score_nonneg, p1_score_nonneg = p0_score, p1_score if p0_score_nonneg == 0 and p1_score_nonneg == 0: p0_reward, p1_reward = 0, 0 else: total_score = p0_score_nonneg + p1_score_nonneg p0_reward, p1_reward = p0_score / total_score, p1_score / total_score tree.node.backpropagate((p0_reward, p1_reward), start_idx=start_idx) if save_epochs > 0 and epoch % save_epochs == 0: save(path, tree._root) for player in env.players: if isinstance(player, MCTSPlayer): player.rollout.save(rollout_path) break # mcts players share the tree, so only update once for player in env.players: if isinstance(player, MCTSPlayer): player.rollout.update(**data) if (epoch + 1) % train_epochs_interval == 0 and ( epoch + 1) < train_epochs_cap: player.rollout.learn() for player in env.players: if isinstance(player, MCTSPlayer): player.rollout.save(rollout_path) break save(path, tree._root)