Exemple #1
0
    def train(self,
              n: int,
              output_iters: int,
              save_model=False,
              model_dir=model_dir,
              model_name='mcts',
              save_data=False,
              data_dir=data_dir,
              data_name='data'):

        avg = 0
        for i in tqdm(range(n)):
            # initialize new game
            self.reset(i)
            self.run()
            self.data.update(self.game, self.player, i)

            avg = sum(self.data.scores) / (i + 1)

            if i > 0 and i % output_iters == 0:
                print(
                    f'Last {output_iters} avg: {sum(self.data.scores[i-output_iters:i]) / output_iters}'
                )
                print(f'Total {i} avg: {avg}')

        if save_model:
            save(os.path.join(model_dir, model_name), self.player.root)
            save(os.path.join(model_dir, f'{model_name}_rollout'),
                 self.rollout)
        if save_data:
            self.data.update_dataframes()
            self.data.augment_avg_scores(100)
            save(os.path.join(data_dir, data_name), self.data)
def main(args: ArgumentParser):
    if args.debug:
        logging.basicConfig(level=logging.INFO)

    config = GameConfig(prosperity=args.prosperity, num_players=len(args.players), sandbox=args.sandbox, feature_type=args.ftype, device=args.device)

    if args.tree_path:
        tree = GameTree.load(args.tree_path, False)
    else:
        tree = None

    players = load_players(args.players, args.models, tree=tree, train=False, rollout_type=args.rollout_type)
    logger = logging.getLogger()

    if args.log_buys:
        logger.setLevel(BUY)

    env = DefaultEnvironment(config, players, logger=logger)
    sim_data = simulate(env, args.n, tree)

    if args.save_data:
        save(args.data_path, sim_data)
def train_mcts(env: Environment,
               tree: GameTree,
               path: str,
               rollout_path: str,
               epochs: int,
               train_epochs_interval: int = 1000,
               train_epochs_cap=10000,
               save_epochs=1000,
               scoring='win_loss'):
    for epoch in tqdm(range(epochs)):
        state: State = env.reset()
        tree.reset(state)
        done = False
        expanded = False
        flip = False
        data = {
            'features': [],
            'rewards': [],
            'cards': [],
            'idxs': state.feature.idxs
        }
        data['model_name'] = os.path.split(path)[-1]
        while not done:
            action = DecisionResponse([])
            d: DecisionState = state.decision
            player: Player = env.players[d.controlling_player]

            # Add any states now visible due to randomness
            if tree.in_tree:
                cards = d.card_choices + [None]
                tree.node.add_unique_children(cards)

            player.makeDecision(state, action)

            if isinstance(player, MCTSPlayer):
                x = state.feature.to_numpy()
                data['features'].append(x)
                data['cards'].append(action.single_card)

            # Advance to the next node within the tree, implicitly adding a node the first time we exit tree
            if tree.in_tree:
                tree.advance(action.single_card)

            # First time we go out of tree, enter rollout phase
            if not expanded and not tree.in_tree:
                # Previous node is starting player action, so current node is opponent player action.
                flip = (state.player == 1)
                expanded = True

            obs, reward, done, _ = env.step(action)

        data['rewards'].extend([reward] *
                               (len(data['features']) - len(data['rewards'])))
        start_idx = 1 if flip else 0
        p0_score, p1_score = state.get_player_score(0), state.get_player_score(
            1)
        if scoring == 'score':
            p0_reward, p1_reward = p0_score, p1_score
        elif scoring == 'win_loss':
            if reward == 0:
                p0_reward, p1_reward = 1 / 2, 1 / 2
            elif reward == 1:
                p0_reward, p1_reward = 1, 0
            else:
                p0_reward, p1_reward = 0, 1
        elif scoring == 'score_ratio':
            min_score = min(p0_score, p1_score)
            if min_score < 0:
                p0_score_nonneg, p1_score_nonneg = p0_score + abs(
                    min_score), p1_score + abs(min_score)
            else:
                p0_score_nonneg, p1_score_nonneg = p0_score, p1_score
            if p0_score_nonneg == 0 and p1_score_nonneg == 0:
                p0_reward, p1_reward = 0, 0
            else:
                total_score = p0_score_nonneg + p1_score_nonneg
                p0_reward, p1_reward = p0_score / total_score, p1_score / total_score

        tree.node.backpropagate((p0_reward, p1_reward), start_idx=start_idx)

        if save_epochs > 0 and epoch % save_epochs == 0:
            save(path, tree._root)

            for player in env.players:
                if isinstance(player, MCTSPlayer):
                    player.rollout.save(rollout_path)
                    break

        # mcts players share the tree, so only update once
        for player in env.players:
            if isinstance(player, MCTSPlayer):
                player.rollout.update(**data)
                if (epoch + 1) % train_epochs_interval == 0 and (
                        epoch + 1) < train_epochs_cap:
                    player.rollout.learn()

    for player in env.players:
        if isinstance(player, MCTSPlayer):
            player.rollout.save(rollout_path)
            break
    save(path, tree._root)
Exemple #4
0
 def save(self, path: str):
     state_dict = {}
     state_dict['models'] = self.models
     save(path, state_dict)
Exemple #5
0
 def save(self, path: str):
     state_dict = {'mast': self.mast, 'tau': self.tau}
     save(path, state_dict)