def profile_model(model_path):
    late_game = load_late_game()

    model = load_diplomacy_model(model_path, map_location="cuda", eval=True)

    for game_name, game in [("new_game", Game()), ("late_game", late_game)]:
        print("\n#", game_name)
        inputs = FeatureEncoder().encode_inputs([game])
        inputs = {k: v.to("cuda") for k, v in inputs.items()}

        for batch_size in B:
            b_inputs = {
                k: v.repeat((batch_size, ) + (1, ) * (len(v.shape) - 1))
                for k, v in inputs.items()
            }
            with torch.no_grad():
                tic = time.time()
                for _ in range(N):
                    order_idxs, order_scores, cand_scores, final_scores = model(
                        **b_inputs, temperature=1.0)
                toc = time.time() - tic

                print(
                    f"[B={batch_size}] {toc}s / {N}, latency={1000*toc/N}ms, throughput={N*batch_size/toc}/s"
                )
    def __init__(
        self,
        *,
        model_path,
        value_model_path=None,
        max_batch_size,
        max_rollout_length=3,
        rollout_temperature,
        rollout_top_p=1.0,
        n_rollout_procs=70,
        device=0,
        mix_square_ratio_scoring=0,
        clear_old_all_possible_orders=False,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.n_rollout_procs = n_rollout_procs
        self.rollout_temperature = rollout_temperature
        self.rollout_top_p = rollout_top_p
        self.max_batch_size = max_batch_size
        self.max_rollout_length = max_rollout_length
        self.mix_square_ratio_scoring = mix_square_ratio_scoring
        self.clear_old_all_possible_orders = clear_old_all_possible_orders
        self.device = parse_device(
            device) if torch.cuda.is_available() else "cpu"

        self.model = load_diplomacy_model(model_path, eval=True)
        # Loading model to gpu right away will load optimizer state we don't care about.
        self.model.to(self.device)
        if value_model_path is not None:
            self.value_model = load_diplomacy_model(value_model_path,
                                                    eval=True)
            # Loading model to gpu right away will load optimizer state we don't care about.
            self.value_model.to(self.device)
        else:
            self.value_model = self.model

        self.thread_pool = pydipcc.ThreadPool(n_rollout_procs,
                                              ORDER_VOCABULARY_TO_IDX,
                                              get_order_vocabulary_idxs_len())
 def __init__(self,
              model_path,
              temperature,
              top_p=1.0,
              device=_DEFAULT_DEVICE):
     self.model = load_diplomacy_model(model_path,
                                       map_location=device,
                                       eval=True)
     self.temperature = temperature
     self.device = device
     self.top_p = top_p
     self.thread_pool = pydipcc.ThreadPool(1, ORDER_VOCABULARY_TO_IDX,
                                           get_order_vocabulary_idxs_len())
    def _init_state(self, device):
        net = load_diplomacy_model(self.cfg.model_path,
                                   map_location=device,
                                   eval=True)
        if self.cfg.reset_agent_weights:

            def _reset(module):
                if hasattr(module, "reset_parameters"):
                    module.reset_parameters()

            net.apply(_reset)

        optim = build_optimizer(net, self.cfg.optimizer)
        self.state = TrainerState(
            model=net,
            optimizer=optim,
            epoch_id=0,
            global_step=0,
            args=torch.load(self.cfg.model_path, map_location="cpu")["args"],
        )