def _initialize_cpe(
     self,
     reward_network,
     q_network_cpe,
     q_network_cpe_target,
     optimizer: Optimizer__Union,
 ) -> None:
     if self.calc_cpe_in_training:
         assert reward_network is not None, "reward_network is required for CPE"
         # pyre-fixme[16]: `RLTrainer` has no attribute `reward_network`.
         self.reward_network = reward_network
         # pyre-fixme[16]: `RLTrainer` has no attribute `reward_network_optimizer`.
         self.reward_network_optimizer = optimizer.make_optimizer_scheduler(
             self.reward_network.parameters())
         assert (
             q_network_cpe is not None and q_network_cpe_target is not None
         ), "q_network_cpe and q_network_cpe_target are required for CPE"
         # pyre-fixme[16]: `RLTrainer` has no attribute `q_network_cpe`.
         self.q_network_cpe = q_network_cpe
         # pyre-fixme[16]: `RLTrainer` has no attribute `q_network_cpe_target`.
         self.q_network_cpe_target = q_network_cpe_target
         # pyre-fixme[16]: `RLTrainer` has no attribute `q_network_cpe_optimizer`.
         self.q_network_cpe_optimizer = optimizer.make_optimizer_scheduler(
             self.q_network_cpe.parameters())
         num_output_nodes = len(self.metrics_to_score) * self.num_actions
         # pyre-fixme[16]: `RLTrainer` has no attribute `reward_idx_offsets`.
         self.reward_idx_offsets = torch.arange(
             0,
             num_output_nodes,
             self.num_actions,
             device=self.device,
             dtype=torch.long,
         )
     else:
         self.reward_network = None
Exemple #2
0
    def __init__(
        self,
        seq2slate_net: Seq2SlateTransformerNet,
        minibatch_size: int = 1024,
        parameters: Seq2SlateParameters = field(  # noqa: B008
            default_factory=Seq2SlateParameters),
        baseline_net: Optional[BaselineNet] = None,
        baseline_warmup_num_batches: int = 0,
        use_gpu: bool = False,
        policy_optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
        baseline_optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
        policy_gradient_interval: int = 1,
        print_interval: int = 100,
    ) -> None:
        self.seq2slate_net = seq2slate_net
        self.parameters = parameters
        self.use_gpu = use_gpu
        self.policy_gradient_interval = policy_gradient_interval
        self.print_interval = print_interval

        self.minibatch_size = minibatch_size
        self.minibatch = 0

        self.baseline_net = baseline_net
        self.baseline_warmup_num_batches = baseline_warmup_num_batches

        self.rl_opt = policy_optimizer.make_optimizer_scheduler(
            self.seq2slate_net.parameters())["optimizer"]
        self.rl_opt.zero_grad()
        if self.baseline_net:
            self.baseline_opt = baseline_optimizer.make_optimizer_scheduler(
                # pyre-fixme[16]: `Optional` has no attribute `parameters`.
                self.baseline_net.parameters())["optimizer"]
 def __init__(
     self,
     imitator,
     use_gpu: bool = False,
     rl: RLParameters = field(default_factory=RLParameters),  # noqa: B008
     minibatch_size: int = 1024,
     minibatches_per_step: int = 1,
     optimizer: Optimizer__Union = field(  # noqa: B008
         default_factory=Optimizer__Union.default),
 ) -> None:
     super().__init__(rl, use_gpu=use_gpu)
     self.minibatch_size = minibatch_size
     self.minibatches_per_step = minibatches_per_step or 1
     self.imitator = imitator
     self.imitator_optimizer = optimizer.make_optimizer_scheduler(
         imitator.parameters())
 def __init__(
     self,
     seq2slate_net: Seq2SlateTransformerNet,
     parameters: Seq2SlateParameters,
     minibatch_size: int,
     use_gpu: bool = False,
     policy_optimizer: Optimizer__Union = field(  # noqa: B008
         default_factory=Optimizer__Union.default),
     print_interval: int = 100,
 ) -> None:
     self.parameters = parameters
     self.use_gpu = use_gpu
     self.print_interval = print_interval
     self.seq2slate_net = seq2slate_net
     self.minibatch_size = minibatch_size
     self.minibatch = 0
     self.optimizer = policy_optimizer.make_optimizer_scheduler(
         self.seq2slate_net.parameters())["optimizer"]
     # TODO: T62269969 add baseline_net in training
     self.kl_div_loss = nn.KLDivLoss(reduction="none")
 def __init__(
     self,
     seq2slate_net: Seq2SlateTransformerNet,
     minibatch_size: int = 1024,
     loss_reporter=None,
     use_gpu: bool = False,
     policy_optimizer: Optimizer__Union = field(  # noqa: B008
         default_factory=Optimizer__Union.default),
 ) -> None:
     self.loss_reporter = loss_reporter
     self.use_gpu = use_gpu
     self.seq2slate_net = seq2slate_net
     self.minibatch_size = minibatch_size
     self.minibatch = 0
     self.optimizer = policy_optimizer.make_optimizer_scheduler(
         self.seq2slate_net.parameters())["optimizer"]
     self.log_softmax = nn.LogSoftmax(dim=1)
     self.kl_loss = nn.KLDivLoss(reduction="batchmean")
     if self.loss_reporter is None:
         self.loss_reporter = NoOpLossReporter()
 def __init__(
     self,
     reward_net: ModelBase,
     optimizer: Optimizer__Union = field(  # noqa: B008
         default_factory=Optimizer__Union.default
     ),
     loss_type: LossFunction = LossFunction.MSE,
     reward_ignore_threshold: Optional[float] = None,
     weighted_by_inverse_propensity: bool = False,
 ) -> None:
     self.reward_net = reward_net
     self.minibatch = 0
     self.opt = optimizer.make_optimizer_scheduler(self.reward_net.parameters())[
         "optimizer"
     ]
     self.loss_type = loss_type
     self.reward_ignore_threshold = reward_ignore_threshold
     self.weighted_by_inverse_propensity = weighted_by_inverse_propensity
     self.loss_fn = _get_loss_function(
         loss_type, reward_ignore_threshold, weighted_by_inverse_propensity
     )