Exemple #1
0
 def handle(self, tdp: TrainingDataPage) -> None:
     if not self.trainer.calc_cpe_in_training:
         return
     if isinstance(tdp, TrainingDataPage):
         if isinstance(self.trainer, DQNTrainer):
             # This is required until we get rid of TrainingDataPage
             if self.trainer.maxq_learning:
                 edp = EvaluationDataPage.create_from_training_batch(
                     tdp.as_discrete_maxq_training_batch(), self.trainer)
             else:
                 edp = EvaluationDataPage.create_from_training_batch(
                     tdp.as_discrete_sarsa_training_batch(), self.trainer)
         else:
             edp = EvaluationDataPage.create_from_tdp(tdp, self.trainer)
     elif isinstance(tdp, TrainingBatch):
         if isinstance(self.trainer, SACTrainer):
             # TODO: Implement CPE for continuous algos
             edp = None
         else:
             edp = EvaluationDataPage.create_from_training_batch(
                 tdp, self.trainer)
     if self.evaluation_data is None:
         self.evaluation_data = edp
     else:
         self.evaluation_data = self.evaluation_data.append(edp)
Exemple #2
0
 def handle(self, tdp: TrainingBatch) -> None:
     if not self.trainer.calc_cpe_in_training:
         return
     if isinstance(tdp, TrainingBatch):
         if isinstance(self.trainer, DQNTrainer):
             # This is required until we get rid of TrainingBatch
             if self.trainer.maxq_learning:
                 edp = EvaluationDataPage.create_from_training_batch(
                     tdp, self.trainer
                 )
             else:
                 edp = EvaluationDataPage.create_from_training_batch(
                     tdp, self.trainer
                 )
         else:
             edp = EvaluationDataPage.create_from_training_batch(tdp, self.trainer)
     elif isinstance(tdp, TrainingBatch):
         # TODO: Perhaps we can make an RLTrainer param to check if continuous?
         if isinstance(self.trainer, SACTrainer):
             # TODO: Implement CPE for continuous algos
             edp = None
         else:
             edp = EvaluationDataPage.create_from_training_batch(tdp, self.trainer)
     if self.evaluation_data is None:
         self.evaluation_data = edp
     else:
         self.evaluation_data = self.evaluation_data.append(edp)
Exemple #3
0
    def evaluate_post_training(self, edp: EvaluationDataPage) -> CpeDetails:
        cpe_details = CpeDetails()

        self.score_cpe("Reward", edp, cpe_details.reward_estimates)

        if (
            self.metrics_to_score is not None
            and edp.logged_metrics is not None
            and self.action_names is not None
        ):
            for i, metric in enumerate(self.metrics_to_score):
                logger.info(
                    "--------- Running CPE on metric: {} ---------".format(metric)
                )

                metric_reward_edp = edp.set_metric_as_reward(i, len(self.action_names))

                cpe_details.metric_estimates[metric] = CpeEstimateSet()
                self.score_cpe(
                    metric, metric_reward_edp, cpe_details.metric_estimates[metric]
                )

        # Compute MC Loss on Aggregate Reward
        cpe_details.mc_loss = float(
            torch.mean(torch.abs(edp.logged_values - edp.model_values))
        )

        return cpe_details
 def create_edp(self, environment, samples, epsilon_model):
     """Generate a EvaluationDataPage such that the model policy is epsilon
     greedy with parameter epsilon_model. The true values of this policy are
     used for the model_values* data.
     """
     tdp = environment.preprocess_samples(samples,
                                          len(samples.mdp_ids),
                                          do_shuffle=False)[0]
     # compute rewards, probs, values for all actions of each sampled state
     model_rewards = environment.true_rewards_all_actions_for_sample(
         samples.states)
     model_propensities = environment.policy_probabilities_for_sample(
         samples.states, epsilon_model)
     model_values = environment.true_epsilon_values_all_actions_for_sample(
         samples.states, epsilon_model)
     # compute rewards for logged action
     model_rewards_logged_action = environment.true_rewards_for_sample(
         samples.states, samples.actions)
     edp = EvaluationDataPage(
         mdp_id=np.array(samples.mdp_ids).reshape(-1, 1),
         sequence_number=torch.tensor(samples.sequence_numbers,
                                      dtype=torch.int),
         logged_propensities=tdp.propensities,
         logged_rewards=tdp.rewards,
         action_mask=tdp.actions,
         model_propensities=torch.tensor(model_propensities,
                                         dtype=torch.float32),
         model_rewards=torch.tensor(model_rewards, dtype=torch.float32),
         model_rewards_for_logged_action=torch.tensor(
             model_rewards_logged_action, dtype=torch.float32),
         model_values=torch.tensor(model_values, dtype=torch.float32),
         model_values_for_logged_action=None,
         possible_actions_mask=tdp.possible_actions_mask,
     )
     return edp
Exemple #5
0
 def handle(self, tdp: TrainingDataPage) -> None:
     if not self.trainer.calc_cpe_in_training:
         return
     if isinstance(tdp, TrainingDataPage):
         edp = EvaluationDataPage.create_from_tdp(tdp, self.trainer)
     elif isinstance(tdp, TrainingBatch):
         if isinstance(self.trainer, (_DQNTrainer, SACTrainer)):
             # TODO: Implement CPE for modular DQNTrainer & continuous algos
             edp = None
         else:
             edp = EvaluationDataPage.create_from_training_batch(
                 tdp, self.trainer)
     if self.evaluation_data is None:
         self.evaluation_data = edp
     else:
         self.evaluation_data = self.evaluation_data.append(edp)
Exemple #6
0
    def evaluate(self, eval_tdp: PreprocessedTrainingBatch):
        seq2slate_net = self.trainer.seq2slate_net
        baseline_net = self.trainer.baseline_net

        seq2slate_net_prev_mode = seq2slate_net.training
        baseline_net_prev_mode = baseline_net.training
        seq2slate_net.eval()
        baseline_net.eval()

        log_prob = (seq2slate_net(eval_tdp.training_input,
                                  mode=Seq2SlateMode.PER_SEQ_LOG_PROB_MODE).
                    log_probs.detach().flatten().cpu().numpy())
        b = baseline_net(eval_tdp.training_input).squeeze().detach()
        advantage = (eval_tdp.training_input.slate_reward -
                     b).flatten().cpu().numpy()

        self.baseline_loss.append(
            F.mse_loss(b, eval_tdp.training_input.slate_reward).item())
        self.advantages.append(advantage)
        self.log_probs.append(log_prob)

        seq2slate_net.train(seq2slate_net_prev_mode)
        baseline_net.train(baseline_net_prev_mode)

        if not self.calc_cpe:
            return

        edp = EvaluationDataPage.create_from_training_batch(
            eval_tdp, self.trainer, self.reward_network)
        if self.eval_data_pages is None:
            self.eval_data_pages = edp
        else:
            self.eval_data_pages = self.eval_data_pages.append(edp)
Exemple #7
0
 def handle(self, tdp: PreprocessedTrainingBatch) -> None:
     if not self.trainer.calc_cpe_in_training:
         return
     # TODO: Perhaps we can make an RLTrainer param to check if continuous?
     if isinstance(self.trainer, (SACTrainer, TD3Trainer)):
         # TODO: Implement CPE for continuous algos
         edp = None
     else:
         edp = EvaluationDataPage.create_from_training_batch(tdp, self.trainer)
     if self.evaluation_data is None:
         self.evaluation_data = edp
     else:
         self.evaluation_data = self.evaluation_data.append(edp)
Exemple #8
0
    def evaluate_post_training(self, edp: EvaluationDataPage) -> CpeDetails:
        cpe_details = CpeDetails()

        cpe_details.reward_estimates = self.score_cpe("Reward", edp)

        if (self.metrics_to_score is not None
                and edp.logged_metrics is not None
                and self.action_names is not None):
            for i, metric in enumerate(self.metrics_to_score):
                logger.info(
                    "--------- Running CPE on metric: {} ---------".format(
                        metric))

                metric_reward_edp = edp.set_metric_as_reward(
                    i, len(self.action_names))

                cpe_details.metric_estimates[metric] = self.score_cpe(
                    metric, metric_reward_edp)

        if self.action_names is not None:
            if edp.optimal_q_values is not None:
                value_means = edp.optimal_q_values.mean(dim=0)
                cpe_details.q_value_means = {
                    action: float(value_means[i])
                    for i, action in enumerate(self.action_names)
                }
                value_stds = edp.optimal_q_values.std(dim=0)  # type: ignore
                cpe_details.q_value_stds = {
                    action: float(value_stds[i])
                    for i, action in enumerate(self.action_names)
                }
            if edp.eval_action_idxs is not None:
                cpe_details.action_distribution = {
                    action: float(
                        (edp.eval_action_idxs == i).sum())  # type: ignore
                    / edp.eval_action_idxs.shape[0]
                    for i, action in enumerate(self.action_names)
                }

        # Compute MC Loss on Aggregate Reward
        cpe_details.mc_loss = float(
            torch.mean(torch.abs(edp.logged_values -
                                 edp.model_values))  # type: ignore
        )

        return cpe_details
Exemple #9
0
    def evaluate_post_training(self, edp: EvaluationDataPage) -> CpeDetails:
        cpe_details = CpeDetails()

        self.score_cpe("Reward", edp, cpe_details.reward_estimates)

        if self.metrics_to_score is not None:
            for i, metric in enumerate(self.metrics_to_score):
                logger.info(
                    "--------- Running CPE on metric: {} ---------".format(
                        metric))

                metric_reward_edp = edp.set_metric_as_reward(
                    i, len(self.action_names))

                cpe_details.metric_estimates[metric] = CpeEstimateSet()
                self.score_cpe(metric, metric_reward_edp,
                               cpe_details.metric_estimates[metric])

        # Compute MC Loss on Aggregate Reward
        cpe_details.mc_loss = float(
            torch.mean(torch.abs(edp.logged_values - edp.model_values)))

        return cpe_details
Exemple #10
0
def train_network(params):
    writer = None
    if params["model_output_path"] is not None:
        writer = SummaryWriter(log_dir=params["model_output_path"])

    logger.info("Running DQN workflow with params:")
    logger.info(params)

    # Set minibatch size based on # of devices being used to train
    params["training"]["minibatch_size"] *= minibatch_size_multiplier(
        params["use_gpu"], params["use_all_avail_gpus"])

    action_names = np.array(params["actions"])
    rl_parameters = RLParameters(**params["rl"])
    training_parameters = TrainingParameters(**params["training"])
    rainbow_parameters = RainbowDQNParameters(**params["rainbow"])

    trainer_params = DiscreteActionModelParameters(
        actions=params["actions"],
        rl=rl_parameters,
        training=training_parameters,
        rainbow=rainbow_parameters,
    )

    dataset = JSONDataset(params["training_data_path"],
                          batch_size=training_parameters.minibatch_size)
    eval_dataset = JSONDataset(params["eval_data_path"], batch_size=16)
    state_normalization = read_norm_file(params["state_norm_data_path"])

    num_batches = int(len(dataset) / training_parameters.minibatch_size)
    logger.info("Read in batch data set {} of size {} examples. Data split "
                "into {} batches of size {}.".format(
                    params["training_data_path"],
                    len(dataset),
                    num_batches,
                    training_parameters.minibatch_size,
                ))

    trainer = DQNTrainer(
        trainer_params,
        state_normalization,
        use_gpu=params["use_gpu"],
        use_all_avail_gpus=params["use_all_avail_gpus"],
    )
    trainer = update_model_for_warm_start(trainer)
    preprocessor = Preprocessor(state_normalization, False)

    evaluator = Evaluator(
        trainer_params.actions,
        trainer_params.rl.gamma,
        trainer,
        metrics_to_score=trainer.metrics_to_score,
    )

    start_time = time.time()
    for epoch in range(int(params["epochs"])):
        dataset.reset_iterator()
        for batch_idx in range(num_batches):
            report_training_status(batch_idx, num_batches, epoch,
                                   int(params["epochs"]))
            batch = dataset.read_batch(batch_idx)
            tdp = preprocess_batch_for_training(preprocessor, batch,
                                                action_names)

            tdp.set_type(trainer.dtype)
            trainer.train(tdp)

        eval_dataset.reset_iterator()
        accumulated_edp = None
        while True:
            batch = eval_dataset.read_batch(batch_idx)
            if batch is None:
                break
            tdp = preprocess_batch_for_training(preprocessor, batch,
                                                action_names)
            edp = EvaluationDataPage.create_from_tdp(tdp, trainer)
            if accumulated_edp is None:
                accumulated_edp = edp
            else:
                accumulated_edp = accumulated_edp.append(edp)
        accumulated_edp = accumulated_edp.compute_values(trainer.gamma)

        cpe_start_time = time.time()
        details = evaluator.evaluate_post_training(accumulated_edp)
        details.log()
        logger.info("CPE evaluation took {} seconds.".format(time.time() -
                                                             cpe_start_time))

    through_put = (len(dataset) * int(params["epochs"])) / (time.time() -
                                                            start_time)
    logger.info("Training finished. Processed ~{} examples / s.".format(
        round(through_put)))

    if writer is not None:
        writer.close()

    return export_trainer_and_predictor(trainer, params["model_output_path"])
def train_network(params):
    logger.info("Running Parametric DQN workflow with params:")
    logger.info(params)

    # Set minibatch size based on # of devices being used to train
    params["training"]["minibatch_size"] *= minibatch_size_multiplier(
        params["use_gpu"], params["use_all_avail_gpus"])

    rl_parameters = RLParameters(**params["rl"])
    training_parameters = TrainingParameters(**params["training"])
    rainbow_parameters = RainbowDQNParameters(**params["rainbow"])
    if params["in_training_cpe"] is not None:
        in_training_cpe_parameters = InTrainingCPEParameters(
            **params["in_training_cpe"])
    else:
        in_training_cpe_parameters = None

    trainer_params = ContinuousActionModelParameters(
        rl=rl_parameters,
        training=training_parameters,
        rainbow=rainbow_parameters,
        in_training_cpe=in_training_cpe_parameters,
    )

    dataset = JSONDataset(params["training_data_path"],
                          batch_size=training_parameters.minibatch_size)
    eval_dataset = JSONDataset(params["eval_data_path"],
                               batch_size=training_parameters.minibatch_size)
    state_normalization = read_norm_file(params["state_norm_data_path"])
    action_normalization = read_norm_file(params["action_norm_data_path"])

    num_batches = int(len(dataset) / training_parameters.minibatch_size)
    logger.info("Read in batch data set {} of size {} examples. Data split "
                "into {} batches of size {}.".format(
                    params["training_data_path"],
                    len(dataset),
                    num_batches,
                    training_parameters.minibatch_size,
                ))

    trainer = ParametricDQNTrainer(
        trainer_params,
        state_normalization,
        action_normalization,
        use_gpu=params["use_gpu"],
        use_all_avail_gpus=params["use_all_avail_gpus"],
    )
    trainer = update_model_for_warm_start(trainer)
    state_preprocessor = Preprocessor(state_normalization, False)
    action_preprocessor = Preprocessor(action_normalization, False)

    if trainer_params.in_training_cpe is not None:
        evaluator = Evaluator(
            None,
            trainer_params.rl.gamma,
            trainer,
            trainer_params.in_training_cpe.mdp_sampled_rate,
            metrics_to_score=trainer.metrics_to_score,
        )
    else:
        evaluator = Evaluator(
            None,
            trainer_params.rl.gamma,
            trainer,
            float(DEFAULT_NUM_SAMPLES_FOR_CPE) / len(dataset),
            metrics_to_score=trainer.metrics_to_score,
        )

    start_time = time.time()
    for epoch in range(params["epochs"]):
        dataset.reset_iterator()
        for batch_idx in range(num_batches):
            report_training_status(batch_idx, num_batches, epoch,
                                   params["epochs"])
            batch = dataset.read_batch(batch_idx)
            tdp = preprocess_batch_for_training(
                state_preprocessor,
                batch,
                action_preprocessor=action_preprocessor)

            tdp.set_type(trainer.dtype)
            trainer.train(tdp)

        eval_dataset.reset_iterator()
        accumulated_edp = None
        for batch_idx in range(num_batches):
            batch = eval_dataset.read_batch(batch_idx)
            tdp = preprocess_batch_for_training(
                state_preprocessor,
                batch,
                action_preprocessor=action_preprocessor)
            edp = EvaluationDataPage.create_from_tdp(tdp, trainer)
            if accumulated_edp is None:
                accumulated_edp = edp
            else:
                accumulated_edp = accumulated_edp.append(edp)
        accumulated_edp = accumulated_edp.compute_values(trainer.gamma)

        cpe_start_time = time.time()
        details = evaluator.evaluate_post_training(accumulated_edp)
        details.log()
        logger.info("CPE evaluation took {} seconds.".format(time.time() -
                                                             cpe_start_time))

    through_put = (len(dataset) * params["epochs"]) / (time.time() -
                                                       start_time)
    logger.info("Training finished. Processed ~{} examples / s.".format(
        round(through_put)))

    return export_trainer_and_predictor(trainer, params["model_output_path"])
    def test_seq2slate_eval_data_page(self):
        """
        Create 3 slate ranking logs and evaluate using Direct Method, Inverse
        Propensity Scores, and Doubly Robust.

        The logs are as follows:
        state: [1, 0, 0], [0, 1, 0], [0, 0, 1]
        indices in logged slates: [3, 2], [3, 2], [3, 2]
        model output indices: [2, 3], [3, 2], [2, 3]
        logged reward: 4, 5, 7
        logged propensities: 0.2, 0.5, 0.4
        predicted rewards on logged slates: 2, 4, 6
        predicted rewards on model outputted slates: 1, 4, 5

        Direct Method uses the predicted rewards on model outputted slates.
        Thus the result is expected to be (1 + 4 + 5) / 3

        Inverse Propensity Scores would scale the reward by 1.0 / logged propensities
        whenever the model output slate matches with the logged slate.
        Since only the second log matches with the model output, the IPS result
        is expected to be 5 / 0.5 / 3

        Doubly Robust is the sum of the direct method result and propensity-scaled
        reward difference; the latter is defined as:
        1.0 / logged_propensities * (logged reward - predicted reward on logged slate)
         * Indicator(model slate == logged slate)
        Since only the second logged slate matches with the model outputted slate,
        the DR result is expected to be (1 + 4 + 5) / 3 + 1.0 / 0.5 * (5 - 4) / 3
        """
        batch_size = 3
        state_dim = 3
        src_seq_len = 2
        tgt_seq_len = 2
        candidate_dim = 2

        reward_net = FakeSeq2SlateRewardNetwork()
        seq2slate_net = FakeSeq2SlateTransformerNet()
        baseline_net = nn.Linear(1, 1)
        trainer = Seq2SlateTrainer(
            seq2slate_net,
            baseline_net,
            parameters=None,
            minibatch_size=3,
            use_gpu=False,
        )

        src_seq = torch.eye(candidate_dim).repeat(batch_size, 1, 1)
        tgt_out_idx = torch.LongTensor([[3, 2], [3, 2], [3, 2]])
        tgt_out_seq = src_seq[torch.arange(batch_size).
                              repeat_interleave(tgt_seq_len),  # type: ignore
                              tgt_out_idx.flatten() - 2, ].reshape(
                                  batch_size, tgt_seq_len, candidate_dim)

        ptb = rlt.PreprocessedTrainingBatch(
            training_input=rlt.PreprocessedRankingInput(
                state=rlt.PreprocessedFeatureVector(
                    float_features=torch.eye(state_dim)),
                src_seq=rlt.PreprocessedFeatureVector(float_features=src_seq),
                tgt_out_seq=rlt.PreprocessedFeatureVector(
                    float_features=tgt_out_seq),
                src_src_mask=torch.ones(batch_size, src_seq_len, src_seq_len),
                tgt_out_idx=tgt_out_idx,
                tgt_out_probs=torch.tensor([0.2, 0.5, 0.4]),
                slate_reward=torch.tensor([4.0, 5.0, 7.0]),
            ),
            extras=rlt.ExtraData(
                sequence_number=torch.tensor([0, 0, 0]),
                mdp_id=np.array(["0", "1", "2"]),
            ),
        )

        edp = EvaluationDataPage.create_from_training_batch(
            ptb, trainer, reward_net)
        doubly_robust_estimator = DoublyRobustEstimator()
        direct_method, inverse_propensity, doubly_robust = doubly_robust_estimator.estimate(
            edp)
        logger.info(f"{direct_method}, {inverse_propensity}, {doubly_robust}")

        avg_logged_reward = (4 + 5 + 7) / 3
        self.assertAlmostEqual(direct_method.raw, (1 + 4 + 5) / 3, delta=1e-6)
        self.assertAlmostEqual(direct_method.normalized,
                               direct_method.raw / avg_logged_reward,
                               delta=1e-6)
        self.assertAlmostEqual(inverse_propensity.raw, 5 / 0.5 / 3, delta=1e-6)
        self.assertAlmostEqual(
            inverse_propensity.normalized,
            inverse_propensity.raw / avg_logged_reward,
            delta=1e-6,
        )
        self.assertAlmostEqual(doubly_robust.raw,
                               direct_method.raw + 1 / 0.5 * (5 - 4) / 3,
                               delta=1e-6)
        self.assertAlmostEqual(doubly_robust.normalized,
                               doubly_robust.raw / avg_logged_reward,
                               delta=1e-6)
Exemple #13
0
def train_network(params):
    writer = None
    if params["model_output_path"] is not None:
        writer = SummaryWriter(log_dir=params["model_output_path"])

    logger.info("Running DQN workflow with params:")
    logger.info(params)

    # Set minibatch size based on # of devices being used to train
    params["training"]["minibatch_size"] *= minibatch_size_multiplier(
        params["use_gpu"], params["use_all_avail_gpus"]
    )

    action_names = np.array(params["actions"])
    rl_parameters = RLParameters(**params["rl"])
    training_parameters = TrainingParameters(**params["training"])
    rainbow_parameters = RainbowDQNParameters(**params["rainbow"])

    trainer_params = DiscreteActionModelParameters(
        actions=params["actions"],
        rl=rl_parameters,
        training=training_parameters,
        rainbow=rainbow_parameters,
    )

    dataset = JSONDataset(
        params["training_data_path"], batch_size=training_parameters.minibatch_size
    )
    eval_dataset = JSONDataset(params["eval_data_path"], batch_size=16)
    state_normalization = read_norm_file(params["state_norm_data_path"])

    num_batches = int(len(dataset) / training_parameters.minibatch_size)
    logger.info(
        "Read in batch data set {} of size {} examples. Data split "
        "into {} batches of size {}.".format(
            params["training_data_path"],
            len(dataset),
            num_batches,
            training_parameters.minibatch_size,
        )
    )

    trainer = DQNTrainer(
        trainer_params,
        state_normalization,
        use_gpu=params["use_gpu"],
        use_all_avail_gpus=params["use_all_avail_gpus"],
    )
    trainer = update_model_for_warm_start(trainer)
    preprocessor = Preprocessor(state_normalization, False)

    evaluator = Evaluator(
        trainer_params.actions,
        trainer_params.rl.gamma,
        trainer,
        metrics_to_score=trainer.metrics_to_score,
    )

    start_time = time.time()
    for epoch in range(int(params["epochs"])):
        dataset.reset_iterator()
        batch_idx = -1
        while True:
            batch_idx += 1
            report_training_status(batch_idx, num_batches, epoch, int(params["epochs"]))
            batch = dataset.read_batch()
            if batch is None:
                break
            tdp = preprocess_batch_for_training(preprocessor, batch, action_names)

            tdp.set_type(trainer.dtype)
            trainer.train(tdp)

        eval_dataset.reset_iterator()
        accumulated_edp = None
        while True:
            batch = eval_dataset.read_batch()
            if batch is None:
                break
            tdp = preprocess_batch_for_training(preprocessor, batch, action_names)
            tdp.set_type(trainer.dtype)
            edp = EvaluationDataPage.create_from_tdp(tdp, trainer)
            if accumulated_edp is None:
                accumulated_edp = edp
            else:
                accumulated_edp = accumulated_edp.append(edp)
        accumulated_edp = accumulated_edp.compute_values(trainer.gamma)

        cpe_start_time = time.time()
        details = evaluator.evaluate_post_training(accumulated_edp)
        details.log()
        logger.info(
            "CPE evaluation took {} seconds.".format(time.time() - cpe_start_time)
        )

    through_put = (len(dataset) * int(params["epochs"])) / (time.time() - start_time)
    logger.info(
        "Training finished. Processed ~{} examples / s.".format(round(through_put))
    )

    if writer is not None:
        writer.close()

    return export_trainer_and_predictor(trainer, params["model_output_path"])
    def evaluate(self, eval_tdp: PreprocessedTrainingBatch):
        seq2slate_net = self.trainer.seq2slate_net
        baseline_net = self.trainer.baseline_net

        seq2slate_net_prev_mode = seq2slate_net.training
        baseline_net_prev_mode = baseline_net.training
        seq2slate_net.eval()
        baseline_net.eval()

        logged_slate_log_prob = (seq2slate_net(
            eval_tdp.training_input, mode=Seq2SlateMode.PER_SEQ_LOG_PROB_MODE).
                                 log_probs.detach().flatten().cpu().numpy())
        b = baseline_net(eval_tdp.training_input).squeeze().detach()
        advantage = (eval_tdp.training_input.slate_reward -
                     b).flatten().cpu().numpy()

        self.baseline_loss.append(
            F.mse_loss(b, eval_tdp.training_input.slate_reward).item())
        self.advantages.append(advantage)
        self.logged_slate_log_probs.append(logged_slate_log_prob)

        ranked_slate_output = seq2slate_net(eval_tdp.training_input,
                                            Seq2SlateMode.RANK_MODE,
                                            greedy=True)
        ranked_slate_prob = (torch.prod(
            torch.gather(
                ranked_slate_output.ranked_tgt_out_probs,
                2,
                ranked_slate_output.ranked_tgt_out_idx.unsqueeze(-1),
            ).squeeze(),
            -1,
        ).cpu().numpy())
        self.ranked_slate_probs.append(ranked_slate_prob)

        seq2slate_net.train(seq2slate_net_prev_mode)
        baseline_net.train(baseline_net_prev_mode)

        if not self.calc_cpe:
            return

        edp_g = EvaluationDataPage.create_from_tensors_seq2slate(
            seq2slate_net,
            self.reward_network,
            eval_tdp.training_input,
            eval_greedy=True,
        )
        if self.eval_data_pages_g is None:
            self.eval_data_pages_g = edp_g
        else:
            self.eval_data_pages_g = self.eval_data_pages_g.append(edp_g)

        edp_ng = EvaluationDataPage.create_from_tensors_seq2slate(
            seq2slate_net,
            self.reward_network,
            eval_tdp.training_input,
            eval_greedy=False,
        )
        if self.eval_data_pages_ng is None:
            self.eval_data_pages_ng = edp_ng
        else:
            self.eval_data_pages_ng = self.eval_data_pages_ng.append(edp_ng)